acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2615 -2547
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +412 -338
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +5 -0
- datahub/_version.py +1 -1
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +26 -23
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +3 -3
- datahub/api/entities/structuredproperties/structuredproperties.py +4 -4
- datahub/api/graphql/operation.py +10 -6
- datahub/cli/check_cli.py +88 -7
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +18 -10
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +125 -27
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +153 -229
- datahub/cli/exists_cli.py +0 -2
- datahub/cli/get_cli.py +0 -2
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +5 -0
- datahub/cli/ingest_cli.py +3 -15
- datahub/cli/migrate.py +2 -0
- datahub/cli/put_cli.py +1 -4
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +37 -6
- datahub/cli/specific/datacontract_cli.py +54 -7
- datahub/cli/specific/dataproduct_cli.py +2 -15
- datahub/cli/specific/dataset_cli.py +1 -8
- datahub/cli/specific/forms_cli.py +0 -4
- datahub/cli/specific/group_cli.py +0 -2
- datahub/cli/specific/structuredproperties_cli.py +1 -4
- datahub/cli/specific/user_cli.py +172 -3
- datahub/cli/state_cli.py +0 -2
- datahub/cli/timeline_cli.py +0 -2
- datahub/configuration/common.py +40 -1
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +3 -2
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +8 -4
- datahub/emitter/rest_emitter.py +103 -30
- datahub/entrypoints.py +6 -3
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +297 -1
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +165 -58
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +13 -4
- datahub/ingestion/glossary/classification_mixin.py +5 -0
- datahub/ingestion/graph/client.py +330 -25
- datahub/ingestion/graph/config.py +3 -2
- datahub/ingestion/graph/filters.py +30 -11
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +81 -11
- datahub/ingestion/run/pipeline_config.py +2 -2
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +13 -5
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +67 -24
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +47 -19
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -0
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +6 -8
- datahub/ingestion/source/cassandra/cassandra_api.py +17 -1
- datahub/ingestion/source/cassandra/cassandra_config.py +5 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +7 -6
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
- datahub/ingestion/source/common/subtypes.py +53 -0
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +37 -0
- datahub/ingestion/source/data_lake_common/object_store.py +115 -27
- datahub/ingestion/source/data_lake_common/path_spec.py +72 -43
- datahub/ingestion/source/datahub/config.py +12 -9
- datahub/ingestion/source/datahub/datahub_database_reader.py +26 -11
- datahub/ingestion/source/datahub/datahub_source.py +10 -0
- datahub/ingestion/source/dbt/dbt_cloud.py +16 -5
- datahub/ingestion/source/dbt/dbt_common.py +224 -9
- datahub/ingestion/source/dbt/dbt_core.py +3 -0
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +132 -98
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +11 -8
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +8 -10
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +66 -7
- datahub/ingestion/source/fivetran/fivetran.py +227 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +37 -8
- datahub/ingestion/source/fivetran/fivetran_query.py +51 -29
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gcs/gcs_source.py +32 -4
- datahub/ingestion/source/ge_data_profiler.py +108 -31
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/api.py +28 -1
- datahub/ingestion/source/hex/hex.py +16 -5
- datahub/ingestion/source/hex/mapper.py +16 -2
- datahub/ingestion/source/hex/model.py +2 -0
- datahub/ingestion/source/hex/query_fetcher.py +1 -1
- datahub/ingestion/source/iceberg/iceberg.py +123 -59
- datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +16 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
- datahub/ingestion/source/kafka_connect/source_connectors.py +62 -4
- datahub/ingestion/source/looker/looker_common.py +148 -79
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +36 -3
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +503 -547
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +96 -117
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +9 -9
- datahub/ingestion/source/mlflow.py +12 -2
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +26 -5
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +83 -144
- datahub/ingestion/source/nifi.py +2 -2
- datahub/ingestion/source/openapi.py +1 -1
- datahub/ingestion/source/powerbi/config.py +47 -21
- datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
- datahub/ingestion/source/powerbi/powerbi.py +10 -6
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/query.py +23 -19
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +6 -5
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +449 -248
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +7 -1
- datahub/ingestion/source/slack/slack.py +10 -16
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +3 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +76 -23
- datahub/ingestion/source/snowflake/snowflake_connection.py +24 -8
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +19 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +464 -97
- datahub/ingestion/source/snowflake/snowflake_query.py +77 -5
- datahub/ingestion/source/snowflake/snowflake_report.py +1 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +352 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +51 -10
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +36 -15
- datahub/ingestion/source/snowflake/snowflake_v2.py +39 -4
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +217 -25
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +24 -8
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +2 -2
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +4 -3
- datahub/ingestion/source/sql/hive_metastore.py +19 -20
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +3 -1
- datahub/ingestion/source/sql/mssql/source.py +336 -57
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +5 -5
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +2 -1
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +22 -0
- datahub/ingestion/source/sql/sqlalchemy_uri.py +39 -7
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +11 -1
- datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/superset.py +314 -67
- datahub/ingestion/source/tableau/tableau.py +135 -59
- datahub/ingestion/source/tableau/tableau_common.py +9 -2
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/unity/config.py +160 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
- datahub/ingestion/source/usage/starburst_trino_usage.py +5 -2
- datahub/ingestion/source/usage/usage_common.py +4 -3
- datahub/ingestion/source/vertexai/vertexai.py +1 -1
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/metadata/_internal_schema_classes.py +6806 -4871
- datahub/metadata/_urns/urn_defs.py +1767 -1539
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +6 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18395 -16979
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +50 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +151 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +62 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +61 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +4 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +61 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/sdk/__init__.py +2 -0
- datahub/sdk/_all_entities.py +7 -0
- datahub/sdk/_shared.py +249 -5
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +7 -0
- datahub/sdk/datajob.py +45 -13
- datahub/sdk/dataset.py +56 -2
- datahub/sdk/entity_client.py +111 -9
- datahub/sdk/lineage_client.py +663 -82
- datahub/sdk/main_client.py +50 -16
- datahub/sdk/mlmodel.py +120 -38
- datahub/sdk/mlmodelgroup.py +7 -0
- datahub/sdk/search_client.py +7 -3
- datahub/sdk/search_filters.py +304 -36
- datahub/secret/datahub_secret_store.py +3 -0
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +39 -59
- datahub/sql_parsing/split_statements.py +13 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +70 -26
- datahub/sql_parsing/sqlglot_lineage.py +196 -42
- datahub/sql_parsing/sqlglot_utils.py +12 -4
- datahub/sql_parsing/tool_meta_extractor.py +1 -3
- datahub/telemetry/telemetry.py +28 -14
- datahub/testing/sdk_v2_helpers.py +7 -1
- datahub/upgrade/upgrade.py +73 -17
- datahub/utilities/file_backed_collections.py +8 -9
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +22 -6
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +10 -1
- datahub/utilities/sqlalchemy_query_combiner.py +5 -2
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -466
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -1,5 +1,6 @@
|
|
|
1
|
+
import pathlib
|
|
1
2
|
from dataclasses import dataclass
|
|
2
|
-
from typing import Dict, Iterable, List, Optional, Tuple, Union
|
|
3
|
+
from typing import ClassVar, Dict, Iterable, List, Optional, Tuple, Union
|
|
3
4
|
|
|
4
5
|
import feast.types
|
|
5
6
|
from feast import (
|
|
@@ -97,7 +98,7 @@ class FeastRepositorySourceConfig(
|
|
|
97
98
|
StatefulIngestionConfigBase,
|
|
98
99
|
):
|
|
99
100
|
path: str = Field(description="Path to Feast repository")
|
|
100
|
-
fs_yaml_file: Optional[
|
|
101
|
+
fs_yaml_file: Optional[pathlib.Path] = Field(
|
|
101
102
|
default=None,
|
|
102
103
|
description="Path to the `feature_store.yaml` file used to configure the feature store",
|
|
103
104
|
)
|
|
@@ -142,17 +143,14 @@ class FeastRepositorySource(StatefulIngestionSourceBase):
|
|
|
142
143
|
- Column types associated with each entity and feature
|
|
143
144
|
"""
|
|
144
145
|
|
|
145
|
-
platform = "feast"
|
|
146
|
-
source_config: FeastRepositorySourceConfig
|
|
147
|
-
report: StaleEntityRemovalSourceReport
|
|
148
|
-
feature_store: FeatureStore
|
|
146
|
+
platform: ClassVar[str] = "feast"
|
|
149
147
|
|
|
150
148
|
def __init__(self, config: FeastRepositorySourceConfig, ctx: PipelineContext):
|
|
151
149
|
super().__init__(config, ctx)
|
|
152
|
-
self.source_config = config
|
|
153
|
-
self.ctx = ctx
|
|
154
|
-
self.report = StaleEntityRemovalSourceReport()
|
|
155
|
-
self.feature_store = FeatureStore(
|
|
150
|
+
self.source_config: FeastRepositorySourceConfig = config
|
|
151
|
+
self.ctx: PipelineContext = ctx
|
|
152
|
+
self.report: StaleEntityRemovalSourceReport = StaleEntityRemovalSourceReport()
|
|
153
|
+
self.feature_store: FeatureStore = FeatureStore(
|
|
156
154
|
repo_path=self.source_config.path,
|
|
157
155
|
fs_yaml_file=self.source_config.fs_yaml_file,
|
|
158
156
|
)
|
datahub/ingestion/source/file.py
CHANGED
|
@@ -18,7 +18,9 @@ from datahub.configuration.validate_field_rename import pydantic_renamed_field
|
|
|
18
18
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
19
19
|
from datahub.ingestion.api.common import PipelineContext
|
|
20
20
|
from datahub.ingestion.api.decorators import (
|
|
21
|
+
SourceCapability,
|
|
21
22
|
SupportStatus,
|
|
23
|
+
capability,
|
|
22
24
|
config_class,
|
|
23
25
|
platform_name,
|
|
24
26
|
support_status,
|
|
@@ -187,6 +189,7 @@ class FileSourceReport(StaleEntityRemovalSourceReport):
|
|
|
187
189
|
@platform_name("Metadata File")
|
|
188
190
|
@config_class(FileSourceConfig)
|
|
189
191
|
@support_status(SupportStatus.CERTIFIED)
|
|
192
|
+
@capability(SourceCapability.TEST_CONNECTION, "Enabled by default")
|
|
190
193
|
class GenericFileSource(StatefulIngestionSourceBase, TestableSource):
|
|
191
194
|
"""
|
|
192
195
|
This plugin pulls metadata from a previously generated file.
|
|
@@ -29,6 +29,7 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
|
29
29
|
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
30
30
|
StatefulIngestionConfigBase,
|
|
31
31
|
)
|
|
32
|
+
from datahub.ingestion.source.unity.connection import UnityCatalogConnectionConfig
|
|
32
33
|
from datahub.utilities.lossy_collections import LossyList
|
|
33
34
|
from datahub.utilities.perf_timer import PerfTimer
|
|
34
35
|
|
|
@@ -56,8 +57,8 @@ class Constant:
|
|
|
56
57
|
STATUS = "status"
|
|
57
58
|
USER_ID = "user_id"
|
|
58
59
|
EMAIL = "email"
|
|
59
|
-
CONNECTOR_ID = "
|
|
60
|
-
CONNECTOR_NAME = "
|
|
60
|
+
CONNECTOR_ID = "connection_id"
|
|
61
|
+
CONNECTOR_NAME = "connection_name"
|
|
61
62
|
CONNECTOR_TYPE_ID = "connector_type_id"
|
|
62
63
|
PAUSED = "paused"
|
|
63
64
|
SYNC_FREQUENCY = "sync_frequency"
|
|
@@ -67,13 +68,22 @@ class Constant:
|
|
|
67
68
|
SUCCESSFUL = "SUCCESSFUL"
|
|
68
69
|
FAILURE_WITH_TASK = "FAILURE_WITH_TASK"
|
|
69
70
|
CANCELED = "CANCELED"
|
|
71
|
+
GOOGLE_SHEETS_CONNECTOR_TYPE = "google_sheets"
|
|
70
72
|
|
|
71
73
|
|
|
74
|
+
# Key: Connector Type, Value: Platform ID/Name
|
|
72
75
|
KNOWN_DATA_PLATFORM_MAPPING = {
|
|
76
|
+
"google_cloud_postgresql": "postgres",
|
|
73
77
|
"postgres": "postgres",
|
|
74
78
|
"snowflake": "snowflake",
|
|
79
|
+
Constant.GOOGLE_SHEETS_CONNECTOR_TYPE: Constant.GOOGLE_SHEETS_CONNECTOR_TYPE,
|
|
75
80
|
}
|
|
76
81
|
|
|
82
|
+
# Note: (As of Oct 2025) Fivetran Platform Connector has stale lineage metadata for Google Sheets column data (deleted/renamed).
|
|
83
|
+
# Ref: https://fivetran.com/docs/connectors/files/google-sheets#deletingdata
|
|
84
|
+
# TODO: Remove Google Sheets connector type from DISABLE_LINEAGE_FOR_CONNECTOR_TYPES
|
|
85
|
+
DISABLE_COL_LINEAGE_FOR_CONNECTOR_TYPES = [Constant.GOOGLE_SHEETS_CONNECTOR_TYPE]
|
|
86
|
+
|
|
77
87
|
|
|
78
88
|
class SnowflakeDestinationConfig(SnowflakeConnectionConfig):
|
|
79
89
|
database: str = Field(description="The fivetran connector log database.")
|
|
@@ -84,10 +94,34 @@ class BigQueryDestinationConfig(BigQueryConnectionConfig):
|
|
|
84
94
|
dataset: str = Field(description="The fivetran connector log dataset.")
|
|
85
95
|
|
|
86
96
|
|
|
97
|
+
class DatabricksDestinationConfig(UnityCatalogConnectionConfig):
|
|
98
|
+
catalog: str = Field(description="The fivetran connector log catalog.")
|
|
99
|
+
log_schema: str = Field(description="The fivetran connector log schema.")
|
|
100
|
+
|
|
101
|
+
@pydantic.validator("warehouse_id")
|
|
102
|
+
def warehouse_id_should_not_be_empty(cls, warehouse_id: Optional[str]) -> str:
|
|
103
|
+
if warehouse_id is None or (warehouse_id and warehouse_id.strip() == ""):
|
|
104
|
+
raise ValueError("Fivetran requires warehouse_id to be set")
|
|
105
|
+
return warehouse_id
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
class FivetranAPIConfig(ConfigModel):
|
|
109
|
+
api_key: str = Field(description="Fivetran API key")
|
|
110
|
+
api_secret: str = Field(description="Fivetran API secret")
|
|
111
|
+
base_url: str = Field(
|
|
112
|
+
default="https://api.fivetran.com", description="Fivetran API base URL"
|
|
113
|
+
)
|
|
114
|
+
request_timeout_sec: int = Field(
|
|
115
|
+
default=30, description="Request timeout in seconds"
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
|
|
87
119
|
class FivetranLogConfig(ConfigModel):
|
|
88
|
-
destination_platform: Literal["snowflake", "bigquery"] =
|
|
89
|
-
|
|
90
|
-
|
|
120
|
+
destination_platform: Literal["snowflake", "bigquery", "databricks"] = (
|
|
121
|
+
pydantic.Field(
|
|
122
|
+
default="snowflake",
|
|
123
|
+
description="The destination platform where fivetran connector log tables are dumped.",
|
|
124
|
+
)
|
|
91
125
|
)
|
|
92
126
|
snowflake_destination_config: Optional[SnowflakeDestinationConfig] = pydantic.Field(
|
|
93
127
|
default=None,
|
|
@@ -97,11 +131,17 @@ class FivetranLogConfig(ConfigModel):
|
|
|
97
131
|
default=None,
|
|
98
132
|
description="If destination platform is 'bigquery', provide bigquery configuration.",
|
|
99
133
|
)
|
|
134
|
+
databricks_destination_config: Optional[DatabricksDestinationConfig] = (
|
|
135
|
+
pydantic.Field(
|
|
136
|
+
default=None,
|
|
137
|
+
description="If destination platform is 'databricks', provide databricks configuration.",
|
|
138
|
+
)
|
|
139
|
+
)
|
|
100
140
|
_rename_destination_config = pydantic_renamed_field(
|
|
101
141
|
"destination_config", "snowflake_destination_config"
|
|
102
142
|
)
|
|
103
143
|
|
|
104
|
-
@root_validator(
|
|
144
|
+
@root_validator(skip_on_failure=True)
|
|
105
145
|
def validate_destination_platfrom_and_config(cls, values: Dict) -> Dict:
|
|
106
146
|
destination_platform = values["destination_platform"]
|
|
107
147
|
if destination_platform == "snowflake":
|
|
@@ -114,6 +154,11 @@ class FivetranLogConfig(ConfigModel):
|
|
|
114
154
|
raise ValueError(
|
|
115
155
|
"If destination platform is 'bigquery', user must provide bigquery destination configuration in the recipe."
|
|
116
156
|
)
|
|
157
|
+
elif destination_platform == "databricks":
|
|
158
|
+
if "databricks_destination_config" not in values:
|
|
159
|
+
raise ValueError(
|
|
160
|
+
"If destination platform is 'databricks', user must provide databricks destination configuration in the recipe."
|
|
161
|
+
)
|
|
117
162
|
else:
|
|
118
163
|
raise ValueError(
|
|
119
164
|
f"Destination platform '{destination_platform}' is not yet supported."
|
|
@@ -137,6 +182,7 @@ class MetadataExtractionPerfReport(Report):
|
|
|
137
182
|
@dataclasses.dataclass
|
|
138
183
|
class FivetranSourceReport(StaleEntityRemovalSourceReport):
|
|
139
184
|
connectors_scanned: int = 0
|
|
185
|
+
fivetran_rest_api_call_count: int = 0
|
|
140
186
|
filtered_connectors: LossyList[str] = dataclasses.field(default_factory=LossyList)
|
|
141
187
|
metadata_extraction_perf: MetadataExtractionPerfReport = dataclasses.field(
|
|
142
188
|
default_factory=MetadataExtractionPerfReport
|
|
@@ -148,6 +194,9 @@ class FivetranSourceReport(StaleEntityRemovalSourceReport):
|
|
|
148
194
|
def report_connectors_dropped(self, connector: str) -> None:
|
|
149
195
|
self.filtered_connectors.append(connector)
|
|
150
196
|
|
|
197
|
+
def report_fivetran_rest_api_call_count(self) -> None:
|
|
198
|
+
self.fivetran_rest_api_call_count += 1
|
|
199
|
+
|
|
151
200
|
|
|
152
201
|
class PlatformDetail(ConfigModel):
|
|
153
202
|
platform: Optional[str] = pydantic.Field(
|
|
@@ -194,7 +243,7 @@ class FivetranSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin
|
|
|
194
243
|
|
|
195
244
|
# Configuration for stateful ingestion
|
|
196
245
|
stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = pydantic.Field(
|
|
197
|
-
default=None, description="
|
|
246
|
+
default=None, description="Fivetran Stateful Ingestion Config."
|
|
198
247
|
)
|
|
199
248
|
|
|
200
249
|
# Fivetran connector all sources to platform instance mapping
|
|
@@ -208,6 +257,16 @@ class FivetranSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin
|
|
|
208
257
|
description="A mapping of destination id to its platform/instance/env details.",
|
|
209
258
|
)
|
|
210
259
|
|
|
260
|
+
"""
|
|
261
|
+
Use Fivetran REST API to get :
|
|
262
|
+
- Google Sheets Connector details and emit related entities
|
|
263
|
+
Fivetran Platform Connector syncs limited information about the Google Sheets Connector.
|
|
264
|
+
"""
|
|
265
|
+
api_config: Optional[FivetranAPIConfig] = Field(
|
|
266
|
+
default=None,
|
|
267
|
+
description="Fivetran REST API configuration, used to provide wider support for connections.",
|
|
268
|
+
)
|
|
269
|
+
|
|
211
270
|
@pydantic.root_validator(pre=True)
|
|
212
271
|
def compat_sources_to_database(cls, values: Dict) -> Dict:
|
|
213
272
|
if "sources_to_database" in values:
|
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import Dict, Iterable, List, Optional
|
|
2
|
+
from typing import Dict, Iterable, List, Optional, Union
|
|
3
|
+
from urllib.parse import urlparse
|
|
3
4
|
|
|
4
5
|
import datahub.emitter.mce_builder as builder
|
|
5
|
-
from datahub.api.entities.datajob import
|
|
6
|
+
from datahub.api.entities.datajob import DataJob as DataJobV1
|
|
6
7
|
from datahub.api.entities.dataprocess.dataprocess_instance import (
|
|
7
8
|
DataProcessInstance,
|
|
8
9
|
InstanceRunResult,
|
|
@@ -16,8 +17,13 @@ from datahub.ingestion.api.decorators import (
|
|
|
16
17
|
platform_name,
|
|
17
18
|
support_status,
|
|
18
19
|
)
|
|
19
|
-
from datahub.ingestion.api.source import
|
|
20
|
+
from datahub.ingestion.api.source import (
|
|
21
|
+
MetadataWorkUnitProcessor,
|
|
22
|
+
SourceReport,
|
|
23
|
+
StructuredLogCategory,
|
|
24
|
+
)
|
|
20
25
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
26
|
+
from datahub.ingestion.source.common.subtypes import DatasetSubTypes
|
|
21
27
|
from datahub.ingestion.source.fivetran.config import (
|
|
22
28
|
KNOWN_DATA_PLATFORM_MAPPING,
|
|
23
29
|
Constant,
|
|
@@ -31,27 +37,39 @@ from datahub.ingestion.source.fivetran.fivetran_query import (
|
|
|
31
37
|
MAX_JOBS_PER_CONNECTOR,
|
|
32
38
|
MAX_TABLE_LINEAGE_PER_CONNECTOR,
|
|
33
39
|
)
|
|
40
|
+
from datahub.ingestion.source.fivetran.fivetran_rest_api import FivetranAPIClient
|
|
41
|
+
from datahub.ingestion.source.fivetran.response_models import FivetranConnectionDetails
|
|
34
42
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
35
43
|
StaleEntityRemovalHandler,
|
|
36
44
|
)
|
|
37
45
|
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
38
46
|
StatefulIngestionSourceBase,
|
|
39
47
|
)
|
|
48
|
+
from datahub.metadata.com.linkedin.pegasus2avro.common import AuditStamp
|
|
40
49
|
from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
|
|
41
50
|
FineGrainedLineage,
|
|
42
51
|
FineGrainedLineageDownstreamType,
|
|
43
52
|
FineGrainedLineageUpstreamType,
|
|
53
|
+
UpstreamLineage,
|
|
54
|
+
)
|
|
55
|
+
from datahub.metadata.schema_classes import (
|
|
56
|
+
DatasetLineageTypeClass,
|
|
57
|
+
UpstreamClass,
|
|
44
58
|
)
|
|
45
|
-
from datahub.
|
|
46
|
-
from datahub.
|
|
59
|
+
from datahub.metadata.urns import CorpUserUrn, DataFlowUrn, DatasetUrn
|
|
60
|
+
from datahub.sdk.dataflow import DataFlow
|
|
61
|
+
from datahub.sdk.datajob import DataJob
|
|
62
|
+
from datahub.sdk.dataset import Dataset
|
|
63
|
+
from datahub.sdk.entity import Entity
|
|
47
64
|
|
|
48
65
|
# Logger instance
|
|
49
66
|
logger = logging.getLogger(__name__)
|
|
67
|
+
CORPUSER_DATAHUB = "urn:li:corpuser:datahub"
|
|
50
68
|
|
|
51
69
|
|
|
52
70
|
@platform_name("Fivetran")
|
|
53
71
|
@config_class(FivetranSourceConfig)
|
|
54
|
-
@support_status(SupportStatus.
|
|
72
|
+
@support_status(SupportStatus.CERTIFIED)
|
|
55
73
|
@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
|
|
56
74
|
@capability(
|
|
57
75
|
SourceCapability.LINEAGE_FINE,
|
|
@@ -60,7 +78,6 @@ logger = logging.getLogger(__name__)
|
|
|
60
78
|
class FivetranSource(StatefulIngestionSourceBase):
|
|
61
79
|
"""
|
|
62
80
|
This plugin extracts fivetran users, connectors, destinations and sync history.
|
|
63
|
-
This plugin is in beta and has only been tested on Snowflake connector.
|
|
64
81
|
"""
|
|
65
82
|
|
|
66
83
|
config: FivetranSourceConfig
|
|
@@ -71,12 +88,16 @@ class FivetranSource(StatefulIngestionSourceBase):
|
|
|
71
88
|
super().__init__(config, ctx)
|
|
72
89
|
self.config = config
|
|
73
90
|
self.report = FivetranSourceReport()
|
|
74
|
-
|
|
75
91
|
self.audit_log = FivetranLogAPI(self.config.fivetran_log_config)
|
|
92
|
+
self.api_client: Optional[FivetranAPIClient] = None
|
|
93
|
+
self._connection_details_cache: Dict[str, FivetranConnectionDetails] = {}
|
|
94
|
+
|
|
95
|
+
if self.config.api_config:
|
|
96
|
+
self.api_client = FivetranAPIClient(self.config.api_config)
|
|
76
97
|
|
|
77
98
|
def _extend_lineage(self, connector: Connector, datajob: DataJob) -> Dict[str, str]:
|
|
78
|
-
input_dataset_urn_list: List[DatasetUrn] = []
|
|
79
|
-
output_dataset_urn_list: List[DatasetUrn] = []
|
|
99
|
+
input_dataset_urn_list: List[Union[str, DatasetUrn]] = []
|
|
100
|
+
output_dataset_urn_list: List[Union[str, DatasetUrn]] = []
|
|
80
101
|
fine_grained_lineage: List[FineGrainedLineage] = []
|
|
81
102
|
|
|
82
103
|
# TODO: Once Fivetran exposes the database via the API, we shouldn't ask for it via config.
|
|
@@ -94,8 +115,10 @@ class FivetranSource(StatefulIngestionSourceBase):
|
|
|
94
115
|
self.report.info(
|
|
95
116
|
title="Guessing source platform for lineage",
|
|
96
117
|
message="We encountered a connector type that we don't fully support yet. "
|
|
97
|
-
"We will attempt to guess the platform based on the connector type."
|
|
98
|
-
|
|
118
|
+
"We will attempt to guess the platform based on the connector type. "
|
|
119
|
+
"Note that we use connector_id as the key not connector_name which you may see in the UI of Fivetran. ",
|
|
120
|
+
context=f"connector_name: {connector.connector_name} (connector_id: {connector.connector_id}, connector_type: {connector.connector_type})",
|
|
121
|
+
log_category=StructuredLogCategory.LINEAGE,
|
|
99
122
|
)
|
|
100
123
|
source_details.platform = connector.connector_type
|
|
101
124
|
|
|
@@ -124,17 +147,43 @@ class FivetranSource(StatefulIngestionSourceBase):
|
|
|
124
147
|
if source_details.include_schema_in_urn
|
|
125
148
|
else lineage.source_table.split(".", 1)[1]
|
|
126
149
|
)
|
|
127
|
-
input_dataset_urn =
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
150
|
+
input_dataset_urn: Optional[DatasetUrn] = None
|
|
151
|
+
# Special Handling for Google Sheets Connectors
|
|
152
|
+
if connector.connector_type == Constant.GOOGLE_SHEETS_CONNECTOR_TYPE:
|
|
153
|
+
# Get Google Sheet dataset details from Fivetran API
|
|
154
|
+
# This is cached in the api_client
|
|
155
|
+
gsheets_conn_details: Optional[FivetranConnectionDetails] = (
|
|
156
|
+
self._get_connection_details_by_id(connector.connector_id)
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
if gsheets_conn_details:
|
|
160
|
+
input_dataset_urn = DatasetUrn.create_from_ids(
|
|
161
|
+
platform_id=Constant.GOOGLE_SHEETS_CONNECTOR_TYPE,
|
|
162
|
+
table_name=self._get_gsheet_named_range_dataset_id(
|
|
163
|
+
gsheets_conn_details
|
|
164
|
+
),
|
|
165
|
+
env=source_details.env,
|
|
166
|
+
)
|
|
167
|
+
else:
|
|
168
|
+
self.report.warning(
|
|
169
|
+
title="Failed to extract lineage for Google Sheets Connector",
|
|
170
|
+
message="Unable to extract lineage for Google Sheets Connector, as the connector details are not available from Fivetran API.",
|
|
171
|
+
context=f"{connector.connector_name} (connector_id: {connector.connector_id})",
|
|
172
|
+
)
|
|
173
|
+
else:
|
|
174
|
+
input_dataset_urn = DatasetUrn.create_from_ids(
|
|
175
|
+
platform_id=source_details.platform,
|
|
176
|
+
table_name=(
|
|
177
|
+
f"{source_details.database.lower()}.{source_table}"
|
|
178
|
+
if source_details.database
|
|
179
|
+
else source_table
|
|
180
|
+
),
|
|
181
|
+
env=source_details.env,
|
|
182
|
+
platform_instance=source_details.platform_instance,
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
if input_dataset_urn:
|
|
186
|
+
input_dataset_urn_list.append(input_dataset_urn)
|
|
138
187
|
|
|
139
188
|
destination_table = (
|
|
140
189
|
lineage.destination_table
|
|
@@ -178,9 +227,9 @@ class FivetranSource(StatefulIngestionSourceBase):
|
|
|
178
227
|
)
|
|
179
228
|
)
|
|
180
229
|
|
|
181
|
-
datajob.
|
|
182
|
-
datajob.
|
|
183
|
-
datajob.
|
|
230
|
+
datajob.set_inlets(input_dataset_urn_list)
|
|
231
|
+
datajob.set_outlets(output_dataset_urn_list)
|
|
232
|
+
datajob.set_fine_grained_lineages(fine_grained_lineage)
|
|
184
233
|
|
|
185
234
|
return dict(
|
|
186
235
|
**{
|
|
@@ -197,10 +246,10 @@ class FivetranSource(StatefulIngestionSourceBase):
|
|
|
197
246
|
|
|
198
247
|
def _generate_dataflow_from_connector(self, connector: Connector) -> DataFlow:
|
|
199
248
|
return DataFlow(
|
|
200
|
-
|
|
201
|
-
|
|
249
|
+
platform=Constant.ORCHESTRATOR,
|
|
250
|
+
name=connector.connector_id,
|
|
202
251
|
env=self.config.env,
|
|
203
|
-
|
|
252
|
+
display_name=connector.connector_name,
|
|
204
253
|
platform_instance=self.config.platform_instance,
|
|
205
254
|
)
|
|
206
255
|
|
|
@@ -213,11 +262,11 @@ class FivetranSource(StatefulIngestionSourceBase):
|
|
|
213
262
|
)
|
|
214
263
|
owner_email = self.audit_log.get_user_email(connector.user_id)
|
|
215
264
|
datajob = DataJob(
|
|
216
|
-
|
|
265
|
+
name=connector.connector_id,
|
|
217
266
|
flow_urn=dataflow_urn,
|
|
218
267
|
platform_instance=self.config.platform_instance,
|
|
219
|
-
|
|
220
|
-
owners=
|
|
268
|
+
display_name=connector.connector_name,
|
|
269
|
+
owners=[CorpUserUrn(owner_email)] if owner_email else None,
|
|
221
270
|
)
|
|
222
271
|
|
|
223
272
|
# Map connector source and destination table with dataset entity
|
|
@@ -232,21 +281,90 @@ class FivetranSource(StatefulIngestionSourceBase):
|
|
|
232
281
|
"sync_frequency": str(connector.sync_frequency),
|
|
233
282
|
"destination_id": connector.destination_id,
|
|
234
283
|
}
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
**lineage_properties,
|
|
238
|
-
}
|
|
284
|
+
|
|
285
|
+
datajob.set_custom_properties({**connector_properties, **lineage_properties})
|
|
239
286
|
|
|
240
287
|
return datajob
|
|
241
288
|
|
|
242
289
|
def _generate_dpi_from_job(self, job: Job, datajob: DataJob) -> DataProcessInstance:
|
|
290
|
+
# hack: convert to old instance for DataProcessInstance.from_datajob compatibility
|
|
291
|
+
datajob_v1 = DataJobV1(
|
|
292
|
+
id=datajob.name,
|
|
293
|
+
flow_urn=datajob.flow_urn,
|
|
294
|
+
platform_instance=self.config.platform_instance,
|
|
295
|
+
name=datajob.name,
|
|
296
|
+
inlets=datajob.inlets,
|
|
297
|
+
outlets=datajob.outlets,
|
|
298
|
+
fine_grained_lineages=datajob.fine_grained_lineages,
|
|
299
|
+
)
|
|
243
300
|
return DataProcessInstance.from_datajob(
|
|
244
|
-
datajob=
|
|
301
|
+
datajob=datajob_v1,
|
|
245
302
|
id=job.job_id,
|
|
246
303
|
clone_inlets=True,
|
|
247
304
|
clone_outlets=True,
|
|
248
305
|
)
|
|
249
306
|
|
|
307
|
+
def _get_connection_details_by_id(
|
|
308
|
+
self, connection_id: str
|
|
309
|
+
) -> Optional[FivetranConnectionDetails]:
|
|
310
|
+
if self.api_client is None:
|
|
311
|
+
self.report.warning(
|
|
312
|
+
title="Fivetran API client is not initialized",
|
|
313
|
+
message="Google Sheets Connector details cannot be extracted, as Fivetran API client is not initialized.",
|
|
314
|
+
context=f"connector_id: {connection_id}",
|
|
315
|
+
)
|
|
316
|
+
return None
|
|
317
|
+
|
|
318
|
+
if connection_id in self._connection_details_cache:
|
|
319
|
+
return self._connection_details_cache[connection_id]
|
|
320
|
+
|
|
321
|
+
try:
|
|
322
|
+
self.report.report_fivetran_rest_api_call_count()
|
|
323
|
+
conn_details = self.api_client.get_connection_details_by_id(connection_id)
|
|
324
|
+
# Update Cache
|
|
325
|
+
if conn_details:
|
|
326
|
+
self._connection_details_cache[connection_id] = conn_details
|
|
327
|
+
|
|
328
|
+
return conn_details
|
|
329
|
+
except Exception as e:
|
|
330
|
+
self.report.warning(
|
|
331
|
+
title="Failed to get connection details for Google Sheets Connector",
|
|
332
|
+
message=f"Exception occurred while getting connection details from Fivetran API. {e}",
|
|
333
|
+
context=f"connector_id: {connection_id}",
|
|
334
|
+
)
|
|
335
|
+
return None
|
|
336
|
+
|
|
337
|
+
def _get_gsheet_sheet_id_from_url(
|
|
338
|
+
self, gsheets_conn_details: FivetranConnectionDetails
|
|
339
|
+
) -> str:
|
|
340
|
+
# Extracting the sheet_id (1A82PdLAE7NXLLb5JcLPKeIpKUMytXQba5Z-Ei-mbXLo) from the sheet_id url
|
|
341
|
+
# "https://docs.google.com/spreadsheets/d/1A82PdLAE7NXLLb5JcLPKeIpKUMytXQba5Z-Ei-mbXLo/edit?gid=0#gid=0",
|
|
342
|
+
try:
|
|
343
|
+
parsed = urlparse(gsheets_conn_details.config.sheet_id)
|
|
344
|
+
# Example: https://docs.google.com/spreadsheets/d/<spreadsheetId>/edit
|
|
345
|
+
parts = parsed.path.split("/")
|
|
346
|
+
return parts[3] if len(parts) > 2 else ""
|
|
347
|
+
except Exception as e:
|
|
348
|
+
logger.warning(
|
|
349
|
+
f"Failed to extract sheet_id from the sheet_id url: {gsheets_conn_details.config.sheet_id}, {e}"
|
|
350
|
+
)
|
|
351
|
+
|
|
352
|
+
return ""
|
|
353
|
+
|
|
354
|
+
def _get_gsheet_named_range_dataset_id(
|
|
355
|
+
self, gsheets_conn_details: FivetranConnectionDetails
|
|
356
|
+
) -> str:
|
|
357
|
+
sheet_id = self._get_gsheet_sheet_id_from_url(gsheets_conn_details)
|
|
358
|
+
named_range_id = (
|
|
359
|
+
f"{sheet_id}.{gsheets_conn_details.config.named_range}"
|
|
360
|
+
if sheet_id
|
|
361
|
+
else gsheets_conn_details.config.named_range
|
|
362
|
+
)
|
|
363
|
+
logger.debug(
|
|
364
|
+
f"Using gsheet_named_range_dataset_id: {named_range_id} for connector: {gsheets_conn_details.id}"
|
|
365
|
+
)
|
|
366
|
+
return named_range_id
|
|
367
|
+
|
|
250
368
|
def _get_dpi_workunits(
|
|
251
369
|
self, job: Job, dpi: DataProcessInstance
|
|
252
370
|
) -> Iterable[MetadataWorkUnit]:
|
|
@@ -278,17 +396,83 @@ class FivetranSource(StatefulIngestionSourceBase):
|
|
|
278
396
|
|
|
279
397
|
def _get_connector_workunits(
|
|
280
398
|
self, connector: Connector
|
|
281
|
-
) -> Iterable[MetadataWorkUnit]:
|
|
399
|
+
) -> Iterable[Union[MetadataWorkUnit, Entity]]:
|
|
282
400
|
self.report.report_connectors_scanned()
|
|
401
|
+
|
|
402
|
+
"""
|
|
403
|
+
-------------------------------------------------------
|
|
404
|
+
Special Handling for Google Sheets Connectors
|
|
405
|
+
-------------------------------------------------------
|
|
406
|
+
Google Sheets source is not supported by Datahub yet.
|
|
407
|
+
As a workaround, we are emitting a dataset entity for the Google Sheet
|
|
408
|
+
and adding it to the lineage. This workaround needs to be removed once
|
|
409
|
+
Datahub supports Google Sheets source natively.
|
|
410
|
+
-------------------------------------------------------
|
|
411
|
+
"""
|
|
412
|
+
if connector.connector_type == Constant.GOOGLE_SHEETS_CONNECTOR_TYPE:
|
|
413
|
+
# Get Google Sheet dataset details from Fivetran API
|
|
414
|
+
gsheets_conn_details: Optional[FivetranConnectionDetails] = (
|
|
415
|
+
self._get_connection_details_by_id(connector.connector_id)
|
|
416
|
+
)
|
|
417
|
+
|
|
418
|
+
if gsheets_conn_details:
|
|
419
|
+
gsheets_dataset = Dataset(
|
|
420
|
+
name=self._get_gsheet_sheet_id_from_url(gsheets_conn_details),
|
|
421
|
+
platform=Constant.GOOGLE_SHEETS_CONNECTOR_TYPE,
|
|
422
|
+
env=self.config.env,
|
|
423
|
+
display_name=self._get_gsheet_sheet_id_from_url(
|
|
424
|
+
gsheets_conn_details
|
|
425
|
+
),
|
|
426
|
+
external_url=gsheets_conn_details.config.sheet_id,
|
|
427
|
+
created=gsheets_conn_details.created_at,
|
|
428
|
+
last_modified=gsheets_conn_details.source_sync_details.last_synced,
|
|
429
|
+
subtype=DatasetSubTypes.GOOGLE_SHEETS,
|
|
430
|
+
custom_properties={
|
|
431
|
+
"ingested_by": "fivetran source",
|
|
432
|
+
"connector_id": gsheets_conn_details.id,
|
|
433
|
+
},
|
|
434
|
+
)
|
|
435
|
+
gsheets_named_range_dataset = Dataset(
|
|
436
|
+
name=self._get_gsheet_named_range_dataset_id(gsheets_conn_details),
|
|
437
|
+
platform=Constant.GOOGLE_SHEETS_CONNECTOR_TYPE,
|
|
438
|
+
env=self.config.env,
|
|
439
|
+
display_name=gsheets_conn_details.config.named_range,
|
|
440
|
+
external_url=gsheets_conn_details.config.sheet_id,
|
|
441
|
+
created=gsheets_conn_details.created_at,
|
|
442
|
+
last_modified=gsheets_conn_details.source_sync_details.last_synced,
|
|
443
|
+
subtype=DatasetSubTypes.GOOGLE_SHEETS_NAMED_RANGE,
|
|
444
|
+
custom_properties={
|
|
445
|
+
"ingested_by": "fivetran source",
|
|
446
|
+
"connector_id": gsheets_conn_details.id,
|
|
447
|
+
},
|
|
448
|
+
upstreams=UpstreamLineage(
|
|
449
|
+
upstreams=[
|
|
450
|
+
UpstreamClass(
|
|
451
|
+
dataset=str(gsheets_dataset.urn),
|
|
452
|
+
type=DatasetLineageTypeClass.VIEW,
|
|
453
|
+
auditStamp=AuditStamp(
|
|
454
|
+
time=int(
|
|
455
|
+
gsheets_conn_details.created_at.timestamp()
|
|
456
|
+
* 1000
|
|
457
|
+
),
|
|
458
|
+
actor=CORPUSER_DATAHUB,
|
|
459
|
+
),
|
|
460
|
+
)
|
|
461
|
+
],
|
|
462
|
+
fineGrainedLineages=None,
|
|
463
|
+
),
|
|
464
|
+
)
|
|
465
|
+
|
|
466
|
+
yield gsheets_dataset
|
|
467
|
+
yield gsheets_named_range_dataset
|
|
468
|
+
|
|
283
469
|
# Create dataflow entity with same name as connector name
|
|
284
470
|
dataflow = self._generate_dataflow_from_connector(connector)
|
|
285
|
-
|
|
286
|
-
yield mcp.as_workunit()
|
|
471
|
+
yield dataflow
|
|
287
472
|
|
|
288
473
|
# Map Fivetran's connector entity with Datahub's datajob entity
|
|
289
474
|
datajob = self._generate_datajob_from_connector(connector)
|
|
290
|
-
|
|
291
|
-
yield mcp.as_workunit()
|
|
475
|
+
yield datajob
|
|
292
476
|
|
|
293
477
|
# Map Fivetran's job/sync history entity with Datahub's data process entity
|
|
294
478
|
if len(connector.jobs) >= MAX_JOBS_PER_CONNECTOR:
|
|
@@ -310,7 +494,7 @@ class FivetranSource(StatefulIngestionSourceBase):
|
|
|
310
494
|
).workunit_processor,
|
|
311
495
|
]
|
|
312
496
|
|
|
313
|
-
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
497
|
+
def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, Entity]]:
|
|
314
498
|
"""
|
|
315
499
|
Datahub Ingestion framework invoke this method
|
|
316
500
|
"""
|