PyPI - acryl-datahub - Versions diffs - 1.1.1rc4__py3-none-any.whl → 1.2.0.1rc1__py3-none-any.whl - Mend

acryl-datahub 1.1.1rc4py3-none-any.whl → 1.2.0.1rc1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (223) hide show

{acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/METADATA +2617 -2590
{acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/RECORD +223 -189
{acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/entry_points.txt +2 -0
datahub/_version.py +1 -1
datahub/api/entities/dataset/dataset.py +1 -1
datahub/api/entities/external/__init__.py +0 -0
datahub/api/entities/external/external_entities.py +239 -0
datahub/api/entities/external/external_tag.py +145 -0
datahub/api/entities/external/lake_formation_external_entites.py +161 -0
datahub/api/entities/external/restricted_text.py +247 -0
datahub/api/entities/external/unity_catalog_external_entites.py +173 -0
datahub/cli/check_cli.py +88 -7
datahub/cli/cli_utils.py +63 -0
datahub/cli/container_cli.py +5 -0
datahub/cli/delete_cli.py +124 -27
datahub/cli/docker_check.py +107 -12
datahub/cli/docker_cli.py +149 -227
datahub/cli/exists_cli.py +0 -2
datahub/cli/get_cli.py +0 -2
datahub/cli/iceberg_cli.py +5 -0
datahub/cli/ingest_cli.py +3 -15
datahub/cli/migrate.py +2 -0
datahub/cli/put_cli.py +1 -4
datahub/cli/quickstart_versioning.py +50 -7
datahub/cli/specific/assertions_cli.py +0 -4
datahub/cli/specific/datacontract_cli.py +0 -3
datahub/cli/specific/dataproduct_cli.py +0 -11
datahub/cli/specific/dataset_cli.py +1 -8
datahub/cli/specific/forms_cli.py +0 -4
datahub/cli/specific/group_cli.py +0 -2
datahub/cli/specific/structuredproperties_cli.py +1 -4
datahub/cli/specific/user_cli.py +0 -2
datahub/cli/state_cli.py +0 -2
datahub/cli/timeline_cli.py +0 -2
datahub/configuration/pydantic_migration_helpers.py +7 -5
datahub/emitter/rest_emitter.py +70 -12
datahub/entrypoints.py +4 -3
datahub/ingestion/api/decorators.py +15 -3
datahub/ingestion/api/report.py +332 -3
datahub/ingestion/api/sink.py +3 -0
datahub/ingestion/api/source.py +48 -44
datahub/ingestion/autogenerated/__init__.py +0 -0
datahub/ingestion/autogenerated/capability_summary.json +3449 -0
datahub/ingestion/autogenerated/lineage.json +401 -0
datahub/ingestion/autogenerated/lineage_helper.py +177 -0
datahub/ingestion/extractor/schema_util.py +13 -4
datahub/ingestion/glossary/classification_mixin.py +5 -0
datahub/ingestion/graph/client.py +100 -15
datahub/ingestion/graph/config.py +1 -0
datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +20 -10
datahub/ingestion/run/pipeline.py +54 -2
datahub/ingestion/sink/datahub_rest.py +13 -0
datahub/ingestion/source/abs/source.py +1 -1
datahub/ingestion/source/aws/aws_common.py +4 -0
datahub/ingestion/source/aws/glue.py +489 -244
datahub/ingestion/source/aws/tag_entities.py +292 -0
datahub/ingestion/source/azure/azure_common.py +2 -2
datahub/ingestion/source/bigquery_v2/bigquery.py +50 -23
datahub/ingestion/source/bigquery_v2/bigquery_config.py +1 -1
datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +2 -0
datahub/ingestion/source/bigquery_v2/common.py +1 -1
datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
datahub/ingestion/source/bigquery_v2/queries.py +3 -3
datahub/ingestion/source/cassandra/cassandra.py +1 -1
datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
datahub/ingestion/source/common/subtypes.py +45 -0
datahub/ingestion/source/data_lake_common/object_store.py +115 -27
datahub/ingestion/source/data_lake_common/path_spec.py +10 -21
datahub/ingestion/source/datahub/datahub_database_reader.py +1 -2
datahub/ingestion/source/dbt/dbt_cloud.py +10 -2
datahub/ingestion/source/dbt/dbt_common.py +6 -2
datahub/ingestion/source/dbt/dbt_core.py +3 -0
datahub/ingestion/source/debug/__init__.py +0 -0
datahub/ingestion/source/debug/datahub_debug.py +300 -0
datahub/ingestion/source/dremio/dremio_api.py +114 -73
datahub/ingestion/source/dremio/dremio_config.py +2 -0
datahub/ingestion/source/dremio/dremio_reporting.py +23 -2
datahub/ingestion/source/dremio/dremio_source.py +94 -81
datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
datahub/ingestion/source/file.py +3 -0
datahub/ingestion/source/fivetran/fivetran.py +34 -26
datahub/ingestion/source/gcs/gcs_source.py +13 -2
datahub/ingestion/source/ge_data_profiler.py +76 -28
datahub/ingestion/source/ge_profiling_config.py +11 -0
datahub/ingestion/source/hex/api.py +26 -1
datahub/ingestion/source/iceberg/iceberg.py +3 -1
datahub/ingestion/source/identity/azure_ad.py +1 -1
datahub/ingestion/source/identity/okta.py +1 -14
datahub/ingestion/source/kafka/kafka.py +16 -0
datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
datahub/ingestion/source/looker/looker_source.py +1 -0
datahub/ingestion/source/mlflow.py +11 -1
datahub/ingestion/source/mock_data/__init__.py +0 -0
datahub/ingestion/source/mock_data/datahub_mock_data.py +507 -0
datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
datahub/ingestion/source/nifi.py +1 -1
datahub/ingestion/source/powerbi/powerbi.py +1 -5
datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
datahub/ingestion/source/preset.py +2 -2
datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -0
datahub/ingestion/source/redshift/redshift.py +21 -1
datahub/ingestion/source/redshift/usage.py +4 -3
datahub/ingestion/source/s3/report.py +4 -2
datahub/ingestion/source/s3/source.py +367 -115
datahub/ingestion/source/sac/sac.py +3 -1
datahub/ingestion/source/salesforce.py +6 -3
datahub/ingestion/source/sigma/sigma.py +7 -1
datahub/ingestion/source/slack/slack.py +2 -1
datahub/ingestion/source/snowflake/snowflake_config.py +30 -7
datahub/ingestion/source/snowflake/snowflake_queries.py +348 -82
datahub/ingestion/source/snowflake/snowflake_summary.py +5 -0
datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
datahub/ingestion/source/snowflake/snowflake_utils.py +2 -7
datahub/ingestion/source/snowflake/snowflake_v2.py +16 -2
datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
datahub/ingestion/source/sql/athena.py +119 -11
datahub/ingestion/source/sql/athena_properties_extractor.py +777 -0
datahub/ingestion/source/sql/clickhouse.py +3 -1
datahub/ingestion/source/sql/cockroachdb.py +0 -1
datahub/ingestion/source/sql/hana.py +3 -1
datahub/ingestion/source/sql/hive_metastore.py +3 -11
datahub/ingestion/source/sql/mariadb.py +0 -1
datahub/ingestion/source/sql/mssql/source.py +239 -34
datahub/ingestion/source/sql/mysql.py +0 -1
datahub/ingestion/source/sql/oracle.py +1 -1
datahub/ingestion/source/sql/postgres.py +0 -1
datahub/ingestion/source/sql/sql_common.py +121 -34
datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
datahub/ingestion/source/sql/teradata.py +997 -235
datahub/ingestion/source/sql/vertica.py +10 -6
datahub/ingestion/source/sql_queries.py +2 -2
datahub/ingestion/source/state/stateful_ingestion_base.py +1 -1
datahub/ingestion/source/superset.py +58 -3
datahub/ingestion/source/tableau/tableau.py +58 -37
datahub/ingestion/source/tableau/tableau_common.py +4 -2
datahub/ingestion/source/tableau/tableau_constant.py +0 -4
datahub/ingestion/source/unity/config.py +5 -0
datahub/ingestion/source/unity/proxy.py +118 -0
datahub/ingestion/source/unity/source.py +195 -17
datahub/ingestion/source/unity/tag_entities.py +295 -0
datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
datahub/ingestion/source/usage/starburst_trino_usage.py +3 -0
datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
datahub/integrations/assertion/snowflake/compiler.py +4 -3
datahub/metadata/_internal_schema_classes.py +1522 -569
datahub/metadata/_urns/urn_defs.py +1826 -1658
datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +29 -0
datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
datahub/metadata/schema.avsc +17758 -17097
datahub/metadata/schemas/ApplicationKey.avsc +31 -0
datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
datahub/metadata/schemas/Applications.avsc +38 -0
datahub/metadata/schemas/ChartKey.avsc +1 -0
datahub/metadata/schemas/ContainerKey.avsc +1 -0
datahub/metadata/schemas/ContainerProperties.avsc +8 -0
datahub/metadata/schemas/CorpUserSettings.avsc +41 -0
datahub/metadata/schemas/DashboardKey.avsc +1 -0
datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
datahub/metadata/schemas/DataFlowKey.avsc +1 -0
datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
datahub/metadata/schemas/DataHubPageModuleProperties.avsc +237 -0
datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
datahub/metadata/schemas/DataJobInfo.avsc +8 -0
datahub/metadata/schemas/DataJobKey.avsc +1 -0
datahub/metadata/schemas/DataProcessKey.avsc +8 -0
datahub/metadata/schemas/DataProductKey.avsc +1 -0
datahub/metadata/schemas/DataProductProperties.avsc +1 -1
datahub/metadata/schemas/DatasetKey.avsc +11 -1
datahub/metadata/schemas/GlobalSettingsInfo.avsc +62 -0
datahub/metadata/schemas/GlossaryTermKey.avsc +1 -0
datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
datahub/metadata/schemas/LogicalParent.avsc +140 -0
datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
datahub/metadata/schemas/MLModelGroupKey.avsc +9 -0
datahub/metadata/schemas/MLModelKey.avsc +9 -0
datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -1
datahub/metadata/schemas/NotebookKey.avsc +1 -0
datahub/metadata/schemas/QuerySubjects.avsc +1 -12
datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
datahub/metadata/schemas/__init__.py +3 -3
datahub/sdk/__init__.py +2 -0
datahub/sdk/_all_entities.py +7 -0
datahub/sdk/_shared.py +116 -0
datahub/sdk/chart.py +315 -0
datahub/sdk/container.py +7 -0
datahub/sdk/dashboard.py +432 -0
datahub/sdk/dataflow.py +7 -0
datahub/sdk/datajob.py +45 -13
datahub/sdk/dataset.py +8 -2
datahub/sdk/entity_client.py +82 -2
datahub/sdk/lineage_client.py +683 -82
datahub/sdk/main_client.py +46 -16
datahub/sdk/mlmodel.py +101 -38
datahub/sdk/mlmodelgroup.py +7 -0
datahub/sdk/search_client.py +4 -3
datahub/sdk/search_filters.py +95 -27
datahub/specific/chart.py +1 -1
datahub/specific/dataproduct.py +4 -0
datahub/sql_parsing/sql_parsing_aggregator.py +29 -17
datahub/sql_parsing/sqlglot_lineage.py +62 -13
datahub/telemetry/telemetry.py +17 -11
datahub/testing/sdk_v2_helpers.py +7 -1
datahub/upgrade/upgrade.py +56 -14
datahub/utilities/server_config_util.py +8 -0
datahub/utilities/sqlalchemy_query_combiner.py +5 -2
datahub/utilities/stats_collections.py +4 -0
{acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/WHEEL +0 -0
{acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/licenses/LICENSE +0 -0
{acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/top_level.txt +0 -0

datahub/ingestion/source/aws/tag_entities.py ADDED Viewed

@@ -0,0 +1,292 @@
+import logging
+from typing import List, Optional
+from pydantic import BaseModel
+from datahub.api.entities.external.external_entities import (
+    ExternalEntity,
+    ExternalEntityId,
+    LinkedResourceSet,
+    PlatformResourceRepository,
+)
+from datahub.api.entities.external.lake_formation_external_entites import (
+    LakeFormationTag,
+)
+from datahub.api.entities.platformresource.platform_resource import (
+    PlatformResource,
+    PlatformResourceKey,
+    PlatformResourceSearchFields,
+)
+from datahub.metadata.urns import TagUrn
+from datahub.utilities.search_utils import ElasticDocumentQuery
+from datahub.utilities.urns.urn import Urn
+logger = logging.getLogger(__name__)
+class LakeFormationTagSyncContext(BaseModel):
+    # it is intentionally empty
+    platform_instance: Optional[str] = None
+    catalog: Optional[str] = None
+class LakeFormationTagPlatformResourceId(BaseModel, ExternalEntityId):
+    """
+    A LakeFormationTag is a unique identifier for a Lakeformation tag.
+    """
+    tag_key: str
+    tag_value: Optional[str] = None
+    platform_instance: Optional[str]
+    catalog: Optional[str] = None
+    exists_in_lake_formation: bool = False
+    persisted: bool = False
+    def __hash__(self) -> int:
+        return hash(self.to_platform_resource_key().id)
+    # this is a hack to make sure the property is a string and not private pydantic field
+    @staticmethod
+    def _RESOURCE_TYPE() -> str:
+        return "LakeFormationTagPlatformResource"
+    def to_platform_resource_key(self) -> PlatformResourceKey:
+        return PlatformResourceKey(
+            platform="glue",
+            resource_type=str(LakeFormationTagPlatformResourceId._RESOURCE_TYPE()),
+            primary_key=f"{self.catalog}.{self.tag_key}:{self.tag_value}"
+            if self.catalog
+            else f"{self.tag_key}:{self.tag_value}",
+            platform_instance=self.platform_instance,
+        )
+    @classmethod
+    def from_tag(
+        cls,
+        tag: LakeFormationTag,
+        platform_instance: Optional[str],
+        platform_resource_repository: PlatformResourceRepository,
+        catalog: Optional[str] = None,
+        exists_in_lake_formation: bool = False,
+    ) -> "LakeFormationTagPlatformResourceId":
+        """
+        Creates a LakeFormationTagPlatformResourceId from a LakeFormationTag.
+        """
+        existing_platform_resource = cls.search_by_urn(
+            tag.to_datahub_tag_urn().urn(),
+            platform_resource_repository=platform_resource_repository,
+            tag_sync_context=LakeFormationTagSyncContext(
+                platform_instance=platform_instance,
+                catalog=catalog,
+            ),
+        )
+        if existing_platform_resource:
+            logger.info(
+                f"Found existing LakeFormationTagPlatformResourceId for tag {tag.key}: {existing_platform_resource}"
+            )
+            return existing_platform_resource
+        return LakeFormationTagPlatformResourceId(
+            tag_key=tag.key,
+            tag_value=tag.value if tag.value is not None else None,
+            platform_instance=platform_instance,
+            exists_in_lake_formation=exists_in_lake_formation,
+            catalog=catalog,
+            persisted=False,
+        )
+    @classmethod
+    def search_by_urn(
+        cls,
+        urn: str,
+        platform_resource_repository: PlatformResourceRepository,
+        tag_sync_context: LakeFormationTagSyncContext,
+    ) -> Optional["LakeFormationTagPlatformResourceId"]:
+        mapped_tags = [
+            t
+            for t in platform_resource_repository.search_by_filter(
+                ElasticDocumentQuery.create_from(
+                    (
+                        PlatformResourceSearchFields.RESOURCE_TYPE,
+                        str(LakeFormationTagPlatformResourceId._RESOURCE_TYPE()),
+                    ),
+                    (PlatformResourceSearchFields.SECONDARY_KEYS, urn),
+                )
+            )
+        ]
+        logger.info(
+            f"Found {len(mapped_tags)} mapped tags for URN {urn}. {mapped_tags}"
+        )
+        if len(mapped_tags) > 0:
+            for platform_resource in mapped_tags:
+                if (
+                    platform_resource.resource_info
+                    and platform_resource.resource_info.value
+                ):
+                    lake_formation_tag_platform_resource = (
+                        LakeFormationTagPlatformResource(
+                            **platform_resource.resource_info.value.as_pydantic_object(
+                                LakeFormationTagPlatformResource
+                            ).dict()
+                        )
+                    )
+                    if (
+                        lake_formation_tag_platform_resource.id.platform_instance
+                        == tag_sync_context.platform_instance
+                        and lake_formation_tag_platform_resource.id.catalog
+                        == tag_sync_context.catalog
+                    ):
+                        lake_formation_tag_id = lake_formation_tag_platform_resource.id
+                        lake_formation_tag_id.exists_in_lake_formation = True
+                        lake_formation_tag_id.persisted = True
+                        return lake_formation_tag_id
+                else:
+                    logger.warning(
+                        f"Platform resource {platform_resource} does not have a resource_info value"
+                    )
+                    continue
+            # If we reach here, it means we did not find a mapped tag for the URN
+            logger.info(
+                f"No mapped tag found for URN {urn} with platform instance {tag_sync_context.platform_instance}. Creating a new LakeFormationTagPlatformResourceId."
+            )
+        return None
+    @classmethod
+    def from_datahub_urn(
+        cls,
+        urn: str,
+        platform_resource_repository: PlatformResourceRepository,
+        tag_sync_context: LakeFormationTagSyncContext,
+    ) -> "LakeFormationTagPlatformResourceId":
+        """
+        Creates a UnityCatalogTagPlatformResourceId from a DataHub URN.
+        """
+        # First we check if we already have a mapped platform resource for this
+        # urn that is of the type UnityCatalogTagPlatformResource
+        # If we do, we can use it to create the UnityCatalogTagPlatformResourceId
+        # Else, we need to generate a new UnityCatalogTagPlatformResourceId
+        existing_platform_resource_id = cls.search_by_urn(
+            urn, platform_resource_repository, tag_sync_context
+        )
+        if existing_platform_resource_id:
+            logger.info(
+                f"Found existing LakeFormationTagPlatformResourceId for URN {urn}: {existing_platform_resource_id}"
+            )
+            return existing_platform_resource_id
+        # Otherwise, we need to create a new UnityCatalogTagPlatformResourceId
+        new_tag_id = cls.generate_tag_id(tag_sync_context, urn)
+        if new_tag_id:
+            # we then check if this tag has already been ingested as a platform
+            # resource in the platform resource repository
+            resource_key = platform_resource_repository.get(
+                new_tag_id.to_platform_resource_key()
+            )
+            if resource_key:
+                logger.info(
+                    f"Tag {new_tag_id} already exists in platform resource repository with {resource_key}"
+                )
+                new_tag_id.exists_in_lake_formation = (
+                    True  # TODO: Check if this is a safe assumption
+                )
+            return new_tag_id
+        raise ValueError(f"Unable to create SnowflakeTagId from DataHub URN: {urn}")
+    @classmethod
+    def generate_tag_id(
+        cls, tag_sync_context: LakeFormationTagSyncContext, urn: str
+    ) -> "LakeFormationTagPlatformResourceId":
+        parsed_urn = Urn.from_string(urn)
+        entity_type = parsed_urn.entity_type
+        if entity_type == "tag":
+            new_tag_id = LakeFormationTagPlatformResourceId.from_datahub_tag(
+                TagUrn.from_string(urn), tag_sync_context
+            )
+        else:
+            raise ValueError(f"Unsupported entity type {entity_type} for URN {urn}")
+        return new_tag_id
+    @classmethod
+    def from_datahub_tag(
+        cls, tag_urn: TagUrn, tag_sync_context: LakeFormationTagSyncContext
+    ) -> "LakeFormationTagPlatformResourceId":
+        tag = LakeFormationTag.from_urn(tag_urn)
+        return LakeFormationTagPlatformResourceId(
+            tag_key=str(tag.key),
+            tag_value=str(tag.value),
+            platform_instance=tag_sync_context.platform_instance,
+            catalog=tag_sync_context.catalog,
+            exists_in_lake_formation=False,
+        )
+class LakeFormationTagPlatformResource(BaseModel, ExternalEntity):
+    datahub_urns: LinkedResourceSet
+    managed_by_datahub: bool
+    id: LakeFormationTagPlatformResourceId
+    allowed_values: Optional[List[str]]
+    def get_id(self) -> ExternalEntityId:
+        return self.id
+    def is_managed_by_datahub(self) -> bool:
+        return self.managed_by_datahub
+    def datahub_linked_resources(self) -> LinkedResourceSet:
+        return self.datahub_urns
+    def as_platform_resource(self) -> PlatformResource:
+        return PlatformResource.create(
+            key=self.id.to_platform_resource_key(),
+            secondary_keys=[u for u in self.datahub_urns.urns],
+            value=self,
+        )
+    @classmethod
+    def get_from_datahub(
+        cls,
+        lake_formation_tag_id: LakeFormationTagPlatformResourceId,
+        platform_resource_repository: PlatformResourceRepository,
+        managed_by_datahub: bool = False,
+    ) -> "LakeFormationTagPlatformResource":
+        # Search for linked DataHub URNs
+        platform_resources = [
+            r
+            for r in platform_resource_repository.search_by_filter(
+                ElasticDocumentQuery.create_from(
+                    (
+                        PlatformResourceSearchFields.RESOURCE_TYPE,
+                        str(LakeFormationTagPlatformResourceId._RESOURCE_TYPE()),
+                    ),
+                    (
+                        PlatformResourceSearchFields.PRIMARY_KEY,
+                        f"{lake_formation_tag_id.tag_key}/{lake_formation_tag_id.tag_value}",
+                    ),
+                )
+            )
+        ]
+        for platform_resource in platform_resources:
+            if (
+                platform_resource.resource_info
+                and platform_resource.resource_info.value
+            ):
+                lf_tag = LakeFormationTagPlatformResource(
+                    **platform_resource.resource_info.value.as_pydantic_object(
+                        LakeFormationTagPlatformResource
+                    ).dict()
+                )
+                if (
+                    lf_tag.id.platform_instance
+                    == lake_formation_tag_id.platform_instance
+                    and lf_tag.id.catalog == lake_formation_tag_id.catalog
+                ):
+                    return lf_tag
+        return cls(
+            id=lake_formation_tag_id,
+            datahub_urns=LinkedResourceSet(urns=[]),
+            managed_by_datahub=managed_by_datahub,
+            allowed_values=None,
+        )

datahub/ingestion/source/azure/azure_common.py CHANGED Viewed

@@ -61,13 +61,13 @@ class AzureConnectionConfig(ConfigModel):
     def get_blob_service_client(self):
         return BlobServiceClient(
             account_url=f"https://{self.account_name}.blob.core.windows.net",
-            credential=f"{self.get_credentials()}",
+            credential=self.get_credentials(),
         )
     def get_data_lake_service_client(self) -> DataLakeServiceClient:
         return DataLakeServiceClient(
             account_url=f"https://{self.account_name}.dfs.core.windows.net",
-            credential=f"{self.get_credentials()}",
+            credential=self.get_credentials(),
         )
     def get_credentials(

datahub/ingestion/source/bigquery_v2/bigquery.py CHANGED Viewed

@@ -4,6 +4,7 @@ import logging
 import os
 from typing import Iterable, List, Optional
+from datahub.configuration.common import AllowDenyPattern
 from datahub.ingestion.api.common import PipelineContext
 from datahub.ingestion.api.decorators import (
     SupportStatus,
@@ -44,6 +45,7 @@ from datahub.ingestion.source.bigquery_v2.queries_extractor import (
     BigQueryQueriesExtractorConfig,
 )
 from datahub.ingestion.source.bigquery_v2.usage import BigQueryUsageExtractor
+from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
 from datahub.ingestion.source.state.profiling_state_handler import ProfilingHandler
 from datahub.ingestion.source.state.redundant_run_skip_handler import (
     RedundantLineageRunSkipHandler,
@@ -77,7 +79,14 @@ def cleanup(config: BigQueryV2Config) -> None:
     supported=False,
 )
 @capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")
-@capability(SourceCapability.CONTAINERS, "Enabled by default")
+@capability(
+    SourceCapability.CONTAINERS,
+    "Enabled by default",
+    subtype_modifier=[
+        SourceCapabilityModifier.BIGQUERY_PROJECT,
+        SourceCapabilityModifier.BIGQUERY_DATASET,
+    ],
+)
 @capability(SourceCapability.SCHEMA_METADATA, "Enabled by default")
 @capability(
     SourceCapability.DATA_PROFILING,
@@ -99,6 +108,7 @@ def cleanup(config: BigQueryV2Config) -> None:
     SourceCapability.PARTITION_SUPPORT,
     "Enabled by default, partition keys and clustering keys are supported.",
 )
+@capability(SourceCapability.TEST_CONNECTION, "Enabled by default")
 class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
     def __init__(self, ctx: PipelineContext, config: BigQueryV2Config):
         super().__init__(config, ctx)
@@ -241,7 +251,23 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
             ).workunit_processor,
         ]
+    def _warn_deprecated_configs(self):
+        if (
+            self.config.match_fully_qualified_names is not None
+            and not self.config.match_fully_qualified_names
+            and self.config.schema_pattern is not None
+            and self.config.schema_pattern != AllowDenyPattern.allow_all()
+        ):
+            self.report.report_warning(
+                message="Please update `schema_pattern` to match against fully qualified schema name `<database_name>.<schema_name>` and set config `match_fully_qualified_names : True`."
+                "Current default `match_fully_qualified_names: False` is only to maintain backward compatibility. "
+                "The config option `match_fully_qualified_names` will be removed in future and the default behavior will be like `match_fully_qualified_names: True`.",
+                context="Config option deprecation warning",
+                title="Config option deprecation warning",
+            )
     def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
+        self._warn_deprecated_configs()
         projects = get_projects(
             self.bq_schema_extractor.schema_api,
             self.report,
@@ -270,28 +296,29 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
             ):
                 return
-            with self.report.new_stage(
-                f"*: {QUERIES_EXTRACTION}"
-            ), BigQueryQueriesExtractor(
-                connection=self.config.get_bigquery_client(),
-                schema_api=self.bq_schema_extractor.schema_api,
-                config=BigQueryQueriesExtractorConfig(
-                    window=self.config,
-                    user_email_pattern=self.config.usage.user_email_pattern,
-                    include_lineage=self.config.include_table_lineage,
-                    include_usage_statistics=self.config.include_usage_statistics,
-                    include_operations=self.config.usage.include_operational_stats,
-                    include_queries=self.config.include_queries,
-                    include_query_usage_statistics=self.config.include_query_usage_statistics,
-                    top_n_queries=self.config.usage.top_n_queries,
-                    region_qualifiers=self.config.region_qualifiers,
-                ),
-                structured_report=self.report,
-                filters=self.filters,
-                identifiers=self.identifiers,
-                schema_resolver=self.sql_parser_schema_resolver,
-                discovered_tables=self.bq_schema_extractor.table_refs,
-            ) as queries_extractor:
+            with (
+                self.report.new_stage(f"*: {QUERIES_EXTRACTION}"),
+                BigQueryQueriesExtractor(
+                    connection=self.config.get_bigquery_client(),
+                    schema_api=self.bq_schema_extractor.schema_api,
+                    config=BigQueryQueriesExtractorConfig(
+                        window=self.config,
+                        user_email_pattern=self.config.usage.user_email_pattern,
+                        include_lineage=self.config.include_table_lineage,
+                        include_usage_statistics=self.config.include_usage_statistics,
+                        include_operations=self.config.usage.include_operational_stats,
+                        include_queries=self.config.include_queries,
+                        include_query_usage_statistics=self.config.include_query_usage_statistics,
+                        top_n_queries=self.config.usage.top_n_queries,
+                        region_qualifiers=self.config.region_qualifiers,
+                    ),
+                    structured_report=self.report,
+                    filters=self.filters,
+                    identifiers=self.identifiers,
+                    schema_resolver=self.sql_parser_schema_resolver,
+                    discovered_tables=self.bq_schema_extractor.table_refs,
+                ) as queries_extractor,
+            ):
                 self.report.queries_extractor = queries_extractor.report
                 yield from queries_extractor.get_workunits_internal()
         else:

datahub/ingestion/source/bigquery_v2/bigquery_config.py CHANGED Viewed

@@ -342,7 +342,7 @@ class BigQueryV2Config(
     )
     use_queries_v2: bool = Field(
-        default=False,
+        default=True,
         description="If enabled, uses the new queries extractor to extract queries from bigquery.",
     )
     include_queries: bool = Field(

datahub/ingestion/source/bigquery_v2/bigquery_queries.py CHANGED Viewed

@@ -94,3 +94,4 @@ class BigQueryQueriesSource(Source):
     def close(self) -> None:
         self.queries_extractor.close()
         self.connection.close()
+        super().close()

datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py CHANGED Viewed

@@ -286,6 +286,7 @@ class BigQuerySchemaGenerator:
         yield from gen_database_container(
             database=database,
             name=database,
+            qualified_name=database,
             sub_types=[DatasetContainerSubTypes.BIGQUERY_PROJECT],
             domain_registry=self.domain_registry,
             domain_config=self.config.domain,
@@ -332,6 +333,7 @@ class BigQuerySchemaGenerator:
         yield from gen_schema_container(
             database=project_id,
             schema=dataset,
+            qualified_name=f"{project_id}.{dataset}",
             sub_types=[DatasetContainerSubTypes.BIGQUERY_DATASET],
             domain_registry=self.domain_registry,
             domain_config=self.config.domain,

datahub/ingestion/source/bigquery_v2/common.py CHANGED Viewed

@@ -63,7 +63,7 @@ class BigQueryIdentifierBuilder:
         )
     def gen_user_urn(self, user_email: str) -> str:
-        return make_user_urn(user_email.split("@")[0])
+        return make_user_urn(user_email)
     def make_data_platform_urn(self) -> str:
         return make_data_platform_urn(self.platform)

datahub/ingestion/source/bigquery_v2/profiler.py CHANGED Viewed

@@ -189,6 +189,7 @@ WHERE
         if len(profile_requests) == 0:
             return
         yield from self.generate_profile_workunits(
             profile_requests,
             max_workers=self.config.profiling.max_workers,
@@ -226,10 +227,11 @@ WHERE
             db_name, schema_name, bq_table, self.config.profiling.partition_datetime
         )
-        if partition is None and bq_table.partition_info:
+        # For partitioned tables, if it has a row count but not a valid partition, that means something went wrong with the partition detection.
+        if partition is None and bq_table.partition_info and bq_table.rows_count:
             self.report.report_warning(
                 title="Profile skipped for partitioned table",
-                message="profile skipped as partitioned table is empty or partition id or type was invalid",
+                message="profile skipped as partition id or type was invalid",
                 context=profile_request.pretty_name,
             )
             return None

datahub/ingestion/source/bigquery_v2/queries.py CHANGED Viewed

@@ -45,12 +45,12 @@ SELECT
   tos.OPTION_VALUE as comment,
   t.is_insertable_into,
   t.ddl,
-  ts.row_count,
+  ts.row_count as row_count,
   ts.size_bytes as bytes,
   p.num_partitions,
   p.max_partition_id,
-  p.active_billable_bytes,
-  p.long_term_billable_bytes,
+  p.active_billable_bytes as active_billable_bytes,
+  IFNULL(p.long_term_billable_bytes, 0) as long_term_billable_bytes,
   REGEXP_EXTRACT(t.table_name, r"(?:(?:.+\\D)[_$]?)(\\d\\d\\d\\d(?:0[1-9]|1[012])(?:0[1-9]|[12][0-9]|3[01]))$") as table_suffix,
   REGEXP_REPLACE(t.table_name, r"(?:[_$]?)(\\d\\d\\d\\d(?:0[1-9]|1[012])(?:0[1-9]|[12][0-9]|3[01]))$", "") as table_base

datahub/ingestion/source/cassandra/cassandra.py CHANGED Viewed

@@ -80,7 +80,7 @@ class KeyspaceKey(ContainerKey):
 @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
 @capability(
     SourceCapability.DELETION_DETECTION,
-    "Optionally enabled via `stateful_ingestion.remove_stale_metadata`",
+    "Enabled by default via stateful ingestion",
     supported=True,
 )
 class CassandraSource(StatefulIngestionSourceBase):

datahub/ingestion/source/cassandra/cassandra_profiling.py CHANGED Viewed

@@ -70,11 +70,12 @@ class CassandraProfiler:
     ) -> Iterable[MetadataWorkUnit]:
         for keyspace_name in cassandra_data.keyspaces:
             tables = cassandra_data.tables.get(keyspace_name, [])
-            with self.report.new_stage(
-                f"{keyspace_name}: {PROFILING}"
-            ), ThreadPoolExecutor(
-                max_workers=self.config.profiling.max_workers
-            ) as executor:
+            with (
+                self.report.new_stage(f"{keyspace_name}: {PROFILING}"),
+                ThreadPoolExecutor(
+                    max_workers=self.config.profiling.max_workers
+                ) as executor,
+            ):
                 future_to_dataset = {
                     executor.submit(
                         self.generate_profile,

datahub/ingestion/source/common/subtypes.py CHANGED Viewed

@@ -1,5 +1,10 @@
+import logging
+from typing import Any, Dict
 from datahub.utilities.str_enum import StrEnum
+logger = logging.getLogger(__name__)
 class DatasetSubTypes(StrEnum):
     # Generic SubTypes
@@ -26,6 +31,8 @@ class DatasetSubTypes(StrEnum):
     NEO4J_RELATIONSHIP = "Neo4j Relationship"
     SNOWFLAKE_STREAM = "Snowflake Stream"
     API_ENDPOINT = "API Endpoint"
+    SLACK_CHANNEL = "Slack Channel"
+    PROJECTIONS = "Projections"
     # TODO: Create separate entity...
     NOTEBOOK = "Notebook"
@@ -52,6 +59,8 @@ class BIContainerSubTypes(StrEnum):
     LOOKER_FOLDER = "Folder"
     LOOKML_PROJECT = "LookML Project"
     LOOKML_MODEL = "LookML Model"
+    TABLEAU_SITE = "Site"
+    TABLEAU_PROJECT = "Project"
     TABLEAU_WORKBOOK = "Workbook"
     POWERBI_DATASET = "Semantic Model"
     POWERBI_DATASET_TABLE = "Table"
@@ -74,6 +83,9 @@ class JobContainerSubTypes(StrEnum):
 class BIAssetSubTypes(StrEnum):
+    DASHBOARD = "Dashboard"
+    CHART = "Chart"
     # Generic SubTypes
     REPORT = "Report"
@@ -116,3 +128,36 @@ class MLAssetSubTypes(StrEnum):
     VERTEX_PIPELINE = "Pipeline Job"
     VERTEX_PIPELINE_TASK = "Pipeline Task"
     VERTEX_PIPELINE_TASK_RUN = "Pipeline Task Run"
+def create_source_capability_modifier_enum():
+    all_values: Dict[str, Any] = {}
+    source_enums = [
+        DatasetSubTypes,
+        DatasetContainerSubTypes,
+        BIContainerSubTypes,
+        FlowContainerSubTypes,
+        JobContainerSubTypes,
+        BIAssetSubTypes,
+        MLAssetSubTypes,
+    ]
+    for enum_class in source_enums:
+        for member in enum_class:  # type: ignore[var-annotated]
+            if member.name in all_values:
+                logger.debug(
+                    f"Warning: {member.name} already exists with value {all_values[member.name]}, skipping {member.value}"
+                )
+                continue
+            all_values[member.name] = member.value
+    enum_code = "class SourceCapabilityModifier(StrEnum):\n"
+    for name, value in all_values.items():
+        enum_code += f'    {name} = "{value}"\n'
+    exec(enum_code, globals())
+    return globals()["SourceCapabilityModifier"]
+# This will have all values from the enums above
+SourceCapabilityModifier = create_source_capability_modifier_enum()

acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.2.0.1rc1__py3-none-any.whl

Potentially problematic release.

acryl-datahub 1.1.1rc4py3-none-any.whl → 1.2.0.1rc1py3-none-any.whl