PyPI - acryl-datahub - Versions diffs - 1.2.0.4rc4__py3-none-any.whl → 1.2.0.5rc2__py3-none-any.whl - Mend

acryl-datahub 1.2.0.4rc4py3-none-any.whl → 1.2.0.5rc2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (41) hide show

{acryl_datahub-1.2.0.4rc4.dist-info → acryl_datahub-1.2.0.5rc2.dist-info}/METADATA +2631 -2631
{acryl_datahub-1.2.0.4rc4.dist-info → acryl_datahub-1.2.0.5rc2.dist-info}/RECORD +41 -39
datahub/_version.py +1 -1
datahub/api/entities/common/serialized_value.py +1 -1
datahub/api/entities/external/external_entities.py +500 -15
datahub/ingestion/source/aws/glue.py +18 -14
datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
datahub/ingestion/source/aws/tag_entities.py +82 -104
datahub/ingestion/source/common/subtypes.py +1 -0
datahub/ingestion/source/hex/api.py +2 -0
datahub/ingestion/source/hex/mapper.py +16 -2
datahub/ingestion/source/hex/model.py +2 -0
datahub/ingestion/source/looker/looker_common.py +26 -0
datahub/ingestion/source/mock_data/datahub_mock_data.py +11 -1
datahub/ingestion/source/snowflake/constants.py +1 -0
datahub/ingestion/source/snowflake/snowflake_config.py +8 -1
datahub/ingestion/source/snowflake/snowflake_queries.py +49 -6
datahub/ingestion/source/snowflake/snowflake_query.py +50 -5
datahub/ingestion/source/snowflake/snowflake_schema.py +173 -9
datahub/ingestion/source/snowflake/snowflake_schema_gen.py +25 -2
datahub/ingestion/source/snowflake/snowflake_utils.py +16 -3
datahub/ingestion/source/snowflake/snowflake_v2.py +4 -1
datahub/ingestion/source/sql/mssql/source.py +2 -25
datahub/ingestion/source/sql/mysql.py +54 -0
datahub/ingestion/source/sql/postgres.py +5 -134
datahub/ingestion/source/sql/sql_common.py +137 -0
datahub/ingestion/source/superset.py +140 -56
datahub/ingestion/source/unity/config.py +11 -0
datahub/ingestion/source/unity/connection_test.py +1 -0
datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
datahub/ingestion/source/unity/proxy.py +20 -6
datahub/ingestion/source/unity/report.py +9 -1
datahub/ingestion/source/unity/source.py +51 -16
datahub/ingestion/source/unity/tag_entities.py +49 -147
datahub/metadata/_internal_schema_classes.py +1 -1
datahub/metadata/schema.avsc +4 -2
datahub/metadata/schemas/Operation.avsc +4 -2
{acryl_datahub-1.2.0.4rc4.dist-info → acryl_datahub-1.2.0.5rc2.dist-info}/WHEEL +0 -0
{acryl_datahub-1.2.0.4rc4.dist-info → acryl_datahub-1.2.0.5rc2.dist-info}/entry_points.txt +0 -0
{acryl_datahub-1.2.0.4rc4.dist-info → acryl_datahub-1.2.0.5rc2.dist-info}/licenses/LICENSE +0 -0
{acryl_datahub-1.2.0.4rc4.dist-info → acryl_datahub-1.2.0.5rc2.dist-info}/top_level.txt +0 -0

datahub/ingestion/source/aws/tag_entities.py CHANGED Viewed

@@ -1,5 +1,10 @@
 import logging
-from typing import List, Optional
+from typing import TYPE_CHECKING, List, Optional
+if TYPE_CHECKING:
+    from datahub.ingestion.source.aws.platform_resource_repository import (
+        GluePlatformResourceRepository,
+    )
 from pydantic import BaseModel
@@ -7,7 +12,6 @@ from datahub.api.entities.external.external_entities import (
     ExternalEntity,
     ExternalEntityId,
     LinkedResourceSet,
-    PlatformResourceRepository,
 )
 from datahub.api.entities.external.lake_formation_external_entites import (
     LakeFormationTag,
@@ -15,10 +19,8 @@ from datahub.api.entities.external.lake_formation_external_entites import (
 from datahub.api.entities.platformresource.platform_resource import (
     PlatformResource,
     PlatformResourceKey,
-    PlatformResourceSearchFields,
 )
 from datahub.metadata.urns import TagUrn
-from datahub.utilities.search_utils import ElasticDocumentQuery
 from datahub.utilities.urns.urn import Urn
 logger = logging.getLogger(__name__)
@@ -29,8 +31,12 @@ class LakeFormationTagSyncContext(BaseModel):
     platform_instance: Optional[str] = None
     catalog: Optional[str] = None
+    # Making it compatible with SyncContext interface
+    def get_platform_instance(self) -> Optional[str]:
+        return self.platform_instance
-class LakeFormationTagPlatformResourceId(BaseModel, ExternalEntityId):
+class LakeFormationTagPlatformResourceId(ExternalEntityId):
     """
     A LakeFormationTag is a unique identifier for a Lakeformation tag.
     """
@@ -42,9 +48,6 @@ class LakeFormationTagPlatformResourceId(BaseModel, ExternalEntityId):
     exists_in_lake_formation: bool = False
     persisted: bool = False
-    def __hash__(self) -> int:
-        return hash(self.to_platform_resource_key().id)
     # this is a hack to make sure the property is a string and not private pydantic field
     @staticmethod
     def _RESOURCE_TYPE() -> str:
@@ -61,24 +64,26 @@ class LakeFormationTagPlatformResourceId(BaseModel, ExternalEntityId):
         )
     @classmethod
-    def from_tag(
+    def get_or_create_from_tag(
         cls,
         tag: LakeFormationTag,
-        platform_instance: Optional[str],
-        platform_resource_repository: PlatformResourceRepository,
-        catalog: Optional[str] = None,
+        platform_resource_repository: "GluePlatformResourceRepository",
         exists_in_lake_formation: bool = False,
+        catalog_id: Optional[str] = None,
     ) -> "LakeFormationTagPlatformResourceId":
         """
         Creates a LakeFormationTagPlatformResourceId from a LakeFormationTag.
         """
+        # Use catalog_id if provided, otherwise fall back to repository catalog
+        effective_catalog = catalog_id or platform_resource_repository.catalog
         existing_platform_resource = cls.search_by_urn(
             tag.to_datahub_tag_urn().urn(),
             platform_resource_repository=platform_resource_repository,
             tag_sync_context=LakeFormationTagSyncContext(
-                platform_instance=platform_instance,
-                catalog=catalog,
+                platform_instance=platform_resource_repository.platform_instance,
+                catalog=effective_catalog,
             ),
         )
         if existing_platform_resource:
@@ -90,9 +95,9 @@ class LakeFormationTagPlatformResourceId(BaseModel, ExternalEntityId):
         return LakeFormationTagPlatformResourceId(
             tag_key=str(tag.key),
             tag_value=str(tag.value) if tag.value is not None else None,
-            platform_instance=platform_instance,
+            platform_instance=platform_resource_repository.platform_instance,
+            catalog=effective_catalog,
             exists_in_lake_formation=exists_in_lake_formation,
-            catalog=catalog,
             persisted=False,
         )
@@ -100,64 +105,48 @@ class LakeFormationTagPlatformResourceId(BaseModel, ExternalEntityId):
     def search_by_urn(
         cls,
         urn: str,
-        platform_resource_repository: PlatformResourceRepository,
+        platform_resource_repository: "GluePlatformResourceRepository",
         tag_sync_context: LakeFormationTagSyncContext,
     ) -> Optional["LakeFormationTagPlatformResourceId"]:
-        mapped_tags = [
-            t
-            for t in platform_resource_repository.search_by_filter(
-                ElasticDocumentQuery.create_from(
-                    (
-                        PlatformResourceSearchFields.RESOURCE_TYPE,
-                        str(LakeFormationTagPlatformResourceId._RESOURCE_TYPE()),
-                    ),
-                    (PlatformResourceSearchFields.SECONDARY_KEYS, urn),
+        """
+        Search for existing Lake Formation tag entity by URN using repository caching.
+        This method now delegates to the repository's search_entity_by_urn method to ensure
+        consistent caching behavior across all platform implementations.
+        """
+        # Use repository's cached search method instead of duplicating search logic
+        existing_entity_id = platform_resource_repository.search_entity_by_urn(urn)
+        if existing_entity_id:
+            # Verify platform instance and catalog match
+            if (
+                existing_entity_id.platform_instance
+                == tag_sync_context.platform_instance
+                and existing_entity_id.catalog == tag_sync_context.catalog
+            ):
+                logger.info(
+                    f"Found existing LakeFormationTagPlatformResourceId for URN {urn}: {existing_entity_id}"
                 )
-            )
-        ]
+                # Create a new ID with the correct state instead of mutating
+                return LakeFormationTagPlatformResourceId(
+                    tag_key=existing_entity_id.tag_key,
+                    tag_value=existing_entity_id.tag_value,
+                    platform_instance=existing_entity_id.platform_instance,
+                    catalog=existing_entity_id.catalog,
+                    exists_in_lake_formation=True,  # This tag exists in Lake Formation
+                    persisted=True,  # And it's persisted in DataHub
+                )
         logger.info(
-            f"Found {len(mapped_tags)} mapped tags for URN {urn}. {mapped_tags}"
+            f"No mapped tag found for URN {urn} with platform instance {tag_sync_context.platform_instance}. Creating a new LakeFormationTagPlatformResourceId."
         )
-        if len(mapped_tags) > 0:
-            for platform_resource in mapped_tags:
-                if (
-                    platform_resource.resource_info
-                    and platform_resource.resource_info.value
-                ):
-                    lake_formation_tag_platform_resource = (
-                        LakeFormationTagPlatformResource(
-                            **platform_resource.resource_info.value.as_pydantic_object(
-                                LakeFormationTagPlatformResource
-                            ).dict()
-                        )
-                    )
-                    if (
-                        lake_formation_tag_platform_resource.id.platform_instance
-                        == tag_sync_context.platform_instance
-                        and lake_formation_tag_platform_resource.id.catalog
-                        == tag_sync_context.catalog
-                    ):
-                        lake_formation_tag_id = lake_formation_tag_platform_resource.id
-                        lake_formation_tag_id.exists_in_lake_formation = True
-                        lake_formation_tag_id.persisted = True
-                        return lake_formation_tag_id
-                else:
-                    logger.warning(
-                        f"Platform resource {platform_resource} does not have a resource_info value"
-                    )
-                    continue
-            # If we reach here, it means we did not find a mapped tag for the URN
-            logger.info(
-                f"No mapped tag found for URN {urn} with platform instance {tag_sync_context.platform_instance}. Creating a new LakeFormationTagPlatformResourceId."
-            )
         return None
     @classmethod
     def from_datahub_urn(
         cls,
         urn: str,
-        platform_resource_repository: PlatformResourceRepository,
+        platform_resource_repository: "GluePlatformResourceRepository",
         tag_sync_context: LakeFormationTagSyncContext,
     ) -> "LakeFormationTagPlatformResourceId":
         """
@@ -188,11 +177,17 @@ class LakeFormationTagPlatformResourceId(BaseModel, ExternalEntityId):
                 logger.info(
                     f"Tag {new_tag_id} already exists in platform resource repository with {resource_key}"
                 )
-                new_tag_id.exists_in_lake_formation = (
-                    True  # TODO: Check if this is a safe assumption
+                # Create a new ID with the correct state instead of mutating
+                return LakeFormationTagPlatformResourceId(
+                    tag_key=new_tag_id.tag_key,
+                    tag_value=new_tag_id.tag_value,
+                    platform_instance=new_tag_id.platform_instance,
+                    catalog=new_tag_id.catalog,
+                    exists_in_lake_formation=True,  # This tag exists in Lake Formation
+                    persisted=new_tag_id.persisted,
                 )
             return new_tag_id
-        raise ValueError(f"Unable to create SnowflakeTagId from DataHub URN: {urn}")
+        raise ValueError(f"Unable to create LakeFormationTagId from DataHub URN: {urn}")
     @classmethod
     def generate_tag_id(
@@ -223,7 +218,7 @@ class LakeFormationTagPlatformResourceId(BaseModel, ExternalEntityId):
         )
-class LakeFormationTagPlatformResource(BaseModel, ExternalEntity):
+class LakeFormationTagPlatformResource(ExternalEntity):
     datahub_urns: LinkedResourceSet
     managed_by_datahub: bool
     id: LakeFormationTagPlatformResourceId
@@ -246,46 +241,29 @@ class LakeFormationTagPlatformResource(BaseModel, ExternalEntity):
         )
     @classmethod
-    def get_from_datahub(
+    def create_default(
         cls,
-        lake_formation_tag_id: LakeFormationTagPlatformResourceId,
-        platform_resource_repository: PlatformResourceRepository,
-        managed_by_datahub: bool = False,
+        entity_id: ExternalEntityId,
+        managed_by_datahub: bool,
     ) -> "LakeFormationTagPlatformResource":
-        # Search for linked DataHub URNs
-        platform_resources = [
-            r
-            for r in platform_resource_repository.search_by_filter(
-                ElasticDocumentQuery.create_from(
-                    (
-                        PlatformResourceSearchFields.RESOURCE_TYPE,
-                        str(LakeFormationTagPlatformResourceId._RESOURCE_TYPE()),
-                    ),
-                    (
-                        PlatformResourceSearchFields.PRIMARY_KEY,
-                        f"{lake_formation_tag_id.tag_key}/{lake_formation_tag_id.tag_value}",
-                    ),
-                )
-            )
-        ]
-        for platform_resource in platform_resources:
-            if (
-                platform_resource.resource_info
-                and platform_resource.resource_info.value
-            ):
-                lf_tag = LakeFormationTagPlatformResource(
-                    **platform_resource.resource_info.value.as_pydantic_object(
-                        LakeFormationTagPlatformResource
-                    ).dict()
-                )
-                if (
-                    lf_tag.id.platform_instance
-                    == lake_formation_tag_id.platform_instance
-                    and lf_tag.id.catalog == lake_formation_tag_id.catalog
-                ):
-                    return lf_tag
+        """Create a default Lake Formation tag entity when none found in DataHub."""
+        # Type narrowing: we know this will be a LakeFormationTagPlatformResourceId
+        assert isinstance(entity_id, LakeFormationTagPlatformResourceId), (
+            f"Expected LakeFormationTagPlatformResourceId, got {type(entity_id)}"
+        )
+        # Create a new entity ID with correct default state instead of mutating
+        default_entity_id = LakeFormationTagPlatformResourceId(
+            tag_key=entity_id.tag_key,
+            tag_value=entity_id.tag_value,
+            platform_instance=entity_id.platform_instance,
+            catalog=entity_id.catalog,
+            exists_in_lake_formation=False,  # New entities don't exist in Lake Formation yet
+            persisted=False,  # New entities are not persisted yet
+        )
         return cls(
-            id=lake_formation_tag_id,
+            id=default_entity_id,
             datahub_urns=LinkedResourceSet(urns=[]),
             managed_by_datahub=managed_by_datahub,
             allowed_values=None,

datahub/ingestion/source/common/subtypes.py CHANGED Viewed

@@ -30,6 +30,7 @@ class DatasetSubTypes(StrEnum):
     NEO4J_NODE = "Neo4j Node"
     NEO4J_RELATIONSHIP = "Neo4j Relationship"
     SNOWFLAKE_STREAM = "Snowflake Stream"
+    DYNAMIC_TABLE = "Dynamic Table"
     API_ENDPOINT = "API Endpoint"
     SLACK_CHANNEL = "Slack Channel"
     PROJECTIONS = "Projections"

datahub/ingestion/source/hex/api.py CHANGED Viewed

@@ -375,6 +375,7 @@ class HexApi:
                 description=hex_item.description,
                 created_at=hex_item.created_at,
                 last_edited_at=hex_item.last_edited_at,
+                last_published_at=hex_item.last_published_at,
                 status=status,
                 categories=categories,
                 collections=collections,
@@ -389,6 +390,7 @@ class HexApi:
                 description=hex_item.description,
                 created_at=hex_item.created_at,
                 last_edited_at=hex_item.last_edited_at,
+                last_published_at=hex_item.last_published_at,
                 status=status,
                 categories=categories,
                 collections=collections,

datahub/ingestion/source/hex/mapper.py CHANGED Viewed

@@ -122,7 +122,7 @@ class Mapper:
             lastModified=self._change_audit_stamps(
                 created_at=project.created_at, last_edited_at=project.last_edited_at
             ),
-            externalUrl=f"{self._base_url}/{self._workspace_name}/hex/{project.id}",
+            externalUrl=self._get_project_or_component_external_url(project),
             customProperties=dict(id=project.id),
             datasetEdges=self._dataset_edges(project.upstream_datasets),
             # TODO: support schema field upstream, maybe InputFields?
@@ -173,7 +173,7 @@ class Mapper:
             lastModified=self._change_audit_stamps(
                 created_at=component.created_at, last_edited_at=component.last_edited_at
             ),
-            externalUrl=f"{self._base_url}/{self._workspace_name}/hex/{component.id}",
+            externalUrl=self._get_project_or_component_external_url(component),
             customProperties=dict(id=component.id),
         )
@@ -242,6 +242,20 @@ class Mapper:
         assert isinstance(dashboard_urn, DashboardUrn)
         return dashboard_urn
+    def _get_project_or_component_external_url(
+        self,
+        project_or_component: Union[Project, Component],
+    ) -> Optional[str]:
+        if project_or_component.last_published_at is None:
+            return (
+                f"{self._base_url}/{self._workspace_name}/hex/{project_or_component.id}"
+            )
+        else:
+            # published Projects/Components have a different URL that everybody, not just editors, can access
+            return (
+                f"{self._base_url}/{self._workspace_name}/app/{project_or_component.id}"
+            )
     def _change_audit_stamps(
         self, created_at: Optional[datetime], last_edited_at: Optional[datetime]
     ) -> ChangeAuditStampsClass:

datahub/ingestion/source/hex/model.py CHANGED Viewed

@@ -46,6 +46,7 @@ class Project:
     title: str
     description: Optional[str]
     last_edited_at: Optional[datetime] = None
+    last_published_at: Optional[datetime] = None
     created_at: Optional[datetime] = None
     status: Optional[Status] = None
     categories: Optional[List[Category]] = None  # TODO: emit category description!
@@ -67,6 +68,7 @@ class Component:
     title: str
     description: Optional[str]
     last_edited_at: Optional[datetime] = None
+    last_published_at: Optional[datetime] = None
     created_at: Optional[datetime] = None
     status: Optional[Status] = None
     categories: Optional[List[Category]] = None

datahub/ingestion/source/looker/looker_common.py CHANGED Viewed

@@ -379,6 +379,14 @@ class ExploreUpstreamViewField:
                 : -(len(self.field.field_group_variant.lower()) + 1)
             ]
+        # Validate that field_name is not empty to prevent invalid schema field URNs
+        if not field_name or not field_name.strip():
+            logger.warning(
+                f"Empty field name detected for field '{self.field.name}' in explore '{self.explore.name}'. "
+                f"Skipping field to prevent invalid schema field URN generation."
+            )
+            return None
         assert view_name  # for lint false positive
         project_include: ProjectInclude = ProjectInclude(
@@ -1351,7 +1359,25 @@ class LookerExplore:
             fine_grained_lineages = []
             if config.extract_column_level_lineage:
                 for field in self.fields or []:
+                    # Skip creating fine-grained lineage for empty field names to prevent invalid schema field URNs
+                    if not field.name or not field.name.strip():
+                        logger.warning(
+                            f"Skipping fine-grained lineage for field with empty name in explore '{self.name}'"
+                        )
+                        continue
                     for upstream_column_ref in field.upstream_fields:
+                        # Skip creating fine-grained lineage for empty column names to prevent invalid schema field URNs
+                        if (
+                            not upstream_column_ref.column
+                            or not upstream_column_ref.column.strip()
+                        ):
+                            logger.warning(
+                                f"Skipping some fine-grained lineage for field '{field.name}' in explore '{self.name}' "
+                                f"due to empty upstream column name in table '{upstream_column_ref.table}'"
+                            )
+                            continue
                         fine_grained_lineages.append(
                             FineGrainedLineageClass(
                                 upstreamType=FineGrainedLineageUpstreamType.FIELD_SET,

datahub/ingestion/source/mock_data/datahub_mock_data.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import logging
-from typing import Dict, Iterable, List, Optional, Tuple
+from typing import Dict, Iterable, List, Optional, Tuple, Union
 from pydantic import Field
@@ -14,6 +14,7 @@ from datahub.ingestion.api.decorators import (
     support_status,
 )
 from datahub.ingestion.api.source import Source, SourceReport, StructuredLogCategory
+from datahub.ingestion.api.source_helpers import AutoSystemMetadata, auto_workunit
 from datahub.ingestion.api.workunit import MetadataWorkUnit
 from datahub.ingestion.source.common.subtypes import DatasetSubTypes
 from datahub.ingestion.source.mock_data.datahub_mock_data_report import (
@@ -31,6 +32,7 @@ from datahub.metadata.schema_classes import (
     UpstreamClass,
     UpstreamLineageClass,
 )
+from datahub.sdk.entity import Entity
 from datahub.utilities.str_enum import StrEnum
 logger = logging.getLogger(__name__)
@@ -165,6 +167,14 @@ class DataHubMockDataSource(Source):
         self.report = DataHubMockDataReport()
     def get_workunits(self) -> Iterable[MetadataWorkUnit]:
+        workunit_processors = [AutoSystemMetadata(self.ctx).stamp]
+        return self._apply_workunit_processors(
+            workunit_processors, auto_workunit(self.get_workunits_internal())
+        )
+    def get_workunits_internal(
+        self,
+    ) -> Iterable[Union[MetadataWorkUnit, MetadataChangeProposalWrapper, Entity]]:
         # We don't want any implicit aspects to be produced
         # so we are not using get_workunits_internal

datahub/ingestion/source/snowflake/constants.py CHANGED Viewed

@@ -55,6 +55,7 @@ class SnowflakeObjectDomain(StrEnum):
     ICEBERG_TABLE = "iceberg table"
     STREAM = "stream"
     PROCEDURE = "procedure"
+    DYNAMIC_TABLE = "dynamic table"
 GENERIC_PERMISSION_ERROR_KEY = "permission-error"

datahub/ingestion/source/snowflake/snowflake_config.py CHANGED Viewed

@@ -356,11 +356,18 @@ class SnowflakeV2Config(
     pushdown_deny_usernames: List[str] = Field(
         default=[],
-        description="List of snowflake usernames which will not be considered for lineage/usage/queries extraction. "
+        description="List of snowflake usernames (SQL LIKE patterns, e.g., 'SERVICE_%', '%_PROD', 'TEST_USER') which will NOT be considered for lineage/usage/queries extraction. "
         "This is primarily useful for improving performance by filtering out users with extremely high query volumes. "
         "Only applicable if `use_queries_v2` is enabled.",
     )
+    pushdown_allow_usernames: List[str] = Field(
+        default=[],
+        description="List of snowflake usernames (SQL LIKE patterns, e.g., 'ANALYST_%', '%_USER', 'MAIN_ACCOUNT') which WILL be considered for lineage/usage/queries extraction. "
+        "This is primarily useful for improving performance by filtering in only specific users. "
+        "Only applicable if `use_queries_v2` is enabled. If not specified, all users not in deny list are included.",
+    )
     push_down_database_pattern_access_history: bool = Field(
         default=False,
         description="If enabled, pushes down database pattern filtering to the access_history table for improved performance. "

datahub/ingestion/source/snowflake/snowflake_queries.py CHANGED Viewed

@@ -89,10 +89,17 @@ class SnowflakeQueriesExtractorConfig(ConfigModel):
     pushdown_deny_usernames: List[str] = pydantic.Field(
         default=[],
-        description="List of snowflake usernames which will not be considered for lineage/usage/queries extraction. "
+        description="List of snowflake usernames (SQL LIKE patterns, e.g., 'SERVICE_%', '%_PROD', 'TEST_USER') which will NOT be considered for lineage/usage/queries extraction. "
         "This is primarily useful for improving performance by filtering out users with extremely high query volumes.",
     )
+    pushdown_allow_usernames: List[str] = pydantic.Field(
+        default=[],
+        description="List of snowflake usernames (SQL LIKE patterns, e.g., 'ANALYST_%', '%_USER', 'MAIN_ACCOUNT') which WILL be considered for lineage/usage/queries extraction. "
+        "This is primarily useful for improving performance by filtering in only specific users. "
+        "If not specified, all users not in deny list are included.",
+    )
     user_email_pattern: AllowDenyPattern = pydantic.Field(
         default=AllowDenyPattern.allow_all(),
         description="Regex patterns for user emails to filter in usage.",
@@ -396,6 +403,7 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
             end_time=self.config.window.end_time,
             bucket_duration=self.config.window.bucket_duration,
             deny_usernames=self.config.pushdown_deny_usernames,
+            allow_usernames=self.config.pushdown_allow_usernames,
             dedup_strategy=self.config.query_dedup_strategy,
             database_pattern=self.filters.filter_config.database_pattern
             if self.config.push_down_database_pattern_access_history
@@ -740,7 +748,8 @@ class QueryLogQueryBuilder:
         start_time: datetime,
         end_time: datetime,
         bucket_duration: BucketDuration,
-        deny_usernames: Optional[List[str]],
+        deny_usernames: Optional[List[str]] = None,
+        allow_usernames: Optional[List[str]] = None,
         max_tables_per_query: int = 20,
         dedup_strategy: QueryDedupStrategyType = QueryDedupStrategyType.STANDARD,
         database_pattern: Optional[AllowDenyPattern] = None,
@@ -753,10 +762,7 @@ class QueryLogQueryBuilder:
         self.max_tables_per_query = max_tables_per_query
         self.dedup_strategy = dedup_strategy
-        self.users_filter = "TRUE"
-        if deny_usernames:
-            user_not_in = ",".join(f"'{user.upper()}'" for user in deny_usernames)
-            self.users_filter = f"user_name NOT IN ({user_not_in})"
+        self.users_filter = self._build_user_filter(deny_usernames, allow_usernames)
         self.access_history_database_filter = (
             self._build_access_history_database_filter_condition(
@@ -767,6 +773,43 @@ class QueryLogQueryBuilder:
         self.time_bucket_size = bucket_duration.value
         assert self.time_bucket_size in ("HOUR", "DAY", "MONTH")
+    def _build_user_filter(
+        self,
+        deny_usernames: Optional[List[str]] = None,
+        allow_usernames: Optional[List[str]] = None,
+    ) -> str:
+        """
+        Build user filter SQL condition based on deny and allow username patterns.
+        Args:
+            deny_usernames: List of username patterns to exclude (SQL LIKE patterns)
+            allow_usernames: List of username patterns to include (SQL LIKE patterns)
+        Returns:
+            SQL WHERE condition string for filtering users
+        """
+        user_filters = []
+        if deny_usernames:
+            deny_conditions = []
+            for pattern in deny_usernames:
+                # Escape single quotes for SQL safety
+                escaped_pattern = pattern.replace("'", "''")
+                deny_conditions.append(f"user_name NOT ILIKE '{escaped_pattern}'")
+            if deny_conditions:
+                user_filters.append(f"({' AND '.join(deny_conditions)})")
+        if allow_usernames:
+            allow_conditions = []
+            for pattern in allow_usernames:
+                # Escape single quotes for SQL safety
+                escaped_pattern = pattern.replace("'", "''")
+                allow_conditions.append(f"user_name ILIKE '{escaped_pattern}'")
+            if allow_conditions:
+                user_filters.append(f"({' OR '.join(allow_conditions)})")
+        return " AND ".join(user_filters) if user_filters else "TRUE"
     def _build_access_history_database_filter_condition(
         self,
         database_pattern: Optional[AllowDenyPattern],

acryl-datahub 1.2.0.4rc4__py3-none-any.whl → 1.2.0.5rc2__py3-none-any.whl

Potentially problematic release.

acryl-datahub 1.2.0.4rc4py3-none-any.whl → 1.2.0.5rc2py3-none-any.whl