PyPI - acryl-datahub - Versions diffs - 1.0.0rc17__py3-none-any.whl → 1.0.0.1__py3-none-any.whl - Mend

acryl-datahub 1.0.0rc17py3-none-any.whl → 1.0.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (107) hide show

{acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info}/METADATA +2426 -2427
{acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info}/RECORD +106 -89
{acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info}/WHEEL +1 -1
{acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info}/entry_points.txt +2 -1
datahub/_version.py +1 -1
datahub/api/entities/dataset/dataset.py +1 -28
datahub/cli/specific/dataset_cli.py +26 -10
datahub/emitter/mce_builder.py +1 -3
datahub/emitter/mcp_builder.py +8 -0
datahub/emitter/request_helper.py +19 -14
datahub/emitter/response_helper.py +25 -18
datahub/emitter/rest_emitter.py +23 -7
datahub/errors.py +8 -0
datahub/ingestion/api/source.py +7 -2
datahub/ingestion/api/source_helpers.py +14 -2
datahub/ingestion/extractor/schema_util.py +1 -0
datahub/ingestion/graph/client.py +26 -20
datahub/ingestion/graph/filters.py +62 -17
datahub/ingestion/sink/datahub_rest.py +2 -2
datahub/ingestion/source/cassandra/cassandra.py +1 -10
datahub/ingestion/source/common/data_platforms.py +23 -0
datahub/ingestion/source/common/gcp_credentials_config.py +6 -0
datahub/ingestion/source/common/subtypes.py +17 -1
datahub/ingestion/source/data_lake_common/path_spec.py +21 -1
datahub/ingestion/source/dbt/dbt_common.py +6 -4
datahub/ingestion/source/dbt/dbt_core.py +4 -6
datahub/ingestion/source/dbt/dbt_tests.py +8 -6
datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
datahub/ingestion/source/dremio/dremio_entities.py +6 -5
datahub/ingestion/source/dremio/dremio_source.py +96 -117
datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
datahub/ingestion/source/ge_data_profiler.py +11 -1
datahub/ingestion/source/hex/__init__.py +0 -0
datahub/ingestion/source/hex/api.py +394 -0
datahub/ingestion/source/hex/constants.py +3 -0
datahub/ingestion/source/hex/hex.py +167 -0
datahub/ingestion/source/hex/mapper.py +372 -0
datahub/ingestion/source/hex/model.py +68 -0
datahub/ingestion/source/iceberg/iceberg.py +193 -140
datahub/ingestion/source/iceberg/iceberg_profiler.py +21 -18
datahub/ingestion/source/mlflow.py +217 -8
datahub/ingestion/source/mode.py +11 -1
datahub/ingestion/source/openapi.py +69 -34
datahub/ingestion/source/powerbi/config.py +31 -4
datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
datahub/ingestion/source/powerbi/m_query/pattern_handler.py +111 -10
datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
datahub/ingestion/source/powerbi/powerbi.py +41 -24
datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -11
datahub/ingestion/source/redshift/lineage_v2.py +9 -1
datahub/ingestion/source/redshift/query.py +1 -1
datahub/ingestion/source/s3/source.py +11 -0
datahub/ingestion/source/sigma/config.py +3 -4
datahub/ingestion/source/sigma/sigma.py +10 -6
datahub/ingestion/source/slack/slack.py +399 -82
datahub/ingestion/source/snowflake/constants.py +1 -0
datahub/ingestion/source/snowflake/snowflake_config.py +14 -1
datahub/ingestion/source/snowflake/snowflake_queries.py +16 -13
datahub/ingestion/source/snowflake/snowflake_query.py +17 -0
datahub/ingestion/source/snowflake/snowflake_report.py +3 -0
datahub/ingestion/source/snowflake/snowflake_schema.py +29 -0
datahub/ingestion/source/snowflake/snowflake_schema_gen.py +112 -42
datahub/ingestion/source/snowflake/snowflake_utils.py +25 -1
datahub/ingestion/source/sql/mssql/job_models.py +15 -1
datahub/ingestion/source/sql/mssql/source.py +8 -4
datahub/ingestion/source/sql/oracle.py +51 -4
datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
datahub/ingestion/source/sql/stored_procedures/base.py +242 -0
datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +1 -29
datahub/ingestion/source/superset.py +291 -35
datahub/ingestion/source/usage/usage_common.py +0 -65
datahub/ingestion/source/vertexai/__init__.py +0 -0
datahub/ingestion/source/vertexai/vertexai.py +1055 -0
datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +68 -0
datahub/metadata/_schema_classes.py +472 -1
datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
datahub/metadata/schema.avsc +313 -2
datahub/metadata/schemas/CorpUserEditableInfo.avsc +14 -0
datahub/metadata/schemas/CorpUserKey.avsc +2 -1
datahub/metadata/schemas/CorpUserSettings.avsc +95 -0
datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
datahub/metadata/schemas/Deprecation.avsc +2 -0
datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
datahub/metadata/schemas/MetadataChangeEvent.avsc +32 -0
datahub/metadata/schemas/QueryProperties.avsc +20 -0
datahub/metadata/schemas/Siblings.avsc +2 -0
datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
datahub/sdk/__init__.py +1 -0
datahub/sdk/dataset.py +122 -0
datahub/sdk/entity.py +99 -3
datahub/sdk/entity_client.py +27 -3
datahub/sdk/main_client.py +24 -1
datahub/sdk/search_client.py +81 -8
datahub/sdk/search_filters.py +94 -37
datahub/sql_parsing/split_statements.py +17 -3
datahub/sql_parsing/sql_parsing_aggregator.py +6 -0
datahub/sql_parsing/tool_meta_extractor.py +27 -2
datahub/testing/mcp_diff.py +1 -18
datahub/utilities/threaded_iterator_executor.py +16 -3
datahub/ingestion/source/vertexai.py +0 -697
{acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info/licenses}/LICENSE +0 -0
{acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info}/top_level.txt +0 -0

datahub/ingestion/source/dremio/dremio_source.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import logging
-import re
-from collections import defaultdict
 from concurrent.futures import ThreadPoolExecutor, as_completed
+from dataclasses import dataclass
 from typing import Dict, Iterable, List, Optional
 from datahub.emitter.mce_builder import (
@@ -28,7 +27,10 @@ from datahub.ingestion.source.dremio.dremio_api import (
     DremioEdition,
 )
 from datahub.ingestion.source.dremio.dremio_aspects import DremioAspects
-from datahub.ingestion.source.dremio.dremio_config import DremioSourceConfig
+from datahub.ingestion.source.dremio.dremio_config import (
+    DremioSourceConfig,
+    DremioSourceMapping,
+)
 from datahub.ingestion.source.dremio.dremio_datahub_source_mapping import (
     DremioToDataHubSourceTypeMapping,
 )
@@ -39,6 +41,7 @@ from datahub.ingestion.source.dremio.dremio_entities import (
     DremioDatasetType,
     DremioGlossaryTerm,
     DremioQuery,
+    DremioSourceContainer,
 )
 from datahub.ingestion.source.dremio.dremio_profiling import DremioProfiler
 from datahub.ingestion.source.dremio.dremio_reporting import DremioSourceReport
@@ -65,6 +68,17 @@ from datahub.sql_parsing.sql_parsing_aggregator import (
 logger = logging.getLogger(__name__)
+@dataclass
+class DremioSourceMapEntry:
+    platform: str
+    source_name: str
+    dremio_source_category: str
+    root_path: str = ""
+    database_name: str = ""
+    platform_instance: Optional[str] = None
+    env: Optional[str] = None
 @platform_name("Dremio")
 @config_class(DremioSourceConfig)
 @support_status(SupportStatus.CERTIFIED)
@@ -112,7 +126,7 @@ class DremioSource(StatefulIngestionSourceBase):
         self.default_db = "dremio"
         self.config = config
         self.report = DremioSourceReport()
-        self.source_map: Dict[str, Dict] = defaultdict()
+        self.source_map: Dict[str, DremioSourceMapEntry] = dict()
         # Initialize API operations
         dremio_api = DremioAPIOperations(self.config, self.report)
@@ -152,111 +166,12 @@ class DremioSource(StatefulIngestionSourceBase):
     def get_platform(self) -> str:
         return "dremio"
-    def _build_source_map(self) -> Dict[str, Dict]:
-        """
-        Builds a source mapping dictionary to support external lineage generation across
-        multiple Dremio sources, based on provided configuration mappings.
-        This method operates as follows:
-        1. If a source mapping is present in the config:
-        - For each source in the Dremio catalog, if the mapping's `source_name` matches
-            the `dremio_source_type`, `root_path` and `database_name` are added to the mapping
-            information, along with the platform, platform instance, and environment if they exist.
-            This allows constructing the full URN for upstream lineage.
-        2. If a source mapping is absent in the configuration:
-        - Default mappings are created for each source name, setting `env` and `platform_instance`
-            to default values and classifying the source type. This ensures all sources have a
-            mapping, even if specific configuration details are missing.
-        Returns:
-            Dict[str, Dict]: A dictionary (`source_map`) where each key is a source name
-                            (lowercased) and each value is another dictionary containing:
-                            - `platform`: The source platform.
-                            - `source_name`: The source name.
-                            - `dremio_source_type`: The type mapped to DataHub,
-                            e.g., "database", "folder".
-                            - Optional `root_path`, `database_name`, `platform_instance`,
-                            and `env` if provided in the configuration.
-        Example:
-            This method is used internally within the class to generate mappings before
-            creating cross-platform lineage.
-        """
-        source_map = {}
+    def _build_source_map(self) -> Dict[str, DremioSourceMapEntry]:
         dremio_sources = self.dremio_catalog.get_sources()
+        source_mappings_config = self.config.source_mappings or []
-        for source in dremio_sources:
-            source_name = source.container_name
-            if isinstance(source.dremio_source_type, str):
-                source_type = source.dremio_source_type.lower()
-                root_path = source.root_path.lower() if source.root_path else ""
-                database_name = (
-                    source.database_name.lower() if source.database_name else ""
-                )
-                source_present = False
-                source_platform_name = source_name
-                for mapping in self.config.source_mappings or []:
-                    if re.search(mapping.source_name, source_type, re.IGNORECASE):
-                        source_platform_name = mapping.source_name.lower()
-                    datahub_source_type = (
-                        DremioToDataHubSourceTypeMapping.get_datahub_source_type(
-                            source_type
-                        )
-                    )
-                    if re.search(mapping.platform, datahub_source_type, re.IGNORECASE):
-                        source_platform_name = source_platform_name.lower()
-                        source_map[source_platform_name] = {
-                            "platform": mapping.platform,
-                            "source_name": mapping.source_name,
-                            "dremio_source_type": DremioToDataHubSourceTypeMapping.get_category(
-                                source_type,
-                            ),
-                            "root_path": root_path,
-                            "database_name": database_name,
-                            "platform_instance": mapping.platform_instance,
-                            "env": mapping.env,
-                        }
-                        source_present = True
-                        break
-                if not source_present:
-                    try:
-                        dremio_source_type = (
-                            DremioToDataHubSourceTypeMapping.get_category(source_type)
-                        )
-                    except Exception as exc:
-                        logger.info(
-                            f"Source {source_type} is not a standard Dremio source type. "
-                            f"Adding source_type {source_type} to mapping as database. Error: {exc}"
-                        )
-                        DremioToDataHubSourceTypeMapping.add_mapping(
-                            source_type, source_name
-                        )
-                        dremio_source_type = (
-                            DremioToDataHubSourceTypeMapping.get_category(source_type)
-                        )
-                    source_map[source_platform_name.lower()] = {
-                        "platform": source_type,
-                        "source_name": source_name,
-                        "dremio_source_type": dremio_source_type,
-                    }
-            else:
-                logger.error(
-                    f'Source "{source.container_name}" is broken. Containers will not be created for source.'
-                )
-                logger.error(
-                    f'No new cross-platform lineage will be emitted for source "{source.container_name}".'
-                )
-                logger.error("Fix this source in Dremio to fix this issue.")
+        source_map = build_dremio_source_map(dremio_sources, source_mappings_config)
+        logger.info(f"Full source map: {source_map}")
         return source_map
@@ -431,6 +346,7 @@ class DremioSource(StatefulIngestionSourceBase):
                     dremio_path=dataset_info.path,
                     dremio_dataset=dataset_info.resource_name,
                 )
+                logger.debug(f"Upstream dataset for {dataset_urn}: {upstream_urn}")
                 if upstream_urn:
                     upstream_lineage = UpstreamLineage(
@@ -596,25 +512,23 @@ class DremioSource(StatefulIngestionSourceBase):
         if not mapping:
             return None
-        platform = mapping.get("platform")
+        platform = mapping.platform
         if not platform:
             return None
-        platform_instance = mapping.get(
-            "platform_instance", self.config.platform_instance
-        )
-        env = mapping.get("env", self.config.env)
+        platform_instance = mapping.platform_instance
+        env = mapping.env or self.config.env
         root_path = ""
         database_name = ""
-        if mapping.get("dremio_source_type") == "file_object_storage":
-            if mapping.get("root_path"):
-                root_path = f"{mapping['root_path'][1:]}/"
+        if mapping.dremio_source_category == "file_object_storage":
+            if mapping.root_path:
+                root_path = f"{mapping.root_path[1:]}/"
             dremio_dataset = f"{root_path}{'/'.join(dremio_path[1:])}/{dremio_dataset}"
         else:
-            if mapping.get("database_name"):
-                database_name = f"{mapping['database_name']}."
+            if mapping.database_name:
+                database_name = f"{mapping.database_name}."
             dremio_dataset = (
                 f"{database_name}{'.'.join(dremio_path[1:])}.{dremio_dataset}"
             )
@@ -639,3 +553,68 @@ class DremioSource(StatefulIngestionSourceBase):
         Get the source report.
         """
         return self.report
+def build_dremio_source_map(
+    dremio_sources: Iterable[DremioSourceContainer],
+    source_mappings_config: List[DremioSourceMapping],
+) -> Dict[str, DremioSourceMapEntry]:
+    """
+    Builds a source mapping dictionary to support external lineage generation across
+    multiple Dremio sources, based on provided configuration mappings.
+    This method operates as follows:
+    Returns:
+        Dict[str, Dict]: A dictionary (`source_map`) where each key is a source name
+                        (lowercased) and each value is another entry containing:
+                        - `platform`: The source platform.
+                        - `source_name`: The source name.
+                        - `dremio_source_category`: The type mapped to DataHub,
+                        e.g., "database", "folder".
+                        - Optional `root_path`, `database_name`, `platform_instance`,
+                        and `env` if provided in the configuration.
+    Example:
+        This method is used internally within the class to generate mappings before
+        creating cross-platform lineage.
+    """
+    source_map = {}
+    for source in dremio_sources:
+        current_source_name = source.container_name
+        source_type = source.dremio_source_type.lower()
+        source_category = DremioToDataHubSourceTypeMapping.get_category(source_type)
+        datahub_platform = DremioToDataHubSourceTypeMapping.get_datahub_platform(
+            source_type
+        )
+        root_path = source.root_path.lower() if source.root_path else ""
+        database_name = source.database_name.lower() if source.database_name else ""
+        source_present = False
+        for mapping in source_mappings_config:
+            if mapping.source_name.lower() == current_source_name.lower():
+                source_map[current_source_name.lower()] = DremioSourceMapEntry(
+                    platform=mapping.platform,
+                    source_name=mapping.source_name,
+                    dremio_source_category=source_category,
+                    root_path=root_path,
+                    database_name=database_name,
+                    platform_instance=mapping.platform_instance,
+                    env=mapping.env,
+                )
+                source_present = True
+                break
+        if not source_present:
+            source_map[current_source_name.lower()] = DremioSourceMapEntry(
+                platform=datahub_platform,
+                source_name=current_source_name,
+                dremio_source_category=source_category,
+                root_path=root_path,
+                database_name=database_name,
+                platform_instance=None,
+                env=None,
+            )
+    return source_map

datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py CHANGED Viewed

@@ -12,32 +12,14 @@ from datahub.configuration import ConfigModel
 from datahub.ingestion.api.common import PipelineContext
 from datahub.ingestion.api.source import SourceReport
 from datahub.ingestion.graph.client import DataHubGraph
-from datahub.ingestion.graph.filters import RemovedStatusFilter
+from datahub.ingestion.graph.filters import RemovedStatusFilter, SearchFilterRule
 from datahub.utilities.lossy_collections import LossyList
 from datahub.utilities.stats_collections import TopKDict
 from datahub.utilities.urns._urn_base import Urn
+from datahub.utilities.urns.error import InvalidUrnError
 logger = logging.getLogger(__name__)
-QUERY_ENTITIES = """
-query listEntities($input: ScrollAcrossEntitiesInput!) {
-  scrollAcrossEntities(input: $input) {
-    nextScrollId
-    count
-    searchResults {
-      entity {
-        ... on QueryEntity {
-          urn
-        }
-        ... on DataProcessInstance {
-          urn
-        }
-      }
-    }
-  }
-}
-"""
 class SoftDeletedEntitiesCleanupConfig(ConfigModel):
     enabled: bool = Field(
@@ -64,7 +46,33 @@ class SoftDeletedEntitiesCleanupConfig(ConfigModel):
     )
     entity_types: Optional[List[str]] = Field(
-        default=None,
+        # A default value is required otherwise QUERY and DATAPROCESS_INSTANCE won't be included
+        default=[
+            "dataset",
+            "dashboard",
+            "chart",
+            "mlmodel",
+            "mlmodelGroup",
+            "mlfeatureTable",
+            "mlfeature",
+            "mlprimaryKey",
+            "dataFlow",
+            "dataJob",
+            "glossaryTerm",
+            "glossaryNode",
+            "tag",
+            "role",
+            "corpuser",
+            "corpGroup",
+            "container",
+            "domain",
+            "dataProduct",
+            "notebook",
+            "businessAttribute",
+            "schemaField",
+            "query",
+            "dataProcessInstance",
+        ],
         description="List of entity types to cleanup",
     )
@@ -103,6 +111,9 @@ class SoftDeletedEntitiesReport(SourceReport):
     num_entities_found: Dict[str, int] = field(default_factory=dict)
     num_soft_deleted_entity_processed: int = 0
     num_soft_deleted_retained_due_to_age: int = 0
+    num_soft_deleted_retained_due_to_age_by_type: TopKDict[str, int] = field(
+        default_factory=TopKDict
+    )
     num_soft_deleted_entity_removal_started: int = 0
     num_hard_deleted: int = 0
     num_hard_deleted_by_type: TopKDict[str, int] = field(default_factory=TopKDict)
@@ -111,6 +122,8 @@ class SoftDeletedEntitiesReport(SourceReport):
     )
     runtime_limit_reached: bool = False
     deletion_limit_reached: bool = False
+    num_soft_deleted_entity_found: int = 0
+    num_soft_deleted_entity_invalid_urn: int = 0
 class SoftDeletedEntitiesCleanup:
@@ -133,7 +146,7 @@ class SoftDeletedEntitiesCleanup:
         self.config = config
         self.report = report
         self.dry_run = dry_run
-        self.start_time = 0.0
+        self.start_time = time.time()
         self._report_lock: Lock = Lock()
         self.last_print_time = 0.0
@@ -142,6 +155,14 @@ class SoftDeletedEntitiesCleanup:
         with self._report_lock:
             self.report.num_soft_deleted_retained_due_to_age += 1
+    def _increment_retained_by_type(self, type: str) -> None:
+        """Thread-safe method to update report fields"""
+        with self._report_lock:
+            self.report.num_soft_deleted_retained_due_to_age_by_type[type] = (
+                self.report.num_soft_deleted_retained_due_to_age_by_type.get(type, 0)
+                + 1
+            )
     def _increment_removal_started_count(self) -> None:
         """Thread-safe method to update report fields"""
         with self._report_lock:
@@ -160,10 +181,9 @@ class SoftDeletedEntitiesCleanup:
                 )
             self.report.sample_hard_deleted_aspects_by_type[entity_type].append(urn)
-    def delete_entity(self, urn: str) -> None:
+    def delete_entity(self, urn: Urn) -> None:
         assert self.ctx.graph
-        entity_urn = Urn.from_string(urn)
         if self.dry_run:
             logger.info(
                 f"Dry run is on otherwise it would have deleted {urn} with hard deletion"
@@ -172,14 +192,14 @@ class SoftDeletedEntitiesCleanup:
         if self._deletion_limit_reached() or self._times_up():
             return
         self._increment_removal_started_count()
-        self.ctx.graph.delete_entity(urn=urn, hard=True)
+        self.ctx.graph.delete_entity(urn=urn.urn(), hard=True)
         self.ctx.graph.delete_references_to_urn(
-            urn=urn,
+            urn=urn.urn(),
             dry_run=False,
         )
-        self._update_report(urn, entity_urn.entity_type)
+        self._update_report(urn.urn(), urn.entity_type)
-    def delete_soft_deleted_entity(self, urn: str) -> None:
+    def delete_soft_deleted_entity(self, urn: Urn) -> None:
         assert self.ctx.graph
         retention_time = (
@@ -187,7 +207,7 @@ class SoftDeletedEntitiesCleanup:
             - self.config.retention_days * 24 * 60 * 60
         )
-        aspect = self.ctx.graph.get_entity_raw(entity_urn=urn, aspects=["status"])
+        aspect = self.ctx.graph.get_entity_raw(entity_urn=urn.urn(), aspects=["status"])
         if "status" in aspect["aspects"]:
             if aspect["aspects"]["status"]["value"]["removed"] and aspect["aspects"][
                 "status"
@@ -196,6 +216,7 @@ class SoftDeletedEntitiesCleanup:
                 self.delete_entity(urn)
             else:
                 self._increment_retained_count()
+                self._increment_retained_by_type(urn.entity_type)
     def _print_report(self) -> None:
         time_taken = round(time.time() - self.last_print_time, 1)
@@ -204,7 +225,7 @@ class SoftDeletedEntitiesCleanup:
             self.last_print_time = time.time()
             logger.info(f"\n{self.report.as_string()}")
-    def _process_futures(self, futures: Dict[Future, str]) -> Dict[Future, str]:
+    def _process_futures(self, futures: Dict[Future, Urn]) -> Dict[Future, Urn]:
         done, not_done = wait(futures, return_when=FIRST_COMPLETED)
         futures = {future: urn for future, urn in futures.items() if future in not_done}
@@ -214,7 +235,7 @@ class SoftDeletedEntitiesCleanup:
                 self.report.failure(
                     title="Failed to delete entity",
                     message="Failed to delete entity",
-                    context=futures[future],
+                    context=futures[future].urn(),
                     exc=future.exception(),
                 )
             self.report.num_soft_deleted_entity_processed += 1
@@ -229,86 +250,52 @@ class SoftDeletedEntitiesCleanup:
                     time.sleep(self.config.delay)
         return futures
-    def _get_soft_deleted(self, graphql_query: str, entity_type: str) -> Iterable[str]:
+    def _get_urns(self) -> Iterable[str]:
         assert self.ctx.graph
-        scroll_id: Optional[str] = None
-        batch_size = self.config.batch_size
-        if entity_type == "DATA_PROCESS_INSTANCE":
-            # Due to a bug in Data process instance querying this is a temp workaround
-            # to avoid a giant stacktrace by having a smaller batch size in first call
-            # This will be remove in future version after server with fix has been
-            # around for a while
-            batch_size = 10
-        while True:
-            try:
-                if entity_type not in self.report.num_calls_made:
-                    self.report.num_calls_made[entity_type] = 1
-                else:
-                    self.report.num_calls_made[entity_type] += 1
-                self._print_report()
-                result = self.ctx.graph.execute_graphql(
-                    graphql_query,
-                    {
-                        "input": {
-                            "types": [entity_type],
-                            "query": "*",
-                            "scrollId": scroll_id if scroll_id else None,
-                            "count": batch_size,
-                            "orFilters": [
-                                {
-                                    "and": [
-                                        {
-                                            "field": "removed",
-                                            "values": ["true"],
-                                            "condition": "EQUAL",
-                                        }
-                                    ]
-                                }
-                            ],
-                        }
-                    },
-                )
-            except Exception as e:
-                self.report.failure(
-                    f"While trying to get {entity_type} with {scroll_id}", exc=e
-                )
-                break
-            scroll_across_entities = result.get("scrollAcrossEntities")
-            if not scroll_across_entities:
-                break
-            search_results = scroll_across_entities.get("searchResults")
-            count = scroll_across_entities.get("count")
-            if not count or not search_results:
-                # Due to a server bug we cannot rely on just count as it was returning response like this
-                # {'count': 1, 'nextScrollId': None, 'searchResults': []}
-                break
-            if entity_type == "DATA_PROCESS_INSTANCE":
-                # Temp workaround. See note in beginning of the function
-                # We make the batch size = config after call has succeeded once
-                batch_size = self.config.batch_size
-            scroll_id = scroll_across_entities.get("nextScrollId")
-            if entity_type not in self.report.num_entities_found:
-                self.report.num_entities_found[entity_type] = 0
-            self.report.num_entities_found[entity_type] += scroll_across_entities.get(
-                "count"
+        # Entities created in the retention period are not considered for deletion
+        created_from = int(
+            (
+                datetime.now(timezone.utc).timestamp()
+                - self.config.retention_days * 24 * 60 * 60
             )
-            for query in search_results:
-                yield query["entity"]["urn"]
+            * 1000
+        )
+        entity_types = self.config.entity_types
+        # dataProcessInstance is a special case where we need to get the entities separately
+        # because we need to filter based on created time we don't stream to many dataProcessInstance entities at once
+        # Gc source soft-deletes dataProcessInstance entities which causes to have a lot of soft deleted entities
+        if (
+            self.config.entity_types
+            and "dataProcessInstance" in self.config.entity_types
+        ):
+            entity_types = self.config.entity_types.copy()
+            yield from self.ctx.graph.get_urns_by_filter(
+                entity_types=["dataProcessInstance"],
+                platform=self.config.platform,
+                env=self.config.env,
+                query=self.config.query,
+                status=RemovedStatusFilter.ONLY_SOFT_DELETED,
+                batch_size=self.config.batch_size,
+                extraFilters=[
+                    SearchFilterRule(
+                        field="created",
+                        condition="LESS_THAN",
+                        values=[f"{created_from}"],
+                    ).to_raw()
+                ],
+            )
+            entity_types.remove("dataProcessInstance")
-    def _get_urns(self) -> Iterable[str]:
-        assert self.ctx.graph
         yield from self.ctx.graph.get_urns_by_filter(
-            entity_types=self.config.entity_types,
+            entity_types=entity_types,
             platform=self.config.platform,
             env=self.config.env,
             query=self.config.query,
             status=RemovedStatusFilter.ONLY_SOFT_DELETED,
             batch_size=self.config.batch_size,
         )
-        yield from self._get_soft_deleted(QUERY_ENTITIES, "QUERY")
-        yield from self._get_soft_deleted(QUERY_ENTITIES, "DATA_PROCESS_INSTANCE")
     def _times_up(self) -> bool:
         if (
@@ -335,16 +322,26 @@ class SoftDeletedEntitiesCleanup:
             return
         self.start_time = time.time()
-        futures: Dict[Future, str] = dict()
+        futures: Dict[Future, Urn] = dict()
         with ThreadPoolExecutor(max_workers=self.config.max_workers) as executor:
             for urn in self._get_urns():
+                try:
+                    self.report.num_soft_deleted_entity_found += 1
+                    soft_deleted_urn = Urn.from_string(urn)
+                except InvalidUrnError as e:
+                    logger.error(f"Failed to parse urn {urn} with error {e}")
+                    self.report.num_soft_deleted_entity_invalid_urn += 1
+                    continue
                 self._print_report()
                 while len(futures) >= self.config.futures_max_at_time:
                     futures = self._process_futures(futures)
                 if self._deletion_limit_reached() or self._times_up():
                     break
-                future = executor.submit(self.delete_soft_deleted_entity, urn)
-                futures[future] = urn
+                future = executor.submit(
+                    self.delete_soft_deleted_entity, soft_deleted_urn
+                )
+                futures[future] = soft_deleted_urn
             logger.info(f"Waiting for {len(futures)} futures to complete")
             while len(futures) > 0:

datahub/ingestion/source/ge_data_profiler.py CHANGED Viewed

@@ -602,7 +602,7 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
         if not self.config.include_field_median_value:
             return
         try:
-            if self.dataset.engine.dialect.name.lower() in [SNOWFLAKE, DATABRICKS]:
+            if self.dataset.engine.dialect.name.lower() == SNOWFLAKE:
                 column_profile.median = str(
                     self.dataset.engine.execute(
                         sa.select([sa.func.median(sa.column(column))]).select_from(
@@ -610,6 +610,16 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
                         )
                     ).scalar()
                 )
+            elif self.dataset.engine.dialect.name.lower() == DATABRICKS:
+                column_profile.median = str(
+                    self.dataset.engine.execute(
+                        sa.select(
+                            sa.text(
+                                f"approx_percentile(`{column}`, 0.5) as approx_median"
+                            )
+                        ).select_from(self.dataset._table)
+                    ).scalar()
+                )
             elif self.dataset.engine.dialect.name.lower() == BIGQUERY:
                 column_profile.median = str(
                     self.dataset.engine.execute(

datahub/ingestion/source/hex/__init__.py ADDED Viewed

File without changes

acryl-datahub 1.0.0rc17__py3-none-any.whl → 1.0.0.1__py3-none-any.whl

Potentially problematic release.

acryl-datahub 1.0.0rc17py3-none-any.whl → 1.0.0.1py3-none-any.whl