PyPI - acryl-datahub - Versions diffs - 1.0.0.1rc2__py3-none-any.whl → 1.0.0.1rc4__py3-none-any.whl - Mend

acryl-datahub 1.0.0.1rc2py3-none-any.whl → 1.0.0.1rc4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (37) hide show

datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py CHANGED Viewed

@@ -12,32 +12,14 @@ from datahub.configuration import ConfigModel
 from datahub.ingestion.api.common import PipelineContext
 from datahub.ingestion.api.source import SourceReport
 from datahub.ingestion.graph.client import DataHubGraph
-from datahub.ingestion.graph.filters import RemovedStatusFilter
+from datahub.ingestion.graph.filters import RemovedStatusFilter, SearchFilterRule
 from datahub.utilities.lossy_collections import LossyList
 from datahub.utilities.stats_collections import TopKDict
 from datahub.utilities.urns._urn_base import Urn
+from datahub.utilities.urns.error import InvalidUrnError
 logger = logging.getLogger(__name__)
-QUERY_ENTITIES = """
-query listEntities($input: ScrollAcrossEntitiesInput!) {
-  scrollAcrossEntities(input: $input) {
-    nextScrollId
-    count
-    searchResults {
-      entity {
-        ... on QueryEntity {
-          urn
-        }
-        ... on DataProcessInstance {
-          urn
-        }
-      }
-    }
-  }
-}
-"""
 class SoftDeletedEntitiesCleanupConfig(ConfigModel):
     enabled: bool = Field(
@@ -64,7 +46,33 @@ class SoftDeletedEntitiesCleanupConfig(ConfigModel):
     )
     entity_types: Optional[List[str]] = Field(
-        default=None,
+        # A default value is required otherwise QUERY and DATAPROCESS_INSTANCE won't be included
+        default=[
+            "dataset",
+            "dashboard",
+            "chart",
+            "mlmodel",
+            "mlmodelGroup",
+            "mlfeatureTable",
+            "mlfeature",
+            "mlprimaryKey",
+            "dataFlow",
+            "dataJob",
+            "glossaryTerm",
+            "glossaryNode",
+            "tag",
+            "role",
+            "corpuser",
+            "corpGroup",
+            "container",
+            "domain",
+            "dataProduct",
+            "notebook",
+            "businessAttribute",
+            "schemaField",
+            "query",
+            "dataProcessInstance",
+        ],
         description="List of entity types to cleanup",
     )
@@ -103,6 +111,9 @@ class SoftDeletedEntitiesReport(SourceReport):
     num_entities_found: Dict[str, int] = field(default_factory=dict)
     num_soft_deleted_entity_processed: int = 0
     num_soft_deleted_retained_due_to_age: int = 0
+    num_soft_deleted_retained_due_to_age_by_type: TopKDict[str, int] = field(
+        default_factory=TopKDict
+    )
     num_soft_deleted_entity_removal_started: int = 0
     num_hard_deleted: int = 0
     num_hard_deleted_by_type: TopKDict[str, int] = field(default_factory=TopKDict)
@@ -111,6 +122,8 @@ class SoftDeletedEntitiesReport(SourceReport):
     )
     runtime_limit_reached: bool = False
     deletion_limit_reached: bool = False
+    num_soft_deleted_entity_found: int = 0
+    num_soft_deleted_entity_invalid_urn: int = 0
 class SoftDeletedEntitiesCleanup:
@@ -133,7 +146,7 @@ class SoftDeletedEntitiesCleanup:
         self.config = config
         self.report = report
         self.dry_run = dry_run
-        self.start_time = 0.0
+        self.start_time = time.time()
         self._report_lock: Lock = Lock()
         self.last_print_time = 0.0
@@ -142,6 +155,14 @@ class SoftDeletedEntitiesCleanup:
         with self._report_lock:
             self.report.num_soft_deleted_retained_due_to_age += 1
+    def _increment_retained_by_type(self, type: str) -> None:
+        """Thread-safe method to update report fields"""
+        with self._report_lock:
+            self.report.num_soft_deleted_retained_due_to_age_by_type[type] = (
+                self.report.num_soft_deleted_retained_due_to_age_by_type.get(type, 0)
+                + 1
+            )
     def _increment_removal_started_count(self) -> None:
         """Thread-safe method to update report fields"""
         with self._report_lock:
@@ -160,10 +181,9 @@ class SoftDeletedEntitiesCleanup:
                 )
             self.report.sample_hard_deleted_aspects_by_type[entity_type].append(urn)
-    def delete_entity(self, urn: str) -> None:
+    def delete_entity(self, urn: Urn) -> None:
         assert self.ctx.graph
-        entity_urn = Urn.from_string(urn)
         if self.dry_run:
             logger.info(
                 f"Dry run is on otherwise it would have deleted {urn} with hard deletion"
@@ -172,14 +192,14 @@ class SoftDeletedEntitiesCleanup:
         if self._deletion_limit_reached() or self._times_up():
             return
         self._increment_removal_started_count()
-        self.ctx.graph.delete_entity(urn=urn, hard=True)
+        self.ctx.graph.delete_entity(urn=urn.urn(), hard=True)
         self.ctx.graph.delete_references_to_urn(
-            urn=urn,
+            urn=urn.urn(),
             dry_run=False,
         )
-        self._update_report(urn, entity_urn.entity_type)
+        self._update_report(urn.urn(), urn.entity_type)
-    def delete_soft_deleted_entity(self, urn: str) -> None:
+    def delete_soft_deleted_entity(self, urn: Urn) -> None:
         assert self.ctx.graph
         retention_time = (
@@ -187,7 +207,7 @@ class SoftDeletedEntitiesCleanup:
             - self.config.retention_days * 24 * 60 * 60
         )
-        aspect = self.ctx.graph.get_entity_raw(entity_urn=urn, aspects=["status"])
+        aspect = self.ctx.graph.get_entity_raw(entity_urn=urn.urn(), aspects=["status"])
         if "status" in aspect["aspects"]:
             if aspect["aspects"]["status"]["value"]["removed"] and aspect["aspects"][
                 "status"
@@ -196,6 +216,7 @@ class SoftDeletedEntitiesCleanup:
                 self.delete_entity(urn)
             else:
                 self._increment_retained_count()
+                self._increment_retained_by_type(urn.entity_type)
     def _print_report(self) -> None:
         time_taken = round(time.time() - self.last_print_time, 1)
@@ -204,7 +225,7 @@ class SoftDeletedEntitiesCleanup:
             self.last_print_time = time.time()
             logger.info(f"\n{self.report.as_string()}")
-    def _process_futures(self, futures: Dict[Future, str]) -> Dict[Future, str]:
+    def _process_futures(self, futures: Dict[Future, Urn]) -> Dict[Future, Urn]:
         done, not_done = wait(futures, return_when=FIRST_COMPLETED)
         futures = {future: urn for future, urn in futures.items() if future in not_done}
@@ -214,7 +235,7 @@ class SoftDeletedEntitiesCleanup:
                 self.report.failure(
                     title="Failed to delete entity",
                     message="Failed to delete entity",
-                    context=futures[future],
+                    context=futures[future].urn(),
                     exc=future.exception(),
                 )
             self.report.num_soft_deleted_entity_processed += 1
@@ -229,86 +250,52 @@ class SoftDeletedEntitiesCleanup:
                     time.sleep(self.config.delay)
         return futures
-    def _get_soft_deleted(self, graphql_query: str, entity_type: str) -> Iterable[str]:
+    def _get_urns(self) -> Iterable[str]:
         assert self.ctx.graph
-        scroll_id: Optional[str] = None
-        batch_size = self.config.batch_size
-        if entity_type == "DATA_PROCESS_INSTANCE":
-            # Due to a bug in Data process instance querying this is a temp workaround
-            # to avoid a giant stacktrace by having a smaller batch size in first call
-            # This will be remove in future version after server with fix has been
-            # around for a while
-            batch_size = 10
-        while True:
-            try:
-                if entity_type not in self.report.num_calls_made:
-                    self.report.num_calls_made[entity_type] = 1
-                else:
-                    self.report.num_calls_made[entity_type] += 1
-                self._print_report()
-                result = self.ctx.graph.execute_graphql(
-                    graphql_query,
-                    {
-                        "input": {
-                            "types": [entity_type],
-                            "query": "*",
-                            "scrollId": scroll_id if scroll_id else None,
-                            "count": batch_size,
-                            "orFilters": [
-                                {
-                                    "and": [
-                                        {
-                                            "field": "removed",
-                                            "values": ["true"],
-                                            "condition": "EQUAL",
-                                        }
-                                    ]
-                                }
-                            ],
-                        }
-                    },
-                )
-            except Exception as e:
-                self.report.failure(
-                    f"While trying to get {entity_type} with {scroll_id}", exc=e
-                )
-                break
-            scroll_across_entities = result.get("scrollAcrossEntities")
-            if not scroll_across_entities:
-                break
-            search_results = scroll_across_entities.get("searchResults")
-            count = scroll_across_entities.get("count")
-            if not count or not search_results:
-                # Due to a server bug we cannot rely on just count as it was returning response like this
-                # {'count': 1, 'nextScrollId': None, 'searchResults': []}
-                break
-            if entity_type == "DATA_PROCESS_INSTANCE":
-                # Temp workaround. See note in beginning of the function
-                # We make the batch size = config after call has succeeded once
-                batch_size = self.config.batch_size
-            scroll_id = scroll_across_entities.get("nextScrollId")
-            if entity_type not in self.report.num_entities_found:
-                self.report.num_entities_found[entity_type] = 0
-            self.report.num_entities_found[entity_type] += scroll_across_entities.get(
-                "count"
+        # Entities created in the retention period are not considered for deletion
+        created_from = int(
+            (
+                datetime.now(timezone.utc).timestamp()
+                - self.config.retention_days * 24 * 60 * 60
             )
-            for query in search_results:
-                yield query["entity"]["urn"]
+            * 1000
+        )
+        entity_types = self.config.entity_types
+        # dataProcessInstance is a special case where we need to get the entities separately
+        # because we need to filter based on created time we don't stream to many dataProcessInstance entities at once
+        # Gc source soft-deletes dataProcessInstance entities which causes to have a lot of soft deleted entities
+        if (
+            self.config.entity_types
+            and "dataProcessInstance" in self.config.entity_types
+        ):
+            entity_types = self.config.entity_types.copy()
+            yield from self.ctx.graph.get_urns_by_filter(
+                entity_types=["dataProcessInstance"],
+                platform=self.config.platform,
+                env=self.config.env,
+                query=self.config.query,
+                status=RemovedStatusFilter.ONLY_SOFT_DELETED,
+                batch_size=self.config.batch_size,
+                extraFilters=[
+                    SearchFilterRule(
+                        field="created",
+                        condition="LESS_THAN",
+                        values=[f"{created_from}"],
+                    ).to_raw()
+                ],
+            )
+            entity_types.remove("dataProcessInstance")
-    def _get_urns(self) -> Iterable[str]:
-        assert self.ctx.graph
         yield from self.ctx.graph.get_urns_by_filter(
-            entity_types=self.config.entity_types,
+            entity_types=entity_types,
             platform=self.config.platform,
             env=self.config.env,
             query=self.config.query,
             status=RemovedStatusFilter.ONLY_SOFT_DELETED,
             batch_size=self.config.batch_size,
         )
-        yield from self._get_soft_deleted(QUERY_ENTITIES, "QUERY")
-        yield from self._get_soft_deleted(QUERY_ENTITIES, "DATA_PROCESS_INSTANCE")
     def _times_up(self) -> bool:
         if (
@@ -335,16 +322,26 @@ class SoftDeletedEntitiesCleanup:
             return
         self.start_time = time.time()
-        futures: Dict[Future, str] = dict()
+        futures: Dict[Future, Urn] = dict()
         with ThreadPoolExecutor(max_workers=self.config.max_workers) as executor:
             for urn in self._get_urns():
+                try:
+                    self.report.num_soft_deleted_entity_found += 1
+                    soft_deleted_urn = Urn.from_string(urn)
+                except InvalidUrnError as e:
+                    logger.error(f"Failed to parse urn {urn} with error {e}")
+                    self.report.num_soft_deleted_entity_invalid_urn += 1
+                    continue
                 self._print_report()
                 while len(futures) >= self.config.futures_max_at_time:
                     futures = self._process_futures(futures)
                 if self._deletion_limit_reached() or self._times_up():
                     break
-                future = executor.submit(self.delete_soft_deleted_entity, urn)
-                futures[future] = urn
+                future = executor.submit(
+                    self.delete_soft_deleted_entity, soft_deleted_urn
+                )
+                futures[future] = soft_deleted_urn
             logger.info(f"Waiting for {len(futures)} futures to complete")
             while len(futures) > 0:

datahub/ingestion/source/ge_data_profiler.py CHANGED Viewed

@@ -602,7 +602,7 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
         if not self.config.include_field_median_value:
             return
         try:
-            if self.dataset.engine.dialect.name.lower() in [SNOWFLAKE, DATABRICKS]:
+            if self.dataset.engine.dialect.name.lower() == SNOWFLAKE:
                 column_profile.median = str(
                     self.dataset.engine.execute(
                         sa.select([sa.func.median(sa.column(column))]).select_from(
@@ -610,6 +610,16 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
                         )
                     ).scalar()
                 )
+            elif self.dataset.engine.dialect.name.lower() == DATABRICKS:
+                column_profile.median = str(
+                    self.dataset.engine.execute(
+                        sa.select(
+                            sa.text(
+                                f"approx_percentile(`{column}`, 0.5) as approx_median"
+                            )
+                        ).select_from(self.dataset._table)
+                    ).scalar()
+                )
             elif self.dataset.engine.dialect.name.lower() == BIGQUERY:
                 column_profile.median = str(
                     self.dataset.engine.execute(

datahub/ingestion/source/mlflow.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import json
+import os
 import time
 from dataclasses import dataclass
 from typing import Any, Callable, Iterable, List, Optional, Tuple, TypeVar, Union
@@ -115,6 +116,13 @@ class MLflowConfig(StatefulIngestionConfigBase, EnvConfigMixin):
         default=None, description="Mapping of source type to datahub platform"
     )
+    username: Optional[str] = Field(
+        default=None, description="Username for MLflow authentication"
+    )
+    password: Optional[str] = Field(
+        default=None, description="Password for MLflow authentication"
+    )
 @dataclass
 class MLflowRegisteredModelStageInfo:
@@ -161,7 +169,17 @@ class MLflowSource(StatefulIngestionSourceBase):
         self.ctx = ctx
         self.config = config
         self.report = StaleEntityRemovalSourceReport()
-        self.client = MlflowClient(
+        self.client = self._configure_client()
+    def _configure_client(self) -> MlflowClient:
+        if bool(self.config.username) != bool(self.config.password):
+            raise ValueError("Both username and password must be set together")
+        if self.config.username and self.config.password:
+            os.environ["MLFLOW_TRACKING_USERNAME"] = self.config.username
+            os.environ["MLFLOW_TRACKING_PASSWORD"] = self.config.password
+        return MlflowClient(
             tracking_uri=self.config.tracking_uri,
             registry_uri=self.config.registry_uri,
         )

datahub/ingestion/source/redshift/lineage_v2.py CHANGED Viewed

@@ -400,6 +400,10 @@ class RedshiftSqlLineageV2(Closeable):
         db_schemas: Dict[str, Dict[str, RedshiftSchema]],
     ) -> None:
         for schema_name, tables in all_tables[self.database].items():
+            logger.info(f"External table lineage: checking schema {schema_name}")
+            if not db_schemas[self.database].get(schema_name):
+                logger.warning(f"Schema {schema_name} not found")
+                continue
             for table in tables:
                 schema = db_schemas[self.database][schema_name]
                 if (
@@ -407,6 +411,9 @@ class RedshiftSqlLineageV2(Closeable):
                     and schema.is_external_schema()
                     and schema.external_platform
                 ):
+                    logger.info(
+                        f"External table lineage: processing table {schema_name}.{table.name}"
+                    )
                     # external_db_params = schema.option
                     upstream_platform = schema.external_platform.lower()

datahub/ingestion/source/redshift/query.py CHANGED Viewed

@@ -44,7 +44,7 @@ class RedshiftCommonQuery:
         SELECT
             schema_name,
             schema_type,
-            schema_option,
+            cast(null as varchar(1024)) as schema_option,
             cast(null as varchar(256)) as external_platform,
             cast(null as varchar(256)) as external_database
         FROM svv_redshift_schemas

datahub/ingestion/source/snowflake/constants.py CHANGED Viewed

@@ -54,6 +54,7 @@ class SnowflakeObjectDomain(StrEnum):
     COLUMN = "column"
     ICEBERG_TABLE = "iceberg table"
     STREAM = "stream"
+    PROCEDURE = "procedure"
 GENERIC_PERMISSION_ERROR_KEY = "permission-error"

datahub/ingestion/source/snowflake/snowflake_config.py CHANGED Viewed

@@ -100,7 +100,15 @@ class SnowflakeFilterConfig(SQLFilterConfig):
     stream_pattern: AllowDenyPattern = Field(
         default=AllowDenyPattern.allow_all(),
-        description="Regex patterns for streams to filter in ingestion. Note: Defaults to table_pattern if not specified. Specify regex to match the entire view name in database.schema.view format. e.g. to match all views starting with customer in Customer database and public schema, use the regex 'Customer.public.customer.*'",
+        description="Regex patterns for streams to filter in ingestion. Specify regex to match the entire view name in database.schema.view format. e.g. to match all views starting with customer in Customer database and public schema, use the regex 'Customer.public.customer.*'",
+    )
+    procedure_pattern: AllowDenyPattern = Field(
+        default=AllowDenyPattern.allow_all(),
+        description="Regex patterns for procedures to filter in ingestion. "
+        "Specify regex to match the entire procedure name in database.schema.procedure format. "
+        "e.g. to match all procedures starting with customer in Customer database and public schema,"
+        " use the regex 'Customer.public.customer.*'",
     )
     match_fully_qualified_names: bool = Field(
@@ -284,6 +292,11 @@ class SnowflakeV2Config(
         description="If enabled, streams will be ingested as separate entities from tables/views.",
     )
+    include_procedures: bool = Field(
+        default=True,
+        description="If enabled, procedures will be ingested as pipelines/tasks.",
+    )
     structured_property_pattern: AllowDenyPattern = Field(
         default=AllowDenyPattern.allow_all(),
         description=(

datahub/ingestion/source/snowflake/snowflake_query.py CHANGED Viewed

@@ -164,6 +164,23 @@ class SnowflakeQuery:
         and table_type in ('BASE TABLE', 'EXTERNAL TABLE')
         order by table_schema, table_name"""
+    @staticmethod
+    def procedures_for_database(db_name: Optional[str]) -> str:
+        db_clause = f'"{db_name}".' if db_name is not None else ""
+        return f"""
+        SELECT procedure_catalog AS "PROCEDURE_CATALOG",
+        procedure_schema AS "PROCEDURE_SCHEMA",
+        procedure_name AS "PROCEDURE_NAME",
+        procedure_language AS "PROCEDURE_LANGUAGE",
+        argument_signature AS "ARGUMENT_SIGNATURE",
+        data_type AS "PROCEDURE_RETURN_TYPE",
+        procedure_definition AS "PROCEDURE_DEFINITION",
+        created AS "CREATED",
+        last_altered AS "LAST_ALTERED",
+        comment AS "COMMENT"
+        FROM {db_clause}information_schema.procedures
+        order by procedure_schema, procedure_name"""
     @staticmethod
     def get_all_tags():
         return """

datahub/ingestion/source/snowflake/snowflake_report.py CHANGED Viewed

@@ -105,6 +105,7 @@ class SnowflakeV2Report(
     databases_scanned: int = 0
     tags_scanned: int = 0
     streams_scanned: int = 0
+    procedures_scanned: int = 0
     include_usage_stats: bool = False
     include_operational_stats: bool = False
@@ -163,6 +164,8 @@ class SnowflakeV2Report(
             self.tags_scanned += 1
         elif ent_type == "stream":
             self.streams_scanned += 1
+        elif ent_type == "procedure":
+            self.procedures_scanned += 1
         else:
             raise KeyError(f"Unknown entity {ent_type}.")

datahub/ingestion/source/snowflake/snowflake_schema.py CHANGED Viewed

@@ -14,6 +14,7 @@ from datahub.ingestion.source.snowflake.snowflake_query import (
     SnowflakeQuery,
 )
 from datahub.ingestion.source.sql.sql_generic import BaseColumn, BaseTable, BaseView
+from datahub.ingestion.source.sql.stored_procedures.base import BaseProcedure
 from datahub.utilities.file_backed_collections import FileBackedDict
 from datahub.utilities.prefix_batch_builder import PrefixGroup, build_prefix_batches
 from datahub.utilities.serialized_lru_cache import serialized_lru_cache
@@ -714,3 +715,31 @@ class SnowflakeDataDictionary(SupportsAsObj):
                 stream_pagination_marker = stream_name
         return streams
+    @serialized_lru_cache(maxsize=1)
+    def get_procedures_for_database(
+        self, db_name: str
+    ) -> Dict[str, List[BaseProcedure]]:
+        procedures: Dict[str, List[BaseProcedure]] = {}
+        cur = self.connection.query(
+            SnowflakeQuery.procedures_for_database(db_name),
+        )
+        for procedure in cur:
+            if procedure["PROCEDURE_SCHEMA"] not in procedures:
+                procedures[procedure["PROCEDURE_SCHEMA"]] = []
+            procedures[procedure["PROCEDURE_SCHEMA"]].append(
+                BaseProcedure(
+                    name=procedure["PROCEDURE_NAME"],
+                    language=procedure["PROCEDURE_LANGUAGE"],
+                    argument_signature=procedure["ARGUMENT_SIGNATURE"],
+                    return_type=procedure["PROCEDURE_RETURN_TYPE"],
+                    procedure_definition=procedure["PROCEDURE_DEFINITION"],
+                    created=procedure["CREATED"],
+                    last_altered=procedure["LAST_ALTERED"],
+                    comment=procedure["COMMENT"],
+                    extra_properties=None,
+                )
+            )
+        return procedures

acryl-datahub 1.0.0.1rc2__py3-none-any.whl → 1.0.0.1rc4__py3-none-any.whl

Potentially problematic release.

acryl-datahub 1.0.0.1rc2py3-none-any.whl → 1.0.0.1rc4py3-none-any.whl