PyPI - acryl-datahub - Versions diffs - 1.0.0.1rc3__py3-none-any.whl → 1.0.0.1rc5__py3-none-any.whl - Mend

acryl-datahub 1.0.0.1rc3py3-none-any.whl → 1.0.0.1rc5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (22) hide show

{acryl_datahub-1.0.0.1rc3.dist-info → acryl_datahub-1.0.0.1rc5.dist-info}/METADATA +2417 -2417
{acryl_datahub-1.0.0.1rc3.dist-info → acryl_datahub-1.0.0.1rc5.dist-info}/RECORD +22 -22
datahub/_version.py +1 -1
datahub/emitter/mce_builder.py +1 -3
datahub/emitter/response_helper.py +25 -18
datahub/emitter/rest_emitter.py +8 -0
datahub/errors.py +4 -0
datahub/ingestion/graph/filters.py +22 -2
datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
datahub/ingestion/source/ge_data_profiler.py +11 -1
datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -11
datahub/ingestion/source/redshift/lineage_v2.py +9 -1
datahub/ingestion/source/redshift/query.py +1 -1
datahub/ingestion/source/superset.py +153 -13
datahub/sdk/__init__.py +1 -0
datahub/sdk/main_client.py +2 -1
datahub/sdk/search_filters.py +18 -23
datahub/sql_parsing/split_statements.py +12 -2
{acryl_datahub-1.0.0.1rc3.dist-info → acryl_datahub-1.0.0.1rc5.dist-info}/WHEEL +0 -0
{acryl_datahub-1.0.0.1rc3.dist-info → acryl_datahub-1.0.0.1rc5.dist-info}/entry_points.txt +0 -0
{acryl_datahub-1.0.0.1rc3.dist-info → acryl_datahub-1.0.0.1rc5.dist-info}/licenses/LICENSE +0 -0
{acryl_datahub-1.0.0.1rc3.dist-info → acryl_datahub-1.0.0.1rc5.dist-info}/top_level.txt +0 -0

datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py CHANGED Viewed

@@ -12,32 +12,14 @@ from datahub.configuration import ConfigModel
 from datahub.ingestion.api.common import PipelineContext
 from datahub.ingestion.api.source import SourceReport
 from datahub.ingestion.graph.client import DataHubGraph
-from datahub.ingestion.graph.filters import RemovedStatusFilter
+from datahub.ingestion.graph.filters import RemovedStatusFilter, SearchFilterRule
 from datahub.utilities.lossy_collections import LossyList
 from datahub.utilities.stats_collections import TopKDict
 from datahub.utilities.urns._urn_base import Urn
+from datahub.utilities.urns.error import InvalidUrnError
 logger = logging.getLogger(__name__)
-QUERY_ENTITIES = """
-query listEntities($input: ScrollAcrossEntitiesInput!) {
-  scrollAcrossEntities(input: $input) {
-    nextScrollId
-    count
-    searchResults {
-      entity {
-        ... on QueryEntity {
-          urn
-        }
-        ... on DataProcessInstance {
-          urn
-        }
-      }
-    }
-  }
-}
-"""
 class SoftDeletedEntitiesCleanupConfig(ConfigModel):
     enabled: bool = Field(
@@ -64,7 +46,33 @@ class SoftDeletedEntitiesCleanupConfig(ConfigModel):
     )
     entity_types: Optional[List[str]] = Field(
-        default=None,
+        # A default value is required otherwise QUERY and DATAPROCESS_INSTANCE won't be included
+        default=[
+            "dataset",
+            "dashboard",
+            "chart",
+            "mlmodel",
+            "mlmodelGroup",
+            "mlfeatureTable",
+            "mlfeature",
+            "mlprimaryKey",
+            "dataFlow",
+            "dataJob",
+            "glossaryTerm",
+            "glossaryNode",
+            "tag",
+            "role",
+            "corpuser",
+            "corpGroup",
+            "container",
+            "domain",
+            "dataProduct",
+            "notebook",
+            "businessAttribute",
+            "schemaField",
+            "query",
+            "dataProcessInstance",
+        ],
         description="List of entity types to cleanup",
     )
@@ -103,6 +111,9 @@ class SoftDeletedEntitiesReport(SourceReport):
     num_entities_found: Dict[str, int] = field(default_factory=dict)
     num_soft_deleted_entity_processed: int = 0
     num_soft_deleted_retained_due_to_age: int = 0
+    num_soft_deleted_retained_due_to_age_by_type: TopKDict[str, int] = field(
+        default_factory=TopKDict
+    )
     num_soft_deleted_entity_removal_started: int = 0
     num_hard_deleted: int = 0
     num_hard_deleted_by_type: TopKDict[str, int] = field(default_factory=TopKDict)
@@ -111,6 +122,8 @@ class SoftDeletedEntitiesReport(SourceReport):
     )
     runtime_limit_reached: bool = False
     deletion_limit_reached: bool = False
+    num_soft_deleted_entity_found: int = 0
+    num_soft_deleted_entity_invalid_urn: int = 0
 class SoftDeletedEntitiesCleanup:
@@ -133,7 +146,7 @@ class SoftDeletedEntitiesCleanup:
         self.config = config
         self.report = report
         self.dry_run = dry_run
-        self.start_time = 0.0
+        self.start_time = time.time()
         self._report_lock: Lock = Lock()
         self.last_print_time = 0.0
@@ -142,6 +155,14 @@ class SoftDeletedEntitiesCleanup:
         with self._report_lock:
             self.report.num_soft_deleted_retained_due_to_age += 1
+    def _increment_retained_by_type(self, type: str) -> None:
+        """Thread-safe method to update report fields"""
+        with self._report_lock:
+            self.report.num_soft_deleted_retained_due_to_age_by_type[type] = (
+                self.report.num_soft_deleted_retained_due_to_age_by_type.get(type, 0)
+                + 1
+            )
     def _increment_removal_started_count(self) -> None:
         """Thread-safe method to update report fields"""
         with self._report_lock:
@@ -160,10 +181,9 @@ class SoftDeletedEntitiesCleanup:
                 )
             self.report.sample_hard_deleted_aspects_by_type[entity_type].append(urn)
-    def delete_entity(self, urn: str) -> None:
+    def delete_entity(self, urn: Urn) -> None:
         assert self.ctx.graph
-        entity_urn = Urn.from_string(urn)
         if self.dry_run:
             logger.info(
                 f"Dry run is on otherwise it would have deleted {urn} with hard deletion"
@@ -172,14 +192,14 @@ class SoftDeletedEntitiesCleanup:
         if self._deletion_limit_reached() or self._times_up():
             return
         self._increment_removal_started_count()
-        self.ctx.graph.delete_entity(urn=urn, hard=True)
+        self.ctx.graph.delete_entity(urn=urn.urn(), hard=True)
         self.ctx.graph.delete_references_to_urn(
-            urn=urn,
+            urn=urn.urn(),
             dry_run=False,
         )
-        self._update_report(urn, entity_urn.entity_type)
+        self._update_report(urn.urn(), urn.entity_type)
-    def delete_soft_deleted_entity(self, urn: str) -> None:
+    def delete_soft_deleted_entity(self, urn: Urn) -> None:
         assert self.ctx.graph
         retention_time = (
@@ -187,7 +207,7 @@ class SoftDeletedEntitiesCleanup:
             - self.config.retention_days * 24 * 60 * 60
         )
-        aspect = self.ctx.graph.get_entity_raw(entity_urn=urn, aspects=["status"])
+        aspect = self.ctx.graph.get_entity_raw(entity_urn=urn.urn(), aspects=["status"])
         if "status" in aspect["aspects"]:
             if aspect["aspects"]["status"]["value"]["removed"] and aspect["aspects"][
                 "status"
@@ -196,6 +216,7 @@ class SoftDeletedEntitiesCleanup:
                 self.delete_entity(urn)
             else:
                 self._increment_retained_count()
+                self._increment_retained_by_type(urn.entity_type)
     def _print_report(self) -> None:
         time_taken = round(time.time() - self.last_print_time, 1)
@@ -204,7 +225,7 @@ class SoftDeletedEntitiesCleanup:
             self.last_print_time = time.time()
             logger.info(f"\n{self.report.as_string()}")
-    def _process_futures(self, futures: Dict[Future, str]) -> Dict[Future, str]:
+    def _process_futures(self, futures: Dict[Future, Urn]) -> Dict[Future, Urn]:
         done, not_done = wait(futures, return_when=FIRST_COMPLETED)
         futures = {future: urn for future, urn in futures.items() if future in not_done}
@@ -214,7 +235,7 @@ class SoftDeletedEntitiesCleanup:
                 self.report.failure(
                     title="Failed to delete entity",
                     message="Failed to delete entity",
-                    context=futures[future],
+                    context=futures[future].urn(),
                     exc=future.exception(),
                 )
             self.report.num_soft_deleted_entity_processed += 1
@@ -229,86 +250,52 @@ class SoftDeletedEntitiesCleanup:
                     time.sleep(self.config.delay)
         return futures
-    def _get_soft_deleted(self, graphql_query: str, entity_type: str) -> Iterable[str]:
+    def _get_urns(self) -> Iterable[str]:
         assert self.ctx.graph
-        scroll_id: Optional[str] = None
-        batch_size = self.config.batch_size
-        if entity_type == "DATA_PROCESS_INSTANCE":
-            # Due to a bug in Data process instance querying this is a temp workaround
-            # to avoid a giant stacktrace by having a smaller batch size in first call
-            # This will be remove in future version after server with fix has been
-            # around for a while
-            batch_size = 10
-        while True:
-            try:
-                if entity_type not in self.report.num_calls_made:
-                    self.report.num_calls_made[entity_type] = 1
-                else:
-                    self.report.num_calls_made[entity_type] += 1
-                self._print_report()
-                result = self.ctx.graph.execute_graphql(
-                    graphql_query,
-                    {
-                        "input": {
-                            "types": [entity_type],
-                            "query": "*",
-                            "scrollId": scroll_id if scroll_id else None,
-                            "count": batch_size,
-                            "orFilters": [
-                                {
-                                    "and": [
-                                        {
-                                            "field": "removed",
-                                            "values": ["true"],
-                                            "condition": "EQUAL",
-                                        }
-                                    ]
-                                }
-                            ],
-                        }
-                    },
-                )
-            except Exception as e:
-                self.report.failure(
-                    f"While trying to get {entity_type} with {scroll_id}", exc=e
-                )
-                break
-            scroll_across_entities = result.get("scrollAcrossEntities")
-            if not scroll_across_entities:
-                break
-            search_results = scroll_across_entities.get("searchResults")
-            count = scroll_across_entities.get("count")
-            if not count or not search_results:
-                # Due to a server bug we cannot rely on just count as it was returning response like this
-                # {'count': 1, 'nextScrollId': None, 'searchResults': []}
-                break
-            if entity_type == "DATA_PROCESS_INSTANCE":
-                # Temp workaround. See note in beginning of the function
-                # We make the batch size = config after call has succeeded once
-                batch_size = self.config.batch_size
-            scroll_id = scroll_across_entities.get("nextScrollId")
-            if entity_type not in self.report.num_entities_found:
-                self.report.num_entities_found[entity_type] = 0
-            self.report.num_entities_found[entity_type] += scroll_across_entities.get(
-                "count"
+        # Entities created in the retention period are not considered for deletion
+        created_from = int(
+            (
+                datetime.now(timezone.utc).timestamp()
+                - self.config.retention_days * 24 * 60 * 60
             )
-            for query in search_results:
-                yield query["entity"]["urn"]
+            * 1000
+        )
+        entity_types = self.config.entity_types
+        # dataProcessInstance is a special case where we need to get the entities separately
+        # because we need to filter based on created time we don't stream to many dataProcessInstance entities at once
+        # Gc source soft-deletes dataProcessInstance entities which causes to have a lot of soft deleted entities
+        if (
+            self.config.entity_types
+            and "dataProcessInstance" in self.config.entity_types
+        ):
+            entity_types = self.config.entity_types.copy()
+            yield from self.ctx.graph.get_urns_by_filter(
+                entity_types=["dataProcessInstance"],
+                platform=self.config.platform,
+                env=self.config.env,
+                query=self.config.query,
+                status=RemovedStatusFilter.ONLY_SOFT_DELETED,
+                batch_size=self.config.batch_size,
+                extraFilters=[
+                    SearchFilterRule(
+                        field="created",
+                        condition="LESS_THAN",
+                        values=[f"{created_from}"],
+                    ).to_raw()
+                ],
+            )
+            entity_types.remove("dataProcessInstance")
-    def _get_urns(self) -> Iterable[str]:
-        assert self.ctx.graph
         yield from self.ctx.graph.get_urns_by_filter(
-            entity_types=self.config.entity_types,
+            entity_types=entity_types,
             platform=self.config.platform,
             env=self.config.env,
             query=self.config.query,
             status=RemovedStatusFilter.ONLY_SOFT_DELETED,
             batch_size=self.config.batch_size,
         )
-        yield from self._get_soft_deleted(QUERY_ENTITIES, "QUERY")
-        yield from self._get_soft_deleted(QUERY_ENTITIES, "DATA_PROCESS_INSTANCE")
     def _times_up(self) -> bool:
         if (
@@ -335,16 +322,26 @@ class SoftDeletedEntitiesCleanup:
             return
         self.start_time = time.time()
-        futures: Dict[Future, str] = dict()
+        futures: Dict[Future, Urn] = dict()
         with ThreadPoolExecutor(max_workers=self.config.max_workers) as executor:
             for urn in self._get_urns():
+                try:
+                    self.report.num_soft_deleted_entity_found += 1
+                    soft_deleted_urn = Urn.from_string(urn)
+                except InvalidUrnError as e:
+                    logger.error(f"Failed to parse urn {urn} with error {e}")
+                    self.report.num_soft_deleted_entity_invalid_urn += 1
+                    continue
                 self._print_report()
                 while len(futures) >= self.config.futures_max_at_time:
                     futures = self._process_futures(futures)
                 if self._deletion_limit_reached() or self._times_up():
                     break
-                future = executor.submit(self.delete_soft_deleted_entity, urn)
-                futures[future] = urn
+                future = executor.submit(
+                    self.delete_soft_deleted_entity, soft_deleted_urn
+                )
+                futures[future] = soft_deleted_urn
             logger.info(f"Waiting for {len(futures)} futures to complete")
             while len(futures) > 0:

datahub/ingestion/source/ge_data_profiler.py CHANGED Viewed

@@ -602,7 +602,7 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
         if not self.config.include_field_median_value:
             return
         try:
-            if self.dataset.engine.dialect.name.lower() in [SNOWFLAKE, DATABRICKS]:
+            if self.dataset.engine.dialect.name.lower() == SNOWFLAKE:
                 column_profile.median = str(
                     self.dataset.engine.execute(
                         sa.select([sa.func.median(sa.column(column))]).select_from(
@@ -610,6 +610,16 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
                         )
                     ).scalar()
                 )
+            elif self.dataset.engine.dialect.name.lower() == DATABRICKS:
+                column_profile.median = str(
+                    self.dataset.engine.execute(
+                        sa.select(
+                            sa.text(
+                                f"approx_percentile(`{column}`, 0.5) as approx_median"
+                            )
+                        ).select_from(self.dataset._table)
+                    ).scalar()
+                )
             elif self.dataset.engine.dialect.name.lower() == BIGQUERY:
                 column_profile.median = str(
                     self.dataset.engine.execute(

datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py CHANGED Viewed

@@ -115,7 +115,7 @@ class PowerBiAPI:
         if scan_result is None:
             return results
-        for scanned_dashboard in scan_result.get(Constant.DASHBOARDS, []):
+        for scanned_dashboard in scan_result.get(Constant.DASHBOARDS) or []:
             # Iterate through response and create a list of PowerBiAPI.Dashboard
             dashboard_id = scanned_dashboard.get("id")
             tags = self._parse_endorsement(
@@ -133,17 +133,17 @@ class PowerBiAPI:
         if scan_result is None:
             return results
-        reports: List[dict] = scan_result.get(Constant.REPORTS, [])
+        reports: List[dict] = scan_result.get(Constant.REPORTS) or []
         for report in reports:
-            report_id = report.get(Constant.ID, None)
+            report_id = report.get(Constant.ID)
             if report_id is None:
                 logger.warning(
                     f"Report id is none. Skipping endorsement tag for report instance {report}"
                 )
                 continue
             endorsements = self._parse_endorsement(
-                report.get(Constant.ENDORSEMENT_DETAIL, None)
+                report.get(Constant.ENDORSEMENT_DETAIL)
             )
             results[report_id] = endorsements
@@ -339,7 +339,7 @@ class PowerBiAPI:
         if not endorsements:
             return []
-        endorsement = endorsements.get(Constant.ENDORSEMENT, None)
+        endorsement = endorsements.get(Constant.ENDORSEMENT)
         if not endorsement:
             return []
@@ -396,7 +396,7 @@ class PowerBiAPI:
             if self.__config.extract_endorsements_to_tags:
                 dataset_instance.tags = self._parse_endorsement(
-                    dataset_dict.get(Constant.ENDORSEMENT_DETAIL, None)
+                    dataset_dict.get(Constant.ENDORSEMENT_DETAIL)
                 )
             dataset_map[dataset_instance.id] = dataset_instance
@@ -407,7 +407,7 @@ class PowerBiAPI:
                 else dataset_instance.id
             )
             logger.debug(f"dataset_dict = {dataset_dict}")
-            for table in dataset_dict.get(Constant.TABLES, []):
+            for table in dataset_dict.get(Constant.TABLES) or []:
                 expression: Optional[str] = (
                     table[Constant.SOURCE][0][Constant.EXPRESSION]
                     if table.get(Constant.SOURCE) is not None
@@ -430,10 +430,10 @@ class PowerBiAPI:
                                 column["dataType"], FIELD_TYPE_MAPPING["Null"]
                             ),
                         )
-                        for column in table.get("columns", [])
+                        for column in table.get("columns") or []
                     ],
                     measures=[
-                        Measure(**measure) for measure in table.get("measures", [])
+                        Measure(**measure) for measure in table.get("measures") or []
                     ],
                     dataset=dataset_instance,
                     row_count=None,
@@ -480,7 +480,7 @@ class PowerBiAPI:
                     )
                 )
                 if app_id is None:  # In PowerBI one workspace can have one app
-                    app_id = report.get(Constant.APP_ID)
+                    app_id = report[Constant.APP_ID]
         raw_app_dashboards: List[Dict] = []
         # Filter app dashboards
@@ -488,7 +488,7 @@ class PowerBiAPI:
             if dashboard.get(Constant.APP_ID):
                 raw_app_dashboards.append(dashboard)
                 if app_id is None:  # In PowerBI, one workspace contains one app
-                    app_id = report[Constant.APP_ID]
+                    app_id = dashboard[Constant.APP_ID]
         # workspace doesn't have an App. Above two loops can be avoided
         # if app_id is available at root level in workspace_metadata

datahub/ingestion/source/redshift/lineage_v2.py CHANGED Viewed

@@ -230,7 +230,8 @@ class RedshiftSqlLineageV2(Closeable):
             )
         # Populate lineage for external tables.
-        self._process_external_tables(all_tables=all_tables, db_schemas=db_schemas)
+        if not self.config.skip_external_tables:
+            self._process_external_tables(all_tables=all_tables, db_schemas=db_schemas)
     def _populate_lineage_agg(
         self,
@@ -400,6 +401,10 @@ class RedshiftSqlLineageV2(Closeable):
         db_schemas: Dict[str, Dict[str, RedshiftSchema]],
     ) -> None:
         for schema_name, tables in all_tables[self.database].items():
+            logger.info(f"External table lineage: checking schema {schema_name}")
+            if not db_schemas[self.database].get(schema_name):
+                logger.warning(f"Schema {schema_name} not found")
+                continue
             for table in tables:
                 schema = db_schemas[self.database][schema_name]
                 if (
@@ -407,6 +412,9 @@ class RedshiftSqlLineageV2(Closeable):
                     and schema.is_external_schema()
                     and schema.external_platform
                 ):
+                    logger.info(
+                        f"External table lineage: processing table {schema_name}.{table.name}"
+                    )
                     # external_db_params = schema.option
                     upstream_platform = schema.external_platform.lower()

datahub/ingestion/source/redshift/query.py CHANGED Viewed

@@ -44,7 +44,7 @@ class RedshiftCommonQuery:
         SELECT
             schema_name,
             schema_type,
-            schema_option,
+            cast(null as varchar(1024)) as schema_option,
             cast(null as varchar(256)) as external_platform,
             cast(null as varchar(256)) as external_database
         FROM svv_redshift_schemas

acryl-datahub 1.0.0.1rc3__py3-none-any.whl → 1.0.0.1rc5__py3-none-any.whl

Potentially problematic release.

acryl-datahub 1.0.0.1rc3py3-none-any.whl → 1.0.0.1rc5py3-none-any.whl