PyPI - acryl-datahub-cloud - Versions diffs - 0.3.10rc4__py3-none-any.whl → 0.3.16.1rc0__py3-none-any.whl - Mend - Supply Chain Defender

acryl-datahub-cloud 0.3.10rc4py3-none-any.whl → 0.3.16.1rc0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub-cloud might be problematic. Click here for more details.

Files changed (243) hide show

acryl_datahub_cloud/datahub_usage_reporting/usage_feature_reporter.py CHANGED Viewed

@@ -22,12 +22,14 @@ from polars.datatypes import DataTypeClass
 from pydantic import Field
 from scipy.stats import expon
+from acryl_datahub_cloud.datahub_usage_reporting.excluded import EXCLUDED_PATTERNS
 from acryl_datahub_cloud.datahub_usage_reporting.query_builder import QueryBuilder
 from acryl_datahub_cloud.datahub_usage_reporting.usage_feature_patch_builder import (
     UsageFeaturePatchBuilder,
 )
 from acryl_datahub_cloud.elasticsearch.config import ElasticSearchClientConfig
 from acryl_datahub_cloud.metadata.schema_classes import (
+    CorpUserUsageFeaturesClass,
     QueryUsageFeaturesClass,
     UsageFeaturesClass,
 )
@@ -40,7 +42,7 @@ from datahub.ingestion.api.decorators import (
     platform_name,
     support_status,
 )
-from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
+from datahub.ingestion.api.source import MetadataWorkUnitProcessor
 from datahub.ingestion.api.source_helpers import auto_workunit_reporter
 from datahub.ingestion.api.workunit import MetadataWorkUnit
 from datahub.ingestion.graph.client import DatahubClientConfig
@@ -114,12 +116,12 @@ class DataHubUsageFeatureReportingSourceConfig(
         30, description="Timeout in seconds for the search queries."
     )
     extract_batch_size: int = Field(
-        1000,
+        5000,
         description="The number of documents to retrieve in each batch from ElasticSearch or OpenSearch.",
     )
     extract_delay: Optional[float] = Field(
-        0.25,
+        0,
         description="The delay in seconds between each batch extraction from ElasticSearch or OpenSearch.",
     )
@@ -135,6 +137,10 @@ class DataHubUsageFeatureReportingSourceConfig(
         None,
         description="Optional configuration for stateful ingestion, including stale metadata removal.",
     )
+    user_usage_enabled: bool = Field(
+        True,
+        description="Flag to enable or disable user usage statistics collection.",
+    )
     dataset_usage_enabled: bool = Field(
         True,
         description="Flag to enable or disable dataset usage statistics collection.",
@@ -177,7 +183,7 @@ class DataHubUsageFeatureReportingSourceConfig(
     # This option is only needed here until we are sure that the streaming mode is stable.
     # then we can remove it and control it with the streaming_mode option.
     experimental_full_streaming: bool = Field(
-        False,
+        True,
         description="Flag to enable full streaming mode.'",
     )
@@ -191,6 +197,11 @@ class DataHubUsageFeatureReportingSourceConfig(
         description="Flag to generate MCP patch for usage features.'",
     )
+    excluded_platforms: List[str] = Field(
+        EXCLUDED_PATTERNS,
+        description="List of platforms to exclude from usage statistics collection. This is done to avoid invite user functionality to be filled with service accounts.",
+    )
 def exp_cdf(series: polars.Series) -> polars.Series:
     with PerfTimer() as timer:
@@ -228,7 +239,7 @@ def exp_cdf(series: polars.Series) -> polars.Series:
 @dataclass
-class DatahubUsageFeatureReport(IngestionStageReport, StatefulIngestionReport):
+class DatahubUsageFeatureReport(StatefulIngestionReport, IngestionStageReport):
     dataset_platforms_count: Dict[str, int] = field(
         default_factory=lambda: defaultdict(lambda: 0)
     )
@@ -241,10 +252,6 @@ class DatahubUsageFeatureReport(IngestionStageReport, StatefulIngestionReport):
         default_factory=lambda: defaultdict(lambda: PerfTimer())
     )
-    dataset_usage_processing_time: PerfTimer = PerfTimer()
-    dashboard_usage_processing_time: PerfTimer = PerfTimer()
-    chart_usage_processing_time: PerfTimer = PerfTimer()
-    query_usage_processing_time: PerfTimer = PerfTimer()
     query_platforms_count: Dict[str, int] = field(
         default_factory=lambda: defaultdict(lambda: 0)
     )
@@ -395,18 +402,10 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
                     "last_modified_at": (
                         doc["_source"]["lastModifiedAt"]
                         if "lastModifiedAt" in doc["_source"]
-                        else (
-                            doc["_source"]["lastModifiedAt"]
-                            if "lastModifiedAt" in doc["_source"]
-                            else None
-                        )
+                        else (doc["_source"].get("lastModifiedAt", None))
                     ),
                     "platform": doc["_source"]["platform"],
-                    "removed": (
-                        doc["_source"]["removed"]
-                        if "removed" in doc["_source"]
-                        else False
-                    ),
+                    "removed": (doc["_source"].get("removed", False)),
                 }
             time_taken = timer.elapsed_seconds()
@@ -509,11 +508,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
                         "eventGranularity": doc["_source"].get("eventGranularity"),
                         "totalSqlQueries": doc["_source"].get("totalSqlQueries", 0),
                         "uniqueUserCount": doc["_source"].get("uniqueUserCount", 0),
-                        "userCounts": (
-                            doc["_source"]["event"]["userCounts"]
-                            if "userCounts" in doc["_source"]["event"]
-                            else None
-                        ),
+                        "userCounts": (doc["_source"]["event"].get("userCounts", None)),
                         "platform": platform,
                     }
                 except KeyError as e:
@@ -525,7 +520,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
             time_taken = timer.elapsed_seconds()
             logger.info(f"DatasetUsage processing took {time_taken:.3f} seconds")
-    def search_score(
+    def search_score(  # noqa: C901
         self, urn: str, last_update_time: int, usage_percentile: int
     ) -> SearchRankingMultipliers:
         usage_search_score_multiplier = 1.0
@@ -622,27 +617,27 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
                         [endpoint],
                         http_auth=(user, password),
                         use_ssl=(
-                            True
-                            if self.config.search_index
-                            and self.config.search_index.use_ssl
-                            else False
+                            bool(
+                                self.config.search_index
+                                and self.config.search_index.use_ssl
+                            )
                         ),
                     )
-                    response = server.create_pit(index, keep_alive="10m")
+                    # response = server.create_pit(index, keep_alive="10m")
                     # TODO: Save PIT, we can resume processing based on <pit, search_after> tuple
-                    pit = response.get("pit_id")
-                    query_copy.update({"pit": {"id": pit, "keep_alive": "10m"}})
+                    # pit = response.get("pit_id")
+                    # query_copy.update({"pit": {"id": pit, "keep_alive": "10m"}})
                 else:
                     server = Elasticsearch(
                         [endpoint],
                         http_auth=(user, password),
                         use_ssl=(
-                            True
-                            if self.config.search_index
-                            and self.config.search_index.use_ssl
-                            else False
+                            bool(
+                                self.config.search_index
+                                and self.config.search_index.use_ssl
+                            )
                         ),
                     )
@@ -737,23 +732,26 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
                 polars.Duration(): pa.duration("ns"),
             }
-            if polars_dtype in [type(key) for key in type_mapping.keys()]:
+            if polars_dtype in [type(key) for key in type_mapping]:
                 return type_mapping[polars_dtype]
             elif polars_dtype == polars.Categorical:
                 return pa.dictionary(index_type=pa.int32(), value_type=pa.string())
             elif isinstance(polars_dtype, polars.Struct):
                 return pa.struct(
-                    {
-                        field.name: convert_dtype(field.dtype)
+                    [
+                        pa.field(field.name, convert_dtype(field.dtype))
                         for field in polars_dtype.fields
-                    }
+                    ]
                 )
             elif isinstance(polars_dtype, polars.List):
                 return pa.list_(convert_dtype(polars_dtype.inner))
             else:
                 raise ValueError(f"Unsupported Polars dtype: {polars_dtype}")
-        fields = [(name, convert_dtype(dtype)) for name, dtype in polars_schema.items()]
+        fields = [
+            pa.field(name, convert_dtype(dtype))
+            for name, dtype in polars_schema.items()
+        ]
         return pa.schema(fields)
     def batch_write_parquet(
@@ -846,7 +844,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
             .drop(["removed"])
         )
-        return wdf.collect(streaming=self.config.streaming_mode).lazy()
+        return wdf
     def load_write_usage_server_side_aggregation(
         self, soft_deleted_entities_df: polars.LazyFrame
@@ -935,6 +933,11 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
         return dataset_df
+    def generate_user_usage_mcps(self) -> Iterable[MetadataWorkUnit]:
+        with polars.StringCache():
+            user_usage_lf = self.generate_user_usage()
+            yield from self.generate_user_usage_mcp_from_lazyframe(user_usage_lf)
     def generate_dataset_usage_mcps(self) -> Iterable[MetadataWorkUnit]:
         with polars.StringCache():
             dataset_usage_df = self.generate_dataset_usage()
@@ -970,48 +973,35 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
         ]
     def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
+        if self.config.user_usage_enabled:
+            with self.report.new_stage("generate user usage"):
+                yield from self.generate_user_usage_mcps()
         if self.config.dataset_usage_enabled:
-            with self.report.dataset_usage_processing_time as timer:
-                self.report.new_stage("generate dataset usage")
+            with self.report.new_stage("generate dataset usage"):
                 yield from self.generate_dataset_usage_mcps()
-                time_taken = timer.elapsed_seconds()
-                logger.info(f"Dataset Usage generation took {time_taken:.3f} seconds")
         if self.config.dashboard_usage_enabled:
-            with self.report.dashboard_usage_processing_time as timer:
-                self.report.new_stage("generate dashboard usage")
+            with self.report.new_stage("generate dashboard usage"):
                 yield from self.generate_dashboard_usage_mcps()
-                time_taken = timer.elapsed_seconds()
-                logger.info(f"Dashboard Usage generation took {time_taken:.3f}")
         if self.config.chart_usage_enabled:
-            with self.report.chart_usage_processing_time as timer:
-                self.report.new_stage("generate chart usage")
+            with self.report.new_stage("generate chart usage"):
                 yield from self.generate_chart_usage_mcps()
-                time_taken = timer.elapsed_seconds()
-                logger.info(f"Chart Usage generation took {time_taken:.3f}")
         if self.config.query_usage_enabled:
-            with self.report.query_usage_processing_time as timer:
-                self.report.new_stage("generate query usage")
+            with self.report.new_stage("generate query usage"):
                 yield from self.generate_query_usage_mcps()
-                time_taken = timer.elapsed_seconds()
-                logger.info(f"Query Usage generation took {time_taken:.3f}")
+        with self.report.new_stage("end so time is calculated for last stage"):
+            pass
     def generate_mcp_from_lazyframe(
         self, lazy_frame: polars.LazyFrame
     ) -> Iterable[MetadataWorkUnit]:
-        num = 0
         for row in lazy_frame.collect(
-            streaming=self.config.experimental_full_streaming
+            engine="streaming" if self.config.experimental_full_streaming else "auto"
         ).to_struct():
-            num += 1
             if "siblings" in row and row["siblings"]:
                 logger.info(f"Siblings found for urn: {row['urn']} -> row['siblings']")
@@ -1067,7 +1057,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
                 uniqueUserPercentileLast30Days=int(
                     row.get("distinct_user_rank_percentile", 0) or 0
                 ),
-                writeCountLast30Days=int(row.get("write_rank_percentile", 0) or 0)
+                writeCountLast30Days=int(row.get("write_count", 0) or 0)
                 if not self.config.disable_write_usage
                 else None,
                 writeCountPercentileLast30Days=int(
@@ -1101,10 +1091,9 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
     def generate_query_usage_mcp_from_lazyframe(
         self, lazy_frame: polars.LazyFrame
     ) -> Iterable[MetadataWorkUnit]:
-        num = 0
-        for row in lazy_frame.collect().iter_rows(named=True):
-            num += 1
+        for row in lazy_frame.collect(
+            engine="streaming" if self.config.experimental_full_streaming else "auto"
+        ).iter_rows(named=True):
             query_usage_features = QueryUsageFeaturesClass(
                 queryCountLast30Days=int(row.get("totalSqlQueries", 0) or 0),
                 queryCountTotal=None,  # This is not implemented
@@ -1124,6 +1113,47 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
                 row["urn"], query_usage_features
             )
+    def _convert_platform_pairs_to_dict(
+        self,
+        platform_pairs: Optional[List[Dict[str, Any]]],
+        value_key: str = "platform_total",
+    ) -> Optional[Dict[str, Any]]:
+        """Convert list of platform usage structs to dictionary."""
+        if not platform_pairs:
+            return None
+        return {
+            pair["platform_urn"]: pair[value_key]
+            for pair in platform_pairs
+            if pair["platform_urn"] is not None
+        }
+    def generate_user_usage_mcp_from_lazyframe(
+        self, lazy_frame: polars.LazyFrame
+    ) -> Iterable[MetadataWorkUnit]:
+        for row in lazy_frame.collect(
+            engine="streaming" if self.config.experimental_full_streaming else "auto"
+        ).iter_rows(named=True):
+            user_usage_features = CorpUserUsageFeaturesClass(
+                userUsageTotalPast30Days=int(
+                    row.get("userUsageTotalPast30Days", 0) or 0
+                ),
+                userPlatformUsageTotalsPast30Days=self._convert_platform_pairs_to_dict(
+                    row.get("platform_usage_pairs", [])
+                ),
+                userPlatformUsagePercentilePast30Days=self._convert_platform_pairs_to_dict(
+                    row.get("platform_usage_percentiles", []),
+                    "platform_rank_percentile",
+                ),
+                userUsagePercentilePast30Days=row.get("userUsagePercentilePast30Days"),
+                userTopDatasetsByUsage=self._convert_top_datasets_to_dict(
+                    row.get("top_datasets_map", [])
+                ),
+            )
+            yield MetadataChangeProposalWrapper(
+                entityUrn=row["user"], aspect=user_usage_features
+            ).as_workunit(is_primary_source=False)
     def generate_usage_feature_mcp(
         self, urn: str, usage_feature: UsageFeaturesClass
     ) -> Iterable[MetadataWorkUnit]:
@@ -1158,9 +1188,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
         return self.generate_dashboard_chart_usage(entity_index, usage_index)
-    def generate_dashboard_chart_usage(
-        self, entity_index: str, usage_index: str
-    ) -> polars.LazyFrame:
+    def _generate_dashboard_chart_entities(self, entity_index: str) -> polars.LazyFrame:
         entity_schema = {
             "entity_urn": polars.Categorical,
             "removed": polars.Boolean,
@@ -1177,7 +1205,12 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
             process_function=self.soft_deleted_batch,
         )
-        dashboard_usage_schema = {
+        return entities_df
+    def _generate_dashboard_chart_usage(
+        self, entities_df: polars.LazyFrame, usage_index: str
+    ) -> polars.LazyFrame:
+        entities_usage_schema = {
             "timestampMillis": polars.Int64,
             "lastObserved": polars.Int64,
             "urn": polars.Categorical,
@@ -1195,7 +1228,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
         }
         lf = self.load_data_from_es_to_lf(
-            schema=dashboard_usage_schema,
+            schema=entities_usage_schema,
             index=usage_index,
             query=QueryBuilder.get_dashboard_usage_query(self.config.lookback_days),
             process_function=self.process_dashboard_usage,
@@ -1214,6 +1247,15 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
             .alias("row_num")
         ).filter(polars.col("row_num") == 1)
+        return lf
+    def generate_dashboard_chart_usage(
+        self, entity_index: str, usage_index: str
+    ) -> polars.LazyFrame:
+        entities_df = self._generate_dashboard_chart_entities(entity_index)
+        lf = self._generate_dashboard_chart_usage(entities_df, usage_index)
         # lf = lf.filter(polars.col("urn") == "urn:li:dashboard:(looker,dashboards.8)")
         # "urn:li:dashboard:(looker,dashboards.8)"
@@ -1287,7 +1329,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
                 .is_not_null()
                 # We only want to downrank datasets that have a search score multiplier greater than 1. 1 is the minimum score of a dataset
                 .and_(polars.col("combinedSearchRankingMultiplier").ne(1))
-            )  # noqa: E712
+            )
             .filter(polars.col("removed") == False)  # noqa: E712
             .drop(["removed"])
             .drop(["last_modified_at"])
@@ -1326,7 +1368,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
         query_entities = self.load_data_from_es_to_lf(
             schema=query_entities_schema,
             index=entity_index,
-            query=QueryBuilder.get_query_entities_query(),
+            query=QueryBuilder.get_query_entities_query(self.config.lookback_days),
             process_function=self.queries_entities_batch,
         )
@@ -1383,6 +1425,380 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
         return usage_with_top_users_with_ranks
+    def _generate_user_usage_for_dataset(self) -> polars.LazyFrame:
+        datasets_lf = self.get_datasets()
+        if self.config.set_upstream_table_max_modification_time_for_views:
+            datasets_lf = self.set_table_modification_time_for_views(datasets_lf)
+        lf = self.load_dataset_usage()
+        # Polaris/pandas join merges the join column into one column and that's why we need to filter based on the removed column
+        lf = (
+            lf.join(datasets_lf, left_on="urn", right_on="entity_urn", how="left")
+            .filter(polars.col("removed") == False)  # noqa: E712
+            .drop(["removed"])
+        )
+        users_lf = (
+            lf.explode("userCounts")
+            .unnest("userCounts")
+            .filter(polars.col("user").is_not_null())
+        )
+        user_dataset_usage_lf = self._create_user_dataset_usage_map(users_lf)
+        return user_dataset_usage_lf
+    @staticmethod
+    def _convert_top_datasets_to_dict(
+        top_datasets_list: Optional[List[Dict[str, Any]]],
+    ) -> Optional[Dict[str, float]]:
+        """
+        Convert list of top datasets structs to dictionary as expected by CorpUserUsageFeatures schema.
+        Args:
+            top_datasets_list: List of dictionaries with 'dataset_urn' and 'count' keys
+        Returns:
+            Dictionary mapping dataset URN to usage count, or None if input is empty
+        """
+        if not top_datasets_list:
+            return None
+        top_datasets_dict = {
+            item["dataset_urn"]: float(item["count"])
+            for item in top_datasets_list
+            if isinstance(item, dict) and "dataset_urn" in item and "count" in item
+        }
+        return top_datasets_dict if top_datasets_dict else None
+    def _create_user_dataset_usage_map(
+        self, users_lf: polars.LazyFrame, top_n: int = 25
+    ) -> polars.LazyFrame:
+        """
+        Creates a lazyframe with user string and map of top N datasets by usage.
+        Args:
+            users_lf: LazyFrame containing user usage data with columns: user, urn, platform, count
+            top_n: Number of top datasets to include per user (default: 25)
+        Returns:
+            LazyFrame with columns:
+            - user: string column containing the user identifier
+            - top_datasets_map: list of structs with dataset_urn (string), count (int), and platform_urn (string)
+            - userUsageTotalPast30Days: total usage count for the user across all datasets
+            - userPlatformUsageTotalsPast30Days: map from platform URN to usage totals
+        """
+        # Create intermediate lazy frame with filtered users and aggregated counts
+        user_dataset_aggregated = (
+            users_lf.filter(polars.col("user").str.contains("@"))
+            .group_by("user", "urn", "platform")
+            .agg(polars.col("count").sum().alias("total_count"))
+            .with_columns(
+                # Direct string formatting - vectorized operation
+                polars.format("urn:li:dataPlatform:{}", polars.col("platform")).alias(
+                    "platform_urn"
+                )
+            )
+        )
+        # Calculate user totals
+        user_totals = user_dataset_aggregated.group_by("user").agg(
+            polars.col("total_count").sum().alias("userUsageTotalPast30Days")
+        )
+        # Calculate platform totals for each user - keep as list of structs
+        platform_totals = (
+            user_dataset_aggregated.group_by("user", "platform_urn")
+            .agg(polars.col("total_count").sum().alias("platform_total"))
+            .filter(polars.col("platform_urn").is_not_null())
+            .group_by("user")
+            .agg(
+                polars.struct(
+                    [
+                        polars.col("platform_urn"),
+                        polars.col("platform_total").cast(polars.Float64),
+                    ]
+                ).alias("platform_usage_pairs")
+            )
+        )
+        # Calculate top datasets
+        top_datasets = (
+            user_dataset_aggregated.with_columns(
+                polars.col("total_count")
+                .rank(descending=True, method="ordinal")
+                .over("user")
+                .alias("dataset_rank")
+            )
+            .filter(polars.col("dataset_rank") <= top_n)
+            .group_by("user")
+            .agg(
+                polars.struct(
+                    [
+                        polars.col("urn").alias("dataset_urn"),
+                        polars.col("total_count").alias("count"),
+                        polars.col("platform_urn"),
+                    ]
+                )
+                .sort_by("total_count", descending=True)
+                .alias("top_datasets_map")
+            )
+        )
+        # Join all results
+        return top_datasets.join(user_totals, on="user", how="left").join(
+            platform_totals, on="user", how="left"
+        )
+    def _combine_user_usage_data(
+        self,
+        dataset_usage_lf: polars.LazyFrame,
+        dashboard_usage_lf: polars.LazyFrame,
+        chart_usage_lf: polars.LazyFrame,
+    ) -> polars.LazyFrame:
+        """
+        Combines user usage data from dataset, dashboard, and chart sources.
+        Args:
+            dataset_usage_lf: LazyFrame with dataset usage data containing top_datasets_map
+            dashboard_usage_lf: LazyFrame with dashboard usage data
+            chart_usage_lf: LazyFrame with chart usage data
+        Returns:
+            Combined LazyFrame with aggregated usage data per user
+        """
+        user_totals = self._combine_user_totals(
+            dataset_usage_lf, dashboard_usage_lf, chart_usage_lf
+        )
+        platform_pairs = self._combine_platform_pairs(
+            dataset_usage_lf, dashboard_usage_lf, chart_usage_lf
+        )
+        result = user_totals.join(platform_pairs, on="user", how="left")
+        return result.with_columns(
+            polars.col("platform_usage_pairs").fill_null(polars.lit([]))
+        )
+    def _filter_users(self, users_lf: polars.LazyFrame) -> polars.LazyFrame:
+        filter_condition = polars.col("user").str.contains("@")
+        for pattern in self.config.excluded_platforms:
+            filter_condition = filter_condition & ~polars.col("user").str.contains(
+                pattern
+            )
+        return users_lf.filter(filter_condition)
+    def _combine_user_totals(
+        self,
+        dataset_usage_lf: polars.LazyFrame,
+        dashboard_usage_lf: polars.LazyFrame,
+        chart_usage_lf: polars.LazyFrame,
+    ) -> polars.LazyFrame:
+        """Combine user totals and top_datasets_map from all sources."""
+        # Collect all unique users in one operation
+        all_users_lf = (
+            polars.concat(
+                [
+                    dataset_usage_lf.select("user"),
+                    dashboard_usage_lf.select("user"),
+                    chart_usage_lf.select("user"),
+                ]
+            )
+            .unique()
+            .pipe(self._filter_users)
+        )
+        return (
+            all_users_lf.join(
+                dataset_usage_lf.select(
+                    ["user", "top_datasets_map", "userUsageTotalPast30Days"]
+                ),
+                on="user",
+                how="left",
+            )
+            .join(
+                dashboard_usage_lf.select(["user", "userUsageTotalPast30Days"]),
+                on="user",
+                how="left",
+                suffix="_dashboard",
+            )
+            .join(
+                chart_usage_lf.select(["user", "userUsageTotalPast30Days"]),
+                on="user",
+                how="left",
+                suffix="_chart",
+            )
+            .with_columns(
+                [
+                    # Sum with explicit null handling
+                    (
+                        polars.col("userUsageTotalPast30Days").fill_null(0)
+                        + polars.col("userUsageTotalPast30Days_dashboard").fill_null(0)
+                        + polars.col("userUsageTotalPast30Days_chart").fill_null(0)
+                    ).alias("userUsageTotalPast30Days")
+                ]
+            )
+            .select(["user", "top_datasets_map", "userUsageTotalPast30Days"])
+        )
+    def _combine_platform_pairs(
+        self,
+        dataset_usage_lf: polars.LazyFrame,
+        dashboard_usage_lf: polars.LazyFrame,
+        chart_usage_lf: polars.LazyFrame,
+    ) -> polars.LazyFrame:
+        """Combine platform usage pairs from all sources."""
+        all_platforms = []
+        # Extract platforms from each source
+        for source_lf, col_name in [
+            (dataset_usage_lf, "platform_usage_pairs"),
+            (dashboard_usage_lf, "platform_usage_pairs"),
+            (chart_usage_lf, "platform_usage_pairs"),
+        ]:
+            platforms = self._extract_platforms_from_source(source_lf, col_name)
+            if platforms is not None:
+                all_platforms.append(platforms)
+        if not all_platforms:
+            # Return empty result if no platforms found
+            return polars.LazyFrame({"user": [], "platform_usage_pairs": []})
+        # Combine all platforms and aggregate by user + platform
+        combined_platforms = polars.concat(all_platforms, how="vertical_relaxed")
+        aggregated = combined_platforms.group_by("user", "platform_urn").agg(
+            polars.col("platform_total").sum().alias("platform_total")
+        )
+        # Rebuild platform_usage_pairs structure
+        return aggregated.group_by("user").agg(
+            polars.struct(
+                [polars.col("platform_urn"), polars.col("platform_total")]
+            ).alias("platform_usage_pairs")
+        )
+    def _extract_platforms_from_source(
+        self, source_lf: polars.LazyFrame, col_name: str
+    ) -> polars.LazyFrame | None:
+        """Extract platform data from a source LazyFrame."""
+        try:
+            return (
+                source_lf.select(["user", col_name])
+                .filter(polars.col(col_name).is_not_null())
+                .filter(polars.col(col_name).list.len() > 0)
+                .explode(col_name)
+                .unnest(col_name)
+                .filter(polars.col("platform_urn").is_not_null())
+                .select(["user", "platform_urn", "platform_total"])
+            )
+        except polars.exceptions.ColumnNotFoundError:
+            return None
+    def add_platform_usage_percentiles(
+        self, user_usage_lf: polars.LazyFrame
+    ) -> polars.LazyFrame:
+        """
+        Add platform usage percentiles to user usage data.
+        Args:
+            user_usage_lf: LazyFrame with user usage data containing platform_usage_pairs column
+        Returns:
+            LazyFrame with additional platform_usage_percentiles column
+        """
+        # First explode the platform_usage_pairs to work with individual platform usage records
+        platform_usage_exploded = (
+            user_usage_lf.explode("platform_usage_pairs")
+            .unnest("platform_usage_pairs")
+            .filter(polars.col("platform_urn").is_not_null())
+        )
+        # Use the existing gen_rank_and_percentile method to calculate percentiles
+        platform_percentiles_with_ranks = self.gen_rank_and_percentile(
+            lf=platform_usage_exploded,
+            count_field="platform_total",
+            urn_field="user",
+            platform_field="platform_urn",
+            prefix="platform_",
+            use_exp_cdf=False,
+        )
+        # Group back by user and create the percentiles structure
+        platform_percentiles = platform_percentiles_with_ranks.group_by("user").agg(
+            polars.struct(
+                [
+                    polars.col("platform_urn"),
+                    polars.col("platform_rank_percentile").cast(polars.Float64),
+                ]
+            ).alias("platform_usage_percentiles")
+        )
+        # Join the percentiles back to the original user_usage_lf
+        return user_usage_lf.join(platform_percentiles, on="user", how="left")
+    def _generate_user_usage_for_dashboard_charts(
+        self, entity_index: str, usage_index: str
+    ) -> polars.LazyFrame:
+        entities_df = self._generate_dashboard_chart_entities(entity_index)
+        lf = self._generate_dashboard_chart_usage(entities_df, usage_index)
+        # Process dashboard usage data into user usage format (similar to dataset version)
+        users_lf = (
+            lf.explode("userCounts")
+            .unnest("userCounts")
+            .filter(polars.col("user").is_not_null())
+            .rename({"usageCount": "count"})  # Rename to match dataset schema
+        )
+        user_dashboard_usage_lf = self._create_user_dataset_usage_map(users_lf)
+        return user_dashboard_usage_lf
+    def generate_user_usage(self) -> polars.LazyFrame:
+        dataset_usage_lf = self._generate_user_usage_for_dataset()
+        usage_index = "dashboard_dashboardusagestatisticsaspect_v1"
+        entity_index = "dashboardindex_v2"
+        dashboard_usage_lf = self._generate_user_usage_for_dashboard_charts(
+            entity_index, usage_index
+        )
+        entity_index = "chartindex_v2"
+        usage_index = "chart_chartusagestatisticsaspect_v1"
+        chart_usage_lf = self._generate_user_usage_for_dashboard_charts(
+            entity_index, usage_index
+        )
+        # Combine all three usage sources
+        lf = self._combine_user_usage_data(
+            dataset_usage_lf, dashboard_usage_lf, chart_usage_lf
+        )
+        lf = self.add_platform_usage_percentiles(lf)
+        # Add user usage percentiles across all users (not grouped by platform)
+        # Create a temporary platform field for percentile calculation
+        lf = lf.with_columns(polars.lit("all_users").alias("temp_platform"))
+        lf = self.gen_rank_and_percentile(
+            lf=lf,
+            count_field="userUsageTotalPast30Days",
+            urn_field="user",
+            platform_field="temp_platform",
+            prefix="userUsage",
+            use_exp_cdf=False,
+        )
+        # Rename the percentile column to match the schema field name and remove temp field
+        lf = lf.rename(
+            {"userUsagerank_percentile": "userUsagePercentilePast30Days"}
+        ).drop("temp_platform")
+        return lf
     def generate_dataset_usage(self) -> polars.LazyFrame:
         datasets_lf = self.get_datasets()
         if self.config.set_upstream_table_max_modification_time_for_views:
@@ -1503,11 +1919,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
             # called `Option::unwrap()` on a `None` value
             # Which only happens if we don't collect immediately
             # return polars.scan_parquet(temp_file.name, schema=schema, low_memory=True).collect().lazy()
-            return (
-                polars.scan_parquet(temp_file.name, schema=schema, low_memory=True)
-                .collect()
-                .lazy()
-            )
+            return polars.scan_parquet(temp_file.name, schema=schema, low_memory=True)
     def load_dataset_usage(self) -> polars.LazyFrame:
         index = "dataset_datasetusagestatisticsaspect_v1"
@@ -1624,23 +2036,40 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
         delay: Optional[float] = None,
     ) -> Iterable[Dict[str, Any]]:
         processed_count = 0
+        scroll_id = None
         while True:
             with PerfTimer() as timer:
                 logger.debug(f"ES query: {query}")
-                results = server.search(
-                    body=query,
-                    size=batch_size,
-                    index=(
-                        index
-                        if not self.config.search_index.opensearch_dialect
-                        else None
-                    ),
-                    params=(
-                        {"timeout": self.config.query_timeout}
-                        if self.config.search_index.opensearch_dialect
-                        else {"request_timeout": self.config.query_timeout}
-                    ),
-                )
+                if not scroll_id:
+                    logger.debug(
+                        f"Getting inital data from index {index} without scroll id"
+                    )
+                    results = server.search(
+                        body=query,
+                        size=batch_size,
+                        scroll="2m",
+                        index=index,
+                        params=(
+                            {"timeout": self.config.query_timeout}
+                            if self.config.search_index.opensearch_dialect
+                            else {"request_timeout": self.config.query_timeout}
+                        ),
+                    )
+                else:
+                    logger.debug(
+                        f"Getting data from index {index} using scroll_id: {scroll_id}"
+                    )
+                    results = server.scroll(
+                        scroll_id=scroll_id,
+                        scroll="2m",
+                        params=(
+                            {"timeout": self.config.query_timeout}
+                            if self.config.search_index.opensearch_dialect
+                            else {"request_timeout": self.config.query_timeout}
+                        ),
+                    )
+                scroll_id = results["_scroll_id"]
                 if not aggregation_key:
                     yield from process_function(results["hits"]["hits"])
@@ -1651,7 +2080,6 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
                     )
                     if len(results["hits"]["hits"]) < batch_size:
                         break
-                    query.update({"search_after": results["hits"]["hits"][-1]["sort"]})
                 else:
                     yield from process_function(
                         results["aggregations"][aggregation_key]["buckets"]
@@ -1661,16 +2089,11 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
                         < batch_size
                     ):
                         break
-                    if "after_key" in results["aggregations"][aggregation_key]:
-                        query["aggs"][aggregation_key]["composite"]["after"] = results[
-                            "aggregations"
-                        ][aggregation_key]["after_key"]
-                if delay:
-                    logger.debug(
-                        f"Sleeping for {delay} seconds before getting next batch from ES"
-                    )
-                    time.sleep(delay)
+            if delay:
+                logger.debug(
+                    f"Sleeping for {delay} seconds before getting next batch from ES"
+                )
+                time.sleep(delay)
-    def get_report(self) -> SourceReport:
+    def get_report(self) -> "DatahubUsageFeatureReport":
         return self.report