acryl-datahub-cloud 0.3.13.3__py3-none-any.whl → 0.3.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub-cloud might be problematic. Click here for more details.

Files changed (51) hide show
  1. acryl_datahub_cloud/_codegen_config.json +1 -1
  2. acryl_datahub_cloud/datahub_usage_reporting/excluded.py +94 -0
  3. acryl_datahub_cloud/datahub_usage_reporting/usage_feature_reporter.py +462 -34
  4. acryl_datahub_cloud/metadata/_urns/urn_defs.py +2034 -2034
  5. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +2 -0
  6. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/identity/__init__.py +6 -0
  7. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/metadata/search/features/__init__.py +2 -0
  8. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  9. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  10. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +6 -0
  11. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/template/__init__.py +6 -0
  12. acryl_datahub_cloud/metadata/schema.avsc +24776 -24109
  13. acryl_datahub_cloud/metadata/schema_classes.py +1581 -696
  14. acryl_datahub_cloud/metadata/schemas/ActionRequestInfo.avsc +95 -0
  15. acryl_datahub_cloud/metadata/schemas/AssertionAnalyticsRunEvent.avsc +0 -21
  16. acryl_datahub_cloud/metadata/schemas/AssetSettings.avsc +63 -0
  17. acryl_datahub_cloud/metadata/schemas/ChartInfo.avsc +2 -1
  18. acryl_datahub_cloud/metadata/schemas/CorpGroupSettings.avsc +127 -2
  19. acryl_datahub_cloud/metadata/schemas/CorpUserInvitationStatus.avsc +106 -0
  20. acryl_datahub_cloud/metadata/schemas/CorpUserKey.avsc +2 -0
  21. acryl_datahub_cloud/metadata/schemas/CorpUserSettings.avsc +127 -2
  22. acryl_datahub_cloud/metadata/schemas/CorpUserUsageFeatures.avsc +93 -0
  23. acryl_datahub_cloud/metadata/schemas/DataHubPageModuleProperties.avsc +21 -2
  24. acryl_datahub_cloud/metadata/schemas/DataHubPageTemplateProperties.avsc +77 -1
  25. acryl_datahub_cloud/metadata/schemas/DataProductKey.avsc +1 -0
  26. acryl_datahub_cloud/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  27. acryl_datahub_cloud/metadata/schemas/DomainKey.avsc +1 -0
  28. acryl_datahub_cloud/metadata/schemas/GlobalSettingsInfo.avsc +82 -0
  29. acryl_datahub_cloud/metadata/schemas/GlossaryNodeKey.avsc +1 -0
  30. acryl_datahub_cloud/metadata/schemas/GlossaryTermKey.avsc +1 -0
  31. acryl_datahub_cloud/metadata/schemas/IncidentActivityEvent.avsc +3 -3
  32. acryl_datahub_cloud/metadata/schemas/IncidentInfo.avsc +3 -3
  33. acryl_datahub_cloud/metadata/schemas/InferredMetadata.avsc +69 -0
  34. acryl_datahub_cloud/metadata/schemas/InviteToken.avsc +26 -0
  35. acryl_datahub_cloud/metadata/schemas/LogicalParent.avsc +104 -100
  36. acryl_datahub_cloud/metadata/schemas/MetadataChangeEvent.avsc +98 -45
  37. acryl_datahub_cloud/metadata/schemas/MonitorSuiteInfo.avsc +127 -2
  38. acryl_datahub_cloud/metadata/schemas/NotificationRequest.avsc +79 -2
  39. acryl_datahub_cloud/metadata/schemas/Ownership.avsc +69 -0
  40. acryl_datahub_cloud/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  41. acryl_datahub_cloud/metadata/schemas/SchemaFieldKey.avsc +2 -0
  42. acryl_datahub_cloud/metadata/schemas/StructuredProperties.avsc +69 -0
  43. acryl_datahub_cloud/metadata/schemas/SubscriptionInfo.avsc +127 -2
  44. acryl_datahub_cloud/sdk/assertions_client.py +21 -7
  45. acryl_datahub_cloud/sdk/resolver_client.py +4 -1
  46. acryl_datahub_cloud/sdk/subscription_client.py +8 -3
  47. {acryl_datahub_cloud-0.3.13.3.dist-info → acryl_datahub_cloud-0.3.14.dist-info}/METADATA +44 -44
  48. {acryl_datahub_cloud-0.3.13.3.dist-info → acryl_datahub_cloud-0.3.14.dist-info}/RECORD +51 -45
  49. {acryl_datahub_cloud-0.3.13.3.dist-info → acryl_datahub_cloud-0.3.14.dist-info}/WHEEL +0 -0
  50. {acryl_datahub_cloud-0.3.13.3.dist-info → acryl_datahub_cloud-0.3.14.dist-info}/entry_points.txt +0 -0
  51. {acryl_datahub_cloud-0.3.13.3.dist-info → acryl_datahub_cloud-0.3.14.dist-info}/top_level.txt +0 -0
@@ -22,12 +22,14 @@ from polars.datatypes import DataTypeClass
22
22
  from pydantic import Field
23
23
  from scipy.stats import expon
24
24
 
25
+ from acryl_datahub_cloud.datahub_usage_reporting.excluded import EXCLUDED_PATTERNS
25
26
  from acryl_datahub_cloud.datahub_usage_reporting.query_builder import QueryBuilder
26
27
  from acryl_datahub_cloud.datahub_usage_reporting.usage_feature_patch_builder import (
27
28
  UsageFeaturePatchBuilder,
28
29
  )
29
30
  from acryl_datahub_cloud.elasticsearch.config import ElasticSearchClientConfig
30
31
  from acryl_datahub_cloud.metadata.schema_classes import (
32
+ CorpUserUsageFeaturesClass,
31
33
  QueryUsageFeaturesClass,
32
34
  UsageFeaturesClass,
33
35
  )
@@ -135,6 +137,10 @@ class DataHubUsageFeatureReportingSourceConfig(
135
137
  None,
136
138
  description="Optional configuration for stateful ingestion, including stale metadata removal.",
137
139
  )
140
+ user_usage_enabled: bool = Field(
141
+ True,
142
+ description="Flag to enable or disable user usage statistics collection.",
143
+ )
138
144
  dataset_usage_enabled: bool = Field(
139
145
  True,
140
146
  description="Flag to enable or disable dataset usage statistics collection.",
@@ -191,6 +197,11 @@ class DataHubUsageFeatureReportingSourceConfig(
191
197
  description="Flag to generate MCP patch for usage features.'",
192
198
  )
193
199
 
200
+ excluded_platforms: List[str] = Field(
201
+ EXCLUDED_PATTERNS,
202
+ description="List of platforms to exclude from usage statistics collection. This is done to avoid invite user functionality to be filled with service accounts.",
203
+ )
204
+
194
205
 
195
206
  def exp_cdf(series: polars.Series) -> polars.Series:
196
207
  with PerfTimer() as timer:
@@ -241,10 +252,6 @@ class DatahubUsageFeatureReport(IngestionStageReport, StatefulIngestionReport):
241
252
  default_factory=lambda: defaultdict(lambda: PerfTimer())
242
253
  )
243
254
 
244
- dataset_usage_processing_time: PerfTimer = PerfTimer()
245
- dashboard_usage_processing_time: PerfTimer = PerfTimer()
246
- chart_usage_processing_time: PerfTimer = PerfTimer()
247
- query_usage_processing_time: PerfTimer = PerfTimer()
248
255
  query_platforms_count: Dict[str, int] = field(
249
256
  default_factory=lambda: defaultdict(lambda: 0)
250
257
  )
@@ -923,6 +930,11 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
923
930
 
924
931
  return dataset_df
925
932
 
933
+ def generate_user_usage_mcps(self) -> Iterable[MetadataWorkUnit]:
934
+ with polars.StringCache():
935
+ user_usage_lf = self.generate_user_usage()
936
+ yield from self.generate_user_usage_mcp_from_lazyframe(user_usage_lf)
937
+
926
938
  def generate_dataset_usage_mcps(self) -> Iterable[MetadataWorkUnit]:
927
939
  with polars.StringCache():
928
940
  dataset_usage_df = self.generate_dataset_usage()
@@ -958,38 +970,27 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
958
970
  ]
959
971
 
960
972
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
973
+ if self.config.user_usage_enabled:
974
+ self.report.new_stage("generate user usage")
975
+ yield from self.generate_user_usage_mcps()
976
+
961
977
  if self.config.dataset_usage_enabled:
962
- with self.report.dataset_usage_processing_time as timer:
963
- self.report.new_stage("generate dataset usage")
964
- yield from self.generate_dataset_usage_mcps()
965
- time_taken = timer.elapsed_seconds()
966
- logger.info(f"Dataset Usage generation took {time_taken:.3f} seconds")
978
+ self.report.new_stage("generate dataset usage")
979
+ yield from self.generate_dataset_usage_mcps()
967
980
 
968
981
  if self.config.dashboard_usage_enabled:
969
- with self.report.dashboard_usage_processing_time as timer:
970
- self.report.new_stage("generate dashboard usage")
971
- yield from self.generate_dashboard_usage_mcps()
972
-
973
- time_taken = timer.elapsed_seconds()
974
- logger.info(f"Dashboard Usage generation took {time_taken:.3f}")
982
+ self.report.new_stage("generate dashboard usage")
983
+ yield from self.generate_dashboard_usage_mcps()
975
984
 
976
985
  if self.config.chart_usage_enabled:
977
- with self.report.chart_usage_processing_time as timer:
978
- self.report.new_stage("generate chart usage")
979
-
980
- yield from self.generate_chart_usage_mcps()
981
-
982
- time_taken = timer.elapsed_seconds()
983
- logger.info(f"Chart Usage generation took {time_taken:.3f}")
986
+ self.report.new_stage("generate chart usage")
987
+ yield from self.generate_chart_usage_mcps()
984
988
 
985
989
  if self.config.query_usage_enabled:
986
- with self.report.query_usage_processing_time as timer:
987
- self.report.new_stage("generate query usage")
988
-
989
- yield from self.generate_query_usage_mcps()
990
+ self.report.new_stage("generate query usage")
991
+ yield from self.generate_query_usage_mcps()
990
992
 
991
- time_taken = timer.elapsed_seconds()
992
- logger.info(f"Query Usage generation took {time_taken:.3f}")
993
+ self.report.new_stage("end so time is calculated for last stage")
993
994
 
994
995
  def generate_mcp_from_lazyframe(
995
996
  self, lazy_frame: polars.LazyFrame
@@ -1052,7 +1053,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1052
1053
  uniqueUserPercentileLast30Days=int(
1053
1054
  row.get("distinct_user_rank_percentile", 0) or 0
1054
1055
  ),
1055
- writeCountLast30Days=int(row.get("write_rank_percentile", 0) or 0)
1056
+ writeCountLast30Days=int(row.get("write_count", 0) or 0)
1056
1057
  if not self.config.disable_write_usage
1057
1058
  else None,
1058
1059
  writeCountPercentileLast30Days=int(
@@ -1108,6 +1109,47 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1108
1109
  row["urn"], query_usage_features
1109
1110
  )
1110
1111
 
1112
+ def _convert_platform_pairs_to_dict(
1113
+ self,
1114
+ platform_pairs: Optional[List[Dict[str, Any]]],
1115
+ value_key: str = "platform_total",
1116
+ ) -> Optional[Dict[str, Any]]:
1117
+ """Convert list of platform usage structs to dictionary."""
1118
+ if not platform_pairs:
1119
+ return None
1120
+
1121
+ return {
1122
+ pair["platform_urn"]: pair[value_key]
1123
+ for pair in platform_pairs
1124
+ if pair["platform_urn"] is not None
1125
+ }
1126
+
1127
+ def generate_user_usage_mcp_from_lazyframe(
1128
+ self, lazy_frame: polars.LazyFrame
1129
+ ) -> Iterable[MetadataWorkUnit]:
1130
+ for row in lazy_frame.collect(
1131
+ engine="streaming" if self.config.experimental_full_streaming else "auto"
1132
+ ).iter_rows(named=True):
1133
+ user_usage_features = CorpUserUsageFeaturesClass(
1134
+ userUsageTotalPast30Days=int(
1135
+ row.get("userUsageTotalPast30Days", 0) or 0
1136
+ ),
1137
+ userPlatformUsageTotalsPast30Days=self._convert_platform_pairs_to_dict(
1138
+ row.get("platform_usage_pairs", [])
1139
+ ),
1140
+ userPlatformUsagePercentilePast30Days=self._convert_platform_pairs_to_dict(
1141
+ row.get("platform_usage_percentiles", []),
1142
+ "platform_rank_percentile",
1143
+ ),
1144
+ userUsagePercentilePast30Days=row.get("userUsagePercentilePast30Days"),
1145
+ userTopDatasetsByUsage=self._convert_top_datasets_to_dict(
1146
+ row.get("top_datasets_map", [])
1147
+ ),
1148
+ )
1149
+ yield MetadataChangeProposalWrapper(
1150
+ entityUrn=row["user"], aspect=user_usage_features
1151
+ ).as_workunit(is_primary_source=False)
1152
+
1111
1153
  def generate_usage_feature_mcp(
1112
1154
  self, urn: str, usage_feature: UsageFeaturesClass
1113
1155
  ) -> Iterable[MetadataWorkUnit]:
@@ -1142,9 +1184,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1142
1184
 
1143
1185
  return self.generate_dashboard_chart_usage(entity_index, usage_index)
1144
1186
 
1145
- def generate_dashboard_chart_usage(
1146
- self, entity_index: str, usage_index: str
1147
- ) -> polars.LazyFrame:
1187
+ def _generate_dashboard_chart_entities(self, entity_index: str) -> polars.LazyFrame:
1148
1188
  entity_schema = {
1149
1189
  "entity_urn": polars.Categorical,
1150
1190
  "removed": polars.Boolean,
@@ -1161,7 +1201,12 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1161
1201
  process_function=self.soft_deleted_batch,
1162
1202
  )
1163
1203
 
1164
- dashboard_usage_schema = {
1204
+ return entities_df
1205
+
1206
+ def _generate_dashboard_chart_usage(
1207
+ self, entities_df: polars.LazyFrame, usage_index: str
1208
+ ) -> polars.LazyFrame:
1209
+ entities_usage_schema = {
1165
1210
  "timestampMillis": polars.Int64,
1166
1211
  "lastObserved": polars.Int64,
1167
1212
  "urn": polars.Categorical,
@@ -1179,7 +1224,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1179
1224
  }
1180
1225
 
1181
1226
  lf = self.load_data_from_es_to_lf(
1182
- schema=dashboard_usage_schema,
1227
+ schema=entities_usage_schema,
1183
1228
  index=usage_index,
1184
1229
  query=QueryBuilder.get_dashboard_usage_query(self.config.lookback_days),
1185
1230
  process_function=self.process_dashboard_usage,
@@ -1198,6 +1243,15 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1198
1243
  .alias("row_num")
1199
1244
  ).filter(polars.col("row_num") == 1)
1200
1245
 
1246
+ return lf
1247
+
1248
+ def generate_dashboard_chart_usage(
1249
+ self, entity_index: str, usage_index: str
1250
+ ) -> polars.LazyFrame:
1251
+ entities_df = self._generate_dashboard_chart_entities(entity_index)
1252
+
1253
+ lf = self._generate_dashboard_chart_usage(entities_df, usage_index)
1254
+
1201
1255
  # lf = lf.filter(polars.col("urn") == "urn:li:dashboard:(looker,dashboards.8)")
1202
1256
  # "urn:li:dashboard:(looker,dashboards.8)"
1203
1257
 
@@ -1367,6 +1421,380 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1367
1421
 
1368
1422
  return usage_with_top_users_with_ranks
1369
1423
 
1424
+ def _generate_user_usage_for_dataset(self) -> polars.LazyFrame:
1425
+ datasets_lf = self.get_datasets()
1426
+ if self.config.set_upstream_table_max_modification_time_for_views:
1427
+ datasets_lf = self.set_table_modification_time_for_views(datasets_lf)
1428
+
1429
+ lf = self.load_dataset_usage()
1430
+
1431
+ # Polaris/pandas join merges the join column into one column and that's why we need to filter based on the removed column
1432
+ lf = (
1433
+ lf.join(datasets_lf, left_on="urn", right_on="entity_urn", how="left")
1434
+ .filter(polars.col("removed") == False) # noqa: E712
1435
+ .drop(["removed"])
1436
+ )
1437
+
1438
+ users_lf = (
1439
+ lf.explode("userCounts")
1440
+ .unnest("userCounts")
1441
+ .filter(polars.col("user").is_not_null())
1442
+ )
1443
+
1444
+ user_dataset_usage_lf = self._create_user_dataset_usage_map(users_lf)
1445
+ return user_dataset_usage_lf
1446
+
1447
+ @staticmethod
1448
+ def _convert_top_datasets_to_dict(
1449
+ top_datasets_list: Optional[List[Dict[str, Any]]],
1450
+ ) -> Optional[Dict[str, float]]:
1451
+ """
1452
+ Convert list of top datasets structs to dictionary as expected by CorpUserUsageFeatures schema.
1453
+
1454
+ Args:
1455
+ top_datasets_list: List of dictionaries with 'dataset_urn' and 'count' keys
1456
+
1457
+ Returns:
1458
+ Dictionary mapping dataset URN to usage count, or None if input is empty
1459
+ """
1460
+ if not top_datasets_list:
1461
+ return None
1462
+
1463
+ top_datasets_dict = {
1464
+ item["dataset_urn"]: float(item["count"])
1465
+ for item in top_datasets_list
1466
+ if isinstance(item, dict) and "dataset_urn" in item and "count" in item
1467
+ }
1468
+
1469
+ return top_datasets_dict if top_datasets_dict else None
1470
+
1471
+ def _create_user_dataset_usage_map(
1472
+ self, users_lf: polars.LazyFrame, top_n: int = 25
1473
+ ) -> polars.LazyFrame:
1474
+ """
1475
+ Creates a lazyframe with user string and map of top N datasets by usage.
1476
+
1477
+ Args:
1478
+ users_lf: LazyFrame containing user usage data with columns: user, urn, platform, count
1479
+ top_n: Number of top datasets to include per user (default: 25)
1480
+
1481
+ Returns:
1482
+ LazyFrame with columns:
1483
+ - user: string column containing the user identifier
1484
+ - top_datasets_map: list of structs with dataset_urn (string), count (int), and platform_urn (string)
1485
+ - userUsageTotalPast30Days: total usage count for the user across all datasets
1486
+ - userPlatformUsageTotalsPast30Days: map from platform URN to usage totals
1487
+ """
1488
+
1489
+ # Create intermediate lazy frame with filtered users and aggregated counts
1490
+ user_dataset_aggregated = (
1491
+ users_lf.filter(polars.col("user").str.contains("@"))
1492
+ .group_by("user", "urn", "platform")
1493
+ .agg(polars.col("count").sum().alias("total_count"))
1494
+ .with_columns(
1495
+ # Direct string formatting - vectorized operation
1496
+ polars.format("urn:li:dataPlatform:{}", polars.col("platform")).alias(
1497
+ "platform_urn"
1498
+ )
1499
+ )
1500
+ )
1501
+
1502
+ # Calculate user totals
1503
+ user_totals = user_dataset_aggregated.group_by("user").agg(
1504
+ polars.col("total_count").sum().alias("userUsageTotalPast30Days")
1505
+ )
1506
+
1507
+ # Calculate platform totals for each user - keep as list of structs
1508
+ platform_totals = (
1509
+ user_dataset_aggregated.group_by("user", "platform_urn")
1510
+ .agg(polars.col("total_count").sum().alias("platform_total"))
1511
+ .filter(polars.col("platform_urn").is_not_null())
1512
+ .group_by("user")
1513
+ .agg(
1514
+ polars.struct(
1515
+ [
1516
+ polars.col("platform_urn"),
1517
+ polars.col("platform_total").cast(polars.Float64),
1518
+ ]
1519
+ ).alias("platform_usage_pairs")
1520
+ )
1521
+ )
1522
+
1523
+ # Calculate top datasets
1524
+ top_datasets = (
1525
+ user_dataset_aggregated.with_columns(
1526
+ polars.col("total_count")
1527
+ .rank(descending=True, method="ordinal")
1528
+ .over("user")
1529
+ .alias("dataset_rank")
1530
+ )
1531
+ .filter(polars.col("dataset_rank") <= top_n)
1532
+ .group_by("user")
1533
+ .agg(
1534
+ polars.struct(
1535
+ [
1536
+ polars.col("urn").alias("dataset_urn"),
1537
+ polars.col("total_count").alias("count"),
1538
+ polars.col("platform_urn"),
1539
+ ]
1540
+ )
1541
+ .sort_by("total_count", descending=True)
1542
+ .alias("top_datasets_map")
1543
+ )
1544
+ )
1545
+
1546
+ # Join all results
1547
+ return top_datasets.join(user_totals, on="user", how="left").join(
1548
+ platform_totals, on="user", how="left"
1549
+ )
1550
+
1551
+ def _combine_user_usage_data(
1552
+ self,
1553
+ dataset_usage_lf: polars.LazyFrame,
1554
+ dashboard_usage_lf: polars.LazyFrame,
1555
+ chart_usage_lf: polars.LazyFrame,
1556
+ ) -> polars.LazyFrame:
1557
+ """
1558
+ Combines user usage data from dataset, dashboard, and chart sources.
1559
+
1560
+ Args:
1561
+ dataset_usage_lf: LazyFrame with dataset usage data containing top_datasets_map
1562
+ dashboard_usage_lf: LazyFrame with dashboard usage data
1563
+ chart_usage_lf: LazyFrame with chart usage data
1564
+
1565
+ Returns:
1566
+ Combined LazyFrame with aggregated usage data per user
1567
+ """
1568
+ user_totals = self._combine_user_totals(
1569
+ dataset_usage_lf, dashboard_usage_lf, chart_usage_lf
1570
+ )
1571
+
1572
+ platform_pairs = self._combine_platform_pairs(
1573
+ dataset_usage_lf, dashboard_usage_lf, chart_usage_lf
1574
+ )
1575
+
1576
+ result = user_totals.join(platform_pairs, on="user", how="left")
1577
+
1578
+ return result.with_columns(
1579
+ polars.col("platform_usage_pairs").fill_null(polars.lit([]))
1580
+ )
1581
+
1582
+ def _filter_users(self, users_lf: polars.LazyFrame) -> polars.LazyFrame:
1583
+ filter_condition = polars.col("user").str.contains("@")
1584
+ for pattern in self.config.excluded_platforms:
1585
+ filter_condition = filter_condition & ~polars.col("user").str.contains(
1586
+ pattern
1587
+ )
1588
+
1589
+ return users_lf.filter(filter_condition)
1590
+
1591
+ def _combine_user_totals(
1592
+ self,
1593
+ dataset_usage_lf: polars.LazyFrame,
1594
+ dashboard_usage_lf: polars.LazyFrame,
1595
+ chart_usage_lf: polars.LazyFrame,
1596
+ ) -> polars.LazyFrame:
1597
+ """Combine user totals and top_datasets_map from all sources."""
1598
+ # Collect all unique users in one operation
1599
+ all_users_lf = (
1600
+ polars.concat(
1601
+ [
1602
+ dataset_usage_lf.select("user"),
1603
+ dashboard_usage_lf.select("user"),
1604
+ chart_usage_lf.select("user"),
1605
+ ]
1606
+ )
1607
+ .unique()
1608
+ .pipe(self._filter_users)
1609
+ )
1610
+
1611
+ return (
1612
+ all_users_lf.join(
1613
+ dataset_usage_lf.select(
1614
+ ["user", "top_datasets_map", "userUsageTotalPast30Days"]
1615
+ ),
1616
+ on="user",
1617
+ how="left",
1618
+ )
1619
+ .join(
1620
+ dashboard_usage_lf.select(["user", "userUsageTotalPast30Days"]),
1621
+ on="user",
1622
+ how="left",
1623
+ suffix="_dashboard",
1624
+ )
1625
+ .join(
1626
+ chart_usage_lf.select(["user", "userUsageTotalPast30Days"]),
1627
+ on="user",
1628
+ how="left",
1629
+ suffix="_chart",
1630
+ )
1631
+ .with_columns(
1632
+ [
1633
+ # Sum with explicit null handling
1634
+ (
1635
+ polars.col("userUsageTotalPast30Days").fill_null(0)
1636
+ + polars.col("userUsageTotalPast30Days_dashboard").fill_null(0)
1637
+ + polars.col("userUsageTotalPast30Days_chart").fill_null(0)
1638
+ ).alias("userUsageTotalPast30Days")
1639
+ ]
1640
+ )
1641
+ .select(["user", "top_datasets_map", "userUsageTotalPast30Days"])
1642
+ )
1643
+
1644
+ def _combine_platform_pairs(
1645
+ self,
1646
+ dataset_usage_lf: polars.LazyFrame,
1647
+ dashboard_usage_lf: polars.LazyFrame,
1648
+ chart_usage_lf: polars.LazyFrame,
1649
+ ) -> polars.LazyFrame:
1650
+ """Combine platform usage pairs from all sources."""
1651
+ all_platforms = []
1652
+
1653
+ # Extract platforms from each source
1654
+ for source_lf, col_name in [
1655
+ (dataset_usage_lf, "platform_usage_pairs"),
1656
+ (dashboard_usage_lf, "platform_usage_pairs"),
1657
+ (chart_usage_lf, "platform_usage_pairs"),
1658
+ ]:
1659
+ platforms = self._extract_platforms_from_source(source_lf, col_name)
1660
+ if platforms is not None:
1661
+ all_platforms.append(platforms)
1662
+
1663
+ if not all_platforms:
1664
+ # Return empty result if no platforms found
1665
+ return polars.LazyFrame({"user": [], "platform_usage_pairs": []})
1666
+
1667
+ # Combine all platforms and aggregate by user + platform
1668
+ combined_platforms = polars.concat(all_platforms, how="vertical_relaxed")
1669
+ aggregated = combined_platforms.group_by("user", "platform_urn").agg(
1670
+ polars.col("platform_total").sum().alias("platform_total")
1671
+ )
1672
+
1673
+ # Rebuild platform_usage_pairs structure
1674
+ return aggregated.group_by("user").agg(
1675
+ polars.struct(
1676
+ [polars.col("platform_urn"), polars.col("platform_total")]
1677
+ ).alias("platform_usage_pairs")
1678
+ )
1679
+
1680
+ def _extract_platforms_from_source(
1681
+ self, source_lf: polars.LazyFrame, col_name: str
1682
+ ) -> polars.LazyFrame | None:
1683
+ """Extract platform data from a source LazyFrame."""
1684
+ try:
1685
+ return (
1686
+ source_lf.select(["user", col_name])
1687
+ .filter(polars.col(col_name).is_not_null())
1688
+ .filter(polars.col(col_name).list.len() > 0)
1689
+ .explode(col_name)
1690
+ .unnest(col_name)
1691
+ .filter(polars.col("platform_urn").is_not_null())
1692
+ .select(["user", "platform_urn", "platform_total"])
1693
+ )
1694
+ except polars.exceptions.ColumnNotFoundError:
1695
+ return None
1696
+
1697
+ def add_platform_usage_percentiles(
1698
+ self, user_usage_lf: polars.LazyFrame
1699
+ ) -> polars.LazyFrame:
1700
+ """
1701
+ Add platform usage percentiles to user usage data.
1702
+
1703
+ Args:
1704
+ user_usage_lf: LazyFrame with user usage data containing platform_usage_pairs column
1705
+
1706
+ Returns:
1707
+ LazyFrame with additional platform_usage_percentiles column
1708
+ """
1709
+ # First explode the platform_usage_pairs to work with individual platform usage records
1710
+ platform_usage_exploded = (
1711
+ user_usage_lf.explode("platform_usage_pairs")
1712
+ .unnest("platform_usage_pairs")
1713
+ .filter(polars.col("platform_urn").is_not_null())
1714
+ )
1715
+
1716
+ # Use the existing gen_rank_and_percentile method to calculate percentiles
1717
+ platform_percentiles_with_ranks = self.gen_rank_and_percentile(
1718
+ lf=platform_usage_exploded,
1719
+ count_field="platform_total",
1720
+ urn_field="user",
1721
+ platform_field="platform_urn",
1722
+ prefix="platform_",
1723
+ use_exp_cdf=False,
1724
+ )
1725
+
1726
+ # Group back by user and create the percentiles structure
1727
+ platform_percentiles = platform_percentiles_with_ranks.group_by("user").agg(
1728
+ polars.struct(
1729
+ [
1730
+ polars.col("platform_urn"),
1731
+ polars.col("platform_rank_percentile").cast(polars.Float64),
1732
+ ]
1733
+ ).alias("platform_usage_percentiles")
1734
+ )
1735
+
1736
+ # Join the percentiles back to the original user_usage_lf
1737
+ return user_usage_lf.join(platform_percentiles, on="user", how="left")
1738
+
1739
+ def _generate_user_usage_for_dashboard_charts(
1740
+ self, entity_index: str, usage_index: str
1741
+ ) -> polars.LazyFrame:
1742
+ entities_df = self._generate_dashboard_chart_entities(entity_index)
1743
+ lf = self._generate_dashboard_chart_usage(entities_df, usage_index)
1744
+
1745
+ # Process dashboard usage data into user usage format (similar to dataset version)
1746
+ users_lf = (
1747
+ lf.explode("userCounts")
1748
+ .unnest("userCounts")
1749
+ .filter(polars.col("user").is_not_null())
1750
+ .rename({"usageCount": "count"}) # Rename to match dataset schema
1751
+ )
1752
+
1753
+ user_dashboard_usage_lf = self._create_user_dataset_usage_map(users_lf)
1754
+ return user_dashboard_usage_lf
1755
+
1756
+ def generate_user_usage(self) -> polars.LazyFrame:
1757
+ dataset_usage_lf = self._generate_user_usage_for_dataset()
1758
+
1759
+ usage_index = "dashboard_dashboardusagestatisticsaspect_v1"
1760
+ entity_index = "dashboardindex_v2"
1761
+ dashboard_usage_lf = self._generate_user_usage_for_dashboard_charts(
1762
+ entity_index, usage_index
1763
+ )
1764
+
1765
+ entity_index = "chartindex_v2"
1766
+ usage_index = "chart_chartusagestatisticsaspect_v1"
1767
+ chart_usage_lf = self._generate_user_usage_for_dashboard_charts(
1768
+ entity_index, usage_index
1769
+ )
1770
+
1771
+ # Combine all three usage sources
1772
+ lf = self._combine_user_usage_data(
1773
+ dataset_usage_lf, dashboard_usage_lf, chart_usage_lf
1774
+ )
1775
+
1776
+ lf = self.add_platform_usage_percentiles(lf)
1777
+
1778
+ # Add user usage percentiles across all users (not grouped by platform)
1779
+ # Create a temporary platform field for percentile calculation
1780
+ lf = lf.with_columns(polars.lit("all_users").alias("temp_platform"))
1781
+
1782
+ lf = self.gen_rank_and_percentile(
1783
+ lf=lf,
1784
+ count_field="userUsageTotalPast30Days",
1785
+ urn_field="user",
1786
+ platform_field="temp_platform",
1787
+ prefix="userUsage",
1788
+ use_exp_cdf=False,
1789
+ )
1790
+
1791
+ # Rename the percentile column to match the schema field name and remove temp field
1792
+ lf = lf.rename(
1793
+ {"userUsagerank_percentile": "userUsagePercentilePast30Days"}
1794
+ ).drop("temp_platform")
1795
+
1796
+ return lf
1797
+
1370
1798
  def generate_dataset_usage(self) -> polars.LazyFrame:
1371
1799
  datasets_lf = self.get_datasets()
1372
1800
  if self.config.set_upstream_table_max_modification_time_for_views: