acryl-datahub-cloud 0.3.13.2rc4__py3-none-any.whl → 0.3.14rc0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub-cloud might be problematic. Click here for more details.

Files changed (33) hide show
  1. acryl_datahub_cloud/_codegen_config.json +1 -1
  2. acryl_datahub_cloud/datahub_usage_reporting/usage_feature_reporter.py +443 -34
  3. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/identity/__init__.py +6 -0
  4. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/metadata/search/features/__init__.py +2 -0
  5. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  6. acryl_datahub_cloud/metadata/schema.avsc +445 -107
  7. acryl_datahub_cloud/metadata/schema_classes.py +420 -19
  8. acryl_datahub_cloud/metadata/schemas/ActionRequestInfo.avsc +95 -0
  9. acryl_datahub_cloud/metadata/schemas/AssertionAnalyticsRunEvent.avsc +0 -21
  10. acryl_datahub_cloud/metadata/schemas/ChartInfo.avsc +2 -1
  11. acryl_datahub_cloud/metadata/schemas/CorpUserInvitationStatus.avsc +106 -0
  12. acryl_datahub_cloud/metadata/schemas/CorpUserKey.avsc +2 -0
  13. acryl_datahub_cloud/metadata/schemas/CorpUserUsageFeatures.avsc +93 -0
  14. acryl_datahub_cloud/metadata/schemas/DataHubPageModuleProperties.avsc +13 -2
  15. acryl_datahub_cloud/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  16. acryl_datahub_cloud/metadata/schemas/InferredMetadata.avsc +69 -0
  17. acryl_datahub_cloud/metadata/schemas/InviteToken.avsc +26 -0
  18. acryl_datahub_cloud/metadata/schemas/LogicalParent.avsc +104 -100
  19. acryl_datahub_cloud/metadata/schemas/MetadataChangeEvent.avsc +98 -45
  20. acryl_datahub_cloud/metadata/schemas/NotificationRequest.avsc +71 -0
  21. acryl_datahub_cloud/metadata/schemas/Ownership.avsc +69 -0
  22. acryl_datahub_cloud/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  23. acryl_datahub_cloud/metadata/schemas/SchemaFieldKey.avsc +2 -0
  24. acryl_datahub_cloud/metadata/schemas/StructuredProperties.avsc +69 -0
  25. acryl_datahub_cloud/metadata/schemas/StructuredPropertyDefinition.avsc +3 -0
  26. acryl_datahub_cloud/sdk/assertions_client.py +21 -7
  27. acryl_datahub_cloud/sdk/resolver_client.py +4 -1
  28. acryl_datahub_cloud/sdk/subscription_client.py +8 -3
  29. {acryl_datahub_cloud-0.3.13.2rc4.dist-info → acryl_datahub_cloud-0.3.14rc0.dist-info}/METADATA +48 -48
  30. {acryl_datahub_cloud-0.3.13.2rc4.dist-info → acryl_datahub_cloud-0.3.14rc0.dist-info}/RECORD +33 -30
  31. {acryl_datahub_cloud-0.3.13.2rc4.dist-info → acryl_datahub_cloud-0.3.14rc0.dist-info}/WHEEL +0 -0
  32. {acryl_datahub_cloud-0.3.13.2rc4.dist-info → acryl_datahub_cloud-0.3.14rc0.dist-info}/entry_points.txt +0 -0
  33. {acryl_datahub_cloud-0.3.13.2rc4.dist-info → acryl_datahub_cloud-0.3.14rc0.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "acryl-datahub-cloud",
3
- "version": "0.3.13.2rc4",
3
+ "version": "0.3.14rc0",
4
4
  "install_requires": [
5
5
  "avro-gen3==0.7.16",
6
6
  "acryl-datahub"
@@ -28,6 +28,7 @@ from acryl_datahub_cloud.datahub_usage_reporting.usage_feature_patch_builder imp
28
28
  )
29
29
  from acryl_datahub_cloud.elasticsearch.config import ElasticSearchClientConfig
30
30
  from acryl_datahub_cloud.metadata.schema_classes import (
31
+ CorpUserUsageFeaturesClass,
31
32
  QueryUsageFeaturesClass,
32
33
  UsageFeaturesClass,
33
34
  )
@@ -135,6 +136,10 @@ class DataHubUsageFeatureReportingSourceConfig(
135
136
  None,
136
137
  description="Optional configuration for stateful ingestion, including stale metadata removal.",
137
138
  )
139
+ user_usage_enabled: bool = Field(
140
+ True,
141
+ description="Flag to enable or disable user usage statistics collection.",
142
+ )
138
143
  dataset_usage_enabled: bool = Field(
139
144
  True,
140
145
  description="Flag to enable or disable dataset usage statistics collection.",
@@ -241,10 +246,6 @@ class DatahubUsageFeatureReport(IngestionStageReport, StatefulIngestionReport):
241
246
  default_factory=lambda: defaultdict(lambda: PerfTimer())
242
247
  )
243
248
 
244
- dataset_usage_processing_time: PerfTimer = PerfTimer()
245
- dashboard_usage_processing_time: PerfTimer = PerfTimer()
246
- chart_usage_processing_time: PerfTimer = PerfTimer()
247
- query_usage_processing_time: PerfTimer = PerfTimer()
248
249
  query_platforms_count: Dict[str, int] = field(
249
250
  default_factory=lambda: defaultdict(lambda: 0)
250
251
  )
@@ -923,6 +924,11 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
923
924
 
924
925
  return dataset_df
925
926
 
927
+ def generate_user_usage_mcps(self) -> Iterable[MetadataWorkUnit]:
928
+ with polars.StringCache():
929
+ user_usage_lf = self.generate_user_usage()
930
+ yield from self.generate_user_usage_mcp_from_lazyframe(user_usage_lf)
931
+
926
932
  def generate_dataset_usage_mcps(self) -> Iterable[MetadataWorkUnit]:
927
933
  with polars.StringCache():
928
934
  dataset_usage_df = self.generate_dataset_usage()
@@ -958,38 +964,27 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
958
964
  ]
959
965
 
960
966
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
967
+ if self.config.user_usage_enabled:
968
+ self.report.new_stage("generate user usage")
969
+ yield from self.generate_user_usage_mcps()
970
+
961
971
  if self.config.dataset_usage_enabled:
962
- with self.report.dataset_usage_processing_time as timer:
963
- self.report.new_stage("generate dataset usage")
964
- yield from self.generate_dataset_usage_mcps()
965
- time_taken = timer.elapsed_seconds()
966
- logger.info(f"Dataset Usage generation took {time_taken:.3f} seconds")
972
+ self.report.new_stage("generate dataset usage")
973
+ yield from self.generate_dataset_usage_mcps()
967
974
 
968
975
  if self.config.dashboard_usage_enabled:
969
- with self.report.dashboard_usage_processing_time as timer:
970
- self.report.new_stage("generate dashboard usage")
971
- yield from self.generate_dashboard_usage_mcps()
972
-
973
- time_taken = timer.elapsed_seconds()
974
- logger.info(f"Dashboard Usage generation took {time_taken:.3f}")
976
+ self.report.new_stage("generate dashboard usage")
977
+ yield from self.generate_dashboard_usage_mcps()
975
978
 
976
979
  if self.config.chart_usage_enabled:
977
- with self.report.chart_usage_processing_time as timer:
978
- self.report.new_stage("generate chart usage")
979
-
980
- yield from self.generate_chart_usage_mcps()
981
-
982
- time_taken = timer.elapsed_seconds()
983
- logger.info(f"Chart Usage generation took {time_taken:.3f}")
980
+ self.report.new_stage("generate chart usage")
981
+ yield from self.generate_chart_usage_mcps()
984
982
 
985
983
  if self.config.query_usage_enabled:
986
- with self.report.query_usage_processing_time as timer:
987
- self.report.new_stage("generate query usage")
988
-
989
- yield from self.generate_query_usage_mcps()
984
+ self.report.new_stage("generate query usage")
985
+ yield from self.generate_query_usage_mcps()
990
986
 
991
- time_taken = timer.elapsed_seconds()
992
- logger.info(f"Query Usage generation took {time_taken:.3f}")
987
+ self.report.new_stage("end so time is calculated for last stage")
993
988
 
994
989
  def generate_mcp_from_lazyframe(
995
990
  self, lazy_frame: polars.LazyFrame
@@ -1052,7 +1047,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1052
1047
  uniqueUserPercentileLast30Days=int(
1053
1048
  row.get("distinct_user_rank_percentile", 0) or 0
1054
1049
  ),
1055
- writeCountLast30Days=int(row.get("write_rank_percentile", 0) or 0)
1050
+ writeCountLast30Days=int(row.get("write_count", 0) or 0)
1056
1051
  if not self.config.disable_write_usage
1057
1052
  else None,
1058
1053
  writeCountPercentileLast30Days=int(
@@ -1108,6 +1103,47 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1108
1103
  row["urn"], query_usage_features
1109
1104
  )
1110
1105
 
1106
+ def _convert_platform_pairs_to_dict(
1107
+ self,
1108
+ platform_pairs: Optional[List[Dict[str, Any]]],
1109
+ value_key: str = "platform_total",
1110
+ ) -> Optional[Dict[str, Any]]:
1111
+ """Convert list of platform usage structs to dictionary."""
1112
+ if not platform_pairs:
1113
+ return None
1114
+
1115
+ return {
1116
+ pair["platform_urn"]: pair[value_key]
1117
+ for pair in platform_pairs
1118
+ if pair["platform_urn"] is not None
1119
+ }
1120
+
1121
+ def generate_user_usage_mcp_from_lazyframe(
1122
+ self, lazy_frame: polars.LazyFrame
1123
+ ) -> Iterable[MetadataWorkUnit]:
1124
+ for row in lazy_frame.collect(
1125
+ engine="streaming" if self.config.experimental_full_streaming else "auto"
1126
+ ).iter_rows(named=True):
1127
+ user_usage_features = CorpUserUsageFeaturesClass(
1128
+ userUsageTotalPast30Days=int(
1129
+ row.get("userUsageTotalPast30Days", 0) or 0
1130
+ ),
1131
+ userPlatformUsageTotalsPast30Days=self._convert_platform_pairs_to_dict(
1132
+ row.get("platform_usage_pairs", [])
1133
+ ),
1134
+ userPlatformUsagePercentilePast30Days=self._convert_platform_pairs_to_dict(
1135
+ row.get("platform_usage_percentiles", []),
1136
+ "platform_rank_percentile",
1137
+ ),
1138
+ userUsagePercentilePast30Days=row.get("userUsagePercentilePast30Days"),
1139
+ userTopDatasetsByUsage=self._convert_top_datasets_to_dict(
1140
+ row.get("top_datasets_map", [])
1141
+ ),
1142
+ )
1143
+ yield MetadataChangeProposalWrapper(
1144
+ entityUrn=row["user"], aspect=user_usage_features
1145
+ ).as_workunit(is_primary_source=False)
1146
+
1111
1147
  def generate_usage_feature_mcp(
1112
1148
  self, urn: str, usage_feature: UsageFeaturesClass
1113
1149
  ) -> Iterable[MetadataWorkUnit]:
@@ -1142,9 +1178,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1142
1178
 
1143
1179
  return self.generate_dashboard_chart_usage(entity_index, usage_index)
1144
1180
 
1145
- def generate_dashboard_chart_usage(
1146
- self, entity_index: str, usage_index: str
1147
- ) -> polars.LazyFrame:
1181
+ def _generate_dashboard_chart_entities(self, entity_index: str) -> polars.LazyFrame:
1148
1182
  entity_schema = {
1149
1183
  "entity_urn": polars.Categorical,
1150
1184
  "removed": polars.Boolean,
@@ -1161,7 +1195,12 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1161
1195
  process_function=self.soft_deleted_batch,
1162
1196
  )
1163
1197
 
1164
- dashboard_usage_schema = {
1198
+ return entities_df
1199
+
1200
+ def _generate_dashboard_chart_usage(
1201
+ self, entities_df: polars.LazyFrame, usage_index: str
1202
+ ) -> polars.LazyFrame:
1203
+ entities_usage_schema = {
1165
1204
  "timestampMillis": polars.Int64,
1166
1205
  "lastObserved": polars.Int64,
1167
1206
  "urn": polars.Categorical,
@@ -1179,7 +1218,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1179
1218
  }
1180
1219
 
1181
1220
  lf = self.load_data_from_es_to_lf(
1182
- schema=dashboard_usage_schema,
1221
+ schema=entities_usage_schema,
1183
1222
  index=usage_index,
1184
1223
  query=QueryBuilder.get_dashboard_usage_query(self.config.lookback_days),
1185
1224
  process_function=self.process_dashboard_usage,
@@ -1198,6 +1237,15 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1198
1237
  .alias("row_num")
1199
1238
  ).filter(polars.col("row_num") == 1)
1200
1239
 
1240
+ return lf
1241
+
1242
+ def generate_dashboard_chart_usage(
1243
+ self, entity_index: str, usage_index: str
1244
+ ) -> polars.LazyFrame:
1245
+ entities_df = self._generate_dashboard_chart_entities(entity_index)
1246
+
1247
+ lf = self._generate_dashboard_chart_usage(entities_df, usage_index)
1248
+
1201
1249
  # lf = lf.filter(polars.col("urn") == "urn:li:dashboard:(looker,dashboards.8)")
1202
1250
  # "urn:li:dashboard:(looker,dashboards.8)"
1203
1251
 
@@ -1367,6 +1415,367 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1367
1415
 
1368
1416
  return usage_with_top_users_with_ranks
1369
1417
 
1418
+ def _generate_user_usage_for_dataset(self) -> polars.LazyFrame:
1419
+ datasets_lf = self.get_datasets()
1420
+ if self.config.set_upstream_table_max_modification_time_for_views:
1421
+ datasets_lf = self.set_table_modification_time_for_views(datasets_lf)
1422
+
1423
+ lf = self.load_dataset_usage()
1424
+
1425
+ # Polaris/pandas join merges the join column into one column and that's why we need to filter based on the removed column
1426
+ lf = (
1427
+ lf.join(datasets_lf, left_on="urn", right_on="entity_urn", how="left")
1428
+ .filter(polars.col("removed") == False) # noqa: E712
1429
+ .drop(["removed"])
1430
+ )
1431
+
1432
+ users_lf = (
1433
+ lf.explode("userCounts")
1434
+ .unnest("userCounts")
1435
+ .filter(polars.col("user").is_not_null())
1436
+ )
1437
+
1438
+ user_dataset_usage_lf = self._create_user_dataset_usage_map(users_lf)
1439
+ return user_dataset_usage_lf
1440
+
1441
+ @staticmethod
1442
+ def _convert_top_datasets_to_dict(
1443
+ top_datasets_list: Optional[List[Dict[str, Any]]],
1444
+ ) -> Optional[Dict[str, float]]:
1445
+ """
1446
+ Convert list of top datasets structs to dictionary as expected by CorpUserUsageFeatures schema.
1447
+
1448
+ Args:
1449
+ top_datasets_list: List of dictionaries with 'dataset_urn' and 'count' keys
1450
+
1451
+ Returns:
1452
+ Dictionary mapping dataset URN to usage count, or None if input is empty
1453
+ """
1454
+ if not top_datasets_list:
1455
+ return None
1456
+
1457
+ top_datasets_dict = {
1458
+ item["dataset_urn"]: float(item["count"])
1459
+ for item in top_datasets_list
1460
+ if isinstance(item, dict) and "dataset_urn" in item and "count" in item
1461
+ }
1462
+
1463
+ return top_datasets_dict if top_datasets_dict else None
1464
+
1465
+ def _create_user_dataset_usage_map(
1466
+ self, users_lf: polars.LazyFrame, top_n: int = 25
1467
+ ) -> polars.LazyFrame:
1468
+ """
1469
+ Creates a lazyframe with user string and map of top N datasets by usage.
1470
+
1471
+ Args:
1472
+ users_lf: LazyFrame containing user usage data with columns: user, urn, platform, count
1473
+ top_n: Number of top datasets to include per user (default: 25)
1474
+
1475
+ Returns:
1476
+ LazyFrame with columns:
1477
+ - user: string column containing the user identifier
1478
+ - top_datasets_map: list of structs with dataset_urn (string), count (int), and platform_urn (string)
1479
+ - userUsageTotalPast30Days: total usage count for the user across all datasets
1480
+ - userPlatformUsageTotalsPast30Days: map from platform URN to usage totals
1481
+ """
1482
+
1483
+ # Create intermediate lazy frame with filtered users and aggregated counts
1484
+ user_dataset_aggregated = (
1485
+ users_lf.filter(polars.col("user").str.contains("@"))
1486
+ .group_by("user", "urn", "platform")
1487
+ .agg(polars.col("count").sum().alias("total_count"))
1488
+ .with_columns(
1489
+ # Direct string formatting - vectorized operation
1490
+ polars.format("urn:li:dataPlatform:{}", polars.col("platform")).alias(
1491
+ "platform_urn"
1492
+ )
1493
+ )
1494
+ )
1495
+
1496
+ # Calculate user totals
1497
+ user_totals = user_dataset_aggregated.group_by("user").agg(
1498
+ polars.col("total_count").sum().alias("userUsageTotalPast30Days")
1499
+ )
1500
+
1501
+ # Calculate platform totals for each user - keep as list of structs
1502
+ platform_totals = (
1503
+ user_dataset_aggregated.group_by("user", "platform_urn")
1504
+ .agg(polars.col("total_count").sum().alias("platform_total"))
1505
+ .filter(polars.col("platform_urn").is_not_null())
1506
+ .group_by("user")
1507
+ .agg(
1508
+ polars.struct(
1509
+ [
1510
+ polars.col("platform_urn"),
1511
+ polars.col("platform_total").cast(polars.Float64),
1512
+ ]
1513
+ ).alias("platform_usage_pairs")
1514
+ )
1515
+ )
1516
+
1517
+ # Calculate top datasets
1518
+ top_datasets = (
1519
+ user_dataset_aggregated.with_columns(
1520
+ polars.col("total_count")
1521
+ .rank(descending=True, method="ordinal")
1522
+ .over("user")
1523
+ .alias("dataset_rank")
1524
+ )
1525
+ .filter(polars.col("dataset_rank") <= top_n)
1526
+ .group_by("user")
1527
+ .agg(
1528
+ polars.struct(
1529
+ [
1530
+ polars.col("urn").alias("dataset_urn"),
1531
+ polars.col("total_count").alias("count"),
1532
+ polars.col("platform_urn"),
1533
+ ]
1534
+ )
1535
+ .sort_by("total_count", descending=True)
1536
+ .alias("top_datasets_map")
1537
+ )
1538
+ )
1539
+
1540
+ # Join all results
1541
+ return top_datasets.join(user_totals, on="user", how="left").join(
1542
+ platform_totals, on="user", how="left"
1543
+ )
1544
+
1545
+ def _combine_user_usage_data(
1546
+ self,
1547
+ dataset_usage_lf: polars.LazyFrame,
1548
+ dashboard_usage_lf: polars.LazyFrame,
1549
+ chart_usage_lf: polars.LazyFrame,
1550
+ ) -> polars.LazyFrame:
1551
+ """
1552
+ Combines user usage data from dataset, dashboard, and chart sources.
1553
+
1554
+ Args:
1555
+ dataset_usage_lf: LazyFrame with dataset usage data containing top_datasets_map
1556
+ dashboard_usage_lf: LazyFrame with dashboard usage data
1557
+ chart_usage_lf: LazyFrame with chart usage data
1558
+
1559
+ Returns:
1560
+ Combined LazyFrame with aggregated usage data per user
1561
+ """
1562
+ user_totals = self._combine_user_totals(
1563
+ dataset_usage_lf, dashboard_usage_lf, chart_usage_lf
1564
+ )
1565
+
1566
+ platform_pairs = self._combine_platform_pairs(
1567
+ dataset_usage_lf, dashboard_usage_lf, chart_usage_lf
1568
+ )
1569
+
1570
+ result = user_totals.join(platform_pairs, on="user", how="left")
1571
+
1572
+ return result.with_columns(
1573
+ polars.col("platform_usage_pairs").fill_null(polars.lit([]))
1574
+ )
1575
+
1576
+ def _combine_user_totals(
1577
+ self,
1578
+ dataset_usage_lf: polars.LazyFrame,
1579
+ dashboard_usage_lf: polars.LazyFrame,
1580
+ chart_usage_lf: polars.LazyFrame,
1581
+ ) -> polars.LazyFrame:
1582
+ """Combine user totals and top_datasets_map from all sources."""
1583
+ # Collect all unique users in one operation
1584
+ all_users_lf = polars.concat(
1585
+ [
1586
+ dataset_usage_lf.select("user"),
1587
+ dashboard_usage_lf.select("user"),
1588
+ chart_usage_lf.select("user"),
1589
+ ]
1590
+ ).unique()
1591
+
1592
+ return (
1593
+ all_users_lf.join(
1594
+ dataset_usage_lf.select(
1595
+ ["user", "top_datasets_map", "userUsageTotalPast30Days"]
1596
+ ),
1597
+ on="user",
1598
+ how="left",
1599
+ )
1600
+ .join(
1601
+ dashboard_usage_lf.select(["user", "userUsageTotalPast30Days"]),
1602
+ on="user",
1603
+ how="left",
1604
+ suffix="_dashboard",
1605
+ )
1606
+ .join(
1607
+ chart_usage_lf.select(["user", "userUsageTotalPast30Days"]),
1608
+ on="user",
1609
+ how="left",
1610
+ suffix="_chart",
1611
+ )
1612
+ .with_columns(
1613
+ [
1614
+ # Sum with explicit null handling
1615
+ (
1616
+ polars.col("userUsageTotalPast30Days").fill_null(0)
1617
+ + polars.col("userUsageTotalPast30Days_dashboard").fill_null(0)
1618
+ + polars.col("userUsageTotalPast30Days_chart").fill_null(0)
1619
+ ).alias("userUsageTotalPast30Days")
1620
+ ]
1621
+ )
1622
+ .select(["user", "top_datasets_map", "userUsageTotalPast30Days"])
1623
+ )
1624
+
1625
+ def _combine_platform_pairs(
1626
+ self,
1627
+ dataset_usage_lf: polars.LazyFrame,
1628
+ dashboard_usage_lf: polars.LazyFrame,
1629
+ chart_usage_lf: polars.LazyFrame,
1630
+ ) -> polars.LazyFrame:
1631
+ """Combine platform usage pairs from all sources."""
1632
+ all_platforms = []
1633
+
1634
+ # Extract platforms from each source
1635
+ for source_lf, col_name in [
1636
+ (dataset_usage_lf, "platform_usage_pairs"),
1637
+ (dashboard_usage_lf, "platform_usage_pairs"),
1638
+ (chart_usage_lf, "platform_usage_pairs"),
1639
+ ]:
1640
+ platforms = self._extract_platforms_from_source(source_lf, col_name)
1641
+ if platforms is not None:
1642
+ all_platforms.append(platforms)
1643
+
1644
+ if not all_platforms:
1645
+ # Return empty result if no platforms found
1646
+ return polars.LazyFrame({"user": [], "platform_usage_pairs": []})
1647
+
1648
+ # Combine all platforms and aggregate by user + platform
1649
+ combined_platforms = polars.concat(all_platforms, how="vertical_relaxed")
1650
+ aggregated = combined_platforms.group_by("user", "platform_urn").agg(
1651
+ polars.col("platform_total").sum().alias("platform_total")
1652
+ )
1653
+
1654
+ # Rebuild platform_usage_pairs structure
1655
+ return aggregated.group_by("user").agg(
1656
+ polars.struct(
1657
+ [polars.col("platform_urn"), polars.col("platform_total")]
1658
+ ).alias("platform_usage_pairs")
1659
+ )
1660
+
1661
+ def _extract_platforms_from_source(
1662
+ self, source_lf: polars.LazyFrame, col_name: str
1663
+ ) -> polars.LazyFrame | None:
1664
+ """Extract platform data from a source LazyFrame."""
1665
+ try:
1666
+ return (
1667
+ source_lf.select(["user", col_name])
1668
+ .filter(polars.col(col_name).is_not_null())
1669
+ .filter(polars.col(col_name).list.len() > 0)
1670
+ .explode(col_name)
1671
+ .unnest(col_name)
1672
+ .filter(polars.col("platform_urn").is_not_null())
1673
+ .select(["user", "platform_urn", "platform_total"])
1674
+ )
1675
+ except polars.exceptions.ColumnNotFoundError:
1676
+ return None
1677
+
1678
+ def add_platform_usage_percentiles(
1679
+ self, user_usage_lf: polars.LazyFrame
1680
+ ) -> polars.LazyFrame:
1681
+ """
1682
+ Add platform usage percentiles to user usage data.
1683
+
1684
+ Args:
1685
+ user_usage_lf: LazyFrame with user usage data containing platform_usage_pairs column
1686
+
1687
+ Returns:
1688
+ LazyFrame with additional platform_usage_percentiles column
1689
+ """
1690
+ # First explode the platform_usage_pairs to work with individual platform usage records
1691
+ platform_usage_exploded = (
1692
+ user_usage_lf.explode("platform_usage_pairs")
1693
+ .unnest("platform_usage_pairs")
1694
+ .filter(polars.col("platform_urn").is_not_null())
1695
+ )
1696
+
1697
+ # Use the existing gen_rank_and_percentile method to calculate percentiles
1698
+ platform_percentiles_with_ranks = self.gen_rank_and_percentile(
1699
+ lf=platform_usage_exploded,
1700
+ count_field="platform_total",
1701
+ urn_field="user",
1702
+ platform_field="platform_urn",
1703
+ prefix="platform_",
1704
+ use_exp_cdf=False,
1705
+ )
1706
+
1707
+ # Group back by user and create the percentiles structure
1708
+ platform_percentiles = platform_percentiles_with_ranks.group_by("user").agg(
1709
+ polars.struct(
1710
+ [
1711
+ polars.col("platform_urn"),
1712
+ polars.col("platform_rank_percentile").cast(polars.Float64),
1713
+ ]
1714
+ ).alias("platform_usage_percentiles")
1715
+ )
1716
+
1717
+ # Join the percentiles back to the original user_usage_lf
1718
+ return user_usage_lf.join(platform_percentiles, on="user", how="left")
1719
+
1720
+ def _generate_user_usage_for_dashboard_charts(
1721
+ self, entity_index: str, usage_index: str
1722
+ ) -> polars.LazyFrame:
1723
+ entities_df = self._generate_dashboard_chart_entities(entity_index)
1724
+ lf = self._generate_dashboard_chart_usage(entities_df, usage_index)
1725
+
1726
+ # Process dashboard usage data into user usage format (similar to dataset version)
1727
+ users_lf = (
1728
+ lf.explode("userCounts")
1729
+ .unnest("userCounts")
1730
+ .filter(polars.col("user").is_not_null())
1731
+ .rename({"usageCount": "count"}) # Rename to match dataset schema
1732
+ )
1733
+
1734
+ user_dashboard_usage_lf = self._create_user_dataset_usage_map(users_lf)
1735
+ return user_dashboard_usage_lf
1736
+
1737
+ def generate_user_usage(self) -> polars.LazyFrame:
1738
+ dataset_usage_lf = self._generate_user_usage_for_dataset()
1739
+
1740
+ usage_index = "dashboard_dashboardusagestatisticsaspect_v1"
1741
+ entity_index = "dashboardindex_v2"
1742
+ dashboard_usage_lf = self._generate_user_usage_for_dashboard_charts(
1743
+ entity_index, usage_index
1744
+ )
1745
+
1746
+ entity_index = "chartindex_v2"
1747
+ usage_index = "chart_chartusagestatisticsaspect_v1"
1748
+ chart_usage_lf = self._generate_user_usage_for_dashboard_charts(
1749
+ entity_index, usage_index
1750
+ )
1751
+
1752
+ # Combine all three usage sources
1753
+ lf = self._combine_user_usage_data(
1754
+ dataset_usage_lf, dashboard_usage_lf, chart_usage_lf
1755
+ )
1756
+
1757
+ lf = self.add_platform_usage_percentiles(lf)
1758
+
1759
+ # Add user usage percentiles across all users (not grouped by platform)
1760
+ # Create a temporary platform field for percentile calculation
1761
+ lf = lf.with_columns(polars.lit("all_users").alias("temp_platform"))
1762
+
1763
+ lf = self.gen_rank_and_percentile(
1764
+ lf=lf,
1765
+ count_field="userUsageTotalPast30Days",
1766
+ urn_field="user",
1767
+ platform_field="temp_platform",
1768
+ prefix="userUsage",
1769
+ use_exp_cdf=False,
1770
+ )
1771
+
1772
+ # Rename the percentile column to match the schema field name and remove temp field
1773
+ lf = lf.rename(
1774
+ {"userUsagerank_percentile": "userUsagePercentilePast30Days"}
1775
+ ).drop("temp_platform")
1776
+
1777
+ return lf
1778
+
1370
1779
  def generate_dataset_usage(self) -> polars.LazyFrame:
1371
1780
  datasets_lf = self.get_datasets()
1372
1781
  if self.config.set_upstream_table_max_modification_time_for_views:
@@ -15,13 +15,16 @@ from .....schema_classes import CorpUserCredentialsClass
15
15
  from .....schema_classes import CorpUserEditableInfoClass
16
16
  from .....schema_classes import CorpUserHomePageSettingsClass
17
17
  from .....schema_classes import CorpUserInfoClass
18
+ from .....schema_classes import CorpUserInvitationStatusClass
18
19
  from .....schema_classes import CorpUserSettingsClass
19
20
  from .....schema_classes import CorpUserStatusClass
20
21
  from .....schema_classes import CorpUserViewsSettingsClass
21
22
  from .....schema_classes import GroupMembershipClass
23
+ from .....schema_classes import InvitationStatusClass
22
24
  from .....schema_classes import InviteTokenClass
23
25
  from .....schema_classes import NativeGroupMembershipClass
24
26
  from .....schema_classes import RoleMembershipClass
27
+ from .....schema_classes import TokenTypeClass
25
28
 
26
29
 
27
30
  CorpGroupEditableInfo = CorpGroupEditableInfoClass
@@ -32,12 +35,15 @@ CorpUserCredentials = CorpUserCredentialsClass
32
35
  CorpUserEditableInfo = CorpUserEditableInfoClass
33
36
  CorpUserHomePageSettings = CorpUserHomePageSettingsClass
34
37
  CorpUserInfo = CorpUserInfoClass
38
+ CorpUserInvitationStatus = CorpUserInvitationStatusClass
35
39
  CorpUserSettings = CorpUserSettingsClass
36
40
  CorpUserStatus = CorpUserStatusClass
37
41
  CorpUserViewsSettings = CorpUserViewsSettingsClass
38
42
  GroupMembership = GroupMembershipClass
43
+ InvitationStatus = InvitationStatusClass
39
44
  InviteToken = InviteTokenClass
40
45
  NativeGroupMembership = NativeGroupMembershipClass
41
46
  RoleMembership = RoleMembershipClass
47
+ TokenType = TokenTypeClass
42
48
 
43
49
  # fmt: on
@@ -7,6 +7,7 @@
7
7
  # pylint: skip-file
8
8
  # fmt: off
9
9
  # isort: skip_file
10
+ from .......schema_classes import CorpUserUsageFeaturesClass
10
11
  from .......schema_classes import CostCurrencyCodeClass
11
12
  from .......schema_classes import CostFeaturesClass
12
13
  from .......schema_classes import LineageFeaturesClass
@@ -14,6 +15,7 @@ from .......schema_classes import StorageFeaturesClass
14
15
  from .......schema_classes import UsageFeaturesClass
15
16
 
16
17
 
18
+ CorpUserUsageFeatures = CorpUserUsageFeaturesClass
17
19
  CostCurrencyCode = CostCurrencyCodeClass
18
20
  CostFeatures = CostFeaturesClass
19
21
  LineageFeatures = LineageFeaturesClass
@@ -9,9 +9,13 @@
9
9
  # isort: skip_file
10
10
  from .......schema_classes import EntityChangeEventClass
11
11
  from .......schema_classes import ParametersClass
12
+ from .......schema_classes import RelationshipChangeEventClass
13
+ from .......schema_classes import RelationshipChangeOperationClass
12
14
 
13
15
 
14
16
  EntityChangeEvent = EntityChangeEventClass
15
17
  Parameters = ParametersClass
18
+ RelationshipChangeEvent = RelationshipChangeEventClass
19
+ RelationshipChangeOperation = RelationshipChangeOperationClass
16
20
 
17
21
  # fmt: on