acryl-datahub-cloud 0.3.13.3rc0__py3-none-any.whl → 0.3.14rc0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub-cloud might be problematic. Click here for more details.
- acryl_datahub_cloud/_codegen_config.json +1 -1
- acryl_datahub_cloud/datahub_usage_reporting/usage_feature_reporter.py +443 -34
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/identity/__init__.py +6 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/metadata/search/features/__init__.py +2 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- acryl_datahub_cloud/metadata/schema.avsc +445 -107
- acryl_datahub_cloud/metadata/schema_classes.py +420 -19
- acryl_datahub_cloud/metadata/schemas/ActionRequestInfo.avsc +95 -0
- acryl_datahub_cloud/metadata/schemas/AssertionAnalyticsRunEvent.avsc +0 -21
- acryl_datahub_cloud/metadata/schemas/ChartInfo.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/CorpUserInvitationStatus.avsc +106 -0
- acryl_datahub_cloud/metadata/schemas/CorpUserKey.avsc +2 -0
- acryl_datahub_cloud/metadata/schemas/CorpUserUsageFeatures.avsc +93 -0
- acryl_datahub_cloud/metadata/schemas/DataHubPageModuleProperties.avsc +13 -2
- acryl_datahub_cloud/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- acryl_datahub_cloud/metadata/schemas/InferredMetadata.avsc +69 -0
- acryl_datahub_cloud/metadata/schemas/InviteToken.avsc +26 -0
- acryl_datahub_cloud/metadata/schemas/LogicalParent.avsc +104 -100
- acryl_datahub_cloud/metadata/schemas/MetadataChangeEvent.avsc +98 -45
- acryl_datahub_cloud/metadata/schemas/NotificationRequest.avsc +71 -0
- acryl_datahub_cloud/metadata/schemas/Ownership.avsc +69 -0
- acryl_datahub_cloud/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- acryl_datahub_cloud/metadata/schemas/SchemaFieldKey.avsc +2 -0
- acryl_datahub_cloud/metadata/schemas/StructuredProperties.avsc +69 -0
- acryl_datahub_cloud/metadata/schemas/StructuredPropertyDefinition.avsc +3 -0
- acryl_datahub_cloud/sdk/assertions_client.py +21 -7
- acryl_datahub_cloud/sdk/resolver_client.py +4 -1
- acryl_datahub_cloud/sdk/subscription_client.py +8 -3
- {acryl_datahub_cloud-0.3.13.3rc0.dist-info → acryl_datahub_cloud-0.3.14rc0.dist-info}/METADATA +50 -50
- {acryl_datahub_cloud-0.3.13.3rc0.dist-info → acryl_datahub_cloud-0.3.14rc0.dist-info}/RECORD +33 -30
- {acryl_datahub_cloud-0.3.13.3rc0.dist-info → acryl_datahub_cloud-0.3.14rc0.dist-info}/WHEEL +0 -0
- {acryl_datahub_cloud-0.3.13.3rc0.dist-info → acryl_datahub_cloud-0.3.14rc0.dist-info}/entry_points.txt +0 -0
- {acryl_datahub_cloud-0.3.13.3rc0.dist-info → acryl_datahub_cloud-0.3.14rc0.dist-info}/top_level.txt +0 -0
|
@@ -28,6 +28,7 @@ from acryl_datahub_cloud.datahub_usage_reporting.usage_feature_patch_builder imp
|
|
|
28
28
|
)
|
|
29
29
|
from acryl_datahub_cloud.elasticsearch.config import ElasticSearchClientConfig
|
|
30
30
|
from acryl_datahub_cloud.metadata.schema_classes import (
|
|
31
|
+
CorpUserUsageFeaturesClass,
|
|
31
32
|
QueryUsageFeaturesClass,
|
|
32
33
|
UsageFeaturesClass,
|
|
33
34
|
)
|
|
@@ -135,6 +136,10 @@ class DataHubUsageFeatureReportingSourceConfig(
|
|
|
135
136
|
None,
|
|
136
137
|
description="Optional configuration for stateful ingestion, including stale metadata removal.",
|
|
137
138
|
)
|
|
139
|
+
user_usage_enabled: bool = Field(
|
|
140
|
+
True,
|
|
141
|
+
description="Flag to enable or disable user usage statistics collection.",
|
|
142
|
+
)
|
|
138
143
|
dataset_usage_enabled: bool = Field(
|
|
139
144
|
True,
|
|
140
145
|
description="Flag to enable or disable dataset usage statistics collection.",
|
|
@@ -241,10 +246,6 @@ class DatahubUsageFeatureReport(IngestionStageReport, StatefulIngestionReport):
|
|
|
241
246
|
default_factory=lambda: defaultdict(lambda: PerfTimer())
|
|
242
247
|
)
|
|
243
248
|
|
|
244
|
-
dataset_usage_processing_time: PerfTimer = PerfTimer()
|
|
245
|
-
dashboard_usage_processing_time: PerfTimer = PerfTimer()
|
|
246
|
-
chart_usage_processing_time: PerfTimer = PerfTimer()
|
|
247
|
-
query_usage_processing_time: PerfTimer = PerfTimer()
|
|
248
249
|
query_platforms_count: Dict[str, int] = field(
|
|
249
250
|
default_factory=lambda: defaultdict(lambda: 0)
|
|
250
251
|
)
|
|
@@ -923,6 +924,11 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
923
924
|
|
|
924
925
|
return dataset_df
|
|
925
926
|
|
|
927
|
+
def generate_user_usage_mcps(self) -> Iterable[MetadataWorkUnit]:
|
|
928
|
+
with polars.StringCache():
|
|
929
|
+
user_usage_lf = self.generate_user_usage()
|
|
930
|
+
yield from self.generate_user_usage_mcp_from_lazyframe(user_usage_lf)
|
|
931
|
+
|
|
926
932
|
def generate_dataset_usage_mcps(self) -> Iterable[MetadataWorkUnit]:
|
|
927
933
|
with polars.StringCache():
|
|
928
934
|
dataset_usage_df = self.generate_dataset_usage()
|
|
@@ -958,38 +964,27 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
958
964
|
]
|
|
959
965
|
|
|
960
966
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
967
|
+
if self.config.user_usage_enabled:
|
|
968
|
+
self.report.new_stage("generate user usage")
|
|
969
|
+
yield from self.generate_user_usage_mcps()
|
|
970
|
+
|
|
961
971
|
if self.config.dataset_usage_enabled:
|
|
962
|
-
|
|
963
|
-
|
|
964
|
-
yield from self.generate_dataset_usage_mcps()
|
|
965
|
-
time_taken = timer.elapsed_seconds()
|
|
966
|
-
logger.info(f"Dataset Usage generation took {time_taken:.3f} seconds")
|
|
972
|
+
self.report.new_stage("generate dataset usage")
|
|
973
|
+
yield from self.generate_dataset_usage_mcps()
|
|
967
974
|
|
|
968
975
|
if self.config.dashboard_usage_enabled:
|
|
969
|
-
|
|
970
|
-
|
|
971
|
-
yield from self.generate_dashboard_usage_mcps()
|
|
972
|
-
|
|
973
|
-
time_taken = timer.elapsed_seconds()
|
|
974
|
-
logger.info(f"Dashboard Usage generation took {time_taken:.3f}")
|
|
976
|
+
self.report.new_stage("generate dashboard usage")
|
|
977
|
+
yield from self.generate_dashboard_usage_mcps()
|
|
975
978
|
|
|
976
979
|
if self.config.chart_usage_enabled:
|
|
977
|
-
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
yield from self.generate_chart_usage_mcps()
|
|
981
|
-
|
|
982
|
-
time_taken = timer.elapsed_seconds()
|
|
983
|
-
logger.info(f"Chart Usage generation took {time_taken:.3f}")
|
|
980
|
+
self.report.new_stage("generate chart usage")
|
|
981
|
+
yield from self.generate_chart_usage_mcps()
|
|
984
982
|
|
|
985
983
|
if self.config.query_usage_enabled:
|
|
986
|
-
|
|
987
|
-
|
|
988
|
-
|
|
989
|
-
yield from self.generate_query_usage_mcps()
|
|
984
|
+
self.report.new_stage("generate query usage")
|
|
985
|
+
yield from self.generate_query_usage_mcps()
|
|
990
986
|
|
|
991
|
-
|
|
992
|
-
logger.info(f"Query Usage generation took {time_taken:.3f}")
|
|
987
|
+
self.report.new_stage("end so time is calculated for last stage")
|
|
993
988
|
|
|
994
989
|
def generate_mcp_from_lazyframe(
|
|
995
990
|
self, lazy_frame: polars.LazyFrame
|
|
@@ -1052,7 +1047,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1052
1047
|
uniqueUserPercentileLast30Days=int(
|
|
1053
1048
|
row.get("distinct_user_rank_percentile", 0) or 0
|
|
1054
1049
|
),
|
|
1055
|
-
writeCountLast30Days=int(row.get("
|
|
1050
|
+
writeCountLast30Days=int(row.get("write_count", 0) or 0)
|
|
1056
1051
|
if not self.config.disable_write_usage
|
|
1057
1052
|
else None,
|
|
1058
1053
|
writeCountPercentileLast30Days=int(
|
|
@@ -1108,6 +1103,47 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1108
1103
|
row["urn"], query_usage_features
|
|
1109
1104
|
)
|
|
1110
1105
|
|
|
1106
|
+
def _convert_platform_pairs_to_dict(
|
|
1107
|
+
self,
|
|
1108
|
+
platform_pairs: Optional[List[Dict[str, Any]]],
|
|
1109
|
+
value_key: str = "platform_total",
|
|
1110
|
+
) -> Optional[Dict[str, Any]]:
|
|
1111
|
+
"""Convert list of platform usage structs to dictionary."""
|
|
1112
|
+
if not platform_pairs:
|
|
1113
|
+
return None
|
|
1114
|
+
|
|
1115
|
+
return {
|
|
1116
|
+
pair["platform_urn"]: pair[value_key]
|
|
1117
|
+
for pair in platform_pairs
|
|
1118
|
+
if pair["platform_urn"] is not None
|
|
1119
|
+
}
|
|
1120
|
+
|
|
1121
|
+
def generate_user_usage_mcp_from_lazyframe(
|
|
1122
|
+
self, lazy_frame: polars.LazyFrame
|
|
1123
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
1124
|
+
for row in lazy_frame.collect(
|
|
1125
|
+
engine="streaming" if self.config.experimental_full_streaming else "auto"
|
|
1126
|
+
).iter_rows(named=True):
|
|
1127
|
+
user_usage_features = CorpUserUsageFeaturesClass(
|
|
1128
|
+
userUsageTotalPast30Days=int(
|
|
1129
|
+
row.get("userUsageTotalPast30Days", 0) or 0
|
|
1130
|
+
),
|
|
1131
|
+
userPlatformUsageTotalsPast30Days=self._convert_platform_pairs_to_dict(
|
|
1132
|
+
row.get("platform_usage_pairs", [])
|
|
1133
|
+
),
|
|
1134
|
+
userPlatformUsagePercentilePast30Days=self._convert_platform_pairs_to_dict(
|
|
1135
|
+
row.get("platform_usage_percentiles", []),
|
|
1136
|
+
"platform_rank_percentile",
|
|
1137
|
+
),
|
|
1138
|
+
userUsagePercentilePast30Days=row.get("userUsagePercentilePast30Days"),
|
|
1139
|
+
userTopDatasetsByUsage=self._convert_top_datasets_to_dict(
|
|
1140
|
+
row.get("top_datasets_map", [])
|
|
1141
|
+
),
|
|
1142
|
+
)
|
|
1143
|
+
yield MetadataChangeProposalWrapper(
|
|
1144
|
+
entityUrn=row["user"], aspect=user_usage_features
|
|
1145
|
+
).as_workunit(is_primary_source=False)
|
|
1146
|
+
|
|
1111
1147
|
def generate_usage_feature_mcp(
|
|
1112
1148
|
self, urn: str, usage_feature: UsageFeaturesClass
|
|
1113
1149
|
) -> Iterable[MetadataWorkUnit]:
|
|
@@ -1142,9 +1178,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1142
1178
|
|
|
1143
1179
|
return self.generate_dashboard_chart_usage(entity_index, usage_index)
|
|
1144
1180
|
|
|
1145
|
-
def
|
|
1146
|
-
self, entity_index: str, usage_index: str
|
|
1147
|
-
) -> polars.LazyFrame:
|
|
1181
|
+
def _generate_dashboard_chart_entities(self, entity_index: str) -> polars.LazyFrame:
|
|
1148
1182
|
entity_schema = {
|
|
1149
1183
|
"entity_urn": polars.Categorical,
|
|
1150
1184
|
"removed": polars.Boolean,
|
|
@@ -1161,7 +1195,12 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1161
1195
|
process_function=self.soft_deleted_batch,
|
|
1162
1196
|
)
|
|
1163
1197
|
|
|
1164
|
-
|
|
1198
|
+
return entities_df
|
|
1199
|
+
|
|
1200
|
+
def _generate_dashboard_chart_usage(
|
|
1201
|
+
self, entities_df: polars.LazyFrame, usage_index: str
|
|
1202
|
+
) -> polars.LazyFrame:
|
|
1203
|
+
entities_usage_schema = {
|
|
1165
1204
|
"timestampMillis": polars.Int64,
|
|
1166
1205
|
"lastObserved": polars.Int64,
|
|
1167
1206
|
"urn": polars.Categorical,
|
|
@@ -1179,7 +1218,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1179
1218
|
}
|
|
1180
1219
|
|
|
1181
1220
|
lf = self.load_data_from_es_to_lf(
|
|
1182
|
-
schema=
|
|
1221
|
+
schema=entities_usage_schema,
|
|
1183
1222
|
index=usage_index,
|
|
1184
1223
|
query=QueryBuilder.get_dashboard_usage_query(self.config.lookback_days),
|
|
1185
1224
|
process_function=self.process_dashboard_usage,
|
|
@@ -1198,6 +1237,15 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1198
1237
|
.alias("row_num")
|
|
1199
1238
|
).filter(polars.col("row_num") == 1)
|
|
1200
1239
|
|
|
1240
|
+
return lf
|
|
1241
|
+
|
|
1242
|
+
def generate_dashboard_chart_usage(
|
|
1243
|
+
self, entity_index: str, usage_index: str
|
|
1244
|
+
) -> polars.LazyFrame:
|
|
1245
|
+
entities_df = self._generate_dashboard_chart_entities(entity_index)
|
|
1246
|
+
|
|
1247
|
+
lf = self._generate_dashboard_chart_usage(entities_df, usage_index)
|
|
1248
|
+
|
|
1201
1249
|
# lf = lf.filter(polars.col("urn") == "urn:li:dashboard:(looker,dashboards.8)")
|
|
1202
1250
|
# "urn:li:dashboard:(looker,dashboards.8)"
|
|
1203
1251
|
|
|
@@ -1367,6 +1415,367 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1367
1415
|
|
|
1368
1416
|
return usage_with_top_users_with_ranks
|
|
1369
1417
|
|
|
1418
|
+
def _generate_user_usage_for_dataset(self) -> polars.LazyFrame:
|
|
1419
|
+
datasets_lf = self.get_datasets()
|
|
1420
|
+
if self.config.set_upstream_table_max_modification_time_for_views:
|
|
1421
|
+
datasets_lf = self.set_table_modification_time_for_views(datasets_lf)
|
|
1422
|
+
|
|
1423
|
+
lf = self.load_dataset_usage()
|
|
1424
|
+
|
|
1425
|
+
# Polaris/pandas join merges the join column into one column and that's why we need to filter based on the removed column
|
|
1426
|
+
lf = (
|
|
1427
|
+
lf.join(datasets_lf, left_on="urn", right_on="entity_urn", how="left")
|
|
1428
|
+
.filter(polars.col("removed") == False) # noqa: E712
|
|
1429
|
+
.drop(["removed"])
|
|
1430
|
+
)
|
|
1431
|
+
|
|
1432
|
+
users_lf = (
|
|
1433
|
+
lf.explode("userCounts")
|
|
1434
|
+
.unnest("userCounts")
|
|
1435
|
+
.filter(polars.col("user").is_not_null())
|
|
1436
|
+
)
|
|
1437
|
+
|
|
1438
|
+
user_dataset_usage_lf = self._create_user_dataset_usage_map(users_lf)
|
|
1439
|
+
return user_dataset_usage_lf
|
|
1440
|
+
|
|
1441
|
+
@staticmethod
|
|
1442
|
+
def _convert_top_datasets_to_dict(
|
|
1443
|
+
top_datasets_list: Optional[List[Dict[str, Any]]],
|
|
1444
|
+
) -> Optional[Dict[str, float]]:
|
|
1445
|
+
"""
|
|
1446
|
+
Convert list of top datasets structs to dictionary as expected by CorpUserUsageFeatures schema.
|
|
1447
|
+
|
|
1448
|
+
Args:
|
|
1449
|
+
top_datasets_list: List of dictionaries with 'dataset_urn' and 'count' keys
|
|
1450
|
+
|
|
1451
|
+
Returns:
|
|
1452
|
+
Dictionary mapping dataset URN to usage count, or None if input is empty
|
|
1453
|
+
"""
|
|
1454
|
+
if not top_datasets_list:
|
|
1455
|
+
return None
|
|
1456
|
+
|
|
1457
|
+
top_datasets_dict = {
|
|
1458
|
+
item["dataset_urn"]: float(item["count"])
|
|
1459
|
+
for item in top_datasets_list
|
|
1460
|
+
if isinstance(item, dict) and "dataset_urn" in item and "count" in item
|
|
1461
|
+
}
|
|
1462
|
+
|
|
1463
|
+
return top_datasets_dict if top_datasets_dict else None
|
|
1464
|
+
|
|
1465
|
+
def _create_user_dataset_usage_map(
|
|
1466
|
+
self, users_lf: polars.LazyFrame, top_n: int = 25
|
|
1467
|
+
) -> polars.LazyFrame:
|
|
1468
|
+
"""
|
|
1469
|
+
Creates a lazyframe with user string and map of top N datasets by usage.
|
|
1470
|
+
|
|
1471
|
+
Args:
|
|
1472
|
+
users_lf: LazyFrame containing user usage data with columns: user, urn, platform, count
|
|
1473
|
+
top_n: Number of top datasets to include per user (default: 25)
|
|
1474
|
+
|
|
1475
|
+
Returns:
|
|
1476
|
+
LazyFrame with columns:
|
|
1477
|
+
- user: string column containing the user identifier
|
|
1478
|
+
- top_datasets_map: list of structs with dataset_urn (string), count (int), and platform_urn (string)
|
|
1479
|
+
- userUsageTotalPast30Days: total usage count for the user across all datasets
|
|
1480
|
+
- userPlatformUsageTotalsPast30Days: map from platform URN to usage totals
|
|
1481
|
+
"""
|
|
1482
|
+
|
|
1483
|
+
# Create intermediate lazy frame with filtered users and aggregated counts
|
|
1484
|
+
user_dataset_aggregated = (
|
|
1485
|
+
users_lf.filter(polars.col("user").str.contains("@"))
|
|
1486
|
+
.group_by("user", "urn", "platform")
|
|
1487
|
+
.agg(polars.col("count").sum().alias("total_count"))
|
|
1488
|
+
.with_columns(
|
|
1489
|
+
# Direct string formatting - vectorized operation
|
|
1490
|
+
polars.format("urn:li:dataPlatform:{}", polars.col("platform")).alias(
|
|
1491
|
+
"platform_urn"
|
|
1492
|
+
)
|
|
1493
|
+
)
|
|
1494
|
+
)
|
|
1495
|
+
|
|
1496
|
+
# Calculate user totals
|
|
1497
|
+
user_totals = user_dataset_aggregated.group_by("user").agg(
|
|
1498
|
+
polars.col("total_count").sum().alias("userUsageTotalPast30Days")
|
|
1499
|
+
)
|
|
1500
|
+
|
|
1501
|
+
# Calculate platform totals for each user - keep as list of structs
|
|
1502
|
+
platform_totals = (
|
|
1503
|
+
user_dataset_aggregated.group_by("user", "platform_urn")
|
|
1504
|
+
.agg(polars.col("total_count").sum().alias("platform_total"))
|
|
1505
|
+
.filter(polars.col("platform_urn").is_not_null())
|
|
1506
|
+
.group_by("user")
|
|
1507
|
+
.agg(
|
|
1508
|
+
polars.struct(
|
|
1509
|
+
[
|
|
1510
|
+
polars.col("platform_urn"),
|
|
1511
|
+
polars.col("platform_total").cast(polars.Float64),
|
|
1512
|
+
]
|
|
1513
|
+
).alias("platform_usage_pairs")
|
|
1514
|
+
)
|
|
1515
|
+
)
|
|
1516
|
+
|
|
1517
|
+
# Calculate top datasets
|
|
1518
|
+
top_datasets = (
|
|
1519
|
+
user_dataset_aggregated.with_columns(
|
|
1520
|
+
polars.col("total_count")
|
|
1521
|
+
.rank(descending=True, method="ordinal")
|
|
1522
|
+
.over("user")
|
|
1523
|
+
.alias("dataset_rank")
|
|
1524
|
+
)
|
|
1525
|
+
.filter(polars.col("dataset_rank") <= top_n)
|
|
1526
|
+
.group_by("user")
|
|
1527
|
+
.agg(
|
|
1528
|
+
polars.struct(
|
|
1529
|
+
[
|
|
1530
|
+
polars.col("urn").alias("dataset_urn"),
|
|
1531
|
+
polars.col("total_count").alias("count"),
|
|
1532
|
+
polars.col("platform_urn"),
|
|
1533
|
+
]
|
|
1534
|
+
)
|
|
1535
|
+
.sort_by("total_count", descending=True)
|
|
1536
|
+
.alias("top_datasets_map")
|
|
1537
|
+
)
|
|
1538
|
+
)
|
|
1539
|
+
|
|
1540
|
+
# Join all results
|
|
1541
|
+
return top_datasets.join(user_totals, on="user", how="left").join(
|
|
1542
|
+
platform_totals, on="user", how="left"
|
|
1543
|
+
)
|
|
1544
|
+
|
|
1545
|
+
def _combine_user_usage_data(
|
|
1546
|
+
self,
|
|
1547
|
+
dataset_usage_lf: polars.LazyFrame,
|
|
1548
|
+
dashboard_usage_lf: polars.LazyFrame,
|
|
1549
|
+
chart_usage_lf: polars.LazyFrame,
|
|
1550
|
+
) -> polars.LazyFrame:
|
|
1551
|
+
"""
|
|
1552
|
+
Combines user usage data from dataset, dashboard, and chart sources.
|
|
1553
|
+
|
|
1554
|
+
Args:
|
|
1555
|
+
dataset_usage_lf: LazyFrame with dataset usage data containing top_datasets_map
|
|
1556
|
+
dashboard_usage_lf: LazyFrame with dashboard usage data
|
|
1557
|
+
chart_usage_lf: LazyFrame with chart usage data
|
|
1558
|
+
|
|
1559
|
+
Returns:
|
|
1560
|
+
Combined LazyFrame with aggregated usage data per user
|
|
1561
|
+
"""
|
|
1562
|
+
user_totals = self._combine_user_totals(
|
|
1563
|
+
dataset_usage_lf, dashboard_usage_lf, chart_usage_lf
|
|
1564
|
+
)
|
|
1565
|
+
|
|
1566
|
+
platform_pairs = self._combine_platform_pairs(
|
|
1567
|
+
dataset_usage_lf, dashboard_usage_lf, chart_usage_lf
|
|
1568
|
+
)
|
|
1569
|
+
|
|
1570
|
+
result = user_totals.join(platform_pairs, on="user", how="left")
|
|
1571
|
+
|
|
1572
|
+
return result.with_columns(
|
|
1573
|
+
polars.col("platform_usage_pairs").fill_null(polars.lit([]))
|
|
1574
|
+
)
|
|
1575
|
+
|
|
1576
|
+
def _combine_user_totals(
|
|
1577
|
+
self,
|
|
1578
|
+
dataset_usage_lf: polars.LazyFrame,
|
|
1579
|
+
dashboard_usage_lf: polars.LazyFrame,
|
|
1580
|
+
chart_usage_lf: polars.LazyFrame,
|
|
1581
|
+
) -> polars.LazyFrame:
|
|
1582
|
+
"""Combine user totals and top_datasets_map from all sources."""
|
|
1583
|
+
# Collect all unique users in one operation
|
|
1584
|
+
all_users_lf = polars.concat(
|
|
1585
|
+
[
|
|
1586
|
+
dataset_usage_lf.select("user"),
|
|
1587
|
+
dashboard_usage_lf.select("user"),
|
|
1588
|
+
chart_usage_lf.select("user"),
|
|
1589
|
+
]
|
|
1590
|
+
).unique()
|
|
1591
|
+
|
|
1592
|
+
return (
|
|
1593
|
+
all_users_lf.join(
|
|
1594
|
+
dataset_usage_lf.select(
|
|
1595
|
+
["user", "top_datasets_map", "userUsageTotalPast30Days"]
|
|
1596
|
+
),
|
|
1597
|
+
on="user",
|
|
1598
|
+
how="left",
|
|
1599
|
+
)
|
|
1600
|
+
.join(
|
|
1601
|
+
dashboard_usage_lf.select(["user", "userUsageTotalPast30Days"]),
|
|
1602
|
+
on="user",
|
|
1603
|
+
how="left",
|
|
1604
|
+
suffix="_dashboard",
|
|
1605
|
+
)
|
|
1606
|
+
.join(
|
|
1607
|
+
chart_usage_lf.select(["user", "userUsageTotalPast30Days"]),
|
|
1608
|
+
on="user",
|
|
1609
|
+
how="left",
|
|
1610
|
+
suffix="_chart",
|
|
1611
|
+
)
|
|
1612
|
+
.with_columns(
|
|
1613
|
+
[
|
|
1614
|
+
# Sum with explicit null handling
|
|
1615
|
+
(
|
|
1616
|
+
polars.col("userUsageTotalPast30Days").fill_null(0)
|
|
1617
|
+
+ polars.col("userUsageTotalPast30Days_dashboard").fill_null(0)
|
|
1618
|
+
+ polars.col("userUsageTotalPast30Days_chart").fill_null(0)
|
|
1619
|
+
).alias("userUsageTotalPast30Days")
|
|
1620
|
+
]
|
|
1621
|
+
)
|
|
1622
|
+
.select(["user", "top_datasets_map", "userUsageTotalPast30Days"])
|
|
1623
|
+
)
|
|
1624
|
+
|
|
1625
|
+
def _combine_platform_pairs(
|
|
1626
|
+
self,
|
|
1627
|
+
dataset_usage_lf: polars.LazyFrame,
|
|
1628
|
+
dashboard_usage_lf: polars.LazyFrame,
|
|
1629
|
+
chart_usage_lf: polars.LazyFrame,
|
|
1630
|
+
) -> polars.LazyFrame:
|
|
1631
|
+
"""Combine platform usage pairs from all sources."""
|
|
1632
|
+
all_platforms = []
|
|
1633
|
+
|
|
1634
|
+
# Extract platforms from each source
|
|
1635
|
+
for source_lf, col_name in [
|
|
1636
|
+
(dataset_usage_lf, "platform_usage_pairs"),
|
|
1637
|
+
(dashboard_usage_lf, "platform_usage_pairs"),
|
|
1638
|
+
(chart_usage_lf, "platform_usage_pairs"),
|
|
1639
|
+
]:
|
|
1640
|
+
platforms = self._extract_platforms_from_source(source_lf, col_name)
|
|
1641
|
+
if platforms is not None:
|
|
1642
|
+
all_platforms.append(platforms)
|
|
1643
|
+
|
|
1644
|
+
if not all_platforms:
|
|
1645
|
+
# Return empty result if no platforms found
|
|
1646
|
+
return polars.LazyFrame({"user": [], "platform_usage_pairs": []})
|
|
1647
|
+
|
|
1648
|
+
# Combine all platforms and aggregate by user + platform
|
|
1649
|
+
combined_platforms = polars.concat(all_platforms, how="vertical_relaxed")
|
|
1650
|
+
aggregated = combined_platforms.group_by("user", "platform_urn").agg(
|
|
1651
|
+
polars.col("platform_total").sum().alias("platform_total")
|
|
1652
|
+
)
|
|
1653
|
+
|
|
1654
|
+
# Rebuild platform_usage_pairs structure
|
|
1655
|
+
return aggregated.group_by("user").agg(
|
|
1656
|
+
polars.struct(
|
|
1657
|
+
[polars.col("platform_urn"), polars.col("platform_total")]
|
|
1658
|
+
).alias("platform_usage_pairs")
|
|
1659
|
+
)
|
|
1660
|
+
|
|
1661
|
+
def _extract_platforms_from_source(
|
|
1662
|
+
self, source_lf: polars.LazyFrame, col_name: str
|
|
1663
|
+
) -> polars.LazyFrame | None:
|
|
1664
|
+
"""Extract platform data from a source LazyFrame."""
|
|
1665
|
+
try:
|
|
1666
|
+
return (
|
|
1667
|
+
source_lf.select(["user", col_name])
|
|
1668
|
+
.filter(polars.col(col_name).is_not_null())
|
|
1669
|
+
.filter(polars.col(col_name).list.len() > 0)
|
|
1670
|
+
.explode(col_name)
|
|
1671
|
+
.unnest(col_name)
|
|
1672
|
+
.filter(polars.col("platform_urn").is_not_null())
|
|
1673
|
+
.select(["user", "platform_urn", "platform_total"])
|
|
1674
|
+
)
|
|
1675
|
+
except polars.exceptions.ColumnNotFoundError:
|
|
1676
|
+
return None
|
|
1677
|
+
|
|
1678
|
+
def add_platform_usage_percentiles(
|
|
1679
|
+
self, user_usage_lf: polars.LazyFrame
|
|
1680
|
+
) -> polars.LazyFrame:
|
|
1681
|
+
"""
|
|
1682
|
+
Add platform usage percentiles to user usage data.
|
|
1683
|
+
|
|
1684
|
+
Args:
|
|
1685
|
+
user_usage_lf: LazyFrame with user usage data containing platform_usage_pairs column
|
|
1686
|
+
|
|
1687
|
+
Returns:
|
|
1688
|
+
LazyFrame with additional platform_usage_percentiles column
|
|
1689
|
+
"""
|
|
1690
|
+
# First explode the platform_usage_pairs to work with individual platform usage records
|
|
1691
|
+
platform_usage_exploded = (
|
|
1692
|
+
user_usage_lf.explode("platform_usage_pairs")
|
|
1693
|
+
.unnest("platform_usage_pairs")
|
|
1694
|
+
.filter(polars.col("platform_urn").is_not_null())
|
|
1695
|
+
)
|
|
1696
|
+
|
|
1697
|
+
# Use the existing gen_rank_and_percentile method to calculate percentiles
|
|
1698
|
+
platform_percentiles_with_ranks = self.gen_rank_and_percentile(
|
|
1699
|
+
lf=platform_usage_exploded,
|
|
1700
|
+
count_field="platform_total",
|
|
1701
|
+
urn_field="user",
|
|
1702
|
+
platform_field="platform_urn",
|
|
1703
|
+
prefix="platform_",
|
|
1704
|
+
use_exp_cdf=False,
|
|
1705
|
+
)
|
|
1706
|
+
|
|
1707
|
+
# Group back by user and create the percentiles structure
|
|
1708
|
+
platform_percentiles = platform_percentiles_with_ranks.group_by("user").agg(
|
|
1709
|
+
polars.struct(
|
|
1710
|
+
[
|
|
1711
|
+
polars.col("platform_urn"),
|
|
1712
|
+
polars.col("platform_rank_percentile").cast(polars.Float64),
|
|
1713
|
+
]
|
|
1714
|
+
).alias("platform_usage_percentiles")
|
|
1715
|
+
)
|
|
1716
|
+
|
|
1717
|
+
# Join the percentiles back to the original user_usage_lf
|
|
1718
|
+
return user_usage_lf.join(platform_percentiles, on="user", how="left")
|
|
1719
|
+
|
|
1720
|
+
def _generate_user_usage_for_dashboard_charts(
|
|
1721
|
+
self, entity_index: str, usage_index: str
|
|
1722
|
+
) -> polars.LazyFrame:
|
|
1723
|
+
entities_df = self._generate_dashboard_chart_entities(entity_index)
|
|
1724
|
+
lf = self._generate_dashboard_chart_usage(entities_df, usage_index)
|
|
1725
|
+
|
|
1726
|
+
# Process dashboard usage data into user usage format (similar to dataset version)
|
|
1727
|
+
users_lf = (
|
|
1728
|
+
lf.explode("userCounts")
|
|
1729
|
+
.unnest("userCounts")
|
|
1730
|
+
.filter(polars.col("user").is_not_null())
|
|
1731
|
+
.rename({"usageCount": "count"}) # Rename to match dataset schema
|
|
1732
|
+
)
|
|
1733
|
+
|
|
1734
|
+
user_dashboard_usage_lf = self._create_user_dataset_usage_map(users_lf)
|
|
1735
|
+
return user_dashboard_usage_lf
|
|
1736
|
+
|
|
1737
|
+
def generate_user_usage(self) -> polars.LazyFrame:
|
|
1738
|
+
dataset_usage_lf = self._generate_user_usage_for_dataset()
|
|
1739
|
+
|
|
1740
|
+
usage_index = "dashboard_dashboardusagestatisticsaspect_v1"
|
|
1741
|
+
entity_index = "dashboardindex_v2"
|
|
1742
|
+
dashboard_usage_lf = self._generate_user_usage_for_dashboard_charts(
|
|
1743
|
+
entity_index, usage_index
|
|
1744
|
+
)
|
|
1745
|
+
|
|
1746
|
+
entity_index = "chartindex_v2"
|
|
1747
|
+
usage_index = "chart_chartusagestatisticsaspect_v1"
|
|
1748
|
+
chart_usage_lf = self._generate_user_usage_for_dashboard_charts(
|
|
1749
|
+
entity_index, usage_index
|
|
1750
|
+
)
|
|
1751
|
+
|
|
1752
|
+
# Combine all three usage sources
|
|
1753
|
+
lf = self._combine_user_usage_data(
|
|
1754
|
+
dataset_usage_lf, dashboard_usage_lf, chart_usage_lf
|
|
1755
|
+
)
|
|
1756
|
+
|
|
1757
|
+
lf = self.add_platform_usage_percentiles(lf)
|
|
1758
|
+
|
|
1759
|
+
# Add user usage percentiles across all users (not grouped by platform)
|
|
1760
|
+
# Create a temporary platform field for percentile calculation
|
|
1761
|
+
lf = lf.with_columns(polars.lit("all_users").alias("temp_platform"))
|
|
1762
|
+
|
|
1763
|
+
lf = self.gen_rank_and_percentile(
|
|
1764
|
+
lf=lf,
|
|
1765
|
+
count_field="userUsageTotalPast30Days",
|
|
1766
|
+
urn_field="user",
|
|
1767
|
+
platform_field="temp_platform",
|
|
1768
|
+
prefix="userUsage",
|
|
1769
|
+
use_exp_cdf=False,
|
|
1770
|
+
)
|
|
1771
|
+
|
|
1772
|
+
# Rename the percentile column to match the schema field name and remove temp field
|
|
1773
|
+
lf = lf.rename(
|
|
1774
|
+
{"userUsagerank_percentile": "userUsagePercentilePast30Days"}
|
|
1775
|
+
).drop("temp_platform")
|
|
1776
|
+
|
|
1777
|
+
return lf
|
|
1778
|
+
|
|
1370
1779
|
def generate_dataset_usage(self) -> polars.LazyFrame:
|
|
1371
1780
|
datasets_lf = self.get_datasets()
|
|
1372
1781
|
if self.config.set_upstream_table_max_modification_time_for_views:
|
|
@@ -15,13 +15,16 @@ from .....schema_classes import CorpUserCredentialsClass
|
|
|
15
15
|
from .....schema_classes import CorpUserEditableInfoClass
|
|
16
16
|
from .....schema_classes import CorpUserHomePageSettingsClass
|
|
17
17
|
from .....schema_classes import CorpUserInfoClass
|
|
18
|
+
from .....schema_classes import CorpUserInvitationStatusClass
|
|
18
19
|
from .....schema_classes import CorpUserSettingsClass
|
|
19
20
|
from .....schema_classes import CorpUserStatusClass
|
|
20
21
|
from .....schema_classes import CorpUserViewsSettingsClass
|
|
21
22
|
from .....schema_classes import GroupMembershipClass
|
|
23
|
+
from .....schema_classes import InvitationStatusClass
|
|
22
24
|
from .....schema_classes import InviteTokenClass
|
|
23
25
|
from .....schema_classes import NativeGroupMembershipClass
|
|
24
26
|
from .....schema_classes import RoleMembershipClass
|
|
27
|
+
from .....schema_classes import TokenTypeClass
|
|
25
28
|
|
|
26
29
|
|
|
27
30
|
CorpGroupEditableInfo = CorpGroupEditableInfoClass
|
|
@@ -32,12 +35,15 @@ CorpUserCredentials = CorpUserCredentialsClass
|
|
|
32
35
|
CorpUserEditableInfo = CorpUserEditableInfoClass
|
|
33
36
|
CorpUserHomePageSettings = CorpUserHomePageSettingsClass
|
|
34
37
|
CorpUserInfo = CorpUserInfoClass
|
|
38
|
+
CorpUserInvitationStatus = CorpUserInvitationStatusClass
|
|
35
39
|
CorpUserSettings = CorpUserSettingsClass
|
|
36
40
|
CorpUserStatus = CorpUserStatusClass
|
|
37
41
|
CorpUserViewsSettings = CorpUserViewsSettingsClass
|
|
38
42
|
GroupMembership = GroupMembershipClass
|
|
43
|
+
InvitationStatus = InvitationStatusClass
|
|
39
44
|
InviteToken = InviteTokenClass
|
|
40
45
|
NativeGroupMembership = NativeGroupMembershipClass
|
|
41
46
|
RoleMembership = RoleMembershipClass
|
|
47
|
+
TokenType = TokenTypeClass
|
|
42
48
|
|
|
43
49
|
# fmt: on
|
|
@@ -7,6 +7,7 @@
|
|
|
7
7
|
# pylint: skip-file
|
|
8
8
|
# fmt: off
|
|
9
9
|
# isort: skip_file
|
|
10
|
+
from .......schema_classes import CorpUserUsageFeaturesClass
|
|
10
11
|
from .......schema_classes import CostCurrencyCodeClass
|
|
11
12
|
from .......schema_classes import CostFeaturesClass
|
|
12
13
|
from .......schema_classes import LineageFeaturesClass
|
|
@@ -14,6 +15,7 @@ from .......schema_classes import StorageFeaturesClass
|
|
|
14
15
|
from .......schema_classes import UsageFeaturesClass
|
|
15
16
|
|
|
16
17
|
|
|
18
|
+
CorpUserUsageFeatures = CorpUserUsageFeaturesClass
|
|
17
19
|
CostCurrencyCode = CostCurrencyCodeClass
|
|
18
20
|
CostFeatures = CostFeaturesClass
|
|
19
21
|
LineageFeatures = LineageFeaturesClass
|
|
@@ -9,9 +9,13 @@
|
|
|
9
9
|
# isort: skip_file
|
|
10
10
|
from .......schema_classes import EntityChangeEventClass
|
|
11
11
|
from .......schema_classes import ParametersClass
|
|
12
|
+
from .......schema_classes import RelationshipChangeEventClass
|
|
13
|
+
from .......schema_classes import RelationshipChangeOperationClass
|
|
12
14
|
|
|
13
15
|
|
|
14
16
|
EntityChangeEvent = EntityChangeEventClass
|
|
15
17
|
Parameters = ParametersClass
|
|
18
|
+
RelationshipChangeEvent = RelationshipChangeEventClass
|
|
19
|
+
RelationshipChangeOperation = RelationshipChangeOperationClass
|
|
16
20
|
|
|
17
21
|
# fmt: on
|