acryl-datahub-cloud 0.3.13.3__py3-none-any.whl → 0.3.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub-cloud might be problematic. Click here for more details.
- acryl_datahub_cloud/_codegen_config.json +1 -1
- acryl_datahub_cloud/datahub_usage_reporting/excluded.py +94 -0
- acryl_datahub_cloud/datahub_usage_reporting/usage_feature_reporter.py +462 -34
- acryl_datahub_cloud/metadata/_urns/urn_defs.py +2034 -2034
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +2 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/identity/__init__.py +6 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/metadata/search/features/__init__.py +2 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +6 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/template/__init__.py +6 -0
- acryl_datahub_cloud/metadata/schema.avsc +24776 -24109
- acryl_datahub_cloud/metadata/schema_classes.py +1581 -696
- acryl_datahub_cloud/metadata/schemas/ActionRequestInfo.avsc +95 -0
- acryl_datahub_cloud/metadata/schemas/AssertionAnalyticsRunEvent.avsc +0 -21
- acryl_datahub_cloud/metadata/schemas/AssetSettings.avsc +63 -0
- acryl_datahub_cloud/metadata/schemas/ChartInfo.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/CorpGroupSettings.avsc +127 -2
- acryl_datahub_cloud/metadata/schemas/CorpUserInvitationStatus.avsc +106 -0
- acryl_datahub_cloud/metadata/schemas/CorpUserKey.avsc +2 -0
- acryl_datahub_cloud/metadata/schemas/CorpUserSettings.avsc +127 -2
- acryl_datahub_cloud/metadata/schemas/CorpUserUsageFeatures.avsc +93 -0
- acryl_datahub_cloud/metadata/schemas/DataHubPageModuleProperties.avsc +21 -2
- acryl_datahub_cloud/metadata/schemas/DataHubPageTemplateProperties.avsc +77 -1
- acryl_datahub_cloud/metadata/schemas/DataProductKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- acryl_datahub_cloud/metadata/schemas/DomainKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/GlobalSettingsInfo.avsc +82 -0
- acryl_datahub_cloud/metadata/schemas/GlossaryNodeKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/GlossaryTermKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/IncidentActivityEvent.avsc +3 -3
- acryl_datahub_cloud/metadata/schemas/IncidentInfo.avsc +3 -3
- acryl_datahub_cloud/metadata/schemas/InferredMetadata.avsc +69 -0
- acryl_datahub_cloud/metadata/schemas/InviteToken.avsc +26 -0
- acryl_datahub_cloud/metadata/schemas/LogicalParent.avsc +104 -100
- acryl_datahub_cloud/metadata/schemas/MetadataChangeEvent.avsc +98 -45
- acryl_datahub_cloud/metadata/schemas/MonitorSuiteInfo.avsc +127 -2
- acryl_datahub_cloud/metadata/schemas/NotificationRequest.avsc +79 -2
- acryl_datahub_cloud/metadata/schemas/Ownership.avsc +69 -0
- acryl_datahub_cloud/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- acryl_datahub_cloud/metadata/schemas/SchemaFieldKey.avsc +2 -0
- acryl_datahub_cloud/metadata/schemas/StructuredProperties.avsc +69 -0
- acryl_datahub_cloud/metadata/schemas/SubscriptionInfo.avsc +127 -2
- acryl_datahub_cloud/sdk/assertions_client.py +21 -7
- acryl_datahub_cloud/sdk/resolver_client.py +4 -1
- acryl_datahub_cloud/sdk/subscription_client.py +8 -3
- {acryl_datahub_cloud-0.3.13.3.dist-info → acryl_datahub_cloud-0.3.14.dist-info}/METADATA +44 -44
- {acryl_datahub_cloud-0.3.13.3.dist-info → acryl_datahub_cloud-0.3.14.dist-info}/RECORD +51 -45
- {acryl_datahub_cloud-0.3.13.3.dist-info → acryl_datahub_cloud-0.3.14.dist-info}/WHEEL +0 -0
- {acryl_datahub_cloud-0.3.13.3.dist-info → acryl_datahub_cloud-0.3.14.dist-info}/entry_points.txt +0 -0
- {acryl_datahub_cloud-0.3.13.3.dist-info → acryl_datahub_cloud-0.3.14.dist-info}/top_level.txt +0 -0
|
@@ -22,12 +22,14 @@ from polars.datatypes import DataTypeClass
|
|
|
22
22
|
from pydantic import Field
|
|
23
23
|
from scipy.stats import expon
|
|
24
24
|
|
|
25
|
+
from acryl_datahub_cloud.datahub_usage_reporting.excluded import EXCLUDED_PATTERNS
|
|
25
26
|
from acryl_datahub_cloud.datahub_usage_reporting.query_builder import QueryBuilder
|
|
26
27
|
from acryl_datahub_cloud.datahub_usage_reporting.usage_feature_patch_builder import (
|
|
27
28
|
UsageFeaturePatchBuilder,
|
|
28
29
|
)
|
|
29
30
|
from acryl_datahub_cloud.elasticsearch.config import ElasticSearchClientConfig
|
|
30
31
|
from acryl_datahub_cloud.metadata.schema_classes import (
|
|
32
|
+
CorpUserUsageFeaturesClass,
|
|
31
33
|
QueryUsageFeaturesClass,
|
|
32
34
|
UsageFeaturesClass,
|
|
33
35
|
)
|
|
@@ -135,6 +137,10 @@ class DataHubUsageFeatureReportingSourceConfig(
|
|
|
135
137
|
None,
|
|
136
138
|
description="Optional configuration for stateful ingestion, including stale metadata removal.",
|
|
137
139
|
)
|
|
140
|
+
user_usage_enabled: bool = Field(
|
|
141
|
+
True,
|
|
142
|
+
description="Flag to enable or disable user usage statistics collection.",
|
|
143
|
+
)
|
|
138
144
|
dataset_usage_enabled: bool = Field(
|
|
139
145
|
True,
|
|
140
146
|
description="Flag to enable or disable dataset usage statistics collection.",
|
|
@@ -191,6 +197,11 @@ class DataHubUsageFeatureReportingSourceConfig(
|
|
|
191
197
|
description="Flag to generate MCP patch for usage features.'",
|
|
192
198
|
)
|
|
193
199
|
|
|
200
|
+
excluded_platforms: List[str] = Field(
|
|
201
|
+
EXCLUDED_PATTERNS,
|
|
202
|
+
description="List of platforms to exclude from usage statistics collection. This is done to avoid invite user functionality to be filled with service accounts.",
|
|
203
|
+
)
|
|
204
|
+
|
|
194
205
|
|
|
195
206
|
def exp_cdf(series: polars.Series) -> polars.Series:
|
|
196
207
|
with PerfTimer() as timer:
|
|
@@ -241,10 +252,6 @@ class DatahubUsageFeatureReport(IngestionStageReport, StatefulIngestionReport):
|
|
|
241
252
|
default_factory=lambda: defaultdict(lambda: PerfTimer())
|
|
242
253
|
)
|
|
243
254
|
|
|
244
|
-
dataset_usage_processing_time: PerfTimer = PerfTimer()
|
|
245
|
-
dashboard_usage_processing_time: PerfTimer = PerfTimer()
|
|
246
|
-
chart_usage_processing_time: PerfTimer = PerfTimer()
|
|
247
|
-
query_usage_processing_time: PerfTimer = PerfTimer()
|
|
248
255
|
query_platforms_count: Dict[str, int] = field(
|
|
249
256
|
default_factory=lambda: defaultdict(lambda: 0)
|
|
250
257
|
)
|
|
@@ -923,6 +930,11 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
923
930
|
|
|
924
931
|
return dataset_df
|
|
925
932
|
|
|
933
|
+
def generate_user_usage_mcps(self) -> Iterable[MetadataWorkUnit]:
|
|
934
|
+
with polars.StringCache():
|
|
935
|
+
user_usage_lf = self.generate_user_usage()
|
|
936
|
+
yield from self.generate_user_usage_mcp_from_lazyframe(user_usage_lf)
|
|
937
|
+
|
|
926
938
|
def generate_dataset_usage_mcps(self) -> Iterable[MetadataWorkUnit]:
|
|
927
939
|
with polars.StringCache():
|
|
928
940
|
dataset_usage_df = self.generate_dataset_usage()
|
|
@@ -958,38 +970,27 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
958
970
|
]
|
|
959
971
|
|
|
960
972
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
973
|
+
if self.config.user_usage_enabled:
|
|
974
|
+
self.report.new_stage("generate user usage")
|
|
975
|
+
yield from self.generate_user_usage_mcps()
|
|
976
|
+
|
|
961
977
|
if self.config.dataset_usage_enabled:
|
|
962
|
-
|
|
963
|
-
|
|
964
|
-
yield from self.generate_dataset_usage_mcps()
|
|
965
|
-
time_taken = timer.elapsed_seconds()
|
|
966
|
-
logger.info(f"Dataset Usage generation took {time_taken:.3f} seconds")
|
|
978
|
+
self.report.new_stage("generate dataset usage")
|
|
979
|
+
yield from self.generate_dataset_usage_mcps()
|
|
967
980
|
|
|
968
981
|
if self.config.dashboard_usage_enabled:
|
|
969
|
-
|
|
970
|
-
|
|
971
|
-
yield from self.generate_dashboard_usage_mcps()
|
|
972
|
-
|
|
973
|
-
time_taken = timer.elapsed_seconds()
|
|
974
|
-
logger.info(f"Dashboard Usage generation took {time_taken:.3f}")
|
|
982
|
+
self.report.new_stage("generate dashboard usage")
|
|
983
|
+
yield from self.generate_dashboard_usage_mcps()
|
|
975
984
|
|
|
976
985
|
if self.config.chart_usage_enabled:
|
|
977
|
-
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
yield from self.generate_chart_usage_mcps()
|
|
981
|
-
|
|
982
|
-
time_taken = timer.elapsed_seconds()
|
|
983
|
-
logger.info(f"Chart Usage generation took {time_taken:.3f}")
|
|
986
|
+
self.report.new_stage("generate chart usage")
|
|
987
|
+
yield from self.generate_chart_usage_mcps()
|
|
984
988
|
|
|
985
989
|
if self.config.query_usage_enabled:
|
|
986
|
-
|
|
987
|
-
|
|
988
|
-
|
|
989
|
-
yield from self.generate_query_usage_mcps()
|
|
990
|
+
self.report.new_stage("generate query usage")
|
|
991
|
+
yield from self.generate_query_usage_mcps()
|
|
990
992
|
|
|
991
|
-
|
|
992
|
-
logger.info(f"Query Usage generation took {time_taken:.3f}")
|
|
993
|
+
self.report.new_stage("end so time is calculated for last stage")
|
|
993
994
|
|
|
994
995
|
def generate_mcp_from_lazyframe(
|
|
995
996
|
self, lazy_frame: polars.LazyFrame
|
|
@@ -1052,7 +1053,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1052
1053
|
uniqueUserPercentileLast30Days=int(
|
|
1053
1054
|
row.get("distinct_user_rank_percentile", 0) or 0
|
|
1054
1055
|
),
|
|
1055
|
-
writeCountLast30Days=int(row.get("
|
|
1056
|
+
writeCountLast30Days=int(row.get("write_count", 0) or 0)
|
|
1056
1057
|
if not self.config.disable_write_usage
|
|
1057
1058
|
else None,
|
|
1058
1059
|
writeCountPercentileLast30Days=int(
|
|
@@ -1108,6 +1109,47 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1108
1109
|
row["urn"], query_usage_features
|
|
1109
1110
|
)
|
|
1110
1111
|
|
|
1112
|
+
def _convert_platform_pairs_to_dict(
|
|
1113
|
+
self,
|
|
1114
|
+
platform_pairs: Optional[List[Dict[str, Any]]],
|
|
1115
|
+
value_key: str = "platform_total",
|
|
1116
|
+
) -> Optional[Dict[str, Any]]:
|
|
1117
|
+
"""Convert list of platform usage structs to dictionary."""
|
|
1118
|
+
if not platform_pairs:
|
|
1119
|
+
return None
|
|
1120
|
+
|
|
1121
|
+
return {
|
|
1122
|
+
pair["platform_urn"]: pair[value_key]
|
|
1123
|
+
for pair in platform_pairs
|
|
1124
|
+
if pair["platform_urn"] is not None
|
|
1125
|
+
}
|
|
1126
|
+
|
|
1127
|
+
def generate_user_usage_mcp_from_lazyframe(
|
|
1128
|
+
self, lazy_frame: polars.LazyFrame
|
|
1129
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
1130
|
+
for row in lazy_frame.collect(
|
|
1131
|
+
engine="streaming" if self.config.experimental_full_streaming else "auto"
|
|
1132
|
+
).iter_rows(named=True):
|
|
1133
|
+
user_usage_features = CorpUserUsageFeaturesClass(
|
|
1134
|
+
userUsageTotalPast30Days=int(
|
|
1135
|
+
row.get("userUsageTotalPast30Days", 0) or 0
|
|
1136
|
+
),
|
|
1137
|
+
userPlatformUsageTotalsPast30Days=self._convert_platform_pairs_to_dict(
|
|
1138
|
+
row.get("platform_usage_pairs", [])
|
|
1139
|
+
),
|
|
1140
|
+
userPlatformUsagePercentilePast30Days=self._convert_platform_pairs_to_dict(
|
|
1141
|
+
row.get("platform_usage_percentiles", []),
|
|
1142
|
+
"platform_rank_percentile",
|
|
1143
|
+
),
|
|
1144
|
+
userUsagePercentilePast30Days=row.get("userUsagePercentilePast30Days"),
|
|
1145
|
+
userTopDatasetsByUsage=self._convert_top_datasets_to_dict(
|
|
1146
|
+
row.get("top_datasets_map", [])
|
|
1147
|
+
),
|
|
1148
|
+
)
|
|
1149
|
+
yield MetadataChangeProposalWrapper(
|
|
1150
|
+
entityUrn=row["user"], aspect=user_usage_features
|
|
1151
|
+
).as_workunit(is_primary_source=False)
|
|
1152
|
+
|
|
1111
1153
|
def generate_usage_feature_mcp(
|
|
1112
1154
|
self, urn: str, usage_feature: UsageFeaturesClass
|
|
1113
1155
|
) -> Iterable[MetadataWorkUnit]:
|
|
@@ -1142,9 +1184,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1142
1184
|
|
|
1143
1185
|
return self.generate_dashboard_chart_usage(entity_index, usage_index)
|
|
1144
1186
|
|
|
1145
|
-
def
|
|
1146
|
-
self, entity_index: str, usage_index: str
|
|
1147
|
-
) -> polars.LazyFrame:
|
|
1187
|
+
def _generate_dashboard_chart_entities(self, entity_index: str) -> polars.LazyFrame:
|
|
1148
1188
|
entity_schema = {
|
|
1149
1189
|
"entity_urn": polars.Categorical,
|
|
1150
1190
|
"removed": polars.Boolean,
|
|
@@ -1161,7 +1201,12 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1161
1201
|
process_function=self.soft_deleted_batch,
|
|
1162
1202
|
)
|
|
1163
1203
|
|
|
1164
|
-
|
|
1204
|
+
return entities_df
|
|
1205
|
+
|
|
1206
|
+
def _generate_dashboard_chart_usage(
|
|
1207
|
+
self, entities_df: polars.LazyFrame, usage_index: str
|
|
1208
|
+
) -> polars.LazyFrame:
|
|
1209
|
+
entities_usage_schema = {
|
|
1165
1210
|
"timestampMillis": polars.Int64,
|
|
1166
1211
|
"lastObserved": polars.Int64,
|
|
1167
1212
|
"urn": polars.Categorical,
|
|
@@ -1179,7 +1224,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1179
1224
|
}
|
|
1180
1225
|
|
|
1181
1226
|
lf = self.load_data_from_es_to_lf(
|
|
1182
|
-
schema=
|
|
1227
|
+
schema=entities_usage_schema,
|
|
1183
1228
|
index=usage_index,
|
|
1184
1229
|
query=QueryBuilder.get_dashboard_usage_query(self.config.lookback_days),
|
|
1185
1230
|
process_function=self.process_dashboard_usage,
|
|
@@ -1198,6 +1243,15 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1198
1243
|
.alias("row_num")
|
|
1199
1244
|
).filter(polars.col("row_num") == 1)
|
|
1200
1245
|
|
|
1246
|
+
return lf
|
|
1247
|
+
|
|
1248
|
+
def generate_dashboard_chart_usage(
|
|
1249
|
+
self, entity_index: str, usage_index: str
|
|
1250
|
+
) -> polars.LazyFrame:
|
|
1251
|
+
entities_df = self._generate_dashboard_chart_entities(entity_index)
|
|
1252
|
+
|
|
1253
|
+
lf = self._generate_dashboard_chart_usage(entities_df, usage_index)
|
|
1254
|
+
|
|
1201
1255
|
# lf = lf.filter(polars.col("urn") == "urn:li:dashboard:(looker,dashboards.8)")
|
|
1202
1256
|
# "urn:li:dashboard:(looker,dashboards.8)"
|
|
1203
1257
|
|
|
@@ -1367,6 +1421,380 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1367
1421
|
|
|
1368
1422
|
return usage_with_top_users_with_ranks
|
|
1369
1423
|
|
|
1424
|
+
def _generate_user_usage_for_dataset(self) -> polars.LazyFrame:
|
|
1425
|
+
datasets_lf = self.get_datasets()
|
|
1426
|
+
if self.config.set_upstream_table_max_modification_time_for_views:
|
|
1427
|
+
datasets_lf = self.set_table_modification_time_for_views(datasets_lf)
|
|
1428
|
+
|
|
1429
|
+
lf = self.load_dataset_usage()
|
|
1430
|
+
|
|
1431
|
+
# Polaris/pandas join merges the join column into one column and that's why we need to filter based on the removed column
|
|
1432
|
+
lf = (
|
|
1433
|
+
lf.join(datasets_lf, left_on="urn", right_on="entity_urn", how="left")
|
|
1434
|
+
.filter(polars.col("removed") == False) # noqa: E712
|
|
1435
|
+
.drop(["removed"])
|
|
1436
|
+
)
|
|
1437
|
+
|
|
1438
|
+
users_lf = (
|
|
1439
|
+
lf.explode("userCounts")
|
|
1440
|
+
.unnest("userCounts")
|
|
1441
|
+
.filter(polars.col("user").is_not_null())
|
|
1442
|
+
)
|
|
1443
|
+
|
|
1444
|
+
user_dataset_usage_lf = self._create_user_dataset_usage_map(users_lf)
|
|
1445
|
+
return user_dataset_usage_lf
|
|
1446
|
+
|
|
1447
|
+
@staticmethod
|
|
1448
|
+
def _convert_top_datasets_to_dict(
|
|
1449
|
+
top_datasets_list: Optional[List[Dict[str, Any]]],
|
|
1450
|
+
) -> Optional[Dict[str, float]]:
|
|
1451
|
+
"""
|
|
1452
|
+
Convert list of top datasets structs to dictionary as expected by CorpUserUsageFeatures schema.
|
|
1453
|
+
|
|
1454
|
+
Args:
|
|
1455
|
+
top_datasets_list: List of dictionaries with 'dataset_urn' and 'count' keys
|
|
1456
|
+
|
|
1457
|
+
Returns:
|
|
1458
|
+
Dictionary mapping dataset URN to usage count, or None if input is empty
|
|
1459
|
+
"""
|
|
1460
|
+
if not top_datasets_list:
|
|
1461
|
+
return None
|
|
1462
|
+
|
|
1463
|
+
top_datasets_dict = {
|
|
1464
|
+
item["dataset_urn"]: float(item["count"])
|
|
1465
|
+
for item in top_datasets_list
|
|
1466
|
+
if isinstance(item, dict) and "dataset_urn" in item and "count" in item
|
|
1467
|
+
}
|
|
1468
|
+
|
|
1469
|
+
return top_datasets_dict if top_datasets_dict else None
|
|
1470
|
+
|
|
1471
|
+
def _create_user_dataset_usage_map(
|
|
1472
|
+
self, users_lf: polars.LazyFrame, top_n: int = 25
|
|
1473
|
+
) -> polars.LazyFrame:
|
|
1474
|
+
"""
|
|
1475
|
+
Creates a lazyframe with user string and map of top N datasets by usage.
|
|
1476
|
+
|
|
1477
|
+
Args:
|
|
1478
|
+
users_lf: LazyFrame containing user usage data with columns: user, urn, platform, count
|
|
1479
|
+
top_n: Number of top datasets to include per user (default: 25)
|
|
1480
|
+
|
|
1481
|
+
Returns:
|
|
1482
|
+
LazyFrame with columns:
|
|
1483
|
+
- user: string column containing the user identifier
|
|
1484
|
+
- top_datasets_map: list of structs with dataset_urn (string), count (int), and platform_urn (string)
|
|
1485
|
+
- userUsageTotalPast30Days: total usage count for the user across all datasets
|
|
1486
|
+
- userPlatformUsageTotalsPast30Days: map from platform URN to usage totals
|
|
1487
|
+
"""
|
|
1488
|
+
|
|
1489
|
+
# Create intermediate lazy frame with filtered users and aggregated counts
|
|
1490
|
+
user_dataset_aggregated = (
|
|
1491
|
+
users_lf.filter(polars.col("user").str.contains("@"))
|
|
1492
|
+
.group_by("user", "urn", "platform")
|
|
1493
|
+
.agg(polars.col("count").sum().alias("total_count"))
|
|
1494
|
+
.with_columns(
|
|
1495
|
+
# Direct string formatting - vectorized operation
|
|
1496
|
+
polars.format("urn:li:dataPlatform:{}", polars.col("platform")).alias(
|
|
1497
|
+
"platform_urn"
|
|
1498
|
+
)
|
|
1499
|
+
)
|
|
1500
|
+
)
|
|
1501
|
+
|
|
1502
|
+
# Calculate user totals
|
|
1503
|
+
user_totals = user_dataset_aggregated.group_by("user").agg(
|
|
1504
|
+
polars.col("total_count").sum().alias("userUsageTotalPast30Days")
|
|
1505
|
+
)
|
|
1506
|
+
|
|
1507
|
+
# Calculate platform totals for each user - keep as list of structs
|
|
1508
|
+
platform_totals = (
|
|
1509
|
+
user_dataset_aggregated.group_by("user", "platform_urn")
|
|
1510
|
+
.agg(polars.col("total_count").sum().alias("platform_total"))
|
|
1511
|
+
.filter(polars.col("platform_urn").is_not_null())
|
|
1512
|
+
.group_by("user")
|
|
1513
|
+
.agg(
|
|
1514
|
+
polars.struct(
|
|
1515
|
+
[
|
|
1516
|
+
polars.col("platform_urn"),
|
|
1517
|
+
polars.col("platform_total").cast(polars.Float64),
|
|
1518
|
+
]
|
|
1519
|
+
).alias("platform_usage_pairs")
|
|
1520
|
+
)
|
|
1521
|
+
)
|
|
1522
|
+
|
|
1523
|
+
# Calculate top datasets
|
|
1524
|
+
top_datasets = (
|
|
1525
|
+
user_dataset_aggregated.with_columns(
|
|
1526
|
+
polars.col("total_count")
|
|
1527
|
+
.rank(descending=True, method="ordinal")
|
|
1528
|
+
.over("user")
|
|
1529
|
+
.alias("dataset_rank")
|
|
1530
|
+
)
|
|
1531
|
+
.filter(polars.col("dataset_rank") <= top_n)
|
|
1532
|
+
.group_by("user")
|
|
1533
|
+
.agg(
|
|
1534
|
+
polars.struct(
|
|
1535
|
+
[
|
|
1536
|
+
polars.col("urn").alias("dataset_urn"),
|
|
1537
|
+
polars.col("total_count").alias("count"),
|
|
1538
|
+
polars.col("platform_urn"),
|
|
1539
|
+
]
|
|
1540
|
+
)
|
|
1541
|
+
.sort_by("total_count", descending=True)
|
|
1542
|
+
.alias("top_datasets_map")
|
|
1543
|
+
)
|
|
1544
|
+
)
|
|
1545
|
+
|
|
1546
|
+
# Join all results
|
|
1547
|
+
return top_datasets.join(user_totals, on="user", how="left").join(
|
|
1548
|
+
platform_totals, on="user", how="left"
|
|
1549
|
+
)
|
|
1550
|
+
|
|
1551
|
+
def _combine_user_usage_data(
|
|
1552
|
+
self,
|
|
1553
|
+
dataset_usage_lf: polars.LazyFrame,
|
|
1554
|
+
dashboard_usage_lf: polars.LazyFrame,
|
|
1555
|
+
chart_usage_lf: polars.LazyFrame,
|
|
1556
|
+
) -> polars.LazyFrame:
|
|
1557
|
+
"""
|
|
1558
|
+
Combines user usage data from dataset, dashboard, and chart sources.
|
|
1559
|
+
|
|
1560
|
+
Args:
|
|
1561
|
+
dataset_usage_lf: LazyFrame with dataset usage data containing top_datasets_map
|
|
1562
|
+
dashboard_usage_lf: LazyFrame with dashboard usage data
|
|
1563
|
+
chart_usage_lf: LazyFrame with chart usage data
|
|
1564
|
+
|
|
1565
|
+
Returns:
|
|
1566
|
+
Combined LazyFrame with aggregated usage data per user
|
|
1567
|
+
"""
|
|
1568
|
+
user_totals = self._combine_user_totals(
|
|
1569
|
+
dataset_usage_lf, dashboard_usage_lf, chart_usage_lf
|
|
1570
|
+
)
|
|
1571
|
+
|
|
1572
|
+
platform_pairs = self._combine_platform_pairs(
|
|
1573
|
+
dataset_usage_lf, dashboard_usage_lf, chart_usage_lf
|
|
1574
|
+
)
|
|
1575
|
+
|
|
1576
|
+
result = user_totals.join(platform_pairs, on="user", how="left")
|
|
1577
|
+
|
|
1578
|
+
return result.with_columns(
|
|
1579
|
+
polars.col("platform_usage_pairs").fill_null(polars.lit([]))
|
|
1580
|
+
)
|
|
1581
|
+
|
|
1582
|
+
def _filter_users(self, users_lf: polars.LazyFrame) -> polars.LazyFrame:
|
|
1583
|
+
filter_condition = polars.col("user").str.contains("@")
|
|
1584
|
+
for pattern in self.config.excluded_platforms:
|
|
1585
|
+
filter_condition = filter_condition & ~polars.col("user").str.contains(
|
|
1586
|
+
pattern
|
|
1587
|
+
)
|
|
1588
|
+
|
|
1589
|
+
return users_lf.filter(filter_condition)
|
|
1590
|
+
|
|
1591
|
+
def _combine_user_totals(
|
|
1592
|
+
self,
|
|
1593
|
+
dataset_usage_lf: polars.LazyFrame,
|
|
1594
|
+
dashboard_usage_lf: polars.LazyFrame,
|
|
1595
|
+
chart_usage_lf: polars.LazyFrame,
|
|
1596
|
+
) -> polars.LazyFrame:
|
|
1597
|
+
"""Combine user totals and top_datasets_map from all sources."""
|
|
1598
|
+
# Collect all unique users in one operation
|
|
1599
|
+
all_users_lf = (
|
|
1600
|
+
polars.concat(
|
|
1601
|
+
[
|
|
1602
|
+
dataset_usage_lf.select("user"),
|
|
1603
|
+
dashboard_usage_lf.select("user"),
|
|
1604
|
+
chart_usage_lf.select("user"),
|
|
1605
|
+
]
|
|
1606
|
+
)
|
|
1607
|
+
.unique()
|
|
1608
|
+
.pipe(self._filter_users)
|
|
1609
|
+
)
|
|
1610
|
+
|
|
1611
|
+
return (
|
|
1612
|
+
all_users_lf.join(
|
|
1613
|
+
dataset_usage_lf.select(
|
|
1614
|
+
["user", "top_datasets_map", "userUsageTotalPast30Days"]
|
|
1615
|
+
),
|
|
1616
|
+
on="user",
|
|
1617
|
+
how="left",
|
|
1618
|
+
)
|
|
1619
|
+
.join(
|
|
1620
|
+
dashboard_usage_lf.select(["user", "userUsageTotalPast30Days"]),
|
|
1621
|
+
on="user",
|
|
1622
|
+
how="left",
|
|
1623
|
+
suffix="_dashboard",
|
|
1624
|
+
)
|
|
1625
|
+
.join(
|
|
1626
|
+
chart_usage_lf.select(["user", "userUsageTotalPast30Days"]),
|
|
1627
|
+
on="user",
|
|
1628
|
+
how="left",
|
|
1629
|
+
suffix="_chart",
|
|
1630
|
+
)
|
|
1631
|
+
.with_columns(
|
|
1632
|
+
[
|
|
1633
|
+
# Sum with explicit null handling
|
|
1634
|
+
(
|
|
1635
|
+
polars.col("userUsageTotalPast30Days").fill_null(0)
|
|
1636
|
+
+ polars.col("userUsageTotalPast30Days_dashboard").fill_null(0)
|
|
1637
|
+
+ polars.col("userUsageTotalPast30Days_chart").fill_null(0)
|
|
1638
|
+
).alias("userUsageTotalPast30Days")
|
|
1639
|
+
]
|
|
1640
|
+
)
|
|
1641
|
+
.select(["user", "top_datasets_map", "userUsageTotalPast30Days"])
|
|
1642
|
+
)
|
|
1643
|
+
|
|
1644
|
+
def _combine_platform_pairs(
|
|
1645
|
+
self,
|
|
1646
|
+
dataset_usage_lf: polars.LazyFrame,
|
|
1647
|
+
dashboard_usage_lf: polars.LazyFrame,
|
|
1648
|
+
chart_usage_lf: polars.LazyFrame,
|
|
1649
|
+
) -> polars.LazyFrame:
|
|
1650
|
+
"""Combine platform usage pairs from all sources."""
|
|
1651
|
+
all_platforms = []
|
|
1652
|
+
|
|
1653
|
+
# Extract platforms from each source
|
|
1654
|
+
for source_lf, col_name in [
|
|
1655
|
+
(dataset_usage_lf, "platform_usage_pairs"),
|
|
1656
|
+
(dashboard_usage_lf, "platform_usage_pairs"),
|
|
1657
|
+
(chart_usage_lf, "platform_usage_pairs"),
|
|
1658
|
+
]:
|
|
1659
|
+
platforms = self._extract_platforms_from_source(source_lf, col_name)
|
|
1660
|
+
if platforms is not None:
|
|
1661
|
+
all_platforms.append(platforms)
|
|
1662
|
+
|
|
1663
|
+
if not all_platforms:
|
|
1664
|
+
# Return empty result if no platforms found
|
|
1665
|
+
return polars.LazyFrame({"user": [], "platform_usage_pairs": []})
|
|
1666
|
+
|
|
1667
|
+
# Combine all platforms and aggregate by user + platform
|
|
1668
|
+
combined_platforms = polars.concat(all_platforms, how="vertical_relaxed")
|
|
1669
|
+
aggregated = combined_platforms.group_by("user", "platform_urn").agg(
|
|
1670
|
+
polars.col("platform_total").sum().alias("platform_total")
|
|
1671
|
+
)
|
|
1672
|
+
|
|
1673
|
+
# Rebuild platform_usage_pairs structure
|
|
1674
|
+
return aggregated.group_by("user").agg(
|
|
1675
|
+
polars.struct(
|
|
1676
|
+
[polars.col("platform_urn"), polars.col("platform_total")]
|
|
1677
|
+
).alias("platform_usage_pairs")
|
|
1678
|
+
)
|
|
1679
|
+
|
|
1680
|
+
def _extract_platforms_from_source(
|
|
1681
|
+
self, source_lf: polars.LazyFrame, col_name: str
|
|
1682
|
+
) -> polars.LazyFrame | None:
|
|
1683
|
+
"""Extract platform data from a source LazyFrame."""
|
|
1684
|
+
try:
|
|
1685
|
+
return (
|
|
1686
|
+
source_lf.select(["user", col_name])
|
|
1687
|
+
.filter(polars.col(col_name).is_not_null())
|
|
1688
|
+
.filter(polars.col(col_name).list.len() > 0)
|
|
1689
|
+
.explode(col_name)
|
|
1690
|
+
.unnest(col_name)
|
|
1691
|
+
.filter(polars.col("platform_urn").is_not_null())
|
|
1692
|
+
.select(["user", "platform_urn", "platform_total"])
|
|
1693
|
+
)
|
|
1694
|
+
except polars.exceptions.ColumnNotFoundError:
|
|
1695
|
+
return None
|
|
1696
|
+
|
|
1697
|
+
def add_platform_usage_percentiles(
|
|
1698
|
+
self, user_usage_lf: polars.LazyFrame
|
|
1699
|
+
) -> polars.LazyFrame:
|
|
1700
|
+
"""
|
|
1701
|
+
Add platform usage percentiles to user usage data.
|
|
1702
|
+
|
|
1703
|
+
Args:
|
|
1704
|
+
user_usage_lf: LazyFrame with user usage data containing platform_usage_pairs column
|
|
1705
|
+
|
|
1706
|
+
Returns:
|
|
1707
|
+
LazyFrame with additional platform_usage_percentiles column
|
|
1708
|
+
"""
|
|
1709
|
+
# First explode the platform_usage_pairs to work with individual platform usage records
|
|
1710
|
+
platform_usage_exploded = (
|
|
1711
|
+
user_usage_lf.explode("platform_usage_pairs")
|
|
1712
|
+
.unnest("platform_usage_pairs")
|
|
1713
|
+
.filter(polars.col("platform_urn").is_not_null())
|
|
1714
|
+
)
|
|
1715
|
+
|
|
1716
|
+
# Use the existing gen_rank_and_percentile method to calculate percentiles
|
|
1717
|
+
platform_percentiles_with_ranks = self.gen_rank_and_percentile(
|
|
1718
|
+
lf=platform_usage_exploded,
|
|
1719
|
+
count_field="platform_total",
|
|
1720
|
+
urn_field="user",
|
|
1721
|
+
platform_field="platform_urn",
|
|
1722
|
+
prefix="platform_",
|
|
1723
|
+
use_exp_cdf=False,
|
|
1724
|
+
)
|
|
1725
|
+
|
|
1726
|
+
# Group back by user and create the percentiles structure
|
|
1727
|
+
platform_percentiles = platform_percentiles_with_ranks.group_by("user").agg(
|
|
1728
|
+
polars.struct(
|
|
1729
|
+
[
|
|
1730
|
+
polars.col("platform_urn"),
|
|
1731
|
+
polars.col("platform_rank_percentile").cast(polars.Float64),
|
|
1732
|
+
]
|
|
1733
|
+
).alias("platform_usage_percentiles")
|
|
1734
|
+
)
|
|
1735
|
+
|
|
1736
|
+
# Join the percentiles back to the original user_usage_lf
|
|
1737
|
+
return user_usage_lf.join(platform_percentiles, on="user", how="left")
|
|
1738
|
+
|
|
1739
|
+
def _generate_user_usage_for_dashboard_charts(
|
|
1740
|
+
self, entity_index: str, usage_index: str
|
|
1741
|
+
) -> polars.LazyFrame:
|
|
1742
|
+
entities_df = self._generate_dashboard_chart_entities(entity_index)
|
|
1743
|
+
lf = self._generate_dashboard_chart_usage(entities_df, usage_index)
|
|
1744
|
+
|
|
1745
|
+
# Process dashboard usage data into user usage format (similar to dataset version)
|
|
1746
|
+
users_lf = (
|
|
1747
|
+
lf.explode("userCounts")
|
|
1748
|
+
.unnest("userCounts")
|
|
1749
|
+
.filter(polars.col("user").is_not_null())
|
|
1750
|
+
.rename({"usageCount": "count"}) # Rename to match dataset schema
|
|
1751
|
+
)
|
|
1752
|
+
|
|
1753
|
+
user_dashboard_usage_lf = self._create_user_dataset_usage_map(users_lf)
|
|
1754
|
+
return user_dashboard_usage_lf
|
|
1755
|
+
|
|
1756
|
+
def generate_user_usage(self) -> polars.LazyFrame:
|
|
1757
|
+
dataset_usage_lf = self._generate_user_usage_for_dataset()
|
|
1758
|
+
|
|
1759
|
+
usage_index = "dashboard_dashboardusagestatisticsaspect_v1"
|
|
1760
|
+
entity_index = "dashboardindex_v2"
|
|
1761
|
+
dashboard_usage_lf = self._generate_user_usage_for_dashboard_charts(
|
|
1762
|
+
entity_index, usage_index
|
|
1763
|
+
)
|
|
1764
|
+
|
|
1765
|
+
entity_index = "chartindex_v2"
|
|
1766
|
+
usage_index = "chart_chartusagestatisticsaspect_v1"
|
|
1767
|
+
chart_usage_lf = self._generate_user_usage_for_dashboard_charts(
|
|
1768
|
+
entity_index, usage_index
|
|
1769
|
+
)
|
|
1770
|
+
|
|
1771
|
+
# Combine all three usage sources
|
|
1772
|
+
lf = self._combine_user_usage_data(
|
|
1773
|
+
dataset_usage_lf, dashboard_usage_lf, chart_usage_lf
|
|
1774
|
+
)
|
|
1775
|
+
|
|
1776
|
+
lf = self.add_platform_usage_percentiles(lf)
|
|
1777
|
+
|
|
1778
|
+
# Add user usage percentiles across all users (not grouped by platform)
|
|
1779
|
+
# Create a temporary platform field for percentile calculation
|
|
1780
|
+
lf = lf.with_columns(polars.lit("all_users").alias("temp_platform"))
|
|
1781
|
+
|
|
1782
|
+
lf = self.gen_rank_and_percentile(
|
|
1783
|
+
lf=lf,
|
|
1784
|
+
count_field="userUsageTotalPast30Days",
|
|
1785
|
+
urn_field="user",
|
|
1786
|
+
platform_field="temp_platform",
|
|
1787
|
+
prefix="userUsage",
|
|
1788
|
+
use_exp_cdf=False,
|
|
1789
|
+
)
|
|
1790
|
+
|
|
1791
|
+
# Rename the percentile column to match the schema field name and remove temp field
|
|
1792
|
+
lf = lf.rename(
|
|
1793
|
+
{"userUsagerank_percentile": "userUsagePercentilePast30Days"}
|
|
1794
|
+
).drop("temp_platform")
|
|
1795
|
+
|
|
1796
|
+
return lf
|
|
1797
|
+
|
|
1370
1798
|
def generate_dataset_usage(self) -> polars.LazyFrame:
|
|
1371
1799
|
datasets_lf = self.get_datasets()
|
|
1372
1800
|
if self.config.set_upstream_table_max_modification_time_for_views:
|