acryl-datahub-cloud 0.3.11rc0__py3-none-any.whl → 0.3.16.1rc0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub-cloud might be problematic. Click here for more details.
- acryl_datahub_cloud/_codegen_config.json +1 -1
- acryl_datahub_cloud/acryl_cs_issues/models.py +5 -3
- acryl_datahub_cloud/action_request/action_request_owner_source.py +36 -6
- acryl_datahub_cloud/datahub_forms_notifications/__init__.py +0 -0
- acryl_datahub_cloud/datahub_forms_notifications/forms_notifications_source.py +569 -0
- acryl_datahub_cloud/datahub_forms_notifications/get_feature_flag.gql +7 -0
- acryl_datahub_cloud/datahub_forms_notifications/get_search_results_total.gql +14 -0
- acryl_datahub_cloud/datahub_forms_notifications/query.py +17 -0
- acryl_datahub_cloud/datahub_forms_notifications/scroll_forms_for_notification.gql +29 -0
- acryl_datahub_cloud/datahub_forms_notifications/send_form_notification_request.gql +5 -0
- acryl_datahub_cloud/datahub_reporting/datahub_dataset.py +37 -13
- acryl_datahub_cloud/datahub_reporting/datahub_form_reporting.py +55 -24
- acryl_datahub_cloud/datahub_reporting/extract_graph.py +4 -3
- acryl_datahub_cloud/datahub_reporting/extract_sql.py +242 -51
- acryl_datahub_cloud/datahub_reporting/forms.py +1 -1
- acryl_datahub_cloud/datahub_reporting/forms_config.py +3 -2
- acryl_datahub_cloud/datahub_restore/source.py +3 -2
- acryl_datahub_cloud/datahub_usage_reporting/excluded.py +94 -0
- acryl_datahub_cloud/datahub_usage_reporting/query_builder.py +48 -8
- acryl_datahub_cloud/datahub_usage_reporting/usage_feature_reporter.py +518 -77
- acryl_datahub_cloud/elasticsearch/graph_service.py +76 -14
- acryl_datahub_cloud/graphql_utils.py +64 -0
- acryl_datahub_cloud/lineage_features/source.py +555 -49
- acryl_datahub_cloud/metadata/_urns/urn_defs.py +2296 -1900
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/actionworkflow/__init__.py +53 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/anomaly/__init__.py +2 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/assertion/__init__.py +4 -2
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/common/__init__.py +6 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/conversation/__init__.py +29 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +2 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/execution/__init__.py +2 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/form/__init__.py +8 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/identity/__init__.py +8 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/knowledge/__init__.py +33 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +12 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/metadata/search/features/__init__.py +2 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/notification/__init__.py +19 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +28 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- acryl_datahub_cloud/metadata/schema.avsc +25091 -20557
- acryl_datahub_cloud/metadata/schema_classes.py +29269 -23863
- acryl_datahub_cloud/metadata/schemas/ActionRequestInfo.avsc +235 -2
- acryl_datahub_cloud/metadata/schemas/ActionWorkflowInfo.avsc +683 -0
- acryl_datahub_cloud/metadata/schemas/ActionWorkflowKey.avsc +21 -0
- acryl_datahub_cloud/metadata/schemas/Actors.avsc +38 -1
- acryl_datahub_cloud/metadata/schemas/ApplicationKey.avsc +31 -0
- acryl_datahub_cloud/metadata/schemas/ApplicationProperties.avsc +75 -0
- acryl_datahub_cloud/metadata/schemas/Applications.avsc +38 -0
- acryl_datahub_cloud/metadata/schemas/AssertionAnalyticsRunEvent.avsc +353 -215
- acryl_datahub_cloud/metadata/schemas/AssertionInfo.avsc +147 -20
- acryl_datahub_cloud/metadata/schemas/AssertionKey.avsc +1 -1
- acryl_datahub_cloud/metadata/schemas/AssertionRunEvent.avsc +166 -21
- acryl_datahub_cloud/metadata/schemas/{AssertionSummary.avsc → AssertionRunSummary.avsc} +15 -2
- acryl_datahub_cloud/metadata/schemas/AssertionsSummary.avsc +54 -0
- acryl_datahub_cloud/metadata/schemas/AssetSettings.avsc +63 -0
- acryl_datahub_cloud/metadata/schemas/BusinessAttributeInfo.avsc +7 -3
- acryl_datahub_cloud/metadata/schemas/ChartInfo.avsc +20 -6
- acryl_datahub_cloud/metadata/schemas/ChartKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/ConstraintInfo.avsc +12 -1
- acryl_datahub_cloud/metadata/schemas/ContainerKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/ContainerProperties.avsc +16 -5
- acryl_datahub_cloud/metadata/schemas/CorpGroupEditableInfo.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/CorpGroupInfo.avsc +7 -3
- acryl_datahub_cloud/metadata/schemas/CorpGroupKey.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/CorpGroupSettings.avsc +127 -2
- acryl_datahub_cloud/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
- acryl_datahub_cloud/metadata/schemas/CorpUserInfo.avsc +18 -2
- acryl_datahub_cloud/metadata/schemas/CorpUserInvitationStatus.avsc +106 -0
- acryl_datahub_cloud/metadata/schemas/CorpUserKey.avsc +4 -1
- acryl_datahub_cloud/metadata/schemas/CorpUserSettings.avsc +304 -2
- acryl_datahub_cloud/metadata/schemas/CorpUserUsageFeatures.avsc +86 -0
- acryl_datahub_cloud/metadata/schemas/DashboardInfo.avsc +11 -5
- acryl_datahub_cloud/metadata/schemas/DashboardKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/DataFlowInfo.avsc +15 -5
- acryl_datahub_cloud/metadata/schemas/DataFlowKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/DataHubAiConversationInfo.avsc +256 -0
- acryl_datahub_cloud/metadata/schemas/DataHubAiConversationKey.avsc +22 -0
- acryl_datahub_cloud/metadata/schemas/DataHubFileInfo.avsc +234 -0
- acryl_datahub_cloud/metadata/schemas/DataHubFileKey.avsc +22 -0
- acryl_datahub_cloud/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- acryl_datahub_cloud/metadata/schemas/DataHubPageModuleProperties.avsc +308 -0
- acryl_datahub_cloud/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- acryl_datahub_cloud/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- acryl_datahub_cloud/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- acryl_datahub_cloud/metadata/schemas/DataJobInfo.avsc +13 -4
- acryl_datahub_cloud/metadata/schemas/DataJobInputOutput.avsc +8 -0
- acryl_datahub_cloud/metadata/schemas/DataJobKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/DataPlatformInfo.avsc +3 -1
- acryl_datahub_cloud/metadata/schemas/DataPlatformInstanceProperties.avsc +5 -2
- acryl_datahub_cloud/metadata/schemas/DataProcessKey.avsc +4 -0
- acryl_datahub_cloud/metadata/schemas/DataProductKey.avsc +2 -0
- acryl_datahub_cloud/metadata/schemas/DataProductProperties.avsc +6 -3
- acryl_datahub_cloud/metadata/schemas/DataTypeInfo.avsc +5 -0
- acryl_datahub_cloud/metadata/schemas/DatasetKey.avsc +10 -2
- acryl_datahub_cloud/metadata/schemas/DatasetProperties.avsc +12 -5
- acryl_datahub_cloud/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- acryl_datahub_cloud/metadata/schemas/DocumentInfo.avsc +407 -0
- acryl_datahub_cloud/metadata/schemas/DocumentKey.avsc +35 -0
- acryl_datahub_cloud/metadata/schemas/DocumentSettings.avsc +79 -0
- acryl_datahub_cloud/metadata/schemas/DomainKey.avsc +2 -0
- acryl_datahub_cloud/metadata/schemas/DomainProperties.avsc +7 -3
- acryl_datahub_cloud/metadata/schemas/EditableContainerProperties.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/EditableDashboardProperties.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/EditableDataFlowProperties.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/EditableDataJobProperties.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/EditableDatasetProperties.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/EditableERModelRelationshipProperties.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/EditableMLFeatureProperties.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/EditableMLFeatureTableProperties.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/EditableMLModelGroupProperties.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/EditableMLModelProperties.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/EditableNotebookProperties.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/EditableSchemaMetadata.avsc +4 -2
- acryl_datahub_cloud/metadata/schemas/EntityTypeInfo.avsc +5 -0
- acryl_datahub_cloud/metadata/schemas/ExecutionRequestArtifactsLocation.avsc +16 -0
- acryl_datahub_cloud/metadata/schemas/ExecutionRequestKey.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/FormAssignmentStatus.avsc +36 -0
- acryl_datahub_cloud/metadata/schemas/FormInfo.avsc +6 -0
- acryl_datahub_cloud/metadata/schemas/FormKey.avsc +3 -1
- acryl_datahub_cloud/metadata/schemas/FormNotifications.avsc +69 -0
- acryl_datahub_cloud/metadata/schemas/FormSettings.avsc +30 -0
- acryl_datahub_cloud/metadata/schemas/GlobalSettingsInfo.avsc +416 -0
- acryl_datahub_cloud/metadata/schemas/GlobalTags.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/GlossaryNodeInfo.avsc +3 -1
- acryl_datahub_cloud/metadata/schemas/GlossaryNodeKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/GlossaryTermInfo.avsc +3 -1
- acryl_datahub_cloud/metadata/schemas/GlossaryTermKey.avsc +2 -0
- acryl_datahub_cloud/metadata/schemas/IcebergWarehouseInfo.avsc +4 -0
- acryl_datahub_cloud/metadata/schemas/IncidentActivityEvent.avsc +3 -3
- acryl_datahub_cloud/metadata/schemas/IncidentInfo.avsc +3 -3
- acryl_datahub_cloud/metadata/schemas/InferredMetadata.avsc +71 -1
- acryl_datahub_cloud/metadata/schemas/InputFields.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/InviteToken.avsc +26 -0
- acryl_datahub_cloud/metadata/schemas/LineageFeatures.avsc +67 -42
- acryl_datahub_cloud/metadata/schemas/LogicalParent.avsc +145 -0
- acryl_datahub_cloud/metadata/schemas/MLFeatureKey.avsc +4 -1
- acryl_datahub_cloud/metadata/schemas/MLFeatureTableKey.avsc +4 -1
- acryl_datahub_cloud/metadata/schemas/MLModelDeploymentKey.avsc +7 -1
- acryl_datahub_cloud/metadata/schemas/MLModelGroupKey.avsc +9 -1
- acryl_datahub_cloud/metadata/schemas/MLModelKey.avsc +9 -1
- acryl_datahub_cloud/metadata/schemas/MLModelProperties.avsc +4 -2
- acryl_datahub_cloud/metadata/schemas/MLPrimaryKeyKey.avsc +4 -1
- acryl_datahub_cloud/metadata/schemas/MetadataChangeEvent.avsc +418 -97
- acryl_datahub_cloud/metadata/schemas/MetadataChangeLog.avsc +62 -44
- acryl_datahub_cloud/metadata/schemas/MetadataChangeProposal.avsc +61 -0
- acryl_datahub_cloud/metadata/schemas/MonitorAnomalyEvent.avsc +54 -9
- acryl_datahub_cloud/metadata/schemas/MonitorInfo.avsc +163 -23
- acryl_datahub_cloud/metadata/schemas/MonitorKey.avsc +9 -1
- acryl_datahub_cloud/metadata/schemas/MonitorSuiteInfo.avsc +128 -3
- acryl_datahub_cloud/metadata/schemas/NotebookInfo.avsc +5 -2
- acryl_datahub_cloud/metadata/schemas/NotebookKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/NotificationRequest.avsc +91 -4
- acryl_datahub_cloud/metadata/schemas/Operation.avsc +17 -0
- acryl_datahub_cloud/metadata/schemas/Ownership.avsc +71 -1
- acryl_datahub_cloud/metadata/schemas/QuerySubjects.avsc +2 -13
- acryl_datahub_cloud/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- acryl_datahub_cloud/metadata/schemas/RoleProperties.avsc +3 -1
- acryl_datahub_cloud/metadata/schemas/SchemaFieldInfo.avsc +3 -1
- acryl_datahub_cloud/metadata/schemas/SchemaFieldKey.avsc +3 -0
- acryl_datahub_cloud/metadata/schemas/SchemaMetadata.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/SemanticContent.avsc +123 -0
- acryl_datahub_cloud/metadata/schemas/StructuredProperties.avsc +69 -0
- acryl_datahub_cloud/metadata/schemas/StructuredPropertyDefinition.avsc +15 -4
- acryl_datahub_cloud/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- acryl_datahub_cloud/metadata/schemas/SubscriptionInfo.avsc +136 -5
- acryl_datahub_cloud/metadata/schemas/SubscriptionKey.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/SystemMetadata.avsc +61 -0
- acryl_datahub_cloud/metadata/schemas/TagProperties.avsc +3 -1
- acryl_datahub_cloud/metadata/schemas/TestInfo.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/UpstreamLineage.avsc +9 -0
- acryl_datahub_cloud/metadata/schemas/UsageFeatures.avsc +10 -0
- acryl_datahub_cloud/notifications/__init__.py +0 -0
- acryl_datahub_cloud/notifications/notification_recipient_builder.py +399 -0
- acryl_datahub_cloud/sdk/__init__.py +69 -0
- acryl_datahub_cloud/sdk/assertion/__init__.py +58 -0
- acryl_datahub_cloud/sdk/assertion/assertion_base.py +779 -0
- acryl_datahub_cloud/sdk/assertion/column_metric_assertion.py +191 -0
- acryl_datahub_cloud/sdk/assertion/column_value_assertion.py +431 -0
- acryl_datahub_cloud/sdk/assertion/freshness_assertion.py +201 -0
- acryl_datahub_cloud/sdk/assertion/schema_assertion.py +268 -0
- acryl_datahub_cloud/sdk/assertion/smart_column_metric_assertion.py +212 -0
- acryl_datahub_cloud/sdk/assertion/smart_freshness_assertion.py +165 -0
- acryl_datahub_cloud/sdk/assertion/smart_sql_assertion.py +156 -0
- acryl_datahub_cloud/sdk/assertion/smart_volume_assertion.py +162 -0
- acryl_datahub_cloud/sdk/assertion/sql_assertion.py +273 -0
- acryl_datahub_cloud/sdk/assertion/types.py +20 -0
- acryl_datahub_cloud/sdk/assertion/volume_assertion.py +156 -0
- acryl_datahub_cloud/sdk/assertion_client/__init__.py +0 -0
- acryl_datahub_cloud/sdk/assertion_client/column_metric.py +545 -0
- acryl_datahub_cloud/sdk/assertion_client/column_value.py +617 -0
- acryl_datahub_cloud/sdk/assertion_client/freshness.py +371 -0
- acryl_datahub_cloud/sdk/assertion_client/helpers.py +166 -0
- acryl_datahub_cloud/sdk/assertion_client/schema.py +358 -0
- acryl_datahub_cloud/sdk/assertion_client/smart_column_metric.py +540 -0
- acryl_datahub_cloud/sdk/assertion_client/smart_freshness.py +373 -0
- acryl_datahub_cloud/sdk/assertion_client/smart_sql.py +411 -0
- acryl_datahub_cloud/sdk/assertion_client/smart_volume.py +380 -0
- acryl_datahub_cloud/sdk/assertion_client/sql.py +410 -0
- acryl_datahub_cloud/sdk/assertion_client/volume.py +446 -0
- acryl_datahub_cloud/sdk/assertion_input/__init__.py +0 -0
- acryl_datahub_cloud/sdk/assertion_input/assertion_input.py +1470 -0
- acryl_datahub_cloud/sdk/assertion_input/column_assertion_constants.py +114 -0
- acryl_datahub_cloud/sdk/assertion_input/column_assertion_utils.py +284 -0
- acryl_datahub_cloud/sdk/assertion_input/column_metric_assertion_input.py +759 -0
- acryl_datahub_cloud/sdk/assertion_input/column_metric_constants.py +109 -0
- acryl_datahub_cloud/sdk/assertion_input/column_value_assertion_input.py +810 -0
- acryl_datahub_cloud/sdk/assertion_input/freshness_assertion_input.py +305 -0
- acryl_datahub_cloud/sdk/assertion_input/schema_assertion_input.py +413 -0
- acryl_datahub_cloud/sdk/assertion_input/smart_column_metric_assertion_input.py +793 -0
- acryl_datahub_cloud/sdk/assertion_input/smart_freshness_assertion_input.py +218 -0
- acryl_datahub_cloud/sdk/assertion_input/smart_sql_assertion_input.py +181 -0
- acryl_datahub_cloud/sdk/assertion_input/smart_volume_assertion_input.py +189 -0
- acryl_datahub_cloud/sdk/assertion_input/sql_assertion_input.py +320 -0
- acryl_datahub_cloud/sdk/assertion_input/volume_assertion_input.py +635 -0
- acryl_datahub_cloud/sdk/assertions_client.py +1074 -0
- acryl_datahub_cloud/sdk/entities/__init__.py +0 -0
- acryl_datahub_cloud/sdk/entities/assertion.py +439 -0
- acryl_datahub_cloud/sdk/entities/monitor.py +291 -0
- acryl_datahub_cloud/sdk/entities/subscription.py +100 -0
- acryl_datahub_cloud/sdk/errors.py +34 -0
- acryl_datahub_cloud/sdk/resolver_client.py +42 -0
- acryl_datahub_cloud/sdk/subscription_client.py +737 -0
- {acryl_datahub_cloud-0.3.11rc0.dist-info → acryl_datahub_cloud-0.3.16.1rc0.dist-info}/METADATA +55 -49
- {acryl_datahub_cloud-0.3.11rc0.dist-info → acryl_datahub_cloud-0.3.16.1rc0.dist-info}/RECORD +235 -142
- {acryl_datahub_cloud-0.3.11rc0.dist-info → acryl_datahub_cloud-0.3.16.1rc0.dist-info}/WHEEL +1 -1
- {acryl_datahub_cloud-0.3.11rc0.dist-info → acryl_datahub_cloud-0.3.16.1rc0.dist-info}/entry_points.txt +1 -0
- acryl_datahub_cloud/_sdk_extras/__init__.py +0 -4
- acryl_datahub_cloud/_sdk_extras/assertion.py +0 -15
- acryl_datahub_cloud/_sdk_extras/assertions_client.py +0 -23
- {acryl_datahub_cloud-0.3.11rc0.dist-info → acryl_datahub_cloud-0.3.16.1rc0.dist-info}/top_level.txt +0 -0
|
@@ -22,12 +22,14 @@ from polars.datatypes import DataTypeClass
|
|
|
22
22
|
from pydantic import Field
|
|
23
23
|
from scipy.stats import expon
|
|
24
24
|
|
|
25
|
+
from acryl_datahub_cloud.datahub_usage_reporting.excluded import EXCLUDED_PATTERNS
|
|
25
26
|
from acryl_datahub_cloud.datahub_usage_reporting.query_builder import QueryBuilder
|
|
26
27
|
from acryl_datahub_cloud.datahub_usage_reporting.usage_feature_patch_builder import (
|
|
27
28
|
UsageFeaturePatchBuilder,
|
|
28
29
|
)
|
|
29
30
|
from acryl_datahub_cloud.elasticsearch.config import ElasticSearchClientConfig
|
|
30
31
|
from acryl_datahub_cloud.metadata.schema_classes import (
|
|
32
|
+
CorpUserUsageFeaturesClass,
|
|
31
33
|
QueryUsageFeaturesClass,
|
|
32
34
|
UsageFeaturesClass,
|
|
33
35
|
)
|
|
@@ -40,7 +42,7 @@ from datahub.ingestion.api.decorators import (
|
|
|
40
42
|
platform_name,
|
|
41
43
|
support_status,
|
|
42
44
|
)
|
|
43
|
-
from datahub.ingestion.api.source import MetadataWorkUnitProcessor
|
|
45
|
+
from datahub.ingestion.api.source import MetadataWorkUnitProcessor
|
|
44
46
|
from datahub.ingestion.api.source_helpers import auto_workunit_reporter
|
|
45
47
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
46
48
|
from datahub.ingestion.graph.client import DatahubClientConfig
|
|
@@ -114,12 +116,12 @@ class DataHubUsageFeatureReportingSourceConfig(
|
|
|
114
116
|
30, description="Timeout in seconds for the search queries."
|
|
115
117
|
)
|
|
116
118
|
extract_batch_size: int = Field(
|
|
117
|
-
|
|
119
|
+
5000,
|
|
118
120
|
description="The number of documents to retrieve in each batch from ElasticSearch or OpenSearch.",
|
|
119
121
|
)
|
|
120
122
|
|
|
121
123
|
extract_delay: Optional[float] = Field(
|
|
122
|
-
0
|
|
124
|
+
0,
|
|
123
125
|
description="The delay in seconds between each batch extraction from ElasticSearch or OpenSearch.",
|
|
124
126
|
)
|
|
125
127
|
|
|
@@ -135,6 +137,10 @@ class DataHubUsageFeatureReportingSourceConfig(
|
|
|
135
137
|
None,
|
|
136
138
|
description="Optional configuration for stateful ingestion, including stale metadata removal.",
|
|
137
139
|
)
|
|
140
|
+
user_usage_enabled: bool = Field(
|
|
141
|
+
True,
|
|
142
|
+
description="Flag to enable or disable user usage statistics collection.",
|
|
143
|
+
)
|
|
138
144
|
dataset_usage_enabled: bool = Field(
|
|
139
145
|
True,
|
|
140
146
|
description="Flag to enable or disable dataset usage statistics collection.",
|
|
@@ -177,7 +183,7 @@ class DataHubUsageFeatureReportingSourceConfig(
|
|
|
177
183
|
# This option is only needed here until we are sure that the streaming mode is stable.
|
|
178
184
|
# then we can remove it and control it with the streaming_mode option.
|
|
179
185
|
experimental_full_streaming: bool = Field(
|
|
180
|
-
|
|
186
|
+
True,
|
|
181
187
|
description="Flag to enable full streaming mode.'",
|
|
182
188
|
)
|
|
183
189
|
|
|
@@ -191,6 +197,11 @@ class DataHubUsageFeatureReportingSourceConfig(
|
|
|
191
197
|
description="Flag to generate MCP patch for usage features.'",
|
|
192
198
|
)
|
|
193
199
|
|
|
200
|
+
excluded_platforms: List[str] = Field(
|
|
201
|
+
EXCLUDED_PATTERNS,
|
|
202
|
+
description="List of platforms to exclude from usage statistics collection. This is done to avoid invite user functionality to be filled with service accounts.",
|
|
203
|
+
)
|
|
204
|
+
|
|
194
205
|
|
|
195
206
|
def exp_cdf(series: polars.Series) -> polars.Series:
|
|
196
207
|
with PerfTimer() as timer:
|
|
@@ -228,7 +239,7 @@ def exp_cdf(series: polars.Series) -> polars.Series:
|
|
|
228
239
|
|
|
229
240
|
|
|
230
241
|
@dataclass
|
|
231
|
-
class DatahubUsageFeatureReport(
|
|
242
|
+
class DatahubUsageFeatureReport(StatefulIngestionReport, IngestionStageReport):
|
|
232
243
|
dataset_platforms_count: Dict[str, int] = field(
|
|
233
244
|
default_factory=lambda: defaultdict(lambda: 0)
|
|
234
245
|
)
|
|
@@ -241,10 +252,6 @@ class DatahubUsageFeatureReport(IngestionStageReport, StatefulIngestionReport):
|
|
|
241
252
|
default_factory=lambda: defaultdict(lambda: PerfTimer())
|
|
242
253
|
)
|
|
243
254
|
|
|
244
|
-
dataset_usage_processing_time: PerfTimer = PerfTimer()
|
|
245
|
-
dashboard_usage_processing_time: PerfTimer = PerfTimer()
|
|
246
|
-
chart_usage_processing_time: PerfTimer = PerfTimer()
|
|
247
|
-
query_usage_processing_time: PerfTimer = PerfTimer()
|
|
248
255
|
query_platforms_count: Dict[str, int] = field(
|
|
249
256
|
default_factory=lambda: defaultdict(lambda: 0)
|
|
250
257
|
)
|
|
@@ -617,11 +624,11 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
617
624
|
),
|
|
618
625
|
)
|
|
619
626
|
|
|
620
|
-
response = server.create_pit(index, keep_alive="10m")
|
|
627
|
+
# response = server.create_pit(index, keep_alive="10m")
|
|
621
628
|
|
|
622
629
|
# TODO: Save PIT, we can resume processing based on <pit, search_after> tuple
|
|
623
|
-
pit = response.get("pit_id")
|
|
624
|
-
query_copy.update({"pit": {"id": pit, "keep_alive": "10m"}})
|
|
630
|
+
# pit = response.get("pit_id")
|
|
631
|
+
# query_copy.update({"pit": {"id": pit, "keep_alive": "10m"}})
|
|
625
632
|
else:
|
|
626
633
|
server = Elasticsearch(
|
|
627
634
|
[endpoint],
|
|
@@ -731,17 +738,20 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
731
738
|
return pa.dictionary(index_type=pa.int32(), value_type=pa.string())
|
|
732
739
|
elif isinstance(polars_dtype, polars.Struct):
|
|
733
740
|
return pa.struct(
|
|
734
|
-
|
|
735
|
-
field.name
|
|
741
|
+
[
|
|
742
|
+
pa.field(field.name, convert_dtype(field.dtype))
|
|
736
743
|
for field in polars_dtype.fields
|
|
737
|
-
|
|
744
|
+
]
|
|
738
745
|
)
|
|
739
746
|
elif isinstance(polars_dtype, polars.List):
|
|
740
747
|
return pa.list_(convert_dtype(polars_dtype.inner))
|
|
741
748
|
else:
|
|
742
749
|
raise ValueError(f"Unsupported Polars dtype: {polars_dtype}")
|
|
743
750
|
|
|
744
|
-
fields = [
|
|
751
|
+
fields = [
|
|
752
|
+
pa.field(name, convert_dtype(dtype))
|
|
753
|
+
for name, dtype in polars_schema.items()
|
|
754
|
+
]
|
|
745
755
|
return pa.schema(fields)
|
|
746
756
|
|
|
747
757
|
def batch_write_parquet(
|
|
@@ -834,7 +844,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
834
844
|
.drop(["removed"])
|
|
835
845
|
)
|
|
836
846
|
|
|
837
|
-
return wdf
|
|
847
|
+
return wdf
|
|
838
848
|
|
|
839
849
|
def load_write_usage_server_side_aggregation(
|
|
840
850
|
self, soft_deleted_entities_df: polars.LazyFrame
|
|
@@ -923,6 +933,11 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
923
933
|
|
|
924
934
|
return dataset_df
|
|
925
935
|
|
|
936
|
+
def generate_user_usage_mcps(self) -> Iterable[MetadataWorkUnit]:
|
|
937
|
+
with polars.StringCache():
|
|
938
|
+
user_usage_lf = self.generate_user_usage()
|
|
939
|
+
yield from self.generate_user_usage_mcp_from_lazyframe(user_usage_lf)
|
|
940
|
+
|
|
926
941
|
def generate_dataset_usage_mcps(self) -> Iterable[MetadataWorkUnit]:
|
|
927
942
|
with polars.StringCache():
|
|
928
943
|
dataset_usage_df = self.generate_dataset_usage()
|
|
@@ -958,44 +973,34 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
958
973
|
]
|
|
959
974
|
|
|
960
975
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
976
|
+
if self.config.user_usage_enabled:
|
|
977
|
+
with self.report.new_stage("generate user usage"):
|
|
978
|
+
yield from self.generate_user_usage_mcps()
|
|
979
|
+
|
|
961
980
|
if self.config.dataset_usage_enabled:
|
|
962
|
-
with self.report.
|
|
963
|
-
self.report.new_stage("generate dataset usage")
|
|
981
|
+
with self.report.new_stage("generate dataset usage"):
|
|
964
982
|
yield from self.generate_dataset_usage_mcps()
|
|
965
|
-
time_taken = timer.elapsed_seconds()
|
|
966
|
-
logger.info(f"Dataset Usage generation took {time_taken:.3f} seconds")
|
|
967
983
|
|
|
968
984
|
if self.config.dashboard_usage_enabled:
|
|
969
|
-
with self.report.
|
|
970
|
-
self.report.new_stage("generate dashboard usage")
|
|
985
|
+
with self.report.new_stage("generate dashboard usage"):
|
|
971
986
|
yield from self.generate_dashboard_usage_mcps()
|
|
972
987
|
|
|
973
|
-
time_taken = timer.elapsed_seconds()
|
|
974
|
-
logger.info(f"Dashboard Usage generation took {time_taken:.3f}")
|
|
975
|
-
|
|
976
988
|
if self.config.chart_usage_enabled:
|
|
977
|
-
with self.report.
|
|
978
|
-
self.report.new_stage("generate chart usage")
|
|
979
|
-
|
|
989
|
+
with self.report.new_stage("generate chart usage"):
|
|
980
990
|
yield from self.generate_chart_usage_mcps()
|
|
981
991
|
|
|
982
|
-
time_taken = timer.elapsed_seconds()
|
|
983
|
-
logger.info(f"Chart Usage generation took {time_taken:.3f}")
|
|
984
|
-
|
|
985
992
|
if self.config.query_usage_enabled:
|
|
986
|
-
with self.report.
|
|
987
|
-
self.report.new_stage("generate query usage")
|
|
988
|
-
|
|
993
|
+
with self.report.new_stage("generate query usage"):
|
|
989
994
|
yield from self.generate_query_usage_mcps()
|
|
990
995
|
|
|
991
|
-
|
|
992
|
-
|
|
996
|
+
with self.report.new_stage("end so time is calculated for last stage"):
|
|
997
|
+
pass
|
|
993
998
|
|
|
994
999
|
def generate_mcp_from_lazyframe(
|
|
995
1000
|
self, lazy_frame: polars.LazyFrame
|
|
996
1001
|
) -> Iterable[MetadataWorkUnit]:
|
|
997
1002
|
for row in lazy_frame.collect(
|
|
998
|
-
streaming
|
|
1003
|
+
engine="streaming" if self.config.experimental_full_streaming else "auto"
|
|
999
1004
|
).to_struct():
|
|
1000
1005
|
if "siblings" in row and row["siblings"]:
|
|
1001
1006
|
logger.info(f"Siblings found for urn: {row['urn']} -> row['siblings']")
|
|
@@ -1052,7 +1057,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1052
1057
|
uniqueUserPercentileLast30Days=int(
|
|
1053
1058
|
row.get("distinct_user_rank_percentile", 0) or 0
|
|
1054
1059
|
),
|
|
1055
|
-
writeCountLast30Days=int(row.get("
|
|
1060
|
+
writeCountLast30Days=int(row.get("write_count", 0) or 0)
|
|
1056
1061
|
if not self.config.disable_write_usage
|
|
1057
1062
|
else None,
|
|
1058
1063
|
writeCountPercentileLast30Days=int(
|
|
@@ -1086,7 +1091,9 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1086
1091
|
def generate_query_usage_mcp_from_lazyframe(
|
|
1087
1092
|
self, lazy_frame: polars.LazyFrame
|
|
1088
1093
|
) -> Iterable[MetadataWorkUnit]:
|
|
1089
|
-
for row in lazy_frame.collect(
|
|
1094
|
+
for row in lazy_frame.collect(
|
|
1095
|
+
engine="streaming" if self.config.experimental_full_streaming else "auto"
|
|
1096
|
+
).iter_rows(named=True):
|
|
1090
1097
|
query_usage_features = QueryUsageFeaturesClass(
|
|
1091
1098
|
queryCountLast30Days=int(row.get("totalSqlQueries", 0) or 0),
|
|
1092
1099
|
queryCountTotal=None, # This is not implemented
|
|
@@ -1106,6 +1113,47 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1106
1113
|
row["urn"], query_usage_features
|
|
1107
1114
|
)
|
|
1108
1115
|
|
|
1116
|
+
def _convert_platform_pairs_to_dict(
|
|
1117
|
+
self,
|
|
1118
|
+
platform_pairs: Optional[List[Dict[str, Any]]],
|
|
1119
|
+
value_key: str = "platform_total",
|
|
1120
|
+
) -> Optional[Dict[str, Any]]:
|
|
1121
|
+
"""Convert list of platform usage structs to dictionary."""
|
|
1122
|
+
if not platform_pairs:
|
|
1123
|
+
return None
|
|
1124
|
+
|
|
1125
|
+
return {
|
|
1126
|
+
pair["platform_urn"]: pair[value_key]
|
|
1127
|
+
for pair in platform_pairs
|
|
1128
|
+
if pair["platform_urn"] is not None
|
|
1129
|
+
}
|
|
1130
|
+
|
|
1131
|
+
def generate_user_usage_mcp_from_lazyframe(
|
|
1132
|
+
self, lazy_frame: polars.LazyFrame
|
|
1133
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
1134
|
+
for row in lazy_frame.collect(
|
|
1135
|
+
engine="streaming" if self.config.experimental_full_streaming else "auto"
|
|
1136
|
+
).iter_rows(named=True):
|
|
1137
|
+
user_usage_features = CorpUserUsageFeaturesClass(
|
|
1138
|
+
userUsageTotalPast30Days=int(
|
|
1139
|
+
row.get("userUsageTotalPast30Days", 0) or 0
|
|
1140
|
+
),
|
|
1141
|
+
userPlatformUsageTotalsPast30Days=self._convert_platform_pairs_to_dict(
|
|
1142
|
+
row.get("platform_usage_pairs", [])
|
|
1143
|
+
),
|
|
1144
|
+
userPlatformUsagePercentilePast30Days=self._convert_platform_pairs_to_dict(
|
|
1145
|
+
row.get("platform_usage_percentiles", []),
|
|
1146
|
+
"platform_rank_percentile",
|
|
1147
|
+
),
|
|
1148
|
+
userUsagePercentilePast30Days=row.get("userUsagePercentilePast30Days"),
|
|
1149
|
+
userTopDatasetsByUsage=self._convert_top_datasets_to_dict(
|
|
1150
|
+
row.get("top_datasets_map", [])
|
|
1151
|
+
),
|
|
1152
|
+
)
|
|
1153
|
+
yield MetadataChangeProposalWrapper(
|
|
1154
|
+
entityUrn=row["user"], aspect=user_usage_features
|
|
1155
|
+
).as_workunit(is_primary_source=False)
|
|
1156
|
+
|
|
1109
1157
|
def generate_usage_feature_mcp(
|
|
1110
1158
|
self, urn: str, usage_feature: UsageFeaturesClass
|
|
1111
1159
|
) -> Iterable[MetadataWorkUnit]:
|
|
@@ -1140,9 +1188,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1140
1188
|
|
|
1141
1189
|
return self.generate_dashboard_chart_usage(entity_index, usage_index)
|
|
1142
1190
|
|
|
1143
|
-
def
|
|
1144
|
-
self, entity_index: str, usage_index: str
|
|
1145
|
-
) -> polars.LazyFrame:
|
|
1191
|
+
def _generate_dashboard_chart_entities(self, entity_index: str) -> polars.LazyFrame:
|
|
1146
1192
|
entity_schema = {
|
|
1147
1193
|
"entity_urn": polars.Categorical,
|
|
1148
1194
|
"removed": polars.Boolean,
|
|
@@ -1159,7 +1205,12 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1159
1205
|
process_function=self.soft_deleted_batch,
|
|
1160
1206
|
)
|
|
1161
1207
|
|
|
1162
|
-
|
|
1208
|
+
return entities_df
|
|
1209
|
+
|
|
1210
|
+
def _generate_dashboard_chart_usage(
|
|
1211
|
+
self, entities_df: polars.LazyFrame, usage_index: str
|
|
1212
|
+
) -> polars.LazyFrame:
|
|
1213
|
+
entities_usage_schema = {
|
|
1163
1214
|
"timestampMillis": polars.Int64,
|
|
1164
1215
|
"lastObserved": polars.Int64,
|
|
1165
1216
|
"urn": polars.Categorical,
|
|
@@ -1177,7 +1228,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1177
1228
|
}
|
|
1178
1229
|
|
|
1179
1230
|
lf = self.load_data_from_es_to_lf(
|
|
1180
|
-
schema=
|
|
1231
|
+
schema=entities_usage_schema,
|
|
1181
1232
|
index=usage_index,
|
|
1182
1233
|
query=QueryBuilder.get_dashboard_usage_query(self.config.lookback_days),
|
|
1183
1234
|
process_function=self.process_dashboard_usage,
|
|
@@ -1196,6 +1247,15 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1196
1247
|
.alias("row_num")
|
|
1197
1248
|
).filter(polars.col("row_num") == 1)
|
|
1198
1249
|
|
|
1250
|
+
return lf
|
|
1251
|
+
|
|
1252
|
+
def generate_dashboard_chart_usage(
|
|
1253
|
+
self, entity_index: str, usage_index: str
|
|
1254
|
+
) -> polars.LazyFrame:
|
|
1255
|
+
entities_df = self._generate_dashboard_chart_entities(entity_index)
|
|
1256
|
+
|
|
1257
|
+
lf = self._generate_dashboard_chart_usage(entities_df, usage_index)
|
|
1258
|
+
|
|
1199
1259
|
# lf = lf.filter(polars.col("urn") == "urn:li:dashboard:(looker,dashboards.8)")
|
|
1200
1260
|
# "urn:li:dashboard:(looker,dashboards.8)"
|
|
1201
1261
|
|
|
@@ -1308,7 +1368,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1308
1368
|
query_entities = self.load_data_from_es_to_lf(
|
|
1309
1369
|
schema=query_entities_schema,
|
|
1310
1370
|
index=entity_index,
|
|
1311
|
-
query=QueryBuilder.get_query_entities_query(),
|
|
1371
|
+
query=QueryBuilder.get_query_entities_query(self.config.lookback_days),
|
|
1312
1372
|
process_function=self.queries_entities_batch,
|
|
1313
1373
|
)
|
|
1314
1374
|
|
|
@@ -1365,6 +1425,380 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1365
1425
|
|
|
1366
1426
|
return usage_with_top_users_with_ranks
|
|
1367
1427
|
|
|
1428
|
+
def _generate_user_usage_for_dataset(self) -> polars.LazyFrame:
|
|
1429
|
+
datasets_lf = self.get_datasets()
|
|
1430
|
+
if self.config.set_upstream_table_max_modification_time_for_views:
|
|
1431
|
+
datasets_lf = self.set_table_modification_time_for_views(datasets_lf)
|
|
1432
|
+
|
|
1433
|
+
lf = self.load_dataset_usage()
|
|
1434
|
+
|
|
1435
|
+
# Polaris/pandas join merges the join column into one column and that's why we need to filter based on the removed column
|
|
1436
|
+
lf = (
|
|
1437
|
+
lf.join(datasets_lf, left_on="urn", right_on="entity_urn", how="left")
|
|
1438
|
+
.filter(polars.col("removed") == False) # noqa: E712
|
|
1439
|
+
.drop(["removed"])
|
|
1440
|
+
)
|
|
1441
|
+
|
|
1442
|
+
users_lf = (
|
|
1443
|
+
lf.explode("userCounts")
|
|
1444
|
+
.unnest("userCounts")
|
|
1445
|
+
.filter(polars.col("user").is_not_null())
|
|
1446
|
+
)
|
|
1447
|
+
|
|
1448
|
+
user_dataset_usage_lf = self._create_user_dataset_usage_map(users_lf)
|
|
1449
|
+
return user_dataset_usage_lf
|
|
1450
|
+
|
|
1451
|
+
@staticmethod
|
|
1452
|
+
def _convert_top_datasets_to_dict(
|
|
1453
|
+
top_datasets_list: Optional[List[Dict[str, Any]]],
|
|
1454
|
+
) -> Optional[Dict[str, float]]:
|
|
1455
|
+
"""
|
|
1456
|
+
Convert list of top datasets structs to dictionary as expected by CorpUserUsageFeatures schema.
|
|
1457
|
+
|
|
1458
|
+
Args:
|
|
1459
|
+
top_datasets_list: List of dictionaries with 'dataset_urn' and 'count' keys
|
|
1460
|
+
|
|
1461
|
+
Returns:
|
|
1462
|
+
Dictionary mapping dataset URN to usage count, or None if input is empty
|
|
1463
|
+
"""
|
|
1464
|
+
if not top_datasets_list:
|
|
1465
|
+
return None
|
|
1466
|
+
|
|
1467
|
+
top_datasets_dict = {
|
|
1468
|
+
item["dataset_urn"]: float(item["count"])
|
|
1469
|
+
for item in top_datasets_list
|
|
1470
|
+
if isinstance(item, dict) and "dataset_urn" in item and "count" in item
|
|
1471
|
+
}
|
|
1472
|
+
|
|
1473
|
+
return top_datasets_dict if top_datasets_dict else None
|
|
1474
|
+
|
|
1475
|
+
def _create_user_dataset_usage_map(
|
|
1476
|
+
self, users_lf: polars.LazyFrame, top_n: int = 25
|
|
1477
|
+
) -> polars.LazyFrame:
|
|
1478
|
+
"""
|
|
1479
|
+
Creates a lazyframe with user string and map of top N datasets by usage.
|
|
1480
|
+
|
|
1481
|
+
Args:
|
|
1482
|
+
users_lf: LazyFrame containing user usage data with columns: user, urn, platform, count
|
|
1483
|
+
top_n: Number of top datasets to include per user (default: 25)
|
|
1484
|
+
|
|
1485
|
+
Returns:
|
|
1486
|
+
LazyFrame with columns:
|
|
1487
|
+
- user: string column containing the user identifier
|
|
1488
|
+
- top_datasets_map: list of structs with dataset_urn (string), count (int), and platform_urn (string)
|
|
1489
|
+
- userUsageTotalPast30Days: total usage count for the user across all datasets
|
|
1490
|
+
- userPlatformUsageTotalsPast30Days: map from platform URN to usage totals
|
|
1491
|
+
"""
|
|
1492
|
+
|
|
1493
|
+
# Create intermediate lazy frame with filtered users and aggregated counts
|
|
1494
|
+
user_dataset_aggregated = (
|
|
1495
|
+
users_lf.filter(polars.col("user").str.contains("@"))
|
|
1496
|
+
.group_by("user", "urn", "platform")
|
|
1497
|
+
.agg(polars.col("count").sum().alias("total_count"))
|
|
1498
|
+
.with_columns(
|
|
1499
|
+
# Direct string formatting - vectorized operation
|
|
1500
|
+
polars.format("urn:li:dataPlatform:{}", polars.col("platform")).alias(
|
|
1501
|
+
"platform_urn"
|
|
1502
|
+
)
|
|
1503
|
+
)
|
|
1504
|
+
)
|
|
1505
|
+
|
|
1506
|
+
# Calculate user totals
|
|
1507
|
+
user_totals = user_dataset_aggregated.group_by("user").agg(
|
|
1508
|
+
polars.col("total_count").sum().alias("userUsageTotalPast30Days")
|
|
1509
|
+
)
|
|
1510
|
+
|
|
1511
|
+
# Calculate platform totals for each user - keep as list of structs
|
|
1512
|
+
platform_totals = (
|
|
1513
|
+
user_dataset_aggregated.group_by("user", "platform_urn")
|
|
1514
|
+
.agg(polars.col("total_count").sum().alias("platform_total"))
|
|
1515
|
+
.filter(polars.col("platform_urn").is_not_null())
|
|
1516
|
+
.group_by("user")
|
|
1517
|
+
.agg(
|
|
1518
|
+
polars.struct(
|
|
1519
|
+
[
|
|
1520
|
+
polars.col("platform_urn"),
|
|
1521
|
+
polars.col("platform_total").cast(polars.Float64),
|
|
1522
|
+
]
|
|
1523
|
+
).alias("platform_usage_pairs")
|
|
1524
|
+
)
|
|
1525
|
+
)
|
|
1526
|
+
|
|
1527
|
+
# Calculate top datasets
|
|
1528
|
+
top_datasets = (
|
|
1529
|
+
user_dataset_aggregated.with_columns(
|
|
1530
|
+
polars.col("total_count")
|
|
1531
|
+
.rank(descending=True, method="ordinal")
|
|
1532
|
+
.over("user")
|
|
1533
|
+
.alias("dataset_rank")
|
|
1534
|
+
)
|
|
1535
|
+
.filter(polars.col("dataset_rank") <= top_n)
|
|
1536
|
+
.group_by("user")
|
|
1537
|
+
.agg(
|
|
1538
|
+
polars.struct(
|
|
1539
|
+
[
|
|
1540
|
+
polars.col("urn").alias("dataset_urn"),
|
|
1541
|
+
polars.col("total_count").alias("count"),
|
|
1542
|
+
polars.col("platform_urn"),
|
|
1543
|
+
]
|
|
1544
|
+
)
|
|
1545
|
+
.sort_by("total_count", descending=True)
|
|
1546
|
+
.alias("top_datasets_map")
|
|
1547
|
+
)
|
|
1548
|
+
)
|
|
1549
|
+
|
|
1550
|
+
# Join all results
|
|
1551
|
+
return top_datasets.join(user_totals, on="user", how="left").join(
|
|
1552
|
+
platform_totals, on="user", how="left"
|
|
1553
|
+
)
|
|
1554
|
+
|
|
1555
|
+
def _combine_user_usage_data(
|
|
1556
|
+
self,
|
|
1557
|
+
dataset_usage_lf: polars.LazyFrame,
|
|
1558
|
+
dashboard_usage_lf: polars.LazyFrame,
|
|
1559
|
+
chart_usage_lf: polars.LazyFrame,
|
|
1560
|
+
) -> polars.LazyFrame:
|
|
1561
|
+
"""
|
|
1562
|
+
Combines user usage data from dataset, dashboard, and chart sources.
|
|
1563
|
+
|
|
1564
|
+
Args:
|
|
1565
|
+
dataset_usage_lf: LazyFrame with dataset usage data containing top_datasets_map
|
|
1566
|
+
dashboard_usage_lf: LazyFrame with dashboard usage data
|
|
1567
|
+
chart_usage_lf: LazyFrame with chart usage data
|
|
1568
|
+
|
|
1569
|
+
Returns:
|
|
1570
|
+
Combined LazyFrame with aggregated usage data per user
|
|
1571
|
+
"""
|
|
1572
|
+
user_totals = self._combine_user_totals(
|
|
1573
|
+
dataset_usage_lf, dashboard_usage_lf, chart_usage_lf
|
|
1574
|
+
)
|
|
1575
|
+
|
|
1576
|
+
platform_pairs = self._combine_platform_pairs(
|
|
1577
|
+
dataset_usage_lf, dashboard_usage_lf, chart_usage_lf
|
|
1578
|
+
)
|
|
1579
|
+
|
|
1580
|
+
result = user_totals.join(platform_pairs, on="user", how="left")
|
|
1581
|
+
|
|
1582
|
+
return result.with_columns(
|
|
1583
|
+
polars.col("platform_usage_pairs").fill_null(polars.lit([]))
|
|
1584
|
+
)
|
|
1585
|
+
|
|
1586
|
+
def _filter_users(self, users_lf: polars.LazyFrame) -> polars.LazyFrame:
|
|
1587
|
+
filter_condition = polars.col("user").str.contains("@")
|
|
1588
|
+
for pattern in self.config.excluded_platforms:
|
|
1589
|
+
filter_condition = filter_condition & ~polars.col("user").str.contains(
|
|
1590
|
+
pattern
|
|
1591
|
+
)
|
|
1592
|
+
|
|
1593
|
+
return users_lf.filter(filter_condition)
|
|
1594
|
+
|
|
1595
|
+
def _combine_user_totals(
|
|
1596
|
+
self,
|
|
1597
|
+
dataset_usage_lf: polars.LazyFrame,
|
|
1598
|
+
dashboard_usage_lf: polars.LazyFrame,
|
|
1599
|
+
chart_usage_lf: polars.LazyFrame,
|
|
1600
|
+
) -> polars.LazyFrame:
|
|
1601
|
+
"""Combine user totals and top_datasets_map from all sources."""
|
|
1602
|
+
# Collect all unique users in one operation
|
|
1603
|
+
all_users_lf = (
|
|
1604
|
+
polars.concat(
|
|
1605
|
+
[
|
|
1606
|
+
dataset_usage_lf.select("user"),
|
|
1607
|
+
dashboard_usage_lf.select("user"),
|
|
1608
|
+
chart_usage_lf.select("user"),
|
|
1609
|
+
]
|
|
1610
|
+
)
|
|
1611
|
+
.unique()
|
|
1612
|
+
.pipe(self._filter_users)
|
|
1613
|
+
)
|
|
1614
|
+
|
|
1615
|
+
return (
|
|
1616
|
+
all_users_lf.join(
|
|
1617
|
+
dataset_usage_lf.select(
|
|
1618
|
+
["user", "top_datasets_map", "userUsageTotalPast30Days"]
|
|
1619
|
+
),
|
|
1620
|
+
on="user",
|
|
1621
|
+
how="left",
|
|
1622
|
+
)
|
|
1623
|
+
.join(
|
|
1624
|
+
dashboard_usage_lf.select(["user", "userUsageTotalPast30Days"]),
|
|
1625
|
+
on="user",
|
|
1626
|
+
how="left",
|
|
1627
|
+
suffix="_dashboard",
|
|
1628
|
+
)
|
|
1629
|
+
.join(
|
|
1630
|
+
chart_usage_lf.select(["user", "userUsageTotalPast30Days"]),
|
|
1631
|
+
on="user",
|
|
1632
|
+
how="left",
|
|
1633
|
+
suffix="_chart",
|
|
1634
|
+
)
|
|
1635
|
+
.with_columns(
|
|
1636
|
+
[
|
|
1637
|
+
# Sum with explicit null handling
|
|
1638
|
+
(
|
|
1639
|
+
polars.col("userUsageTotalPast30Days").fill_null(0)
|
|
1640
|
+
+ polars.col("userUsageTotalPast30Days_dashboard").fill_null(0)
|
|
1641
|
+
+ polars.col("userUsageTotalPast30Days_chart").fill_null(0)
|
|
1642
|
+
).alias("userUsageTotalPast30Days")
|
|
1643
|
+
]
|
|
1644
|
+
)
|
|
1645
|
+
.select(["user", "top_datasets_map", "userUsageTotalPast30Days"])
|
|
1646
|
+
)
|
|
1647
|
+
|
|
1648
|
+
def _combine_platform_pairs(
|
|
1649
|
+
self,
|
|
1650
|
+
dataset_usage_lf: polars.LazyFrame,
|
|
1651
|
+
dashboard_usage_lf: polars.LazyFrame,
|
|
1652
|
+
chart_usage_lf: polars.LazyFrame,
|
|
1653
|
+
) -> polars.LazyFrame:
|
|
1654
|
+
"""Combine platform usage pairs from all sources."""
|
|
1655
|
+
all_platforms = []
|
|
1656
|
+
|
|
1657
|
+
# Extract platforms from each source
|
|
1658
|
+
for source_lf, col_name in [
|
|
1659
|
+
(dataset_usage_lf, "platform_usage_pairs"),
|
|
1660
|
+
(dashboard_usage_lf, "platform_usage_pairs"),
|
|
1661
|
+
(chart_usage_lf, "platform_usage_pairs"),
|
|
1662
|
+
]:
|
|
1663
|
+
platforms = self._extract_platforms_from_source(source_lf, col_name)
|
|
1664
|
+
if platforms is not None:
|
|
1665
|
+
all_platforms.append(platforms)
|
|
1666
|
+
|
|
1667
|
+
if not all_platforms:
|
|
1668
|
+
# Return empty result if no platforms found
|
|
1669
|
+
return polars.LazyFrame({"user": [], "platform_usage_pairs": []})
|
|
1670
|
+
|
|
1671
|
+
# Combine all platforms and aggregate by user + platform
|
|
1672
|
+
combined_platforms = polars.concat(all_platforms, how="vertical_relaxed")
|
|
1673
|
+
aggregated = combined_platforms.group_by("user", "platform_urn").agg(
|
|
1674
|
+
polars.col("platform_total").sum().alias("platform_total")
|
|
1675
|
+
)
|
|
1676
|
+
|
|
1677
|
+
# Rebuild platform_usage_pairs structure
|
|
1678
|
+
return aggregated.group_by("user").agg(
|
|
1679
|
+
polars.struct(
|
|
1680
|
+
[polars.col("platform_urn"), polars.col("platform_total")]
|
|
1681
|
+
).alias("platform_usage_pairs")
|
|
1682
|
+
)
|
|
1683
|
+
|
|
1684
|
+
def _extract_platforms_from_source(
|
|
1685
|
+
self, source_lf: polars.LazyFrame, col_name: str
|
|
1686
|
+
) -> polars.LazyFrame | None:
|
|
1687
|
+
"""Extract platform data from a source LazyFrame."""
|
|
1688
|
+
try:
|
|
1689
|
+
return (
|
|
1690
|
+
source_lf.select(["user", col_name])
|
|
1691
|
+
.filter(polars.col(col_name).is_not_null())
|
|
1692
|
+
.filter(polars.col(col_name).list.len() > 0)
|
|
1693
|
+
.explode(col_name)
|
|
1694
|
+
.unnest(col_name)
|
|
1695
|
+
.filter(polars.col("platform_urn").is_not_null())
|
|
1696
|
+
.select(["user", "platform_urn", "platform_total"])
|
|
1697
|
+
)
|
|
1698
|
+
except polars.exceptions.ColumnNotFoundError:
|
|
1699
|
+
return None
|
|
1700
|
+
|
|
1701
|
+
def add_platform_usage_percentiles(
|
|
1702
|
+
self, user_usage_lf: polars.LazyFrame
|
|
1703
|
+
) -> polars.LazyFrame:
|
|
1704
|
+
"""
|
|
1705
|
+
Add platform usage percentiles to user usage data.
|
|
1706
|
+
|
|
1707
|
+
Args:
|
|
1708
|
+
user_usage_lf: LazyFrame with user usage data containing platform_usage_pairs column
|
|
1709
|
+
|
|
1710
|
+
Returns:
|
|
1711
|
+
LazyFrame with additional platform_usage_percentiles column
|
|
1712
|
+
"""
|
|
1713
|
+
# First explode the platform_usage_pairs to work with individual platform usage records
|
|
1714
|
+
platform_usage_exploded = (
|
|
1715
|
+
user_usage_lf.explode("platform_usage_pairs")
|
|
1716
|
+
.unnest("platform_usage_pairs")
|
|
1717
|
+
.filter(polars.col("platform_urn").is_not_null())
|
|
1718
|
+
)
|
|
1719
|
+
|
|
1720
|
+
# Use the existing gen_rank_and_percentile method to calculate percentiles
|
|
1721
|
+
platform_percentiles_with_ranks = self.gen_rank_and_percentile(
|
|
1722
|
+
lf=platform_usage_exploded,
|
|
1723
|
+
count_field="platform_total",
|
|
1724
|
+
urn_field="user",
|
|
1725
|
+
platform_field="platform_urn",
|
|
1726
|
+
prefix="platform_",
|
|
1727
|
+
use_exp_cdf=False,
|
|
1728
|
+
)
|
|
1729
|
+
|
|
1730
|
+
# Group back by user and create the percentiles structure
|
|
1731
|
+
platform_percentiles = platform_percentiles_with_ranks.group_by("user").agg(
|
|
1732
|
+
polars.struct(
|
|
1733
|
+
[
|
|
1734
|
+
polars.col("platform_urn"),
|
|
1735
|
+
polars.col("platform_rank_percentile").cast(polars.Float64),
|
|
1736
|
+
]
|
|
1737
|
+
).alias("platform_usage_percentiles")
|
|
1738
|
+
)
|
|
1739
|
+
|
|
1740
|
+
# Join the percentiles back to the original user_usage_lf
|
|
1741
|
+
return user_usage_lf.join(platform_percentiles, on="user", how="left")
|
|
1742
|
+
|
|
1743
|
+
def _generate_user_usage_for_dashboard_charts(
|
|
1744
|
+
self, entity_index: str, usage_index: str
|
|
1745
|
+
) -> polars.LazyFrame:
|
|
1746
|
+
entities_df = self._generate_dashboard_chart_entities(entity_index)
|
|
1747
|
+
lf = self._generate_dashboard_chart_usage(entities_df, usage_index)
|
|
1748
|
+
|
|
1749
|
+
# Process dashboard usage data into user usage format (similar to dataset version)
|
|
1750
|
+
users_lf = (
|
|
1751
|
+
lf.explode("userCounts")
|
|
1752
|
+
.unnest("userCounts")
|
|
1753
|
+
.filter(polars.col("user").is_not_null())
|
|
1754
|
+
.rename({"usageCount": "count"}) # Rename to match dataset schema
|
|
1755
|
+
)
|
|
1756
|
+
|
|
1757
|
+
user_dashboard_usage_lf = self._create_user_dataset_usage_map(users_lf)
|
|
1758
|
+
return user_dashboard_usage_lf
|
|
1759
|
+
|
|
1760
|
+
def generate_user_usage(self) -> polars.LazyFrame:
|
|
1761
|
+
dataset_usage_lf = self._generate_user_usage_for_dataset()
|
|
1762
|
+
|
|
1763
|
+
usage_index = "dashboard_dashboardusagestatisticsaspect_v1"
|
|
1764
|
+
entity_index = "dashboardindex_v2"
|
|
1765
|
+
dashboard_usage_lf = self._generate_user_usage_for_dashboard_charts(
|
|
1766
|
+
entity_index, usage_index
|
|
1767
|
+
)
|
|
1768
|
+
|
|
1769
|
+
entity_index = "chartindex_v2"
|
|
1770
|
+
usage_index = "chart_chartusagestatisticsaspect_v1"
|
|
1771
|
+
chart_usage_lf = self._generate_user_usage_for_dashboard_charts(
|
|
1772
|
+
entity_index, usage_index
|
|
1773
|
+
)
|
|
1774
|
+
|
|
1775
|
+
# Combine all three usage sources
|
|
1776
|
+
lf = self._combine_user_usage_data(
|
|
1777
|
+
dataset_usage_lf, dashboard_usage_lf, chart_usage_lf
|
|
1778
|
+
)
|
|
1779
|
+
|
|
1780
|
+
lf = self.add_platform_usage_percentiles(lf)
|
|
1781
|
+
|
|
1782
|
+
# Add user usage percentiles across all users (not grouped by platform)
|
|
1783
|
+
# Create a temporary platform field for percentile calculation
|
|
1784
|
+
lf = lf.with_columns(polars.lit("all_users").alias("temp_platform"))
|
|
1785
|
+
|
|
1786
|
+
lf = self.gen_rank_and_percentile(
|
|
1787
|
+
lf=lf,
|
|
1788
|
+
count_field="userUsageTotalPast30Days",
|
|
1789
|
+
urn_field="user",
|
|
1790
|
+
platform_field="temp_platform",
|
|
1791
|
+
prefix="userUsage",
|
|
1792
|
+
use_exp_cdf=False,
|
|
1793
|
+
)
|
|
1794
|
+
|
|
1795
|
+
# Rename the percentile column to match the schema field name and remove temp field
|
|
1796
|
+
lf = lf.rename(
|
|
1797
|
+
{"userUsagerank_percentile": "userUsagePercentilePast30Days"}
|
|
1798
|
+
).drop("temp_platform")
|
|
1799
|
+
|
|
1800
|
+
return lf
|
|
1801
|
+
|
|
1368
1802
|
def generate_dataset_usage(self) -> polars.LazyFrame:
|
|
1369
1803
|
datasets_lf = self.get_datasets()
|
|
1370
1804
|
if self.config.set_upstream_table_max_modification_time_for_views:
|
|
@@ -1485,11 +1919,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1485
1919
|
# called `Option::unwrap()` on a `None` value
|
|
1486
1920
|
# Which only happens if we don't collect immediately
|
|
1487
1921
|
# return polars.scan_parquet(temp_file.name, schema=schema, low_memory=True).collect().lazy()
|
|
1488
|
-
return (
|
|
1489
|
-
polars.scan_parquet(temp_file.name, schema=schema, low_memory=True)
|
|
1490
|
-
.collect()
|
|
1491
|
-
.lazy()
|
|
1492
|
-
)
|
|
1922
|
+
return polars.scan_parquet(temp_file.name, schema=schema, low_memory=True)
|
|
1493
1923
|
|
|
1494
1924
|
def load_dataset_usage(self) -> polars.LazyFrame:
|
|
1495
1925
|
index = "dataset_datasetusagestatisticsaspect_v1"
|
|
@@ -1606,23 +2036,40 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1606
2036
|
delay: Optional[float] = None,
|
|
1607
2037
|
) -> Iterable[Dict[str, Any]]:
|
|
1608
2038
|
processed_count = 0
|
|
2039
|
+
scroll_id = None
|
|
1609
2040
|
while True:
|
|
1610
2041
|
with PerfTimer() as timer:
|
|
1611
2042
|
logger.debug(f"ES query: {query}")
|
|
1612
|
-
|
|
1613
|
-
|
|
1614
|
-
|
|
1615
|
-
|
|
1616
|
-
|
|
1617
|
-
|
|
1618
|
-
|
|
1619
|
-
|
|
1620
|
-
|
|
1621
|
-
|
|
1622
|
-
|
|
1623
|
-
|
|
1624
|
-
|
|
1625
|
-
|
|
2043
|
+
if not scroll_id:
|
|
2044
|
+
logger.debug(
|
|
2045
|
+
f"Getting inital data from index {index} without scroll id"
|
|
2046
|
+
)
|
|
2047
|
+
results = server.search(
|
|
2048
|
+
body=query,
|
|
2049
|
+
size=batch_size,
|
|
2050
|
+
scroll="2m",
|
|
2051
|
+
index=index,
|
|
2052
|
+
params=(
|
|
2053
|
+
{"timeout": self.config.query_timeout}
|
|
2054
|
+
if self.config.search_index.opensearch_dialect
|
|
2055
|
+
else {"request_timeout": self.config.query_timeout}
|
|
2056
|
+
),
|
|
2057
|
+
)
|
|
2058
|
+
else:
|
|
2059
|
+
logger.debug(
|
|
2060
|
+
f"Getting data from index {index} using scroll_id: {scroll_id}"
|
|
2061
|
+
)
|
|
2062
|
+
results = server.scroll(
|
|
2063
|
+
scroll_id=scroll_id,
|
|
2064
|
+
scroll="2m",
|
|
2065
|
+
params=(
|
|
2066
|
+
{"timeout": self.config.query_timeout}
|
|
2067
|
+
if self.config.search_index.opensearch_dialect
|
|
2068
|
+
else {"request_timeout": self.config.query_timeout}
|
|
2069
|
+
),
|
|
2070
|
+
)
|
|
2071
|
+
scroll_id = results["_scroll_id"]
|
|
2072
|
+
|
|
1626
2073
|
if not aggregation_key:
|
|
1627
2074
|
yield from process_function(results["hits"]["hits"])
|
|
1628
2075
|
|
|
@@ -1633,7 +2080,6 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1633
2080
|
)
|
|
1634
2081
|
if len(results["hits"]["hits"]) < batch_size:
|
|
1635
2082
|
break
|
|
1636
|
-
query.update({"search_after": results["hits"]["hits"][-1]["sort"]})
|
|
1637
2083
|
else:
|
|
1638
2084
|
yield from process_function(
|
|
1639
2085
|
results["aggregations"][aggregation_key]["buckets"]
|
|
@@ -1643,16 +2089,11 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1643
2089
|
< batch_size
|
|
1644
2090
|
):
|
|
1645
2091
|
break
|
|
1646
|
-
|
|
1647
|
-
|
|
1648
|
-
|
|
1649
|
-
|
|
1650
|
-
|
|
1651
|
-
if delay:
|
|
1652
|
-
logger.debug(
|
|
1653
|
-
f"Sleeping for {delay} seconds before getting next batch from ES"
|
|
1654
|
-
)
|
|
1655
|
-
time.sleep(delay)
|
|
2092
|
+
if delay:
|
|
2093
|
+
logger.debug(
|
|
2094
|
+
f"Sleeping for {delay} seconds before getting next batch from ES"
|
|
2095
|
+
)
|
|
2096
|
+
time.sleep(delay)
|
|
1656
2097
|
|
|
1657
|
-
def get_report(self) ->
|
|
2098
|
+
def get_report(self) -> "DatahubUsageFeatureReport":
|
|
1658
2099
|
return self.report
|