acryl-datahub-cloud 0.3.10rc4__py3-none-any.whl → 0.3.16.1rc0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub-cloud might be problematic. Click here for more details.
- acryl_datahub_cloud/_codegen_config.json +1 -1
- acryl_datahub_cloud/acryl_cs_issues/acryl_customer.py +1 -1
- acryl_datahub_cloud/acryl_cs_issues/models.py +5 -3
- acryl_datahub_cloud/action_request/action_request_owner_source.py +37 -8
- acryl_datahub_cloud/datahub_forms_notifications/__init__.py +0 -0
- acryl_datahub_cloud/datahub_forms_notifications/forms_notifications_source.py +569 -0
- acryl_datahub_cloud/datahub_forms_notifications/get_feature_flag.gql +7 -0
- acryl_datahub_cloud/datahub_forms_notifications/get_search_results_total.gql +14 -0
- acryl_datahub_cloud/datahub_forms_notifications/query.py +17 -0
- acryl_datahub_cloud/datahub_forms_notifications/scroll_forms_for_notification.gql +29 -0
- acryl_datahub_cloud/datahub_forms_notifications/send_form_notification_request.gql +5 -0
- acryl_datahub_cloud/datahub_reporting/datahub_dataset.py +39 -19
- acryl_datahub_cloud/datahub_reporting/datahub_form_reporting.py +60 -25
- acryl_datahub_cloud/datahub_reporting/extract_graph.py +9 -3
- acryl_datahub_cloud/datahub_reporting/extract_sql.py +248 -52
- acryl_datahub_cloud/datahub_reporting/forms.py +1 -1
- acryl_datahub_cloud/datahub_reporting/forms_config.py +3 -2
- acryl_datahub_cloud/datahub_restore/source.py +3 -2
- acryl_datahub_cloud/datahub_usage_reporting/excluded.py +94 -0
- acryl_datahub_cloud/datahub_usage_reporting/query_builder.py +48 -8
- acryl_datahub_cloud/datahub_usage_reporting/usage_feature_reporter.py +532 -109
- acryl_datahub_cloud/elasticsearch/graph_service.py +76 -14
- acryl_datahub_cloud/graphql_utils.py +64 -0
- acryl_datahub_cloud/lineage_features/source.py +555 -49
- acryl_datahub_cloud/metadata/_urns/urn_defs.py +2390 -1938
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/actionworkflow/__init__.py +53 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/anomaly/__init__.py +2 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/assertion/__init__.py +6 -2
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/common/__init__.py +6 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/conversation/__init__.py +29 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +2 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/execution/__init__.py +2 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/form/__init__.py +8 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/identity/__init__.py +8 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/knowledge/__init__.py +33 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +14 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/metadata/search/features/__init__.py +2 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/monitor/__init__.py +6 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/notification/__init__.py +19 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +28 -0
- acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- acryl_datahub_cloud/metadata/schema.avsc +27843 -23200
- acryl_datahub_cloud/metadata/schema_classes.py +29901 -24310
- acryl_datahub_cloud/metadata/schemas/ActionRequestInfo.avsc +235 -2
- acryl_datahub_cloud/metadata/schemas/ActionWorkflowInfo.avsc +683 -0
- acryl_datahub_cloud/metadata/schemas/ActionWorkflowKey.avsc +21 -0
- acryl_datahub_cloud/metadata/schemas/Actors.avsc +38 -1
- acryl_datahub_cloud/metadata/schemas/ApplicationKey.avsc +31 -0
- acryl_datahub_cloud/metadata/schemas/ApplicationProperties.avsc +75 -0
- acryl_datahub_cloud/metadata/schemas/Applications.avsc +38 -0
- acryl_datahub_cloud/metadata/schemas/AssertionAnalyticsRunEvent.avsc +375 -212
- acryl_datahub_cloud/metadata/schemas/AssertionInfo.avsc +147 -20
- acryl_datahub_cloud/metadata/schemas/AssertionKey.avsc +1 -1
- acryl_datahub_cloud/metadata/schemas/AssertionRunEvent.avsc +191 -21
- acryl_datahub_cloud/metadata/schemas/{AssertionSummary.avsc → AssertionRunSummary.avsc} +15 -2
- acryl_datahub_cloud/metadata/schemas/AssertionsSummary.avsc +54 -0
- acryl_datahub_cloud/metadata/schemas/AssetSettings.avsc +63 -0
- acryl_datahub_cloud/metadata/schemas/BusinessAttributeInfo.avsc +7 -3
- acryl_datahub_cloud/metadata/schemas/ChartInfo.avsc +20 -6
- acryl_datahub_cloud/metadata/schemas/ChartKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/ConstraintInfo.avsc +12 -1
- acryl_datahub_cloud/metadata/schemas/ContainerKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/ContainerProperties.avsc +16 -5
- acryl_datahub_cloud/metadata/schemas/CorpGroupEditableInfo.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/CorpGroupInfo.avsc +7 -3
- acryl_datahub_cloud/metadata/schemas/CorpGroupKey.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/CorpGroupSettings.avsc +127 -2
- acryl_datahub_cloud/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
- acryl_datahub_cloud/metadata/schemas/CorpUserInfo.avsc +18 -2
- acryl_datahub_cloud/metadata/schemas/CorpUserInvitationStatus.avsc +106 -0
- acryl_datahub_cloud/metadata/schemas/CorpUserKey.avsc +4 -1
- acryl_datahub_cloud/metadata/schemas/CorpUserSettings.avsc +304 -2
- acryl_datahub_cloud/metadata/schemas/CorpUserUsageFeatures.avsc +86 -0
- acryl_datahub_cloud/metadata/schemas/DashboardInfo.avsc +11 -5
- acryl_datahub_cloud/metadata/schemas/DashboardKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/DataContractKey.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/DataFlowInfo.avsc +15 -5
- acryl_datahub_cloud/metadata/schemas/DataFlowKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/DataHubAiConversationInfo.avsc +256 -0
- acryl_datahub_cloud/metadata/schemas/DataHubAiConversationKey.avsc +22 -0
- acryl_datahub_cloud/metadata/schemas/DataHubFileInfo.avsc +234 -0
- acryl_datahub_cloud/metadata/schemas/DataHubFileKey.avsc +22 -0
- acryl_datahub_cloud/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
- acryl_datahub_cloud/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- acryl_datahub_cloud/metadata/schemas/DataHubPageModuleProperties.avsc +308 -0
- acryl_datahub_cloud/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- acryl_datahub_cloud/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- acryl_datahub_cloud/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- acryl_datahub_cloud/metadata/schemas/DataJobInfo.avsc +13 -4
- acryl_datahub_cloud/metadata/schemas/DataJobInputOutput.avsc +8 -0
- acryl_datahub_cloud/metadata/schemas/DataJobKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/DataPlatformInfo.avsc +3 -1
- acryl_datahub_cloud/metadata/schemas/DataPlatformInstanceProperties.avsc +5 -2
- acryl_datahub_cloud/metadata/schemas/DataProcessKey.avsc +4 -0
- acryl_datahub_cloud/metadata/schemas/DataProductKey.avsc +2 -0
- acryl_datahub_cloud/metadata/schemas/DataProductProperties.avsc +6 -3
- acryl_datahub_cloud/metadata/schemas/DataTransformLogic.avsc +4 -2
- acryl_datahub_cloud/metadata/schemas/DataTypeInfo.avsc +5 -0
- acryl_datahub_cloud/metadata/schemas/DatasetKey.avsc +10 -2
- acryl_datahub_cloud/metadata/schemas/DatasetProperties.avsc +12 -5
- acryl_datahub_cloud/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- acryl_datahub_cloud/metadata/schemas/DocumentInfo.avsc +407 -0
- acryl_datahub_cloud/metadata/schemas/DocumentKey.avsc +35 -0
- acryl_datahub_cloud/metadata/schemas/DocumentSettings.avsc +79 -0
- acryl_datahub_cloud/metadata/schemas/DomainKey.avsc +2 -0
- acryl_datahub_cloud/metadata/schemas/DomainProperties.avsc +7 -3
- acryl_datahub_cloud/metadata/schemas/EditableContainerProperties.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/EditableDashboardProperties.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/EditableDataFlowProperties.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/EditableDataJobProperties.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/EditableDatasetProperties.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/EditableERModelRelationshipProperties.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/EditableMLFeatureProperties.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/EditableMLFeatureTableProperties.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/EditableMLModelGroupProperties.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/EditableMLModelProperties.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/EditableNotebookProperties.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/EditableSchemaMetadata.avsc +4 -2
- acryl_datahub_cloud/metadata/schemas/EntityTypeInfo.avsc +5 -0
- acryl_datahub_cloud/metadata/schemas/ExecutionRequestArtifactsLocation.avsc +16 -0
- acryl_datahub_cloud/metadata/schemas/ExecutionRequestKey.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/FormAssignmentStatus.avsc +36 -0
- acryl_datahub_cloud/metadata/schemas/FormInfo.avsc +6 -0
- acryl_datahub_cloud/metadata/schemas/FormKey.avsc +3 -1
- acryl_datahub_cloud/metadata/schemas/FormNotifications.avsc +69 -0
- acryl_datahub_cloud/metadata/schemas/FormSettings.avsc +30 -0
- acryl_datahub_cloud/metadata/schemas/GlobalSettingsInfo.avsc +416 -0
- acryl_datahub_cloud/metadata/schemas/GlobalTags.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/GlossaryNodeInfo.avsc +3 -1
- acryl_datahub_cloud/metadata/schemas/GlossaryNodeKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/GlossaryTermInfo.avsc +3 -1
- acryl_datahub_cloud/metadata/schemas/GlossaryTermKey.avsc +2 -0
- acryl_datahub_cloud/metadata/schemas/IcebergWarehouseInfo.avsc +4 -0
- acryl_datahub_cloud/metadata/schemas/IncidentActivityEvent.avsc +3 -3
- acryl_datahub_cloud/metadata/schemas/IncidentInfo.avsc +3 -3
- acryl_datahub_cloud/metadata/schemas/InferredMetadata.avsc +71 -1
- acryl_datahub_cloud/metadata/schemas/InputFields.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/InviteToken.avsc +26 -0
- acryl_datahub_cloud/metadata/schemas/LineageFeatures.avsc +67 -42
- acryl_datahub_cloud/metadata/schemas/LogicalParent.avsc +145 -0
- acryl_datahub_cloud/metadata/schemas/MLFeatureKey.avsc +4 -1
- acryl_datahub_cloud/metadata/schemas/MLFeatureTableKey.avsc +4 -1
- acryl_datahub_cloud/metadata/schemas/MLModelDeploymentKey.avsc +7 -1
- acryl_datahub_cloud/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
- acryl_datahub_cloud/metadata/schemas/MLModelGroupKey.avsc +9 -1
- acryl_datahub_cloud/metadata/schemas/MLModelKey.avsc +9 -1
- acryl_datahub_cloud/metadata/schemas/MLModelProperties.avsc +4 -2
- acryl_datahub_cloud/metadata/schemas/MLPrimaryKeyKey.avsc +4 -1
- acryl_datahub_cloud/metadata/schemas/MetadataChangeEvent.avsc +424 -97
- acryl_datahub_cloud/metadata/schemas/MetadataChangeLog.avsc +65 -44
- acryl_datahub_cloud/metadata/schemas/MetadataChangeProposal.avsc +64 -0
- acryl_datahub_cloud/metadata/schemas/MonitorAnomalyEvent.avsc +84 -29
- acryl_datahub_cloud/metadata/schemas/MonitorInfo.avsc +221 -23
- acryl_datahub_cloud/metadata/schemas/MonitorKey.avsc +9 -1
- acryl_datahub_cloud/metadata/schemas/MonitorSuiteInfo.avsc +128 -3
- acryl_datahub_cloud/metadata/schemas/NotebookInfo.avsc +5 -2
- acryl_datahub_cloud/metadata/schemas/NotebookKey.avsc +1 -0
- acryl_datahub_cloud/metadata/schemas/NotificationRequest.avsc +91 -4
- acryl_datahub_cloud/metadata/schemas/Operation.avsc +17 -0
- acryl_datahub_cloud/metadata/schemas/Ownership.avsc +71 -1
- acryl_datahub_cloud/metadata/schemas/QueryProperties.avsc +4 -2
- acryl_datahub_cloud/metadata/schemas/QuerySubjects.avsc +2 -13
- acryl_datahub_cloud/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- acryl_datahub_cloud/metadata/schemas/RoleProperties.avsc +3 -1
- acryl_datahub_cloud/metadata/schemas/SchemaFieldInfo.avsc +3 -1
- acryl_datahub_cloud/metadata/schemas/SchemaFieldKey.avsc +3 -0
- acryl_datahub_cloud/metadata/schemas/SchemaMetadata.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/SemanticContent.avsc +123 -0
- acryl_datahub_cloud/metadata/schemas/StructuredProperties.avsc +69 -0
- acryl_datahub_cloud/metadata/schemas/StructuredPropertyDefinition.avsc +15 -4
- acryl_datahub_cloud/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- acryl_datahub_cloud/metadata/schemas/SubscriptionInfo.avsc +136 -5
- acryl_datahub_cloud/metadata/schemas/SubscriptionKey.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/SystemMetadata.avsc +147 -0
- acryl_datahub_cloud/metadata/schemas/TagProperties.avsc +3 -1
- acryl_datahub_cloud/metadata/schemas/TestInfo.avsc +2 -1
- acryl_datahub_cloud/metadata/schemas/UpstreamLineage.avsc +9 -0
- acryl_datahub_cloud/metadata/schemas/UsageFeatures.avsc +10 -0
- acryl_datahub_cloud/metadata/schemas/__init__.py +3 -3
- acryl_datahub_cloud/notifications/__init__.py +0 -0
- acryl_datahub_cloud/notifications/notification_recipient_builder.py +399 -0
- acryl_datahub_cloud/sdk/__init__.py +69 -0
- acryl_datahub_cloud/sdk/assertion/__init__.py +58 -0
- acryl_datahub_cloud/sdk/assertion/assertion_base.py +779 -0
- acryl_datahub_cloud/sdk/assertion/column_metric_assertion.py +191 -0
- acryl_datahub_cloud/sdk/assertion/column_value_assertion.py +431 -0
- acryl_datahub_cloud/sdk/assertion/freshness_assertion.py +201 -0
- acryl_datahub_cloud/sdk/assertion/schema_assertion.py +268 -0
- acryl_datahub_cloud/sdk/assertion/smart_column_metric_assertion.py +212 -0
- acryl_datahub_cloud/sdk/assertion/smart_freshness_assertion.py +165 -0
- acryl_datahub_cloud/sdk/assertion/smart_sql_assertion.py +156 -0
- acryl_datahub_cloud/sdk/assertion/smart_volume_assertion.py +162 -0
- acryl_datahub_cloud/sdk/assertion/sql_assertion.py +273 -0
- acryl_datahub_cloud/sdk/assertion/types.py +20 -0
- acryl_datahub_cloud/sdk/assertion/volume_assertion.py +156 -0
- acryl_datahub_cloud/sdk/assertion_client/__init__.py +0 -0
- acryl_datahub_cloud/sdk/assertion_client/column_metric.py +545 -0
- acryl_datahub_cloud/sdk/assertion_client/column_value.py +617 -0
- acryl_datahub_cloud/sdk/assertion_client/freshness.py +371 -0
- acryl_datahub_cloud/sdk/assertion_client/helpers.py +166 -0
- acryl_datahub_cloud/sdk/assertion_client/schema.py +358 -0
- acryl_datahub_cloud/sdk/assertion_client/smart_column_metric.py +540 -0
- acryl_datahub_cloud/sdk/assertion_client/smart_freshness.py +373 -0
- acryl_datahub_cloud/sdk/assertion_client/smart_sql.py +411 -0
- acryl_datahub_cloud/sdk/assertion_client/smart_volume.py +380 -0
- acryl_datahub_cloud/sdk/assertion_client/sql.py +410 -0
- acryl_datahub_cloud/sdk/assertion_client/volume.py +446 -0
- acryl_datahub_cloud/sdk/assertion_input/__init__.py +0 -0
- acryl_datahub_cloud/sdk/assertion_input/assertion_input.py +1470 -0
- acryl_datahub_cloud/sdk/assertion_input/column_assertion_constants.py +114 -0
- acryl_datahub_cloud/sdk/assertion_input/column_assertion_utils.py +284 -0
- acryl_datahub_cloud/sdk/assertion_input/column_metric_assertion_input.py +759 -0
- acryl_datahub_cloud/sdk/assertion_input/column_metric_constants.py +109 -0
- acryl_datahub_cloud/sdk/assertion_input/column_value_assertion_input.py +810 -0
- acryl_datahub_cloud/sdk/assertion_input/freshness_assertion_input.py +305 -0
- acryl_datahub_cloud/sdk/assertion_input/schema_assertion_input.py +413 -0
- acryl_datahub_cloud/sdk/assertion_input/smart_column_metric_assertion_input.py +793 -0
- acryl_datahub_cloud/sdk/assertion_input/smart_freshness_assertion_input.py +218 -0
- acryl_datahub_cloud/sdk/assertion_input/smart_sql_assertion_input.py +181 -0
- acryl_datahub_cloud/sdk/assertion_input/smart_volume_assertion_input.py +189 -0
- acryl_datahub_cloud/sdk/assertion_input/sql_assertion_input.py +320 -0
- acryl_datahub_cloud/sdk/assertion_input/volume_assertion_input.py +635 -0
- acryl_datahub_cloud/sdk/assertions_client.py +1074 -0
- acryl_datahub_cloud/sdk/entities/__init__.py +0 -0
- acryl_datahub_cloud/sdk/entities/assertion.py +439 -0
- acryl_datahub_cloud/sdk/entities/monitor.py +291 -0
- acryl_datahub_cloud/sdk/entities/subscription.py +100 -0
- acryl_datahub_cloud/sdk/errors.py +34 -0
- acryl_datahub_cloud/sdk/resolver_client.py +42 -0
- acryl_datahub_cloud/sdk/subscription_client.py +737 -0
- {acryl_datahub_cloud-0.3.10rc4.dist-info → acryl_datahub_cloud-0.3.16.1rc0.dist-info}/METADATA +49 -43
- {acryl_datahub_cloud-0.3.10rc4.dist-info → acryl_datahub_cloud-0.3.16.1rc0.dist-info}/RECORD +243 -145
- {acryl_datahub_cloud-0.3.10rc4.dist-info → acryl_datahub_cloud-0.3.16.1rc0.dist-info}/WHEEL +1 -1
- {acryl_datahub_cloud-0.3.10rc4.dist-info → acryl_datahub_cloud-0.3.16.1rc0.dist-info}/entry_points.txt +1 -0
- {acryl_datahub_cloud-0.3.10rc4.dist-info → acryl_datahub_cloud-0.3.16.1rc0.dist-info}/top_level.txt +0 -0
|
@@ -22,12 +22,14 @@ from polars.datatypes import DataTypeClass
|
|
|
22
22
|
from pydantic import Field
|
|
23
23
|
from scipy.stats import expon
|
|
24
24
|
|
|
25
|
+
from acryl_datahub_cloud.datahub_usage_reporting.excluded import EXCLUDED_PATTERNS
|
|
25
26
|
from acryl_datahub_cloud.datahub_usage_reporting.query_builder import QueryBuilder
|
|
26
27
|
from acryl_datahub_cloud.datahub_usage_reporting.usage_feature_patch_builder import (
|
|
27
28
|
UsageFeaturePatchBuilder,
|
|
28
29
|
)
|
|
29
30
|
from acryl_datahub_cloud.elasticsearch.config import ElasticSearchClientConfig
|
|
30
31
|
from acryl_datahub_cloud.metadata.schema_classes import (
|
|
32
|
+
CorpUserUsageFeaturesClass,
|
|
31
33
|
QueryUsageFeaturesClass,
|
|
32
34
|
UsageFeaturesClass,
|
|
33
35
|
)
|
|
@@ -40,7 +42,7 @@ from datahub.ingestion.api.decorators import (
|
|
|
40
42
|
platform_name,
|
|
41
43
|
support_status,
|
|
42
44
|
)
|
|
43
|
-
from datahub.ingestion.api.source import MetadataWorkUnitProcessor
|
|
45
|
+
from datahub.ingestion.api.source import MetadataWorkUnitProcessor
|
|
44
46
|
from datahub.ingestion.api.source_helpers import auto_workunit_reporter
|
|
45
47
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
46
48
|
from datahub.ingestion.graph.client import DatahubClientConfig
|
|
@@ -114,12 +116,12 @@ class DataHubUsageFeatureReportingSourceConfig(
|
|
|
114
116
|
30, description="Timeout in seconds for the search queries."
|
|
115
117
|
)
|
|
116
118
|
extract_batch_size: int = Field(
|
|
117
|
-
|
|
119
|
+
5000,
|
|
118
120
|
description="The number of documents to retrieve in each batch from ElasticSearch or OpenSearch.",
|
|
119
121
|
)
|
|
120
122
|
|
|
121
123
|
extract_delay: Optional[float] = Field(
|
|
122
|
-
0
|
|
124
|
+
0,
|
|
123
125
|
description="The delay in seconds between each batch extraction from ElasticSearch or OpenSearch.",
|
|
124
126
|
)
|
|
125
127
|
|
|
@@ -135,6 +137,10 @@ class DataHubUsageFeatureReportingSourceConfig(
|
|
|
135
137
|
None,
|
|
136
138
|
description="Optional configuration for stateful ingestion, including stale metadata removal.",
|
|
137
139
|
)
|
|
140
|
+
user_usage_enabled: bool = Field(
|
|
141
|
+
True,
|
|
142
|
+
description="Flag to enable or disable user usage statistics collection.",
|
|
143
|
+
)
|
|
138
144
|
dataset_usage_enabled: bool = Field(
|
|
139
145
|
True,
|
|
140
146
|
description="Flag to enable or disable dataset usage statistics collection.",
|
|
@@ -177,7 +183,7 @@ class DataHubUsageFeatureReportingSourceConfig(
|
|
|
177
183
|
# This option is only needed here until we are sure that the streaming mode is stable.
|
|
178
184
|
# then we can remove it and control it with the streaming_mode option.
|
|
179
185
|
experimental_full_streaming: bool = Field(
|
|
180
|
-
|
|
186
|
+
True,
|
|
181
187
|
description="Flag to enable full streaming mode.'",
|
|
182
188
|
)
|
|
183
189
|
|
|
@@ -191,6 +197,11 @@ class DataHubUsageFeatureReportingSourceConfig(
|
|
|
191
197
|
description="Flag to generate MCP patch for usage features.'",
|
|
192
198
|
)
|
|
193
199
|
|
|
200
|
+
excluded_platforms: List[str] = Field(
|
|
201
|
+
EXCLUDED_PATTERNS,
|
|
202
|
+
description="List of platforms to exclude from usage statistics collection. This is done to avoid invite user functionality to be filled with service accounts.",
|
|
203
|
+
)
|
|
204
|
+
|
|
194
205
|
|
|
195
206
|
def exp_cdf(series: polars.Series) -> polars.Series:
|
|
196
207
|
with PerfTimer() as timer:
|
|
@@ -228,7 +239,7 @@ def exp_cdf(series: polars.Series) -> polars.Series:
|
|
|
228
239
|
|
|
229
240
|
|
|
230
241
|
@dataclass
|
|
231
|
-
class DatahubUsageFeatureReport(
|
|
242
|
+
class DatahubUsageFeatureReport(StatefulIngestionReport, IngestionStageReport):
|
|
232
243
|
dataset_platforms_count: Dict[str, int] = field(
|
|
233
244
|
default_factory=lambda: defaultdict(lambda: 0)
|
|
234
245
|
)
|
|
@@ -241,10 +252,6 @@ class DatahubUsageFeatureReport(IngestionStageReport, StatefulIngestionReport):
|
|
|
241
252
|
default_factory=lambda: defaultdict(lambda: PerfTimer())
|
|
242
253
|
)
|
|
243
254
|
|
|
244
|
-
dataset_usage_processing_time: PerfTimer = PerfTimer()
|
|
245
|
-
dashboard_usage_processing_time: PerfTimer = PerfTimer()
|
|
246
|
-
chart_usage_processing_time: PerfTimer = PerfTimer()
|
|
247
|
-
query_usage_processing_time: PerfTimer = PerfTimer()
|
|
248
255
|
query_platforms_count: Dict[str, int] = field(
|
|
249
256
|
default_factory=lambda: defaultdict(lambda: 0)
|
|
250
257
|
)
|
|
@@ -395,18 +402,10 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
395
402
|
"last_modified_at": (
|
|
396
403
|
doc["_source"]["lastModifiedAt"]
|
|
397
404
|
if "lastModifiedAt" in doc["_source"]
|
|
398
|
-
else (
|
|
399
|
-
doc["_source"]["lastModifiedAt"]
|
|
400
|
-
if "lastModifiedAt" in doc["_source"]
|
|
401
|
-
else None
|
|
402
|
-
)
|
|
405
|
+
else (doc["_source"].get("lastModifiedAt", None))
|
|
403
406
|
),
|
|
404
407
|
"platform": doc["_source"]["platform"],
|
|
405
|
-
"removed": (
|
|
406
|
-
doc["_source"]["removed"]
|
|
407
|
-
if "removed" in doc["_source"]
|
|
408
|
-
else False
|
|
409
|
-
),
|
|
408
|
+
"removed": (doc["_source"].get("removed", False)),
|
|
410
409
|
}
|
|
411
410
|
|
|
412
411
|
time_taken = timer.elapsed_seconds()
|
|
@@ -509,11 +508,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
509
508
|
"eventGranularity": doc["_source"].get("eventGranularity"),
|
|
510
509
|
"totalSqlQueries": doc["_source"].get("totalSqlQueries", 0),
|
|
511
510
|
"uniqueUserCount": doc["_source"].get("uniqueUserCount", 0),
|
|
512
|
-
"userCounts": (
|
|
513
|
-
doc["_source"]["event"]["userCounts"]
|
|
514
|
-
if "userCounts" in doc["_source"]["event"]
|
|
515
|
-
else None
|
|
516
|
-
),
|
|
511
|
+
"userCounts": (doc["_source"]["event"].get("userCounts", None)),
|
|
517
512
|
"platform": platform,
|
|
518
513
|
}
|
|
519
514
|
except KeyError as e:
|
|
@@ -525,7 +520,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
525
520
|
time_taken = timer.elapsed_seconds()
|
|
526
521
|
logger.info(f"DatasetUsage processing took {time_taken:.3f} seconds")
|
|
527
522
|
|
|
528
|
-
def search_score(
|
|
523
|
+
def search_score( # noqa: C901
|
|
529
524
|
self, urn: str, last_update_time: int, usage_percentile: int
|
|
530
525
|
) -> SearchRankingMultipliers:
|
|
531
526
|
usage_search_score_multiplier = 1.0
|
|
@@ -622,27 +617,27 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
622
617
|
[endpoint],
|
|
623
618
|
http_auth=(user, password),
|
|
624
619
|
use_ssl=(
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
620
|
+
bool(
|
|
621
|
+
self.config.search_index
|
|
622
|
+
and self.config.search_index.use_ssl
|
|
623
|
+
)
|
|
629
624
|
),
|
|
630
625
|
)
|
|
631
626
|
|
|
632
|
-
response = server.create_pit(index, keep_alive="10m")
|
|
627
|
+
# response = server.create_pit(index, keep_alive="10m")
|
|
633
628
|
|
|
634
629
|
# TODO: Save PIT, we can resume processing based on <pit, search_after> tuple
|
|
635
|
-
pit = response.get("pit_id")
|
|
636
|
-
query_copy.update({"pit": {"id": pit, "keep_alive": "10m"}})
|
|
630
|
+
# pit = response.get("pit_id")
|
|
631
|
+
# query_copy.update({"pit": {"id": pit, "keep_alive": "10m"}})
|
|
637
632
|
else:
|
|
638
633
|
server = Elasticsearch(
|
|
639
634
|
[endpoint],
|
|
640
635
|
http_auth=(user, password),
|
|
641
636
|
use_ssl=(
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
637
|
+
bool(
|
|
638
|
+
self.config.search_index
|
|
639
|
+
and self.config.search_index.use_ssl
|
|
640
|
+
)
|
|
646
641
|
),
|
|
647
642
|
)
|
|
648
643
|
|
|
@@ -737,23 +732,26 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
737
732
|
polars.Duration(): pa.duration("ns"),
|
|
738
733
|
}
|
|
739
734
|
|
|
740
|
-
if polars_dtype in [type(key) for key in type_mapping
|
|
735
|
+
if polars_dtype in [type(key) for key in type_mapping]:
|
|
741
736
|
return type_mapping[polars_dtype]
|
|
742
737
|
elif polars_dtype == polars.Categorical:
|
|
743
738
|
return pa.dictionary(index_type=pa.int32(), value_type=pa.string())
|
|
744
739
|
elif isinstance(polars_dtype, polars.Struct):
|
|
745
740
|
return pa.struct(
|
|
746
|
-
|
|
747
|
-
field.name
|
|
741
|
+
[
|
|
742
|
+
pa.field(field.name, convert_dtype(field.dtype))
|
|
748
743
|
for field in polars_dtype.fields
|
|
749
|
-
|
|
744
|
+
]
|
|
750
745
|
)
|
|
751
746
|
elif isinstance(polars_dtype, polars.List):
|
|
752
747
|
return pa.list_(convert_dtype(polars_dtype.inner))
|
|
753
748
|
else:
|
|
754
749
|
raise ValueError(f"Unsupported Polars dtype: {polars_dtype}")
|
|
755
750
|
|
|
756
|
-
fields = [
|
|
751
|
+
fields = [
|
|
752
|
+
pa.field(name, convert_dtype(dtype))
|
|
753
|
+
for name, dtype in polars_schema.items()
|
|
754
|
+
]
|
|
757
755
|
return pa.schema(fields)
|
|
758
756
|
|
|
759
757
|
def batch_write_parquet(
|
|
@@ -846,7 +844,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
846
844
|
.drop(["removed"])
|
|
847
845
|
)
|
|
848
846
|
|
|
849
|
-
return wdf
|
|
847
|
+
return wdf
|
|
850
848
|
|
|
851
849
|
def load_write_usage_server_side_aggregation(
|
|
852
850
|
self, soft_deleted_entities_df: polars.LazyFrame
|
|
@@ -935,6 +933,11 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
935
933
|
|
|
936
934
|
return dataset_df
|
|
937
935
|
|
|
936
|
+
def generate_user_usage_mcps(self) -> Iterable[MetadataWorkUnit]:
|
|
937
|
+
with polars.StringCache():
|
|
938
|
+
user_usage_lf = self.generate_user_usage()
|
|
939
|
+
yield from self.generate_user_usage_mcp_from_lazyframe(user_usage_lf)
|
|
940
|
+
|
|
938
941
|
def generate_dataset_usage_mcps(self) -> Iterable[MetadataWorkUnit]:
|
|
939
942
|
with polars.StringCache():
|
|
940
943
|
dataset_usage_df = self.generate_dataset_usage()
|
|
@@ -970,48 +973,35 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
970
973
|
]
|
|
971
974
|
|
|
972
975
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
976
|
+
if self.config.user_usage_enabled:
|
|
977
|
+
with self.report.new_stage("generate user usage"):
|
|
978
|
+
yield from self.generate_user_usage_mcps()
|
|
979
|
+
|
|
973
980
|
if self.config.dataset_usage_enabled:
|
|
974
|
-
with self.report.
|
|
975
|
-
self.report.new_stage("generate dataset usage")
|
|
981
|
+
with self.report.new_stage("generate dataset usage"):
|
|
976
982
|
yield from self.generate_dataset_usage_mcps()
|
|
977
|
-
time_taken = timer.elapsed_seconds()
|
|
978
|
-
logger.info(f"Dataset Usage generation took {time_taken:.3f} seconds")
|
|
979
983
|
|
|
980
984
|
if self.config.dashboard_usage_enabled:
|
|
981
|
-
with self.report.
|
|
982
|
-
self.report.new_stage("generate dashboard usage")
|
|
985
|
+
with self.report.new_stage("generate dashboard usage"):
|
|
983
986
|
yield from self.generate_dashboard_usage_mcps()
|
|
984
987
|
|
|
985
|
-
time_taken = timer.elapsed_seconds()
|
|
986
|
-
logger.info(f"Dashboard Usage generation took {time_taken:.3f}")
|
|
987
|
-
|
|
988
988
|
if self.config.chart_usage_enabled:
|
|
989
|
-
with self.report.
|
|
990
|
-
self.report.new_stage("generate chart usage")
|
|
991
|
-
|
|
989
|
+
with self.report.new_stage("generate chart usage"):
|
|
992
990
|
yield from self.generate_chart_usage_mcps()
|
|
993
991
|
|
|
994
|
-
time_taken = timer.elapsed_seconds()
|
|
995
|
-
logger.info(f"Chart Usage generation took {time_taken:.3f}")
|
|
996
|
-
|
|
997
992
|
if self.config.query_usage_enabled:
|
|
998
|
-
with self.report.
|
|
999
|
-
self.report.new_stage("generate query usage")
|
|
1000
|
-
|
|
993
|
+
with self.report.new_stage("generate query usage"):
|
|
1001
994
|
yield from self.generate_query_usage_mcps()
|
|
1002
995
|
|
|
1003
|
-
|
|
1004
|
-
|
|
996
|
+
with self.report.new_stage("end so time is calculated for last stage"):
|
|
997
|
+
pass
|
|
1005
998
|
|
|
1006
999
|
def generate_mcp_from_lazyframe(
|
|
1007
1000
|
self, lazy_frame: polars.LazyFrame
|
|
1008
1001
|
) -> Iterable[MetadataWorkUnit]:
|
|
1009
|
-
num = 0
|
|
1010
1002
|
for row in lazy_frame.collect(
|
|
1011
|
-
streaming
|
|
1003
|
+
engine="streaming" if self.config.experimental_full_streaming else "auto"
|
|
1012
1004
|
).to_struct():
|
|
1013
|
-
num += 1
|
|
1014
|
-
|
|
1015
1005
|
if "siblings" in row and row["siblings"]:
|
|
1016
1006
|
logger.info(f"Siblings found for urn: {row['urn']} -> row['siblings']")
|
|
1017
1007
|
|
|
@@ -1067,7 +1057,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1067
1057
|
uniqueUserPercentileLast30Days=int(
|
|
1068
1058
|
row.get("distinct_user_rank_percentile", 0) or 0
|
|
1069
1059
|
),
|
|
1070
|
-
writeCountLast30Days=int(row.get("
|
|
1060
|
+
writeCountLast30Days=int(row.get("write_count", 0) or 0)
|
|
1071
1061
|
if not self.config.disable_write_usage
|
|
1072
1062
|
else None,
|
|
1073
1063
|
writeCountPercentileLast30Days=int(
|
|
@@ -1101,10 +1091,9 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1101
1091
|
def generate_query_usage_mcp_from_lazyframe(
|
|
1102
1092
|
self, lazy_frame: polars.LazyFrame
|
|
1103
1093
|
) -> Iterable[MetadataWorkUnit]:
|
|
1104
|
-
|
|
1105
|
-
|
|
1106
|
-
|
|
1107
|
-
|
|
1094
|
+
for row in lazy_frame.collect(
|
|
1095
|
+
engine="streaming" if self.config.experimental_full_streaming else "auto"
|
|
1096
|
+
).iter_rows(named=True):
|
|
1108
1097
|
query_usage_features = QueryUsageFeaturesClass(
|
|
1109
1098
|
queryCountLast30Days=int(row.get("totalSqlQueries", 0) or 0),
|
|
1110
1099
|
queryCountTotal=None, # This is not implemented
|
|
@@ -1124,6 +1113,47 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1124
1113
|
row["urn"], query_usage_features
|
|
1125
1114
|
)
|
|
1126
1115
|
|
|
1116
|
+
def _convert_platform_pairs_to_dict(
|
|
1117
|
+
self,
|
|
1118
|
+
platform_pairs: Optional[List[Dict[str, Any]]],
|
|
1119
|
+
value_key: str = "platform_total",
|
|
1120
|
+
) -> Optional[Dict[str, Any]]:
|
|
1121
|
+
"""Convert list of platform usage structs to dictionary."""
|
|
1122
|
+
if not platform_pairs:
|
|
1123
|
+
return None
|
|
1124
|
+
|
|
1125
|
+
return {
|
|
1126
|
+
pair["platform_urn"]: pair[value_key]
|
|
1127
|
+
for pair in platform_pairs
|
|
1128
|
+
if pair["platform_urn"] is not None
|
|
1129
|
+
}
|
|
1130
|
+
|
|
1131
|
+
def generate_user_usage_mcp_from_lazyframe(
|
|
1132
|
+
self, lazy_frame: polars.LazyFrame
|
|
1133
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
1134
|
+
for row in lazy_frame.collect(
|
|
1135
|
+
engine="streaming" if self.config.experimental_full_streaming else "auto"
|
|
1136
|
+
).iter_rows(named=True):
|
|
1137
|
+
user_usage_features = CorpUserUsageFeaturesClass(
|
|
1138
|
+
userUsageTotalPast30Days=int(
|
|
1139
|
+
row.get("userUsageTotalPast30Days", 0) or 0
|
|
1140
|
+
),
|
|
1141
|
+
userPlatformUsageTotalsPast30Days=self._convert_platform_pairs_to_dict(
|
|
1142
|
+
row.get("platform_usage_pairs", [])
|
|
1143
|
+
),
|
|
1144
|
+
userPlatformUsagePercentilePast30Days=self._convert_platform_pairs_to_dict(
|
|
1145
|
+
row.get("platform_usage_percentiles", []),
|
|
1146
|
+
"platform_rank_percentile",
|
|
1147
|
+
),
|
|
1148
|
+
userUsagePercentilePast30Days=row.get("userUsagePercentilePast30Days"),
|
|
1149
|
+
userTopDatasetsByUsage=self._convert_top_datasets_to_dict(
|
|
1150
|
+
row.get("top_datasets_map", [])
|
|
1151
|
+
),
|
|
1152
|
+
)
|
|
1153
|
+
yield MetadataChangeProposalWrapper(
|
|
1154
|
+
entityUrn=row["user"], aspect=user_usage_features
|
|
1155
|
+
).as_workunit(is_primary_source=False)
|
|
1156
|
+
|
|
1127
1157
|
def generate_usage_feature_mcp(
|
|
1128
1158
|
self, urn: str, usage_feature: UsageFeaturesClass
|
|
1129
1159
|
) -> Iterable[MetadataWorkUnit]:
|
|
@@ -1158,9 +1188,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1158
1188
|
|
|
1159
1189
|
return self.generate_dashboard_chart_usage(entity_index, usage_index)
|
|
1160
1190
|
|
|
1161
|
-
def
|
|
1162
|
-
self, entity_index: str, usage_index: str
|
|
1163
|
-
) -> polars.LazyFrame:
|
|
1191
|
+
def _generate_dashboard_chart_entities(self, entity_index: str) -> polars.LazyFrame:
|
|
1164
1192
|
entity_schema = {
|
|
1165
1193
|
"entity_urn": polars.Categorical,
|
|
1166
1194
|
"removed": polars.Boolean,
|
|
@@ -1177,7 +1205,12 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1177
1205
|
process_function=self.soft_deleted_batch,
|
|
1178
1206
|
)
|
|
1179
1207
|
|
|
1180
|
-
|
|
1208
|
+
return entities_df
|
|
1209
|
+
|
|
1210
|
+
def _generate_dashboard_chart_usage(
|
|
1211
|
+
self, entities_df: polars.LazyFrame, usage_index: str
|
|
1212
|
+
) -> polars.LazyFrame:
|
|
1213
|
+
entities_usage_schema = {
|
|
1181
1214
|
"timestampMillis": polars.Int64,
|
|
1182
1215
|
"lastObserved": polars.Int64,
|
|
1183
1216
|
"urn": polars.Categorical,
|
|
@@ -1195,7 +1228,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1195
1228
|
}
|
|
1196
1229
|
|
|
1197
1230
|
lf = self.load_data_from_es_to_lf(
|
|
1198
|
-
schema=
|
|
1231
|
+
schema=entities_usage_schema,
|
|
1199
1232
|
index=usage_index,
|
|
1200
1233
|
query=QueryBuilder.get_dashboard_usage_query(self.config.lookback_days),
|
|
1201
1234
|
process_function=self.process_dashboard_usage,
|
|
@@ -1214,6 +1247,15 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1214
1247
|
.alias("row_num")
|
|
1215
1248
|
).filter(polars.col("row_num") == 1)
|
|
1216
1249
|
|
|
1250
|
+
return lf
|
|
1251
|
+
|
|
1252
|
+
def generate_dashboard_chart_usage(
|
|
1253
|
+
self, entity_index: str, usage_index: str
|
|
1254
|
+
) -> polars.LazyFrame:
|
|
1255
|
+
entities_df = self._generate_dashboard_chart_entities(entity_index)
|
|
1256
|
+
|
|
1257
|
+
lf = self._generate_dashboard_chart_usage(entities_df, usage_index)
|
|
1258
|
+
|
|
1217
1259
|
# lf = lf.filter(polars.col("urn") == "urn:li:dashboard:(looker,dashboards.8)")
|
|
1218
1260
|
# "urn:li:dashboard:(looker,dashboards.8)"
|
|
1219
1261
|
|
|
@@ -1287,7 +1329,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1287
1329
|
.is_not_null()
|
|
1288
1330
|
# We only want to downrank datasets that have a search score multiplier greater than 1. 1 is the minimum score of a dataset
|
|
1289
1331
|
.and_(polars.col("combinedSearchRankingMultiplier").ne(1))
|
|
1290
|
-
)
|
|
1332
|
+
)
|
|
1291
1333
|
.filter(polars.col("removed") == False) # noqa: E712
|
|
1292
1334
|
.drop(["removed"])
|
|
1293
1335
|
.drop(["last_modified_at"])
|
|
@@ -1326,7 +1368,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1326
1368
|
query_entities = self.load_data_from_es_to_lf(
|
|
1327
1369
|
schema=query_entities_schema,
|
|
1328
1370
|
index=entity_index,
|
|
1329
|
-
query=QueryBuilder.get_query_entities_query(),
|
|
1371
|
+
query=QueryBuilder.get_query_entities_query(self.config.lookback_days),
|
|
1330
1372
|
process_function=self.queries_entities_batch,
|
|
1331
1373
|
)
|
|
1332
1374
|
|
|
@@ -1383,6 +1425,380 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1383
1425
|
|
|
1384
1426
|
return usage_with_top_users_with_ranks
|
|
1385
1427
|
|
|
1428
|
+
def _generate_user_usage_for_dataset(self) -> polars.LazyFrame:
|
|
1429
|
+
datasets_lf = self.get_datasets()
|
|
1430
|
+
if self.config.set_upstream_table_max_modification_time_for_views:
|
|
1431
|
+
datasets_lf = self.set_table_modification_time_for_views(datasets_lf)
|
|
1432
|
+
|
|
1433
|
+
lf = self.load_dataset_usage()
|
|
1434
|
+
|
|
1435
|
+
# Polaris/pandas join merges the join column into one column and that's why we need to filter based on the removed column
|
|
1436
|
+
lf = (
|
|
1437
|
+
lf.join(datasets_lf, left_on="urn", right_on="entity_urn", how="left")
|
|
1438
|
+
.filter(polars.col("removed") == False) # noqa: E712
|
|
1439
|
+
.drop(["removed"])
|
|
1440
|
+
)
|
|
1441
|
+
|
|
1442
|
+
users_lf = (
|
|
1443
|
+
lf.explode("userCounts")
|
|
1444
|
+
.unnest("userCounts")
|
|
1445
|
+
.filter(polars.col("user").is_not_null())
|
|
1446
|
+
)
|
|
1447
|
+
|
|
1448
|
+
user_dataset_usage_lf = self._create_user_dataset_usage_map(users_lf)
|
|
1449
|
+
return user_dataset_usage_lf
|
|
1450
|
+
|
|
1451
|
+
@staticmethod
|
|
1452
|
+
def _convert_top_datasets_to_dict(
|
|
1453
|
+
top_datasets_list: Optional[List[Dict[str, Any]]],
|
|
1454
|
+
) -> Optional[Dict[str, float]]:
|
|
1455
|
+
"""
|
|
1456
|
+
Convert list of top datasets structs to dictionary as expected by CorpUserUsageFeatures schema.
|
|
1457
|
+
|
|
1458
|
+
Args:
|
|
1459
|
+
top_datasets_list: List of dictionaries with 'dataset_urn' and 'count' keys
|
|
1460
|
+
|
|
1461
|
+
Returns:
|
|
1462
|
+
Dictionary mapping dataset URN to usage count, or None if input is empty
|
|
1463
|
+
"""
|
|
1464
|
+
if not top_datasets_list:
|
|
1465
|
+
return None
|
|
1466
|
+
|
|
1467
|
+
top_datasets_dict = {
|
|
1468
|
+
item["dataset_urn"]: float(item["count"])
|
|
1469
|
+
for item in top_datasets_list
|
|
1470
|
+
if isinstance(item, dict) and "dataset_urn" in item and "count" in item
|
|
1471
|
+
}
|
|
1472
|
+
|
|
1473
|
+
return top_datasets_dict if top_datasets_dict else None
|
|
1474
|
+
|
|
1475
|
+
def _create_user_dataset_usage_map(
|
|
1476
|
+
self, users_lf: polars.LazyFrame, top_n: int = 25
|
|
1477
|
+
) -> polars.LazyFrame:
|
|
1478
|
+
"""
|
|
1479
|
+
Creates a lazyframe with user string and map of top N datasets by usage.
|
|
1480
|
+
|
|
1481
|
+
Args:
|
|
1482
|
+
users_lf: LazyFrame containing user usage data with columns: user, urn, platform, count
|
|
1483
|
+
top_n: Number of top datasets to include per user (default: 25)
|
|
1484
|
+
|
|
1485
|
+
Returns:
|
|
1486
|
+
LazyFrame with columns:
|
|
1487
|
+
- user: string column containing the user identifier
|
|
1488
|
+
- top_datasets_map: list of structs with dataset_urn (string), count (int), and platform_urn (string)
|
|
1489
|
+
- userUsageTotalPast30Days: total usage count for the user across all datasets
|
|
1490
|
+
- userPlatformUsageTotalsPast30Days: map from platform URN to usage totals
|
|
1491
|
+
"""
|
|
1492
|
+
|
|
1493
|
+
# Create intermediate lazy frame with filtered users and aggregated counts
|
|
1494
|
+
user_dataset_aggregated = (
|
|
1495
|
+
users_lf.filter(polars.col("user").str.contains("@"))
|
|
1496
|
+
.group_by("user", "urn", "platform")
|
|
1497
|
+
.agg(polars.col("count").sum().alias("total_count"))
|
|
1498
|
+
.with_columns(
|
|
1499
|
+
# Direct string formatting - vectorized operation
|
|
1500
|
+
polars.format("urn:li:dataPlatform:{}", polars.col("platform")).alias(
|
|
1501
|
+
"platform_urn"
|
|
1502
|
+
)
|
|
1503
|
+
)
|
|
1504
|
+
)
|
|
1505
|
+
|
|
1506
|
+
# Calculate user totals
|
|
1507
|
+
user_totals = user_dataset_aggregated.group_by("user").agg(
|
|
1508
|
+
polars.col("total_count").sum().alias("userUsageTotalPast30Days")
|
|
1509
|
+
)
|
|
1510
|
+
|
|
1511
|
+
# Calculate platform totals for each user - keep as list of structs
|
|
1512
|
+
platform_totals = (
|
|
1513
|
+
user_dataset_aggregated.group_by("user", "platform_urn")
|
|
1514
|
+
.agg(polars.col("total_count").sum().alias("platform_total"))
|
|
1515
|
+
.filter(polars.col("platform_urn").is_not_null())
|
|
1516
|
+
.group_by("user")
|
|
1517
|
+
.agg(
|
|
1518
|
+
polars.struct(
|
|
1519
|
+
[
|
|
1520
|
+
polars.col("platform_urn"),
|
|
1521
|
+
polars.col("platform_total").cast(polars.Float64),
|
|
1522
|
+
]
|
|
1523
|
+
).alias("platform_usage_pairs")
|
|
1524
|
+
)
|
|
1525
|
+
)
|
|
1526
|
+
|
|
1527
|
+
# Calculate top datasets
|
|
1528
|
+
top_datasets = (
|
|
1529
|
+
user_dataset_aggregated.with_columns(
|
|
1530
|
+
polars.col("total_count")
|
|
1531
|
+
.rank(descending=True, method="ordinal")
|
|
1532
|
+
.over("user")
|
|
1533
|
+
.alias("dataset_rank")
|
|
1534
|
+
)
|
|
1535
|
+
.filter(polars.col("dataset_rank") <= top_n)
|
|
1536
|
+
.group_by("user")
|
|
1537
|
+
.agg(
|
|
1538
|
+
polars.struct(
|
|
1539
|
+
[
|
|
1540
|
+
polars.col("urn").alias("dataset_urn"),
|
|
1541
|
+
polars.col("total_count").alias("count"),
|
|
1542
|
+
polars.col("platform_urn"),
|
|
1543
|
+
]
|
|
1544
|
+
)
|
|
1545
|
+
.sort_by("total_count", descending=True)
|
|
1546
|
+
.alias("top_datasets_map")
|
|
1547
|
+
)
|
|
1548
|
+
)
|
|
1549
|
+
|
|
1550
|
+
# Join all results
|
|
1551
|
+
return top_datasets.join(user_totals, on="user", how="left").join(
|
|
1552
|
+
platform_totals, on="user", how="left"
|
|
1553
|
+
)
|
|
1554
|
+
|
|
1555
|
+
def _combine_user_usage_data(
|
|
1556
|
+
self,
|
|
1557
|
+
dataset_usage_lf: polars.LazyFrame,
|
|
1558
|
+
dashboard_usage_lf: polars.LazyFrame,
|
|
1559
|
+
chart_usage_lf: polars.LazyFrame,
|
|
1560
|
+
) -> polars.LazyFrame:
|
|
1561
|
+
"""
|
|
1562
|
+
Combines user usage data from dataset, dashboard, and chart sources.
|
|
1563
|
+
|
|
1564
|
+
Args:
|
|
1565
|
+
dataset_usage_lf: LazyFrame with dataset usage data containing top_datasets_map
|
|
1566
|
+
dashboard_usage_lf: LazyFrame with dashboard usage data
|
|
1567
|
+
chart_usage_lf: LazyFrame with chart usage data
|
|
1568
|
+
|
|
1569
|
+
Returns:
|
|
1570
|
+
Combined LazyFrame with aggregated usage data per user
|
|
1571
|
+
"""
|
|
1572
|
+
user_totals = self._combine_user_totals(
|
|
1573
|
+
dataset_usage_lf, dashboard_usage_lf, chart_usage_lf
|
|
1574
|
+
)
|
|
1575
|
+
|
|
1576
|
+
platform_pairs = self._combine_platform_pairs(
|
|
1577
|
+
dataset_usage_lf, dashboard_usage_lf, chart_usage_lf
|
|
1578
|
+
)
|
|
1579
|
+
|
|
1580
|
+
result = user_totals.join(platform_pairs, on="user", how="left")
|
|
1581
|
+
|
|
1582
|
+
return result.with_columns(
|
|
1583
|
+
polars.col("platform_usage_pairs").fill_null(polars.lit([]))
|
|
1584
|
+
)
|
|
1585
|
+
|
|
1586
|
+
def _filter_users(self, users_lf: polars.LazyFrame) -> polars.LazyFrame:
|
|
1587
|
+
filter_condition = polars.col("user").str.contains("@")
|
|
1588
|
+
for pattern in self.config.excluded_platforms:
|
|
1589
|
+
filter_condition = filter_condition & ~polars.col("user").str.contains(
|
|
1590
|
+
pattern
|
|
1591
|
+
)
|
|
1592
|
+
|
|
1593
|
+
return users_lf.filter(filter_condition)
|
|
1594
|
+
|
|
1595
|
+
def _combine_user_totals(
|
|
1596
|
+
self,
|
|
1597
|
+
dataset_usage_lf: polars.LazyFrame,
|
|
1598
|
+
dashboard_usage_lf: polars.LazyFrame,
|
|
1599
|
+
chart_usage_lf: polars.LazyFrame,
|
|
1600
|
+
) -> polars.LazyFrame:
|
|
1601
|
+
"""Combine user totals and top_datasets_map from all sources."""
|
|
1602
|
+
# Collect all unique users in one operation
|
|
1603
|
+
all_users_lf = (
|
|
1604
|
+
polars.concat(
|
|
1605
|
+
[
|
|
1606
|
+
dataset_usage_lf.select("user"),
|
|
1607
|
+
dashboard_usage_lf.select("user"),
|
|
1608
|
+
chart_usage_lf.select("user"),
|
|
1609
|
+
]
|
|
1610
|
+
)
|
|
1611
|
+
.unique()
|
|
1612
|
+
.pipe(self._filter_users)
|
|
1613
|
+
)
|
|
1614
|
+
|
|
1615
|
+
return (
|
|
1616
|
+
all_users_lf.join(
|
|
1617
|
+
dataset_usage_lf.select(
|
|
1618
|
+
["user", "top_datasets_map", "userUsageTotalPast30Days"]
|
|
1619
|
+
),
|
|
1620
|
+
on="user",
|
|
1621
|
+
how="left",
|
|
1622
|
+
)
|
|
1623
|
+
.join(
|
|
1624
|
+
dashboard_usage_lf.select(["user", "userUsageTotalPast30Days"]),
|
|
1625
|
+
on="user",
|
|
1626
|
+
how="left",
|
|
1627
|
+
suffix="_dashboard",
|
|
1628
|
+
)
|
|
1629
|
+
.join(
|
|
1630
|
+
chart_usage_lf.select(["user", "userUsageTotalPast30Days"]),
|
|
1631
|
+
on="user",
|
|
1632
|
+
how="left",
|
|
1633
|
+
suffix="_chart",
|
|
1634
|
+
)
|
|
1635
|
+
.with_columns(
|
|
1636
|
+
[
|
|
1637
|
+
# Sum with explicit null handling
|
|
1638
|
+
(
|
|
1639
|
+
polars.col("userUsageTotalPast30Days").fill_null(0)
|
|
1640
|
+
+ polars.col("userUsageTotalPast30Days_dashboard").fill_null(0)
|
|
1641
|
+
+ polars.col("userUsageTotalPast30Days_chart").fill_null(0)
|
|
1642
|
+
).alias("userUsageTotalPast30Days")
|
|
1643
|
+
]
|
|
1644
|
+
)
|
|
1645
|
+
.select(["user", "top_datasets_map", "userUsageTotalPast30Days"])
|
|
1646
|
+
)
|
|
1647
|
+
|
|
1648
|
+
def _combine_platform_pairs(
|
|
1649
|
+
self,
|
|
1650
|
+
dataset_usage_lf: polars.LazyFrame,
|
|
1651
|
+
dashboard_usage_lf: polars.LazyFrame,
|
|
1652
|
+
chart_usage_lf: polars.LazyFrame,
|
|
1653
|
+
) -> polars.LazyFrame:
|
|
1654
|
+
"""Combine platform usage pairs from all sources."""
|
|
1655
|
+
all_platforms = []
|
|
1656
|
+
|
|
1657
|
+
# Extract platforms from each source
|
|
1658
|
+
for source_lf, col_name in [
|
|
1659
|
+
(dataset_usage_lf, "platform_usage_pairs"),
|
|
1660
|
+
(dashboard_usage_lf, "platform_usage_pairs"),
|
|
1661
|
+
(chart_usage_lf, "platform_usage_pairs"),
|
|
1662
|
+
]:
|
|
1663
|
+
platforms = self._extract_platforms_from_source(source_lf, col_name)
|
|
1664
|
+
if platforms is not None:
|
|
1665
|
+
all_platforms.append(platforms)
|
|
1666
|
+
|
|
1667
|
+
if not all_platforms:
|
|
1668
|
+
# Return empty result if no platforms found
|
|
1669
|
+
return polars.LazyFrame({"user": [], "platform_usage_pairs": []})
|
|
1670
|
+
|
|
1671
|
+
# Combine all platforms and aggregate by user + platform
|
|
1672
|
+
combined_platforms = polars.concat(all_platforms, how="vertical_relaxed")
|
|
1673
|
+
aggregated = combined_platforms.group_by("user", "platform_urn").agg(
|
|
1674
|
+
polars.col("platform_total").sum().alias("platform_total")
|
|
1675
|
+
)
|
|
1676
|
+
|
|
1677
|
+
# Rebuild platform_usage_pairs structure
|
|
1678
|
+
return aggregated.group_by("user").agg(
|
|
1679
|
+
polars.struct(
|
|
1680
|
+
[polars.col("platform_urn"), polars.col("platform_total")]
|
|
1681
|
+
).alias("platform_usage_pairs")
|
|
1682
|
+
)
|
|
1683
|
+
|
|
1684
|
+
def _extract_platforms_from_source(
|
|
1685
|
+
self, source_lf: polars.LazyFrame, col_name: str
|
|
1686
|
+
) -> polars.LazyFrame | None:
|
|
1687
|
+
"""Extract platform data from a source LazyFrame."""
|
|
1688
|
+
try:
|
|
1689
|
+
return (
|
|
1690
|
+
source_lf.select(["user", col_name])
|
|
1691
|
+
.filter(polars.col(col_name).is_not_null())
|
|
1692
|
+
.filter(polars.col(col_name).list.len() > 0)
|
|
1693
|
+
.explode(col_name)
|
|
1694
|
+
.unnest(col_name)
|
|
1695
|
+
.filter(polars.col("platform_urn").is_not_null())
|
|
1696
|
+
.select(["user", "platform_urn", "platform_total"])
|
|
1697
|
+
)
|
|
1698
|
+
except polars.exceptions.ColumnNotFoundError:
|
|
1699
|
+
return None
|
|
1700
|
+
|
|
1701
|
+
def add_platform_usage_percentiles(
|
|
1702
|
+
self, user_usage_lf: polars.LazyFrame
|
|
1703
|
+
) -> polars.LazyFrame:
|
|
1704
|
+
"""
|
|
1705
|
+
Add platform usage percentiles to user usage data.
|
|
1706
|
+
|
|
1707
|
+
Args:
|
|
1708
|
+
user_usage_lf: LazyFrame with user usage data containing platform_usage_pairs column
|
|
1709
|
+
|
|
1710
|
+
Returns:
|
|
1711
|
+
LazyFrame with additional platform_usage_percentiles column
|
|
1712
|
+
"""
|
|
1713
|
+
# First explode the platform_usage_pairs to work with individual platform usage records
|
|
1714
|
+
platform_usage_exploded = (
|
|
1715
|
+
user_usage_lf.explode("platform_usage_pairs")
|
|
1716
|
+
.unnest("platform_usage_pairs")
|
|
1717
|
+
.filter(polars.col("platform_urn").is_not_null())
|
|
1718
|
+
)
|
|
1719
|
+
|
|
1720
|
+
# Use the existing gen_rank_and_percentile method to calculate percentiles
|
|
1721
|
+
platform_percentiles_with_ranks = self.gen_rank_and_percentile(
|
|
1722
|
+
lf=platform_usage_exploded,
|
|
1723
|
+
count_field="platform_total",
|
|
1724
|
+
urn_field="user",
|
|
1725
|
+
platform_field="platform_urn",
|
|
1726
|
+
prefix="platform_",
|
|
1727
|
+
use_exp_cdf=False,
|
|
1728
|
+
)
|
|
1729
|
+
|
|
1730
|
+
# Group back by user and create the percentiles structure
|
|
1731
|
+
platform_percentiles = platform_percentiles_with_ranks.group_by("user").agg(
|
|
1732
|
+
polars.struct(
|
|
1733
|
+
[
|
|
1734
|
+
polars.col("platform_urn"),
|
|
1735
|
+
polars.col("platform_rank_percentile").cast(polars.Float64),
|
|
1736
|
+
]
|
|
1737
|
+
).alias("platform_usage_percentiles")
|
|
1738
|
+
)
|
|
1739
|
+
|
|
1740
|
+
# Join the percentiles back to the original user_usage_lf
|
|
1741
|
+
return user_usage_lf.join(platform_percentiles, on="user", how="left")
|
|
1742
|
+
|
|
1743
|
+
def _generate_user_usage_for_dashboard_charts(
|
|
1744
|
+
self, entity_index: str, usage_index: str
|
|
1745
|
+
) -> polars.LazyFrame:
|
|
1746
|
+
entities_df = self._generate_dashboard_chart_entities(entity_index)
|
|
1747
|
+
lf = self._generate_dashboard_chart_usage(entities_df, usage_index)
|
|
1748
|
+
|
|
1749
|
+
# Process dashboard usage data into user usage format (similar to dataset version)
|
|
1750
|
+
users_lf = (
|
|
1751
|
+
lf.explode("userCounts")
|
|
1752
|
+
.unnest("userCounts")
|
|
1753
|
+
.filter(polars.col("user").is_not_null())
|
|
1754
|
+
.rename({"usageCount": "count"}) # Rename to match dataset schema
|
|
1755
|
+
)
|
|
1756
|
+
|
|
1757
|
+
user_dashboard_usage_lf = self._create_user_dataset_usage_map(users_lf)
|
|
1758
|
+
return user_dashboard_usage_lf
|
|
1759
|
+
|
|
1760
|
+
def generate_user_usage(self) -> polars.LazyFrame:
|
|
1761
|
+
dataset_usage_lf = self._generate_user_usage_for_dataset()
|
|
1762
|
+
|
|
1763
|
+
usage_index = "dashboard_dashboardusagestatisticsaspect_v1"
|
|
1764
|
+
entity_index = "dashboardindex_v2"
|
|
1765
|
+
dashboard_usage_lf = self._generate_user_usage_for_dashboard_charts(
|
|
1766
|
+
entity_index, usage_index
|
|
1767
|
+
)
|
|
1768
|
+
|
|
1769
|
+
entity_index = "chartindex_v2"
|
|
1770
|
+
usage_index = "chart_chartusagestatisticsaspect_v1"
|
|
1771
|
+
chart_usage_lf = self._generate_user_usage_for_dashboard_charts(
|
|
1772
|
+
entity_index, usage_index
|
|
1773
|
+
)
|
|
1774
|
+
|
|
1775
|
+
# Combine all three usage sources
|
|
1776
|
+
lf = self._combine_user_usage_data(
|
|
1777
|
+
dataset_usage_lf, dashboard_usage_lf, chart_usage_lf
|
|
1778
|
+
)
|
|
1779
|
+
|
|
1780
|
+
lf = self.add_platform_usage_percentiles(lf)
|
|
1781
|
+
|
|
1782
|
+
# Add user usage percentiles across all users (not grouped by platform)
|
|
1783
|
+
# Create a temporary platform field for percentile calculation
|
|
1784
|
+
lf = lf.with_columns(polars.lit("all_users").alias("temp_platform"))
|
|
1785
|
+
|
|
1786
|
+
lf = self.gen_rank_and_percentile(
|
|
1787
|
+
lf=lf,
|
|
1788
|
+
count_field="userUsageTotalPast30Days",
|
|
1789
|
+
urn_field="user",
|
|
1790
|
+
platform_field="temp_platform",
|
|
1791
|
+
prefix="userUsage",
|
|
1792
|
+
use_exp_cdf=False,
|
|
1793
|
+
)
|
|
1794
|
+
|
|
1795
|
+
# Rename the percentile column to match the schema field name and remove temp field
|
|
1796
|
+
lf = lf.rename(
|
|
1797
|
+
{"userUsagerank_percentile": "userUsagePercentilePast30Days"}
|
|
1798
|
+
).drop("temp_platform")
|
|
1799
|
+
|
|
1800
|
+
return lf
|
|
1801
|
+
|
|
1386
1802
|
def generate_dataset_usage(self) -> polars.LazyFrame:
|
|
1387
1803
|
datasets_lf = self.get_datasets()
|
|
1388
1804
|
if self.config.set_upstream_table_max_modification_time_for_views:
|
|
@@ -1503,11 +1919,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1503
1919
|
# called `Option::unwrap()` on a `None` value
|
|
1504
1920
|
# Which only happens if we don't collect immediately
|
|
1505
1921
|
# return polars.scan_parquet(temp_file.name, schema=schema, low_memory=True).collect().lazy()
|
|
1506
|
-
return (
|
|
1507
|
-
polars.scan_parquet(temp_file.name, schema=schema, low_memory=True)
|
|
1508
|
-
.collect()
|
|
1509
|
-
.lazy()
|
|
1510
|
-
)
|
|
1922
|
+
return polars.scan_parquet(temp_file.name, schema=schema, low_memory=True)
|
|
1511
1923
|
|
|
1512
1924
|
def load_dataset_usage(self) -> polars.LazyFrame:
|
|
1513
1925
|
index = "dataset_datasetusagestatisticsaspect_v1"
|
|
@@ -1624,23 +2036,40 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1624
2036
|
delay: Optional[float] = None,
|
|
1625
2037
|
) -> Iterable[Dict[str, Any]]:
|
|
1626
2038
|
processed_count = 0
|
|
2039
|
+
scroll_id = None
|
|
1627
2040
|
while True:
|
|
1628
2041
|
with PerfTimer() as timer:
|
|
1629
2042
|
logger.debug(f"ES query: {query}")
|
|
1630
|
-
|
|
1631
|
-
|
|
1632
|
-
|
|
1633
|
-
|
|
1634
|
-
|
|
1635
|
-
|
|
1636
|
-
|
|
1637
|
-
|
|
1638
|
-
|
|
1639
|
-
|
|
1640
|
-
|
|
1641
|
-
|
|
1642
|
-
|
|
1643
|
-
|
|
2043
|
+
if not scroll_id:
|
|
2044
|
+
logger.debug(
|
|
2045
|
+
f"Getting inital data from index {index} without scroll id"
|
|
2046
|
+
)
|
|
2047
|
+
results = server.search(
|
|
2048
|
+
body=query,
|
|
2049
|
+
size=batch_size,
|
|
2050
|
+
scroll="2m",
|
|
2051
|
+
index=index,
|
|
2052
|
+
params=(
|
|
2053
|
+
{"timeout": self.config.query_timeout}
|
|
2054
|
+
if self.config.search_index.opensearch_dialect
|
|
2055
|
+
else {"request_timeout": self.config.query_timeout}
|
|
2056
|
+
),
|
|
2057
|
+
)
|
|
2058
|
+
else:
|
|
2059
|
+
logger.debug(
|
|
2060
|
+
f"Getting data from index {index} using scroll_id: {scroll_id}"
|
|
2061
|
+
)
|
|
2062
|
+
results = server.scroll(
|
|
2063
|
+
scroll_id=scroll_id,
|
|
2064
|
+
scroll="2m",
|
|
2065
|
+
params=(
|
|
2066
|
+
{"timeout": self.config.query_timeout}
|
|
2067
|
+
if self.config.search_index.opensearch_dialect
|
|
2068
|
+
else {"request_timeout": self.config.query_timeout}
|
|
2069
|
+
),
|
|
2070
|
+
)
|
|
2071
|
+
scroll_id = results["_scroll_id"]
|
|
2072
|
+
|
|
1644
2073
|
if not aggregation_key:
|
|
1645
2074
|
yield from process_function(results["hits"]["hits"])
|
|
1646
2075
|
|
|
@@ -1651,7 +2080,6 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1651
2080
|
)
|
|
1652
2081
|
if len(results["hits"]["hits"]) < batch_size:
|
|
1653
2082
|
break
|
|
1654
|
-
query.update({"search_after": results["hits"]["hits"][-1]["sort"]})
|
|
1655
2083
|
else:
|
|
1656
2084
|
yield from process_function(
|
|
1657
2085
|
results["aggregations"][aggregation_key]["buckets"]
|
|
@@ -1661,16 +2089,11 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1661
2089
|
< batch_size
|
|
1662
2090
|
):
|
|
1663
2091
|
break
|
|
1664
|
-
|
|
1665
|
-
|
|
1666
|
-
|
|
1667
|
-
|
|
1668
|
-
|
|
1669
|
-
if delay:
|
|
1670
|
-
logger.debug(
|
|
1671
|
-
f"Sleeping for {delay} seconds before getting next batch from ES"
|
|
1672
|
-
)
|
|
1673
|
-
time.sleep(delay)
|
|
2092
|
+
if delay:
|
|
2093
|
+
logger.debug(
|
|
2094
|
+
f"Sleeping for {delay} seconds before getting next batch from ES"
|
|
2095
|
+
)
|
|
2096
|
+
time.sleep(delay)
|
|
1674
2097
|
|
|
1675
|
-
def get_report(self) ->
|
|
2098
|
+
def get_report(self) -> "DatahubUsageFeatureReport":
|
|
1676
2099
|
return self.report
|