acryl-datahub-cloud 0.3.8rc3__py3-none-any.whl → 0.3.8rc6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub-cloud might be problematic. Click here for more details.
- acryl_datahub_cloud/_codegen_config.json +1 -1
- acryl_datahub_cloud/datahub_metadata_sharing/scroll_shared_entities.gql +204 -0
- acryl_datahub_cloud/datahub_metadata_sharing/share_entity.gql +9 -0
- acryl_datahub_cloud/datahub_usage_reporting/query_builder.py +79 -57
- acryl_datahub_cloud/datahub_usage_reporting/usage_feature_reporter.py +268 -213
- acryl_datahub_cloud/metadata/schema_classes.py +2 -2
- {acryl_datahub_cloud-0.3.8rc3.dist-info → acryl_datahub_cloud-0.3.8rc6.dist-info}/METADATA +38 -38
- acryl_datahub_cloud-0.3.8rc6.dist-info/RECORD +133 -0
- acryl_datahub_cloud/metadata/schema.avsc +0 -26607
- acryl_datahub_cloud/metadata/schemas/Access.avsc +0 -55
- acryl_datahub_cloud/metadata/schemas/ActionRequestArchived.avsc +0 -68
- acryl_datahub_cloud/metadata/schemas/ActionRequestInfo.avsc +0 -524
- acryl_datahub_cloud/metadata/schemas/ActionRequestKey.avsc +0 -21
- acryl_datahub_cloud/metadata/schemas/ActionRequestStatus.avsc +0 -85
- acryl_datahub_cloud/metadata/schemas/Actors.avsc +0 -48
- acryl_datahub_cloud/metadata/schemas/AiInferenceMetadata.avsc +0 -42
- acryl_datahub_cloud/metadata/schemas/AnomaliesSummary.avsc +0 -126
- acryl_datahub_cloud/metadata/schemas/AnomalyInfo.avsc +0 -342
- acryl_datahub_cloud/metadata/schemas/AnomalyKey.avsc +0 -22
- acryl_datahub_cloud/metadata/schemas/AssertionActions.avsc +0 -53
- acryl_datahub_cloud/metadata/schemas/AssertionAnalyticsRunEvent.avsc +0 -3506
- acryl_datahub_cloud/metadata/schemas/AssertionDryRunEvent.avsc +0 -309
- acryl_datahub_cloud/metadata/schemas/AssertionInferenceDetails.avsc +0 -105
- acryl_datahub_cloud/metadata/schemas/AssertionInfo.avsc +0 -2579
- acryl_datahub_cloud/metadata/schemas/AssertionKey.avsc +0 -32
- acryl_datahub_cloud/metadata/schemas/AssertionRunEvent.avsc +0 -3374
- acryl_datahub_cloud/metadata/schemas/AssertionSummary.avsc +0 -50
- acryl_datahub_cloud/metadata/schemas/AssertionsSummary.avsc +0 -189
- acryl_datahub_cloud/metadata/schemas/BatchTestRunEvent.avsc +0 -286
- acryl_datahub_cloud/metadata/schemas/BrowsePaths.avsc +0 -25
- acryl_datahub_cloud/metadata/schemas/BrowsePathsV2.avsc +0 -50
- acryl_datahub_cloud/metadata/schemas/BusinessAttributeInfo.avsc +0 -601
- acryl_datahub_cloud/metadata/schemas/BusinessAttributeKey.avsc +0 -24
- acryl_datahub_cloud/metadata/schemas/BusinessAttributes.avsc +0 -51
- acryl_datahub_cloud/metadata/schemas/CaveatsAndRecommendations.avsc +0 -78
- acryl_datahub_cloud/metadata/schemas/ChartInfo.avsc +0 -346
- acryl_datahub_cloud/metadata/schemas/ChartKey.avsc +0 -58
- acryl_datahub_cloud/metadata/schemas/ChartQuery.avsc +0 -39
- acryl_datahub_cloud/metadata/schemas/ChartUsageStatistics.avsc +0 -195
- acryl_datahub_cloud/metadata/schemas/ConstraintInfo.avsc +0 -182
- acryl_datahub_cloud/metadata/schemas/ConstraintKey.avsc +0 -20
- acryl_datahub_cloud/metadata/schemas/Container.avsc +0 -36
- acryl_datahub_cloud/metadata/schemas/ContainerKey.avsc +0 -47
- acryl_datahub_cloud/metadata/schemas/ContainerProperties.avsc +0 -189
- acryl_datahub_cloud/metadata/schemas/CorpGroupEditableInfo.avsc +0 -52
- acryl_datahub_cloud/metadata/schemas/CorpGroupInfo.avsc +0 -177
- acryl_datahub_cloud/metadata/schemas/CorpGroupKey.avsc +0 -39
- acryl_datahub_cloud/metadata/schemas/CorpGroupSettings.avsc +0 -106
- acryl_datahub_cloud/metadata/schemas/CorpUserCredentials.avsc +0 -42
- acryl_datahub_cloud/metadata/schemas/CorpUserEditableInfo.avsc +0 -169
- acryl_datahub_cloud/metadata/schemas/CorpUserInfo.avsc +0 -171
- acryl_datahub_cloud/metadata/schemas/CorpUserKey.avsc +0 -43
- acryl_datahub_cloud/metadata/schemas/CorpUserSettings.avsc +0 -165
- acryl_datahub_cloud/metadata/schemas/CorpUserStatus.avsc +0 -73
- acryl_datahub_cloud/metadata/schemas/Cost.avsc +0 -64
- acryl_datahub_cloud/metadata/schemas/CostFeatures.avsc +0 -36
- acryl_datahub_cloud/metadata/schemas/DashboardInfo.avsc +0 -403
- acryl_datahub_cloud/metadata/schemas/DashboardKey.avsc +0 -57
- acryl_datahub_cloud/metadata/schemas/DashboardUsageStatistics.avsc +0 -255
- acryl_datahub_cloud/metadata/schemas/DataContractKey.avsc +0 -23
- acryl_datahub_cloud/metadata/schemas/DataContractProperties.avsc +0 -201
- acryl_datahub_cloud/metadata/schemas/DataContractStatus.avsc +0 -44
- acryl_datahub_cloud/metadata/schemas/DataFlowInfo.avsc +0 -188
- acryl_datahub_cloud/metadata/schemas/DataFlowKey.avsc +0 -63
- acryl_datahub_cloud/metadata/schemas/DataHubAccessTokenInfo.avsc +0 -74
- acryl_datahub_cloud/metadata/schemas/DataHubAccessTokenKey.avsc +0 -21
- acryl_datahub_cloud/metadata/schemas/DataHubActionInfo.avsc +0 -121
- acryl_datahub_cloud/metadata/schemas/DataHubActionKey.avsc +0 -22
- acryl_datahub_cloud/metadata/schemas/DataHubActionStatus.avsc +0 -181
- acryl_datahub_cloud/metadata/schemas/DataHubConnectionDetails.avsc +0 -62
- acryl_datahub_cloud/metadata/schemas/DataHubConnectionKey.avsc +0 -23
- acryl_datahub_cloud/metadata/schemas/DataHubIngestionSourceInfo.avsc +0 -157
- acryl_datahub_cloud/metadata/schemas/DataHubIngestionSourceKey.avsc +0 -21
- acryl_datahub_cloud/metadata/schemas/DataHubPersonaInfo.avsc +0 -24
- acryl_datahub_cloud/metadata/schemas/DataHubPersonaKey.avsc +0 -21
- acryl_datahub_cloud/metadata/schemas/DataHubPolicyInfo.avsc +0 -302
- acryl_datahub_cloud/metadata/schemas/DataHubPolicyKey.avsc +0 -22
- acryl_datahub_cloud/metadata/schemas/DataHubRetentionConfig.avsc +0 -59
- acryl_datahub_cloud/metadata/schemas/DataHubRetentionKey.avsc +0 -26
- acryl_datahub_cloud/metadata/schemas/DataHubRoleInfo.avsc +0 -33
- acryl_datahub_cloud/metadata/schemas/DataHubRoleKey.avsc +0 -21
- acryl_datahub_cloud/metadata/schemas/DataHubSecretKey.avsc +0 -21
- acryl_datahub_cloud/metadata/schemas/DataHubSecretValue.avsc +0 -91
- acryl_datahub_cloud/metadata/schemas/DataHubStepStateKey.avsc +0 -21
- acryl_datahub_cloud/metadata/schemas/DataHubStepStateProperties.avsc +0 -68
- acryl_datahub_cloud/metadata/schemas/DataHubUpgradeKey.avsc +0 -21
- acryl_datahub_cloud/metadata/schemas/DataHubUpgradeRequest.avsc +0 -21
- acryl_datahub_cloud/metadata/schemas/DataHubUpgradeResult.avsc +0 -53
- acryl_datahub_cloud/metadata/schemas/DataHubViewInfo.avsc +0 -265
- acryl_datahub_cloud/metadata/schemas/DataHubViewKey.avsc +0 -21
- acryl_datahub_cloud/metadata/schemas/DataJobInfo.avsc +0 -254
- acryl_datahub_cloud/metadata/schemas/DataJobInputOutput.avsc +0 -462
- acryl_datahub_cloud/metadata/schemas/DataJobKey.avsc +0 -75
- acryl_datahub_cloud/metadata/schemas/DataPlatformInfo.avsc +0 -93
- acryl_datahub_cloud/metadata/schemas/DataPlatformInstance.avsc +0 -44
- acryl_datahub_cloud/metadata/schemas/DataPlatformInstanceKey.avsc +0 -35
- acryl_datahub_cloud/metadata/schemas/DataPlatformInstanceProperties.avsc +0 -72
- acryl_datahub_cloud/metadata/schemas/DataPlatformKey.avsc +0 -21
- acryl_datahub_cloud/metadata/schemas/DataProcessInfo.avsc +0 -73
- acryl_datahub_cloud/metadata/schemas/DataProcessInstanceInput.avsc +0 -38
- acryl_datahub_cloud/metadata/schemas/DataProcessInstanceKey.avsc +0 -29
- acryl_datahub_cloud/metadata/schemas/DataProcessInstanceOutput.avsc +0 -38
- acryl_datahub_cloud/metadata/schemas/DataProcessInstanceProperties.avsc +0 -131
- acryl_datahub_cloud/metadata/schemas/DataProcessInstanceRelationships.avsc +0 -99
- acryl_datahub_cloud/metadata/schemas/DataProcessInstanceRunEvent.avsc +0 -229
- acryl_datahub_cloud/metadata/schemas/DataProcessKey.avsc +0 -84
- acryl_datahub_cloud/metadata/schemas/DataProductKey.avsc +0 -32
- acryl_datahub_cloud/metadata/schemas/DataProductProperties.avsc +0 -211
- acryl_datahub_cloud/metadata/schemas/DataTransformLogic.avsc +0 -63
- acryl_datahub_cloud/metadata/schemas/DataTypeInfo.avsc +0 -33
- acryl_datahub_cloud/metadata/schemas/DataTypeKey.avsc +0 -23
- acryl_datahub_cloud/metadata/schemas/DatahubIngestionCheckpoint.avsc +0 -188
- acryl_datahub_cloud/metadata/schemas/DatahubIngestionRunSummary.avsc +0 -365
- acryl_datahub_cloud/metadata/schemas/DatasetDeprecation.avsc +0 -50
- acryl_datahub_cloud/metadata/schemas/DatasetKey.avsc +0 -135
- acryl_datahub_cloud/metadata/schemas/DatasetProfile.avsc +0 -539
- acryl_datahub_cloud/metadata/schemas/DatasetProperties.avsc +0 -165
- acryl_datahub_cloud/metadata/schemas/DatasetUpstreamLineage.avsc +0 -129
- acryl_datahub_cloud/metadata/schemas/DatasetUsageStatistics.avsc +0 -247
- acryl_datahub_cloud/metadata/schemas/Deprecation.avsc +0 -57
- acryl_datahub_cloud/metadata/schemas/DisplayProperties.avsc +0 -62
- acryl_datahub_cloud/metadata/schemas/Documentation.avsc +0 -152
- acryl_datahub_cloud/metadata/schemas/DomainKey.avsc +0 -30
- acryl_datahub_cloud/metadata/schemas/DomainProperties.avsc +0 -137
- acryl_datahub_cloud/metadata/schemas/Domains.avsc +0 -38
- acryl_datahub_cloud/metadata/schemas/DynamicFormAssignment.avsc +0 -150
- acryl_datahub_cloud/metadata/schemas/ERModelRelationshipKey.avsc +0 -28
- acryl_datahub_cloud/metadata/schemas/ERModelRelationshipProperties.avsc +0 -196
- acryl_datahub_cloud/metadata/schemas/EditableChartProperties.avsc +0 -98
- acryl_datahub_cloud/metadata/schemas/EditableContainerProperties.avsc +0 -24
- acryl_datahub_cloud/metadata/schemas/EditableDashboardProperties.avsc +0 -98
- acryl_datahub_cloud/metadata/schemas/EditableDataFlowProperties.avsc +0 -98
- acryl_datahub_cloud/metadata/schemas/EditableDataJobProperties.avsc +0 -98
- acryl_datahub_cloud/metadata/schemas/EditableDatasetProperties.avsc +0 -111
- acryl_datahub_cloud/metadata/schemas/EditableERModelRelationshipProperties.avsc +0 -111
- acryl_datahub_cloud/metadata/schemas/EditableMLFeatureProperties.avsc +0 -24
- acryl_datahub_cloud/metadata/schemas/EditableMLFeatureTableProperties.avsc +0 -24
- acryl_datahub_cloud/metadata/schemas/EditableMLModelGroupProperties.avsc +0 -24
- acryl_datahub_cloud/metadata/schemas/EditableMLModelProperties.avsc +0 -24
- acryl_datahub_cloud/metadata/schemas/EditableMLPrimaryKeyProperties.avsc +0 -24
- acryl_datahub_cloud/metadata/schemas/EditableNotebookProperties.avsc +0 -98
- acryl_datahub_cloud/metadata/schemas/EditableSchemaMetadata.avsc +0 -431
- acryl_datahub_cloud/metadata/schemas/Embed.avsc +0 -20
- acryl_datahub_cloud/metadata/schemas/EntityChangeEvent.avsc +0 -112
- acryl_datahub_cloud/metadata/schemas/EntityInferenceMetadata.avsc +0 -47
- acryl_datahub_cloud/metadata/schemas/EntityTypeInfo.avsc +0 -33
- acryl_datahub_cloud/metadata/schemas/EntityTypeKey.avsc +0 -24
- acryl_datahub_cloud/metadata/schemas/EthicalConsiderations.avsc +0 -71
- acryl_datahub_cloud/metadata/schemas/EvaluationData.avsc +0 -56
- acryl_datahub_cloud/metadata/schemas/ExecutionRequestInput.avsc +0 -134
- acryl_datahub_cloud/metadata/schemas/ExecutionRequestKey.avsc +0 -23
- acryl_datahub_cloud/metadata/schemas/ExecutionRequestResult.avsc +0 -97
- acryl_datahub_cloud/metadata/schemas/ExecutionRequestSignal.avsc +0 -73
- acryl_datahub_cloud/metadata/schemas/Filter.avsc +0 -126
- acryl_datahub_cloud/metadata/schemas/FormInfo.avsc +0 -517
- acryl_datahub_cloud/metadata/schemas/FormKey.avsc +0 -23
- acryl_datahub_cloud/metadata/schemas/Forms.avsc +0 -447
- acryl_datahub_cloud/metadata/schemas/GenericEntityKey.avsc +0 -16
- acryl_datahub_cloud/metadata/schemas/GlobalSettingsInfo.avsc +0 -524
- acryl_datahub_cloud/metadata/schemas/GlobalSettingsKey.avsc +0 -22
- acryl_datahub_cloud/metadata/schemas/GlobalTags.avsc +0 -132
- acryl_datahub_cloud/metadata/schemas/GlossaryNodeInfo.avsc +0 -89
- acryl_datahub_cloud/metadata/schemas/GlossaryNodeKey.avsc +0 -33
- acryl_datahub_cloud/metadata/schemas/GlossaryRelatedTerms.avsc +0 -125
- acryl_datahub_cloud/metadata/schemas/GlossaryTermInfo.avsc +0 -131
- acryl_datahub_cloud/metadata/schemas/GlossaryTermKey.avsc +0 -39
- acryl_datahub_cloud/metadata/schemas/GlossaryTerms.avsc +0 -190
- acryl_datahub_cloud/metadata/schemas/GroupMembership.avsc +0 -28
- acryl_datahub_cloud/metadata/schemas/IncidentActivityEvent.avsc +0 -605
- acryl_datahub_cloud/metadata/schemas/IncidentInfo.avsc +0 -376
- acryl_datahub_cloud/metadata/schemas/IncidentKey.avsc +0 -25
- acryl_datahub_cloud/metadata/schemas/IncidentNotificationDetails.avsc +0 -62
- acryl_datahub_cloud/metadata/schemas/IncidentSource.avsc +0 -48
- acryl_datahub_cloud/metadata/schemas/IncidentsSummary.avsc +0 -160
- acryl_datahub_cloud/metadata/schemas/InferredMetadata.avsc +0 -398
- acryl_datahub_cloud/metadata/schemas/InferredNeighbors.avsc +0 -112
- acryl_datahub_cloud/metadata/schemas/InputFields.avsc +0 -678
- acryl_datahub_cloud/metadata/schemas/InstitutionalMemory.avsc +0 -88
- acryl_datahub_cloud/metadata/schemas/IntendedUse.avsc +0 -56
- acryl_datahub_cloud/metadata/schemas/InviteToken.avsc +0 -34
- acryl_datahub_cloud/metadata/schemas/InviteTokenKey.avsc +0 -21
- acryl_datahub_cloud/metadata/schemas/LineageFeatures.avsc +0 -76
- acryl_datahub_cloud/metadata/schemas/LinkPreviewInfo.avsc +0 -38
- acryl_datahub_cloud/metadata/schemas/LinkPreviewKey.avsc +0 -21
- acryl_datahub_cloud/metadata/schemas/MLFeatureKey.avsc +0 -57
- acryl_datahub_cloud/metadata/schemas/MLFeatureProperties.avsc +0 -189
- acryl_datahub_cloud/metadata/schemas/MLFeatureTableKey.avsc +0 -66
- acryl_datahub_cloud/metadata/schemas/MLFeatureTableProperties.avsc +0 -95
- acryl_datahub_cloud/metadata/schemas/MLHyperParam.avsc +0 -43
- acryl_datahub_cloud/metadata/schemas/MLMetric.avsc +0 -43
- acryl_datahub_cloud/metadata/schemas/MLModelDeploymentKey.avsc +0 -92
- acryl_datahub_cloud/metadata/schemas/MLModelDeploymentProperties.avsc +0 -173
- acryl_datahub_cloud/metadata/schemas/MLModelFactorPrompts.avsc +0 -78
- acryl_datahub_cloud/metadata/schemas/MLModelGroupKey.avsc +0 -102
- acryl_datahub_cloud/metadata/schemas/MLModelGroupProperties.avsc +0 -123
- acryl_datahub_cloud/metadata/schemas/MLModelKey.avsc +0 -117
- acryl_datahub_cloud/metadata/schemas/MLModelProperties.avsc +0 -414
- acryl_datahub_cloud/metadata/schemas/MLPrimaryKeyKey.avsc +0 -53
- acryl_datahub_cloud/metadata/schemas/MLPrimaryKeyProperties.avsc +0 -185
- acryl_datahub_cloud/metadata/schemas/MetadataChangeEvent.avsc +0 -8710
- acryl_datahub_cloud/metadata/schemas/MetadataChangeLog.avsc +0 -360
- acryl_datahub_cloud/metadata/schemas/MetadataChangeProposal.avsc +0 -290
- acryl_datahub_cloud/metadata/schemas/Metrics.avsc +0 -35
- acryl_datahub_cloud/metadata/schemas/MonitorInfo.avsc +0 -3238
- acryl_datahub_cloud/metadata/schemas/MonitorKey.avsc +0 -43
- acryl_datahub_cloud/metadata/schemas/MonitorTimeseriesState.avsc +0 -159
- acryl_datahub_cloud/metadata/schemas/NativeGroupMembership.avsc +0 -28
- acryl_datahub_cloud/metadata/schemas/NotebookContent.avsc +0 -252
- acryl_datahub_cloud/metadata/schemas/NotebookInfo.avsc +0 -154
- acryl_datahub_cloud/metadata/schemas/NotebookKey.avsc +0 -44
- acryl_datahub_cloud/metadata/schemas/NotificationRequest.avsc +0 -427
- acryl_datahub_cloud/metadata/schemas/Operation.avsc +0 -381
- acryl_datahub_cloud/metadata/schemas/Origin.avsc +0 -157
- acryl_datahub_cloud/metadata/schemas/Ownership.avsc +0 -255
- acryl_datahub_cloud/metadata/schemas/OwnershipTypeInfo.avsc +0 -103
- acryl_datahub_cloud/metadata/schemas/OwnershipTypeKey.avsc +0 -23
- acryl_datahub_cloud/metadata/schemas/PartitionsSummary.avsc +0 -59
- acryl_datahub_cloud/metadata/schemas/PlatformEvent.avsc +0 -52
- acryl_datahub_cloud/metadata/schemas/PlatformResourceInfo.avsc +0 -109
- acryl_datahub_cloud/metadata/schemas/PlatformResourceKey.avsc +0 -24
- acryl_datahub_cloud/metadata/schemas/PostInfo.avsc +0 -262
- acryl_datahub_cloud/metadata/schemas/PostKey.avsc +0 -22
- acryl_datahub_cloud/metadata/schemas/Proposals.avsc +0 -53
- acryl_datahub_cloud/metadata/schemas/QuantitativeAnalyses.avsc +0 -29
- acryl_datahub_cloud/metadata/schemas/QueryKey.avsc +0 -28
- acryl_datahub_cloud/metadata/schemas/QueryProperties.avsc +0 -171
- acryl_datahub_cloud/metadata/schemas/QuerySubjects.avsc +0 -50
- acryl_datahub_cloud/metadata/schemas/QueryUsageFeatures.avsc +0 -94
- acryl_datahub_cloud/metadata/schemas/QueryUsageStatistics.avsc +0 -221
- acryl_datahub_cloud/metadata/schemas/RecommendationModule.avsc +0 -259
- acryl_datahub_cloud/metadata/schemas/RecommendationModuleKey.avsc +0 -26
- acryl_datahub_cloud/metadata/schemas/RemoteExecutorKey.avsc +0 -21
- acryl_datahub_cloud/metadata/schemas/RemoteExecutorStatus.avsc +0 -80
- acryl_datahub_cloud/metadata/schemas/RoleKey.avsc +0 -22
- acryl_datahub_cloud/metadata/schemas/RoleMembership.avsc +0 -28
- acryl_datahub_cloud/metadata/schemas/RoleProperties.avsc +0 -99
- acryl_datahub_cloud/metadata/schemas/SchemaFieldAliases.avsc +0 -29
- acryl_datahub_cloud/metadata/schemas/SchemaFieldInfo.avsc +0 -42
- acryl_datahub_cloud/metadata/schemas/SchemaFieldKey.avsc +0 -46
- acryl_datahub_cloud/metadata/schemas/SchemaFieldProfile.avsc +0 -474
- acryl_datahub_cloud/metadata/schemas/SchemaFieldsInferredMetadata.avsc +0 -222
- acryl_datahub_cloud/metadata/schemas/SchemaFieldsInferredNeighbors.avsc +0 -136
- acryl_datahub_cloud/metadata/schemas/SchemaMetadata.avsc +0 -1045
- acryl_datahub_cloud/metadata/schemas/SchemaProposals.avsc +0 -73
- acryl_datahub_cloud/metadata/schemas/Share.avsc +0 -211
- acryl_datahub_cloud/metadata/schemas/Siblings.avsc +0 -41
- acryl_datahub_cloud/metadata/schemas/SlackUserInfo.avsc +0 -160
- acryl_datahub_cloud/metadata/schemas/SourceCode.avsc +0 -49
- acryl_datahub_cloud/metadata/schemas/Status.avsc +0 -20
- acryl_datahub_cloud/metadata/schemas/StorageFeatures.avsc +0 -76
- acryl_datahub_cloud/metadata/schemas/StructuredProperties.avsc +0 -106
- acryl_datahub_cloud/metadata/schemas/StructuredPropertyDefinition.avsc +0 -390
- acryl_datahub_cloud/metadata/schemas/StructuredPropertyKey.avsc +0 -26
- acryl_datahub_cloud/metadata/schemas/StructuredPropertySettings.avsc +0 -114
- acryl_datahub_cloud/metadata/schemas/SubTypes.avsc +0 -27
- acryl_datahub_cloud/metadata/schemas/SubscriptionInfo.avsc +0 -355
- acryl_datahub_cloud/metadata/schemas/SubscriptionKey.avsc +0 -21
- acryl_datahub_cloud/metadata/schemas/TagKey.avsc +0 -33
- acryl_datahub_cloud/metadata/schemas/TagProperties.avsc +0 -43
- acryl_datahub_cloud/metadata/schemas/TelemetryClientId.avsc +0 -16
- acryl_datahub_cloud/metadata/schemas/TelemetryKey.avsc +0 -21
- acryl_datahub_cloud/metadata/schemas/TestInfo.avsc +0 -300
- acryl_datahub_cloud/metadata/schemas/TestKey.avsc +0 -24
- acryl_datahub_cloud/metadata/schemas/TestResults.avsc +0 -163
- acryl_datahub_cloud/metadata/schemas/TrainingData.avsc +0 -56
- acryl_datahub_cloud/metadata/schemas/UpstreamLineage.avsc +0 -286
- acryl_datahub_cloud/metadata/schemas/UsageAggregation.avsc +0 -153
- acryl_datahub_cloud/metadata/schemas/UsageFeatures.avsc +0 -243
- acryl_datahub_cloud/metadata/schemas/VersionInfo.avsc +0 -52
- acryl_datahub_cloud/metadata/schemas/VersionProperties.avsc +0 -216
- acryl_datahub_cloud/metadata/schemas/VersionSetKey.avsc +0 -26
- acryl_datahub_cloud/metadata/schemas/VersionSetProperties.avsc +0 -49
- acryl_datahub_cloud/metadata/schemas/ViewProperties.avsc +0 -41
- acryl_datahub_cloud-0.3.8rc3.dist-info/RECORD +0 -396
- {acryl_datahub_cloud-0.3.8rc3.dist-info → acryl_datahub_cloud-0.3.8rc6.dist-info}/WHEEL +0 -0
- {acryl_datahub_cloud-0.3.8rc3.dist-info → acryl_datahub_cloud-0.3.8rc6.dist-info}/entry_points.txt +0 -0
- {acryl_datahub_cloud-0.3.8rc3.dist-info → acryl_datahub_cloud-0.3.8rc6.dist-info}/top_level.txt +0 -0
|
@@ -9,7 +9,8 @@ from dataclasses import dataclass, field
|
|
|
9
9
|
from datetime import datetime
|
|
10
10
|
from functools import partial
|
|
11
11
|
from itertools import chain
|
|
12
|
-
from
|
|
12
|
+
from tempfile import TemporaryDirectory
|
|
13
|
+
from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Tuple, Union
|
|
13
14
|
|
|
14
15
|
import numpy
|
|
15
16
|
import polars
|
|
@@ -17,6 +18,7 @@ import pyarrow as pa
|
|
|
17
18
|
import pyarrow.parquet as pq
|
|
18
19
|
from elasticsearch.client import Elasticsearch
|
|
19
20
|
from opensearchpy import OpenSearch
|
|
21
|
+
from polars.datatypes import DataTypeClass
|
|
20
22
|
from pydantic import Field
|
|
21
23
|
from scipy.stats import expon
|
|
22
24
|
|
|
@@ -171,7 +173,7 @@ class DataHubUsageFeatureReportingSourceConfig(
|
|
|
171
173
|
)
|
|
172
174
|
|
|
173
175
|
disable_write_usage: bool = Field(
|
|
174
|
-
|
|
176
|
+
True,
|
|
175
177
|
description="Flag to disable write usage statistics collection.'",
|
|
176
178
|
)
|
|
177
179
|
|
|
@@ -245,6 +247,7 @@ class DatahubUsageFeatureReport(IngestionStageReport, StatefulIngestionReport):
|
|
|
245
247
|
class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
246
248
|
platform = "datahub"
|
|
247
249
|
temp_files_to_clean: List[str] = []
|
|
250
|
+
temp_dir: Optional[TemporaryDirectory] = None
|
|
248
251
|
|
|
249
252
|
def __init__(
|
|
250
253
|
self, ctx: PipelineContext, config: DataHubUsageFeatureReportingSourceConfig
|
|
@@ -267,6 +270,10 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
267
270
|
if num > 0:
|
|
268
271
|
logger.info(f"Compiled {num} regexp factors")
|
|
269
272
|
|
|
273
|
+
if self.config.streaming_mode:
|
|
274
|
+
self.temp_dir = tempfile.TemporaryDirectory(prefix="datahub-usage-")
|
|
275
|
+
logger.info(f"Using temp dir: {self.temp_dir.name}")
|
|
276
|
+
|
|
270
277
|
def soft_deleted_batch(self, results: Iterable) -> Iterable[Dict]:
|
|
271
278
|
with PerfTimer() as timer:
|
|
272
279
|
for doc in results:
|
|
@@ -397,7 +404,6 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
397
404
|
if "eventGranularity" in doc["_source"]
|
|
398
405
|
else None
|
|
399
406
|
),
|
|
400
|
-
"partitionSpec": doc["_source"]["partitionSpec"],
|
|
401
407
|
"viewsCount": (
|
|
402
408
|
doc["_source"]["viewsCount"]
|
|
403
409
|
if "viewsCount" in doc["_source"]
|
|
@@ -410,7 +416,8 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
410
416
|
),
|
|
411
417
|
"userCounts": (
|
|
412
418
|
doc["_source"]["event"]["userCounts"]
|
|
413
|
-
if "
|
|
419
|
+
if "event" in doc["_source"]
|
|
420
|
+
and "userCounts" in doc["_source"]["event"]
|
|
414
421
|
else []
|
|
415
422
|
),
|
|
416
423
|
"platform": platform,
|
|
@@ -427,7 +434,6 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
427
434
|
if "eventGranularity" in doc["_source"]
|
|
428
435
|
else None
|
|
429
436
|
),
|
|
430
|
-
"partitionSpec": doc["_source"]["partitionSpec"],
|
|
431
437
|
"queryCount": (
|
|
432
438
|
doc["_source"]["queryCount"]
|
|
433
439
|
if "queryCount" in doc["_source"]
|
|
@@ -497,7 +503,6 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
497
503
|
"timestampMillis": doc["_source"]["timestampMillis"],
|
|
498
504
|
"urn": doc["_source"]["urn"],
|
|
499
505
|
"eventGranularity": doc["_source"]["eventGranularity"],
|
|
500
|
-
"partitionSpec": doc["_source"]["partitionSpec"],
|
|
501
506
|
"totalSqlQueries": doc["_source"]["totalSqlQueries"],
|
|
502
507
|
"uniqueUserCount": doc["_source"]["uniqueUserCount"],
|
|
503
508
|
"userCounts": (
|
|
@@ -695,9 +700,13 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
695
700
|
return lf
|
|
696
701
|
|
|
697
702
|
@staticmethod
|
|
698
|
-
def polars_to_arrow_schema(
|
|
699
|
-
|
|
700
|
-
|
|
703
|
+
def polars_to_arrow_schema(
|
|
704
|
+
polars_schema: Dict[str, Union[DataTypeClass, polars.DataType]]
|
|
705
|
+
) -> pa.Schema:
|
|
706
|
+
def convert_dtype(
|
|
707
|
+
polars_dtype: Union[DataTypeClass, polars.DataType]
|
|
708
|
+
) -> pa.DataType:
|
|
709
|
+
type_mapping: Dict[Union[DataTypeClass, polars.DataType], pa.DataType] = {
|
|
701
710
|
polars.Boolean(): pa.bool_(),
|
|
702
711
|
polars.Int8(): pa.int8(),
|
|
703
712
|
polars.Int16(): pa.int16(),
|
|
@@ -710,6 +719,8 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
710
719
|
polars.Float32(): pa.float32(),
|
|
711
720
|
polars.Float64(): pa.float64(),
|
|
712
721
|
polars.Utf8(): pa.string(),
|
|
722
|
+
polars.Utf8(): pa.utf8(),
|
|
723
|
+
polars.String(): pa.string(),
|
|
713
724
|
polars.Date(): pa.date32(),
|
|
714
725
|
polars.Datetime(): pa.timestamp("ns"),
|
|
715
726
|
polars.Time(): pa.time64("ns"),
|
|
@@ -718,85 +729,97 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
718
729
|
|
|
719
730
|
if polars_dtype in [type(key) for key in type_mapping.keys()]:
|
|
720
731
|
return type_mapping[polars_dtype]
|
|
721
|
-
elif polars_dtype == polars.Categorical
|
|
732
|
+
elif polars_dtype == polars.Categorical:
|
|
722
733
|
return pa.dictionary(index_type=pa.int32(), value_type=pa.string())
|
|
734
|
+
elif isinstance(polars_dtype, polars.Struct):
|
|
735
|
+
return pa.struct(
|
|
736
|
+
{
|
|
737
|
+
field.name: convert_dtype(field.dtype)
|
|
738
|
+
for field in polars_dtype.fields
|
|
739
|
+
}
|
|
740
|
+
)
|
|
741
|
+
elif isinstance(polars_dtype, polars.List):
|
|
742
|
+
return pa.list_(convert_dtype(polars_dtype.inner))
|
|
723
743
|
else:
|
|
724
744
|
raise ValueError(f"Unsupported Polars dtype: {polars_dtype}")
|
|
725
745
|
|
|
726
746
|
fields = [(name, convert_dtype(dtype)) for name, dtype in polars_schema.items()]
|
|
727
747
|
return pa.schema(fields)
|
|
728
748
|
|
|
729
|
-
def
|
|
730
|
-
self,
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
current_batch = []
|
|
754
|
-
|
|
755
|
-
for row in es_data:
|
|
756
|
-
current_batch.append(row)
|
|
757
|
-
|
|
758
|
-
if len(current_batch) >= batch_size:
|
|
759
|
-
# Convert the batch to a PyArrow Table
|
|
760
|
-
table = pa.Table.from_pylist(current_batch, schema=pa_schema)
|
|
749
|
+
def batch_write_parquet(
|
|
750
|
+
self,
|
|
751
|
+
data_iterator: Iterable[Dict[Any, Any]],
|
|
752
|
+
pl_schema: Dict,
|
|
753
|
+
output_path: str,
|
|
754
|
+
batch_size: int = 50000,
|
|
755
|
+
append: bool = False,
|
|
756
|
+
parquet_writer: Optional[pq.ParquetWriter] = None,
|
|
757
|
+
) -> None:
|
|
758
|
+
"""
|
|
759
|
+
Write data in batches to a file with support for appending to existing files.
|
|
760
|
+
|
|
761
|
+
Args:
|
|
762
|
+
data_iterator: Iterator of dictionaries containing the data
|
|
763
|
+
pa_schema: PyArrow schema for the data
|
|
764
|
+
output_path: Path for the output file
|
|
765
|
+
format_type: One of "ipc", "feather", "csv", "parquet", "pl_parquet"
|
|
766
|
+
batch_size: Number of rows per batch
|
|
767
|
+
append: If True, append to existing file. If False, create new file.
|
|
768
|
+
parquet_writer: Parquet doesn't let to append to existing file, so we need to pass the writer object
|
|
769
|
+
Returns:
|
|
770
|
+
LazyFrame pointing to the written data
|
|
771
|
+
"""
|
|
772
|
+
arrow_schema = self.polars_to_arrow_schema(pl_schema)
|
|
761
773
|
|
|
762
|
-
|
|
763
|
-
|
|
774
|
+
total_rows = 0
|
|
775
|
+
total_batches = 0
|
|
764
776
|
|
|
765
|
-
|
|
766
|
-
|
|
777
|
+
try:
|
|
778
|
+
if parquet_writer:
|
|
779
|
+
writer = parquet_writer
|
|
780
|
+
else:
|
|
781
|
+
writer = pq.ParquetWriter(output_path, arrow_schema)
|
|
767
782
|
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
table = pa.Table.from_pylist(
|
|
783
|
+
try:
|
|
784
|
+
for batch in self._get_batches(data_iterator, batch_size):
|
|
785
|
+
table = pa.Table.from_pylist(batch, schema=arrow_schema)
|
|
771
786
|
writer.write_table(table)
|
|
787
|
+
total_rows += len(batch)
|
|
788
|
+
total_batches += 1
|
|
789
|
+
logger.debug(f"Wrote batch {total_batches} ({len(batch)} rows)")
|
|
790
|
+
finally:
|
|
791
|
+
if not parquet_writer:
|
|
792
|
+
writer.close()
|
|
793
|
+
except Exception as e:
|
|
794
|
+
logger.exception(f"Error during batch writing: {str(e)}", exc_info=True)
|
|
795
|
+
raise
|
|
796
|
+
|
|
797
|
+
def _get_batches(
|
|
798
|
+
self, iterator: Iterable[Dict], batch_size: int
|
|
799
|
+
) -> Iterator[List[Dict]]:
|
|
800
|
+
"""Helper generator to create batches from an iterator."""
|
|
801
|
+
current_batch = []
|
|
802
|
+
for item in iterator:
|
|
803
|
+
current_batch.append(item)
|
|
804
|
+
if len(current_batch) >= batch_size:
|
|
805
|
+
yield current_batch
|
|
806
|
+
current_batch = []
|
|
772
807
|
|
|
773
|
-
|
|
808
|
+
if current_batch:
|
|
809
|
+
yield current_batch
|
|
774
810
|
|
|
775
811
|
def load_write_usage(
|
|
776
812
|
self, soft_deleted_entities_df: polars.LazyFrame
|
|
777
813
|
) -> polars.LazyFrame:
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
wdf = wdf.cast({polars.String: polars.Categorical})
|
|
788
|
-
else:
|
|
789
|
-
wdf = polars.LazyFrame(
|
|
790
|
-
self.load_data_from_es(
|
|
791
|
-
"dataset_operationaspect_v1",
|
|
792
|
-
QueryBuilder.get_dataset_write_usage_raw_query(
|
|
793
|
-
self.config.lookback_days
|
|
794
|
-
),
|
|
795
|
-
self.write_stat_raw_batch,
|
|
796
|
-
),
|
|
797
|
-
schema={"urn": polars.Categorical, "platform": polars.Categorical},
|
|
798
|
-
strict=True,
|
|
799
|
-
)
|
|
814
|
+
wdf = self.load_data_from_es_to_lf(
|
|
815
|
+
index="dataset_operationaspect_v1",
|
|
816
|
+
query=QueryBuilder.get_dataset_write_usage_raw_query(
|
|
817
|
+
self.config.lookback_days
|
|
818
|
+
),
|
|
819
|
+
process_function=self.write_stat_raw_batch,
|
|
820
|
+
schema={"urn": polars.Categorical, "platform": polars.Categorical},
|
|
821
|
+
)
|
|
822
|
+
wdf = wdf.cast({polars.String: polars.Categorical})
|
|
800
823
|
|
|
801
824
|
wdf = wdf.group_by(polars.col("urn"), polars.col("platform")).agg(
|
|
802
825
|
polars.col("urn").count().alias("write_count"),
|
|
@@ -851,18 +874,18 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
851
874
|
def set_table_modification_time_for_views(
|
|
852
875
|
self, datasets_df: polars.LazyFrame
|
|
853
876
|
) -> polars.LazyFrame:
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
schema=
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
strict=True,
|
|
877
|
+
schema = {
|
|
878
|
+
"source_urn": polars.Categorical,
|
|
879
|
+
"destination_urn": polars.Categorical,
|
|
880
|
+
}
|
|
881
|
+
|
|
882
|
+
upstreams_lf = self.load_data_from_es_to_lf(
|
|
883
|
+
schema=schema,
|
|
884
|
+
index="graph_service_v1",
|
|
885
|
+
query=QueryBuilder.get_upstreams_query(),
|
|
886
|
+
process_function=self.upstream_lineage_batch,
|
|
865
887
|
)
|
|
888
|
+
|
|
866
889
|
wdf = (
|
|
867
890
|
(
|
|
868
891
|
upstreams_lf.join(
|
|
@@ -1116,7 +1139,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1116
1139
|
self, lazy_frame: polars.LazyFrame
|
|
1117
1140
|
) -> Iterable[MetadataWorkUnit]:
|
|
1118
1141
|
num = 0
|
|
1119
|
-
for row in lazy_frame.collect().
|
|
1142
|
+
for row in lazy_frame.collect().iter_rows(named=True):
|
|
1120
1143
|
num += 1
|
|
1121
1144
|
|
|
1122
1145
|
query_usage_features = QueryUsageFeaturesClass(
|
|
@@ -1186,49 +1209,43 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1186
1209
|
def generate_dashboard_chart_usage(
|
|
1187
1210
|
self, entity_index: str, usage_index: str
|
|
1188
1211
|
) -> polars.LazyFrame:
|
|
1189
|
-
|
|
1190
|
-
|
|
1191
|
-
|
|
1192
|
-
|
|
1193
|
-
|
|
1194
|
-
|
|
1195
|
-
|
|
1196
|
-
|
|
1197
|
-
|
|
1198
|
-
|
|
1199
|
-
|
|
1200
|
-
|
|
1201
|
-
|
|
1202
|
-
strict=True,
|
|
1212
|
+
soft_deleted_schema = {
|
|
1213
|
+
"entity_urn": polars.Categorical,
|
|
1214
|
+
"removed": polars.Boolean,
|
|
1215
|
+
"last_modified_at": polars.Int64,
|
|
1216
|
+
"siblings": polars.List(polars.String),
|
|
1217
|
+
"isView": polars.Boolean,
|
|
1218
|
+
}
|
|
1219
|
+
|
|
1220
|
+
soft_deleted_df = self.load_data_from_es_to_lf(
|
|
1221
|
+
schema=soft_deleted_schema,
|
|
1222
|
+
index=entity_index,
|
|
1223
|
+
query=QueryBuilder.get_dataset_entities_query(),
|
|
1224
|
+
process_function=self.soft_deleted_batch,
|
|
1203
1225
|
)
|
|
1204
1226
|
|
|
1205
|
-
|
|
1206
|
-
|
|
1207
|
-
|
|
1208
|
-
|
|
1209
|
-
|
|
1210
|
-
|
|
1211
|
-
|
|
1212
|
-
|
|
1213
|
-
|
|
1214
|
-
"urn": polars.Categorical,
|
|
1215
|
-
"platform": polars.Categorical,
|
|
1216
|
-
"eventGranularity": polars.String,
|
|
1217
|
-
"partitionSpec": polars.Struct(
|
|
1227
|
+
dashboard_usage_schema = {
|
|
1228
|
+
"timestampMillis": polars.Int64,
|
|
1229
|
+
"lastObserved": polars.Int64,
|
|
1230
|
+
"urn": polars.Categorical,
|
|
1231
|
+
"platform": polars.Categorical,
|
|
1232
|
+
"eventGranularity": polars.String,
|
|
1233
|
+
"viewsCount": polars.Int64,
|
|
1234
|
+
"userCounts": polars.List(
|
|
1235
|
+
polars.Struct(
|
|
1218
1236
|
{
|
|
1219
|
-
"
|
|
1237
|
+
"usageCount": polars.Int64,
|
|
1238
|
+
"user": polars.String,
|
|
1220
1239
|
}
|
|
1221
|
-
)
|
|
1222
|
-
|
|
1223
|
-
|
|
1224
|
-
|
|
1225
|
-
|
|
1226
|
-
|
|
1227
|
-
|
|
1228
|
-
|
|
1229
|
-
|
|
1230
|
-
),
|
|
1231
|
-
},
|
|
1240
|
+
)
|
|
1241
|
+
),
|
|
1242
|
+
}
|
|
1243
|
+
|
|
1244
|
+
lf = self.load_data_from_es_to_lf(
|
|
1245
|
+
schema=dashboard_usage_schema,
|
|
1246
|
+
index=usage_index,
|
|
1247
|
+
query=QueryBuilder.get_dashboard_usage_query(self.config.lookback_days),
|
|
1248
|
+
process_function=self.process_dashboard_usage,
|
|
1232
1249
|
)
|
|
1233
1250
|
|
|
1234
1251
|
lf = (
|
|
@@ -1301,48 +1318,41 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1301
1318
|
def generate_query_usage(self) -> polars.LazyFrame:
|
|
1302
1319
|
usage_index = "query_queryusagestatisticsaspect_v1"
|
|
1303
1320
|
entity_index = "queryindex_v2"
|
|
1304
|
-
|
|
1305
|
-
|
|
1306
|
-
|
|
1307
|
-
|
|
1308
|
-
|
|
1309
|
-
|
|
1310
|
-
|
|
1311
|
-
|
|
1312
|
-
|
|
1313
|
-
|
|
1314
|
-
|
|
1315
|
-
|
|
1316
|
-
},
|
|
1317
|
-
strict=True,
|
|
1321
|
+
query_entities_schema = {
|
|
1322
|
+
"entity_urn": polars.Categorical,
|
|
1323
|
+
"last_modified_at": polars.Int64,
|
|
1324
|
+
"platform": polars.Categorical,
|
|
1325
|
+
"removed": polars.Boolean,
|
|
1326
|
+
}
|
|
1327
|
+
|
|
1328
|
+
query_entities = self.load_data_from_es_to_lf(
|
|
1329
|
+
schema=query_entities_schema,
|
|
1330
|
+
index=entity_index,
|
|
1331
|
+
query=QueryBuilder.get_query_entities_query(),
|
|
1332
|
+
process_function=self.queries_entities_batch,
|
|
1318
1333
|
)
|
|
1319
1334
|
|
|
1320
|
-
|
|
1321
|
-
|
|
1322
|
-
|
|
1323
|
-
|
|
1324
|
-
|
|
1325
|
-
|
|
1326
|
-
|
|
1327
|
-
|
|
1328
|
-
"lastObserved": polars.Int64,
|
|
1329
|
-
"urn": polars.Categorical,
|
|
1330
|
-
"eventGranularity": polars.String,
|
|
1331
|
-
"partitionSpec": polars.Struct(
|
|
1335
|
+
query_usage_schema = {
|
|
1336
|
+
"timestampMillis": polars.Int64,
|
|
1337
|
+
"lastObserved": polars.Int64,
|
|
1338
|
+
"urn": polars.Categorical,
|
|
1339
|
+
"eventGranularity": polars.String,
|
|
1340
|
+
"queryCount": polars.Int64,
|
|
1341
|
+
"userCounts": polars.List(
|
|
1342
|
+
polars.Struct(
|
|
1332
1343
|
{
|
|
1333
|
-
"
|
|
1344
|
+
"usageCount": polars.Int64,
|
|
1345
|
+
"user": polars.String,
|
|
1334
1346
|
}
|
|
1335
|
-
)
|
|
1336
|
-
|
|
1337
|
-
|
|
1338
|
-
|
|
1339
|
-
|
|
1340
|
-
|
|
1341
|
-
|
|
1342
|
-
|
|
1343
|
-
|
|
1344
|
-
),
|
|
1345
|
-
},
|
|
1347
|
+
)
|
|
1348
|
+
),
|
|
1349
|
+
}
|
|
1350
|
+
|
|
1351
|
+
lf = self.load_data_from_es_to_lf(
|
|
1352
|
+
schema=query_usage_schema,
|
|
1353
|
+
index=usage_index,
|
|
1354
|
+
query=QueryBuilder.get_query_usage_query(self.config.lookback_days),
|
|
1355
|
+
process_function=self.process_query_usage,
|
|
1346
1356
|
)
|
|
1347
1357
|
|
|
1348
1358
|
lf = query_entities.join(
|
|
@@ -1380,36 +1390,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1380
1390
|
if self.config.set_upstream_table_max_modification_time_for_views:
|
|
1381
1391
|
datasets_lf = self.set_table_modification_time_for_views(datasets_lf)
|
|
1382
1392
|
|
|
1383
|
-
|
|
1384
|
-
lf: polars.LazyFrame = polars.LazyFrame(
|
|
1385
|
-
self.load_data_from_es(
|
|
1386
|
-
index=index,
|
|
1387
|
-
query=QueryBuilder.get_dataset_usage_query(self.config.lookback_days),
|
|
1388
|
-
process_function=self.process_batch,
|
|
1389
|
-
),
|
|
1390
|
-
schema={
|
|
1391
|
-
"timestampMillis": polars.Int64,
|
|
1392
|
-
"urn": polars.Categorical,
|
|
1393
|
-
"platform": polars.Categorical,
|
|
1394
|
-
"eventGranularity": polars.String,
|
|
1395
|
-
"partitionSpec": polars.Struct(
|
|
1396
|
-
{
|
|
1397
|
-
"partition": polars.String,
|
|
1398
|
-
}
|
|
1399
|
-
),
|
|
1400
|
-
"totalSqlQueries": polars.Int64,
|
|
1401
|
-
"uniqueUserCount": polars.Int64,
|
|
1402
|
-
"userCounts": polars.List(
|
|
1403
|
-
polars.Struct(
|
|
1404
|
-
{
|
|
1405
|
-
"count": polars.Int64,
|
|
1406
|
-
"user": polars.String,
|
|
1407
|
-
"userEmail": polars.String,
|
|
1408
|
-
}
|
|
1409
|
-
)
|
|
1410
|
-
),
|
|
1411
|
-
},
|
|
1412
|
-
)
|
|
1393
|
+
lf = self.load_dataset_usage()
|
|
1413
1394
|
|
|
1414
1395
|
# Polaris/pandas join merges the join column into one column and that's why we need to filter based on the removed column
|
|
1415
1396
|
lf = (
|
|
@@ -1472,23 +1453,101 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1472
1453
|
)
|
|
1473
1454
|
return usage_and_write_lf
|
|
1474
1455
|
|
|
1475
|
-
def
|
|
1476
|
-
|
|
1477
|
-
|
|
1478
|
-
|
|
1479
|
-
|
|
1480
|
-
|
|
1456
|
+
def load_data_from_es_to_lf(
|
|
1457
|
+
self,
|
|
1458
|
+
index: str,
|
|
1459
|
+
schema: Dict,
|
|
1460
|
+
query: Dict,
|
|
1461
|
+
process_function: Callable,
|
|
1462
|
+
aggregation_key: Optional[str] = None,
|
|
1463
|
+
file_to_load: Optional[str] = None,
|
|
1464
|
+
) -> polars.LazyFrame:
|
|
1465
|
+
data = self.load_data_from_es(
|
|
1466
|
+
index=index,
|
|
1467
|
+
query=query,
|
|
1468
|
+
process_function=process_function,
|
|
1469
|
+
aggregation_key=aggregation_key,
|
|
1470
|
+
)
|
|
1471
|
+
|
|
1472
|
+
if not self.config.streaming_mode:
|
|
1473
|
+
return polars.LazyFrame(data, schema)
|
|
1474
|
+
else:
|
|
1475
|
+
assert (
|
|
1476
|
+
self.temp_dir is not None
|
|
1477
|
+
), "In Streaming mode temp dir should be set. Normally this should not happen..."
|
|
1478
|
+
|
|
1479
|
+
with tempfile.NamedTemporaryFile(
|
|
1480
|
+
delete=False,
|
|
1481
|
+
mode="wb",
|
|
1482
|
+
dir=self.temp_dir.name,
|
|
1483
|
+
prefix=f"{index}_",
|
|
1484
|
+
suffix=".parquet",
|
|
1485
|
+
) as temp_file:
|
|
1486
|
+
tempfile_name = temp_file.name
|
|
1487
|
+
with pq.ParquetWriter(
|
|
1488
|
+
tempfile_name, self.polars_to_arrow_schema(schema)
|
|
1489
|
+
) as writer:
|
|
1490
|
+
logger.debug(f"Creating temporary file {tempfile_name}")
|
|
1491
|
+
|
|
1492
|
+
self.batch_write_parquet(
|
|
1493
|
+
data,
|
|
1494
|
+
schema,
|
|
1495
|
+
temp_file.name,
|
|
1496
|
+
parquet_writer=writer,
|
|
1497
|
+
)
|
|
1498
|
+
# Scan parquet fails in some cases with
|
|
1499
|
+
# thread 'polars-1' panicked at crates/polars-parquet/src/arrow/read/deserialize/dictionary_encoded/required_masked_dense.rs:113:72:
|
|
1500
|
+
# called `Option::unwrap()` on a `None` value
|
|
1501
|
+
# Which only happens if we don't collect immediately
|
|
1502
|
+
# return polars.scan_parquet(temp_file.name, schema=schema, low_memory=True).collect().lazy()
|
|
1503
|
+
return (
|
|
1504
|
+
polars.scan_parquet(temp_file.name, schema=schema, low_memory=True)
|
|
1505
|
+
.collect()
|
|
1506
|
+
.lazy()
|
|
1507
|
+
)
|
|
1508
|
+
|
|
1509
|
+
def load_dataset_usage(self) -> polars.LazyFrame:
|
|
1510
|
+
index = "dataset_datasetusagestatisticsaspect_v1"
|
|
1511
|
+
schema = {
|
|
1512
|
+
"timestampMillis": polars.Int64,
|
|
1513
|
+
"urn": polars.Categorical,
|
|
1514
|
+
"platform": polars.Categorical,
|
|
1515
|
+
"eventGranularity": polars.String,
|
|
1516
|
+
"totalSqlQueries": polars.Int64,
|
|
1517
|
+
"uniqueUserCount": polars.Int64,
|
|
1518
|
+
"userCounts": polars.List(
|
|
1519
|
+
polars.Struct(
|
|
1520
|
+
{
|
|
1521
|
+
"count": polars.Int64,
|
|
1522
|
+
"user": polars.String,
|
|
1523
|
+
"userEmail": polars.String,
|
|
1524
|
+
}
|
|
1525
|
+
)
|
|
1481
1526
|
),
|
|
1482
|
-
|
|
1483
|
-
|
|
1484
|
-
|
|
1485
|
-
|
|
1486
|
-
|
|
1487
|
-
|
|
1488
|
-
|
|
1489
|
-
|
|
1527
|
+
}
|
|
1528
|
+
|
|
1529
|
+
return self.load_data_from_es_to_lf(
|
|
1530
|
+
schema=schema,
|
|
1531
|
+
index=index,
|
|
1532
|
+
query=QueryBuilder.get_dataset_usage_query(self.config.lookback_days),
|
|
1533
|
+
process_function=self.process_batch,
|
|
1534
|
+
)
|
|
1535
|
+
|
|
1536
|
+
def get_datasets(self) -> polars.LazyFrame:
|
|
1537
|
+
schema = {
|
|
1538
|
+
"entity_urn": polars.Categorical,
|
|
1539
|
+
"removed": polars.Boolean,
|
|
1540
|
+
"last_modified_at": polars.Int64,
|
|
1541
|
+
"siblings": polars.List(polars.String),
|
|
1542
|
+
"isView": polars.Boolean,
|
|
1543
|
+
}
|
|
1544
|
+
|
|
1545
|
+
return self.load_data_from_es_to_lf(
|
|
1546
|
+
schema=schema,
|
|
1547
|
+
index="datasetindex_v2",
|
|
1548
|
+
query=QueryBuilder.get_dataset_entities_query(),
|
|
1549
|
+
process_function=self.soft_deleted_batch,
|
|
1490
1550
|
)
|
|
1491
|
-
return datasets_df
|
|
1492
1551
|
|
|
1493
1552
|
def generate_top_users(
|
|
1494
1553
|
self, lf: polars.LazyFrame, count_field_name: str = "count"
|
|
@@ -1560,6 +1619,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1560
1619
|
batch_size: int = 1000,
|
|
1561
1620
|
delay: Optional[float] = None,
|
|
1562
1621
|
) -> Iterable[Dict[str, Any]]:
|
|
1622
|
+
processed_count = 0
|
|
1563
1623
|
while True:
|
|
1564
1624
|
with PerfTimer() as timer:
|
|
1565
1625
|
logger.debug(f"ES query: {query}")
|
|
@@ -1581,8 +1641,9 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1581
1641
|
yield from process_function(results["hits"]["hits"])
|
|
1582
1642
|
|
|
1583
1643
|
time_taken = timer.elapsed_seconds()
|
|
1644
|
+
processed_count += len(results["hits"]["hits"])
|
|
1584
1645
|
logger.info(
|
|
1585
|
-
f"Processed {len(results['hits']['hits'''])} data from {index} index in {time_taken:.3f} seconds"
|
|
1646
|
+
f"Processed {len(results['hits']['hits'''])} data from {index} index in {time_taken:.3f} seconds. Total: {processed_count} processed."
|
|
1586
1647
|
)
|
|
1587
1648
|
if len(results["hits"]["hits"]) < batch_size:
|
|
1588
1649
|
break
|
|
@@ -1609,9 +1670,3 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
|
|
|
1609
1670
|
|
|
1610
1671
|
def get_report(self) -> SourceReport:
|
|
1611
1672
|
return self.report
|
|
1612
|
-
|
|
1613
|
-
def __del__(self) -> None:
|
|
1614
|
-
for temp_file in self.temp_files_to_clean:
|
|
1615
|
-
logger.info(f"Cleaning up temp file: {temp_file}")
|
|
1616
|
-
os.remove(temp_file)
|
|
1617
|
-
self.temp_files_to_clean = []
|