acryl-datahub-cloud 0.3.8rc3__py3-none-any.whl → 0.3.8rc6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub-cloud might be problematic. Click here for more details.

Files changed (277) hide show
  1. acryl_datahub_cloud/_codegen_config.json +1 -1
  2. acryl_datahub_cloud/datahub_metadata_sharing/scroll_shared_entities.gql +204 -0
  3. acryl_datahub_cloud/datahub_metadata_sharing/share_entity.gql +9 -0
  4. acryl_datahub_cloud/datahub_usage_reporting/query_builder.py +79 -57
  5. acryl_datahub_cloud/datahub_usage_reporting/usage_feature_reporter.py +268 -213
  6. acryl_datahub_cloud/metadata/schema_classes.py +2 -2
  7. {acryl_datahub_cloud-0.3.8rc3.dist-info → acryl_datahub_cloud-0.3.8rc6.dist-info}/METADATA +38 -38
  8. acryl_datahub_cloud-0.3.8rc6.dist-info/RECORD +133 -0
  9. acryl_datahub_cloud/metadata/schema.avsc +0 -26607
  10. acryl_datahub_cloud/metadata/schemas/Access.avsc +0 -55
  11. acryl_datahub_cloud/metadata/schemas/ActionRequestArchived.avsc +0 -68
  12. acryl_datahub_cloud/metadata/schemas/ActionRequestInfo.avsc +0 -524
  13. acryl_datahub_cloud/metadata/schemas/ActionRequestKey.avsc +0 -21
  14. acryl_datahub_cloud/metadata/schemas/ActionRequestStatus.avsc +0 -85
  15. acryl_datahub_cloud/metadata/schemas/Actors.avsc +0 -48
  16. acryl_datahub_cloud/metadata/schemas/AiInferenceMetadata.avsc +0 -42
  17. acryl_datahub_cloud/metadata/schemas/AnomaliesSummary.avsc +0 -126
  18. acryl_datahub_cloud/metadata/schemas/AnomalyInfo.avsc +0 -342
  19. acryl_datahub_cloud/metadata/schemas/AnomalyKey.avsc +0 -22
  20. acryl_datahub_cloud/metadata/schemas/AssertionActions.avsc +0 -53
  21. acryl_datahub_cloud/metadata/schemas/AssertionAnalyticsRunEvent.avsc +0 -3506
  22. acryl_datahub_cloud/metadata/schemas/AssertionDryRunEvent.avsc +0 -309
  23. acryl_datahub_cloud/metadata/schemas/AssertionInferenceDetails.avsc +0 -105
  24. acryl_datahub_cloud/metadata/schemas/AssertionInfo.avsc +0 -2579
  25. acryl_datahub_cloud/metadata/schemas/AssertionKey.avsc +0 -32
  26. acryl_datahub_cloud/metadata/schemas/AssertionRunEvent.avsc +0 -3374
  27. acryl_datahub_cloud/metadata/schemas/AssertionSummary.avsc +0 -50
  28. acryl_datahub_cloud/metadata/schemas/AssertionsSummary.avsc +0 -189
  29. acryl_datahub_cloud/metadata/schemas/BatchTestRunEvent.avsc +0 -286
  30. acryl_datahub_cloud/metadata/schemas/BrowsePaths.avsc +0 -25
  31. acryl_datahub_cloud/metadata/schemas/BrowsePathsV2.avsc +0 -50
  32. acryl_datahub_cloud/metadata/schemas/BusinessAttributeInfo.avsc +0 -601
  33. acryl_datahub_cloud/metadata/schemas/BusinessAttributeKey.avsc +0 -24
  34. acryl_datahub_cloud/metadata/schemas/BusinessAttributes.avsc +0 -51
  35. acryl_datahub_cloud/metadata/schemas/CaveatsAndRecommendations.avsc +0 -78
  36. acryl_datahub_cloud/metadata/schemas/ChartInfo.avsc +0 -346
  37. acryl_datahub_cloud/metadata/schemas/ChartKey.avsc +0 -58
  38. acryl_datahub_cloud/metadata/schemas/ChartQuery.avsc +0 -39
  39. acryl_datahub_cloud/metadata/schemas/ChartUsageStatistics.avsc +0 -195
  40. acryl_datahub_cloud/metadata/schemas/ConstraintInfo.avsc +0 -182
  41. acryl_datahub_cloud/metadata/schemas/ConstraintKey.avsc +0 -20
  42. acryl_datahub_cloud/metadata/schemas/Container.avsc +0 -36
  43. acryl_datahub_cloud/metadata/schemas/ContainerKey.avsc +0 -47
  44. acryl_datahub_cloud/metadata/schemas/ContainerProperties.avsc +0 -189
  45. acryl_datahub_cloud/metadata/schemas/CorpGroupEditableInfo.avsc +0 -52
  46. acryl_datahub_cloud/metadata/schemas/CorpGroupInfo.avsc +0 -177
  47. acryl_datahub_cloud/metadata/schemas/CorpGroupKey.avsc +0 -39
  48. acryl_datahub_cloud/metadata/schemas/CorpGroupSettings.avsc +0 -106
  49. acryl_datahub_cloud/metadata/schemas/CorpUserCredentials.avsc +0 -42
  50. acryl_datahub_cloud/metadata/schemas/CorpUserEditableInfo.avsc +0 -169
  51. acryl_datahub_cloud/metadata/schemas/CorpUserInfo.avsc +0 -171
  52. acryl_datahub_cloud/metadata/schemas/CorpUserKey.avsc +0 -43
  53. acryl_datahub_cloud/metadata/schemas/CorpUserSettings.avsc +0 -165
  54. acryl_datahub_cloud/metadata/schemas/CorpUserStatus.avsc +0 -73
  55. acryl_datahub_cloud/metadata/schemas/Cost.avsc +0 -64
  56. acryl_datahub_cloud/metadata/schemas/CostFeatures.avsc +0 -36
  57. acryl_datahub_cloud/metadata/schemas/DashboardInfo.avsc +0 -403
  58. acryl_datahub_cloud/metadata/schemas/DashboardKey.avsc +0 -57
  59. acryl_datahub_cloud/metadata/schemas/DashboardUsageStatistics.avsc +0 -255
  60. acryl_datahub_cloud/metadata/schemas/DataContractKey.avsc +0 -23
  61. acryl_datahub_cloud/metadata/schemas/DataContractProperties.avsc +0 -201
  62. acryl_datahub_cloud/metadata/schemas/DataContractStatus.avsc +0 -44
  63. acryl_datahub_cloud/metadata/schemas/DataFlowInfo.avsc +0 -188
  64. acryl_datahub_cloud/metadata/schemas/DataFlowKey.avsc +0 -63
  65. acryl_datahub_cloud/metadata/schemas/DataHubAccessTokenInfo.avsc +0 -74
  66. acryl_datahub_cloud/metadata/schemas/DataHubAccessTokenKey.avsc +0 -21
  67. acryl_datahub_cloud/metadata/schemas/DataHubActionInfo.avsc +0 -121
  68. acryl_datahub_cloud/metadata/schemas/DataHubActionKey.avsc +0 -22
  69. acryl_datahub_cloud/metadata/schemas/DataHubActionStatus.avsc +0 -181
  70. acryl_datahub_cloud/metadata/schemas/DataHubConnectionDetails.avsc +0 -62
  71. acryl_datahub_cloud/metadata/schemas/DataHubConnectionKey.avsc +0 -23
  72. acryl_datahub_cloud/metadata/schemas/DataHubIngestionSourceInfo.avsc +0 -157
  73. acryl_datahub_cloud/metadata/schemas/DataHubIngestionSourceKey.avsc +0 -21
  74. acryl_datahub_cloud/metadata/schemas/DataHubPersonaInfo.avsc +0 -24
  75. acryl_datahub_cloud/metadata/schemas/DataHubPersonaKey.avsc +0 -21
  76. acryl_datahub_cloud/metadata/schemas/DataHubPolicyInfo.avsc +0 -302
  77. acryl_datahub_cloud/metadata/schemas/DataHubPolicyKey.avsc +0 -22
  78. acryl_datahub_cloud/metadata/schemas/DataHubRetentionConfig.avsc +0 -59
  79. acryl_datahub_cloud/metadata/schemas/DataHubRetentionKey.avsc +0 -26
  80. acryl_datahub_cloud/metadata/schemas/DataHubRoleInfo.avsc +0 -33
  81. acryl_datahub_cloud/metadata/schemas/DataHubRoleKey.avsc +0 -21
  82. acryl_datahub_cloud/metadata/schemas/DataHubSecretKey.avsc +0 -21
  83. acryl_datahub_cloud/metadata/schemas/DataHubSecretValue.avsc +0 -91
  84. acryl_datahub_cloud/metadata/schemas/DataHubStepStateKey.avsc +0 -21
  85. acryl_datahub_cloud/metadata/schemas/DataHubStepStateProperties.avsc +0 -68
  86. acryl_datahub_cloud/metadata/schemas/DataHubUpgradeKey.avsc +0 -21
  87. acryl_datahub_cloud/metadata/schemas/DataHubUpgradeRequest.avsc +0 -21
  88. acryl_datahub_cloud/metadata/schemas/DataHubUpgradeResult.avsc +0 -53
  89. acryl_datahub_cloud/metadata/schemas/DataHubViewInfo.avsc +0 -265
  90. acryl_datahub_cloud/metadata/schemas/DataHubViewKey.avsc +0 -21
  91. acryl_datahub_cloud/metadata/schemas/DataJobInfo.avsc +0 -254
  92. acryl_datahub_cloud/metadata/schemas/DataJobInputOutput.avsc +0 -462
  93. acryl_datahub_cloud/metadata/schemas/DataJobKey.avsc +0 -75
  94. acryl_datahub_cloud/metadata/schemas/DataPlatformInfo.avsc +0 -93
  95. acryl_datahub_cloud/metadata/schemas/DataPlatformInstance.avsc +0 -44
  96. acryl_datahub_cloud/metadata/schemas/DataPlatformInstanceKey.avsc +0 -35
  97. acryl_datahub_cloud/metadata/schemas/DataPlatformInstanceProperties.avsc +0 -72
  98. acryl_datahub_cloud/metadata/schemas/DataPlatformKey.avsc +0 -21
  99. acryl_datahub_cloud/metadata/schemas/DataProcessInfo.avsc +0 -73
  100. acryl_datahub_cloud/metadata/schemas/DataProcessInstanceInput.avsc +0 -38
  101. acryl_datahub_cloud/metadata/schemas/DataProcessInstanceKey.avsc +0 -29
  102. acryl_datahub_cloud/metadata/schemas/DataProcessInstanceOutput.avsc +0 -38
  103. acryl_datahub_cloud/metadata/schemas/DataProcessInstanceProperties.avsc +0 -131
  104. acryl_datahub_cloud/metadata/schemas/DataProcessInstanceRelationships.avsc +0 -99
  105. acryl_datahub_cloud/metadata/schemas/DataProcessInstanceRunEvent.avsc +0 -229
  106. acryl_datahub_cloud/metadata/schemas/DataProcessKey.avsc +0 -84
  107. acryl_datahub_cloud/metadata/schemas/DataProductKey.avsc +0 -32
  108. acryl_datahub_cloud/metadata/schemas/DataProductProperties.avsc +0 -211
  109. acryl_datahub_cloud/metadata/schemas/DataTransformLogic.avsc +0 -63
  110. acryl_datahub_cloud/metadata/schemas/DataTypeInfo.avsc +0 -33
  111. acryl_datahub_cloud/metadata/schemas/DataTypeKey.avsc +0 -23
  112. acryl_datahub_cloud/metadata/schemas/DatahubIngestionCheckpoint.avsc +0 -188
  113. acryl_datahub_cloud/metadata/schemas/DatahubIngestionRunSummary.avsc +0 -365
  114. acryl_datahub_cloud/metadata/schemas/DatasetDeprecation.avsc +0 -50
  115. acryl_datahub_cloud/metadata/schemas/DatasetKey.avsc +0 -135
  116. acryl_datahub_cloud/metadata/schemas/DatasetProfile.avsc +0 -539
  117. acryl_datahub_cloud/metadata/schemas/DatasetProperties.avsc +0 -165
  118. acryl_datahub_cloud/metadata/schemas/DatasetUpstreamLineage.avsc +0 -129
  119. acryl_datahub_cloud/metadata/schemas/DatasetUsageStatistics.avsc +0 -247
  120. acryl_datahub_cloud/metadata/schemas/Deprecation.avsc +0 -57
  121. acryl_datahub_cloud/metadata/schemas/DisplayProperties.avsc +0 -62
  122. acryl_datahub_cloud/metadata/schemas/Documentation.avsc +0 -152
  123. acryl_datahub_cloud/metadata/schemas/DomainKey.avsc +0 -30
  124. acryl_datahub_cloud/metadata/schemas/DomainProperties.avsc +0 -137
  125. acryl_datahub_cloud/metadata/schemas/Domains.avsc +0 -38
  126. acryl_datahub_cloud/metadata/schemas/DynamicFormAssignment.avsc +0 -150
  127. acryl_datahub_cloud/metadata/schemas/ERModelRelationshipKey.avsc +0 -28
  128. acryl_datahub_cloud/metadata/schemas/ERModelRelationshipProperties.avsc +0 -196
  129. acryl_datahub_cloud/metadata/schemas/EditableChartProperties.avsc +0 -98
  130. acryl_datahub_cloud/metadata/schemas/EditableContainerProperties.avsc +0 -24
  131. acryl_datahub_cloud/metadata/schemas/EditableDashboardProperties.avsc +0 -98
  132. acryl_datahub_cloud/metadata/schemas/EditableDataFlowProperties.avsc +0 -98
  133. acryl_datahub_cloud/metadata/schemas/EditableDataJobProperties.avsc +0 -98
  134. acryl_datahub_cloud/metadata/schemas/EditableDatasetProperties.avsc +0 -111
  135. acryl_datahub_cloud/metadata/schemas/EditableERModelRelationshipProperties.avsc +0 -111
  136. acryl_datahub_cloud/metadata/schemas/EditableMLFeatureProperties.avsc +0 -24
  137. acryl_datahub_cloud/metadata/schemas/EditableMLFeatureTableProperties.avsc +0 -24
  138. acryl_datahub_cloud/metadata/schemas/EditableMLModelGroupProperties.avsc +0 -24
  139. acryl_datahub_cloud/metadata/schemas/EditableMLModelProperties.avsc +0 -24
  140. acryl_datahub_cloud/metadata/schemas/EditableMLPrimaryKeyProperties.avsc +0 -24
  141. acryl_datahub_cloud/metadata/schemas/EditableNotebookProperties.avsc +0 -98
  142. acryl_datahub_cloud/metadata/schemas/EditableSchemaMetadata.avsc +0 -431
  143. acryl_datahub_cloud/metadata/schemas/Embed.avsc +0 -20
  144. acryl_datahub_cloud/metadata/schemas/EntityChangeEvent.avsc +0 -112
  145. acryl_datahub_cloud/metadata/schemas/EntityInferenceMetadata.avsc +0 -47
  146. acryl_datahub_cloud/metadata/schemas/EntityTypeInfo.avsc +0 -33
  147. acryl_datahub_cloud/metadata/schemas/EntityTypeKey.avsc +0 -24
  148. acryl_datahub_cloud/metadata/schemas/EthicalConsiderations.avsc +0 -71
  149. acryl_datahub_cloud/metadata/schemas/EvaluationData.avsc +0 -56
  150. acryl_datahub_cloud/metadata/schemas/ExecutionRequestInput.avsc +0 -134
  151. acryl_datahub_cloud/metadata/schemas/ExecutionRequestKey.avsc +0 -23
  152. acryl_datahub_cloud/metadata/schemas/ExecutionRequestResult.avsc +0 -97
  153. acryl_datahub_cloud/metadata/schemas/ExecutionRequestSignal.avsc +0 -73
  154. acryl_datahub_cloud/metadata/schemas/Filter.avsc +0 -126
  155. acryl_datahub_cloud/metadata/schemas/FormInfo.avsc +0 -517
  156. acryl_datahub_cloud/metadata/schemas/FormKey.avsc +0 -23
  157. acryl_datahub_cloud/metadata/schemas/Forms.avsc +0 -447
  158. acryl_datahub_cloud/metadata/schemas/GenericEntityKey.avsc +0 -16
  159. acryl_datahub_cloud/metadata/schemas/GlobalSettingsInfo.avsc +0 -524
  160. acryl_datahub_cloud/metadata/schemas/GlobalSettingsKey.avsc +0 -22
  161. acryl_datahub_cloud/metadata/schemas/GlobalTags.avsc +0 -132
  162. acryl_datahub_cloud/metadata/schemas/GlossaryNodeInfo.avsc +0 -89
  163. acryl_datahub_cloud/metadata/schemas/GlossaryNodeKey.avsc +0 -33
  164. acryl_datahub_cloud/metadata/schemas/GlossaryRelatedTerms.avsc +0 -125
  165. acryl_datahub_cloud/metadata/schemas/GlossaryTermInfo.avsc +0 -131
  166. acryl_datahub_cloud/metadata/schemas/GlossaryTermKey.avsc +0 -39
  167. acryl_datahub_cloud/metadata/schemas/GlossaryTerms.avsc +0 -190
  168. acryl_datahub_cloud/metadata/schemas/GroupMembership.avsc +0 -28
  169. acryl_datahub_cloud/metadata/schemas/IncidentActivityEvent.avsc +0 -605
  170. acryl_datahub_cloud/metadata/schemas/IncidentInfo.avsc +0 -376
  171. acryl_datahub_cloud/metadata/schemas/IncidentKey.avsc +0 -25
  172. acryl_datahub_cloud/metadata/schemas/IncidentNotificationDetails.avsc +0 -62
  173. acryl_datahub_cloud/metadata/schemas/IncidentSource.avsc +0 -48
  174. acryl_datahub_cloud/metadata/schemas/IncidentsSummary.avsc +0 -160
  175. acryl_datahub_cloud/metadata/schemas/InferredMetadata.avsc +0 -398
  176. acryl_datahub_cloud/metadata/schemas/InferredNeighbors.avsc +0 -112
  177. acryl_datahub_cloud/metadata/schemas/InputFields.avsc +0 -678
  178. acryl_datahub_cloud/metadata/schemas/InstitutionalMemory.avsc +0 -88
  179. acryl_datahub_cloud/metadata/schemas/IntendedUse.avsc +0 -56
  180. acryl_datahub_cloud/metadata/schemas/InviteToken.avsc +0 -34
  181. acryl_datahub_cloud/metadata/schemas/InviteTokenKey.avsc +0 -21
  182. acryl_datahub_cloud/metadata/schemas/LineageFeatures.avsc +0 -76
  183. acryl_datahub_cloud/metadata/schemas/LinkPreviewInfo.avsc +0 -38
  184. acryl_datahub_cloud/metadata/schemas/LinkPreviewKey.avsc +0 -21
  185. acryl_datahub_cloud/metadata/schemas/MLFeatureKey.avsc +0 -57
  186. acryl_datahub_cloud/metadata/schemas/MLFeatureProperties.avsc +0 -189
  187. acryl_datahub_cloud/metadata/schemas/MLFeatureTableKey.avsc +0 -66
  188. acryl_datahub_cloud/metadata/schemas/MLFeatureTableProperties.avsc +0 -95
  189. acryl_datahub_cloud/metadata/schemas/MLHyperParam.avsc +0 -43
  190. acryl_datahub_cloud/metadata/schemas/MLMetric.avsc +0 -43
  191. acryl_datahub_cloud/metadata/schemas/MLModelDeploymentKey.avsc +0 -92
  192. acryl_datahub_cloud/metadata/schemas/MLModelDeploymentProperties.avsc +0 -173
  193. acryl_datahub_cloud/metadata/schemas/MLModelFactorPrompts.avsc +0 -78
  194. acryl_datahub_cloud/metadata/schemas/MLModelGroupKey.avsc +0 -102
  195. acryl_datahub_cloud/metadata/schemas/MLModelGroupProperties.avsc +0 -123
  196. acryl_datahub_cloud/metadata/schemas/MLModelKey.avsc +0 -117
  197. acryl_datahub_cloud/metadata/schemas/MLModelProperties.avsc +0 -414
  198. acryl_datahub_cloud/metadata/schemas/MLPrimaryKeyKey.avsc +0 -53
  199. acryl_datahub_cloud/metadata/schemas/MLPrimaryKeyProperties.avsc +0 -185
  200. acryl_datahub_cloud/metadata/schemas/MetadataChangeEvent.avsc +0 -8710
  201. acryl_datahub_cloud/metadata/schemas/MetadataChangeLog.avsc +0 -360
  202. acryl_datahub_cloud/metadata/schemas/MetadataChangeProposal.avsc +0 -290
  203. acryl_datahub_cloud/metadata/schemas/Metrics.avsc +0 -35
  204. acryl_datahub_cloud/metadata/schemas/MonitorInfo.avsc +0 -3238
  205. acryl_datahub_cloud/metadata/schemas/MonitorKey.avsc +0 -43
  206. acryl_datahub_cloud/metadata/schemas/MonitorTimeseriesState.avsc +0 -159
  207. acryl_datahub_cloud/metadata/schemas/NativeGroupMembership.avsc +0 -28
  208. acryl_datahub_cloud/metadata/schemas/NotebookContent.avsc +0 -252
  209. acryl_datahub_cloud/metadata/schemas/NotebookInfo.avsc +0 -154
  210. acryl_datahub_cloud/metadata/schemas/NotebookKey.avsc +0 -44
  211. acryl_datahub_cloud/metadata/schemas/NotificationRequest.avsc +0 -427
  212. acryl_datahub_cloud/metadata/schemas/Operation.avsc +0 -381
  213. acryl_datahub_cloud/metadata/schemas/Origin.avsc +0 -157
  214. acryl_datahub_cloud/metadata/schemas/Ownership.avsc +0 -255
  215. acryl_datahub_cloud/metadata/schemas/OwnershipTypeInfo.avsc +0 -103
  216. acryl_datahub_cloud/metadata/schemas/OwnershipTypeKey.avsc +0 -23
  217. acryl_datahub_cloud/metadata/schemas/PartitionsSummary.avsc +0 -59
  218. acryl_datahub_cloud/metadata/schemas/PlatformEvent.avsc +0 -52
  219. acryl_datahub_cloud/metadata/schemas/PlatformResourceInfo.avsc +0 -109
  220. acryl_datahub_cloud/metadata/schemas/PlatformResourceKey.avsc +0 -24
  221. acryl_datahub_cloud/metadata/schemas/PostInfo.avsc +0 -262
  222. acryl_datahub_cloud/metadata/schemas/PostKey.avsc +0 -22
  223. acryl_datahub_cloud/metadata/schemas/Proposals.avsc +0 -53
  224. acryl_datahub_cloud/metadata/schemas/QuantitativeAnalyses.avsc +0 -29
  225. acryl_datahub_cloud/metadata/schemas/QueryKey.avsc +0 -28
  226. acryl_datahub_cloud/metadata/schemas/QueryProperties.avsc +0 -171
  227. acryl_datahub_cloud/metadata/schemas/QuerySubjects.avsc +0 -50
  228. acryl_datahub_cloud/metadata/schemas/QueryUsageFeatures.avsc +0 -94
  229. acryl_datahub_cloud/metadata/schemas/QueryUsageStatistics.avsc +0 -221
  230. acryl_datahub_cloud/metadata/schemas/RecommendationModule.avsc +0 -259
  231. acryl_datahub_cloud/metadata/schemas/RecommendationModuleKey.avsc +0 -26
  232. acryl_datahub_cloud/metadata/schemas/RemoteExecutorKey.avsc +0 -21
  233. acryl_datahub_cloud/metadata/schemas/RemoteExecutorStatus.avsc +0 -80
  234. acryl_datahub_cloud/metadata/schemas/RoleKey.avsc +0 -22
  235. acryl_datahub_cloud/metadata/schemas/RoleMembership.avsc +0 -28
  236. acryl_datahub_cloud/metadata/schemas/RoleProperties.avsc +0 -99
  237. acryl_datahub_cloud/metadata/schemas/SchemaFieldAliases.avsc +0 -29
  238. acryl_datahub_cloud/metadata/schemas/SchemaFieldInfo.avsc +0 -42
  239. acryl_datahub_cloud/metadata/schemas/SchemaFieldKey.avsc +0 -46
  240. acryl_datahub_cloud/metadata/schemas/SchemaFieldProfile.avsc +0 -474
  241. acryl_datahub_cloud/metadata/schemas/SchemaFieldsInferredMetadata.avsc +0 -222
  242. acryl_datahub_cloud/metadata/schemas/SchemaFieldsInferredNeighbors.avsc +0 -136
  243. acryl_datahub_cloud/metadata/schemas/SchemaMetadata.avsc +0 -1045
  244. acryl_datahub_cloud/metadata/schemas/SchemaProposals.avsc +0 -73
  245. acryl_datahub_cloud/metadata/schemas/Share.avsc +0 -211
  246. acryl_datahub_cloud/metadata/schemas/Siblings.avsc +0 -41
  247. acryl_datahub_cloud/metadata/schemas/SlackUserInfo.avsc +0 -160
  248. acryl_datahub_cloud/metadata/schemas/SourceCode.avsc +0 -49
  249. acryl_datahub_cloud/metadata/schemas/Status.avsc +0 -20
  250. acryl_datahub_cloud/metadata/schemas/StorageFeatures.avsc +0 -76
  251. acryl_datahub_cloud/metadata/schemas/StructuredProperties.avsc +0 -106
  252. acryl_datahub_cloud/metadata/schemas/StructuredPropertyDefinition.avsc +0 -390
  253. acryl_datahub_cloud/metadata/schemas/StructuredPropertyKey.avsc +0 -26
  254. acryl_datahub_cloud/metadata/schemas/StructuredPropertySettings.avsc +0 -114
  255. acryl_datahub_cloud/metadata/schemas/SubTypes.avsc +0 -27
  256. acryl_datahub_cloud/metadata/schemas/SubscriptionInfo.avsc +0 -355
  257. acryl_datahub_cloud/metadata/schemas/SubscriptionKey.avsc +0 -21
  258. acryl_datahub_cloud/metadata/schemas/TagKey.avsc +0 -33
  259. acryl_datahub_cloud/metadata/schemas/TagProperties.avsc +0 -43
  260. acryl_datahub_cloud/metadata/schemas/TelemetryClientId.avsc +0 -16
  261. acryl_datahub_cloud/metadata/schemas/TelemetryKey.avsc +0 -21
  262. acryl_datahub_cloud/metadata/schemas/TestInfo.avsc +0 -300
  263. acryl_datahub_cloud/metadata/schemas/TestKey.avsc +0 -24
  264. acryl_datahub_cloud/metadata/schemas/TestResults.avsc +0 -163
  265. acryl_datahub_cloud/metadata/schemas/TrainingData.avsc +0 -56
  266. acryl_datahub_cloud/metadata/schemas/UpstreamLineage.avsc +0 -286
  267. acryl_datahub_cloud/metadata/schemas/UsageAggregation.avsc +0 -153
  268. acryl_datahub_cloud/metadata/schemas/UsageFeatures.avsc +0 -243
  269. acryl_datahub_cloud/metadata/schemas/VersionInfo.avsc +0 -52
  270. acryl_datahub_cloud/metadata/schemas/VersionProperties.avsc +0 -216
  271. acryl_datahub_cloud/metadata/schemas/VersionSetKey.avsc +0 -26
  272. acryl_datahub_cloud/metadata/schemas/VersionSetProperties.avsc +0 -49
  273. acryl_datahub_cloud/metadata/schemas/ViewProperties.avsc +0 -41
  274. acryl_datahub_cloud-0.3.8rc3.dist-info/RECORD +0 -396
  275. {acryl_datahub_cloud-0.3.8rc3.dist-info → acryl_datahub_cloud-0.3.8rc6.dist-info}/WHEEL +0 -0
  276. {acryl_datahub_cloud-0.3.8rc3.dist-info → acryl_datahub_cloud-0.3.8rc6.dist-info}/entry_points.txt +0 -0
  277. {acryl_datahub_cloud-0.3.8rc3.dist-info → acryl_datahub_cloud-0.3.8rc6.dist-info}/top_level.txt +0 -0
@@ -9,7 +9,8 @@ from dataclasses import dataclass, field
9
9
  from datetime import datetime
10
10
  from functools import partial
11
11
  from itertools import chain
12
- from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
12
+ from tempfile import TemporaryDirectory
13
+ from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Tuple, Union
13
14
 
14
15
  import numpy
15
16
  import polars
@@ -17,6 +18,7 @@ import pyarrow as pa
17
18
  import pyarrow.parquet as pq
18
19
  from elasticsearch.client import Elasticsearch
19
20
  from opensearchpy import OpenSearch
21
+ from polars.datatypes import DataTypeClass
20
22
  from pydantic import Field
21
23
  from scipy.stats import expon
22
24
 
@@ -171,7 +173,7 @@ class DataHubUsageFeatureReportingSourceConfig(
171
173
  )
172
174
 
173
175
  disable_write_usage: bool = Field(
174
- False,
176
+ True,
175
177
  description="Flag to disable write usage statistics collection.'",
176
178
  )
177
179
 
@@ -245,6 +247,7 @@ class DatahubUsageFeatureReport(IngestionStageReport, StatefulIngestionReport):
245
247
  class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
246
248
  platform = "datahub"
247
249
  temp_files_to_clean: List[str] = []
250
+ temp_dir: Optional[TemporaryDirectory] = None
248
251
 
249
252
  def __init__(
250
253
  self, ctx: PipelineContext, config: DataHubUsageFeatureReportingSourceConfig
@@ -267,6 +270,10 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
267
270
  if num > 0:
268
271
  logger.info(f"Compiled {num} regexp factors")
269
272
 
273
+ if self.config.streaming_mode:
274
+ self.temp_dir = tempfile.TemporaryDirectory(prefix="datahub-usage-")
275
+ logger.info(f"Using temp dir: {self.temp_dir.name}")
276
+
270
277
  def soft_deleted_batch(self, results: Iterable) -> Iterable[Dict]:
271
278
  with PerfTimer() as timer:
272
279
  for doc in results:
@@ -397,7 +404,6 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
397
404
  if "eventGranularity" in doc["_source"]
398
405
  else None
399
406
  ),
400
- "partitionSpec": doc["_source"]["partitionSpec"],
401
407
  "viewsCount": (
402
408
  doc["_source"]["viewsCount"]
403
409
  if "viewsCount" in doc["_source"]
@@ -410,7 +416,8 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
410
416
  ),
411
417
  "userCounts": (
412
418
  doc["_source"]["event"]["userCounts"]
413
- if "userCounts" in doc["_source"]["event"]
419
+ if "event" in doc["_source"]
420
+ and "userCounts" in doc["_source"]["event"]
414
421
  else []
415
422
  ),
416
423
  "platform": platform,
@@ -427,7 +434,6 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
427
434
  if "eventGranularity" in doc["_source"]
428
435
  else None
429
436
  ),
430
- "partitionSpec": doc["_source"]["partitionSpec"],
431
437
  "queryCount": (
432
438
  doc["_source"]["queryCount"]
433
439
  if "queryCount" in doc["_source"]
@@ -497,7 +503,6 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
497
503
  "timestampMillis": doc["_source"]["timestampMillis"],
498
504
  "urn": doc["_source"]["urn"],
499
505
  "eventGranularity": doc["_source"]["eventGranularity"],
500
- "partitionSpec": doc["_source"]["partitionSpec"],
501
506
  "totalSqlQueries": doc["_source"]["totalSqlQueries"],
502
507
  "uniqueUserCount": doc["_source"]["uniqueUserCount"],
503
508
  "userCounts": (
@@ -695,9 +700,13 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
695
700
  return lf
696
701
 
697
702
  @staticmethod
698
- def polars_to_arrow_schema(polars_schema: Dict[str, polars.DataType]) -> pa.Schema:
699
- def convert_dtype(polars_dtype: polars.DataType) -> pa.DataType:
700
- type_mapping: Dict[polars.DataType, pa.DataType] = {
703
+ def polars_to_arrow_schema(
704
+ polars_schema: Dict[str, Union[DataTypeClass, polars.DataType]]
705
+ ) -> pa.Schema:
706
+ def convert_dtype(
707
+ polars_dtype: Union[DataTypeClass, polars.DataType]
708
+ ) -> pa.DataType:
709
+ type_mapping: Dict[Union[DataTypeClass, polars.DataType], pa.DataType] = {
701
710
  polars.Boolean(): pa.bool_(),
702
711
  polars.Int8(): pa.int8(),
703
712
  polars.Int16(): pa.int16(),
@@ -710,6 +719,8 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
710
719
  polars.Float32(): pa.float32(),
711
720
  polars.Float64(): pa.float64(),
712
721
  polars.Utf8(): pa.string(),
722
+ polars.Utf8(): pa.utf8(),
723
+ polars.String(): pa.string(),
713
724
  polars.Date(): pa.date32(),
714
725
  polars.Datetime(): pa.timestamp("ns"),
715
726
  polars.Time(): pa.time64("ns"),
@@ -718,85 +729,97 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
718
729
 
719
730
  if polars_dtype in [type(key) for key in type_mapping.keys()]:
720
731
  return type_mapping[polars_dtype]
721
- elif polars_dtype == polars.Categorical():
732
+ elif polars_dtype == polars.Categorical:
722
733
  return pa.dictionary(index_type=pa.int32(), value_type=pa.string())
734
+ elif isinstance(polars_dtype, polars.Struct):
735
+ return pa.struct(
736
+ {
737
+ field.name: convert_dtype(field.dtype)
738
+ for field in polars_dtype.fields
739
+ }
740
+ )
741
+ elif isinstance(polars_dtype, polars.List):
742
+ return pa.list_(convert_dtype(polars_dtype.inner))
723
743
  else:
724
744
  raise ValueError(f"Unsupported Polars dtype: {polars_dtype}")
725
745
 
726
746
  fields = [(name, convert_dtype(dtype)) for name, dtype in polars_schema.items()]
727
747
  return pa.schema(fields)
728
748
 
729
- def load_es_data_to_lf(
730
- self, index: str, query: Dict, read_function: Callable, schema: Dict
731
- ) -> polars.LazyFrame:
732
- es_data = self.load_data_from_es(
733
- index,
734
- query,
735
- read_function,
736
- )
737
-
738
- with tempfile.NamedTemporaryFile(
739
- delete=False, mode="wb", suffix=".parquet"
740
- ) as temp_file:
741
- tempfile_name = temp_file.name
742
- logger.debug(f"Creating temporary file {tempfile_name}")
743
- self.temp_files_to_clean.append(tempfile_name)
744
-
745
- # Create a PyArrow schema from the provided schema dict
746
- pa_schema = self.polars_to_arrow_schema(schema)
747
-
748
- # Initialize the ParquetWriter
749
- with pq.ParquetWriter(tempfile_name, pa_schema) as writer:
750
- batch_size = (
751
- 1000 # Adjust this value based on your data and memory constraints
752
- )
753
- current_batch = []
754
-
755
- for row in es_data:
756
- current_batch.append(row)
757
-
758
- if len(current_batch) >= batch_size:
759
- # Convert the batch to a PyArrow Table
760
- table = pa.Table.from_pylist(current_batch, schema=pa_schema)
749
+ def batch_write_parquet(
750
+ self,
751
+ data_iterator: Iterable[Dict[Any, Any]],
752
+ pl_schema: Dict,
753
+ output_path: str,
754
+ batch_size: int = 50000,
755
+ append: bool = False,
756
+ parquet_writer: Optional[pq.ParquetWriter] = None,
757
+ ) -> None:
758
+ """
759
+ Write data in batches to a file with support for appending to existing files.
760
+
761
+ Args:
762
+ data_iterator: Iterator of dictionaries containing the data
763
+ pa_schema: PyArrow schema for the data
764
+ output_path: Path for the output file
765
+ format_type: One of "ipc", "feather", "csv", "parquet", "pl_parquet"
766
+ batch_size: Number of rows per batch
767
+ append: If True, append to existing file. If False, create new file.
768
+ parquet_writer: Parquet doesn't let to append to existing file, so we need to pass the writer object
769
+ Returns:
770
+ LazyFrame pointing to the written data
771
+ """
772
+ arrow_schema = self.polars_to_arrow_schema(pl_schema)
761
773
 
762
- # Write the batch
763
- writer.write_table(table)
774
+ total_rows = 0
775
+ total_batches = 0
764
776
 
765
- # Clear the current batch
766
- current_batch = []
777
+ try:
778
+ if parquet_writer:
779
+ writer = parquet_writer
780
+ else:
781
+ writer = pq.ParquetWriter(output_path, arrow_schema)
767
782
 
768
- # Write any remaining rows
769
- if current_batch:
770
- table = pa.Table.from_pylist(current_batch, schema=pa_schema)
783
+ try:
784
+ for batch in self._get_batches(data_iterator, batch_size):
785
+ table = pa.Table.from_pylist(batch, schema=arrow_schema)
771
786
  writer.write_table(table)
787
+ total_rows += len(batch)
788
+ total_batches += 1
789
+ logger.debug(f"Wrote batch {total_batches} ({len(batch)} rows)")
790
+ finally:
791
+ if not parquet_writer:
792
+ writer.close()
793
+ except Exception as e:
794
+ logger.exception(f"Error during batch writing: {str(e)}", exc_info=True)
795
+ raise
796
+
797
+ def _get_batches(
798
+ self, iterator: Iterable[Dict], batch_size: int
799
+ ) -> Iterator[List[Dict]]:
800
+ """Helper generator to create batches from an iterator."""
801
+ current_batch = []
802
+ for item in iterator:
803
+ current_batch.append(item)
804
+ if len(current_batch) >= batch_size:
805
+ yield current_batch
806
+ current_batch = []
772
807
 
773
- return polars.scan_parquet(tempfile_name)
808
+ if current_batch:
809
+ yield current_batch
774
810
 
775
811
  def load_write_usage(
776
812
  self, soft_deleted_entities_df: polars.LazyFrame
777
813
  ) -> polars.LazyFrame:
778
- if self.config.streaming_mode:
779
- wdf = self.load_es_data_to_lf(
780
- index="dataset_operationaspect_v1",
781
- query=QueryBuilder.get_dataset_write_usage_raw_query(
782
- self.config.lookback_days
783
- ),
784
- read_function=self.write_stat_raw_batch,
785
- schema={"urn": polars.Categorical, "platform": polars.Categorical},
786
- )
787
- wdf = wdf.cast({polars.String: polars.Categorical})
788
- else:
789
- wdf = polars.LazyFrame(
790
- self.load_data_from_es(
791
- "dataset_operationaspect_v1",
792
- QueryBuilder.get_dataset_write_usage_raw_query(
793
- self.config.lookback_days
794
- ),
795
- self.write_stat_raw_batch,
796
- ),
797
- schema={"urn": polars.Categorical, "platform": polars.Categorical},
798
- strict=True,
799
- )
814
+ wdf = self.load_data_from_es_to_lf(
815
+ index="dataset_operationaspect_v1",
816
+ query=QueryBuilder.get_dataset_write_usage_raw_query(
817
+ self.config.lookback_days
818
+ ),
819
+ process_function=self.write_stat_raw_batch,
820
+ schema={"urn": polars.Categorical, "platform": polars.Categorical},
821
+ )
822
+ wdf = wdf.cast({polars.String: polars.Categorical})
800
823
 
801
824
  wdf = wdf.group_by(polars.col("urn"), polars.col("platform")).agg(
802
825
  polars.col("urn").count().alias("write_count"),
@@ -851,18 +874,18 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
851
874
  def set_table_modification_time_for_views(
852
875
  self, datasets_df: polars.LazyFrame
853
876
  ) -> polars.LazyFrame:
854
- upstreams_lf = polars.LazyFrame(
855
- self.load_data_from_es(
856
- "graph_service_v1",
857
- QueryBuilder.get_upstreams_query(),
858
- self.upstream_lineage_batch,
859
- ),
860
- schema={
861
- "source_urn": polars.Categorical,
862
- "destination_urn": polars.Categorical,
863
- },
864
- strict=True,
877
+ schema = {
878
+ "source_urn": polars.Categorical,
879
+ "destination_urn": polars.Categorical,
880
+ }
881
+
882
+ upstreams_lf = self.load_data_from_es_to_lf(
883
+ schema=schema,
884
+ index="graph_service_v1",
885
+ query=QueryBuilder.get_upstreams_query(),
886
+ process_function=self.upstream_lineage_batch,
865
887
  )
888
+
866
889
  wdf = (
867
890
  (
868
891
  upstreams_lf.join(
@@ -1116,7 +1139,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1116
1139
  self, lazy_frame: polars.LazyFrame
1117
1140
  ) -> Iterable[MetadataWorkUnit]:
1118
1141
  num = 0
1119
- for row in lazy_frame.collect().to_struct():
1142
+ for row in lazy_frame.collect().iter_rows(named=True):
1120
1143
  num += 1
1121
1144
 
1122
1145
  query_usage_features = QueryUsageFeaturesClass(
@@ -1186,49 +1209,43 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1186
1209
  def generate_dashboard_chart_usage(
1187
1210
  self, entity_index: str, usage_index: str
1188
1211
  ) -> polars.LazyFrame:
1189
- soft_deleted_df = polars.LazyFrame(
1190
- self.load_data_from_es(
1191
- index=entity_index,
1192
- query=QueryBuilder.get_soft_deleted_entities_query(),
1193
- process_function=self.soft_deleted_batch,
1194
- ),
1195
- schema={
1196
- "entity_urn": polars.Categorical,
1197
- "removed": bool,
1198
- "last_modified_at": polars.Int64,
1199
- "siblings": polars.List(polars.String),
1200
- "isView": polars.Boolean,
1201
- },
1202
- strict=True,
1212
+ soft_deleted_schema = {
1213
+ "entity_urn": polars.Categorical,
1214
+ "removed": polars.Boolean,
1215
+ "last_modified_at": polars.Int64,
1216
+ "siblings": polars.List(polars.String),
1217
+ "isView": polars.Boolean,
1218
+ }
1219
+
1220
+ soft_deleted_df = self.load_data_from_es_to_lf(
1221
+ schema=soft_deleted_schema,
1222
+ index=entity_index,
1223
+ query=QueryBuilder.get_dataset_entities_query(),
1224
+ process_function=self.soft_deleted_batch,
1203
1225
  )
1204
1226
 
1205
- lf: polars.LazyFrame = polars.LazyFrame(
1206
- self.load_data_from_es(
1207
- index=usage_index,
1208
- query=QueryBuilder.get_dashboard_usage_query(self.config.lookback_days),
1209
- process_function=self.process_dashboard_usage,
1210
- ),
1211
- schema={
1212
- "timestampMillis": polars.Int64,
1213
- "lastObserved": polars.Int64,
1214
- "urn": polars.Categorical,
1215
- "platform": polars.Categorical,
1216
- "eventGranularity": polars.String,
1217
- "partitionSpec": polars.Struct(
1227
+ dashboard_usage_schema = {
1228
+ "timestampMillis": polars.Int64,
1229
+ "lastObserved": polars.Int64,
1230
+ "urn": polars.Categorical,
1231
+ "platform": polars.Categorical,
1232
+ "eventGranularity": polars.String,
1233
+ "viewsCount": polars.Int64,
1234
+ "userCounts": polars.List(
1235
+ polars.Struct(
1218
1236
  {
1219
- "partition": polars.String,
1237
+ "usageCount": polars.Int64,
1238
+ "user": polars.String,
1220
1239
  }
1221
- ),
1222
- "viewsCount": polars.Int64,
1223
- "userCounts": polars.List(
1224
- polars.Struct(
1225
- {
1226
- "usageCount": polars.Int64,
1227
- "user": polars.String,
1228
- }
1229
- )
1230
- ),
1231
- },
1240
+ )
1241
+ ),
1242
+ }
1243
+
1244
+ lf = self.load_data_from_es_to_lf(
1245
+ schema=dashboard_usage_schema,
1246
+ index=usage_index,
1247
+ query=QueryBuilder.get_dashboard_usage_query(self.config.lookback_days),
1248
+ process_function=self.process_dashboard_usage,
1232
1249
  )
1233
1250
 
1234
1251
  lf = (
@@ -1301,48 +1318,41 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1301
1318
  def generate_query_usage(self) -> polars.LazyFrame:
1302
1319
  usage_index = "query_queryusagestatisticsaspect_v1"
1303
1320
  entity_index = "queryindex_v2"
1304
-
1305
- query_entities = polars.LazyFrame(
1306
- self.load_data_from_es(
1307
- index=entity_index,
1308
- query=QueryBuilder.get_query_entities_query(),
1309
- process_function=self.queries_entities_batch,
1310
- ),
1311
- schema={
1312
- "entity_urn": polars.Categorical,
1313
- "last_modified_at": polars.Int64,
1314
- "platform": polars.Categorical,
1315
- "removed": polars.Boolean,
1316
- },
1317
- strict=True,
1321
+ query_entities_schema = {
1322
+ "entity_urn": polars.Categorical,
1323
+ "last_modified_at": polars.Int64,
1324
+ "platform": polars.Categorical,
1325
+ "removed": polars.Boolean,
1326
+ }
1327
+
1328
+ query_entities = self.load_data_from_es_to_lf(
1329
+ schema=query_entities_schema,
1330
+ index=entity_index,
1331
+ query=QueryBuilder.get_query_entities_query(),
1332
+ process_function=self.queries_entities_batch,
1318
1333
  )
1319
1334
 
1320
- lf: polars.LazyFrame = polars.LazyFrame(
1321
- self.load_data_from_es(
1322
- index=usage_index,
1323
- query=QueryBuilder.get_query_usage_query(self.config.lookback_days),
1324
- process_function=self.process_query_usage,
1325
- ),
1326
- schema={
1327
- "timestampMillis": polars.Int64,
1328
- "lastObserved": polars.Int64,
1329
- "urn": polars.Categorical,
1330
- "eventGranularity": polars.String,
1331
- "partitionSpec": polars.Struct(
1335
+ query_usage_schema = {
1336
+ "timestampMillis": polars.Int64,
1337
+ "lastObserved": polars.Int64,
1338
+ "urn": polars.Categorical,
1339
+ "eventGranularity": polars.String,
1340
+ "queryCount": polars.Int64,
1341
+ "userCounts": polars.List(
1342
+ polars.Struct(
1332
1343
  {
1333
- "partition": polars.String,
1344
+ "usageCount": polars.Int64,
1345
+ "user": polars.String,
1334
1346
  }
1335
- ),
1336
- "queryCount": polars.Int64,
1337
- "userCounts": polars.List(
1338
- polars.Struct(
1339
- {
1340
- "usageCount": polars.Int64,
1341
- "user": polars.String,
1342
- }
1343
- )
1344
- ),
1345
- },
1347
+ )
1348
+ ),
1349
+ }
1350
+
1351
+ lf = self.load_data_from_es_to_lf(
1352
+ schema=query_usage_schema,
1353
+ index=usage_index,
1354
+ query=QueryBuilder.get_query_usage_query(self.config.lookback_days),
1355
+ process_function=self.process_query_usage,
1346
1356
  )
1347
1357
 
1348
1358
  lf = query_entities.join(
@@ -1380,36 +1390,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1380
1390
  if self.config.set_upstream_table_max_modification_time_for_views:
1381
1391
  datasets_lf = self.set_table_modification_time_for_views(datasets_lf)
1382
1392
 
1383
- index = "dataset_datasetusagestatisticsaspect_v1"
1384
- lf: polars.LazyFrame = polars.LazyFrame(
1385
- self.load_data_from_es(
1386
- index=index,
1387
- query=QueryBuilder.get_dataset_usage_query(self.config.lookback_days),
1388
- process_function=self.process_batch,
1389
- ),
1390
- schema={
1391
- "timestampMillis": polars.Int64,
1392
- "urn": polars.Categorical,
1393
- "platform": polars.Categorical,
1394
- "eventGranularity": polars.String,
1395
- "partitionSpec": polars.Struct(
1396
- {
1397
- "partition": polars.String,
1398
- }
1399
- ),
1400
- "totalSqlQueries": polars.Int64,
1401
- "uniqueUserCount": polars.Int64,
1402
- "userCounts": polars.List(
1403
- polars.Struct(
1404
- {
1405
- "count": polars.Int64,
1406
- "user": polars.String,
1407
- "userEmail": polars.String,
1408
- }
1409
- )
1410
- ),
1411
- },
1412
- )
1393
+ lf = self.load_dataset_usage()
1413
1394
 
1414
1395
  # Polaris/pandas join merges the join column into one column and that's why we need to filter based on the removed column
1415
1396
  lf = (
@@ -1472,23 +1453,101 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1472
1453
  )
1473
1454
  return usage_and_write_lf
1474
1455
 
1475
- def get_datasets(self) -> polars.LazyFrame:
1476
- datasets_df = polars.LazyFrame(
1477
- self.load_data_from_es(
1478
- index="datasetindex_v2",
1479
- query=QueryBuilder.get_soft_deleted_entities_query(),
1480
- process_function=self.soft_deleted_batch,
1456
+ def load_data_from_es_to_lf(
1457
+ self,
1458
+ index: str,
1459
+ schema: Dict,
1460
+ query: Dict,
1461
+ process_function: Callable,
1462
+ aggregation_key: Optional[str] = None,
1463
+ file_to_load: Optional[str] = None,
1464
+ ) -> polars.LazyFrame:
1465
+ data = self.load_data_from_es(
1466
+ index=index,
1467
+ query=query,
1468
+ process_function=process_function,
1469
+ aggregation_key=aggregation_key,
1470
+ )
1471
+
1472
+ if not self.config.streaming_mode:
1473
+ return polars.LazyFrame(data, schema)
1474
+ else:
1475
+ assert (
1476
+ self.temp_dir is not None
1477
+ ), "In Streaming mode temp dir should be set. Normally this should not happen..."
1478
+
1479
+ with tempfile.NamedTemporaryFile(
1480
+ delete=False,
1481
+ mode="wb",
1482
+ dir=self.temp_dir.name,
1483
+ prefix=f"{index}_",
1484
+ suffix=".parquet",
1485
+ ) as temp_file:
1486
+ tempfile_name = temp_file.name
1487
+ with pq.ParquetWriter(
1488
+ tempfile_name, self.polars_to_arrow_schema(schema)
1489
+ ) as writer:
1490
+ logger.debug(f"Creating temporary file {tempfile_name}")
1491
+
1492
+ self.batch_write_parquet(
1493
+ data,
1494
+ schema,
1495
+ temp_file.name,
1496
+ parquet_writer=writer,
1497
+ )
1498
+ # Scan parquet fails in some cases with
1499
+ # thread 'polars-1' panicked at crates/polars-parquet/src/arrow/read/deserialize/dictionary_encoded/required_masked_dense.rs:113:72:
1500
+ # called `Option::unwrap()` on a `None` value
1501
+ # Which only happens if we don't collect immediately
1502
+ # return polars.scan_parquet(temp_file.name, schema=schema, low_memory=True).collect().lazy()
1503
+ return (
1504
+ polars.scan_parquet(temp_file.name, schema=schema, low_memory=True)
1505
+ .collect()
1506
+ .lazy()
1507
+ )
1508
+
1509
+ def load_dataset_usage(self) -> polars.LazyFrame:
1510
+ index = "dataset_datasetusagestatisticsaspect_v1"
1511
+ schema = {
1512
+ "timestampMillis": polars.Int64,
1513
+ "urn": polars.Categorical,
1514
+ "platform": polars.Categorical,
1515
+ "eventGranularity": polars.String,
1516
+ "totalSqlQueries": polars.Int64,
1517
+ "uniqueUserCount": polars.Int64,
1518
+ "userCounts": polars.List(
1519
+ polars.Struct(
1520
+ {
1521
+ "count": polars.Int64,
1522
+ "user": polars.String,
1523
+ "userEmail": polars.String,
1524
+ }
1525
+ )
1481
1526
  ),
1482
- schema={
1483
- "entity_urn": polars.Categorical,
1484
- "removed": bool,
1485
- "last_modified_at": polars.Int64,
1486
- "siblings": polars.List(polars.String),
1487
- "isView": polars.Boolean,
1488
- },
1489
- strict=True,
1527
+ }
1528
+
1529
+ return self.load_data_from_es_to_lf(
1530
+ schema=schema,
1531
+ index=index,
1532
+ query=QueryBuilder.get_dataset_usage_query(self.config.lookback_days),
1533
+ process_function=self.process_batch,
1534
+ )
1535
+
1536
+ def get_datasets(self) -> polars.LazyFrame:
1537
+ schema = {
1538
+ "entity_urn": polars.Categorical,
1539
+ "removed": polars.Boolean,
1540
+ "last_modified_at": polars.Int64,
1541
+ "siblings": polars.List(polars.String),
1542
+ "isView": polars.Boolean,
1543
+ }
1544
+
1545
+ return self.load_data_from_es_to_lf(
1546
+ schema=schema,
1547
+ index="datasetindex_v2",
1548
+ query=QueryBuilder.get_dataset_entities_query(),
1549
+ process_function=self.soft_deleted_batch,
1490
1550
  )
1491
- return datasets_df
1492
1551
 
1493
1552
  def generate_top_users(
1494
1553
  self, lf: polars.LazyFrame, count_field_name: str = "count"
@@ -1560,6 +1619,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1560
1619
  batch_size: int = 1000,
1561
1620
  delay: Optional[float] = None,
1562
1621
  ) -> Iterable[Dict[str, Any]]:
1622
+ processed_count = 0
1563
1623
  while True:
1564
1624
  with PerfTimer() as timer:
1565
1625
  logger.debug(f"ES query: {query}")
@@ -1581,8 +1641,9 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1581
1641
  yield from process_function(results["hits"]["hits"])
1582
1642
 
1583
1643
  time_taken = timer.elapsed_seconds()
1644
+ processed_count += len(results["hits"]["hits"])
1584
1645
  logger.info(
1585
- f"Processed {len(results['hits']['hits'''])} data from {index} index in {time_taken:.3f} seconds"
1646
+ f"Processed {len(results['hits']['hits'''])} data from {index} index in {time_taken:.3f} seconds. Total: {processed_count} processed."
1586
1647
  )
1587
1648
  if len(results["hits"]["hits"]) < batch_size:
1588
1649
  break
@@ -1609,9 +1670,3 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1609
1670
 
1610
1671
  def get_report(self) -> SourceReport:
1611
1672
  return self.report
1612
-
1613
- def __del__(self) -> None:
1614
- for temp_file in self.temp_files_to_clean:
1615
- logger.info(f"Cleaning up temp file: {temp_file}")
1616
- os.remove(temp_file)
1617
- self.temp_files_to_clean = []