acryl-datahub-cloud 0.3.10rc4__py3-none-any.whl → 0.3.16.1rc0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub-cloud might be problematic. Click here for more details.

Files changed (243) hide show
  1. acryl_datahub_cloud/_codegen_config.json +1 -1
  2. acryl_datahub_cloud/acryl_cs_issues/acryl_customer.py +1 -1
  3. acryl_datahub_cloud/acryl_cs_issues/models.py +5 -3
  4. acryl_datahub_cloud/action_request/action_request_owner_source.py +37 -8
  5. acryl_datahub_cloud/datahub_forms_notifications/__init__.py +0 -0
  6. acryl_datahub_cloud/datahub_forms_notifications/forms_notifications_source.py +569 -0
  7. acryl_datahub_cloud/datahub_forms_notifications/get_feature_flag.gql +7 -0
  8. acryl_datahub_cloud/datahub_forms_notifications/get_search_results_total.gql +14 -0
  9. acryl_datahub_cloud/datahub_forms_notifications/query.py +17 -0
  10. acryl_datahub_cloud/datahub_forms_notifications/scroll_forms_for_notification.gql +29 -0
  11. acryl_datahub_cloud/datahub_forms_notifications/send_form_notification_request.gql +5 -0
  12. acryl_datahub_cloud/datahub_reporting/datahub_dataset.py +39 -19
  13. acryl_datahub_cloud/datahub_reporting/datahub_form_reporting.py +60 -25
  14. acryl_datahub_cloud/datahub_reporting/extract_graph.py +9 -3
  15. acryl_datahub_cloud/datahub_reporting/extract_sql.py +248 -52
  16. acryl_datahub_cloud/datahub_reporting/forms.py +1 -1
  17. acryl_datahub_cloud/datahub_reporting/forms_config.py +3 -2
  18. acryl_datahub_cloud/datahub_restore/source.py +3 -2
  19. acryl_datahub_cloud/datahub_usage_reporting/excluded.py +94 -0
  20. acryl_datahub_cloud/datahub_usage_reporting/query_builder.py +48 -8
  21. acryl_datahub_cloud/datahub_usage_reporting/usage_feature_reporter.py +532 -109
  22. acryl_datahub_cloud/elasticsearch/graph_service.py +76 -14
  23. acryl_datahub_cloud/graphql_utils.py +64 -0
  24. acryl_datahub_cloud/lineage_features/source.py +555 -49
  25. acryl_datahub_cloud/metadata/_urns/urn_defs.py +2390 -1938
  26. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/actionworkflow/__init__.py +53 -0
  27. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/anomaly/__init__.py +2 -0
  28. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  29. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/assertion/__init__.py +6 -2
  30. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/common/__init__.py +6 -0
  31. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/conversation/__init__.py +29 -0
  32. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +2 -0
  33. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/execution/__init__.py +2 -0
  34. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  35. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/form/__init__.py +8 -0
  36. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/identity/__init__.py +8 -0
  37. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/knowledge/__init__.py +33 -0
  38. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  39. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +14 -0
  40. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/metadata/search/features/__init__.py +2 -0
  41. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  42. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/monitor/__init__.py +6 -0
  43. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/notification/__init__.py +19 -0
  44. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  45. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  46. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  47. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +28 -0
  48. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  49. acryl_datahub_cloud/metadata/schema.avsc +27843 -23200
  50. acryl_datahub_cloud/metadata/schema_classes.py +29901 -24310
  51. acryl_datahub_cloud/metadata/schemas/ActionRequestInfo.avsc +235 -2
  52. acryl_datahub_cloud/metadata/schemas/ActionWorkflowInfo.avsc +683 -0
  53. acryl_datahub_cloud/metadata/schemas/ActionWorkflowKey.avsc +21 -0
  54. acryl_datahub_cloud/metadata/schemas/Actors.avsc +38 -1
  55. acryl_datahub_cloud/metadata/schemas/ApplicationKey.avsc +31 -0
  56. acryl_datahub_cloud/metadata/schemas/ApplicationProperties.avsc +75 -0
  57. acryl_datahub_cloud/metadata/schemas/Applications.avsc +38 -0
  58. acryl_datahub_cloud/metadata/schemas/AssertionAnalyticsRunEvent.avsc +375 -212
  59. acryl_datahub_cloud/metadata/schemas/AssertionInfo.avsc +147 -20
  60. acryl_datahub_cloud/metadata/schemas/AssertionKey.avsc +1 -1
  61. acryl_datahub_cloud/metadata/schemas/AssertionRunEvent.avsc +191 -21
  62. acryl_datahub_cloud/metadata/schemas/{AssertionSummary.avsc → AssertionRunSummary.avsc} +15 -2
  63. acryl_datahub_cloud/metadata/schemas/AssertionsSummary.avsc +54 -0
  64. acryl_datahub_cloud/metadata/schemas/AssetSettings.avsc +63 -0
  65. acryl_datahub_cloud/metadata/schemas/BusinessAttributeInfo.avsc +7 -3
  66. acryl_datahub_cloud/metadata/schemas/ChartInfo.avsc +20 -6
  67. acryl_datahub_cloud/metadata/schemas/ChartKey.avsc +1 -0
  68. acryl_datahub_cloud/metadata/schemas/ConstraintInfo.avsc +12 -1
  69. acryl_datahub_cloud/metadata/schemas/ContainerKey.avsc +1 -0
  70. acryl_datahub_cloud/metadata/schemas/ContainerProperties.avsc +16 -5
  71. acryl_datahub_cloud/metadata/schemas/CorpGroupEditableInfo.avsc +2 -1
  72. acryl_datahub_cloud/metadata/schemas/CorpGroupInfo.avsc +7 -3
  73. acryl_datahub_cloud/metadata/schemas/CorpGroupKey.avsc +2 -1
  74. acryl_datahub_cloud/metadata/schemas/CorpGroupSettings.avsc +127 -2
  75. acryl_datahub_cloud/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
  76. acryl_datahub_cloud/metadata/schemas/CorpUserInfo.avsc +18 -2
  77. acryl_datahub_cloud/metadata/schemas/CorpUserInvitationStatus.avsc +106 -0
  78. acryl_datahub_cloud/metadata/schemas/CorpUserKey.avsc +4 -1
  79. acryl_datahub_cloud/metadata/schemas/CorpUserSettings.avsc +304 -2
  80. acryl_datahub_cloud/metadata/schemas/CorpUserUsageFeatures.avsc +86 -0
  81. acryl_datahub_cloud/metadata/schemas/DashboardInfo.avsc +11 -5
  82. acryl_datahub_cloud/metadata/schemas/DashboardKey.avsc +1 -0
  83. acryl_datahub_cloud/metadata/schemas/DataContractKey.avsc +2 -1
  84. acryl_datahub_cloud/metadata/schemas/DataFlowInfo.avsc +15 -5
  85. acryl_datahub_cloud/metadata/schemas/DataFlowKey.avsc +1 -0
  86. acryl_datahub_cloud/metadata/schemas/DataHubAiConversationInfo.avsc +256 -0
  87. acryl_datahub_cloud/metadata/schemas/DataHubAiConversationKey.avsc +22 -0
  88. acryl_datahub_cloud/metadata/schemas/DataHubFileInfo.avsc +234 -0
  89. acryl_datahub_cloud/metadata/schemas/DataHubFileKey.avsc +22 -0
  90. acryl_datahub_cloud/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
  91. acryl_datahub_cloud/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
  92. acryl_datahub_cloud/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  93. acryl_datahub_cloud/metadata/schemas/DataHubPageModuleProperties.avsc +308 -0
  94. acryl_datahub_cloud/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  95. acryl_datahub_cloud/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  96. acryl_datahub_cloud/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  97. acryl_datahub_cloud/metadata/schemas/DataJobInfo.avsc +13 -4
  98. acryl_datahub_cloud/metadata/schemas/DataJobInputOutput.avsc +8 -0
  99. acryl_datahub_cloud/metadata/schemas/DataJobKey.avsc +1 -0
  100. acryl_datahub_cloud/metadata/schemas/DataPlatformInfo.avsc +3 -1
  101. acryl_datahub_cloud/metadata/schemas/DataPlatformInstanceProperties.avsc +5 -2
  102. acryl_datahub_cloud/metadata/schemas/DataProcessKey.avsc +4 -0
  103. acryl_datahub_cloud/metadata/schemas/DataProductKey.avsc +2 -0
  104. acryl_datahub_cloud/metadata/schemas/DataProductProperties.avsc +6 -3
  105. acryl_datahub_cloud/metadata/schemas/DataTransformLogic.avsc +4 -2
  106. acryl_datahub_cloud/metadata/schemas/DataTypeInfo.avsc +5 -0
  107. acryl_datahub_cloud/metadata/schemas/DatasetKey.avsc +10 -2
  108. acryl_datahub_cloud/metadata/schemas/DatasetProperties.avsc +12 -5
  109. acryl_datahub_cloud/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  110. acryl_datahub_cloud/metadata/schemas/DocumentInfo.avsc +407 -0
  111. acryl_datahub_cloud/metadata/schemas/DocumentKey.avsc +35 -0
  112. acryl_datahub_cloud/metadata/schemas/DocumentSettings.avsc +79 -0
  113. acryl_datahub_cloud/metadata/schemas/DomainKey.avsc +2 -0
  114. acryl_datahub_cloud/metadata/schemas/DomainProperties.avsc +7 -3
  115. acryl_datahub_cloud/metadata/schemas/EditableContainerProperties.avsc +2 -1
  116. acryl_datahub_cloud/metadata/schemas/EditableDashboardProperties.avsc +2 -1
  117. acryl_datahub_cloud/metadata/schemas/EditableDataFlowProperties.avsc +2 -1
  118. acryl_datahub_cloud/metadata/schemas/EditableDataJobProperties.avsc +2 -1
  119. acryl_datahub_cloud/metadata/schemas/EditableDatasetProperties.avsc +2 -1
  120. acryl_datahub_cloud/metadata/schemas/EditableERModelRelationshipProperties.avsc +2 -1
  121. acryl_datahub_cloud/metadata/schemas/EditableMLFeatureProperties.avsc +2 -1
  122. acryl_datahub_cloud/metadata/schemas/EditableMLFeatureTableProperties.avsc +2 -1
  123. acryl_datahub_cloud/metadata/schemas/EditableMLModelGroupProperties.avsc +2 -1
  124. acryl_datahub_cloud/metadata/schemas/EditableMLModelProperties.avsc +2 -1
  125. acryl_datahub_cloud/metadata/schemas/EditableNotebookProperties.avsc +2 -1
  126. acryl_datahub_cloud/metadata/schemas/EditableSchemaMetadata.avsc +4 -2
  127. acryl_datahub_cloud/metadata/schemas/EntityTypeInfo.avsc +5 -0
  128. acryl_datahub_cloud/metadata/schemas/ExecutionRequestArtifactsLocation.avsc +16 -0
  129. acryl_datahub_cloud/metadata/schemas/ExecutionRequestKey.avsc +2 -1
  130. acryl_datahub_cloud/metadata/schemas/FormAssignmentStatus.avsc +36 -0
  131. acryl_datahub_cloud/metadata/schemas/FormInfo.avsc +6 -0
  132. acryl_datahub_cloud/metadata/schemas/FormKey.avsc +3 -1
  133. acryl_datahub_cloud/metadata/schemas/FormNotifications.avsc +69 -0
  134. acryl_datahub_cloud/metadata/schemas/FormSettings.avsc +30 -0
  135. acryl_datahub_cloud/metadata/schemas/GlobalSettingsInfo.avsc +416 -0
  136. acryl_datahub_cloud/metadata/schemas/GlobalTags.avsc +2 -1
  137. acryl_datahub_cloud/metadata/schemas/GlossaryNodeInfo.avsc +3 -1
  138. acryl_datahub_cloud/metadata/schemas/GlossaryNodeKey.avsc +1 -0
  139. acryl_datahub_cloud/metadata/schemas/GlossaryTermInfo.avsc +3 -1
  140. acryl_datahub_cloud/metadata/schemas/GlossaryTermKey.avsc +2 -0
  141. acryl_datahub_cloud/metadata/schemas/IcebergWarehouseInfo.avsc +4 -0
  142. acryl_datahub_cloud/metadata/schemas/IncidentActivityEvent.avsc +3 -3
  143. acryl_datahub_cloud/metadata/schemas/IncidentInfo.avsc +3 -3
  144. acryl_datahub_cloud/metadata/schemas/InferredMetadata.avsc +71 -1
  145. acryl_datahub_cloud/metadata/schemas/InputFields.avsc +2 -1
  146. acryl_datahub_cloud/metadata/schemas/InviteToken.avsc +26 -0
  147. acryl_datahub_cloud/metadata/schemas/LineageFeatures.avsc +67 -42
  148. acryl_datahub_cloud/metadata/schemas/LogicalParent.avsc +145 -0
  149. acryl_datahub_cloud/metadata/schemas/MLFeatureKey.avsc +4 -1
  150. acryl_datahub_cloud/metadata/schemas/MLFeatureTableKey.avsc +4 -1
  151. acryl_datahub_cloud/metadata/schemas/MLModelDeploymentKey.avsc +7 -1
  152. acryl_datahub_cloud/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
  153. acryl_datahub_cloud/metadata/schemas/MLModelGroupKey.avsc +9 -1
  154. acryl_datahub_cloud/metadata/schemas/MLModelKey.avsc +9 -1
  155. acryl_datahub_cloud/metadata/schemas/MLModelProperties.avsc +4 -2
  156. acryl_datahub_cloud/metadata/schemas/MLPrimaryKeyKey.avsc +4 -1
  157. acryl_datahub_cloud/metadata/schemas/MetadataChangeEvent.avsc +424 -97
  158. acryl_datahub_cloud/metadata/schemas/MetadataChangeLog.avsc +65 -44
  159. acryl_datahub_cloud/metadata/schemas/MetadataChangeProposal.avsc +64 -0
  160. acryl_datahub_cloud/metadata/schemas/MonitorAnomalyEvent.avsc +84 -29
  161. acryl_datahub_cloud/metadata/schemas/MonitorInfo.avsc +221 -23
  162. acryl_datahub_cloud/metadata/schemas/MonitorKey.avsc +9 -1
  163. acryl_datahub_cloud/metadata/schemas/MonitorSuiteInfo.avsc +128 -3
  164. acryl_datahub_cloud/metadata/schemas/NotebookInfo.avsc +5 -2
  165. acryl_datahub_cloud/metadata/schemas/NotebookKey.avsc +1 -0
  166. acryl_datahub_cloud/metadata/schemas/NotificationRequest.avsc +91 -4
  167. acryl_datahub_cloud/metadata/schemas/Operation.avsc +17 -0
  168. acryl_datahub_cloud/metadata/schemas/Ownership.avsc +71 -1
  169. acryl_datahub_cloud/metadata/schemas/QueryProperties.avsc +4 -2
  170. acryl_datahub_cloud/metadata/schemas/QuerySubjects.avsc +2 -13
  171. acryl_datahub_cloud/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  172. acryl_datahub_cloud/metadata/schemas/RoleProperties.avsc +3 -1
  173. acryl_datahub_cloud/metadata/schemas/SchemaFieldInfo.avsc +3 -1
  174. acryl_datahub_cloud/metadata/schemas/SchemaFieldKey.avsc +3 -0
  175. acryl_datahub_cloud/metadata/schemas/SchemaMetadata.avsc +2 -1
  176. acryl_datahub_cloud/metadata/schemas/SemanticContent.avsc +123 -0
  177. acryl_datahub_cloud/metadata/schemas/StructuredProperties.avsc +69 -0
  178. acryl_datahub_cloud/metadata/schemas/StructuredPropertyDefinition.avsc +15 -4
  179. acryl_datahub_cloud/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  180. acryl_datahub_cloud/metadata/schemas/SubscriptionInfo.avsc +136 -5
  181. acryl_datahub_cloud/metadata/schemas/SubscriptionKey.avsc +2 -1
  182. acryl_datahub_cloud/metadata/schemas/SystemMetadata.avsc +147 -0
  183. acryl_datahub_cloud/metadata/schemas/TagProperties.avsc +3 -1
  184. acryl_datahub_cloud/metadata/schemas/TestInfo.avsc +2 -1
  185. acryl_datahub_cloud/metadata/schemas/UpstreamLineage.avsc +9 -0
  186. acryl_datahub_cloud/metadata/schemas/UsageFeatures.avsc +10 -0
  187. acryl_datahub_cloud/metadata/schemas/__init__.py +3 -3
  188. acryl_datahub_cloud/notifications/__init__.py +0 -0
  189. acryl_datahub_cloud/notifications/notification_recipient_builder.py +399 -0
  190. acryl_datahub_cloud/sdk/__init__.py +69 -0
  191. acryl_datahub_cloud/sdk/assertion/__init__.py +58 -0
  192. acryl_datahub_cloud/sdk/assertion/assertion_base.py +779 -0
  193. acryl_datahub_cloud/sdk/assertion/column_metric_assertion.py +191 -0
  194. acryl_datahub_cloud/sdk/assertion/column_value_assertion.py +431 -0
  195. acryl_datahub_cloud/sdk/assertion/freshness_assertion.py +201 -0
  196. acryl_datahub_cloud/sdk/assertion/schema_assertion.py +268 -0
  197. acryl_datahub_cloud/sdk/assertion/smart_column_metric_assertion.py +212 -0
  198. acryl_datahub_cloud/sdk/assertion/smart_freshness_assertion.py +165 -0
  199. acryl_datahub_cloud/sdk/assertion/smart_sql_assertion.py +156 -0
  200. acryl_datahub_cloud/sdk/assertion/smart_volume_assertion.py +162 -0
  201. acryl_datahub_cloud/sdk/assertion/sql_assertion.py +273 -0
  202. acryl_datahub_cloud/sdk/assertion/types.py +20 -0
  203. acryl_datahub_cloud/sdk/assertion/volume_assertion.py +156 -0
  204. acryl_datahub_cloud/sdk/assertion_client/__init__.py +0 -0
  205. acryl_datahub_cloud/sdk/assertion_client/column_metric.py +545 -0
  206. acryl_datahub_cloud/sdk/assertion_client/column_value.py +617 -0
  207. acryl_datahub_cloud/sdk/assertion_client/freshness.py +371 -0
  208. acryl_datahub_cloud/sdk/assertion_client/helpers.py +166 -0
  209. acryl_datahub_cloud/sdk/assertion_client/schema.py +358 -0
  210. acryl_datahub_cloud/sdk/assertion_client/smart_column_metric.py +540 -0
  211. acryl_datahub_cloud/sdk/assertion_client/smart_freshness.py +373 -0
  212. acryl_datahub_cloud/sdk/assertion_client/smart_sql.py +411 -0
  213. acryl_datahub_cloud/sdk/assertion_client/smart_volume.py +380 -0
  214. acryl_datahub_cloud/sdk/assertion_client/sql.py +410 -0
  215. acryl_datahub_cloud/sdk/assertion_client/volume.py +446 -0
  216. acryl_datahub_cloud/sdk/assertion_input/__init__.py +0 -0
  217. acryl_datahub_cloud/sdk/assertion_input/assertion_input.py +1470 -0
  218. acryl_datahub_cloud/sdk/assertion_input/column_assertion_constants.py +114 -0
  219. acryl_datahub_cloud/sdk/assertion_input/column_assertion_utils.py +284 -0
  220. acryl_datahub_cloud/sdk/assertion_input/column_metric_assertion_input.py +759 -0
  221. acryl_datahub_cloud/sdk/assertion_input/column_metric_constants.py +109 -0
  222. acryl_datahub_cloud/sdk/assertion_input/column_value_assertion_input.py +810 -0
  223. acryl_datahub_cloud/sdk/assertion_input/freshness_assertion_input.py +305 -0
  224. acryl_datahub_cloud/sdk/assertion_input/schema_assertion_input.py +413 -0
  225. acryl_datahub_cloud/sdk/assertion_input/smart_column_metric_assertion_input.py +793 -0
  226. acryl_datahub_cloud/sdk/assertion_input/smart_freshness_assertion_input.py +218 -0
  227. acryl_datahub_cloud/sdk/assertion_input/smart_sql_assertion_input.py +181 -0
  228. acryl_datahub_cloud/sdk/assertion_input/smart_volume_assertion_input.py +189 -0
  229. acryl_datahub_cloud/sdk/assertion_input/sql_assertion_input.py +320 -0
  230. acryl_datahub_cloud/sdk/assertion_input/volume_assertion_input.py +635 -0
  231. acryl_datahub_cloud/sdk/assertions_client.py +1074 -0
  232. acryl_datahub_cloud/sdk/entities/__init__.py +0 -0
  233. acryl_datahub_cloud/sdk/entities/assertion.py +439 -0
  234. acryl_datahub_cloud/sdk/entities/monitor.py +291 -0
  235. acryl_datahub_cloud/sdk/entities/subscription.py +100 -0
  236. acryl_datahub_cloud/sdk/errors.py +34 -0
  237. acryl_datahub_cloud/sdk/resolver_client.py +42 -0
  238. acryl_datahub_cloud/sdk/subscription_client.py +737 -0
  239. {acryl_datahub_cloud-0.3.10rc4.dist-info → acryl_datahub_cloud-0.3.16.1rc0.dist-info}/METADATA +49 -43
  240. {acryl_datahub_cloud-0.3.10rc4.dist-info → acryl_datahub_cloud-0.3.16.1rc0.dist-info}/RECORD +243 -145
  241. {acryl_datahub_cloud-0.3.10rc4.dist-info → acryl_datahub_cloud-0.3.16.1rc0.dist-info}/WHEEL +1 -1
  242. {acryl_datahub_cloud-0.3.10rc4.dist-info → acryl_datahub_cloud-0.3.16.1rc0.dist-info}/entry_points.txt +1 -0
  243. {acryl_datahub_cloud-0.3.10rc4.dist-info → acryl_datahub_cloud-0.3.16.1rc0.dist-info}/top_level.txt +0 -0
@@ -22,12 +22,14 @@ from polars.datatypes import DataTypeClass
22
22
  from pydantic import Field
23
23
  from scipy.stats import expon
24
24
 
25
+ from acryl_datahub_cloud.datahub_usage_reporting.excluded import EXCLUDED_PATTERNS
25
26
  from acryl_datahub_cloud.datahub_usage_reporting.query_builder import QueryBuilder
26
27
  from acryl_datahub_cloud.datahub_usage_reporting.usage_feature_patch_builder import (
27
28
  UsageFeaturePatchBuilder,
28
29
  )
29
30
  from acryl_datahub_cloud.elasticsearch.config import ElasticSearchClientConfig
30
31
  from acryl_datahub_cloud.metadata.schema_classes import (
32
+ CorpUserUsageFeaturesClass,
31
33
  QueryUsageFeaturesClass,
32
34
  UsageFeaturesClass,
33
35
  )
@@ -40,7 +42,7 @@ from datahub.ingestion.api.decorators import (
40
42
  platform_name,
41
43
  support_status,
42
44
  )
43
- from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
45
+ from datahub.ingestion.api.source import MetadataWorkUnitProcessor
44
46
  from datahub.ingestion.api.source_helpers import auto_workunit_reporter
45
47
  from datahub.ingestion.api.workunit import MetadataWorkUnit
46
48
  from datahub.ingestion.graph.client import DatahubClientConfig
@@ -114,12 +116,12 @@ class DataHubUsageFeatureReportingSourceConfig(
114
116
  30, description="Timeout in seconds for the search queries."
115
117
  )
116
118
  extract_batch_size: int = Field(
117
- 1000,
119
+ 5000,
118
120
  description="The number of documents to retrieve in each batch from ElasticSearch or OpenSearch.",
119
121
  )
120
122
 
121
123
  extract_delay: Optional[float] = Field(
122
- 0.25,
124
+ 0,
123
125
  description="The delay in seconds between each batch extraction from ElasticSearch or OpenSearch.",
124
126
  )
125
127
 
@@ -135,6 +137,10 @@ class DataHubUsageFeatureReportingSourceConfig(
135
137
  None,
136
138
  description="Optional configuration for stateful ingestion, including stale metadata removal.",
137
139
  )
140
+ user_usage_enabled: bool = Field(
141
+ True,
142
+ description="Flag to enable or disable user usage statistics collection.",
143
+ )
138
144
  dataset_usage_enabled: bool = Field(
139
145
  True,
140
146
  description="Flag to enable or disable dataset usage statistics collection.",
@@ -177,7 +183,7 @@ class DataHubUsageFeatureReportingSourceConfig(
177
183
  # This option is only needed here until we are sure that the streaming mode is stable.
178
184
  # then we can remove it and control it with the streaming_mode option.
179
185
  experimental_full_streaming: bool = Field(
180
- False,
186
+ True,
181
187
  description="Flag to enable full streaming mode.'",
182
188
  )
183
189
 
@@ -191,6 +197,11 @@ class DataHubUsageFeatureReportingSourceConfig(
191
197
  description="Flag to generate MCP patch for usage features.'",
192
198
  )
193
199
 
200
+ excluded_platforms: List[str] = Field(
201
+ EXCLUDED_PATTERNS,
202
+ description="List of platforms to exclude from usage statistics collection. This is done to avoid invite user functionality to be filled with service accounts.",
203
+ )
204
+
194
205
 
195
206
  def exp_cdf(series: polars.Series) -> polars.Series:
196
207
  with PerfTimer() as timer:
@@ -228,7 +239,7 @@ def exp_cdf(series: polars.Series) -> polars.Series:
228
239
 
229
240
 
230
241
  @dataclass
231
- class DatahubUsageFeatureReport(IngestionStageReport, StatefulIngestionReport):
242
+ class DatahubUsageFeatureReport(StatefulIngestionReport, IngestionStageReport):
232
243
  dataset_platforms_count: Dict[str, int] = field(
233
244
  default_factory=lambda: defaultdict(lambda: 0)
234
245
  )
@@ -241,10 +252,6 @@ class DatahubUsageFeatureReport(IngestionStageReport, StatefulIngestionReport):
241
252
  default_factory=lambda: defaultdict(lambda: PerfTimer())
242
253
  )
243
254
 
244
- dataset_usage_processing_time: PerfTimer = PerfTimer()
245
- dashboard_usage_processing_time: PerfTimer = PerfTimer()
246
- chart_usage_processing_time: PerfTimer = PerfTimer()
247
- query_usage_processing_time: PerfTimer = PerfTimer()
248
255
  query_platforms_count: Dict[str, int] = field(
249
256
  default_factory=lambda: defaultdict(lambda: 0)
250
257
  )
@@ -395,18 +402,10 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
395
402
  "last_modified_at": (
396
403
  doc["_source"]["lastModifiedAt"]
397
404
  if "lastModifiedAt" in doc["_source"]
398
- else (
399
- doc["_source"]["lastModifiedAt"]
400
- if "lastModifiedAt" in doc["_source"]
401
- else None
402
- )
405
+ else (doc["_source"].get("lastModifiedAt", None))
403
406
  ),
404
407
  "platform": doc["_source"]["platform"],
405
- "removed": (
406
- doc["_source"]["removed"]
407
- if "removed" in doc["_source"]
408
- else False
409
- ),
408
+ "removed": (doc["_source"].get("removed", False)),
410
409
  }
411
410
 
412
411
  time_taken = timer.elapsed_seconds()
@@ -509,11 +508,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
509
508
  "eventGranularity": doc["_source"].get("eventGranularity"),
510
509
  "totalSqlQueries": doc["_source"].get("totalSqlQueries", 0),
511
510
  "uniqueUserCount": doc["_source"].get("uniqueUserCount", 0),
512
- "userCounts": (
513
- doc["_source"]["event"]["userCounts"]
514
- if "userCounts" in doc["_source"]["event"]
515
- else None
516
- ),
511
+ "userCounts": (doc["_source"]["event"].get("userCounts", None)),
517
512
  "platform": platform,
518
513
  }
519
514
  except KeyError as e:
@@ -525,7 +520,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
525
520
  time_taken = timer.elapsed_seconds()
526
521
  logger.info(f"DatasetUsage processing took {time_taken:.3f} seconds")
527
522
 
528
- def search_score(
523
+ def search_score( # noqa: C901
529
524
  self, urn: str, last_update_time: int, usage_percentile: int
530
525
  ) -> SearchRankingMultipliers:
531
526
  usage_search_score_multiplier = 1.0
@@ -622,27 +617,27 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
622
617
  [endpoint],
623
618
  http_auth=(user, password),
624
619
  use_ssl=(
625
- True
626
- if self.config.search_index
627
- and self.config.search_index.use_ssl
628
- else False
620
+ bool(
621
+ self.config.search_index
622
+ and self.config.search_index.use_ssl
623
+ )
629
624
  ),
630
625
  )
631
626
 
632
- response = server.create_pit(index, keep_alive="10m")
627
+ # response = server.create_pit(index, keep_alive="10m")
633
628
 
634
629
  # TODO: Save PIT, we can resume processing based on <pit, search_after> tuple
635
- pit = response.get("pit_id")
636
- query_copy.update({"pit": {"id": pit, "keep_alive": "10m"}})
630
+ # pit = response.get("pit_id")
631
+ # query_copy.update({"pit": {"id": pit, "keep_alive": "10m"}})
637
632
  else:
638
633
  server = Elasticsearch(
639
634
  [endpoint],
640
635
  http_auth=(user, password),
641
636
  use_ssl=(
642
- True
643
- if self.config.search_index
644
- and self.config.search_index.use_ssl
645
- else False
637
+ bool(
638
+ self.config.search_index
639
+ and self.config.search_index.use_ssl
640
+ )
646
641
  ),
647
642
  )
648
643
 
@@ -737,23 +732,26 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
737
732
  polars.Duration(): pa.duration("ns"),
738
733
  }
739
734
 
740
- if polars_dtype in [type(key) for key in type_mapping.keys()]:
735
+ if polars_dtype in [type(key) for key in type_mapping]:
741
736
  return type_mapping[polars_dtype]
742
737
  elif polars_dtype == polars.Categorical:
743
738
  return pa.dictionary(index_type=pa.int32(), value_type=pa.string())
744
739
  elif isinstance(polars_dtype, polars.Struct):
745
740
  return pa.struct(
746
- {
747
- field.name: convert_dtype(field.dtype)
741
+ [
742
+ pa.field(field.name, convert_dtype(field.dtype))
748
743
  for field in polars_dtype.fields
749
- }
744
+ ]
750
745
  )
751
746
  elif isinstance(polars_dtype, polars.List):
752
747
  return pa.list_(convert_dtype(polars_dtype.inner))
753
748
  else:
754
749
  raise ValueError(f"Unsupported Polars dtype: {polars_dtype}")
755
750
 
756
- fields = [(name, convert_dtype(dtype)) for name, dtype in polars_schema.items()]
751
+ fields = [
752
+ pa.field(name, convert_dtype(dtype))
753
+ for name, dtype in polars_schema.items()
754
+ ]
757
755
  return pa.schema(fields)
758
756
 
759
757
  def batch_write_parquet(
@@ -846,7 +844,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
846
844
  .drop(["removed"])
847
845
  )
848
846
 
849
- return wdf.collect(streaming=self.config.streaming_mode).lazy()
847
+ return wdf
850
848
 
851
849
  def load_write_usage_server_side_aggregation(
852
850
  self, soft_deleted_entities_df: polars.LazyFrame
@@ -935,6 +933,11 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
935
933
 
936
934
  return dataset_df
937
935
 
936
+ def generate_user_usage_mcps(self) -> Iterable[MetadataWorkUnit]:
937
+ with polars.StringCache():
938
+ user_usage_lf = self.generate_user_usage()
939
+ yield from self.generate_user_usage_mcp_from_lazyframe(user_usage_lf)
940
+
938
941
  def generate_dataset_usage_mcps(self) -> Iterable[MetadataWorkUnit]:
939
942
  with polars.StringCache():
940
943
  dataset_usage_df = self.generate_dataset_usage()
@@ -970,48 +973,35 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
970
973
  ]
971
974
 
972
975
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
976
+ if self.config.user_usage_enabled:
977
+ with self.report.new_stage("generate user usage"):
978
+ yield from self.generate_user_usage_mcps()
979
+
973
980
  if self.config.dataset_usage_enabled:
974
- with self.report.dataset_usage_processing_time as timer:
975
- self.report.new_stage("generate dataset usage")
981
+ with self.report.new_stage("generate dataset usage"):
976
982
  yield from self.generate_dataset_usage_mcps()
977
- time_taken = timer.elapsed_seconds()
978
- logger.info(f"Dataset Usage generation took {time_taken:.3f} seconds")
979
983
 
980
984
  if self.config.dashboard_usage_enabled:
981
- with self.report.dashboard_usage_processing_time as timer:
982
- self.report.new_stage("generate dashboard usage")
985
+ with self.report.new_stage("generate dashboard usage"):
983
986
  yield from self.generate_dashboard_usage_mcps()
984
987
 
985
- time_taken = timer.elapsed_seconds()
986
- logger.info(f"Dashboard Usage generation took {time_taken:.3f}")
987
-
988
988
  if self.config.chart_usage_enabled:
989
- with self.report.chart_usage_processing_time as timer:
990
- self.report.new_stage("generate chart usage")
991
-
989
+ with self.report.new_stage("generate chart usage"):
992
990
  yield from self.generate_chart_usage_mcps()
993
991
 
994
- time_taken = timer.elapsed_seconds()
995
- logger.info(f"Chart Usage generation took {time_taken:.3f}")
996
-
997
992
  if self.config.query_usage_enabled:
998
- with self.report.query_usage_processing_time as timer:
999
- self.report.new_stage("generate query usage")
1000
-
993
+ with self.report.new_stage("generate query usage"):
1001
994
  yield from self.generate_query_usage_mcps()
1002
995
 
1003
- time_taken = timer.elapsed_seconds()
1004
- logger.info(f"Query Usage generation took {time_taken:.3f}")
996
+ with self.report.new_stage("end so time is calculated for last stage"):
997
+ pass
1005
998
 
1006
999
  def generate_mcp_from_lazyframe(
1007
1000
  self, lazy_frame: polars.LazyFrame
1008
1001
  ) -> Iterable[MetadataWorkUnit]:
1009
- num = 0
1010
1002
  for row in lazy_frame.collect(
1011
- streaming=self.config.experimental_full_streaming
1003
+ engine="streaming" if self.config.experimental_full_streaming else "auto"
1012
1004
  ).to_struct():
1013
- num += 1
1014
-
1015
1005
  if "siblings" in row and row["siblings"]:
1016
1006
  logger.info(f"Siblings found for urn: {row['urn']} -> row['siblings']")
1017
1007
 
@@ -1067,7 +1057,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1067
1057
  uniqueUserPercentileLast30Days=int(
1068
1058
  row.get("distinct_user_rank_percentile", 0) or 0
1069
1059
  ),
1070
- writeCountLast30Days=int(row.get("write_rank_percentile", 0) or 0)
1060
+ writeCountLast30Days=int(row.get("write_count", 0) or 0)
1071
1061
  if not self.config.disable_write_usage
1072
1062
  else None,
1073
1063
  writeCountPercentileLast30Days=int(
@@ -1101,10 +1091,9 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1101
1091
  def generate_query_usage_mcp_from_lazyframe(
1102
1092
  self, lazy_frame: polars.LazyFrame
1103
1093
  ) -> Iterable[MetadataWorkUnit]:
1104
- num = 0
1105
- for row in lazy_frame.collect().iter_rows(named=True):
1106
- num += 1
1107
-
1094
+ for row in lazy_frame.collect(
1095
+ engine="streaming" if self.config.experimental_full_streaming else "auto"
1096
+ ).iter_rows(named=True):
1108
1097
  query_usage_features = QueryUsageFeaturesClass(
1109
1098
  queryCountLast30Days=int(row.get("totalSqlQueries", 0) or 0),
1110
1099
  queryCountTotal=None, # This is not implemented
@@ -1124,6 +1113,47 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1124
1113
  row["urn"], query_usage_features
1125
1114
  )
1126
1115
 
1116
+ def _convert_platform_pairs_to_dict(
1117
+ self,
1118
+ platform_pairs: Optional[List[Dict[str, Any]]],
1119
+ value_key: str = "platform_total",
1120
+ ) -> Optional[Dict[str, Any]]:
1121
+ """Convert list of platform usage structs to dictionary."""
1122
+ if not platform_pairs:
1123
+ return None
1124
+
1125
+ return {
1126
+ pair["platform_urn"]: pair[value_key]
1127
+ for pair in platform_pairs
1128
+ if pair["platform_urn"] is not None
1129
+ }
1130
+
1131
+ def generate_user_usage_mcp_from_lazyframe(
1132
+ self, lazy_frame: polars.LazyFrame
1133
+ ) -> Iterable[MetadataWorkUnit]:
1134
+ for row in lazy_frame.collect(
1135
+ engine="streaming" if self.config.experimental_full_streaming else "auto"
1136
+ ).iter_rows(named=True):
1137
+ user_usage_features = CorpUserUsageFeaturesClass(
1138
+ userUsageTotalPast30Days=int(
1139
+ row.get("userUsageTotalPast30Days", 0) or 0
1140
+ ),
1141
+ userPlatformUsageTotalsPast30Days=self._convert_platform_pairs_to_dict(
1142
+ row.get("platform_usage_pairs", [])
1143
+ ),
1144
+ userPlatformUsagePercentilePast30Days=self._convert_platform_pairs_to_dict(
1145
+ row.get("platform_usage_percentiles", []),
1146
+ "platform_rank_percentile",
1147
+ ),
1148
+ userUsagePercentilePast30Days=row.get("userUsagePercentilePast30Days"),
1149
+ userTopDatasetsByUsage=self._convert_top_datasets_to_dict(
1150
+ row.get("top_datasets_map", [])
1151
+ ),
1152
+ )
1153
+ yield MetadataChangeProposalWrapper(
1154
+ entityUrn=row["user"], aspect=user_usage_features
1155
+ ).as_workunit(is_primary_source=False)
1156
+
1127
1157
  def generate_usage_feature_mcp(
1128
1158
  self, urn: str, usage_feature: UsageFeaturesClass
1129
1159
  ) -> Iterable[MetadataWorkUnit]:
@@ -1158,9 +1188,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1158
1188
 
1159
1189
  return self.generate_dashboard_chart_usage(entity_index, usage_index)
1160
1190
 
1161
- def generate_dashboard_chart_usage(
1162
- self, entity_index: str, usage_index: str
1163
- ) -> polars.LazyFrame:
1191
+ def _generate_dashboard_chart_entities(self, entity_index: str) -> polars.LazyFrame:
1164
1192
  entity_schema = {
1165
1193
  "entity_urn": polars.Categorical,
1166
1194
  "removed": polars.Boolean,
@@ -1177,7 +1205,12 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1177
1205
  process_function=self.soft_deleted_batch,
1178
1206
  )
1179
1207
 
1180
- dashboard_usage_schema = {
1208
+ return entities_df
1209
+
1210
+ def _generate_dashboard_chart_usage(
1211
+ self, entities_df: polars.LazyFrame, usage_index: str
1212
+ ) -> polars.LazyFrame:
1213
+ entities_usage_schema = {
1181
1214
  "timestampMillis": polars.Int64,
1182
1215
  "lastObserved": polars.Int64,
1183
1216
  "urn": polars.Categorical,
@@ -1195,7 +1228,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1195
1228
  }
1196
1229
 
1197
1230
  lf = self.load_data_from_es_to_lf(
1198
- schema=dashboard_usage_schema,
1231
+ schema=entities_usage_schema,
1199
1232
  index=usage_index,
1200
1233
  query=QueryBuilder.get_dashboard_usage_query(self.config.lookback_days),
1201
1234
  process_function=self.process_dashboard_usage,
@@ -1214,6 +1247,15 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1214
1247
  .alias("row_num")
1215
1248
  ).filter(polars.col("row_num") == 1)
1216
1249
 
1250
+ return lf
1251
+
1252
+ def generate_dashboard_chart_usage(
1253
+ self, entity_index: str, usage_index: str
1254
+ ) -> polars.LazyFrame:
1255
+ entities_df = self._generate_dashboard_chart_entities(entity_index)
1256
+
1257
+ lf = self._generate_dashboard_chart_usage(entities_df, usage_index)
1258
+
1217
1259
  # lf = lf.filter(polars.col("urn") == "urn:li:dashboard:(looker,dashboards.8)")
1218
1260
  # "urn:li:dashboard:(looker,dashboards.8)"
1219
1261
 
@@ -1287,7 +1329,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1287
1329
  .is_not_null()
1288
1330
  # We only want to downrank datasets that have a search score multiplier greater than 1. 1 is the minimum score of a dataset
1289
1331
  .and_(polars.col("combinedSearchRankingMultiplier").ne(1))
1290
- ) # noqa: E712
1332
+ )
1291
1333
  .filter(polars.col("removed") == False) # noqa: E712
1292
1334
  .drop(["removed"])
1293
1335
  .drop(["last_modified_at"])
@@ -1326,7 +1368,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1326
1368
  query_entities = self.load_data_from_es_to_lf(
1327
1369
  schema=query_entities_schema,
1328
1370
  index=entity_index,
1329
- query=QueryBuilder.get_query_entities_query(),
1371
+ query=QueryBuilder.get_query_entities_query(self.config.lookback_days),
1330
1372
  process_function=self.queries_entities_batch,
1331
1373
  )
1332
1374
 
@@ -1383,6 +1425,380 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1383
1425
 
1384
1426
  return usage_with_top_users_with_ranks
1385
1427
 
1428
+ def _generate_user_usage_for_dataset(self) -> polars.LazyFrame:
1429
+ datasets_lf = self.get_datasets()
1430
+ if self.config.set_upstream_table_max_modification_time_for_views:
1431
+ datasets_lf = self.set_table_modification_time_for_views(datasets_lf)
1432
+
1433
+ lf = self.load_dataset_usage()
1434
+
1435
+ # Polaris/pandas join merges the join column into one column and that's why we need to filter based on the removed column
1436
+ lf = (
1437
+ lf.join(datasets_lf, left_on="urn", right_on="entity_urn", how="left")
1438
+ .filter(polars.col("removed") == False) # noqa: E712
1439
+ .drop(["removed"])
1440
+ )
1441
+
1442
+ users_lf = (
1443
+ lf.explode("userCounts")
1444
+ .unnest("userCounts")
1445
+ .filter(polars.col("user").is_not_null())
1446
+ )
1447
+
1448
+ user_dataset_usage_lf = self._create_user_dataset_usage_map(users_lf)
1449
+ return user_dataset_usage_lf
1450
+
1451
+ @staticmethod
1452
+ def _convert_top_datasets_to_dict(
1453
+ top_datasets_list: Optional[List[Dict[str, Any]]],
1454
+ ) -> Optional[Dict[str, float]]:
1455
+ """
1456
+ Convert list of top datasets structs to dictionary as expected by CorpUserUsageFeatures schema.
1457
+
1458
+ Args:
1459
+ top_datasets_list: List of dictionaries with 'dataset_urn' and 'count' keys
1460
+
1461
+ Returns:
1462
+ Dictionary mapping dataset URN to usage count, or None if input is empty
1463
+ """
1464
+ if not top_datasets_list:
1465
+ return None
1466
+
1467
+ top_datasets_dict = {
1468
+ item["dataset_urn"]: float(item["count"])
1469
+ for item in top_datasets_list
1470
+ if isinstance(item, dict) and "dataset_urn" in item and "count" in item
1471
+ }
1472
+
1473
+ return top_datasets_dict if top_datasets_dict else None
1474
+
1475
+ def _create_user_dataset_usage_map(
1476
+ self, users_lf: polars.LazyFrame, top_n: int = 25
1477
+ ) -> polars.LazyFrame:
1478
+ """
1479
+ Creates a lazyframe with user string and map of top N datasets by usage.
1480
+
1481
+ Args:
1482
+ users_lf: LazyFrame containing user usage data with columns: user, urn, platform, count
1483
+ top_n: Number of top datasets to include per user (default: 25)
1484
+
1485
+ Returns:
1486
+ LazyFrame with columns:
1487
+ - user: string column containing the user identifier
1488
+ - top_datasets_map: list of structs with dataset_urn (string), count (int), and platform_urn (string)
1489
+ - userUsageTotalPast30Days: total usage count for the user across all datasets
1490
+ - userPlatformUsageTotalsPast30Days: map from platform URN to usage totals
1491
+ """
1492
+
1493
+ # Create intermediate lazy frame with filtered users and aggregated counts
1494
+ user_dataset_aggregated = (
1495
+ users_lf.filter(polars.col("user").str.contains("@"))
1496
+ .group_by("user", "urn", "platform")
1497
+ .agg(polars.col("count").sum().alias("total_count"))
1498
+ .with_columns(
1499
+ # Direct string formatting - vectorized operation
1500
+ polars.format("urn:li:dataPlatform:{}", polars.col("platform")).alias(
1501
+ "platform_urn"
1502
+ )
1503
+ )
1504
+ )
1505
+
1506
+ # Calculate user totals
1507
+ user_totals = user_dataset_aggregated.group_by("user").agg(
1508
+ polars.col("total_count").sum().alias("userUsageTotalPast30Days")
1509
+ )
1510
+
1511
+ # Calculate platform totals for each user - keep as list of structs
1512
+ platform_totals = (
1513
+ user_dataset_aggregated.group_by("user", "platform_urn")
1514
+ .agg(polars.col("total_count").sum().alias("platform_total"))
1515
+ .filter(polars.col("platform_urn").is_not_null())
1516
+ .group_by("user")
1517
+ .agg(
1518
+ polars.struct(
1519
+ [
1520
+ polars.col("platform_urn"),
1521
+ polars.col("platform_total").cast(polars.Float64),
1522
+ ]
1523
+ ).alias("platform_usage_pairs")
1524
+ )
1525
+ )
1526
+
1527
+ # Calculate top datasets
1528
+ top_datasets = (
1529
+ user_dataset_aggregated.with_columns(
1530
+ polars.col("total_count")
1531
+ .rank(descending=True, method="ordinal")
1532
+ .over("user")
1533
+ .alias("dataset_rank")
1534
+ )
1535
+ .filter(polars.col("dataset_rank") <= top_n)
1536
+ .group_by("user")
1537
+ .agg(
1538
+ polars.struct(
1539
+ [
1540
+ polars.col("urn").alias("dataset_urn"),
1541
+ polars.col("total_count").alias("count"),
1542
+ polars.col("platform_urn"),
1543
+ ]
1544
+ )
1545
+ .sort_by("total_count", descending=True)
1546
+ .alias("top_datasets_map")
1547
+ )
1548
+ )
1549
+
1550
+ # Join all results
1551
+ return top_datasets.join(user_totals, on="user", how="left").join(
1552
+ platform_totals, on="user", how="left"
1553
+ )
1554
+
1555
+ def _combine_user_usage_data(
1556
+ self,
1557
+ dataset_usage_lf: polars.LazyFrame,
1558
+ dashboard_usage_lf: polars.LazyFrame,
1559
+ chart_usage_lf: polars.LazyFrame,
1560
+ ) -> polars.LazyFrame:
1561
+ """
1562
+ Combines user usage data from dataset, dashboard, and chart sources.
1563
+
1564
+ Args:
1565
+ dataset_usage_lf: LazyFrame with dataset usage data containing top_datasets_map
1566
+ dashboard_usage_lf: LazyFrame with dashboard usage data
1567
+ chart_usage_lf: LazyFrame with chart usage data
1568
+
1569
+ Returns:
1570
+ Combined LazyFrame with aggregated usage data per user
1571
+ """
1572
+ user_totals = self._combine_user_totals(
1573
+ dataset_usage_lf, dashboard_usage_lf, chart_usage_lf
1574
+ )
1575
+
1576
+ platform_pairs = self._combine_platform_pairs(
1577
+ dataset_usage_lf, dashboard_usage_lf, chart_usage_lf
1578
+ )
1579
+
1580
+ result = user_totals.join(platform_pairs, on="user", how="left")
1581
+
1582
+ return result.with_columns(
1583
+ polars.col("platform_usage_pairs").fill_null(polars.lit([]))
1584
+ )
1585
+
1586
+ def _filter_users(self, users_lf: polars.LazyFrame) -> polars.LazyFrame:
1587
+ filter_condition = polars.col("user").str.contains("@")
1588
+ for pattern in self.config.excluded_platforms:
1589
+ filter_condition = filter_condition & ~polars.col("user").str.contains(
1590
+ pattern
1591
+ )
1592
+
1593
+ return users_lf.filter(filter_condition)
1594
+
1595
+ def _combine_user_totals(
1596
+ self,
1597
+ dataset_usage_lf: polars.LazyFrame,
1598
+ dashboard_usage_lf: polars.LazyFrame,
1599
+ chart_usage_lf: polars.LazyFrame,
1600
+ ) -> polars.LazyFrame:
1601
+ """Combine user totals and top_datasets_map from all sources."""
1602
+ # Collect all unique users in one operation
1603
+ all_users_lf = (
1604
+ polars.concat(
1605
+ [
1606
+ dataset_usage_lf.select("user"),
1607
+ dashboard_usage_lf.select("user"),
1608
+ chart_usage_lf.select("user"),
1609
+ ]
1610
+ )
1611
+ .unique()
1612
+ .pipe(self._filter_users)
1613
+ )
1614
+
1615
+ return (
1616
+ all_users_lf.join(
1617
+ dataset_usage_lf.select(
1618
+ ["user", "top_datasets_map", "userUsageTotalPast30Days"]
1619
+ ),
1620
+ on="user",
1621
+ how="left",
1622
+ )
1623
+ .join(
1624
+ dashboard_usage_lf.select(["user", "userUsageTotalPast30Days"]),
1625
+ on="user",
1626
+ how="left",
1627
+ suffix="_dashboard",
1628
+ )
1629
+ .join(
1630
+ chart_usage_lf.select(["user", "userUsageTotalPast30Days"]),
1631
+ on="user",
1632
+ how="left",
1633
+ suffix="_chart",
1634
+ )
1635
+ .with_columns(
1636
+ [
1637
+ # Sum with explicit null handling
1638
+ (
1639
+ polars.col("userUsageTotalPast30Days").fill_null(0)
1640
+ + polars.col("userUsageTotalPast30Days_dashboard").fill_null(0)
1641
+ + polars.col("userUsageTotalPast30Days_chart").fill_null(0)
1642
+ ).alias("userUsageTotalPast30Days")
1643
+ ]
1644
+ )
1645
+ .select(["user", "top_datasets_map", "userUsageTotalPast30Days"])
1646
+ )
1647
+
1648
+ def _combine_platform_pairs(
1649
+ self,
1650
+ dataset_usage_lf: polars.LazyFrame,
1651
+ dashboard_usage_lf: polars.LazyFrame,
1652
+ chart_usage_lf: polars.LazyFrame,
1653
+ ) -> polars.LazyFrame:
1654
+ """Combine platform usage pairs from all sources."""
1655
+ all_platforms = []
1656
+
1657
+ # Extract platforms from each source
1658
+ for source_lf, col_name in [
1659
+ (dataset_usage_lf, "platform_usage_pairs"),
1660
+ (dashboard_usage_lf, "platform_usage_pairs"),
1661
+ (chart_usage_lf, "platform_usage_pairs"),
1662
+ ]:
1663
+ platforms = self._extract_platforms_from_source(source_lf, col_name)
1664
+ if platforms is not None:
1665
+ all_platforms.append(platforms)
1666
+
1667
+ if not all_platforms:
1668
+ # Return empty result if no platforms found
1669
+ return polars.LazyFrame({"user": [], "platform_usage_pairs": []})
1670
+
1671
+ # Combine all platforms and aggregate by user + platform
1672
+ combined_platforms = polars.concat(all_platforms, how="vertical_relaxed")
1673
+ aggregated = combined_platforms.group_by("user", "platform_urn").agg(
1674
+ polars.col("platform_total").sum().alias("platform_total")
1675
+ )
1676
+
1677
+ # Rebuild platform_usage_pairs structure
1678
+ return aggregated.group_by("user").agg(
1679
+ polars.struct(
1680
+ [polars.col("platform_urn"), polars.col("platform_total")]
1681
+ ).alias("platform_usage_pairs")
1682
+ )
1683
+
1684
+ def _extract_platforms_from_source(
1685
+ self, source_lf: polars.LazyFrame, col_name: str
1686
+ ) -> polars.LazyFrame | None:
1687
+ """Extract platform data from a source LazyFrame."""
1688
+ try:
1689
+ return (
1690
+ source_lf.select(["user", col_name])
1691
+ .filter(polars.col(col_name).is_not_null())
1692
+ .filter(polars.col(col_name).list.len() > 0)
1693
+ .explode(col_name)
1694
+ .unnest(col_name)
1695
+ .filter(polars.col("platform_urn").is_not_null())
1696
+ .select(["user", "platform_urn", "platform_total"])
1697
+ )
1698
+ except polars.exceptions.ColumnNotFoundError:
1699
+ return None
1700
+
1701
+ def add_platform_usage_percentiles(
1702
+ self, user_usage_lf: polars.LazyFrame
1703
+ ) -> polars.LazyFrame:
1704
+ """
1705
+ Add platform usage percentiles to user usage data.
1706
+
1707
+ Args:
1708
+ user_usage_lf: LazyFrame with user usage data containing platform_usage_pairs column
1709
+
1710
+ Returns:
1711
+ LazyFrame with additional platform_usage_percentiles column
1712
+ """
1713
+ # First explode the platform_usage_pairs to work with individual platform usage records
1714
+ platform_usage_exploded = (
1715
+ user_usage_lf.explode("platform_usage_pairs")
1716
+ .unnest("platform_usage_pairs")
1717
+ .filter(polars.col("platform_urn").is_not_null())
1718
+ )
1719
+
1720
+ # Use the existing gen_rank_and_percentile method to calculate percentiles
1721
+ platform_percentiles_with_ranks = self.gen_rank_and_percentile(
1722
+ lf=platform_usage_exploded,
1723
+ count_field="platform_total",
1724
+ urn_field="user",
1725
+ platform_field="platform_urn",
1726
+ prefix="platform_",
1727
+ use_exp_cdf=False,
1728
+ )
1729
+
1730
+ # Group back by user and create the percentiles structure
1731
+ platform_percentiles = platform_percentiles_with_ranks.group_by("user").agg(
1732
+ polars.struct(
1733
+ [
1734
+ polars.col("platform_urn"),
1735
+ polars.col("platform_rank_percentile").cast(polars.Float64),
1736
+ ]
1737
+ ).alias("platform_usage_percentiles")
1738
+ )
1739
+
1740
+ # Join the percentiles back to the original user_usage_lf
1741
+ return user_usage_lf.join(platform_percentiles, on="user", how="left")
1742
+
1743
+ def _generate_user_usage_for_dashboard_charts(
1744
+ self, entity_index: str, usage_index: str
1745
+ ) -> polars.LazyFrame:
1746
+ entities_df = self._generate_dashboard_chart_entities(entity_index)
1747
+ lf = self._generate_dashboard_chart_usage(entities_df, usage_index)
1748
+
1749
+ # Process dashboard usage data into user usage format (similar to dataset version)
1750
+ users_lf = (
1751
+ lf.explode("userCounts")
1752
+ .unnest("userCounts")
1753
+ .filter(polars.col("user").is_not_null())
1754
+ .rename({"usageCount": "count"}) # Rename to match dataset schema
1755
+ )
1756
+
1757
+ user_dashboard_usage_lf = self._create_user_dataset_usage_map(users_lf)
1758
+ return user_dashboard_usage_lf
1759
+
1760
+ def generate_user_usage(self) -> polars.LazyFrame:
1761
+ dataset_usage_lf = self._generate_user_usage_for_dataset()
1762
+
1763
+ usage_index = "dashboard_dashboardusagestatisticsaspect_v1"
1764
+ entity_index = "dashboardindex_v2"
1765
+ dashboard_usage_lf = self._generate_user_usage_for_dashboard_charts(
1766
+ entity_index, usage_index
1767
+ )
1768
+
1769
+ entity_index = "chartindex_v2"
1770
+ usage_index = "chart_chartusagestatisticsaspect_v1"
1771
+ chart_usage_lf = self._generate_user_usage_for_dashboard_charts(
1772
+ entity_index, usage_index
1773
+ )
1774
+
1775
+ # Combine all three usage sources
1776
+ lf = self._combine_user_usage_data(
1777
+ dataset_usage_lf, dashboard_usage_lf, chart_usage_lf
1778
+ )
1779
+
1780
+ lf = self.add_platform_usage_percentiles(lf)
1781
+
1782
+ # Add user usage percentiles across all users (not grouped by platform)
1783
+ # Create a temporary platform field for percentile calculation
1784
+ lf = lf.with_columns(polars.lit("all_users").alias("temp_platform"))
1785
+
1786
+ lf = self.gen_rank_and_percentile(
1787
+ lf=lf,
1788
+ count_field="userUsageTotalPast30Days",
1789
+ urn_field="user",
1790
+ platform_field="temp_platform",
1791
+ prefix="userUsage",
1792
+ use_exp_cdf=False,
1793
+ )
1794
+
1795
+ # Rename the percentile column to match the schema field name and remove temp field
1796
+ lf = lf.rename(
1797
+ {"userUsagerank_percentile": "userUsagePercentilePast30Days"}
1798
+ ).drop("temp_platform")
1799
+
1800
+ return lf
1801
+
1386
1802
  def generate_dataset_usage(self) -> polars.LazyFrame:
1387
1803
  datasets_lf = self.get_datasets()
1388
1804
  if self.config.set_upstream_table_max_modification_time_for_views:
@@ -1503,11 +1919,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1503
1919
  # called `Option::unwrap()` on a `None` value
1504
1920
  # Which only happens if we don't collect immediately
1505
1921
  # return polars.scan_parquet(temp_file.name, schema=schema, low_memory=True).collect().lazy()
1506
- return (
1507
- polars.scan_parquet(temp_file.name, schema=schema, low_memory=True)
1508
- .collect()
1509
- .lazy()
1510
- )
1922
+ return polars.scan_parquet(temp_file.name, schema=schema, low_memory=True)
1511
1923
 
1512
1924
  def load_dataset_usage(self) -> polars.LazyFrame:
1513
1925
  index = "dataset_datasetusagestatisticsaspect_v1"
@@ -1624,23 +2036,40 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1624
2036
  delay: Optional[float] = None,
1625
2037
  ) -> Iterable[Dict[str, Any]]:
1626
2038
  processed_count = 0
2039
+ scroll_id = None
1627
2040
  while True:
1628
2041
  with PerfTimer() as timer:
1629
2042
  logger.debug(f"ES query: {query}")
1630
- results = server.search(
1631
- body=query,
1632
- size=batch_size,
1633
- index=(
1634
- index
1635
- if not self.config.search_index.opensearch_dialect
1636
- else None
1637
- ),
1638
- params=(
1639
- {"timeout": self.config.query_timeout}
1640
- if self.config.search_index.opensearch_dialect
1641
- else {"request_timeout": self.config.query_timeout}
1642
- ),
1643
- )
2043
+ if not scroll_id:
2044
+ logger.debug(
2045
+ f"Getting inital data from index {index} without scroll id"
2046
+ )
2047
+ results = server.search(
2048
+ body=query,
2049
+ size=batch_size,
2050
+ scroll="2m",
2051
+ index=index,
2052
+ params=(
2053
+ {"timeout": self.config.query_timeout}
2054
+ if self.config.search_index.opensearch_dialect
2055
+ else {"request_timeout": self.config.query_timeout}
2056
+ ),
2057
+ )
2058
+ else:
2059
+ logger.debug(
2060
+ f"Getting data from index {index} using scroll_id: {scroll_id}"
2061
+ )
2062
+ results = server.scroll(
2063
+ scroll_id=scroll_id,
2064
+ scroll="2m",
2065
+ params=(
2066
+ {"timeout": self.config.query_timeout}
2067
+ if self.config.search_index.opensearch_dialect
2068
+ else {"request_timeout": self.config.query_timeout}
2069
+ ),
2070
+ )
2071
+ scroll_id = results["_scroll_id"]
2072
+
1644
2073
  if not aggregation_key:
1645
2074
  yield from process_function(results["hits"]["hits"])
1646
2075
 
@@ -1651,7 +2080,6 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1651
2080
  )
1652
2081
  if len(results["hits"]["hits"]) < batch_size:
1653
2082
  break
1654
- query.update({"search_after": results["hits"]["hits"][-1]["sort"]})
1655
2083
  else:
1656
2084
  yield from process_function(
1657
2085
  results["aggregations"][aggregation_key]["buckets"]
@@ -1661,16 +2089,11 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1661
2089
  < batch_size
1662
2090
  ):
1663
2091
  break
1664
- if "after_key" in results["aggregations"][aggregation_key]:
1665
- query["aggs"][aggregation_key]["composite"]["after"] = results[
1666
- "aggregations"
1667
- ][aggregation_key]["after_key"]
1668
-
1669
- if delay:
1670
- logger.debug(
1671
- f"Sleeping for {delay} seconds before getting next batch from ES"
1672
- )
1673
- time.sleep(delay)
2092
+ if delay:
2093
+ logger.debug(
2094
+ f"Sleeping for {delay} seconds before getting next batch from ES"
2095
+ )
2096
+ time.sleep(delay)
1674
2097
 
1675
- def get_report(self) -> SourceReport:
2098
+ def get_report(self) -> "DatahubUsageFeatureReport":
1676
2099
  return self.report