acryl-datahub-cloud 0.3.11rc0__py3-none-any.whl → 0.3.16.1rc0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub-cloud might be problematic. Click here for more details.

Files changed (238) hide show
  1. acryl_datahub_cloud/_codegen_config.json +1 -1
  2. acryl_datahub_cloud/acryl_cs_issues/models.py +5 -3
  3. acryl_datahub_cloud/action_request/action_request_owner_source.py +36 -6
  4. acryl_datahub_cloud/datahub_forms_notifications/__init__.py +0 -0
  5. acryl_datahub_cloud/datahub_forms_notifications/forms_notifications_source.py +569 -0
  6. acryl_datahub_cloud/datahub_forms_notifications/get_feature_flag.gql +7 -0
  7. acryl_datahub_cloud/datahub_forms_notifications/get_search_results_total.gql +14 -0
  8. acryl_datahub_cloud/datahub_forms_notifications/query.py +17 -0
  9. acryl_datahub_cloud/datahub_forms_notifications/scroll_forms_for_notification.gql +29 -0
  10. acryl_datahub_cloud/datahub_forms_notifications/send_form_notification_request.gql +5 -0
  11. acryl_datahub_cloud/datahub_reporting/datahub_dataset.py +37 -13
  12. acryl_datahub_cloud/datahub_reporting/datahub_form_reporting.py +55 -24
  13. acryl_datahub_cloud/datahub_reporting/extract_graph.py +4 -3
  14. acryl_datahub_cloud/datahub_reporting/extract_sql.py +242 -51
  15. acryl_datahub_cloud/datahub_reporting/forms.py +1 -1
  16. acryl_datahub_cloud/datahub_reporting/forms_config.py +3 -2
  17. acryl_datahub_cloud/datahub_restore/source.py +3 -2
  18. acryl_datahub_cloud/datahub_usage_reporting/excluded.py +94 -0
  19. acryl_datahub_cloud/datahub_usage_reporting/query_builder.py +48 -8
  20. acryl_datahub_cloud/datahub_usage_reporting/usage_feature_reporter.py +518 -77
  21. acryl_datahub_cloud/elasticsearch/graph_service.py +76 -14
  22. acryl_datahub_cloud/graphql_utils.py +64 -0
  23. acryl_datahub_cloud/lineage_features/source.py +555 -49
  24. acryl_datahub_cloud/metadata/_urns/urn_defs.py +2296 -1900
  25. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/actionworkflow/__init__.py +53 -0
  26. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/anomaly/__init__.py +2 -0
  27. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  28. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/assertion/__init__.py +4 -2
  29. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/common/__init__.py +6 -0
  30. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/conversation/__init__.py +29 -0
  31. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +2 -0
  32. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/execution/__init__.py +2 -0
  33. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  34. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/form/__init__.py +8 -0
  35. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/identity/__init__.py +8 -0
  36. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/knowledge/__init__.py +33 -0
  37. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  38. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +12 -0
  39. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/metadata/search/features/__init__.py +2 -0
  40. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  41. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/notification/__init__.py +19 -0
  42. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  43. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  44. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  45. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +28 -0
  46. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  47. acryl_datahub_cloud/metadata/schema.avsc +25091 -20557
  48. acryl_datahub_cloud/metadata/schema_classes.py +29269 -23863
  49. acryl_datahub_cloud/metadata/schemas/ActionRequestInfo.avsc +235 -2
  50. acryl_datahub_cloud/metadata/schemas/ActionWorkflowInfo.avsc +683 -0
  51. acryl_datahub_cloud/metadata/schemas/ActionWorkflowKey.avsc +21 -0
  52. acryl_datahub_cloud/metadata/schemas/Actors.avsc +38 -1
  53. acryl_datahub_cloud/metadata/schemas/ApplicationKey.avsc +31 -0
  54. acryl_datahub_cloud/metadata/schemas/ApplicationProperties.avsc +75 -0
  55. acryl_datahub_cloud/metadata/schemas/Applications.avsc +38 -0
  56. acryl_datahub_cloud/metadata/schemas/AssertionAnalyticsRunEvent.avsc +353 -215
  57. acryl_datahub_cloud/metadata/schemas/AssertionInfo.avsc +147 -20
  58. acryl_datahub_cloud/metadata/schemas/AssertionKey.avsc +1 -1
  59. acryl_datahub_cloud/metadata/schemas/AssertionRunEvent.avsc +166 -21
  60. acryl_datahub_cloud/metadata/schemas/{AssertionSummary.avsc → AssertionRunSummary.avsc} +15 -2
  61. acryl_datahub_cloud/metadata/schemas/AssertionsSummary.avsc +54 -0
  62. acryl_datahub_cloud/metadata/schemas/AssetSettings.avsc +63 -0
  63. acryl_datahub_cloud/metadata/schemas/BusinessAttributeInfo.avsc +7 -3
  64. acryl_datahub_cloud/metadata/schemas/ChartInfo.avsc +20 -6
  65. acryl_datahub_cloud/metadata/schemas/ChartKey.avsc +1 -0
  66. acryl_datahub_cloud/metadata/schemas/ConstraintInfo.avsc +12 -1
  67. acryl_datahub_cloud/metadata/schemas/ContainerKey.avsc +1 -0
  68. acryl_datahub_cloud/metadata/schemas/ContainerProperties.avsc +16 -5
  69. acryl_datahub_cloud/metadata/schemas/CorpGroupEditableInfo.avsc +2 -1
  70. acryl_datahub_cloud/metadata/schemas/CorpGroupInfo.avsc +7 -3
  71. acryl_datahub_cloud/metadata/schemas/CorpGroupKey.avsc +2 -1
  72. acryl_datahub_cloud/metadata/schemas/CorpGroupSettings.avsc +127 -2
  73. acryl_datahub_cloud/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
  74. acryl_datahub_cloud/metadata/schemas/CorpUserInfo.avsc +18 -2
  75. acryl_datahub_cloud/metadata/schemas/CorpUserInvitationStatus.avsc +106 -0
  76. acryl_datahub_cloud/metadata/schemas/CorpUserKey.avsc +4 -1
  77. acryl_datahub_cloud/metadata/schemas/CorpUserSettings.avsc +304 -2
  78. acryl_datahub_cloud/metadata/schemas/CorpUserUsageFeatures.avsc +86 -0
  79. acryl_datahub_cloud/metadata/schemas/DashboardInfo.avsc +11 -5
  80. acryl_datahub_cloud/metadata/schemas/DashboardKey.avsc +1 -0
  81. acryl_datahub_cloud/metadata/schemas/DataFlowInfo.avsc +15 -5
  82. acryl_datahub_cloud/metadata/schemas/DataFlowKey.avsc +1 -0
  83. acryl_datahub_cloud/metadata/schemas/DataHubAiConversationInfo.avsc +256 -0
  84. acryl_datahub_cloud/metadata/schemas/DataHubAiConversationKey.avsc +22 -0
  85. acryl_datahub_cloud/metadata/schemas/DataHubFileInfo.avsc +234 -0
  86. acryl_datahub_cloud/metadata/schemas/DataHubFileKey.avsc +22 -0
  87. acryl_datahub_cloud/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
  88. acryl_datahub_cloud/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  89. acryl_datahub_cloud/metadata/schemas/DataHubPageModuleProperties.avsc +308 -0
  90. acryl_datahub_cloud/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  91. acryl_datahub_cloud/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  92. acryl_datahub_cloud/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  93. acryl_datahub_cloud/metadata/schemas/DataJobInfo.avsc +13 -4
  94. acryl_datahub_cloud/metadata/schemas/DataJobInputOutput.avsc +8 -0
  95. acryl_datahub_cloud/metadata/schemas/DataJobKey.avsc +1 -0
  96. acryl_datahub_cloud/metadata/schemas/DataPlatformInfo.avsc +3 -1
  97. acryl_datahub_cloud/metadata/schemas/DataPlatformInstanceProperties.avsc +5 -2
  98. acryl_datahub_cloud/metadata/schemas/DataProcessKey.avsc +4 -0
  99. acryl_datahub_cloud/metadata/schemas/DataProductKey.avsc +2 -0
  100. acryl_datahub_cloud/metadata/schemas/DataProductProperties.avsc +6 -3
  101. acryl_datahub_cloud/metadata/schemas/DataTypeInfo.avsc +5 -0
  102. acryl_datahub_cloud/metadata/schemas/DatasetKey.avsc +10 -2
  103. acryl_datahub_cloud/metadata/schemas/DatasetProperties.avsc +12 -5
  104. acryl_datahub_cloud/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  105. acryl_datahub_cloud/metadata/schemas/DocumentInfo.avsc +407 -0
  106. acryl_datahub_cloud/metadata/schemas/DocumentKey.avsc +35 -0
  107. acryl_datahub_cloud/metadata/schemas/DocumentSettings.avsc +79 -0
  108. acryl_datahub_cloud/metadata/schemas/DomainKey.avsc +2 -0
  109. acryl_datahub_cloud/metadata/schemas/DomainProperties.avsc +7 -3
  110. acryl_datahub_cloud/metadata/schemas/EditableContainerProperties.avsc +2 -1
  111. acryl_datahub_cloud/metadata/schemas/EditableDashboardProperties.avsc +2 -1
  112. acryl_datahub_cloud/metadata/schemas/EditableDataFlowProperties.avsc +2 -1
  113. acryl_datahub_cloud/metadata/schemas/EditableDataJobProperties.avsc +2 -1
  114. acryl_datahub_cloud/metadata/schemas/EditableDatasetProperties.avsc +2 -1
  115. acryl_datahub_cloud/metadata/schemas/EditableERModelRelationshipProperties.avsc +2 -1
  116. acryl_datahub_cloud/metadata/schemas/EditableMLFeatureProperties.avsc +2 -1
  117. acryl_datahub_cloud/metadata/schemas/EditableMLFeatureTableProperties.avsc +2 -1
  118. acryl_datahub_cloud/metadata/schemas/EditableMLModelGroupProperties.avsc +2 -1
  119. acryl_datahub_cloud/metadata/schemas/EditableMLModelProperties.avsc +2 -1
  120. acryl_datahub_cloud/metadata/schemas/EditableNotebookProperties.avsc +2 -1
  121. acryl_datahub_cloud/metadata/schemas/EditableSchemaMetadata.avsc +4 -2
  122. acryl_datahub_cloud/metadata/schemas/EntityTypeInfo.avsc +5 -0
  123. acryl_datahub_cloud/metadata/schemas/ExecutionRequestArtifactsLocation.avsc +16 -0
  124. acryl_datahub_cloud/metadata/schemas/ExecutionRequestKey.avsc +2 -1
  125. acryl_datahub_cloud/metadata/schemas/FormAssignmentStatus.avsc +36 -0
  126. acryl_datahub_cloud/metadata/schemas/FormInfo.avsc +6 -0
  127. acryl_datahub_cloud/metadata/schemas/FormKey.avsc +3 -1
  128. acryl_datahub_cloud/metadata/schemas/FormNotifications.avsc +69 -0
  129. acryl_datahub_cloud/metadata/schemas/FormSettings.avsc +30 -0
  130. acryl_datahub_cloud/metadata/schemas/GlobalSettingsInfo.avsc +416 -0
  131. acryl_datahub_cloud/metadata/schemas/GlobalTags.avsc +2 -1
  132. acryl_datahub_cloud/metadata/schemas/GlossaryNodeInfo.avsc +3 -1
  133. acryl_datahub_cloud/metadata/schemas/GlossaryNodeKey.avsc +1 -0
  134. acryl_datahub_cloud/metadata/schemas/GlossaryTermInfo.avsc +3 -1
  135. acryl_datahub_cloud/metadata/schemas/GlossaryTermKey.avsc +2 -0
  136. acryl_datahub_cloud/metadata/schemas/IcebergWarehouseInfo.avsc +4 -0
  137. acryl_datahub_cloud/metadata/schemas/IncidentActivityEvent.avsc +3 -3
  138. acryl_datahub_cloud/metadata/schemas/IncidentInfo.avsc +3 -3
  139. acryl_datahub_cloud/metadata/schemas/InferredMetadata.avsc +71 -1
  140. acryl_datahub_cloud/metadata/schemas/InputFields.avsc +2 -1
  141. acryl_datahub_cloud/metadata/schemas/InviteToken.avsc +26 -0
  142. acryl_datahub_cloud/metadata/schemas/LineageFeatures.avsc +67 -42
  143. acryl_datahub_cloud/metadata/schemas/LogicalParent.avsc +145 -0
  144. acryl_datahub_cloud/metadata/schemas/MLFeatureKey.avsc +4 -1
  145. acryl_datahub_cloud/metadata/schemas/MLFeatureTableKey.avsc +4 -1
  146. acryl_datahub_cloud/metadata/schemas/MLModelDeploymentKey.avsc +7 -1
  147. acryl_datahub_cloud/metadata/schemas/MLModelGroupKey.avsc +9 -1
  148. acryl_datahub_cloud/metadata/schemas/MLModelKey.avsc +9 -1
  149. acryl_datahub_cloud/metadata/schemas/MLModelProperties.avsc +4 -2
  150. acryl_datahub_cloud/metadata/schemas/MLPrimaryKeyKey.avsc +4 -1
  151. acryl_datahub_cloud/metadata/schemas/MetadataChangeEvent.avsc +418 -97
  152. acryl_datahub_cloud/metadata/schemas/MetadataChangeLog.avsc +62 -44
  153. acryl_datahub_cloud/metadata/schemas/MetadataChangeProposal.avsc +61 -0
  154. acryl_datahub_cloud/metadata/schemas/MonitorAnomalyEvent.avsc +54 -9
  155. acryl_datahub_cloud/metadata/schemas/MonitorInfo.avsc +163 -23
  156. acryl_datahub_cloud/metadata/schemas/MonitorKey.avsc +9 -1
  157. acryl_datahub_cloud/metadata/schemas/MonitorSuiteInfo.avsc +128 -3
  158. acryl_datahub_cloud/metadata/schemas/NotebookInfo.avsc +5 -2
  159. acryl_datahub_cloud/metadata/schemas/NotebookKey.avsc +1 -0
  160. acryl_datahub_cloud/metadata/schemas/NotificationRequest.avsc +91 -4
  161. acryl_datahub_cloud/metadata/schemas/Operation.avsc +17 -0
  162. acryl_datahub_cloud/metadata/schemas/Ownership.avsc +71 -1
  163. acryl_datahub_cloud/metadata/schemas/QuerySubjects.avsc +2 -13
  164. acryl_datahub_cloud/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  165. acryl_datahub_cloud/metadata/schemas/RoleProperties.avsc +3 -1
  166. acryl_datahub_cloud/metadata/schemas/SchemaFieldInfo.avsc +3 -1
  167. acryl_datahub_cloud/metadata/schemas/SchemaFieldKey.avsc +3 -0
  168. acryl_datahub_cloud/metadata/schemas/SchemaMetadata.avsc +2 -1
  169. acryl_datahub_cloud/metadata/schemas/SemanticContent.avsc +123 -0
  170. acryl_datahub_cloud/metadata/schemas/StructuredProperties.avsc +69 -0
  171. acryl_datahub_cloud/metadata/schemas/StructuredPropertyDefinition.avsc +15 -4
  172. acryl_datahub_cloud/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  173. acryl_datahub_cloud/metadata/schemas/SubscriptionInfo.avsc +136 -5
  174. acryl_datahub_cloud/metadata/schemas/SubscriptionKey.avsc +2 -1
  175. acryl_datahub_cloud/metadata/schemas/SystemMetadata.avsc +61 -0
  176. acryl_datahub_cloud/metadata/schemas/TagProperties.avsc +3 -1
  177. acryl_datahub_cloud/metadata/schemas/TestInfo.avsc +2 -1
  178. acryl_datahub_cloud/metadata/schemas/UpstreamLineage.avsc +9 -0
  179. acryl_datahub_cloud/metadata/schemas/UsageFeatures.avsc +10 -0
  180. acryl_datahub_cloud/notifications/__init__.py +0 -0
  181. acryl_datahub_cloud/notifications/notification_recipient_builder.py +399 -0
  182. acryl_datahub_cloud/sdk/__init__.py +69 -0
  183. acryl_datahub_cloud/sdk/assertion/__init__.py +58 -0
  184. acryl_datahub_cloud/sdk/assertion/assertion_base.py +779 -0
  185. acryl_datahub_cloud/sdk/assertion/column_metric_assertion.py +191 -0
  186. acryl_datahub_cloud/sdk/assertion/column_value_assertion.py +431 -0
  187. acryl_datahub_cloud/sdk/assertion/freshness_assertion.py +201 -0
  188. acryl_datahub_cloud/sdk/assertion/schema_assertion.py +268 -0
  189. acryl_datahub_cloud/sdk/assertion/smart_column_metric_assertion.py +212 -0
  190. acryl_datahub_cloud/sdk/assertion/smart_freshness_assertion.py +165 -0
  191. acryl_datahub_cloud/sdk/assertion/smart_sql_assertion.py +156 -0
  192. acryl_datahub_cloud/sdk/assertion/smart_volume_assertion.py +162 -0
  193. acryl_datahub_cloud/sdk/assertion/sql_assertion.py +273 -0
  194. acryl_datahub_cloud/sdk/assertion/types.py +20 -0
  195. acryl_datahub_cloud/sdk/assertion/volume_assertion.py +156 -0
  196. acryl_datahub_cloud/sdk/assertion_client/__init__.py +0 -0
  197. acryl_datahub_cloud/sdk/assertion_client/column_metric.py +545 -0
  198. acryl_datahub_cloud/sdk/assertion_client/column_value.py +617 -0
  199. acryl_datahub_cloud/sdk/assertion_client/freshness.py +371 -0
  200. acryl_datahub_cloud/sdk/assertion_client/helpers.py +166 -0
  201. acryl_datahub_cloud/sdk/assertion_client/schema.py +358 -0
  202. acryl_datahub_cloud/sdk/assertion_client/smart_column_metric.py +540 -0
  203. acryl_datahub_cloud/sdk/assertion_client/smart_freshness.py +373 -0
  204. acryl_datahub_cloud/sdk/assertion_client/smart_sql.py +411 -0
  205. acryl_datahub_cloud/sdk/assertion_client/smart_volume.py +380 -0
  206. acryl_datahub_cloud/sdk/assertion_client/sql.py +410 -0
  207. acryl_datahub_cloud/sdk/assertion_client/volume.py +446 -0
  208. acryl_datahub_cloud/sdk/assertion_input/__init__.py +0 -0
  209. acryl_datahub_cloud/sdk/assertion_input/assertion_input.py +1470 -0
  210. acryl_datahub_cloud/sdk/assertion_input/column_assertion_constants.py +114 -0
  211. acryl_datahub_cloud/sdk/assertion_input/column_assertion_utils.py +284 -0
  212. acryl_datahub_cloud/sdk/assertion_input/column_metric_assertion_input.py +759 -0
  213. acryl_datahub_cloud/sdk/assertion_input/column_metric_constants.py +109 -0
  214. acryl_datahub_cloud/sdk/assertion_input/column_value_assertion_input.py +810 -0
  215. acryl_datahub_cloud/sdk/assertion_input/freshness_assertion_input.py +305 -0
  216. acryl_datahub_cloud/sdk/assertion_input/schema_assertion_input.py +413 -0
  217. acryl_datahub_cloud/sdk/assertion_input/smart_column_metric_assertion_input.py +793 -0
  218. acryl_datahub_cloud/sdk/assertion_input/smart_freshness_assertion_input.py +218 -0
  219. acryl_datahub_cloud/sdk/assertion_input/smart_sql_assertion_input.py +181 -0
  220. acryl_datahub_cloud/sdk/assertion_input/smart_volume_assertion_input.py +189 -0
  221. acryl_datahub_cloud/sdk/assertion_input/sql_assertion_input.py +320 -0
  222. acryl_datahub_cloud/sdk/assertion_input/volume_assertion_input.py +635 -0
  223. acryl_datahub_cloud/sdk/assertions_client.py +1074 -0
  224. acryl_datahub_cloud/sdk/entities/__init__.py +0 -0
  225. acryl_datahub_cloud/sdk/entities/assertion.py +439 -0
  226. acryl_datahub_cloud/sdk/entities/monitor.py +291 -0
  227. acryl_datahub_cloud/sdk/entities/subscription.py +100 -0
  228. acryl_datahub_cloud/sdk/errors.py +34 -0
  229. acryl_datahub_cloud/sdk/resolver_client.py +42 -0
  230. acryl_datahub_cloud/sdk/subscription_client.py +737 -0
  231. {acryl_datahub_cloud-0.3.11rc0.dist-info → acryl_datahub_cloud-0.3.16.1rc0.dist-info}/METADATA +55 -49
  232. {acryl_datahub_cloud-0.3.11rc0.dist-info → acryl_datahub_cloud-0.3.16.1rc0.dist-info}/RECORD +235 -142
  233. {acryl_datahub_cloud-0.3.11rc0.dist-info → acryl_datahub_cloud-0.3.16.1rc0.dist-info}/WHEEL +1 -1
  234. {acryl_datahub_cloud-0.3.11rc0.dist-info → acryl_datahub_cloud-0.3.16.1rc0.dist-info}/entry_points.txt +1 -0
  235. acryl_datahub_cloud/_sdk_extras/__init__.py +0 -4
  236. acryl_datahub_cloud/_sdk_extras/assertion.py +0 -15
  237. acryl_datahub_cloud/_sdk_extras/assertions_client.py +0 -23
  238. {acryl_datahub_cloud-0.3.11rc0.dist-info → acryl_datahub_cloud-0.3.16.1rc0.dist-info}/top_level.txt +0 -0
@@ -22,12 +22,14 @@ from polars.datatypes import DataTypeClass
22
22
  from pydantic import Field
23
23
  from scipy.stats import expon
24
24
 
25
+ from acryl_datahub_cloud.datahub_usage_reporting.excluded import EXCLUDED_PATTERNS
25
26
  from acryl_datahub_cloud.datahub_usage_reporting.query_builder import QueryBuilder
26
27
  from acryl_datahub_cloud.datahub_usage_reporting.usage_feature_patch_builder import (
27
28
  UsageFeaturePatchBuilder,
28
29
  )
29
30
  from acryl_datahub_cloud.elasticsearch.config import ElasticSearchClientConfig
30
31
  from acryl_datahub_cloud.metadata.schema_classes import (
32
+ CorpUserUsageFeaturesClass,
31
33
  QueryUsageFeaturesClass,
32
34
  UsageFeaturesClass,
33
35
  )
@@ -40,7 +42,7 @@ from datahub.ingestion.api.decorators import (
40
42
  platform_name,
41
43
  support_status,
42
44
  )
43
- from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
45
+ from datahub.ingestion.api.source import MetadataWorkUnitProcessor
44
46
  from datahub.ingestion.api.source_helpers import auto_workunit_reporter
45
47
  from datahub.ingestion.api.workunit import MetadataWorkUnit
46
48
  from datahub.ingestion.graph.client import DatahubClientConfig
@@ -114,12 +116,12 @@ class DataHubUsageFeatureReportingSourceConfig(
114
116
  30, description="Timeout in seconds for the search queries."
115
117
  )
116
118
  extract_batch_size: int = Field(
117
- 1000,
119
+ 5000,
118
120
  description="The number of documents to retrieve in each batch from ElasticSearch or OpenSearch.",
119
121
  )
120
122
 
121
123
  extract_delay: Optional[float] = Field(
122
- 0.25,
124
+ 0,
123
125
  description="The delay in seconds between each batch extraction from ElasticSearch or OpenSearch.",
124
126
  )
125
127
 
@@ -135,6 +137,10 @@ class DataHubUsageFeatureReportingSourceConfig(
135
137
  None,
136
138
  description="Optional configuration for stateful ingestion, including stale metadata removal.",
137
139
  )
140
+ user_usage_enabled: bool = Field(
141
+ True,
142
+ description="Flag to enable or disable user usage statistics collection.",
143
+ )
138
144
  dataset_usage_enabled: bool = Field(
139
145
  True,
140
146
  description="Flag to enable or disable dataset usage statistics collection.",
@@ -177,7 +183,7 @@ class DataHubUsageFeatureReportingSourceConfig(
177
183
  # This option is only needed here until we are sure that the streaming mode is stable.
178
184
  # then we can remove it and control it with the streaming_mode option.
179
185
  experimental_full_streaming: bool = Field(
180
- False,
186
+ True,
181
187
  description="Flag to enable full streaming mode.'",
182
188
  )
183
189
 
@@ -191,6 +197,11 @@ class DataHubUsageFeatureReportingSourceConfig(
191
197
  description="Flag to generate MCP patch for usage features.'",
192
198
  )
193
199
 
200
+ excluded_platforms: List[str] = Field(
201
+ EXCLUDED_PATTERNS,
202
+ description="List of platforms to exclude from usage statistics collection. This is done to avoid invite user functionality to be filled with service accounts.",
203
+ )
204
+
194
205
 
195
206
  def exp_cdf(series: polars.Series) -> polars.Series:
196
207
  with PerfTimer() as timer:
@@ -228,7 +239,7 @@ def exp_cdf(series: polars.Series) -> polars.Series:
228
239
 
229
240
 
230
241
  @dataclass
231
- class DatahubUsageFeatureReport(IngestionStageReport, StatefulIngestionReport):
242
+ class DatahubUsageFeatureReport(StatefulIngestionReport, IngestionStageReport):
232
243
  dataset_platforms_count: Dict[str, int] = field(
233
244
  default_factory=lambda: defaultdict(lambda: 0)
234
245
  )
@@ -241,10 +252,6 @@ class DatahubUsageFeatureReport(IngestionStageReport, StatefulIngestionReport):
241
252
  default_factory=lambda: defaultdict(lambda: PerfTimer())
242
253
  )
243
254
 
244
- dataset_usage_processing_time: PerfTimer = PerfTimer()
245
- dashboard_usage_processing_time: PerfTimer = PerfTimer()
246
- chart_usage_processing_time: PerfTimer = PerfTimer()
247
- query_usage_processing_time: PerfTimer = PerfTimer()
248
255
  query_platforms_count: Dict[str, int] = field(
249
256
  default_factory=lambda: defaultdict(lambda: 0)
250
257
  )
@@ -617,11 +624,11 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
617
624
  ),
618
625
  )
619
626
 
620
- response = server.create_pit(index, keep_alive="10m")
627
+ # response = server.create_pit(index, keep_alive="10m")
621
628
 
622
629
  # TODO: Save PIT, we can resume processing based on <pit, search_after> tuple
623
- pit = response.get("pit_id")
624
- query_copy.update({"pit": {"id": pit, "keep_alive": "10m"}})
630
+ # pit = response.get("pit_id")
631
+ # query_copy.update({"pit": {"id": pit, "keep_alive": "10m"}})
625
632
  else:
626
633
  server = Elasticsearch(
627
634
  [endpoint],
@@ -731,17 +738,20 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
731
738
  return pa.dictionary(index_type=pa.int32(), value_type=pa.string())
732
739
  elif isinstance(polars_dtype, polars.Struct):
733
740
  return pa.struct(
734
- {
735
- field.name: convert_dtype(field.dtype)
741
+ [
742
+ pa.field(field.name, convert_dtype(field.dtype))
736
743
  for field in polars_dtype.fields
737
- }
744
+ ]
738
745
  )
739
746
  elif isinstance(polars_dtype, polars.List):
740
747
  return pa.list_(convert_dtype(polars_dtype.inner))
741
748
  else:
742
749
  raise ValueError(f"Unsupported Polars dtype: {polars_dtype}")
743
750
 
744
- fields = [(name, convert_dtype(dtype)) for name, dtype in polars_schema.items()]
751
+ fields = [
752
+ pa.field(name, convert_dtype(dtype))
753
+ for name, dtype in polars_schema.items()
754
+ ]
745
755
  return pa.schema(fields)
746
756
 
747
757
  def batch_write_parquet(
@@ -834,7 +844,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
834
844
  .drop(["removed"])
835
845
  )
836
846
 
837
- return wdf.collect(streaming=self.config.streaming_mode).lazy()
847
+ return wdf
838
848
 
839
849
  def load_write_usage_server_side_aggregation(
840
850
  self, soft_deleted_entities_df: polars.LazyFrame
@@ -923,6 +933,11 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
923
933
 
924
934
  return dataset_df
925
935
 
936
+ def generate_user_usage_mcps(self) -> Iterable[MetadataWorkUnit]:
937
+ with polars.StringCache():
938
+ user_usage_lf = self.generate_user_usage()
939
+ yield from self.generate_user_usage_mcp_from_lazyframe(user_usage_lf)
940
+
926
941
  def generate_dataset_usage_mcps(self) -> Iterable[MetadataWorkUnit]:
927
942
  with polars.StringCache():
928
943
  dataset_usage_df = self.generate_dataset_usage()
@@ -958,44 +973,34 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
958
973
  ]
959
974
 
960
975
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
976
+ if self.config.user_usage_enabled:
977
+ with self.report.new_stage("generate user usage"):
978
+ yield from self.generate_user_usage_mcps()
979
+
961
980
  if self.config.dataset_usage_enabled:
962
- with self.report.dataset_usage_processing_time as timer:
963
- self.report.new_stage("generate dataset usage")
981
+ with self.report.new_stage("generate dataset usage"):
964
982
  yield from self.generate_dataset_usage_mcps()
965
- time_taken = timer.elapsed_seconds()
966
- logger.info(f"Dataset Usage generation took {time_taken:.3f} seconds")
967
983
 
968
984
  if self.config.dashboard_usage_enabled:
969
- with self.report.dashboard_usage_processing_time as timer:
970
- self.report.new_stage("generate dashboard usage")
985
+ with self.report.new_stage("generate dashboard usage"):
971
986
  yield from self.generate_dashboard_usage_mcps()
972
987
 
973
- time_taken = timer.elapsed_seconds()
974
- logger.info(f"Dashboard Usage generation took {time_taken:.3f}")
975
-
976
988
  if self.config.chart_usage_enabled:
977
- with self.report.chart_usage_processing_time as timer:
978
- self.report.new_stage("generate chart usage")
979
-
989
+ with self.report.new_stage("generate chart usage"):
980
990
  yield from self.generate_chart_usage_mcps()
981
991
 
982
- time_taken = timer.elapsed_seconds()
983
- logger.info(f"Chart Usage generation took {time_taken:.3f}")
984
-
985
992
  if self.config.query_usage_enabled:
986
- with self.report.query_usage_processing_time as timer:
987
- self.report.new_stage("generate query usage")
988
-
993
+ with self.report.new_stage("generate query usage"):
989
994
  yield from self.generate_query_usage_mcps()
990
995
 
991
- time_taken = timer.elapsed_seconds()
992
- logger.info(f"Query Usage generation took {time_taken:.3f}")
996
+ with self.report.new_stage("end so time is calculated for last stage"):
997
+ pass
993
998
 
994
999
  def generate_mcp_from_lazyframe(
995
1000
  self, lazy_frame: polars.LazyFrame
996
1001
  ) -> Iterable[MetadataWorkUnit]:
997
1002
  for row in lazy_frame.collect(
998
- streaming=self.config.experimental_full_streaming
1003
+ engine="streaming" if self.config.experimental_full_streaming else "auto"
999
1004
  ).to_struct():
1000
1005
  if "siblings" in row and row["siblings"]:
1001
1006
  logger.info(f"Siblings found for urn: {row['urn']} -> row['siblings']")
@@ -1052,7 +1057,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1052
1057
  uniqueUserPercentileLast30Days=int(
1053
1058
  row.get("distinct_user_rank_percentile", 0) or 0
1054
1059
  ),
1055
- writeCountLast30Days=int(row.get("write_rank_percentile", 0) or 0)
1060
+ writeCountLast30Days=int(row.get("write_count", 0) or 0)
1056
1061
  if not self.config.disable_write_usage
1057
1062
  else None,
1058
1063
  writeCountPercentileLast30Days=int(
@@ -1086,7 +1091,9 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1086
1091
  def generate_query_usage_mcp_from_lazyframe(
1087
1092
  self, lazy_frame: polars.LazyFrame
1088
1093
  ) -> Iterable[MetadataWorkUnit]:
1089
- for row in lazy_frame.collect().iter_rows(named=True):
1094
+ for row in lazy_frame.collect(
1095
+ engine="streaming" if self.config.experimental_full_streaming else "auto"
1096
+ ).iter_rows(named=True):
1090
1097
  query_usage_features = QueryUsageFeaturesClass(
1091
1098
  queryCountLast30Days=int(row.get("totalSqlQueries", 0) or 0),
1092
1099
  queryCountTotal=None, # This is not implemented
@@ -1106,6 +1113,47 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1106
1113
  row["urn"], query_usage_features
1107
1114
  )
1108
1115
 
1116
+ def _convert_platform_pairs_to_dict(
1117
+ self,
1118
+ platform_pairs: Optional[List[Dict[str, Any]]],
1119
+ value_key: str = "platform_total",
1120
+ ) -> Optional[Dict[str, Any]]:
1121
+ """Convert list of platform usage structs to dictionary."""
1122
+ if not platform_pairs:
1123
+ return None
1124
+
1125
+ return {
1126
+ pair["platform_urn"]: pair[value_key]
1127
+ for pair in platform_pairs
1128
+ if pair["platform_urn"] is not None
1129
+ }
1130
+
1131
+ def generate_user_usage_mcp_from_lazyframe(
1132
+ self, lazy_frame: polars.LazyFrame
1133
+ ) -> Iterable[MetadataWorkUnit]:
1134
+ for row in lazy_frame.collect(
1135
+ engine="streaming" if self.config.experimental_full_streaming else "auto"
1136
+ ).iter_rows(named=True):
1137
+ user_usage_features = CorpUserUsageFeaturesClass(
1138
+ userUsageTotalPast30Days=int(
1139
+ row.get("userUsageTotalPast30Days", 0) or 0
1140
+ ),
1141
+ userPlatformUsageTotalsPast30Days=self._convert_platform_pairs_to_dict(
1142
+ row.get("platform_usage_pairs", [])
1143
+ ),
1144
+ userPlatformUsagePercentilePast30Days=self._convert_platform_pairs_to_dict(
1145
+ row.get("platform_usage_percentiles", []),
1146
+ "platform_rank_percentile",
1147
+ ),
1148
+ userUsagePercentilePast30Days=row.get("userUsagePercentilePast30Days"),
1149
+ userTopDatasetsByUsage=self._convert_top_datasets_to_dict(
1150
+ row.get("top_datasets_map", [])
1151
+ ),
1152
+ )
1153
+ yield MetadataChangeProposalWrapper(
1154
+ entityUrn=row["user"], aspect=user_usage_features
1155
+ ).as_workunit(is_primary_source=False)
1156
+
1109
1157
  def generate_usage_feature_mcp(
1110
1158
  self, urn: str, usage_feature: UsageFeaturesClass
1111
1159
  ) -> Iterable[MetadataWorkUnit]:
@@ -1140,9 +1188,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1140
1188
 
1141
1189
  return self.generate_dashboard_chart_usage(entity_index, usage_index)
1142
1190
 
1143
- def generate_dashboard_chart_usage(
1144
- self, entity_index: str, usage_index: str
1145
- ) -> polars.LazyFrame:
1191
+ def _generate_dashboard_chart_entities(self, entity_index: str) -> polars.LazyFrame:
1146
1192
  entity_schema = {
1147
1193
  "entity_urn": polars.Categorical,
1148
1194
  "removed": polars.Boolean,
@@ -1159,7 +1205,12 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1159
1205
  process_function=self.soft_deleted_batch,
1160
1206
  )
1161
1207
 
1162
- dashboard_usage_schema = {
1208
+ return entities_df
1209
+
1210
+ def _generate_dashboard_chart_usage(
1211
+ self, entities_df: polars.LazyFrame, usage_index: str
1212
+ ) -> polars.LazyFrame:
1213
+ entities_usage_schema = {
1163
1214
  "timestampMillis": polars.Int64,
1164
1215
  "lastObserved": polars.Int64,
1165
1216
  "urn": polars.Categorical,
@@ -1177,7 +1228,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1177
1228
  }
1178
1229
 
1179
1230
  lf = self.load_data_from_es_to_lf(
1180
- schema=dashboard_usage_schema,
1231
+ schema=entities_usage_schema,
1181
1232
  index=usage_index,
1182
1233
  query=QueryBuilder.get_dashboard_usage_query(self.config.lookback_days),
1183
1234
  process_function=self.process_dashboard_usage,
@@ -1196,6 +1247,15 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1196
1247
  .alias("row_num")
1197
1248
  ).filter(polars.col("row_num") == 1)
1198
1249
 
1250
+ return lf
1251
+
1252
+ def generate_dashboard_chart_usage(
1253
+ self, entity_index: str, usage_index: str
1254
+ ) -> polars.LazyFrame:
1255
+ entities_df = self._generate_dashboard_chart_entities(entity_index)
1256
+
1257
+ lf = self._generate_dashboard_chart_usage(entities_df, usage_index)
1258
+
1199
1259
  # lf = lf.filter(polars.col("urn") == "urn:li:dashboard:(looker,dashboards.8)")
1200
1260
  # "urn:li:dashboard:(looker,dashboards.8)"
1201
1261
 
@@ -1308,7 +1368,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1308
1368
  query_entities = self.load_data_from_es_to_lf(
1309
1369
  schema=query_entities_schema,
1310
1370
  index=entity_index,
1311
- query=QueryBuilder.get_query_entities_query(),
1371
+ query=QueryBuilder.get_query_entities_query(self.config.lookback_days),
1312
1372
  process_function=self.queries_entities_batch,
1313
1373
  )
1314
1374
 
@@ -1365,6 +1425,380 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1365
1425
 
1366
1426
  return usage_with_top_users_with_ranks
1367
1427
 
1428
+ def _generate_user_usage_for_dataset(self) -> polars.LazyFrame:
1429
+ datasets_lf = self.get_datasets()
1430
+ if self.config.set_upstream_table_max_modification_time_for_views:
1431
+ datasets_lf = self.set_table_modification_time_for_views(datasets_lf)
1432
+
1433
+ lf = self.load_dataset_usage()
1434
+
1435
+ # Polaris/pandas join merges the join column into one column and that's why we need to filter based on the removed column
1436
+ lf = (
1437
+ lf.join(datasets_lf, left_on="urn", right_on="entity_urn", how="left")
1438
+ .filter(polars.col("removed") == False) # noqa: E712
1439
+ .drop(["removed"])
1440
+ )
1441
+
1442
+ users_lf = (
1443
+ lf.explode("userCounts")
1444
+ .unnest("userCounts")
1445
+ .filter(polars.col("user").is_not_null())
1446
+ )
1447
+
1448
+ user_dataset_usage_lf = self._create_user_dataset_usage_map(users_lf)
1449
+ return user_dataset_usage_lf
1450
+
1451
+ @staticmethod
1452
+ def _convert_top_datasets_to_dict(
1453
+ top_datasets_list: Optional[List[Dict[str, Any]]],
1454
+ ) -> Optional[Dict[str, float]]:
1455
+ """
1456
+ Convert list of top datasets structs to dictionary as expected by CorpUserUsageFeatures schema.
1457
+
1458
+ Args:
1459
+ top_datasets_list: List of dictionaries with 'dataset_urn' and 'count' keys
1460
+
1461
+ Returns:
1462
+ Dictionary mapping dataset URN to usage count, or None if input is empty
1463
+ """
1464
+ if not top_datasets_list:
1465
+ return None
1466
+
1467
+ top_datasets_dict = {
1468
+ item["dataset_urn"]: float(item["count"])
1469
+ for item in top_datasets_list
1470
+ if isinstance(item, dict) and "dataset_urn" in item and "count" in item
1471
+ }
1472
+
1473
+ return top_datasets_dict if top_datasets_dict else None
1474
+
1475
+ def _create_user_dataset_usage_map(
1476
+ self, users_lf: polars.LazyFrame, top_n: int = 25
1477
+ ) -> polars.LazyFrame:
1478
+ """
1479
+ Creates a lazyframe with user string and map of top N datasets by usage.
1480
+
1481
+ Args:
1482
+ users_lf: LazyFrame containing user usage data with columns: user, urn, platform, count
1483
+ top_n: Number of top datasets to include per user (default: 25)
1484
+
1485
+ Returns:
1486
+ LazyFrame with columns:
1487
+ - user: string column containing the user identifier
1488
+ - top_datasets_map: list of structs with dataset_urn (string), count (int), and platform_urn (string)
1489
+ - userUsageTotalPast30Days: total usage count for the user across all datasets
1490
+ - userPlatformUsageTotalsPast30Days: map from platform URN to usage totals
1491
+ """
1492
+
1493
+ # Create intermediate lazy frame with filtered users and aggregated counts
1494
+ user_dataset_aggregated = (
1495
+ users_lf.filter(polars.col("user").str.contains("@"))
1496
+ .group_by("user", "urn", "platform")
1497
+ .agg(polars.col("count").sum().alias("total_count"))
1498
+ .with_columns(
1499
+ # Direct string formatting - vectorized operation
1500
+ polars.format("urn:li:dataPlatform:{}", polars.col("platform")).alias(
1501
+ "platform_urn"
1502
+ )
1503
+ )
1504
+ )
1505
+
1506
+ # Calculate user totals
1507
+ user_totals = user_dataset_aggregated.group_by("user").agg(
1508
+ polars.col("total_count").sum().alias("userUsageTotalPast30Days")
1509
+ )
1510
+
1511
+ # Calculate platform totals for each user - keep as list of structs
1512
+ platform_totals = (
1513
+ user_dataset_aggregated.group_by("user", "platform_urn")
1514
+ .agg(polars.col("total_count").sum().alias("platform_total"))
1515
+ .filter(polars.col("platform_urn").is_not_null())
1516
+ .group_by("user")
1517
+ .agg(
1518
+ polars.struct(
1519
+ [
1520
+ polars.col("platform_urn"),
1521
+ polars.col("platform_total").cast(polars.Float64),
1522
+ ]
1523
+ ).alias("platform_usage_pairs")
1524
+ )
1525
+ )
1526
+
1527
+ # Calculate top datasets
1528
+ top_datasets = (
1529
+ user_dataset_aggregated.with_columns(
1530
+ polars.col("total_count")
1531
+ .rank(descending=True, method="ordinal")
1532
+ .over("user")
1533
+ .alias("dataset_rank")
1534
+ )
1535
+ .filter(polars.col("dataset_rank") <= top_n)
1536
+ .group_by("user")
1537
+ .agg(
1538
+ polars.struct(
1539
+ [
1540
+ polars.col("urn").alias("dataset_urn"),
1541
+ polars.col("total_count").alias("count"),
1542
+ polars.col("platform_urn"),
1543
+ ]
1544
+ )
1545
+ .sort_by("total_count", descending=True)
1546
+ .alias("top_datasets_map")
1547
+ )
1548
+ )
1549
+
1550
+ # Join all results
1551
+ return top_datasets.join(user_totals, on="user", how="left").join(
1552
+ platform_totals, on="user", how="left"
1553
+ )
1554
+
1555
+ def _combine_user_usage_data(
1556
+ self,
1557
+ dataset_usage_lf: polars.LazyFrame,
1558
+ dashboard_usage_lf: polars.LazyFrame,
1559
+ chart_usage_lf: polars.LazyFrame,
1560
+ ) -> polars.LazyFrame:
1561
+ """
1562
+ Combines user usage data from dataset, dashboard, and chart sources.
1563
+
1564
+ Args:
1565
+ dataset_usage_lf: LazyFrame with dataset usage data containing top_datasets_map
1566
+ dashboard_usage_lf: LazyFrame with dashboard usage data
1567
+ chart_usage_lf: LazyFrame with chart usage data
1568
+
1569
+ Returns:
1570
+ Combined LazyFrame with aggregated usage data per user
1571
+ """
1572
+ user_totals = self._combine_user_totals(
1573
+ dataset_usage_lf, dashboard_usage_lf, chart_usage_lf
1574
+ )
1575
+
1576
+ platform_pairs = self._combine_platform_pairs(
1577
+ dataset_usage_lf, dashboard_usage_lf, chart_usage_lf
1578
+ )
1579
+
1580
+ result = user_totals.join(platform_pairs, on="user", how="left")
1581
+
1582
+ return result.with_columns(
1583
+ polars.col("platform_usage_pairs").fill_null(polars.lit([]))
1584
+ )
1585
+
1586
+ def _filter_users(self, users_lf: polars.LazyFrame) -> polars.LazyFrame:
1587
+ filter_condition = polars.col("user").str.contains("@")
1588
+ for pattern in self.config.excluded_platforms:
1589
+ filter_condition = filter_condition & ~polars.col("user").str.contains(
1590
+ pattern
1591
+ )
1592
+
1593
+ return users_lf.filter(filter_condition)
1594
+
1595
+ def _combine_user_totals(
1596
+ self,
1597
+ dataset_usage_lf: polars.LazyFrame,
1598
+ dashboard_usage_lf: polars.LazyFrame,
1599
+ chart_usage_lf: polars.LazyFrame,
1600
+ ) -> polars.LazyFrame:
1601
+ """Combine user totals and top_datasets_map from all sources."""
1602
+ # Collect all unique users in one operation
1603
+ all_users_lf = (
1604
+ polars.concat(
1605
+ [
1606
+ dataset_usage_lf.select("user"),
1607
+ dashboard_usage_lf.select("user"),
1608
+ chart_usage_lf.select("user"),
1609
+ ]
1610
+ )
1611
+ .unique()
1612
+ .pipe(self._filter_users)
1613
+ )
1614
+
1615
+ return (
1616
+ all_users_lf.join(
1617
+ dataset_usage_lf.select(
1618
+ ["user", "top_datasets_map", "userUsageTotalPast30Days"]
1619
+ ),
1620
+ on="user",
1621
+ how="left",
1622
+ )
1623
+ .join(
1624
+ dashboard_usage_lf.select(["user", "userUsageTotalPast30Days"]),
1625
+ on="user",
1626
+ how="left",
1627
+ suffix="_dashboard",
1628
+ )
1629
+ .join(
1630
+ chart_usage_lf.select(["user", "userUsageTotalPast30Days"]),
1631
+ on="user",
1632
+ how="left",
1633
+ suffix="_chart",
1634
+ )
1635
+ .with_columns(
1636
+ [
1637
+ # Sum with explicit null handling
1638
+ (
1639
+ polars.col("userUsageTotalPast30Days").fill_null(0)
1640
+ + polars.col("userUsageTotalPast30Days_dashboard").fill_null(0)
1641
+ + polars.col("userUsageTotalPast30Days_chart").fill_null(0)
1642
+ ).alias("userUsageTotalPast30Days")
1643
+ ]
1644
+ )
1645
+ .select(["user", "top_datasets_map", "userUsageTotalPast30Days"])
1646
+ )
1647
+
1648
+ def _combine_platform_pairs(
1649
+ self,
1650
+ dataset_usage_lf: polars.LazyFrame,
1651
+ dashboard_usage_lf: polars.LazyFrame,
1652
+ chart_usage_lf: polars.LazyFrame,
1653
+ ) -> polars.LazyFrame:
1654
+ """Combine platform usage pairs from all sources."""
1655
+ all_platforms = []
1656
+
1657
+ # Extract platforms from each source
1658
+ for source_lf, col_name in [
1659
+ (dataset_usage_lf, "platform_usage_pairs"),
1660
+ (dashboard_usage_lf, "platform_usage_pairs"),
1661
+ (chart_usage_lf, "platform_usage_pairs"),
1662
+ ]:
1663
+ platforms = self._extract_platforms_from_source(source_lf, col_name)
1664
+ if platforms is not None:
1665
+ all_platforms.append(platforms)
1666
+
1667
+ if not all_platforms:
1668
+ # Return empty result if no platforms found
1669
+ return polars.LazyFrame({"user": [], "platform_usage_pairs": []})
1670
+
1671
+ # Combine all platforms and aggregate by user + platform
1672
+ combined_platforms = polars.concat(all_platforms, how="vertical_relaxed")
1673
+ aggregated = combined_platforms.group_by("user", "platform_urn").agg(
1674
+ polars.col("platform_total").sum().alias("platform_total")
1675
+ )
1676
+
1677
+ # Rebuild platform_usage_pairs structure
1678
+ return aggregated.group_by("user").agg(
1679
+ polars.struct(
1680
+ [polars.col("platform_urn"), polars.col("platform_total")]
1681
+ ).alias("platform_usage_pairs")
1682
+ )
1683
+
1684
+ def _extract_platforms_from_source(
1685
+ self, source_lf: polars.LazyFrame, col_name: str
1686
+ ) -> polars.LazyFrame | None:
1687
+ """Extract platform data from a source LazyFrame."""
1688
+ try:
1689
+ return (
1690
+ source_lf.select(["user", col_name])
1691
+ .filter(polars.col(col_name).is_not_null())
1692
+ .filter(polars.col(col_name).list.len() > 0)
1693
+ .explode(col_name)
1694
+ .unnest(col_name)
1695
+ .filter(polars.col("platform_urn").is_not_null())
1696
+ .select(["user", "platform_urn", "platform_total"])
1697
+ )
1698
+ except polars.exceptions.ColumnNotFoundError:
1699
+ return None
1700
+
1701
+ def add_platform_usage_percentiles(
1702
+ self, user_usage_lf: polars.LazyFrame
1703
+ ) -> polars.LazyFrame:
1704
+ """
1705
+ Add platform usage percentiles to user usage data.
1706
+
1707
+ Args:
1708
+ user_usage_lf: LazyFrame with user usage data containing platform_usage_pairs column
1709
+
1710
+ Returns:
1711
+ LazyFrame with additional platform_usage_percentiles column
1712
+ """
1713
+ # First explode the platform_usage_pairs to work with individual platform usage records
1714
+ platform_usage_exploded = (
1715
+ user_usage_lf.explode("platform_usage_pairs")
1716
+ .unnest("platform_usage_pairs")
1717
+ .filter(polars.col("platform_urn").is_not_null())
1718
+ )
1719
+
1720
+ # Use the existing gen_rank_and_percentile method to calculate percentiles
1721
+ platform_percentiles_with_ranks = self.gen_rank_and_percentile(
1722
+ lf=platform_usage_exploded,
1723
+ count_field="platform_total",
1724
+ urn_field="user",
1725
+ platform_field="platform_urn",
1726
+ prefix="platform_",
1727
+ use_exp_cdf=False,
1728
+ )
1729
+
1730
+ # Group back by user and create the percentiles structure
1731
+ platform_percentiles = platform_percentiles_with_ranks.group_by("user").agg(
1732
+ polars.struct(
1733
+ [
1734
+ polars.col("platform_urn"),
1735
+ polars.col("platform_rank_percentile").cast(polars.Float64),
1736
+ ]
1737
+ ).alias("platform_usage_percentiles")
1738
+ )
1739
+
1740
+ # Join the percentiles back to the original user_usage_lf
1741
+ return user_usage_lf.join(platform_percentiles, on="user", how="left")
1742
+
1743
+ def _generate_user_usage_for_dashboard_charts(
1744
+ self, entity_index: str, usage_index: str
1745
+ ) -> polars.LazyFrame:
1746
+ entities_df = self._generate_dashboard_chart_entities(entity_index)
1747
+ lf = self._generate_dashboard_chart_usage(entities_df, usage_index)
1748
+
1749
+ # Process dashboard usage data into user usage format (similar to dataset version)
1750
+ users_lf = (
1751
+ lf.explode("userCounts")
1752
+ .unnest("userCounts")
1753
+ .filter(polars.col("user").is_not_null())
1754
+ .rename({"usageCount": "count"}) # Rename to match dataset schema
1755
+ )
1756
+
1757
+ user_dashboard_usage_lf = self._create_user_dataset_usage_map(users_lf)
1758
+ return user_dashboard_usage_lf
1759
+
1760
+ def generate_user_usage(self) -> polars.LazyFrame:
1761
+ dataset_usage_lf = self._generate_user_usage_for_dataset()
1762
+
1763
+ usage_index = "dashboard_dashboardusagestatisticsaspect_v1"
1764
+ entity_index = "dashboardindex_v2"
1765
+ dashboard_usage_lf = self._generate_user_usage_for_dashboard_charts(
1766
+ entity_index, usage_index
1767
+ )
1768
+
1769
+ entity_index = "chartindex_v2"
1770
+ usage_index = "chart_chartusagestatisticsaspect_v1"
1771
+ chart_usage_lf = self._generate_user_usage_for_dashboard_charts(
1772
+ entity_index, usage_index
1773
+ )
1774
+
1775
+ # Combine all three usage sources
1776
+ lf = self._combine_user_usage_data(
1777
+ dataset_usage_lf, dashboard_usage_lf, chart_usage_lf
1778
+ )
1779
+
1780
+ lf = self.add_platform_usage_percentiles(lf)
1781
+
1782
+ # Add user usage percentiles across all users (not grouped by platform)
1783
+ # Create a temporary platform field for percentile calculation
1784
+ lf = lf.with_columns(polars.lit("all_users").alias("temp_platform"))
1785
+
1786
+ lf = self.gen_rank_and_percentile(
1787
+ lf=lf,
1788
+ count_field="userUsageTotalPast30Days",
1789
+ urn_field="user",
1790
+ platform_field="temp_platform",
1791
+ prefix="userUsage",
1792
+ use_exp_cdf=False,
1793
+ )
1794
+
1795
+ # Rename the percentile column to match the schema field name and remove temp field
1796
+ lf = lf.rename(
1797
+ {"userUsagerank_percentile": "userUsagePercentilePast30Days"}
1798
+ ).drop("temp_platform")
1799
+
1800
+ return lf
1801
+
1368
1802
  def generate_dataset_usage(self) -> polars.LazyFrame:
1369
1803
  datasets_lf = self.get_datasets()
1370
1804
  if self.config.set_upstream_table_max_modification_time_for_views:
@@ -1485,11 +1919,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1485
1919
  # called `Option::unwrap()` on a `None` value
1486
1920
  # Which only happens if we don't collect immediately
1487
1921
  # return polars.scan_parquet(temp_file.name, schema=schema, low_memory=True).collect().lazy()
1488
- return (
1489
- polars.scan_parquet(temp_file.name, schema=schema, low_memory=True)
1490
- .collect()
1491
- .lazy()
1492
- )
1922
+ return polars.scan_parquet(temp_file.name, schema=schema, low_memory=True)
1493
1923
 
1494
1924
  def load_dataset_usage(self) -> polars.LazyFrame:
1495
1925
  index = "dataset_datasetusagestatisticsaspect_v1"
@@ -1606,23 +2036,40 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1606
2036
  delay: Optional[float] = None,
1607
2037
  ) -> Iterable[Dict[str, Any]]:
1608
2038
  processed_count = 0
2039
+ scroll_id = None
1609
2040
  while True:
1610
2041
  with PerfTimer() as timer:
1611
2042
  logger.debug(f"ES query: {query}")
1612
- results = server.search(
1613
- body=query,
1614
- size=batch_size,
1615
- index=(
1616
- index
1617
- if not self.config.search_index.opensearch_dialect
1618
- else None
1619
- ),
1620
- params=(
1621
- {"timeout": self.config.query_timeout}
1622
- if self.config.search_index.opensearch_dialect
1623
- else {"request_timeout": self.config.query_timeout}
1624
- ),
1625
- )
2043
+ if not scroll_id:
2044
+ logger.debug(
2045
+ f"Getting inital data from index {index} without scroll id"
2046
+ )
2047
+ results = server.search(
2048
+ body=query,
2049
+ size=batch_size,
2050
+ scroll="2m",
2051
+ index=index,
2052
+ params=(
2053
+ {"timeout": self.config.query_timeout}
2054
+ if self.config.search_index.opensearch_dialect
2055
+ else {"request_timeout": self.config.query_timeout}
2056
+ ),
2057
+ )
2058
+ else:
2059
+ logger.debug(
2060
+ f"Getting data from index {index} using scroll_id: {scroll_id}"
2061
+ )
2062
+ results = server.scroll(
2063
+ scroll_id=scroll_id,
2064
+ scroll="2m",
2065
+ params=(
2066
+ {"timeout": self.config.query_timeout}
2067
+ if self.config.search_index.opensearch_dialect
2068
+ else {"request_timeout": self.config.query_timeout}
2069
+ ),
2070
+ )
2071
+ scroll_id = results["_scroll_id"]
2072
+
1626
2073
  if not aggregation_key:
1627
2074
  yield from process_function(results["hits"]["hits"])
1628
2075
 
@@ -1633,7 +2080,6 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1633
2080
  )
1634
2081
  if len(results["hits"]["hits"]) < batch_size:
1635
2082
  break
1636
- query.update({"search_after": results["hits"]["hits"][-1]["sort"]})
1637
2083
  else:
1638
2084
  yield from process_function(
1639
2085
  results["aggregations"][aggregation_key]["buckets"]
@@ -1643,16 +2089,11 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1643
2089
  < batch_size
1644
2090
  ):
1645
2091
  break
1646
- if "after_key" in results["aggregations"][aggregation_key]:
1647
- query["aggs"][aggregation_key]["composite"]["after"] = results[
1648
- "aggregations"
1649
- ][aggregation_key]["after_key"]
1650
-
1651
- if delay:
1652
- logger.debug(
1653
- f"Sleeping for {delay} seconds before getting next batch from ES"
1654
- )
1655
- time.sleep(delay)
2092
+ if delay:
2093
+ logger.debug(
2094
+ f"Sleeping for {delay} seconds before getting next batch from ES"
2095
+ )
2096
+ time.sleep(delay)
1656
2097
 
1657
- def get_report(self) -> SourceReport:
2098
+ def get_report(self) -> "DatahubUsageFeatureReport":
1658
2099
  return self.report