@oneuptime/common 11.0.0 → 11.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (357) hide show
  1. package/Models/DatabaseModels/Alert.ts +110 -0
  2. package/Models/DatabaseModels/CephCluster.ts +964 -0
  3. package/Models/DatabaseModels/CephClusterLabelRule.ts +514 -0
  4. package/Models/DatabaseModels/CephClusterOwnerRule.ts +596 -0
  5. package/Models/DatabaseModels/CephClusterOwnerTeam.ts +487 -0
  6. package/Models/DatabaseModels/CephClusterOwnerUser.ts +486 -0
  7. package/Models/DatabaseModels/CephResource.ts +809 -0
  8. package/Models/DatabaseModels/Host.ts +64 -0
  9. package/Models/DatabaseModels/Incident.ts +110 -0
  10. package/Models/DatabaseModels/Index.ts +24 -0
  11. package/Models/DatabaseModels/ProxmoxCluster.ts +943 -0
  12. package/Models/DatabaseModels/ProxmoxClusterLabelRule.ts +514 -0
  13. package/Models/DatabaseModels/ProxmoxClusterOwnerRule.ts +596 -0
  14. package/Models/DatabaseModels/ProxmoxClusterOwnerTeam.ts +487 -0
  15. package/Models/DatabaseModels/ProxmoxClusterOwnerUser.ts +486 -0
  16. package/Models/DatabaseModels/ProxmoxResource.ts +726 -0
  17. package/Models/DatabaseModels/ScheduledMaintenance.ts +110 -0
  18. package/Server/API/BillingInvoiceAPI.ts +47 -7
  19. package/Server/API/CephResourceAPI.ts +134 -0
  20. package/Server/API/DashboardAPI.ts +46 -0
  21. package/Server/API/ProjectAPI.ts +15 -0
  22. package/Server/API/ProxmoxResourceAPI.ts +132 -0
  23. package/Server/API/ResellerPlanAPI.ts +17 -0
  24. package/Server/Infrastructure/GlobalCache.ts +8 -2
  25. package/Server/Infrastructure/Postgres/SchemaMigrations/1781500000000-AddProxmoxAndCephClusterTables.ts +163 -0
  26. package/Server/Infrastructure/Postgres/SchemaMigrations/1781600000000-AddProxmoxCephV2Columns.ts +211 -0
  27. package/Server/Infrastructure/Postgres/SchemaMigrations/1781600000001-AddProxmoxCephActivityAndRules.ts +590 -0
  28. package/Server/Infrastructure/Postgres/SchemaMigrations/1781700000000-AddProxmoxCephV3Columns.ts +64 -0
  29. package/Server/Infrastructure/Postgres/SchemaMigrations/Index.ts +8 -0
  30. package/Server/Infrastructure/Redis.ts +40 -12
  31. package/Server/Services/AnalyticsDatabaseService.ts +1 -1
  32. package/Server/Services/BillingService.ts +109 -21
  33. package/Server/Services/CephClusterLabelRuleEngineService.ts +200 -0
  34. package/Server/Services/CephClusterLabelRuleService.ts +14 -0
  35. package/Server/Services/CephClusterOwnerRuleEngineService.ts +218 -0
  36. package/Server/Services/CephClusterOwnerRuleService.ts +14 -0
  37. package/Server/Services/CephClusterOwnerTeamService.ts +10 -0
  38. package/Server/Services/CephClusterOwnerUserService.ts +10 -0
  39. package/Server/Services/CephClusterService.ts +401 -0
  40. package/Server/Services/CephResourceService.ts +383 -0
  41. package/Server/Services/CloudResourceService.ts +11 -3
  42. package/Server/Services/DockerHostService.ts +11 -3
  43. package/Server/Services/ExceptionAggregationService.ts +2 -0
  44. package/Server/Services/HostService.ts +11 -3
  45. package/Server/Services/Index.ts +24 -0
  46. package/Server/Services/KubernetesClusterService.ts +11 -3
  47. package/Server/Services/LogAggregationService.ts +2 -0
  48. package/Server/Services/MetricAggregationService.ts +2 -0
  49. package/Server/Services/OpenTelemetryIngestService.ts +36 -0
  50. package/Server/Services/ProxmoxClusterLabelRuleEngineService.ts +204 -0
  51. package/Server/Services/ProxmoxClusterLabelRuleService.ts +14 -0
  52. package/Server/Services/ProxmoxClusterOwnerRuleEngineService.ts +222 -0
  53. package/Server/Services/ProxmoxClusterOwnerRuleService.ts +14 -0
  54. package/Server/Services/ProxmoxClusterOwnerTeamService.ts +10 -0
  55. package/Server/Services/ProxmoxClusterOwnerUserService.ts +10 -0
  56. package/Server/Services/ProxmoxClusterService.ts +382 -0
  57. package/Server/Services/ProxmoxResourceService.ts +404 -0
  58. package/Server/Services/RumApplicationService.ts +11 -3
  59. package/Server/Services/ServerlessFunctionService.ts +11 -3
  60. package/Server/Services/TelemetryUsageBillingService.ts +41 -3
  61. package/Server/Services/TraceAggregationService.ts +2 -0
  62. package/Server/Types/AnalyticsDatabase/AggregateBy.ts +8 -23
  63. package/Server/Utils/Monitor/MonitorAlert.ts +45 -0
  64. package/Server/Utils/Monitor/MonitorClusterContext.ts +129 -0
  65. package/Server/Utils/Monitor/MonitorCriteriaEvaluator.ts +344 -4
  66. package/Server/Utils/Monitor/MonitorIncident.ts +130 -7
  67. package/Server/Utils/Monitor/MonitorMaintenanceSuppression.ts +39 -6
  68. package/Server/Utils/Monitor/MonitorTemplateUtil.ts +3 -1
  69. package/Server/Utils/Monitor/SeriesResourceLabels.ts +33 -0
  70. package/Server/Utils/Profiling.ts +37 -2
  71. package/Server/Utils/Telemetry/EntityRegistry.ts +4 -0
  72. package/Server/Utils/Telemetry/ProxmoxCephSnapshotScan.ts +1096 -0
  73. package/Server/Utils/Telemetry/TelemetryEntity.ts +85 -0
  74. package/Server/Utils/Telemetry.ts +8 -19
  75. package/Tests/Server/API/BillingInvoiceAPI.test.ts +194 -0
  76. package/Tests/Server/API/ProjectAPI.test.ts +91 -0
  77. package/Tests/Server/API/ResellerPlanAPI.test.ts +207 -0
  78. package/Tests/Server/Infrastructure/GlobalCache.test.ts +100 -0
  79. package/Tests/Server/Services/BillingService.test.ts +323 -0
  80. package/Tests/Server/Services/CephResourceService.test.ts +264 -0
  81. package/Tests/Server/Services/ProxmoxResourceService.test.ts +326 -0
  82. package/Tests/Server/Utils/Monitor/MonitorCriteriaEvaluator.test.ts +322 -0
  83. package/Tests/Server/Utils/Monitor/MonitorMaintenanceSuppression.test.ts +13 -0
  84. package/Tests/Server/Utils/Telemetry/ProxmoxCephSnapshotScan.test.ts +879 -0
  85. package/Tests/Server/Utils/Telemetry/TelemetryEntity.test.ts +196 -0
  86. package/Tests/Types/Monitor/CephAlertTemplates.test.ts +1231 -0
  87. package/Tests/Types/Monitor/ProxmoxAlertTemplates.test.ts +732 -0
  88. package/Tests/Utils/ModelImportExport.test.ts +366 -0
  89. package/Tests/Utils/Telemetry/EntityRelationship.test.ts +49 -0
  90. package/Tests/Utils/Telemetry/HeartbeatAvailability.test.ts +423 -0
  91. package/Types/BaseDatabase/AggregationIntervalUtil.ts +74 -0
  92. package/Types/Dashboard/DashboardComponentType.ts +4 -0
  93. package/Types/Dashboard/DashboardComponents/ComponentArgument.ts +2 -0
  94. package/Types/Dashboard/DashboardComponents/DashboardCephOsdListComponent.ts +15 -0
  95. package/Types/Dashboard/DashboardComponents/DashboardCephPoolListComponent.ts +14 -0
  96. package/Types/Dashboard/DashboardComponents/DashboardProxmoxGuestListComponent.ts +17 -0
  97. package/Types/Dashboard/DashboardComponents/DashboardProxmoxNodeListComponent.ts +16 -0
  98. package/Types/Dashboard/DashboardTemplates.ts +446 -0
  99. package/Types/Icon/IconProp.ts +2 -0
  100. package/Types/Monitor/CephAlertTemplates.ts +1647 -0
  101. package/Types/Monitor/CephMetricCatalog.ts +409 -0
  102. package/Types/Monitor/MetricMonitor/MetricMonitorResponse.ts +44 -0
  103. package/Types/Monitor/MonitorStep.ts +64 -0
  104. package/Types/Monitor/MonitorStepCephMonitor.ts +57 -0
  105. package/Types/Monitor/MonitorStepProxmoxMonitor.ts +81 -0
  106. package/Types/Monitor/MonitorType.ts +29 -1
  107. package/Types/Monitor/ProxmoxAlertTemplates.ts +899 -0
  108. package/Types/Monitor/ProxmoxMetricCatalog.ts +382 -0
  109. package/Types/Permission.ts +464 -0
  110. package/Types/Telemetry/EntityType.ts +11 -0
  111. package/Types/Telemetry/ServiceType.ts +2 -0
  112. package/UI/Components/Icon/Icon.tsx +84 -0
  113. package/UI/Components/ImportExport/ExportModelCard.tsx +90 -0
  114. package/UI/Components/ImportExport/ImportModelsModal.tsx +239 -0
  115. package/UI/Components/ModelTable/ModelTable.tsx +294 -143
  116. package/UI/Components/MonitorTemplateVariables/TemplateVariablesCatalog.ts +9 -5
  117. package/UI/Utils/ModelImportExport.ts +207 -0
  118. package/UI/Utils/Telemetry/Telemetry.ts +16 -21
  119. package/UI/Utils/TelemetryService.ts +7 -3
  120. package/Utils/Dashboard/Components/DashboardCephOsdListComponent.ts +63 -0
  121. package/Utils/Dashboard/Components/DashboardCephPoolListComponent.ts +32 -0
  122. package/Utils/Dashboard/Components/DashboardCephResourceListShared.ts +61 -0
  123. package/Utils/Dashboard/Components/DashboardProxmoxGuestListComponent.ts +69 -0
  124. package/Utils/Dashboard/Components/DashboardProxmoxNodeListComponent.ts +55 -0
  125. package/Utils/Dashboard/Components/DashboardProxmoxResourceListShared.ts +61 -0
  126. package/Utils/Dashboard/Components/Index.ts +28 -0
  127. package/Utils/ModelImportExport.ts +369 -0
  128. package/Utils/Telemetry/EntityKey.ts +35 -0
  129. package/Utils/Telemetry/EntityRelationship.ts +6 -0
  130. package/Utils/Telemetry/HeartbeatAvailability.ts +262 -0
  131. package/build/dist/Models/DatabaseModels/Alert.js +108 -0
  132. package/build/dist/Models/DatabaseModels/Alert.js.map +1 -1
  133. package/build/dist/Models/DatabaseModels/CephCluster.js +992 -0
  134. package/build/dist/Models/DatabaseModels/CephCluster.js.map +1 -0
  135. package/build/dist/Models/DatabaseModels/CephClusterLabelRule.js +522 -0
  136. package/build/dist/Models/DatabaseModels/CephClusterLabelRule.js.map +1 -0
  137. package/build/dist/Models/DatabaseModels/CephClusterOwnerRule.js +603 -0
  138. package/build/dist/Models/DatabaseModels/CephClusterOwnerRule.js.map +1 -0
  139. package/build/dist/Models/DatabaseModels/CephClusterOwnerTeam.js +503 -0
  140. package/build/dist/Models/DatabaseModels/CephClusterOwnerTeam.js.map +1 -0
  141. package/build/dist/Models/DatabaseModels/CephClusterOwnerUser.js +502 -0
  142. package/build/dist/Models/DatabaseModels/CephClusterOwnerUser.js.map +1 -0
  143. package/build/dist/Models/DatabaseModels/CephResource.js +846 -0
  144. package/build/dist/Models/DatabaseModels/CephResource.js.map +1 -0
  145. package/build/dist/Models/DatabaseModels/Host.js +63 -0
  146. package/build/dist/Models/DatabaseModels/Host.js.map +1 -1
  147. package/build/dist/Models/DatabaseModels/Incident.js +108 -0
  148. package/build/dist/Models/DatabaseModels/Incident.js.map +1 -1
  149. package/build/dist/Models/DatabaseModels/Index.js +24 -0
  150. package/build/dist/Models/DatabaseModels/Index.js.map +1 -1
  151. package/build/dist/Models/DatabaseModels/ProxmoxCluster.js +967 -0
  152. package/build/dist/Models/DatabaseModels/ProxmoxCluster.js.map +1 -0
  153. package/build/dist/Models/DatabaseModels/ProxmoxClusterLabelRule.js +522 -0
  154. package/build/dist/Models/DatabaseModels/ProxmoxClusterLabelRule.js.map +1 -0
  155. package/build/dist/Models/DatabaseModels/ProxmoxClusterOwnerRule.js +603 -0
  156. package/build/dist/Models/DatabaseModels/ProxmoxClusterOwnerRule.js.map +1 -0
  157. package/build/dist/Models/DatabaseModels/ProxmoxClusterOwnerTeam.js +503 -0
  158. package/build/dist/Models/DatabaseModels/ProxmoxClusterOwnerTeam.js.map +1 -0
  159. package/build/dist/Models/DatabaseModels/ProxmoxClusterOwnerUser.js +502 -0
  160. package/build/dist/Models/DatabaseModels/ProxmoxClusterOwnerUser.js.map +1 -0
  161. package/build/dist/Models/DatabaseModels/ProxmoxResource.js +761 -0
  162. package/build/dist/Models/DatabaseModels/ProxmoxResource.js.map +1 -0
  163. package/build/dist/Models/DatabaseModels/ScheduledMaintenance.js +108 -0
  164. package/build/dist/Models/DatabaseModels/ScheduledMaintenance.js.map +1 -1
  165. package/build/dist/Server/API/BillingInvoiceAPI.js +35 -5
  166. package/build/dist/Server/API/BillingInvoiceAPI.js.map +1 -1
  167. package/build/dist/Server/API/CephResourceAPI.js +98 -0
  168. package/build/dist/Server/API/CephResourceAPI.js.map +1 -0
  169. package/build/dist/Server/API/DashboardAPI.js +46 -0
  170. package/build/dist/Server/API/DashboardAPI.js.map +1 -1
  171. package/build/dist/Server/API/ProjectAPI.js +11 -0
  172. package/build/dist/Server/API/ProjectAPI.js.map +1 -1
  173. package/build/dist/Server/API/ProxmoxResourceAPI.js +95 -0
  174. package/build/dist/Server/API/ProxmoxResourceAPI.js.map +1 -0
  175. package/build/dist/Server/API/ResellerPlanAPI.js +17 -3
  176. package/build/dist/Server/API/ResellerPlanAPI.js.map +1 -1
  177. package/build/dist/Server/Infrastructure/GlobalCache.js +7 -2
  178. package/build/dist/Server/Infrastructure/GlobalCache.js.map +1 -1
  179. package/build/dist/Server/Infrastructure/Postgres/SchemaMigrations/1781500000000-AddProxmoxAndCephClusterTables.js +76 -0
  180. package/build/dist/Server/Infrastructure/Postgres/SchemaMigrations/1781500000000-AddProxmoxAndCephClusterTables.js.map +1 -0
  181. package/build/dist/Server/Infrastructure/Postgres/SchemaMigrations/1781600000000-AddProxmoxCephV2Columns.js +108 -0
  182. package/build/dist/Server/Infrastructure/Postgres/SchemaMigrations/1781600000000-AddProxmoxCephV2Columns.js.map +1 -0
  183. package/build/dist/Server/Infrastructure/Postgres/SchemaMigrations/1781600000001-AddProxmoxCephActivityAndRules.js +253 -0
  184. package/build/dist/Server/Infrastructure/Postgres/SchemaMigrations/1781600000001-AddProxmoxCephActivityAndRules.js.map +1 -0
  185. package/build/dist/Server/Infrastructure/Postgres/SchemaMigrations/1781700000000-AddProxmoxCephV3Columns.js +43 -0
  186. package/build/dist/Server/Infrastructure/Postgres/SchemaMigrations/1781700000000-AddProxmoxCephV3Columns.js.map +1 -0
  187. package/build/dist/Server/Infrastructure/Postgres/SchemaMigrations/Index.js +8 -0
  188. package/build/dist/Server/Infrastructure/Postgres/SchemaMigrations/Index.js.map +1 -1
  189. package/build/dist/Server/Infrastructure/Redis.js +31 -8
  190. package/build/dist/Server/Infrastructure/Redis.js.map +1 -1
  191. package/build/dist/Server/Services/AnalyticsDatabaseService.js +1 -1
  192. package/build/dist/Server/Services/AnalyticsDatabaseService.js.map +1 -1
  193. package/build/dist/Server/Services/BillingService.js +85 -23
  194. package/build/dist/Server/Services/BillingService.js.map +1 -1
  195. package/build/dist/Server/Services/CephClusterLabelRuleEngineService.js +166 -0
  196. package/build/dist/Server/Services/CephClusterLabelRuleEngineService.js.map +1 -0
  197. package/build/dist/Server/Services/CephClusterLabelRuleService.js +13 -0
  198. package/build/dist/Server/Services/CephClusterLabelRuleService.js.map +1 -0
  199. package/build/dist/Server/Services/CephClusterOwnerRuleEngineService.js +186 -0
  200. package/build/dist/Server/Services/CephClusterOwnerRuleEngineService.js.map +1 -0
  201. package/build/dist/Server/Services/CephClusterOwnerRuleService.js +13 -0
  202. package/build/dist/Server/Services/CephClusterOwnerRuleService.js.map +1 -0
  203. package/build/dist/Server/Services/CephClusterOwnerTeamService.js +9 -0
  204. package/build/dist/Server/Services/CephClusterOwnerTeamService.js.map +1 -0
  205. package/build/dist/Server/Services/CephClusterOwnerUserService.js +9 -0
  206. package/build/dist/Server/Services/CephClusterOwnerUserService.js.map +1 -0
  207. package/build/dist/Server/Services/CephClusterService.js +353 -0
  208. package/build/dist/Server/Services/CephClusterService.js.map +1 -0
  209. package/build/dist/Server/Services/CephResourceService.js +257 -0
  210. package/build/dist/Server/Services/CephResourceService.js.map +1 -0
  211. package/build/dist/Server/Services/CloudResourceService.js +10 -2
  212. package/build/dist/Server/Services/CloudResourceService.js.map +1 -1
  213. package/build/dist/Server/Services/DockerHostService.js +10 -2
  214. package/build/dist/Server/Services/DockerHostService.js.map +1 -1
  215. package/build/dist/Server/Services/ExceptionAggregationService.js +2 -0
  216. package/build/dist/Server/Services/ExceptionAggregationService.js.map +1 -1
  217. package/build/dist/Server/Services/HostService.js +10 -2
  218. package/build/dist/Server/Services/HostService.js.map +1 -1
  219. package/build/dist/Server/Services/Index.js +24 -0
  220. package/build/dist/Server/Services/Index.js.map +1 -1
  221. package/build/dist/Server/Services/KubernetesClusterService.js +10 -2
  222. package/build/dist/Server/Services/KubernetesClusterService.js.map +1 -1
  223. package/build/dist/Server/Services/LogAggregationService.js +2 -0
  224. package/build/dist/Server/Services/LogAggregationService.js.map +1 -1
  225. package/build/dist/Server/Services/MetricAggregationService.js +2 -0
  226. package/build/dist/Server/Services/MetricAggregationService.js.map +1 -1
  227. package/build/dist/Server/Services/OpenTelemetryIngestService.js +37 -7
  228. package/build/dist/Server/Services/OpenTelemetryIngestService.js.map +1 -1
  229. package/build/dist/Server/Services/ProxmoxClusterLabelRuleEngineService.js +166 -0
  230. package/build/dist/Server/Services/ProxmoxClusterLabelRuleEngineService.js.map +1 -0
  231. package/build/dist/Server/Services/ProxmoxClusterLabelRuleService.js +13 -0
  232. package/build/dist/Server/Services/ProxmoxClusterLabelRuleService.js.map +1 -0
  233. package/build/dist/Server/Services/ProxmoxClusterOwnerRuleEngineService.js +186 -0
  234. package/build/dist/Server/Services/ProxmoxClusterOwnerRuleEngineService.js.map +1 -0
  235. package/build/dist/Server/Services/ProxmoxClusterOwnerRuleService.js +13 -0
  236. package/build/dist/Server/Services/ProxmoxClusterOwnerRuleService.js.map +1 -0
  237. package/build/dist/Server/Services/ProxmoxClusterOwnerTeamService.js +9 -0
  238. package/build/dist/Server/Services/ProxmoxClusterOwnerTeamService.js.map +1 -0
  239. package/build/dist/Server/Services/ProxmoxClusterOwnerUserService.js +9 -0
  240. package/build/dist/Server/Services/ProxmoxClusterOwnerUserService.js.map +1 -0
  241. package/build/dist/Server/Services/ProxmoxClusterService.js +337 -0
  242. package/build/dist/Server/Services/ProxmoxClusterService.js.map +1 -0
  243. package/build/dist/Server/Services/ProxmoxResourceService.js +285 -0
  244. package/build/dist/Server/Services/ProxmoxResourceService.js.map +1 -0
  245. package/build/dist/Server/Services/RumApplicationService.js +10 -2
  246. package/build/dist/Server/Services/RumApplicationService.js.map +1 -1
  247. package/build/dist/Server/Services/ServerlessFunctionService.js +10 -2
  248. package/build/dist/Server/Services/ServerlessFunctionService.js.map +1 -1
  249. package/build/dist/Server/Services/TelemetryUsageBillingService.js +30 -3
  250. package/build/dist/Server/Services/TelemetryUsageBillingService.js.map +1 -1
  251. package/build/dist/Server/Services/TraceAggregationService.js +2 -0
  252. package/build/dist/Server/Services/TraceAggregationService.js.map +1 -1
  253. package/build/dist/Server/Types/AnalyticsDatabase/AggregateBy.js +8 -25
  254. package/build/dist/Server/Types/AnalyticsDatabase/AggregateBy.js.map +1 -1
  255. package/build/dist/Server/Utils/Monitor/MonitorAlert.js +36 -0
  256. package/build/dist/Server/Utils/Monitor/MonitorAlert.js.map +1 -1
  257. package/build/dist/Server/Utils/Monitor/MonitorClusterContext.js +90 -0
  258. package/build/dist/Server/Utils/Monitor/MonitorClusterContext.js.map +1 -0
  259. package/build/dist/Server/Utils/Monitor/MonitorCriteriaEvaluator.js +228 -4
  260. package/build/dist/Server/Utils/Monitor/MonitorCriteriaEvaluator.js.map +1 -1
  261. package/build/dist/Server/Utils/Monitor/MonitorIncident.js +103 -8
  262. package/build/dist/Server/Utils/Monitor/MonitorIncident.js.map +1 -1
  263. package/build/dist/Server/Utils/Monitor/MonitorMaintenanceSuppression.js +23 -6
  264. package/build/dist/Server/Utils/Monitor/MonitorMaintenanceSuppression.js.map +1 -1
  265. package/build/dist/Server/Utils/Monitor/MonitorTemplateUtil.js +3 -1
  266. package/build/dist/Server/Utils/Monitor/MonitorTemplateUtil.js.map +1 -1
  267. package/build/dist/Server/Utils/Monitor/SeriesResourceLabels.js +23 -0
  268. package/build/dist/Server/Utils/Monitor/SeriesResourceLabels.js.map +1 -1
  269. package/build/dist/Server/Utils/Profiling.js +24 -3
  270. package/build/dist/Server/Utils/Profiling.js.map +1 -1
  271. package/build/dist/Server/Utils/Telemetry/EntityRegistry.js +4 -0
  272. package/build/dist/Server/Utils/Telemetry/EntityRegistry.js.map +1 -1
  273. package/build/dist/Server/Utils/Telemetry/ProxmoxCephSnapshotScan.js +854 -0
  274. package/build/dist/Server/Utils/Telemetry/ProxmoxCephSnapshotScan.js.map +1 -0
  275. package/build/dist/Server/Utils/Telemetry/TelemetryEntity.js +62 -0
  276. package/build/dist/Server/Utils/Telemetry/TelemetryEntity.js.map +1 -1
  277. package/build/dist/Server/Utils/Telemetry.js +8 -10
  278. package/build/dist/Server/Utils/Telemetry.js.map +1 -1
  279. package/build/dist/Types/BaseDatabase/AggregationIntervalUtil.js +69 -0
  280. package/build/dist/Types/BaseDatabase/AggregationIntervalUtil.js.map +1 -0
  281. package/build/dist/Types/Dashboard/DashboardComponentType.js +4 -0
  282. package/build/dist/Types/Dashboard/DashboardComponentType.js.map +1 -1
  283. package/build/dist/Types/Dashboard/DashboardComponents/ComponentArgument.js +2 -0
  284. package/build/dist/Types/Dashboard/DashboardComponents/ComponentArgument.js.map +1 -1
  285. package/build/dist/Types/Dashboard/DashboardComponents/DashboardCephOsdListComponent.js +2 -0
  286. package/build/dist/Types/Dashboard/DashboardComponents/DashboardCephOsdListComponent.js.map +1 -0
  287. package/build/dist/Types/Dashboard/DashboardComponents/DashboardCephPoolListComponent.js +2 -0
  288. package/build/dist/Types/Dashboard/DashboardComponents/DashboardCephPoolListComponent.js.map +1 -0
  289. package/build/dist/Types/Dashboard/DashboardComponents/DashboardProxmoxGuestListComponent.js +2 -0
  290. package/build/dist/Types/Dashboard/DashboardComponents/DashboardProxmoxGuestListComponent.js.map +1 -0
  291. package/build/dist/Types/Dashboard/DashboardComponents/DashboardProxmoxNodeListComponent.js +2 -0
  292. package/build/dist/Types/Dashboard/DashboardComponents/DashboardProxmoxNodeListComponent.js.map +1 -0
  293. package/build/dist/Types/Dashboard/DashboardTemplates.js +394 -0
  294. package/build/dist/Types/Dashboard/DashboardTemplates.js.map +1 -1
  295. package/build/dist/Types/Icon/IconProp.js +2 -0
  296. package/build/dist/Types/Icon/IconProp.js.map +1 -1
  297. package/build/dist/Types/Monitor/CephAlertTemplates.js +1379 -0
  298. package/build/dist/Types/Monitor/CephAlertTemplates.js.map +1 -0
  299. package/build/dist/Types/Monitor/CephMetricCatalog.js +353 -0
  300. package/build/dist/Types/Monitor/CephMetricCatalog.js.map +1 -0
  301. package/build/dist/Types/Monitor/MonitorStep.js +46 -0
  302. package/build/dist/Types/Monitor/MonitorStep.js.map +1 -1
  303. package/build/dist/Types/Monitor/MonitorStepCephMonitor.js +34 -0
  304. package/build/dist/Types/Monitor/MonitorStepCephMonitor.js.map +1 -0
  305. package/build/dist/Types/Monitor/MonitorStepProxmoxMonitor.js +36 -0
  306. package/build/dist/Types/Monitor/MonitorStepProxmoxMonitor.js.map +1 -0
  307. package/build/dist/Types/Monitor/MonitorType.js +27 -1
  308. package/build/dist/Types/Monitor/MonitorType.js.map +1 -1
  309. package/build/dist/Types/Monitor/ProxmoxAlertTemplates.js +743 -0
  310. package/build/dist/Types/Monitor/ProxmoxAlertTemplates.js.map +1 -0
  311. package/build/dist/Types/Monitor/ProxmoxMetricCatalog.js +320 -0
  312. package/build/dist/Types/Monitor/ProxmoxMetricCatalog.js.map +1 -0
  313. package/build/dist/Types/Permission.js +408 -0
  314. package/build/dist/Types/Permission.js.map +1 -1
  315. package/build/dist/Types/Telemetry/EntityType.js +11 -0
  316. package/build/dist/Types/Telemetry/EntityType.js.map +1 -1
  317. package/build/dist/Types/Telemetry/ServiceType.js +2 -0
  318. package/build/dist/Types/Telemetry/ServiceType.js.map +1 -1
  319. package/build/dist/UI/Components/Icon/Icon.js +33 -0
  320. package/build/dist/UI/Components/Icon/Icon.js.map +1 -1
  321. package/build/dist/UI/Components/ImportExport/ExportModelCard.js +50 -0
  322. package/build/dist/UI/Components/ImportExport/ExportModelCard.js.map +1 -0
  323. package/build/dist/UI/Components/ImportExport/ImportModelsModal.js +115 -0
  324. package/build/dist/UI/Components/ImportExport/ImportModelsModal.js.map +1 -0
  325. package/build/dist/UI/Components/ModelTable/ModelTable.js +166 -74
  326. package/build/dist/UI/Components/ModelTable/ModelTable.js.map +1 -1
  327. package/build/dist/UI/Components/MonitorTemplateVariables/TemplateVariablesCatalog.js +5 -1
  328. package/build/dist/UI/Components/MonitorTemplateVariables/TemplateVariablesCatalog.js.map +1 -1
  329. package/build/dist/UI/Utils/ModelImportExport.js +142 -0
  330. package/build/dist/UI/Utils/ModelImportExport.js.map +1 -0
  331. package/build/dist/UI/Utils/Telemetry/Telemetry.js +11 -10
  332. package/build/dist/UI/Utils/Telemetry/Telemetry.js.map +1 -1
  333. package/build/dist/UI/Utils/TelemetryService.js +5 -2
  334. package/build/dist/UI/Utils/TelemetryService.js.map +1 -1
  335. package/build/dist/Utils/Dashboard/Components/DashboardCephOsdListComponent.js +50 -0
  336. package/build/dist/Utils/Dashboard/Components/DashboardCephOsdListComponent.js.map +1 -0
  337. package/build/dist/Utils/Dashboard/Components/DashboardCephPoolListComponent.js +27 -0
  338. package/build/dist/Utils/Dashboard/Components/DashboardCephPoolListComponent.js.map +1 -0
  339. package/build/dist/Utils/Dashboard/Components/DashboardCephResourceListShared.js +46 -0
  340. package/build/dist/Utils/Dashboard/Components/DashboardCephResourceListShared.js.map +1 -0
  341. package/build/dist/Utils/Dashboard/Components/DashboardProxmoxGuestListComponent.js +55 -0
  342. package/build/dist/Utils/Dashboard/Components/DashboardProxmoxGuestListComponent.js.map +1 -0
  343. package/build/dist/Utils/Dashboard/Components/DashboardProxmoxNodeListComponent.js +42 -0
  344. package/build/dist/Utils/Dashboard/Components/DashboardProxmoxNodeListComponent.js.map +1 -0
  345. package/build/dist/Utils/Dashboard/Components/DashboardProxmoxResourceListShared.js +46 -0
  346. package/build/dist/Utils/Dashboard/Components/DashboardProxmoxResourceListShared.js.map +1 -0
  347. package/build/dist/Utils/Dashboard/Components/Index.js +16 -0
  348. package/build/dist/Utils/Dashboard/Components/Index.js.map +1 -1
  349. package/build/dist/Utils/ModelImportExport.js +257 -0
  350. package/build/dist/Utils/ModelImportExport.js.map +1 -0
  351. package/build/dist/Utils/Telemetry/EntityKey.js +27 -0
  352. package/build/dist/Utils/Telemetry/EntityKey.js.map +1 -1
  353. package/build/dist/Utils/Telemetry/EntityRelationship.js +3 -0
  354. package/build/dist/Utils/Telemetry/EntityRelationship.js.map +1 -1
  355. package/build/dist/Utils/Telemetry/HeartbeatAvailability.js +174 -0
  356. package/build/dist/Utils/Telemetry/HeartbeatAvailability.js.map +1 -0
  357. package/package.json +29 -21
@@ -0,0 +1,1647 @@
1
+ import ObjectID from "../ObjectID";
2
+ import MonitorStep from "./MonitorStep";
3
+ import MonitorCriteria from "./MonitorCriteria";
4
+ import MonitorCriteriaInstance from "./MonitorCriteriaInstance";
5
+ import FilterCondition from "../Filter/FilterCondition";
6
+ import {
7
+ CheckOn,
8
+ FilterType,
9
+ EvaluateOverTimeType,
10
+ NoDataPolicy,
11
+ } from "./CriteriaFilter";
12
+ import MonitorStepCephMonitor from "./MonitorStepCephMonitor";
13
+ import RollingTime from "../RollingTime/RollingTime";
14
+ import MetricsAggregationType from "../Metrics/MetricsAggregationType";
15
+
16
+ export type CephAlertTemplateCategory =
17
+ | "Cluster Health"
18
+ | "OSD"
19
+ | "PG"
20
+ | "Capacity";
21
+
22
+ export type CephAlertTemplateSeverity = "Critical" | "Warning";
23
+
24
+ export interface CephAlertTemplateArgs {
25
+ clusterIdentifier: string;
26
+ onlineMonitorStatusId: ObjectID;
27
+ offlineMonitorStatusId: ObjectID;
28
+ defaultIncidentSeverityId: ObjectID;
29
+ defaultAlertSeverityId: ObjectID;
30
+ monitorName: string;
31
+ }
32
+
33
+ export interface CephAlertTemplate {
34
+ id: string;
35
+ name: string;
36
+ description: string;
37
+ category: CephAlertTemplateCategory;
38
+ severity: CephAlertTemplateSeverity;
39
+ getMonitorStep: (args: CephAlertTemplateArgs) => MonitorStep;
40
+ }
41
+
42
+ /*
43
+ * Filter contract: ceph-mgr prometheus-module series are keyed by datapoint
44
+ * labels — `ceph_daemon` (e.g. "osd.3", "mon.a") on per-daemon metrics and
45
+ * `pool_id` on per-pool metrics. Pool DATA series carry only `pool_id`; the
46
+ * pool name exists solely on ceph_pool_metadata. Templates group by these
47
+ * labels so one incident fires per daemon/pool. Datapoint labels are NOT
48
+ * `resource.`-prefixed in ClickHouse.
49
+ *
50
+ * Health-check contract: `ceph_health_detail{name,severity}` (Quincy and
51
+ * later) exports one series per ACTIVE health check — equality-filter on
52
+ * the `name` datapoint label, exactly like the `pve_ha_state` state filter.
53
+ * A series exists only while its check fires, so absence = healthy: the
54
+ * fire criteria use Max > 0 (no data never fires under the default Ignore
55
+ * no-data policy) and the recover criteria use = 0 with TreatAsZero so the
56
+ * monitor returns to Healthy when the series disappears.
57
+ * `ceph_daemon_health_metrics{type,ceph_daemon}` follows the same pattern
58
+ * per daemon.
59
+ */
60
+
61
+ export function buildCephMonitorStep(args: {
62
+ cephMonitor: MonitorStepCephMonitor;
63
+ offlineCriteriaInstance: MonitorCriteriaInstance;
64
+ onlineCriteriaInstance: MonitorCriteriaInstance;
65
+ /*
66
+ * Optional extra unhealthy tiers, evaluated AFTER the primary offline
67
+ * instance and before the online instance. Criteria are first-match-
68
+ * wins, so pass tiers worst-first (e.g. ceph-mon-disk-space pairs a
69
+ * Critical MON_DISK_CRIT tier with a Warning MON_DISK_LOW tier in one
70
+ * template).
71
+ */
72
+ additionalOfflineCriteriaInstances?: Array<MonitorCriteriaInstance>;
73
+ }): MonitorStep {
74
+ const monitorStep: MonitorStep = new MonitorStep();
75
+
76
+ const monitorCriteria: MonitorCriteria = new MonitorCriteria();
77
+
78
+ monitorCriteria.data = {
79
+ monitorCriteriaInstanceArray: [
80
+ args.offlineCriteriaInstance,
81
+ ...(args.additionalOfflineCriteriaInstances || []),
82
+ args.onlineCriteriaInstance,
83
+ ],
84
+ };
85
+
86
+ monitorStep.data = {
87
+ id: ObjectID.generate().toString(),
88
+ monitorDestination: undefined,
89
+ doNotFollowRedirects: undefined,
90
+ monitorDestinationPort: undefined,
91
+ monitorCriteria: monitorCriteria,
92
+ requestType: "GET" as any,
93
+ requestHeaders: undefined,
94
+ requestBody: undefined,
95
+ customCode: undefined,
96
+ screenSizeTypes: undefined,
97
+ browserTypes: undefined,
98
+ retryCountOnError: undefined,
99
+ logMonitor: undefined,
100
+ traceMonitor: undefined,
101
+ metricMonitor: undefined,
102
+ exceptionMonitor: undefined,
103
+ snmpMonitor: undefined,
104
+ dnsMonitor: undefined,
105
+ domainMonitor: undefined,
106
+ externalStatusPageMonitor: undefined,
107
+ kubernetesMonitor: undefined,
108
+ profileMonitor: undefined,
109
+ dockerMonitor: undefined,
110
+ cephMonitor: args.cephMonitor,
111
+ };
112
+
113
+ return monitorStep;
114
+ }
115
+
116
+ /**
117
+ * One extra threshold filter inside a criteria instance — references
118
+ * another query alias of the same monitor step. Used by health-check
119
+ * templates that watch two `ceph_health_detail` names at once.
120
+ */
121
+ export interface CephCriteriaFilterSpec {
122
+ metricAlias: string;
123
+ filterType: FilterType;
124
+ value: number;
125
+ }
126
+
127
+ export function buildCephOfflineCriteriaInstance(args: {
128
+ offlineMonitorStatusId: ObjectID;
129
+ incidentSeverityId: ObjectID;
130
+ alertSeverityId: ObjectID;
131
+ monitorName: string;
132
+ metricAlias: string;
133
+ filterType: FilterType;
134
+ value: number;
135
+ incidentTitle?: string;
136
+ incidentDescription?: string;
137
+ criteriaName?: string;
138
+ criteriaDescription?: string;
139
+ /*
140
+ * Extra OR'd filters (the instance is FilterCondition.Any) — fires when
141
+ * EITHER the primary alias or any additional alias breaches, e.g.
142
+ * PG_DAMAGED OR OSD_SCRUB_ERRORS.
143
+ */
144
+ additionalFilters?: Array<CephCriteriaFilterSpec> | undefined;
145
+ }): MonitorCriteriaInstance {
146
+ const instance: MonitorCriteriaInstance = new MonitorCriteriaInstance();
147
+
148
+ const incidentTitle: string =
149
+ args.incidentTitle || `${args.monitorName} - Alert Triggered`;
150
+ const incidentDescription: string =
151
+ args.incidentDescription ||
152
+ `${args.monitorName} has triggered an alert condition. See root cause for detailed Ceph cluster information.`;
153
+
154
+ instance.data = {
155
+ id: ObjectID.generate().toString(),
156
+ monitorStatusId: args.offlineMonitorStatusId,
157
+ filterCondition: FilterCondition.Any,
158
+ filters: [
159
+ {
160
+ checkOn: CheckOn.MetricValue,
161
+ filterType: args.filterType,
162
+ metricMonitorOptions: {
163
+ metricAggregationType: EvaluateOverTimeType.AnyValue,
164
+ metricAlias: args.metricAlias,
165
+ },
166
+ value: args.value,
167
+ },
168
+ ...(args.additionalFilters || []).map(
169
+ (filter: CephCriteriaFilterSpec) => {
170
+ return {
171
+ checkOn: CheckOn.MetricValue,
172
+ filterType: filter.filterType,
173
+ metricMonitorOptions: {
174
+ metricAggregationType: EvaluateOverTimeType.AnyValue,
175
+ metricAlias: filter.metricAlias,
176
+ },
177
+ value: filter.value,
178
+ };
179
+ },
180
+ ),
181
+ ],
182
+ incidents: [
183
+ {
184
+ title: incidentTitle,
185
+ description: incidentDescription,
186
+ incidentSeverityId: args.incidentSeverityId,
187
+ autoResolveIncident: true,
188
+ id: ObjectID.generate().toString(),
189
+ onCallPolicyIds: [],
190
+ },
191
+ ],
192
+ alerts: [
193
+ {
194
+ title: incidentTitle,
195
+ description: incidentDescription,
196
+ alertSeverityId: args.alertSeverityId,
197
+ autoResolveAlert: true,
198
+ id: ObjectID.generate().toString(),
199
+ onCallPolicyIds: [],
200
+ },
201
+ ],
202
+ changeMonitorStatus: true,
203
+ createIncidents: true,
204
+ createAlerts: true,
205
+ name: args.criteriaName || `${args.monitorName} - Unhealthy`,
206
+ description:
207
+ args.criteriaDescription || `Criteria for detecting unhealthy state.`,
208
+ };
209
+
210
+ return instance;
211
+ }
212
+
213
+ export function buildCephOnlineCriteriaInstance(args: {
214
+ onlineMonitorStatusId: ObjectID;
215
+ metricAlias: string;
216
+ filterType: FilterType;
217
+ value: number;
218
+ /*
219
+ * Extra filters for multi-alias recovery. Pass FilterCondition.All with
220
+ * them so the monitor only recovers when EVERY watched health check has
221
+ * cleared (the complement of the offline instance's Any).
222
+ */
223
+ additionalFilters?: Array<CephCriteriaFilterSpec> | undefined;
224
+ filterCondition?: FilterCondition | undefined;
225
+ /*
226
+ * Health-detail series exist only while the check is active, so the
227
+ * recover comparison (= 0) would otherwise see no data and never match.
228
+ * TreatAsZero makes series absence count as 0 — the spec'd
229
+ * "Max > 0 fire / = 0 recover" semantics.
230
+ */
231
+ treatNoDataAsZero?: boolean | undefined;
232
+ }): MonitorCriteriaInstance {
233
+ const instance: MonitorCriteriaInstance = new MonitorCriteriaInstance();
234
+
235
+ const onNoDataPolicy: NoDataPolicy | undefined = args.treatNoDataAsZero
236
+ ? NoDataPolicy.TreatAsZero
237
+ : undefined;
238
+
239
+ instance.data = {
240
+ id: ObjectID.generate().toString(),
241
+ monitorStatusId: args.onlineMonitorStatusId,
242
+ filterCondition: args.filterCondition || FilterCondition.Any,
243
+ filters: [
244
+ {
245
+ checkOn: CheckOn.MetricValue,
246
+ filterType: args.filterType,
247
+ metricMonitorOptions: {
248
+ metricAggregationType: EvaluateOverTimeType.AnyValue,
249
+ metricAlias: args.metricAlias,
250
+ onNoDataPolicy: onNoDataPolicy,
251
+ },
252
+ value: args.value,
253
+ },
254
+ ...(args.additionalFilters || []).map(
255
+ (filter: CephCriteriaFilterSpec) => {
256
+ return {
257
+ checkOn: CheckOn.MetricValue,
258
+ filterType: filter.filterType,
259
+ metricMonitorOptions: {
260
+ metricAggregationType: EvaluateOverTimeType.AnyValue,
261
+ metricAlias: filter.metricAlias,
262
+ onNoDataPolicy: onNoDataPolicy,
263
+ },
264
+ value: filter.value,
265
+ };
266
+ },
267
+ ),
268
+ ],
269
+ incidents: [],
270
+ alerts: [],
271
+ changeMonitorStatus: true,
272
+ createIncidents: false,
273
+ createAlerts: false,
274
+ name: "Healthy",
275
+ description: "Criteria for healthy state.",
276
+ };
277
+
278
+ return instance;
279
+ }
280
+
281
+ export function buildCephMonitorConfig(args: {
282
+ clusterIdentifier: string;
283
+ metricName: string;
284
+ metricAlias: string;
285
+ rollingTime: RollingTime;
286
+ aggregationType: MetricsAggregationType;
287
+ attributes?: Record<string, string>;
288
+ groupByAttributeKey?: string | undefined;
289
+ }): MonitorStepCephMonitor {
290
+ return {
291
+ clusterIdentifier: args.clusterIdentifier,
292
+ resourceFilters: {},
293
+ metricViewConfig: {
294
+ queryConfigs: [
295
+ {
296
+ metricAliasData: {
297
+ metricVariable: args.metricAlias,
298
+ title: args.metricAlias,
299
+ description: args.metricAlias,
300
+ legend: args.metricAlias,
301
+ legendUnit: undefined,
302
+ },
303
+ metricQueryData: {
304
+ filterData: {
305
+ metricName: args.metricName,
306
+ attributes: args.attributes || {},
307
+ aggegationType: args.aggregationType,
308
+ aggregateBy: {},
309
+ },
310
+ ...(args.groupByAttributeKey
311
+ ? { groupByAttributeKeys: [args.groupByAttributeKey] }
312
+ : {}),
313
+ },
314
+ },
315
+ ],
316
+ formulaConfigs: [],
317
+ },
318
+ rollingTime: args.rollingTime,
319
+ };
320
+ }
321
+
322
+ export interface CephFormulaQuery {
323
+ alias: string;
324
+ metricName: string;
325
+ attributes?: Record<string, string> | undefined;
326
+ }
327
+
328
+ /**
329
+ * Build a multi-metric formula monitor, optionally grouped by an
330
+ * OpenTelemetry attribute so one incident fires per group (e.g. per
331
+ * `pool_id`). The formula references the query aliases.
332
+ *
333
+ * Aggregation contract (see buildKubernetesRatioMonitorConfig for the full
334
+ * derivation): the per-series worker buckets raw rows by (group, minute)
335
+ * and applies the aggregation across both the grouped series AND the
336
+ * scrapes in that minute. `Sum` is only correct for ratios whose numerator
337
+ * and denominator ride the SAME receiver/scrape so the scrape multiple
338
+ * cancels: `(Σnum × scrapes) / (Σden × scrapes)`. Every Ceph metric comes
339
+ * from ONE receiver — the prometheus scrape of the active ceph-mgr — so
340
+ * all Ceph ratios are same-receiver and use `Sum`/`Sum`. (`Avg`/`Avg` is
341
+ * the cross-receiver variant; not needed here.) Difference formulas
342
+ * compared against zero (e.g. pg_total − pg_active > 0) also use
343
+ * `Sum`/`Sum`: the scrape multiple k scales both terms equally
344
+ * (k·Σtotal − k·Σactive = k·Σinactive), so the sign of the difference —
345
+ * and therefore the > 0 fire / = 0 recover thresholds — is preserved
346
+ * exactly. `Max`/`Max` would be WRONG for ungrouped per-pool metrics:
347
+ * each side collapses to the largest pool's value, hiding non-zero
348
+ * differences in every other pool.
349
+ */
350
+ export function buildCephFormulaMonitorConfig(args: {
351
+ clusterIdentifier: string;
352
+ queries: Array<CephFormulaQuery>;
353
+ formula: string;
354
+ resultAlias: string;
355
+ resultLegend: string;
356
+ resultLegendUnit?: string | undefined;
357
+ rollingTime: RollingTime;
358
+ aggregationType: MetricsAggregationType;
359
+ groupByAttributeKey?: string | undefined;
360
+ }): MonitorStepCephMonitor {
361
+ const buildQueryConfig: (query: CephFormulaQuery) => any = (
362
+ query: CephFormulaQuery,
363
+ ): any => {
364
+ return {
365
+ metricAliasData: {
366
+ metricVariable: query.alias,
367
+ title: query.alias,
368
+ description: query.alias,
369
+ legend: query.alias,
370
+ legendUnit: undefined,
371
+ },
372
+ metricQueryData: {
373
+ filterData: {
374
+ metricName: query.metricName,
375
+ attributes: query.attributes || {},
376
+ aggegationType: args.aggregationType,
377
+ aggregateBy: {},
378
+ },
379
+ ...(args.groupByAttributeKey
380
+ ? { groupByAttributeKeys: [args.groupByAttributeKey] }
381
+ : {}),
382
+ },
383
+ };
384
+ };
385
+
386
+ return {
387
+ clusterIdentifier: args.clusterIdentifier,
388
+ resourceFilters: {},
389
+ metricViewConfig: {
390
+ queryConfigs: args.queries.map(buildQueryConfig),
391
+ formulaConfigs: [
392
+ {
393
+ metricAliasData: {
394
+ metricVariable: args.resultAlias,
395
+ title: args.resultLegend,
396
+ description: args.resultLegend,
397
+ legend: args.resultLegend,
398
+ legendUnit: args.resultLegendUnit,
399
+ },
400
+ metricFormulaData: {
401
+ metricFormula: args.formula,
402
+ },
403
+ },
404
+ ],
405
+ },
406
+ rollingTime: args.rollingTime,
407
+ };
408
+ }
409
+
410
+ /**
411
+ * Build a percentage-ratio monitor: `(numerator / denominator) * 100`.
412
+ * Same-receiver Sum/Sum by default — see buildCephFormulaMonitorConfig.
413
+ */
414
+ export function buildCephRatioMonitorConfig(args: {
415
+ clusterIdentifier: string;
416
+ numeratorMetricName: string;
417
+ denominatorMetricName: string;
418
+ numeratorAlias: string;
419
+ denominatorAlias: string;
420
+ resultAlias: string;
421
+ resultLegend: string;
422
+ rollingTime: RollingTime;
423
+ attributes?: Record<string, string> | undefined;
424
+ groupByAttributeKey?: string | undefined;
425
+ aggregationType?: MetricsAggregationType | undefined;
426
+ }): MonitorStepCephMonitor {
427
+ return buildCephFormulaMonitorConfig({
428
+ clusterIdentifier: args.clusterIdentifier,
429
+ queries: [
430
+ {
431
+ alias: args.numeratorAlias,
432
+ metricName: args.numeratorMetricName,
433
+ attributes: args.attributes,
434
+ },
435
+ {
436
+ alias: args.denominatorAlias,
437
+ metricName: args.denominatorMetricName,
438
+ attributes: args.attributes,
439
+ },
440
+ ],
441
+ formula: `(${args.numeratorAlias} / ${args.denominatorAlias}) * 100`,
442
+ resultAlias: args.resultAlias,
443
+ resultLegend: args.resultLegend,
444
+ resultLegendUnit: "%",
445
+ rollingTime: args.rollingTime,
446
+ aggregationType: args.aggregationType || MetricsAggregationType.Sum,
447
+ groupByAttributeKey: args.groupByAttributeKey,
448
+ });
449
+ }
450
+
451
+ /**
452
+ * Build a multi-query monitor with NO formula — each query keeps its own
453
+ * alias and the criteria filters reference the aliases independently
454
+ * (combined with FilterCondition.Any/All). Used by health-check templates
455
+ * that watch two `ceph_health_detail` names at once: a formula like
456
+ * `a + b` would yield no result whenever one check is inactive (health-
457
+ * detail series exist only while a check fires), while independent
458
+ * filters evaluate each alias on its own.
459
+ */
460
+ export function buildCephMultiQueryMonitorConfig(args: {
461
+ clusterIdentifier: string;
462
+ queries: Array<CephFormulaQuery>;
463
+ rollingTime: RollingTime;
464
+ aggregationType: MetricsAggregationType;
465
+ groupByAttributeKey?: string | undefined;
466
+ }): MonitorStepCephMonitor {
467
+ return {
468
+ clusterIdentifier: args.clusterIdentifier,
469
+ resourceFilters: {},
470
+ metricViewConfig: {
471
+ queryConfigs: args.queries.map((query: CephFormulaQuery) => {
472
+ return {
473
+ metricAliasData: {
474
+ metricVariable: query.alias,
475
+ title: query.alias,
476
+ description: query.alias,
477
+ legend: query.alias,
478
+ legendUnit: undefined,
479
+ },
480
+ metricQueryData: {
481
+ filterData: {
482
+ metricName: query.metricName,
483
+ attributes: query.attributes || {},
484
+ aggegationType: args.aggregationType,
485
+ aggregateBy: {},
486
+ },
487
+ ...(args.groupByAttributeKey
488
+ ? { groupByAttributeKeys: [args.groupByAttributeKey] }
489
+ : {}),
490
+ },
491
+ };
492
+ }),
493
+ formulaConfigs: [],
494
+ },
495
+ rollingTime: args.rollingTime,
496
+ };
497
+ }
498
+
499
+ // --- Template Definitions ---
500
+
501
+ const healthErrorTemplate: CephAlertTemplate = {
502
+ id: "ceph-health-error",
503
+ name: "Cluster Health Error",
504
+ description:
505
+ "Alert immediately when the Ceph cluster reports HEALTH_ERR (health status >= 2) — data availability or durability is at risk.",
506
+ category: "Cluster Health",
507
+ severity: "Critical",
508
+ getMonitorStep: (args: CephAlertTemplateArgs): MonitorStep => {
509
+ const metricAlias: string = "ceph_health_error";
510
+
511
+ return buildCephMonitorStep({
512
+ cephMonitor: buildCephMonitorConfig({
513
+ clusterIdentifier: args.clusterIdentifier,
514
+ metricName: "ceph_health_status",
515
+ metricAlias,
516
+ rollingTime: RollingTime.Past1Minute,
517
+ aggregationType: MetricsAggregationType.Max,
518
+ }),
519
+ offlineCriteriaInstance: buildCephOfflineCriteriaInstance({
520
+ offlineMonitorStatusId: args.offlineMonitorStatusId,
521
+ incidentSeverityId: args.defaultIncidentSeverityId,
522
+ alertSeverityId: args.defaultAlertSeverityId,
523
+ monitorName: args.monitorName,
524
+ metricAlias,
525
+ filterType: FilterType.GreaterThanOrEqualTo,
526
+ value: 2,
527
+ incidentTitle: `[Ceph] CRITICAL: Cluster Health Error - ${args.monitorName}`,
528
+ incidentDescription: `The Ceph cluster is reporting HEALTH_ERR. Data availability or durability is at immediate risk — common causes include PGs inactive/incomplete, full OSDs blocking writes, or inconsistent objects failing scrub. Run \`ceph health detail\` and \`ceph -s\` on the cluster and address the failing checks immediately.`,
529
+ criteriaName: "Health Error - Status >= 2",
530
+ criteriaDescription:
531
+ "Triggers when the cluster health status is HEALTH_ERR (2).",
532
+ }),
533
+ onlineCriteriaInstance: buildCephOnlineCriteriaInstance({
534
+ onlineMonitorStatusId: args.onlineMonitorStatusId,
535
+ metricAlias,
536
+ filterType: FilterType.LessThan,
537
+ value: 2,
538
+ }),
539
+ });
540
+ },
541
+ };
542
+
543
+ const healthWarnTemplate: CephAlertTemplate = {
544
+ id: "ceph-health-warn",
545
+ name: "Cluster Health Warning",
546
+ description:
547
+ "Alert when the Ceph cluster reports HEALTH_WARN or worse (health status >= 1).",
548
+ category: "Cluster Health",
549
+ severity: "Warning",
550
+ getMonitorStep: (args: CephAlertTemplateArgs): MonitorStep => {
551
+ const metricAlias: string = "ceph_health";
552
+
553
+ return buildCephMonitorStep({
554
+ cephMonitor: buildCephMonitorConfig({
555
+ clusterIdentifier: args.clusterIdentifier,
556
+ metricName: "ceph_health_status",
557
+ metricAlias,
558
+ rollingTime: RollingTime.Past5Minutes,
559
+ aggregationType: MetricsAggregationType.Max,
560
+ }),
561
+ offlineCriteriaInstance: buildCephOfflineCriteriaInstance({
562
+ offlineMonitorStatusId: args.offlineMonitorStatusId,
563
+ incidentSeverityId: args.defaultIncidentSeverityId,
564
+ alertSeverityId: args.defaultAlertSeverityId,
565
+ monitorName: args.monitorName,
566
+ metricAlias,
567
+ filterType: FilterType.GreaterThanOrEqualTo,
568
+ value: 1,
569
+ incidentTitle: `[Ceph] Cluster Health Warning - ${args.monitorName}`,
570
+ incidentDescription: `The Ceph cluster is reporting HEALTH_WARN or worse. Run \`ceph health detail\` on the cluster to see the active health checks (common causes: OSDs down, PGs degraded, clock skew, nearfull OSDs). The cluster is still serving I/O but needs attention before the condition worsens.`,
571
+ criteriaName: "Health Warning - Status >= 1",
572
+ criteriaDescription:
573
+ "Triggers when the cluster health status is HEALTH_WARN (1) or HEALTH_ERR (2).",
574
+ }),
575
+ onlineCriteriaInstance: buildCephOnlineCriteriaInstance({
576
+ onlineMonitorStatusId: args.onlineMonitorStatusId,
577
+ metricAlias,
578
+ filterType: FilterType.LessThan,
579
+ value: 1,
580
+ }),
581
+ });
582
+ },
583
+ };
584
+
585
+ const osdDownTemplate: CephAlertTemplate = {
586
+ id: "ceph-osd-down",
587
+ name: "OSD Down",
588
+ description:
589
+ "Alert when any OSD daemon reports as down. Down OSDs reduce redundancy and trigger recovery traffic. One incident per OSD.",
590
+ category: "OSD",
591
+ severity: "Critical",
592
+ getMonitorStep: (args: CephAlertTemplateArgs): MonitorStep => {
593
+ const metricAlias: string = "osd_up";
594
+
595
+ return buildCephMonitorStep({
596
+ cephMonitor: buildCephMonitorConfig({
597
+ clusterIdentifier: args.clusterIdentifier,
598
+ metricName: "ceph_osd_up",
599
+ metricAlias,
600
+ rollingTime: RollingTime.Past5Minutes,
601
+ /*
602
+ * Min per OSD — a single down scrape trips the threshold instead
603
+ * of being masked by scrapes where the OSD was still up.
604
+ */
605
+ aggregationType: MetricsAggregationType.Min,
606
+ groupByAttributeKey: "ceph_daemon",
607
+ }),
608
+ offlineCriteriaInstance: buildCephOfflineCriteriaInstance({
609
+ offlineMonitorStatusId: args.offlineMonitorStatusId,
610
+ incidentSeverityId: args.defaultIncidentSeverityId,
611
+ alertSeverityId: args.defaultAlertSeverityId,
612
+ monitorName: args.monitorName,
613
+ metricAlias,
614
+ filterType: FilterType.LessThan,
615
+ value: 1,
616
+ incidentTitle: `[Ceph] OSD Down - ${args.monitorName}`,
617
+ incidentDescription: `One or more Ceph OSD daemons are down. Redundancy is reduced and the cluster will start recovery/backfill once the OSD is marked out. Run \`ceph osd tree | grep down\` to identify the affected OSDs, check the OSD host and daemon logs, and restart or replace the OSD. Check the root cause for the affected ceph_daemon label.`,
618
+ criteriaName: "OSD Down - ceph_osd_up < 1",
619
+ criteriaDescription:
620
+ "Triggers when any OSD reports ceph_osd_up below 1.",
621
+ }),
622
+ onlineCriteriaInstance: buildCephOnlineCriteriaInstance({
623
+ onlineMonitorStatusId: args.onlineMonitorStatusId,
624
+ metricAlias,
625
+ filterType: FilterType.GreaterThanOrEqualTo,
626
+ value: 1,
627
+ }),
628
+ });
629
+ },
630
+ };
631
+
632
+ const osdOutTemplate: CephAlertTemplate = {
633
+ id: "ceph-osd-out",
634
+ name: "OSD Out",
635
+ description:
636
+ "Alert when any OSD is marked out of the cluster. An out OSD no longer stores data and triggers rebalancing. One incident per OSD.",
637
+ category: "OSD",
638
+ severity: "Warning",
639
+ getMonitorStep: (args: CephAlertTemplateArgs): MonitorStep => {
640
+ const metricAlias: string = "osd_in";
641
+
642
+ return buildCephMonitorStep({
643
+ cephMonitor: buildCephMonitorConfig({
644
+ clusterIdentifier: args.clusterIdentifier,
645
+ metricName: "ceph_osd_in",
646
+ metricAlias,
647
+ rollingTime: RollingTime.Past5Minutes,
648
+ /*
649
+ * Min per OSD — a single out scrape trips the threshold instead
650
+ * of being masked by scrapes where the OSD was still in.
651
+ */
652
+ aggregationType: MetricsAggregationType.Min,
653
+ groupByAttributeKey: "ceph_daemon",
654
+ }),
655
+ offlineCriteriaInstance: buildCephOfflineCriteriaInstance({
656
+ offlineMonitorStatusId: args.offlineMonitorStatusId,
657
+ incidentSeverityId: args.defaultIncidentSeverityId,
658
+ alertSeverityId: args.defaultAlertSeverityId,
659
+ monitorName: args.monitorName,
660
+ metricAlias,
661
+ filterType: FilterType.LessThan,
662
+ value: 1,
663
+ incidentTitle: `[Ceph] OSD Out - ${args.monitorName}`,
664
+ incidentDescription: `One or more Ceph OSDs have been marked out of the cluster. Their data is being rebalanced onto the remaining OSDs, which consumes cluster capacity and I/O. If the OSD was marked out automatically after being down (default 10 minutes), bring it back with \`ceph osd in <id>\` once the underlying issue is fixed. Check the root cause for the affected ceph_daemon label.`,
665
+ criteriaName: "OSD Out - ceph_osd_in < 1",
666
+ criteriaDescription:
667
+ "Triggers when any OSD reports ceph_osd_in below 1.",
668
+ }),
669
+ onlineCriteriaInstance: buildCephOnlineCriteriaInstance({
670
+ onlineMonitorStatusId: args.onlineMonitorStatusId,
671
+ metricAlias,
672
+ filterType: FilterType.GreaterThanOrEqualTo,
673
+ value: 1,
674
+ }),
675
+ });
676
+ },
677
+ };
678
+
679
+ const osdHighLatencyTemplate: CephAlertTemplate = {
680
+ id: "ceph-osd-high-latency",
681
+ name: "OSD High Latency",
682
+ description:
683
+ "Alert when any OSD's average apply latency exceeds 100 ms. Slow OSDs drag down client I/O across every PG they host. One incident per OSD.",
684
+ category: "OSD",
685
+ severity: "Warning",
686
+ getMonitorStep: (args: CephAlertTemplateArgs): MonitorStep => {
687
+ const metricAlias: string = "osd_apply_latency";
688
+
689
+ return buildCephMonitorStep({
690
+ cephMonitor: buildCephMonitorConfig({
691
+ clusterIdentifier: args.clusterIdentifier,
692
+ metricName: "ceph_osd_apply_latency_ms",
693
+ metricAlias,
694
+ rollingTime: RollingTime.Past5Minutes,
695
+ /*
696
+ * Avg per OSD — latency is a per-OSD gauge, so the per-minute
697
+ * average is the sustained latency regardless of scrape count.
698
+ */
699
+ aggregationType: MetricsAggregationType.Avg,
700
+ groupByAttributeKey: "ceph_daemon",
701
+ }),
702
+ offlineCriteriaInstance: buildCephOfflineCriteriaInstance({
703
+ offlineMonitorStatusId: args.offlineMonitorStatusId,
704
+ incidentSeverityId: args.defaultIncidentSeverityId,
705
+ alertSeverityId: args.defaultAlertSeverityId,
706
+ monitorName: args.monitorName,
707
+ metricAlias,
708
+ filterType: FilterType.GreaterThan,
709
+ value: 100,
710
+ incidentTitle: `[Ceph] OSD High Latency (>100ms) - ${args.monitorName}`,
711
+ incidentDescription: `A Ceph OSD's average apply latency has exceeded 100 ms. A slow OSD degrades client I/O for every placement group it hosts. Common causes: a failing disk, deep-scrub or backfill load, or an overloaded host. Check the affected OSD's host with \`iostat\`/SMART data and \`ceph osd perf\`, and consider reweighting or replacing the OSD. Check the root cause for the affected ceph_daemon label.`,
712
+ criteriaName: "OSD High Latency - Apply Latency > 100ms",
713
+ criteriaDescription:
714
+ "Triggers when any OSD's average apply latency exceeds 100 ms over the monitoring window.",
715
+ }),
716
+ onlineCriteriaInstance: buildCephOnlineCriteriaInstance({
717
+ onlineMonitorStatusId: args.onlineMonitorStatusId,
718
+ metricAlias,
719
+ filterType: FilterType.LessThanOrEqualTo,
720
+ value: 100,
721
+ }),
722
+ });
723
+ },
724
+ };
725
+
726
+ const monQuorumDegradedTemplate: CephAlertTemplate = {
727
+ id: "ceph-mon-quorum-degraded",
728
+ name: "Monitor Quorum Degraded",
729
+ description:
730
+ "Alert immediately when any Ceph monitor daemon falls out of quorum. Losing quorum entirely halts the cluster. One incident per monitor.",
731
+ category: "Cluster Health",
732
+ severity: "Critical",
733
+ getMonitorStep: (args: CephAlertTemplateArgs): MonitorStep => {
734
+ const metricAlias: string = "mon_quorum";
735
+
736
+ return buildCephMonitorStep({
737
+ cephMonitor: buildCephMonitorConfig({
738
+ clusterIdentifier: args.clusterIdentifier,
739
+ metricName: "ceph_mon_quorum_status",
740
+ metricAlias,
741
+ rollingTime: RollingTime.Past1Minute,
742
+ /*
743
+ * Min per monitor — a single out-of-quorum scrape trips the
744
+ * threshold instead of being masked by in-quorum scrapes.
745
+ */
746
+ aggregationType: MetricsAggregationType.Min,
747
+ groupByAttributeKey: "ceph_daemon",
748
+ }),
749
+ offlineCriteriaInstance: buildCephOfflineCriteriaInstance({
750
+ offlineMonitorStatusId: args.offlineMonitorStatusId,
751
+ incidentSeverityId: args.defaultIncidentSeverityId,
752
+ alertSeverityId: args.defaultAlertSeverityId,
753
+ monitorName: args.monitorName,
754
+ metricAlias,
755
+ filterType: FilterType.LessThan,
756
+ value: 1,
757
+ incidentTitle: `[Ceph] CRITICAL: Monitor Quorum Degraded - ${args.monitorName}`,
758
+ incidentDescription: `A Ceph monitor daemon has fallen out of quorum. The cluster can tolerate losing a minority of monitors, but if quorum is lost entirely all I/O stops. Run \`ceph quorum_status\` to see which monitors are in quorum, then check the affected monitor host for daemon crashes, disk-full conditions, network partitions, or clock skew. Check the root cause for the affected ceph_daemon label.`,
759
+ criteriaName: "Quorum Degraded - Member Out of Quorum",
760
+ criteriaDescription:
761
+ "Triggers when any monitor reports ceph_mon_quorum_status below 1.",
762
+ }),
763
+ onlineCriteriaInstance: buildCephOnlineCriteriaInstance({
764
+ onlineMonitorStatusId: args.onlineMonitorStatusId,
765
+ metricAlias,
766
+ filterType: FilterType.GreaterThanOrEqualTo,
767
+ value: 1,
768
+ }),
769
+ });
770
+ },
771
+ };
772
+
773
+ const pgDegradedTemplate: CephAlertTemplate = {
774
+ id: "ceph-pg-degraded",
775
+ name: "Degraded Placement Groups",
776
+ description:
777
+ "Alert when any placement groups are degraded — objects have fewer replicas than configured.",
778
+ category: "PG",
779
+ severity: "Warning",
780
+ getMonitorStep: (args: CephAlertTemplateArgs): MonitorStep => {
781
+ const metricAlias: string = "pg_degraded";
782
+
783
+ return buildCephMonitorStep({
784
+ cephMonitor: buildCephMonitorConfig({
785
+ clusterIdentifier: args.clusterIdentifier,
786
+ metricName: "ceph_pg_degraded",
787
+ metricAlias,
788
+ rollingTime: RollingTime.Past5Minutes,
789
+ /*
790
+ * Per-pool series (pool_id label); Max-across-pools still fires
791
+ * the > 0 threshold when ANY pool has degraded PGs.
792
+ */
793
+ aggregationType: MetricsAggregationType.Max,
794
+ }),
795
+ offlineCriteriaInstance: buildCephOfflineCriteriaInstance({
796
+ offlineMonitorStatusId: args.offlineMonitorStatusId,
797
+ incidentSeverityId: args.defaultIncidentSeverityId,
798
+ alertSeverityId: args.defaultAlertSeverityId,
799
+ monitorName: args.monitorName,
800
+ metricAlias,
801
+ filterType: FilterType.GreaterThan,
802
+ value: 0,
803
+ incidentTitle: `[Ceph] Degraded Placement Groups - ${args.monitorName}`,
804
+ incidentDescription: `The Ceph cluster has degraded placement groups — some objects currently have fewer replicas than the configured replication factor. This usually follows an OSD failure or restart and should clear as recovery completes. If the count does not trend down, run \`ceph pg dump_stuck degraded\` and \`ceph osd tree\` to find the OSDs blocking recovery.`,
805
+ criteriaName: "PG Degraded - Count > 0",
806
+ criteriaDescription:
807
+ "Triggers when the number of degraded placement groups is above zero.",
808
+ }),
809
+ onlineCriteriaInstance: buildCephOnlineCriteriaInstance({
810
+ onlineMonitorStatusId: args.onlineMonitorStatusId,
811
+ metricAlias,
812
+ filterType: FilterType.EqualTo,
813
+ value: 0,
814
+ }),
815
+ });
816
+ },
817
+ };
818
+
819
+ const pgUndersizedTemplate: CephAlertTemplate = {
820
+ id: "ceph-pg-undersized",
821
+ name: "Undersized Placement Groups",
822
+ description:
823
+ "Alert when any placement groups are undersized — mapped to fewer OSDs than their replica count.",
824
+ category: "PG",
825
+ severity: "Warning",
826
+ getMonitorStep: (args: CephAlertTemplateArgs): MonitorStep => {
827
+ const metricAlias: string = "pg_undersized";
828
+
829
+ return buildCephMonitorStep({
830
+ cephMonitor: buildCephMonitorConfig({
831
+ clusterIdentifier: args.clusterIdentifier,
832
+ metricName: "ceph_pg_undersized",
833
+ metricAlias,
834
+ rollingTime: RollingTime.Past5Minutes,
835
+ /*
836
+ * Per-pool series (pool_id label); Max-across-pools still fires
837
+ * the > 0 threshold when ANY pool has undersized PGs.
838
+ */
839
+ aggregationType: MetricsAggregationType.Max,
840
+ }),
841
+ offlineCriteriaInstance: buildCephOfflineCriteriaInstance({
842
+ offlineMonitorStatusId: args.offlineMonitorStatusId,
843
+ incidentSeverityId: args.defaultIncidentSeverityId,
844
+ alertSeverityId: args.defaultAlertSeverityId,
845
+ monitorName: args.monitorName,
846
+ metricAlias,
847
+ filterType: FilterType.GreaterThan,
848
+ value: 0,
849
+ incidentTitle: `[Ceph] Undersized Placement Groups - ${args.monitorName}`,
850
+ incidentDescription: `The Ceph cluster has undersized placement groups — they are mapped to fewer OSDs than their configured replica count, so full redundancy cannot be restored. Sustained undersized PGs usually mean the cluster lacks enough OSDs (or failure domains) to satisfy the CRUSH rule. Check for down/out OSDs with \`ceph osd tree\` and verify the pool's replication settings against available capacity.`,
851
+ criteriaName: "PG Undersized - Count > 0",
852
+ criteriaDescription:
853
+ "Triggers when the number of undersized placement groups is above zero.",
854
+ }),
855
+ onlineCriteriaInstance: buildCephOnlineCriteriaInstance({
856
+ onlineMonitorStatusId: args.onlineMonitorStatusId,
857
+ metricAlias,
858
+ filterType: FilterType.EqualTo,
859
+ value: 0,
860
+ }),
861
+ });
862
+ },
863
+ };
864
+
865
+ const pgInactiveTemplate: CephAlertTemplate = {
866
+ id: "ceph-pg-inactive",
867
+ name: "Inactive Placement Groups",
868
+ description:
869
+ "Alert when any placement groups are not active (ceph_pg_total − ceph_pg_active > 0). Inactive PGs cannot serve I/O — client requests to them hang.",
870
+ category: "PG",
871
+ severity: "Critical",
872
+ getMonitorStep: (args: CephAlertTemplateArgs): MonitorStep => {
873
+ const metricAlias: string = "pg_inactive";
874
+
875
+ return buildCephMonitorStep({
876
+ cephMonitor: buildCephFormulaMonitorConfig({
877
+ clusterIdentifier: args.clusterIdentifier,
878
+ queries: [
879
+ { alias: "pg_total", metricName: "ceph_pg_total" },
880
+ { alias: "pg_active", metricName: "ceph_pg_active" },
881
+ ],
882
+ formula: "pg_total - pg_active",
883
+ resultAlias: metricAlias,
884
+ resultLegend: "Inactive PGs",
885
+ rollingTime: RollingTime.Past5Minutes,
886
+ /*
887
+ * Sum/Sum difference — ceph_pg_total and ceph_pg_active are
888
+ * PER-POOL series (pool_id label, since Nautilus), not single
889
+ * cluster-wide gauges. Sum folds every pool into a cluster-wide
890
+ * count; both metrics ride the same mgr scrape, so the scrape
891
+ * multiple k scales both terms equally (k·Σtotal − k·Σactive =
892
+ * k·Σinactive) and the > 0 fire / = 0 recover thresholds stay
893
+ * exact. Max would collapse each side to the largest pool's
894
+ * value, so inactive PGs in any other pool would yield 0 and
895
+ * this Critical alert would never fire.
896
+ */
897
+ aggregationType: MetricsAggregationType.Sum,
898
+ }),
899
+ offlineCriteriaInstance: buildCephOfflineCriteriaInstance({
900
+ offlineMonitorStatusId: args.offlineMonitorStatusId,
901
+ incidentSeverityId: args.defaultIncidentSeverityId,
902
+ alertSeverityId: args.defaultAlertSeverityId,
903
+ monitorName: args.monitorName,
904
+ metricAlias,
905
+ filterType: FilterType.GreaterThan,
906
+ value: 0,
907
+ incidentTitle: `[Ceph] CRITICAL: Inactive Placement Groups - ${args.monitorName}`,
908
+ incidentDescription: `The Ceph cluster has placement groups that are not active. Inactive PGs cannot serve reads or writes — client I/O to them hangs until they recover. This typically follows the loss of too many OSDs in a failure domain. Run \`ceph pg dump_stuck inactive\` and \`ceph health detail\` to identify the stuck PGs and the OSDs they are waiting on, and restore those OSDs immediately.`,
909
+ criteriaName: "PG Inactive - Count > 0",
910
+ criteriaDescription:
911
+ "Triggers when the number of inactive placement groups (total minus active) is above zero.",
912
+ }),
913
+ onlineCriteriaInstance: buildCephOnlineCriteriaInstance({
914
+ onlineMonitorStatusId: args.onlineMonitorStatusId,
915
+ metricAlias,
916
+ filterType: FilterType.EqualTo,
917
+ value: 0,
918
+ }),
919
+ });
920
+ },
921
+ };
922
+
923
+ const clusterNearFullTemplate: CephAlertTemplate = {
924
+ id: "ceph-cluster-near-full",
925
+ name: "Cluster Near Full",
926
+ description:
927
+ "Alert when raw cluster usage exceeds 85% of total capacity (ceph_cluster_total_used_bytes ÷ ceph_cluster_total_bytes × 100) — Ceph's default nearfull ratio.",
928
+ category: "Capacity",
929
+ severity: "Warning",
930
+ getMonitorStep: (args: CephAlertTemplateArgs): MonitorStep => {
931
+ const metricAlias: string = "cluster_used_percent";
932
+
933
+ return buildCephMonitorStep({
934
+ cephMonitor: buildCephRatioMonitorConfig({
935
+ clusterIdentifier: args.clusterIdentifier,
936
+ numeratorMetricName: "ceph_cluster_total_used_bytes",
937
+ denominatorMetricName: "ceph_cluster_total_bytes",
938
+ numeratorAlias: "used_bytes",
939
+ denominatorAlias: "total_bytes",
940
+ resultAlias: metricAlias,
941
+ resultLegend: "Cluster Capacity Used (%)",
942
+ rollingTime: RollingTime.Past5Minutes,
943
+ }),
944
+ offlineCriteriaInstance: buildCephOfflineCriteriaInstance({
945
+ offlineMonitorStatusId: args.offlineMonitorStatusId,
946
+ incidentSeverityId: args.defaultIncidentSeverityId,
947
+ alertSeverityId: args.defaultAlertSeverityId,
948
+ monitorName: args.monitorName,
949
+ metricAlias,
950
+ filterType: FilterType.GreaterThan,
951
+ value: 85,
952
+ incidentTitle: `[Ceph] Cluster Near Full (>85%) - ${args.monitorName}`,
953
+ incidentDescription: `The Ceph cluster's raw usage has exceeded 85% of total capacity — the default nearfull threshold. Individual OSDs will start reporting nearfull (capacity is rarely perfectly balanced), and at the full ratio (95%) writes stop cluster-wide. Run \`ceph df\` and \`ceph osd df\` to check the distribution, then add OSDs, delete data, or rebalance before the cluster reaches full.`,
954
+ criteriaName: "Cluster Near Full - Used > 85%",
955
+ criteriaDescription:
956
+ "Triggers when raw cluster usage exceeds 85% of total capacity over the monitoring window.",
957
+ }),
958
+ onlineCriteriaInstance: buildCephOnlineCriteriaInstance({
959
+ onlineMonitorStatusId: args.onlineMonitorStatusId,
960
+ metricAlias,
961
+ filterType: FilterType.LessThanOrEqualTo,
962
+ value: 85,
963
+ }),
964
+ });
965
+ },
966
+ };
967
+
968
+ const clusterFullTemplate: CephAlertTemplate = {
969
+ id: "ceph-cluster-full",
970
+ name: "Cluster Full",
971
+ description:
972
+ "Alert when raw cluster usage exceeds 95% of total capacity — Ceph's default full ratio, at which writes stop cluster-wide.",
973
+ category: "Capacity",
974
+ severity: "Critical",
975
+ getMonitorStep: (args: CephAlertTemplateArgs): MonitorStep => {
976
+ const metricAlias: string = "cluster_used_percent";
977
+
978
+ return buildCephMonitorStep({
979
+ cephMonitor: buildCephRatioMonitorConfig({
980
+ clusterIdentifier: args.clusterIdentifier,
981
+ numeratorMetricName: "ceph_cluster_total_used_bytes",
982
+ denominatorMetricName: "ceph_cluster_total_bytes",
983
+ numeratorAlias: "used_bytes",
984
+ denominatorAlias: "total_bytes",
985
+ resultAlias: metricAlias,
986
+ resultLegend: "Cluster Capacity Used (%)",
987
+ rollingTime: RollingTime.Past5Minutes,
988
+ }),
989
+ offlineCriteriaInstance: buildCephOfflineCriteriaInstance({
990
+ offlineMonitorStatusId: args.offlineMonitorStatusId,
991
+ incidentSeverityId: args.defaultIncidentSeverityId,
992
+ alertSeverityId: args.defaultAlertSeverityId,
993
+ monitorName: args.monitorName,
994
+ metricAlias,
995
+ filterType: FilterType.GreaterThan,
996
+ value: 95,
997
+ incidentTitle: `[Ceph] CRITICAL: Cluster Full (>95%) - ${args.monitorName}`,
998
+ incidentDescription: `The Ceph cluster's raw usage has exceeded 95% of total capacity — the default full threshold. OSDs at this ratio refuse writes, which stalls client I/O cluster-wide and can wedge recovery. Free capacity immediately: delete unneeded data or snapshots, add OSDs, or temporarily raise the full ratio (\`ceph osd set-full-ratio\`) with extreme caution to restore write availability.`,
999
+ criteriaName: "Cluster Full - Used > 95%",
1000
+ criteriaDescription:
1001
+ "Triggers when raw cluster usage exceeds 95% of total capacity over the monitoring window.",
1002
+ }),
1003
+ onlineCriteriaInstance: buildCephOnlineCriteriaInstance({
1004
+ onlineMonitorStatusId: args.onlineMonitorStatusId,
1005
+ metricAlias,
1006
+ filterType: FilterType.LessThanOrEqualTo,
1007
+ value: 95,
1008
+ }),
1009
+ });
1010
+ },
1011
+ };
1012
+
1013
+ const poolNearFullTemplate: CephAlertTemplate = {
1014
+ id: "ceph-pool-near-full",
1015
+ name: "Pool Near Full",
1016
+ description:
1017
+ "Alert when any pool's usage exceeds 85% of its writable capacity. Computed per pool as stored ÷ (stored + max_avail) × 100 from ceph_pool_stored and ceph_pool_max_avail. One incident per pool (grouped by pool_id — pool data series carry no name label).",
1018
+ category: "Capacity",
1019
+ severity: "Warning",
1020
+ getMonitorStep: (args: CephAlertTemplateArgs): MonitorStep => {
1021
+ const metricAlias: string = "pool_used_percent";
1022
+
1023
+ return buildCephMonitorStep({
1024
+ cephMonitor: buildCephFormulaMonitorConfig({
1025
+ clusterIdentifier: args.clusterIdentifier,
1026
+ queries: [
1027
+ { alias: "pool_stored", metricName: "ceph_pool_stored" },
1028
+ { alias: "pool_max_avail", metricName: "ceph_pool_max_avail" },
1029
+ ],
1030
+ /*
1031
+ * stored / (stored + max_avail) — the pool's share of the space
1032
+ * it can still grow into. Same-receiver Sum/Sum: the scrape
1033
+ * multiple cancels in both the numerator and the denominator sum.
1034
+ */
1035
+ formula: "(pool_stored / (pool_stored + pool_max_avail)) * 100",
1036
+ resultAlias: metricAlias,
1037
+ resultLegend: "Pool Capacity Used (%)",
1038
+ resultLegendUnit: "%",
1039
+ rollingTime: RollingTime.Past5Minutes,
1040
+ aggregationType: MetricsAggregationType.Sum,
1041
+ groupByAttributeKey: "pool_id",
1042
+ }),
1043
+ offlineCriteriaInstance: buildCephOfflineCriteriaInstance({
1044
+ offlineMonitorStatusId: args.offlineMonitorStatusId,
1045
+ incidentSeverityId: args.defaultIncidentSeverityId,
1046
+ alertSeverityId: args.defaultAlertSeverityId,
1047
+ monitorName: args.monitorName,
1048
+ metricAlias,
1049
+ filterType: FilterType.GreaterThan,
1050
+ value: 85,
1051
+ incidentTitle: `[Ceph] Pool Near Full (>85%) - ${args.monitorName}`,
1052
+ incidentDescription: `A Ceph pool has used more than 85% of its writable capacity (stored ÷ (stored + max_avail)). When a pool fills up, writes to it stall — and the OSDs backing it may hit their full ratio, blocking writes cluster-wide. Run \`ceph df\` to see per-pool usage (match the pool_id from the root cause to its name), then free space, add OSDs, or adjust pool quotas.`,
1053
+ criteriaName: "Pool Near Full - Used > 85%",
1054
+ criteriaDescription:
1055
+ "Triggers when any pool's usage exceeds 85% of its writable capacity over the monitoring window.",
1056
+ }),
1057
+ onlineCriteriaInstance: buildCephOnlineCriteriaInstance({
1058
+ onlineMonitorStatusId: args.onlineMonitorStatusId,
1059
+ metricAlias,
1060
+ filterType: FilterType.LessThanOrEqualTo,
1061
+ value: 85,
1062
+ }),
1063
+ });
1064
+ },
1065
+ };
1066
+
1067
+ const slowOpsTemplate: CephAlertTemplate = {
1068
+ id: "ceph-slow-ops",
1069
+ name: "Slow Operations",
1070
+ description:
1071
+ "Alert when the SLOW_OPS health check is active (ceph_healthcheck_slow_ops > 0) — OSD or monitor requests are taking too long to complete.",
1072
+ category: "Cluster Health",
1073
+ severity: "Warning",
1074
+ getMonitorStep: (args: CephAlertTemplateArgs): MonitorStep => {
1075
+ const metricAlias: string = "slow_ops";
1076
+
1077
+ return buildCephMonitorStep({
1078
+ cephMonitor: buildCephMonitorConfig({
1079
+ clusterIdentifier: args.clusterIdentifier,
1080
+ metricName: "ceph_healthcheck_slow_ops",
1081
+ metricAlias,
1082
+ rollingTime: RollingTime.Past5Minutes,
1083
+ aggregationType: MetricsAggregationType.Max,
1084
+ }),
1085
+ offlineCriteriaInstance: buildCephOfflineCriteriaInstance({
1086
+ offlineMonitorStatusId: args.offlineMonitorStatusId,
1087
+ incidentSeverityId: args.defaultIncidentSeverityId,
1088
+ alertSeverityId: args.defaultAlertSeverityId,
1089
+ monitorName: args.monitorName,
1090
+ metricAlias,
1091
+ filterType: FilterType.GreaterThan,
1092
+ value: 0,
1093
+ incidentTitle: `[Ceph] Slow Operations Detected - ${args.monitorName}`,
1094
+ incidentDescription: `The Ceph SLOW_OPS health check is active — OSD or monitor operations are taking too long to complete, which surfaces as client I/O latency or hangs. Common causes: a failing or saturated disk, network issues between OSDs, or an overloaded daemon. Run \`ceph health detail\` to see which daemons report slow ops, then inspect those hosts' disks and network.`,
1095
+ criteriaName: "Slow Ops - Healthcheck Active",
1096
+ criteriaDescription:
1097
+ "Triggers when the SLOW_OPS healthcheck reports any slow operations.",
1098
+ }),
1099
+ onlineCriteriaInstance: buildCephOnlineCriteriaInstance({
1100
+ onlineMonitorStatusId: args.onlineMonitorStatusId,
1101
+ metricAlias,
1102
+ filterType: FilterType.EqualTo,
1103
+ value: 0,
1104
+ }),
1105
+ });
1106
+ },
1107
+ };
1108
+
1109
+ /*
1110
+ * --- Health-check-driven templates (V3 WI-26) ---
1111
+ *
1112
+ * All of the following watch `ceph_health_detail{name=...}` (or
1113
+ * `ceph_daemon_health_metrics{type=...}`) — see the health-check contract
1114
+ * comment at the top of this file. Series exist only while a check is
1115
+ * active (Quincy+), so every template fires on Max > 0 and recovers on
1116
+ * = 0 with TreatAsZero, staying quiet by default.
1117
+ */
1118
+
1119
+ const pgDamagedTemplate: CephAlertTemplate = {
1120
+ id: "ceph-pg-damaged",
1121
+ name: "Damaged Placement Groups",
1122
+ description:
1123
+ "Alert when scrubbing finds data damage — the PG_DAMAGED or OSD_SCRUB_ERRORS health check is active (ceph_health_detail; Quincy and later, absent series = healthy).",
1124
+ category: "PG",
1125
+ severity: "Critical",
1126
+ getMonitorStep: (args: CephAlertTemplateArgs): MonitorStep => {
1127
+ const pgDamagedAlias: string = "pg_damaged";
1128
+ const scrubErrorsAlias: string = "scrub_errors";
1129
+
1130
+ return buildCephMonitorStep({
1131
+ cephMonitor: buildCephMultiQueryMonitorConfig({
1132
+ clusterIdentifier: args.clusterIdentifier,
1133
+ queries: [
1134
+ {
1135
+ alias: pgDamagedAlias,
1136
+ metricName: "ceph_health_detail",
1137
+ attributes: { name: "PG_DAMAGED" },
1138
+ },
1139
+ {
1140
+ alias: scrubErrorsAlias,
1141
+ metricName: "ceph_health_detail",
1142
+ attributes: { name: "OSD_SCRUB_ERRORS" },
1143
+ },
1144
+ ],
1145
+ rollingTime: RollingTime.Past5Minutes,
1146
+ aggregationType: MetricsAggregationType.Max,
1147
+ }),
1148
+ offlineCriteriaInstance: buildCephOfflineCriteriaInstance({
1149
+ offlineMonitorStatusId: args.offlineMonitorStatusId,
1150
+ incidentSeverityId: args.defaultIncidentSeverityId,
1151
+ alertSeverityId: args.defaultAlertSeverityId,
1152
+ monitorName: args.monitorName,
1153
+ metricAlias: pgDamagedAlias,
1154
+ filterType: FilterType.GreaterThan,
1155
+ value: 0,
1156
+ additionalFilters: [
1157
+ {
1158
+ metricAlias: scrubErrorsAlias,
1159
+ filterType: FilterType.GreaterThan,
1160
+ value: 0,
1161
+ },
1162
+ ],
1163
+ incidentTitle: `[Ceph] CRITICAL: Damaged Placement Groups - ${args.monitorName}`,
1164
+ incidentDescription: `Ceph scrubbing has found damaged placement groups or OSD read errors (PG_DAMAGED / OSD_SCRUB_ERRORS health checks). Data integrity is at risk on at least one replica. Run \`ceph health detail\` to see the affected PGs, \`rados list-inconsistent-pg <pool>\` to locate the inconsistencies, and repair with \`ceph pg repair <pg.id>\`. Scrub errors usually indicate failing media — check the backing disks' SMART data before repairing.`,
1165
+ criteriaName: "PG Damaged - Scrub Found Errors",
1166
+ criteriaDescription:
1167
+ "Triggers when the PG_DAMAGED or OSD_SCRUB_ERRORS health check is active.",
1168
+ }),
1169
+ onlineCriteriaInstance: buildCephOnlineCriteriaInstance({
1170
+ onlineMonitorStatusId: args.onlineMonitorStatusId,
1171
+ metricAlias: pgDamagedAlias,
1172
+ filterType: FilterType.EqualTo,
1173
+ value: 0,
1174
+ additionalFilters: [
1175
+ {
1176
+ metricAlias: scrubErrorsAlias,
1177
+ filterType: FilterType.EqualTo,
1178
+ value: 0,
1179
+ },
1180
+ ],
1181
+ filterCondition: FilterCondition.All,
1182
+ treatNoDataAsZero: true,
1183
+ }),
1184
+ });
1185
+ },
1186
+ };
1187
+
1188
+ const daemonCrashTemplate: CephAlertTemplate = {
1189
+ id: "ceph-daemon-crash",
1190
+ name: "Daemon Crash",
1191
+ description:
1192
+ "Alert when one or more Ceph daemons have recently crashed (RECENT_CRASH health check). This is the only crash signal the mgr exports — there is no ceph_crash_* metric.",
1193
+ category: "Cluster Health",
1194
+ severity: "Critical",
1195
+ getMonitorStep: (args: CephAlertTemplateArgs): MonitorStep => {
1196
+ const metricAlias: string = "recent_crash";
1197
+
1198
+ return buildCephMonitorStep({
1199
+ cephMonitor: buildCephMonitorConfig({
1200
+ clusterIdentifier: args.clusterIdentifier,
1201
+ metricName: "ceph_health_detail",
1202
+ metricAlias,
1203
+ rollingTime: RollingTime.Past5Minutes,
1204
+ aggregationType: MetricsAggregationType.Max,
1205
+ attributes: { name: "RECENT_CRASH" },
1206
+ }),
1207
+ offlineCriteriaInstance: buildCephOfflineCriteriaInstance({
1208
+ offlineMonitorStatusId: args.offlineMonitorStatusId,
1209
+ incidentSeverityId: args.defaultIncidentSeverityId,
1210
+ alertSeverityId: args.defaultAlertSeverityId,
1211
+ monitorName: args.monitorName,
1212
+ metricAlias,
1213
+ filterType: FilterType.GreaterThan,
1214
+ value: 0,
1215
+ incidentTitle: `[Ceph] CRITICAL: Daemon Crash Detected - ${args.monitorName}`,
1216
+ incidentDescription: `One or more Ceph daemons have crashed recently and the crashes have not been acknowledged (RECENT_CRASH health check). Run \`ceph crash ls-new\` to list the new crashes and \`ceph crash info <crash-id>\` to inspect the backtrace. Once investigated, archive them with \`ceph crash archive <crash-id>\` (or \`ceph crash archive-all\`) — the health check clears when all crashes are archived.`,
1217
+ criteriaName: "Daemon Crash - RECENT_CRASH Active",
1218
+ criteriaDescription:
1219
+ "Triggers when the RECENT_CRASH health check reports unacknowledged daemon crashes.",
1220
+ }),
1221
+ onlineCriteriaInstance: buildCephOnlineCriteriaInstance({
1222
+ onlineMonitorStatusId: args.onlineMonitorStatusId,
1223
+ metricAlias,
1224
+ filterType: FilterType.EqualTo,
1225
+ value: 0,
1226
+ treatNoDataAsZero: true,
1227
+ }),
1228
+ });
1229
+ },
1230
+ };
1231
+
1232
+ const osdSlowHeartbeatsTemplate: CephAlertTemplate = {
1233
+ id: "ceph-osd-slow-heartbeats",
1234
+ name: "OSD Slow Heartbeats",
1235
+ description:
1236
+ "Alert when OSD heartbeat pings on the front (public) or back (cluster) network exceed Ceph's grace threshold (OSD_SLOW_PING_TIME_FRONT/BACK health checks — the mgr exports no ping-time gauge, so these are the only signal).",
1237
+ category: "OSD",
1238
+ severity: "Warning",
1239
+ getMonitorStep: (args: CephAlertTemplateArgs): MonitorStep => {
1240
+ const frontAlias: string = "slow_ping_front";
1241
+ const backAlias: string = "slow_ping_back";
1242
+
1243
+ return buildCephMonitorStep({
1244
+ cephMonitor: buildCephMultiQueryMonitorConfig({
1245
+ clusterIdentifier: args.clusterIdentifier,
1246
+ queries: [
1247
+ {
1248
+ alias: frontAlias,
1249
+ metricName: "ceph_health_detail",
1250
+ attributes: { name: "OSD_SLOW_PING_TIME_FRONT" },
1251
+ },
1252
+ {
1253
+ alias: backAlias,
1254
+ metricName: "ceph_health_detail",
1255
+ attributes: { name: "OSD_SLOW_PING_TIME_BACK" },
1256
+ },
1257
+ ],
1258
+ rollingTime: RollingTime.Past5Minutes,
1259
+ aggregationType: MetricsAggregationType.Max,
1260
+ }),
1261
+ offlineCriteriaInstance: buildCephOfflineCriteriaInstance({
1262
+ offlineMonitorStatusId: args.offlineMonitorStatusId,
1263
+ incidentSeverityId: args.defaultIncidentSeverityId,
1264
+ alertSeverityId: args.defaultAlertSeverityId,
1265
+ monitorName: args.monitorName,
1266
+ metricAlias: frontAlias,
1267
+ filterType: FilterType.GreaterThan,
1268
+ value: 0,
1269
+ additionalFilters: [
1270
+ {
1271
+ metricAlias: backAlias,
1272
+ filterType: FilterType.GreaterThan,
1273
+ value: 0,
1274
+ },
1275
+ ],
1276
+ incidentTitle: `[Ceph] OSD Slow Heartbeats - ${args.monitorName}`,
1277
+ incidentDescription: `OSD heartbeat pings on the front (client/public) or back (cluster/replication) network are exceeding Ceph's grace threshold (OSD_SLOW_PING_TIME_FRONT / OSD_SLOW_PING_TIME_BACK health checks). Slow heartbeats usually mean network congestion, packet loss, or a saturated NIC — and can escalate to OSDs being wrongly marked down. Run \`ceph health detail\` to see the affected OSD pairs, then inspect the network path between their hosts.`,
1278
+ criteriaName: "Slow Heartbeats - Front or Back Network",
1279
+ criteriaDescription:
1280
+ "Triggers when the OSD_SLOW_PING_TIME_FRONT or OSD_SLOW_PING_TIME_BACK health check is active.",
1281
+ }),
1282
+ onlineCriteriaInstance: buildCephOnlineCriteriaInstance({
1283
+ onlineMonitorStatusId: args.onlineMonitorStatusId,
1284
+ metricAlias: frontAlias,
1285
+ filterType: FilterType.EqualTo,
1286
+ value: 0,
1287
+ additionalFilters: [
1288
+ {
1289
+ metricAlias: backAlias,
1290
+ filterType: FilterType.EqualTo,
1291
+ value: 0,
1292
+ },
1293
+ ],
1294
+ filterCondition: FilterCondition.All,
1295
+ treatNoDataAsZero: true,
1296
+ }),
1297
+ });
1298
+ },
1299
+ };
1300
+
1301
+ const monClockSkewTemplate: CephAlertTemplate = {
1302
+ id: "ceph-mon-clock-skew",
1303
+ name: "Monitor Clock Skew",
1304
+ description:
1305
+ "Alert when clock skew between Ceph monitors exceeds the allowed threshold (MON_CLOCK_SKEW health check). Skewed clocks can drop monitors from quorum.",
1306
+ category: "Cluster Health",
1307
+ severity: "Warning",
1308
+ getMonitorStep: (args: CephAlertTemplateArgs): MonitorStep => {
1309
+ const metricAlias: string = "mon_clock_skew";
1310
+
1311
+ return buildCephMonitorStep({
1312
+ cephMonitor: buildCephMonitorConfig({
1313
+ clusterIdentifier: args.clusterIdentifier,
1314
+ metricName: "ceph_health_detail",
1315
+ metricAlias,
1316
+ rollingTime: RollingTime.Past5Minutes,
1317
+ aggregationType: MetricsAggregationType.Max,
1318
+ attributes: { name: "MON_CLOCK_SKEW" },
1319
+ }),
1320
+ offlineCriteriaInstance: buildCephOfflineCriteriaInstance({
1321
+ offlineMonitorStatusId: args.offlineMonitorStatusId,
1322
+ incidentSeverityId: args.defaultIncidentSeverityId,
1323
+ alertSeverityId: args.defaultAlertSeverityId,
1324
+ monitorName: args.monitorName,
1325
+ metricAlias,
1326
+ filterType: FilterType.GreaterThan,
1327
+ value: 0,
1328
+ incidentTitle: `[Ceph] Monitor Clock Skew - ${args.monitorName}`,
1329
+ incidentDescription: `Clock skew between Ceph monitor daemons has exceeded the allowed threshold (MON_CLOCK_SKEW health check; default 0.05 s). Monitors need closely synchronized clocks to maintain quorum — sustained skew can drop monitors out and stall the cluster. Run \`ceph time-sync-status\` to see per-monitor offsets and fix time synchronization (chrony/ntpd) on the affected monitor hosts.`,
1330
+ criteriaName: "Clock Skew - MON_CLOCK_SKEW Active",
1331
+ criteriaDescription:
1332
+ "Triggers when the MON_CLOCK_SKEW health check is active.",
1333
+ }),
1334
+ onlineCriteriaInstance: buildCephOnlineCriteriaInstance({
1335
+ onlineMonitorStatusId: args.onlineMonitorStatusId,
1336
+ metricAlias,
1337
+ filterType: FilterType.EqualTo,
1338
+ value: 0,
1339
+ treatNoDataAsZero: true,
1340
+ }),
1341
+ });
1342
+ },
1343
+ };
1344
+
1345
+ const osdNearfullTemplate: CephAlertTemplate = {
1346
+ id: "ceph-osd-nearfull",
1347
+ name: "OSD Nearfull",
1348
+ description:
1349
+ "Alert when any individual OSD crosses the nearfull threshold (OSD_NEARFULL health check; default 85%). Single OSDs fill up long before the cluster average does.",
1350
+ category: "Capacity",
1351
+ severity: "Warning",
1352
+ getMonitorStep: (args: CephAlertTemplateArgs): MonitorStep => {
1353
+ const metricAlias: string = "osd_nearfull";
1354
+
1355
+ return buildCephMonitorStep({
1356
+ cephMonitor: buildCephMonitorConfig({
1357
+ clusterIdentifier: args.clusterIdentifier,
1358
+ metricName: "ceph_health_detail",
1359
+ metricAlias,
1360
+ rollingTime: RollingTime.Past5Minutes,
1361
+ aggregationType: MetricsAggregationType.Max,
1362
+ attributes: { name: "OSD_NEARFULL" },
1363
+ }),
1364
+ offlineCriteriaInstance: buildCephOfflineCriteriaInstance({
1365
+ offlineMonitorStatusId: args.offlineMonitorStatusId,
1366
+ incidentSeverityId: args.defaultIncidentSeverityId,
1367
+ alertSeverityId: args.defaultAlertSeverityId,
1368
+ monitorName: args.monitorName,
1369
+ metricAlias,
1370
+ filterType: FilterType.GreaterThan,
1371
+ value: 0,
1372
+ incidentTitle: `[Ceph] OSD Nearfull - ${args.monitorName}`,
1373
+ incidentDescription: `One or more OSDs have crossed the nearfull threshold (OSD_NEARFULL health check; default 85%). Capacity is rarely perfectly balanced — individual OSDs fill up before the cluster does, and any single OSD reaching the full ratio blocks writes cluster-wide. Run \`ceph osd df\` to find the affected OSDs, rebalance with the balancer module or \`ceph osd reweight-by-utilization\`, and plan capacity now.`,
1374
+ criteriaName: "OSD Nearfull - Health Check Active",
1375
+ criteriaDescription:
1376
+ "Triggers when the OSD_NEARFULL health check is active.",
1377
+ }),
1378
+ onlineCriteriaInstance: buildCephOnlineCriteriaInstance({
1379
+ onlineMonitorStatusId: args.onlineMonitorStatusId,
1380
+ metricAlias,
1381
+ filterType: FilterType.EqualTo,
1382
+ value: 0,
1383
+ treatNoDataAsZero: true,
1384
+ }),
1385
+ });
1386
+ },
1387
+ };
1388
+
1389
+ const osdBackfillfullTemplate: CephAlertTemplate = {
1390
+ id: "ceph-osd-backfillfull",
1391
+ name: "OSD Backfillfull",
1392
+ description:
1393
+ "Alert when any OSD crosses the backfillfull threshold (OSD_BACKFILLFULL health check; default 90%). Backfill to these OSDs is blocked, stalling recovery and rebalancing.",
1394
+ category: "Capacity",
1395
+ severity: "Warning",
1396
+ getMonitorStep: (args: CephAlertTemplateArgs): MonitorStep => {
1397
+ const metricAlias: string = "osd_backfillfull";
1398
+
1399
+ return buildCephMonitorStep({
1400
+ cephMonitor: buildCephMonitorConfig({
1401
+ clusterIdentifier: args.clusterIdentifier,
1402
+ metricName: "ceph_health_detail",
1403
+ metricAlias,
1404
+ rollingTime: RollingTime.Past5Minutes,
1405
+ aggregationType: MetricsAggregationType.Max,
1406
+ attributes: { name: "OSD_BACKFILLFULL" },
1407
+ }),
1408
+ offlineCriteriaInstance: buildCephOfflineCriteriaInstance({
1409
+ offlineMonitorStatusId: args.offlineMonitorStatusId,
1410
+ incidentSeverityId: args.defaultIncidentSeverityId,
1411
+ alertSeverityId: args.defaultAlertSeverityId,
1412
+ monitorName: args.monitorName,
1413
+ metricAlias,
1414
+ filterType: FilterType.GreaterThan,
1415
+ value: 0,
1416
+ incidentTitle: `[Ceph] OSD Backfillfull - ${args.monitorName}`,
1417
+ incidentDescription: `One or more OSDs have crossed the backfillfull threshold (OSD_BACKFILLFULL health check; default 90%). Backfill and rebalance operations onto these OSDs are now refused, which stalls recovery after failures and can leave the cluster degraded. Run \`ceph osd df\` to find the affected OSDs, then free space or add capacity so recovery can proceed before the OSDs reach the full ratio.`,
1418
+ criteriaName: "OSD Backfillfull - Health Check Active",
1419
+ criteriaDescription:
1420
+ "Triggers when the OSD_BACKFILLFULL health check is active.",
1421
+ }),
1422
+ onlineCriteriaInstance: buildCephOnlineCriteriaInstance({
1423
+ onlineMonitorStatusId: args.onlineMonitorStatusId,
1424
+ metricAlias,
1425
+ filterType: FilterType.EqualTo,
1426
+ value: 0,
1427
+ treatNoDataAsZero: true,
1428
+ }),
1429
+ });
1430
+ },
1431
+ };
1432
+
1433
+ const osdFullTemplate: CephAlertTemplate = {
1434
+ id: "ceph-osd-full",
1435
+ name: "OSD Full",
1436
+ description:
1437
+ "Alert immediately when any OSD reaches the full threshold (OSD_FULL health check; default 95%) — writes to the cluster are refused until space is freed.",
1438
+ category: "Capacity",
1439
+ severity: "Critical",
1440
+ getMonitorStep: (args: CephAlertTemplateArgs): MonitorStep => {
1441
+ const metricAlias: string = "osd_full";
1442
+
1443
+ return buildCephMonitorStep({
1444
+ cephMonitor: buildCephMonitorConfig({
1445
+ clusterIdentifier: args.clusterIdentifier,
1446
+ metricName: "ceph_health_detail",
1447
+ metricAlias,
1448
+ /*
1449
+ * Past1Minute — writes are already blocked when this check fires,
1450
+ * so alert on the very first scrape that reports it.
1451
+ */
1452
+ rollingTime: RollingTime.Past1Minute,
1453
+ aggregationType: MetricsAggregationType.Max,
1454
+ attributes: { name: "OSD_FULL" },
1455
+ }),
1456
+ offlineCriteriaInstance: buildCephOfflineCriteriaInstance({
1457
+ offlineMonitorStatusId: args.offlineMonitorStatusId,
1458
+ incidentSeverityId: args.defaultIncidentSeverityId,
1459
+ alertSeverityId: args.defaultAlertSeverityId,
1460
+ monitorName: args.monitorName,
1461
+ metricAlias,
1462
+ filterType: FilterType.GreaterThan,
1463
+ value: 0,
1464
+ incidentTitle: `[Ceph] CRITICAL: OSD Full - ${args.monitorName}`,
1465
+ incidentDescription: `One or more OSDs have reached the full threshold (OSD_FULL health check; default 95%) and Ceph is refusing writes to protect data integrity — client I/O is stalling now. Free capacity immediately: delete unneeded data or snapshots, add OSDs, or as a last resort temporarily raise the ratio with \`ceph osd set-full-ratio\` (extreme caution) to restore write availability. Run \`ceph osd df\` to identify the full OSDs.`,
1466
+ criteriaName: "OSD Full - Health Check Active",
1467
+ criteriaDescription:
1468
+ "Triggers when the OSD_FULL health check is active.",
1469
+ }),
1470
+ onlineCriteriaInstance: buildCephOnlineCriteriaInstance({
1471
+ onlineMonitorStatusId: args.onlineMonitorStatusId,
1472
+ metricAlias,
1473
+ filterType: FilterType.EqualTo,
1474
+ value: 0,
1475
+ treatNoDataAsZero: true,
1476
+ }),
1477
+ });
1478
+ },
1479
+ };
1480
+
1481
+ const monDiskSpaceTemplate: CephAlertTemplate = {
1482
+ id: "ceph-mon-disk-space",
1483
+ name: "Monitor Disk Space",
1484
+ description:
1485
+ "Alert when a Ceph monitor's database disk runs low — Critical at the MON_DISK_CRIT threshold (default 5% free), Warning at MON_DISK_LOW (default 30% free), both tiers in one template. A full monitor disk crashes the monitor and risks quorum.",
1486
+ category: "Cluster Health",
1487
+ severity: "Critical",
1488
+ getMonitorStep: (args: CephAlertTemplateArgs): MonitorStep => {
1489
+ const critAlias: string = "mon_disk_crit";
1490
+ const lowAlias: string = "mon_disk_low";
1491
+
1492
+ return buildCephMonitorStep({
1493
+ cephMonitor: buildCephMultiQueryMonitorConfig({
1494
+ clusterIdentifier: args.clusterIdentifier,
1495
+ queries: [
1496
+ {
1497
+ alias: critAlias,
1498
+ metricName: "ceph_health_detail",
1499
+ attributes: { name: "MON_DISK_CRIT" },
1500
+ },
1501
+ {
1502
+ alias: lowAlias,
1503
+ metricName: "ceph_health_detail",
1504
+ attributes: { name: "MON_DISK_LOW" },
1505
+ },
1506
+ ],
1507
+ rollingTime: RollingTime.Past5Minutes,
1508
+ aggregationType: MetricsAggregationType.Max,
1509
+ }),
1510
+ // Critical tier first — criteria are evaluated first-match-wins.
1511
+ offlineCriteriaInstance: buildCephOfflineCriteriaInstance({
1512
+ offlineMonitorStatusId: args.offlineMonitorStatusId,
1513
+ incidentSeverityId: args.defaultIncidentSeverityId,
1514
+ alertSeverityId: args.defaultAlertSeverityId,
1515
+ monitorName: args.monitorName,
1516
+ metricAlias: critAlias,
1517
+ filterType: FilterType.GreaterThan,
1518
+ value: 0,
1519
+ incidentTitle: `[Ceph] CRITICAL: Monitor Disk Critically Low - ${args.monitorName}`,
1520
+ incidentDescription: `A Ceph monitor's database disk has crossed the critical threshold (MON_DISK_CRIT health check; default 5% free). If the disk fills completely the monitor crashes — and losing too many monitors loses quorum and halts the cluster. Free space on the affected monitor host immediately: compact the mon store (\`ceph tell mon.<id> compact\`), remove old logs, or grow the volume. Run \`ceph health detail\` to see which monitor is affected.`,
1521
+ criteriaName: "Mon Disk Critical - MON_DISK_CRIT Active",
1522
+ criteriaDescription:
1523
+ "Triggers when the MON_DISK_CRIT health check is active.",
1524
+ }),
1525
+ additionalOfflineCriteriaInstances: [
1526
+ buildCephOfflineCriteriaInstance({
1527
+ offlineMonitorStatusId: args.offlineMonitorStatusId,
1528
+ incidentSeverityId: args.defaultIncidentSeverityId,
1529
+ alertSeverityId: args.defaultAlertSeverityId,
1530
+ monitorName: args.monitorName,
1531
+ metricAlias: lowAlias,
1532
+ filterType: FilterType.GreaterThan,
1533
+ value: 0,
1534
+ incidentTitle: `[Ceph] Monitor Disk Space Low - ${args.monitorName}`,
1535
+ incidentDescription: `A Ceph monitor's database disk is running low on space (MON_DISK_LOW health check; default 30% free). The monitor keeps working, but if the disk keeps filling it will cross the critical threshold and eventually crash, putting quorum at risk. Free space on the affected monitor host: compact the mon store (\`ceph tell mon.<id> compact\`), clean up logs, or grow the volume. Run \`ceph health detail\` to see which monitor is affected.`,
1536
+ criteriaName: "Mon Disk Low - MON_DISK_LOW Active",
1537
+ criteriaDescription:
1538
+ "Triggers when the MON_DISK_LOW health check is active.",
1539
+ }),
1540
+ ],
1541
+ onlineCriteriaInstance: buildCephOnlineCriteriaInstance({
1542
+ onlineMonitorStatusId: args.onlineMonitorStatusId,
1543
+ metricAlias: critAlias,
1544
+ filterType: FilterType.EqualTo,
1545
+ value: 0,
1546
+ additionalFilters: [
1547
+ {
1548
+ metricAlias: lowAlias,
1549
+ filterType: FilterType.EqualTo,
1550
+ value: 0,
1551
+ },
1552
+ ],
1553
+ filterCondition: FilterCondition.All,
1554
+ treatNoDataAsZero: true,
1555
+ }),
1556
+ });
1557
+ },
1558
+ };
1559
+
1560
+ const daemonSlowOpsTemplate: CephAlertTemplate = {
1561
+ id: "ceph-daemon-slow-ops",
1562
+ name: "Daemon Slow Operations",
1563
+ description:
1564
+ "Alert when a specific OSD or monitor daemon reports slow operations (ceph_daemon_health_metrics, type SLOW_OPS) — the per-daemon complement to the cluster-level Slow Operations template. One incident per daemon.",
1565
+ category: "Cluster Health",
1566
+ severity: "Warning",
1567
+ getMonitorStep: (args: CephAlertTemplateArgs): MonitorStep => {
1568
+ const metricAlias: string = "daemon_slow_ops";
1569
+
1570
+ return buildCephMonitorStep({
1571
+ cephMonitor: buildCephMonitorConfig({
1572
+ clusterIdentifier: args.clusterIdentifier,
1573
+ metricName: "ceph_daemon_health_metrics",
1574
+ metricAlias,
1575
+ rollingTime: RollingTime.Past5Minutes,
1576
+ aggregationType: MetricsAggregationType.Max,
1577
+ attributes: { type: "SLOW_OPS" },
1578
+ groupByAttributeKey: "ceph_daemon",
1579
+ }),
1580
+ offlineCriteriaInstance: buildCephOfflineCriteriaInstance({
1581
+ offlineMonitorStatusId: args.offlineMonitorStatusId,
1582
+ incidentSeverityId: args.defaultIncidentSeverityId,
1583
+ alertSeverityId: args.defaultAlertSeverityId,
1584
+ monitorName: args.monitorName,
1585
+ metricAlias,
1586
+ filterType: FilterType.GreaterThan,
1587
+ value: 0,
1588
+ incidentTitle: `[Ceph] Daemon Slow Operations - ${args.monitorName}`,
1589
+ incidentDescription: `A Ceph daemon is reporting operations that exceed the configured complaint time (ceph_daemon_health_metrics with type SLOW_OPS). Unlike the cluster-level Slow Operations alert, this pinpoints the exact OSD or monitor. Inspect the daemon with \`ceph daemon <ceph_daemon> dump_ops_in_flight\` and check its host for a failing or saturated disk and network problems. Check the root cause for the affected ceph_daemon label.`,
1590
+ criteriaName: "Daemon Slow Ops - Count > 0",
1591
+ criteriaDescription:
1592
+ "Triggers when any daemon reports slow operations via ceph_daemon_health_metrics.",
1593
+ }),
1594
+ onlineCriteriaInstance: buildCephOnlineCriteriaInstance({
1595
+ onlineMonitorStatusId: args.onlineMonitorStatusId,
1596
+ metricAlias,
1597
+ filterType: FilterType.EqualTo,
1598
+ value: 0,
1599
+ treatNoDataAsZero: true,
1600
+ }),
1601
+ });
1602
+ },
1603
+ };
1604
+
1605
+ export function getAllCephAlertTemplates(): Array<CephAlertTemplate> {
1606
+ return [
1607
+ healthErrorTemplate,
1608
+ healthWarnTemplate,
1609
+ osdDownTemplate,
1610
+ osdOutTemplate,
1611
+ osdHighLatencyTemplate,
1612
+ monQuorumDegradedTemplate,
1613
+ pgDegradedTemplate,
1614
+ pgUndersizedTemplate,
1615
+ pgInactiveTemplate,
1616
+ clusterNearFullTemplate,
1617
+ clusterFullTemplate,
1618
+ poolNearFullTemplate,
1619
+ slowOpsTemplate,
1620
+ // Health-check-driven templates (V3 WI-26):
1621
+ pgDamagedTemplate,
1622
+ daemonCrashTemplate,
1623
+ osdSlowHeartbeatsTemplate,
1624
+ monClockSkewTemplate,
1625
+ osdNearfullTemplate,
1626
+ osdBackfillfullTemplate,
1627
+ osdFullTemplate,
1628
+ monDiskSpaceTemplate,
1629
+ daemonSlowOpsTemplate,
1630
+ ];
1631
+ }
1632
+
1633
+ export function getCephAlertTemplatesByCategory(
1634
+ category: CephAlertTemplateCategory,
1635
+ ): Array<CephAlertTemplate> {
1636
+ return getAllCephAlertTemplates().filter((template: CephAlertTemplate) => {
1637
+ return template.category === category;
1638
+ });
1639
+ }
1640
+
1641
+ export function getCephAlertTemplateById(
1642
+ id: string,
1643
+ ): CephAlertTemplate | undefined {
1644
+ return getAllCephAlertTemplates().find((template: CephAlertTemplate) => {
1645
+ return template.id === id;
1646
+ });
1647
+ }