@unrdf/observability 26.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,269 @@
1
+ # Prometheus Alert Rules for UNRDF
2
+ #
3
+ # Production-grade alerting rules for:
4
+ # - Business metrics (success rates, SLA violations)
5
+ # - Performance (latency, throughput)
6
+ # - Resources (memory, CPU)
7
+ # - Security events
8
+
9
+ groups:
10
+ # Business Metrics Alerts
11
+ - name: business_metrics
12
+ interval: 30s
13
+ rules:
14
+ - alert: LowSuccessRate
15
+ expr: |
16
+ (
17
+ rate(business_operations_total{result="success"}[5m]) /
18
+ rate(business_operations_total[5m])
19
+ ) < 0.95
20
+ for: 5m
21
+ labels:
22
+ severity: warning
23
+ category: business
24
+ annotations:
25
+ summary: 'Low success rate for {{ $labels.operation }}'
26
+ description: 'Success rate is {{ $value | humanizePercentage }}, below 95% threshold'
27
+
28
+ - alert: CriticalSuccessRate
29
+ expr: |
30
+ (
31
+ rate(business_operations_total{result="success"}[5m]) /
32
+ rate(business_operations_total[5m])
33
+ ) < 0.90
34
+ for: 2m
35
+ labels:
36
+ severity: critical
37
+ category: business
38
+ annotations:
39
+ summary: 'CRITICAL: Success rate for {{ $labels.operation }}'
40
+ description: 'Success rate is {{ $value | humanizePercentage }}, below 90% threshold'
41
+
42
+ - alert: HighSLAViolations
43
+ expr: rate(business_sla_violations[5m]) > 0.1
44
+ for: 5m
45
+ labels:
46
+ severity: warning
47
+ category: business
48
+ annotations:
49
+ summary: 'High SLA violation rate'
50
+ description: 'SLA violations at {{ $value }} violations/sec for operation {{ $labels.operation }}'
51
+
52
+ # Latency Alerts
53
+ - name: latency_metrics
54
+ interval: 30s
55
+ rules:
56
+ - alert: HighP95Latency
57
+ expr: latency_p95_ms > 1000
58
+ for: 5m
59
+ labels:
60
+ severity: warning
61
+ category: performance
62
+ annotations:
63
+ summary: 'High P95 latency for {{ $labels.operation }}'
64
+ description: 'P95 latency is {{ $value }}ms, exceeding 1000ms threshold'
65
+
66
+ - alert: HighP99Latency
67
+ expr: latency_p99_ms > 5000
68
+ for: 2m
69
+ labels:
70
+ severity: critical
71
+ category: performance
72
+ annotations:
73
+ summary: 'CRITICAL: High P99 latency for {{ $labels.operation }}'
74
+ description: 'P99 latency is {{ $value }}ms, exceeding 5000ms threshold'
75
+
76
+ - alert: LatencySpike
77
+ expr: |
78
+ (
79
+ latency_p95_ms - latency_p95_ms offset 5m
80
+ ) / latency_p95_ms offset 5m > 0.5
81
+ for: 2m
82
+ labels:
83
+ severity: warning
84
+ category: performance
85
+ annotations:
86
+ summary: 'Latency spike detected for {{ $labels.operation }}'
87
+ description: 'P95 latency increased by {{ $value | humanizePercentage }} in 5 minutes'
88
+
89
+ # Throughput Alerts
90
+ - name: throughput_metrics
91
+ interval: 30s
92
+ rules:
93
+ - alert: LowThroughput
94
+ expr: throughput_ops_per_second < 10
95
+ for: 10m
96
+ labels:
97
+ severity: warning
98
+ category: performance
99
+ annotations:
100
+ summary: 'Low throughput for {{ $labels.operation }}'
101
+ description: 'Throughput is {{ $value }} ops/sec, below expected rate'
102
+
103
+ - alert: ThroughputDrop
104
+ expr: |
105
+ (
106
+ rate(business_operations_total[5m]) -
107
+ rate(business_operations_total[5m] offset 10m)
108
+ ) / rate(business_operations_total[5m] offset 10m) < -0.5
109
+ for: 5m
110
+ labels:
111
+ severity: warning
112
+ category: performance
113
+ annotations:
114
+ summary: 'Throughput drop detected'
115
+ description: 'Throughput dropped by {{ $value | humanizePercentage }} in 10 minutes'
116
+
117
+ # Resource Alerts
118
+ - name: resource_metrics
119
+ interval: 30s
120
+ rules:
121
+ - alert: HighMemoryUsage
122
+ expr: |
123
+ resource_heap_used_bytes / resource_heap_total_bytes > 0.85
124
+ for: 5m
125
+ labels:
126
+ severity: warning
127
+ category: resource
128
+ annotations:
129
+ summary: 'High memory usage'
130
+ description: 'Memory usage is {{ $value | humanizePercentage }}, above 85% threshold'
131
+
132
+ - alert: CriticalMemoryUsage
133
+ expr: |
134
+ resource_heap_used_bytes / resource_heap_total_bytes > 0.95
135
+ for: 2m
136
+ labels:
137
+ severity: critical
138
+ category: resource
139
+ annotations:
140
+ summary: 'CRITICAL: Memory usage'
141
+ description: 'Memory usage is {{ $value | humanizePercentage }}, above 95% threshold'
142
+
143
+ - alert: HighCPULoad
144
+ expr: resource_cpu_load > 0.80
145
+ for: 5m
146
+ labels:
147
+ severity: warning
148
+ category: resource
149
+ annotations:
150
+ summary: 'High CPU load'
151
+ description: 'CPU load is {{ $value | humanizePercentage }}'
152
+
153
+ - alert: HighEventLoopLag
154
+ expr: |
155
+ histogram_quantile(0.95, rate(resource_event_loop_lag_ms_bucket[5m])) > 100
156
+ for: 5m
157
+ labels:
158
+ severity: warning
159
+ category: resource
160
+ annotations:
161
+ summary: 'High event loop lag'
162
+ description: 'P95 event loop lag is {{ $value }}ms, above 100ms threshold'
163
+
164
+ # Security Alerts
165
+ - name: security_events
166
+ interval: 30s
167
+ rules:
168
+ - alert: HighAuthFailures
169
+ expr: rate(event_total{event_type="security.auth.failure"}[5m]) > 1
170
+ for: 2m
171
+ labels:
172
+ severity: warning
173
+ category: security
174
+ annotations:
175
+ summary: 'High authentication failure rate'
176
+ description: '{{ $value }} auth failures per second'
177
+
178
+ - alert: InjectionAttempt
179
+ expr: increase(event_total{event_type="security.injection.attempt"}[5m]) > 0
180
+ for: 1m
181
+ labels:
182
+ severity: critical
183
+ category: security
184
+ annotations:
185
+ summary: 'Injection attempt detected'
186
+ description: '{{ $value }} injection attempts in last 5 minutes'
187
+
188
+ - alert: RateLimitExceeded
189
+ expr: rate(event_total{event_type="security.rate_limit.exceeded"}[5m]) > 5
190
+ for: 5m
191
+ labels:
192
+ severity: warning
193
+ category: security
194
+ annotations:
195
+ summary: 'High rate limit violations'
196
+ description: '{{ $value }} rate limit violations per second'
197
+
198
+ - alert: UnauthorizedAccess
199
+ expr: increase(event_total{event_type="security.unauthorized_access"}[5m]) > 5
200
+ for: 2m
201
+ labels:
202
+ severity: critical
203
+ category: security
204
+ annotations:
205
+ summary: 'Unauthorized access attempts'
206
+ description: '{{ $value }} unauthorized access attempts in last 5 minutes'
207
+
208
+ # Error Rate Alerts
209
+ - name: error_rates
210
+ interval: 30s
211
+ rules:
212
+ - alert: HighErrorRate
213
+ expr: |
214
+ sum(rate(business_failures_by_type[5m])) by (operation) /
215
+ sum(rate(business_operations_total[5m])) by (operation) > 0.05
216
+ for: 5m
217
+ labels:
218
+ severity: warning
219
+ category: errors
220
+ annotations:
221
+ summary: 'High error rate for {{ $labels.operation }}'
222
+ description: 'Error rate is {{ $value | humanizePercentage }}, above 5% threshold'
223
+
224
+ - alert: CriticalErrorRate
225
+ expr: |
226
+ sum(rate(business_failures_by_type[5m])) by (operation) /
227
+ sum(rate(business_operations_total[5m])) by (operation) > 0.10
228
+ for: 2m
229
+ labels:
230
+ severity: critical
231
+ category: errors
232
+ annotations:
233
+ summary: 'CRITICAL: Error rate for {{ $labels.operation }}'
234
+ description: 'Error rate is {{ $value | humanizePercentage }}, above 10% threshold'
235
+
236
+ # Service Health
237
+ - name: service_health
238
+ interval: 30s
239
+ rules:
240
+ - alert: ServiceDown
241
+ expr: up{job=~"unrdf.*"} == 0
242
+ for: 1m
243
+ labels:
244
+ severity: critical
245
+ category: availability
246
+ annotations:
247
+ summary: 'Service {{ $labels.job }} is down'
248
+ description: 'Service {{ $labels.job }} on {{ $labels.instance }} has been down for 1 minute'
249
+
250
+ - alert: OTELCollectorDown
251
+ expr: up{job="otel-collector"} == 0
252
+ for: 1m
253
+ labels:
254
+ severity: critical
255
+ category: observability
256
+ annotations:
257
+ summary: 'OpenTelemetry Collector is down'
258
+ description: 'OTEL Collector is unreachable, metrics collection impaired'
259
+
260
+ - alert: MetricsStaleness
261
+ expr: |
262
+ time() - timestamp(business_operations_total) > 120
263
+ for: 2m
264
+ labels:
265
+ severity: warning
266
+ category: observability
267
+ annotations:
268
+ summary: 'Metrics are stale'
269
+ description: 'No metrics received for {{ $labels.operation }} in last 2 minutes'
@@ -0,0 +1,136 @@
1
+ # Prometheus Scrape Configuration for UNRDF
2
+ #
3
+ # This configuration collects OpenTelemetry metrics from UNRDF services.
4
+ # It includes scrape jobs, relabeling rules, and service discovery.
5
+
6
+ global:
7
+ scrape_interval: 15s
8
+ scrape_timeout: 10s
9
+ evaluation_interval: 15s
10
+ external_labels:
11
+ cluster: 'unrdf-production'
12
+ environment: 'production'
13
+
14
+ # Alertmanager configuration
15
+ alerting:
16
+ alertmanagers:
17
+ - static_configs:
18
+ - targets:
19
+ - 'alertmanager:9093'
20
+
21
+ # Load alert rules
22
+ rule_files:
23
+ - 'alert-rules.yml'
24
+
25
+ # Scrape configurations
26
+ scrape_configs:
27
+ # UNRDF Core Service
28
+ - job_name: 'unrdf-core'
29
+ scrape_interval: 10s
30
+ metrics_path: '/metrics'
31
+ static_configs:
32
+ - targets:
33
+ - 'localhost:9464' # OTEL collector endpoint
34
+ relabel_configs:
35
+ - source_labels: [__address__]
36
+ target_label: instance
37
+ - source_labels: [__scheme__]
38
+ target_label: scheme
39
+ metric_relabel_configs:
40
+ # Keep only UNRDF metrics
41
+ - source_labels: [__name__]
42
+ regex: '(business|latency|throughput|resource|event)_.*'
43
+ action: keep
44
+ # Add service label
45
+ - target_label: service
46
+ replacement: 'unrdf-core'
47
+
48
+ # UNRDF Sidecar
49
+ - job_name: 'unrdf-sidecar'
50
+ scrape_interval: 10s
51
+ metrics_path: '/api/metrics'
52
+ static_configs:
53
+ - targets:
54
+ - 'localhost:3000'
55
+ relabel_configs:
56
+ - source_labels: [__address__]
57
+ target_label: instance
58
+ metric_relabel_configs:
59
+ - source_labels: [__name__]
60
+ regex: '(rate_limit|ddos|query|backpressure)_.*'
61
+ action: keep
62
+ - target_label: service
63
+ replacement: 'unrdf-sidecar'
64
+
65
+ # UNRDF Federation Nodes
66
+ - job_name: 'unrdf-federation'
67
+ scrape_interval: 15s
68
+ metrics_path: '/metrics'
69
+ static_configs:
70
+ - targets:
71
+ - 'federation-node-1:9464'
72
+ - 'federation-node-2:9464'
73
+ - 'federation-node-3:9464'
74
+ relabel_configs:
75
+ - source_labels: [__address__]
76
+ regex: '(.*):.*'
77
+ target_label: node
78
+ replacement: '$1'
79
+ metric_relabel_configs:
80
+ - target_label: service
81
+ replacement: 'unrdf-federation'
82
+
83
+ # OpenTelemetry Collector
84
+ - job_name: 'otel-collector'
85
+ scrape_interval: 10s
86
+ static_configs:
87
+ - targets:
88
+ - 'otel-collector:8888' # Collector self-metrics
89
+ metric_relabel_configs:
90
+ - source_labels: [__name__]
91
+ regex: 'otelcol_.*'
92
+ action: keep
93
+
94
+ # Kubernetes Service Discovery (optional)
95
+ - job_name: 'kubernetes-pods'
96
+ kubernetes_sd_configs:
97
+ - role: pod
98
+ relabel_configs:
99
+ # Only scrape pods with annotation prometheus.io/scrape: "true"
100
+ - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
101
+ action: keep
102
+ regex: true
103
+ # Use custom scrape path if specified
104
+ - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
105
+ action: replace
106
+ target_label: __metrics_path__
107
+ regex: (.+)
108
+ # Use custom port if specified
109
+ - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
110
+ action: replace
111
+ regex: ([^:]+)(?::\d+)?;(\d+)
112
+ replacement: $1:$2
113
+ target_label: __address__
114
+ # Add pod labels
115
+ - action: labelmap
116
+ regex: __meta_kubernetes_pod_label_(.+)
117
+ - source_labels: [__meta_kubernetes_namespace]
118
+ target_label: kubernetes_namespace
119
+ - source_labels: [__meta_kubernetes_pod_name]
120
+ target_label: kubernetes_pod_name
121
+ - source_labels: [__meta_kubernetes_pod_node_name]
122
+ target_label: kubernetes_node
123
+
124
+ # Remote write configuration (for long-term storage)
125
+ remote_write:
126
+ - url: 'http://remote-storage:9090/api/v1/write'
127
+ queue_config:
128
+ max_samples_per_send: 10000
129
+ batch_send_deadline: 5s
130
+ max_shards: 10
131
+ min_shards: 1
132
+ write_relabel_configs:
133
+ # Drop high-cardinality metrics
134
+ - source_labels: [__name__]
135
+ regex: '.*_bucket|.*_count|.*_sum'
136
+ action: drop