@unrdf/observability 26.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.eslintrc.cjs +10 -0
- package/IMPLEMENTATION-SUMMARY.md +478 -0
- package/LICENSE +21 -0
- package/README.md +482 -0
- package/capability-map.md +90 -0
- package/config/alert-rules.yml +269 -0
- package/config/prometheus.yml +136 -0
- package/dashboards/grafana-unrdf.json +798 -0
- package/dashboards/unrdf-workflow-dashboard.json +295 -0
- package/docs/OBSERVABILITY-PATTERNS.md +681 -0
- package/docs/OBSERVABILITY-RUNBOOK.md +554 -0
- package/examples/observability-demo.mjs +334 -0
- package/package.json +46 -0
- package/src/advanced-metrics.mjs +413 -0
- package/src/alerts/alert-manager.mjs +436 -0
- package/src/custom-events.mjs +558 -0
- package/src/distributed-tracing.mjs +352 -0
- package/src/exporters/grafana-exporter.mjs +415 -0
- package/src/index.mjs +61 -0
- package/src/metrics/workflow-metrics.mjs +346 -0
- package/src/receipts/anchor.mjs +155 -0
- package/src/receipts/index.mjs +62 -0
- package/src/receipts/merkle-tree.mjs +188 -0
- package/src/receipts/receipt-chain.mjs +209 -0
- package/src/receipts/receipt-schema.mjs +128 -0
- package/src/receipts/tamper-detection.mjs +219 -0
- package/test/advanced-metrics.test.mjs +302 -0
- package/test/custom-events.test.mjs +387 -0
- package/test/distributed-tracing.test.mjs +314 -0
- package/validation/observability-validation.mjs +366 -0
- package/vitest.config.mjs +25 -0
|
@@ -0,0 +1,269 @@
|
|
|
1
|
+
# Prometheus Alert Rules for UNRDF
|
|
2
|
+
#
|
|
3
|
+
# Production-grade alerting rules for:
|
|
4
|
+
# - Business metrics (success rates, SLA violations)
|
|
5
|
+
# - Performance (latency, throughput)
|
|
6
|
+
# - Resources (memory, CPU)
|
|
7
|
+
# - Security events
|
|
8
|
+
|
|
9
|
+
groups:
|
|
10
|
+
# Business Metrics Alerts
|
|
11
|
+
- name: business_metrics
|
|
12
|
+
interval: 30s
|
|
13
|
+
rules:
|
|
14
|
+
- alert: LowSuccessRate
|
|
15
|
+
expr: |
|
|
16
|
+
(
|
|
17
|
+
rate(business_operations_total{result="success"}[5m]) /
|
|
18
|
+
rate(business_operations_total[5m])
|
|
19
|
+
) < 0.95
|
|
20
|
+
for: 5m
|
|
21
|
+
labels:
|
|
22
|
+
severity: warning
|
|
23
|
+
category: business
|
|
24
|
+
annotations:
|
|
25
|
+
summary: 'Low success rate for {{ $labels.operation }}'
|
|
26
|
+
description: 'Success rate is {{ $value | humanizePercentage }}, below 95% threshold'
|
|
27
|
+
|
|
28
|
+
- alert: CriticalSuccessRate
|
|
29
|
+
expr: |
|
|
30
|
+
(
|
|
31
|
+
rate(business_operations_total{result="success"}[5m]) /
|
|
32
|
+
rate(business_operations_total[5m])
|
|
33
|
+
) < 0.90
|
|
34
|
+
for: 2m
|
|
35
|
+
labels:
|
|
36
|
+
severity: critical
|
|
37
|
+
category: business
|
|
38
|
+
annotations:
|
|
39
|
+
summary: 'CRITICAL: Success rate for {{ $labels.operation }}'
|
|
40
|
+
description: 'Success rate is {{ $value | humanizePercentage }}, below 90% threshold'
|
|
41
|
+
|
|
42
|
+
- alert: HighSLAViolations
|
|
43
|
+
expr: rate(business_sla_violations[5m]) > 0.1
|
|
44
|
+
for: 5m
|
|
45
|
+
labels:
|
|
46
|
+
severity: warning
|
|
47
|
+
category: business
|
|
48
|
+
annotations:
|
|
49
|
+
summary: 'High SLA violation rate'
|
|
50
|
+
description: 'SLA violations at {{ $value }} violations/sec for operation {{ $labels.operation }}'
|
|
51
|
+
|
|
52
|
+
# Latency Alerts
|
|
53
|
+
- name: latency_metrics
|
|
54
|
+
interval: 30s
|
|
55
|
+
rules:
|
|
56
|
+
- alert: HighP95Latency
|
|
57
|
+
expr: latency_p95_ms > 1000
|
|
58
|
+
for: 5m
|
|
59
|
+
labels:
|
|
60
|
+
severity: warning
|
|
61
|
+
category: performance
|
|
62
|
+
annotations:
|
|
63
|
+
summary: 'High P95 latency for {{ $labels.operation }}'
|
|
64
|
+
description: 'P95 latency is {{ $value }}ms, exceeding 1000ms threshold'
|
|
65
|
+
|
|
66
|
+
- alert: HighP99Latency
|
|
67
|
+
expr: latency_p99_ms > 5000
|
|
68
|
+
for: 2m
|
|
69
|
+
labels:
|
|
70
|
+
severity: critical
|
|
71
|
+
category: performance
|
|
72
|
+
annotations:
|
|
73
|
+
summary: 'CRITICAL: High P99 latency for {{ $labels.operation }}'
|
|
74
|
+
description: 'P99 latency is {{ $value }}ms, exceeding 5000ms threshold'
|
|
75
|
+
|
|
76
|
+
- alert: LatencySpike
|
|
77
|
+
expr: |
|
|
78
|
+
(
|
|
79
|
+
latency_p95_ms - latency_p95_ms offset 5m
|
|
80
|
+
) / latency_p95_ms offset 5m > 0.5
|
|
81
|
+
for: 2m
|
|
82
|
+
labels:
|
|
83
|
+
severity: warning
|
|
84
|
+
category: performance
|
|
85
|
+
annotations:
|
|
86
|
+
summary: 'Latency spike detected for {{ $labels.operation }}'
|
|
87
|
+
description: 'P95 latency increased by {{ $value | humanizePercentage }} in 5 minutes'
|
|
88
|
+
|
|
89
|
+
# Throughput Alerts
|
|
90
|
+
- name: throughput_metrics
|
|
91
|
+
interval: 30s
|
|
92
|
+
rules:
|
|
93
|
+
- alert: LowThroughput
|
|
94
|
+
expr: throughput_ops_per_second < 10
|
|
95
|
+
for: 10m
|
|
96
|
+
labels:
|
|
97
|
+
severity: warning
|
|
98
|
+
category: performance
|
|
99
|
+
annotations:
|
|
100
|
+
summary: 'Low throughput for {{ $labels.operation }}'
|
|
101
|
+
description: 'Throughput is {{ $value }} ops/sec, below expected rate'
|
|
102
|
+
|
|
103
|
+
- alert: ThroughputDrop
|
|
104
|
+
expr: |
|
|
105
|
+
(
|
|
106
|
+
rate(business_operations_total[5m]) -
|
|
107
|
+
rate(business_operations_total[5m] offset 10m)
|
|
108
|
+
) / rate(business_operations_total[5m] offset 10m) < -0.5
|
|
109
|
+
for: 5m
|
|
110
|
+
labels:
|
|
111
|
+
severity: warning
|
|
112
|
+
category: performance
|
|
113
|
+
annotations:
|
|
114
|
+
summary: 'Throughput drop detected'
|
|
115
|
+
description: 'Throughput dropped by {{ $value | humanizePercentage }} in 10 minutes'
|
|
116
|
+
|
|
117
|
+
# Resource Alerts
|
|
118
|
+
- name: resource_metrics
|
|
119
|
+
interval: 30s
|
|
120
|
+
rules:
|
|
121
|
+
- alert: HighMemoryUsage
|
|
122
|
+
expr: |
|
|
123
|
+
resource_heap_used_bytes / resource_heap_total_bytes > 0.85
|
|
124
|
+
for: 5m
|
|
125
|
+
labels:
|
|
126
|
+
severity: warning
|
|
127
|
+
category: resource
|
|
128
|
+
annotations:
|
|
129
|
+
summary: 'High memory usage'
|
|
130
|
+
description: 'Memory usage is {{ $value | humanizePercentage }}, above 85% threshold'
|
|
131
|
+
|
|
132
|
+
- alert: CriticalMemoryUsage
|
|
133
|
+
expr: |
|
|
134
|
+
resource_heap_used_bytes / resource_heap_total_bytes > 0.95
|
|
135
|
+
for: 2m
|
|
136
|
+
labels:
|
|
137
|
+
severity: critical
|
|
138
|
+
category: resource
|
|
139
|
+
annotations:
|
|
140
|
+
summary: 'CRITICAL: Memory usage'
|
|
141
|
+
description: 'Memory usage is {{ $value | humanizePercentage }}, above 95% threshold'
|
|
142
|
+
|
|
143
|
+
- alert: HighCPULoad
|
|
144
|
+
expr: resource_cpu_load > 0.80
|
|
145
|
+
for: 5m
|
|
146
|
+
labels:
|
|
147
|
+
severity: warning
|
|
148
|
+
category: resource
|
|
149
|
+
annotations:
|
|
150
|
+
summary: 'High CPU load'
|
|
151
|
+
description: 'CPU load is {{ $value | humanizePercentage }}'
|
|
152
|
+
|
|
153
|
+
- alert: HighEventLoopLag
|
|
154
|
+
expr: |
|
|
155
|
+
histogram_quantile(0.95, rate(resource_event_loop_lag_ms_bucket[5m])) > 100
|
|
156
|
+
for: 5m
|
|
157
|
+
labels:
|
|
158
|
+
severity: warning
|
|
159
|
+
category: resource
|
|
160
|
+
annotations:
|
|
161
|
+
summary: 'High event loop lag'
|
|
162
|
+
description: 'P95 event loop lag is {{ $value }}ms, above 100ms threshold'
|
|
163
|
+
|
|
164
|
+
# Security Alerts
|
|
165
|
+
- name: security_events
|
|
166
|
+
interval: 30s
|
|
167
|
+
rules:
|
|
168
|
+
- alert: HighAuthFailures
|
|
169
|
+
expr: rate(event_total{event_type="security.auth.failure"}[5m]) > 1
|
|
170
|
+
for: 2m
|
|
171
|
+
labels:
|
|
172
|
+
severity: warning
|
|
173
|
+
category: security
|
|
174
|
+
annotations:
|
|
175
|
+
summary: 'High authentication failure rate'
|
|
176
|
+
description: '{{ $value }} auth failures per second'
|
|
177
|
+
|
|
178
|
+
- alert: InjectionAttempt
|
|
179
|
+
expr: increase(event_total{event_type="security.injection.attempt"}[5m]) > 0
|
|
180
|
+
for: 1m
|
|
181
|
+
labels:
|
|
182
|
+
severity: critical
|
|
183
|
+
category: security
|
|
184
|
+
annotations:
|
|
185
|
+
summary: 'Injection attempt detected'
|
|
186
|
+
description: '{{ $value }} injection attempts in last 5 minutes'
|
|
187
|
+
|
|
188
|
+
- alert: RateLimitExceeded
|
|
189
|
+
expr: rate(event_total{event_type="security.rate_limit.exceeded"}[5m]) > 5
|
|
190
|
+
for: 5m
|
|
191
|
+
labels:
|
|
192
|
+
severity: warning
|
|
193
|
+
category: security
|
|
194
|
+
annotations:
|
|
195
|
+
summary: 'High rate limit violations'
|
|
196
|
+
description: '{{ $value }} rate limit violations per second'
|
|
197
|
+
|
|
198
|
+
- alert: UnauthorizedAccess
|
|
199
|
+
expr: increase(event_total{event_type="security.unauthorized_access"}[5m]) > 5
|
|
200
|
+
for: 2m
|
|
201
|
+
labels:
|
|
202
|
+
severity: critical
|
|
203
|
+
category: security
|
|
204
|
+
annotations:
|
|
205
|
+
summary: 'Unauthorized access attempts'
|
|
206
|
+
description: '{{ $value }} unauthorized access attempts in last 5 minutes'
|
|
207
|
+
|
|
208
|
+
# Error Rate Alerts
|
|
209
|
+
- name: error_rates
|
|
210
|
+
interval: 30s
|
|
211
|
+
rules:
|
|
212
|
+
- alert: HighErrorRate
|
|
213
|
+
expr: |
|
|
214
|
+
sum(rate(business_failures_by_type[5m])) by (operation) /
|
|
215
|
+
sum(rate(business_operations_total[5m])) by (operation) > 0.05
|
|
216
|
+
for: 5m
|
|
217
|
+
labels:
|
|
218
|
+
severity: warning
|
|
219
|
+
category: errors
|
|
220
|
+
annotations:
|
|
221
|
+
summary: 'High error rate for {{ $labels.operation }}'
|
|
222
|
+
description: 'Error rate is {{ $value | humanizePercentage }}, above 5% threshold'
|
|
223
|
+
|
|
224
|
+
- alert: CriticalErrorRate
|
|
225
|
+
expr: |
|
|
226
|
+
sum(rate(business_failures_by_type[5m])) by (operation) /
|
|
227
|
+
sum(rate(business_operations_total[5m])) by (operation) > 0.10
|
|
228
|
+
for: 2m
|
|
229
|
+
labels:
|
|
230
|
+
severity: critical
|
|
231
|
+
category: errors
|
|
232
|
+
annotations:
|
|
233
|
+
summary: 'CRITICAL: Error rate for {{ $labels.operation }}'
|
|
234
|
+
description: 'Error rate is {{ $value | humanizePercentage }}, above 10% threshold'
|
|
235
|
+
|
|
236
|
+
# Service Health
|
|
237
|
+
- name: service_health
|
|
238
|
+
interval: 30s
|
|
239
|
+
rules:
|
|
240
|
+
- alert: ServiceDown
|
|
241
|
+
expr: up{job=~"unrdf.*"} == 0
|
|
242
|
+
for: 1m
|
|
243
|
+
labels:
|
|
244
|
+
severity: critical
|
|
245
|
+
category: availability
|
|
246
|
+
annotations:
|
|
247
|
+
summary: 'Service {{ $labels.job }} is down'
|
|
248
|
+
description: 'Service {{ $labels.job }} on {{ $labels.instance }} has been down for 1 minute'
|
|
249
|
+
|
|
250
|
+
- alert: OTELCollectorDown
|
|
251
|
+
expr: up{job="otel-collector"} == 0
|
|
252
|
+
for: 1m
|
|
253
|
+
labels:
|
|
254
|
+
severity: critical
|
|
255
|
+
category: observability
|
|
256
|
+
annotations:
|
|
257
|
+
summary: 'OpenTelemetry Collector is down'
|
|
258
|
+
description: 'OTEL Collector is unreachable, metrics collection impaired'
|
|
259
|
+
|
|
260
|
+
- alert: MetricsStaleness
|
|
261
|
+
expr: |
|
|
262
|
+
time() - timestamp(business_operations_total) > 120
|
|
263
|
+
for: 2m
|
|
264
|
+
labels:
|
|
265
|
+
severity: warning
|
|
266
|
+
category: observability
|
|
267
|
+
annotations:
|
|
268
|
+
summary: 'Metrics are stale'
|
|
269
|
+
description: 'No metrics received for {{ $labels.operation }} in last 2 minutes'
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
# Prometheus Scrape Configuration for UNRDF
|
|
2
|
+
#
|
|
3
|
+
# This configuration collects OpenTelemetry metrics from UNRDF services.
|
|
4
|
+
# It includes scrape jobs, relabeling rules, and service discovery.
|
|
5
|
+
|
|
6
|
+
global:
|
|
7
|
+
scrape_interval: 15s
|
|
8
|
+
scrape_timeout: 10s
|
|
9
|
+
evaluation_interval: 15s
|
|
10
|
+
external_labels:
|
|
11
|
+
cluster: 'unrdf-production'
|
|
12
|
+
environment: 'production'
|
|
13
|
+
|
|
14
|
+
# Alertmanager configuration
|
|
15
|
+
alerting:
|
|
16
|
+
alertmanagers:
|
|
17
|
+
- static_configs:
|
|
18
|
+
- targets:
|
|
19
|
+
- 'alertmanager:9093'
|
|
20
|
+
|
|
21
|
+
# Load alert rules
|
|
22
|
+
rule_files:
|
|
23
|
+
- 'alert-rules.yml'
|
|
24
|
+
|
|
25
|
+
# Scrape configurations
|
|
26
|
+
scrape_configs:
|
|
27
|
+
# UNRDF Core Service
|
|
28
|
+
- job_name: 'unrdf-core'
|
|
29
|
+
scrape_interval: 10s
|
|
30
|
+
metrics_path: '/metrics'
|
|
31
|
+
static_configs:
|
|
32
|
+
- targets:
|
|
33
|
+
- 'localhost:9464' # OTEL collector endpoint
|
|
34
|
+
relabel_configs:
|
|
35
|
+
- source_labels: [__address__]
|
|
36
|
+
target_label: instance
|
|
37
|
+
- source_labels: [__scheme__]
|
|
38
|
+
target_label: scheme
|
|
39
|
+
metric_relabel_configs:
|
|
40
|
+
# Keep only UNRDF metrics
|
|
41
|
+
- source_labels: [__name__]
|
|
42
|
+
regex: '(business|latency|throughput|resource|event)_.*'
|
|
43
|
+
action: keep
|
|
44
|
+
# Add service label
|
|
45
|
+
- target_label: service
|
|
46
|
+
replacement: 'unrdf-core'
|
|
47
|
+
|
|
48
|
+
# UNRDF Sidecar
|
|
49
|
+
- job_name: 'unrdf-sidecar'
|
|
50
|
+
scrape_interval: 10s
|
|
51
|
+
metrics_path: '/api/metrics'
|
|
52
|
+
static_configs:
|
|
53
|
+
- targets:
|
|
54
|
+
- 'localhost:3000'
|
|
55
|
+
relabel_configs:
|
|
56
|
+
- source_labels: [__address__]
|
|
57
|
+
target_label: instance
|
|
58
|
+
metric_relabel_configs:
|
|
59
|
+
- source_labels: [__name__]
|
|
60
|
+
regex: '(rate_limit|ddos|query|backpressure)_.*'
|
|
61
|
+
action: keep
|
|
62
|
+
- target_label: service
|
|
63
|
+
replacement: 'unrdf-sidecar'
|
|
64
|
+
|
|
65
|
+
# UNRDF Federation Nodes
|
|
66
|
+
- job_name: 'unrdf-federation'
|
|
67
|
+
scrape_interval: 15s
|
|
68
|
+
metrics_path: '/metrics'
|
|
69
|
+
static_configs:
|
|
70
|
+
- targets:
|
|
71
|
+
- 'federation-node-1:9464'
|
|
72
|
+
- 'federation-node-2:9464'
|
|
73
|
+
- 'federation-node-3:9464'
|
|
74
|
+
relabel_configs:
|
|
75
|
+
- source_labels: [__address__]
|
|
76
|
+
regex: '(.*):.*'
|
|
77
|
+
target_label: node
|
|
78
|
+
replacement: '$1'
|
|
79
|
+
metric_relabel_configs:
|
|
80
|
+
- target_label: service
|
|
81
|
+
replacement: 'unrdf-federation'
|
|
82
|
+
|
|
83
|
+
# OpenTelemetry Collector
|
|
84
|
+
- job_name: 'otel-collector'
|
|
85
|
+
scrape_interval: 10s
|
|
86
|
+
static_configs:
|
|
87
|
+
- targets:
|
|
88
|
+
- 'otel-collector:8888' # Collector self-metrics
|
|
89
|
+
metric_relabel_configs:
|
|
90
|
+
- source_labels: [__name__]
|
|
91
|
+
regex: 'otelcol_.*'
|
|
92
|
+
action: keep
|
|
93
|
+
|
|
94
|
+
# Kubernetes Service Discovery (optional)
|
|
95
|
+
- job_name: 'kubernetes-pods'
|
|
96
|
+
kubernetes_sd_configs:
|
|
97
|
+
- role: pod
|
|
98
|
+
relabel_configs:
|
|
99
|
+
# Only scrape pods with annotation prometheus.io/scrape: "true"
|
|
100
|
+
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
|
|
101
|
+
action: keep
|
|
102
|
+
regex: true
|
|
103
|
+
# Use custom scrape path if specified
|
|
104
|
+
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
|
|
105
|
+
action: replace
|
|
106
|
+
target_label: __metrics_path__
|
|
107
|
+
regex: (.+)
|
|
108
|
+
# Use custom port if specified
|
|
109
|
+
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
|
|
110
|
+
action: replace
|
|
111
|
+
regex: ([^:]+)(?::\d+)?;(\d+)
|
|
112
|
+
replacement: $1:$2
|
|
113
|
+
target_label: __address__
|
|
114
|
+
# Add pod labels
|
|
115
|
+
- action: labelmap
|
|
116
|
+
regex: __meta_kubernetes_pod_label_(.+)
|
|
117
|
+
- source_labels: [__meta_kubernetes_namespace]
|
|
118
|
+
target_label: kubernetes_namespace
|
|
119
|
+
- source_labels: [__meta_kubernetes_pod_name]
|
|
120
|
+
target_label: kubernetes_pod_name
|
|
121
|
+
- source_labels: [__meta_kubernetes_pod_node_name]
|
|
122
|
+
target_label: kubernetes_node
|
|
123
|
+
|
|
124
|
+
# Remote write configuration (for long-term storage)
|
|
125
|
+
remote_write:
|
|
126
|
+
- url: 'http://remote-storage:9090/api/v1/write'
|
|
127
|
+
queue_config:
|
|
128
|
+
max_samples_per_send: 10000
|
|
129
|
+
batch_send_deadline: 5s
|
|
130
|
+
max_shards: 10
|
|
131
|
+
min_shards: 1
|
|
132
|
+
write_relabel_configs:
|
|
133
|
+
# Drop high-cardinality metrics
|
|
134
|
+
- source_labels: [__name__]
|
|
135
|
+
regex: '.*_bucket|.*_count|.*_sum'
|
|
136
|
+
action: drop
|