predicate-claw 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (136) hide show
  1. package/.github/workflows/release.yml +76 -0
  2. package/.github/workflows/tests.yml +34 -0
  3. package/.markdownlint.yaml +5 -0
  4. package/.pre-commit-config.yaml +100 -0
  5. package/README.md +405 -0
  6. package/dist/src/adapter.d.ts +17 -0
  7. package/dist/src/adapter.js +36 -0
  8. package/dist/src/authority-client.d.ts +21 -0
  9. package/dist/src/authority-client.js +22 -0
  10. package/dist/src/circuit-breaker.d.ts +86 -0
  11. package/dist/src/circuit-breaker.js +174 -0
  12. package/dist/src/config.d.ts +8 -0
  13. package/dist/src/config.js +7 -0
  14. package/dist/src/control-plane-sync.d.ts +57 -0
  15. package/dist/src/control-plane-sync.js +99 -0
  16. package/dist/src/errors.d.ts +6 -0
  17. package/dist/src/errors.js +6 -0
  18. package/dist/src/index.d.ts +12 -0
  19. package/dist/src/index.js +12 -0
  20. package/dist/src/non-web-evidence.d.ts +46 -0
  21. package/dist/src/non-web-evidence.js +54 -0
  22. package/dist/src/openclaw-hooks.d.ts +27 -0
  23. package/dist/src/openclaw-hooks.js +54 -0
  24. package/dist/src/openclaw-plugin-api.d.ts +18 -0
  25. package/dist/src/openclaw-plugin-api.js +17 -0
  26. package/dist/src/provider.d.ts +48 -0
  27. package/dist/src/provider.js +154 -0
  28. package/dist/src/runtime-integration.d.ts +20 -0
  29. package/dist/src/runtime-integration.js +43 -0
  30. package/dist/src/web-evidence.d.ts +48 -0
  31. package/dist/src/web-evidence.js +49 -0
  32. package/dist/tests/adapter.test.d.ts +1 -0
  33. package/dist/tests/adapter.test.js +63 -0
  34. package/dist/tests/audit-event-e2e.test.d.ts +1 -0
  35. package/dist/tests/audit-event-e2e.test.js +209 -0
  36. package/dist/tests/authority-client.test.d.ts +1 -0
  37. package/dist/tests/authority-client.test.js +46 -0
  38. package/dist/tests/circuit-breaker.test.d.ts +1 -0
  39. package/dist/tests/circuit-breaker.test.js +200 -0
  40. package/dist/tests/control-plane-sync.test.d.ts +1 -0
  41. package/dist/tests/control-plane-sync.test.js +90 -0
  42. package/dist/tests/hack-vs-fix-demo.test.d.ts +1 -0
  43. package/dist/tests/hack-vs-fix-demo.test.js +36 -0
  44. package/dist/tests/jwks-rotation.test.d.ts +1 -0
  45. package/dist/tests/jwks-rotation.test.js +232 -0
  46. package/dist/tests/load-latency.test.d.ts +1 -0
  47. package/dist/tests/load-latency.test.js +175 -0
  48. package/dist/tests/multi-tenant-isolation.test.d.ts +1 -0
  49. package/dist/tests/multi-tenant-isolation.test.js +146 -0
  50. package/dist/tests/non-web-evidence.test.d.ts +1 -0
  51. package/dist/tests/non-web-evidence.test.js +139 -0
  52. package/dist/tests/openclaw-hooks.test.d.ts +1 -0
  53. package/dist/tests/openclaw-hooks.test.js +38 -0
  54. package/dist/tests/openclaw-plugin-api.test.d.ts +1 -0
  55. package/dist/tests/openclaw-plugin-api.test.js +40 -0
  56. package/dist/tests/provider.test.d.ts +1 -0
  57. package/dist/tests/provider.test.js +190 -0
  58. package/dist/tests/runtime-integration.test.d.ts +1 -0
  59. package/dist/tests/runtime-integration.test.js +57 -0
  60. package/dist/tests/web-evidence.test.d.ts +1 -0
  61. package/dist/tests/web-evidence.test.js +89 -0
  62. package/docs/MIGRATION_GUIDE.md +405 -0
  63. package/docs/OPERATIONAL_RUNBOOK.md +389 -0
  64. package/docs/PRODUCTION_READINESS.md +134 -0
  65. package/docs/SLO_THRESHOLDS.md +193 -0
  66. package/examples/README.md +171 -0
  67. package/examples/docker/Dockerfile.test +16 -0
  68. package/examples/docker/README.md +48 -0
  69. package/examples/docker/docker-compose.test.yml +16 -0
  70. package/examples/non-web-evidence-demo.ts +184 -0
  71. package/examples/openclaw-plugin-smoke/index.ts +30 -0
  72. package/examples/openclaw-plugin-smoke/openclaw.plugin.json +11 -0
  73. package/examples/openclaw-plugin-smoke/package.json +9 -0
  74. package/examples/openclaw_integration_example.py +41 -0
  75. package/examples/policy/README.md +165 -0
  76. package/examples/policy/approved-hosts.yaml +137 -0
  77. package/examples/policy/dev-workflow.yaml +206 -0
  78. package/examples/policy/policy.example.yaml +17 -0
  79. package/examples/policy/production-strict.yaml +97 -0
  80. package/examples/policy/sensitive-paths.yaml +114 -0
  81. package/examples/policy/source-trust.yaml +129 -0
  82. package/examples/policy/workspace-isolation.yaml +51 -0
  83. package/examples/runtime_registry_example.py +75 -0
  84. package/package.json +27 -0
  85. package/pyproject.toml +41 -0
  86. package/src/adapter.ts +45 -0
  87. package/src/authority-client.ts +50 -0
  88. package/src/circuit-breaker.ts +245 -0
  89. package/src/config.ts +15 -0
  90. package/src/control-plane-sync.ts +159 -0
  91. package/src/errors.ts +5 -0
  92. package/src/index.ts +12 -0
  93. package/src/non-web-evidence.ts +116 -0
  94. package/src/openclaw-hooks.ts +76 -0
  95. package/src/openclaw-plugin-api.ts +51 -0
  96. package/src/openclaw_predicate_provider/__init__.py +16 -0
  97. package/src/openclaw_predicate_provider/__main__.py +5 -0
  98. package/src/openclaw_predicate_provider/adapter.py +84 -0
  99. package/src/openclaw_predicate_provider/agentidentity_backend.py +78 -0
  100. package/src/openclaw_predicate_provider/cli.py +160 -0
  101. package/src/openclaw_predicate_provider/config.py +42 -0
  102. package/src/openclaw_predicate_provider/errors.py +13 -0
  103. package/src/openclaw_predicate_provider/integrations/__init__.py +5 -0
  104. package/src/openclaw_predicate_provider/integrations/openclaw_runtime.py +74 -0
  105. package/src/openclaw_predicate_provider/models.py +19 -0
  106. package/src/openclaw_predicate_provider/openclaw_hooks.py +75 -0
  107. package/src/openclaw_predicate_provider/provider.py +69 -0
  108. package/src/openclaw_predicate_provider/py.typed +1 -0
  109. package/src/openclaw_predicate_provider/sidecar.py +59 -0
  110. package/src/provider.ts +220 -0
  111. package/src/runtime-integration.ts +68 -0
  112. package/src/web-evidence.ts +95 -0
  113. package/tests/adapter.test.ts +76 -0
  114. package/tests/audit-event-e2e.test.ts +258 -0
  115. package/tests/authority-client.test.ts +52 -0
  116. package/tests/circuit-breaker.test.ts +266 -0
  117. package/tests/conftest.py +9 -0
  118. package/tests/control-plane-sync.test.ts +114 -0
  119. package/tests/hack-vs-fix-demo.test.ts +44 -0
  120. package/tests/jwks-rotation.test.ts +274 -0
  121. package/tests/load-latency.test.ts +214 -0
  122. package/tests/multi-tenant-isolation.test.ts +183 -0
  123. package/tests/non-web-evidence.test.ts +168 -0
  124. package/tests/openclaw-hooks.test.ts +46 -0
  125. package/tests/openclaw-plugin-api.test.ts +50 -0
  126. package/tests/provider.test.ts +227 -0
  127. package/tests/runtime-integration.test.ts +70 -0
  128. package/tests/test_adapter.py +46 -0
  129. package/tests/test_cli.py +26 -0
  130. package/tests/test_openclaw_hooks.py +53 -0
  131. package/tests/test_provider.py +59 -0
  132. package/tests/test_runtime_integration.py +77 -0
  133. package/tests/test_sidecar_client.py +198 -0
  134. package/tests/web-evidence.test.ts +113 -0
  135. package/tsconfig.json +14 -0
  136. package/vitest.config.ts +7 -0
@@ -0,0 +1,389 @@
1
+ # Operational Runbook
2
+
3
+ This runbook provides step-by-step procedures for operating and troubleshooting
4
+ the OpenClaw Predicate Provider in production environments.
5
+
6
+ ## Quick Reference
7
+
8
+ | Incident Type | Severity | First Response |
9
+ |---------------|----------|----------------|
10
+ | Circuit breaker open | P1 | Check sidecar health |
11
+ | Elevated deny rate | P2 | Compare to policy changes |
12
+ | High latency | P3 | Check sidecar resources |
13
+ | Audit export failures | P4 | Check control plane connectivity |
14
+
15
+ ## Prerequisites
16
+
17
+ Before using this runbook, ensure you have:
18
+
19
+ - Access to provider logs and metrics dashboards
20
+ - Access to sidecar logs (`predicate-authorityd`)
21
+ - Ability to restart provider/sidecar processes
22
+ - Contact information for on-call escalation
23
+
24
+ ## Incident Response Procedures
25
+
26
+ ### P1: Circuit Breaker Stuck Open
27
+
28
+ **Symptoms:**
29
+ - All authorization requests failing immediately
30
+ - `CircuitOpenError` in provider logs
31
+ - Metrics showing `predicate_circuit_state = open`
32
+
33
+ **Diagnosis Steps:**
34
+
35
+ 1. **Check sidecar health**
36
+ ```bash
37
+ curl -s http://localhost:8787/health | jq .
38
+ ```
39
+ Expected: `{"status": "healthy"}`
40
+
41
+ 2. **Check sidecar logs for errors**
42
+ ```bash
43
+ journalctl -u predicate-authorityd -n 100 --no-pager
44
+ # or
45
+ docker logs predicate-authorityd --tail 100
46
+ ```
47
+
48
+ 3. **Verify network connectivity**
49
+ ```bash
50
+ curl -w "@curl-format.txt" -s -o /dev/null http://localhost:8787/health
51
+ ```
52
+
53
+ 4. **Check control plane sync status**
54
+ ```bash
55
+ curl -s http://localhost:8787/v1/sync/status | jq .
56
+ ```
57
+
58
+ **Resolution Steps:**
59
+
60
+ 1. **If sidecar is unhealthy:**
61
+ ```bash
62
+ # Restart sidecar
63
+ systemctl restart predicate-authorityd
64
+ # or
65
+ docker restart predicate-authorityd
66
+ ```
67
+
68
+ 2. **If sidecar is healthy but circuit is still open:**
69
+ - Circuit will auto-recover after `resetTimeoutMs` (default: 30s)
70
+ - For immediate recovery, restart the provider process
71
+
72
+ 3. **If control plane sync is failing:**
73
+ - Check control plane endpoint accessibility
74
+ - Verify API credentials are valid
75
+ - Check for control plane service incidents
76
+
77
+ **Escalation:**
78
+ - If not resolved in 5 minutes, page on-call engineer
79
+ - If sidecar restart doesn't help, escalate to platform team
80
+
81
+ ---
82
+
83
+ ### P2: Elevated Deny Rate
84
+
85
+ **Symptoms:**
86
+ - Sudden increase in deny decisions (>2x baseline)
87
+ - User reports of blocked actions
88
+ - `denied_by_policy` reason code spike
89
+
90
+ **Diagnosis Steps:**
91
+
92
+ 1. **Check deny rate trend**
93
+ ```bash
94
+ # Query recent deny events
95
+ curl -s "http://localhost:8787/v1/audit/decisions?outcome=deny&limit=50" | jq .
96
+ ```
97
+
98
+ 2. **Compare to recent policy changes**
99
+ - Check control plane for recent policy deployments
100
+ - Review policy version in metrics
101
+
102
+ 3. **Identify affected actions/resources**
103
+ ```bash
104
+ # Group denials by action
105
+ curl -s "http://localhost:8787/v1/audit/decisions?outcome=deny" | \
106
+ jq -r '.items | group_by(.action) | map({action: .[0].action, count: length})'
107
+ ```
108
+
109
+ 4. **Check for attack patterns**
110
+ - Look for repeated denials from same principal
111
+ - Check for unusual resource patterns (path traversal, etc.)
112
+
113
+ **Resolution Steps:**
114
+
115
+ 1. **If caused by policy change:**
116
+ - Rollback to previous policy version via control plane
117
+ - Or fix policy and redeploy
118
+
119
+ 2. **If attack attempt:**
120
+ - Document attack patterns
121
+ - Consider adding rate limiting
122
+ - Report to security team
123
+
124
+ 3. **If false positives:**
125
+ - Review policy rules for overly broad denials
126
+ - Add specific allow rules for legitimate use cases
127
+
128
+ **Escalation:**
129
+ - If attack suspected, notify security team immediately
130
+ - If policy rollback needed, coordinate with policy owners
131
+
132
+ ---
133
+
134
+ ### P3: High Authorization Latency
135
+
136
+ **Symptoms:**
137
+ - p95 latency > 150ms
138
+ - Slow tool execution reported by users
139
+ - Timeout errors in logs
140
+
141
+ **Diagnosis Steps:**
142
+
143
+ 1. **Check current latency percentiles**
144
+ ```bash
145
+ curl -s http://localhost:8787/metrics | grep predicate_auth_latency
146
+ ```
147
+
148
+ 2. **Check sidecar resource usage**
149
+ ```bash
150
+ # CPU and memory
151
+ top -p $(pgrep predicate-authorityd)
152
+ # or
153
+ docker stats predicate-authorityd --no-stream
154
+ ```
155
+
156
+ 3. **Check control plane sync load**
157
+ ```bash
158
+ curl -s http://localhost:8787/v1/sync/status | jq '.last_sync_duration_ms'
159
+ ```
160
+
161
+ 4. **Check concurrent request volume**
162
+ ```bash
163
+ curl -s http://localhost:8787/metrics | grep predicate_auth_concurrent
164
+ ```
165
+
166
+ **Resolution Steps:**
167
+
168
+ 1. **If sidecar CPU is high:**
169
+ - Check for runaway policy evaluation
170
+ - Consider scaling sidecar resources
171
+ - Review policy complexity
172
+
173
+ 2. **If sync is slow:**
174
+ - Check control plane latency
175
+ - Consider increasing sync interval
176
+ - Review policy size
177
+
178
+ 3. **If high concurrent load:**
179
+ - Consider horizontal scaling
180
+ - Review request batching options
181
+ - Check for retry storms
182
+
183
+ **Escalation:**
184
+ - If resources are maxed, request capacity increase
185
+ - If policy is too complex, work with policy team to optimize
186
+
187
+ ---
188
+
189
+ ### P4: Audit Export Failures
190
+
191
+ **Symptoms:**
192
+ - Missing audit events in control plane
193
+ - `audit_export_failure` in logs
194
+ - Non-zero `predicate_audit_failures` counter
195
+
196
+ **Diagnosis Steps:**
197
+
198
+ 1. **Check export error logs**
199
+ ```bash
200
+ grep "audit.*error" /var/log/provider.log | tail -20
201
+ ```
202
+
203
+ 2. **Verify control plane connectivity**
204
+ ```bash
205
+ curl -s https://control-plane.example.com/health
206
+ ```
207
+
208
+ 3. **Check export queue depth**
209
+ ```bash
210
+ curl -s http://localhost:8787/metrics | grep predicate_audit_queue
211
+ ```
212
+
213
+ **Resolution Steps:**
214
+
215
+ 1. **If control plane unreachable:**
216
+ - Check network/firewall rules
217
+ - Verify TLS certificates
218
+ - Check for control plane incidents
219
+
220
+ 2. **If queue is backed up:**
221
+ - Audit export is best-effort; auth continues working
222
+ - Events will retry automatically
223
+ - Check disk space for local buffer
224
+
225
+ 3. **If credentials expired:**
226
+ - Rotate API credentials
227
+ - Update provider configuration
228
+ - Restart provider
229
+
230
+ **Escalation:**
231
+ - Audit failures are P4 (non-blocking)
232
+ - Escalate only if prolonged (>1 hour) or compliance-critical
233
+
234
+ ---
235
+
236
+ ## Routine Operations
237
+
238
+ ### Restarting the Provider
239
+
240
+ ```bash
241
+ # Graceful restart (allows in-flight requests to complete)
242
+ systemctl reload openclaw-provider
243
+
244
+ # Full restart
245
+ systemctl restart openclaw-provider
246
+ ```
247
+
248
+ ### Rotating Credentials
249
+
250
+ 1. Generate new credentials in control plane
251
+ 2. Update provider configuration
252
+ 3. Restart provider
253
+ 4. Verify connectivity
254
+ 5. Revoke old credentials
255
+
256
+ ### Updating Policy
257
+
258
+ 1. Deploy new policy to control plane
259
+ 2. Monitor sync status on sidecars
260
+ 3. Watch deny rate for anomalies
261
+ 4. Rollback if issues detected
262
+
263
+ ### Scaling Sidecars
264
+
265
+ For high-load environments:
266
+
267
+ 1. Deploy additional sidecar instances
268
+ 2. Configure load balancer
269
+ 3. Update provider `baseUrl` to load balancer
270
+ 4. Verify even distribution
271
+
272
+ ---
273
+
274
+ ## Health Checks
275
+
276
+ ### Provider Health
277
+
278
+ ```bash
279
+ # Local provider health
280
+ curl -s http://localhost:3000/health
281
+
282
+ # Expected response
283
+ {
284
+ "status": "healthy",
285
+ "sidecar": "connected",
286
+ "circuit": "closed"
287
+ }
288
+ ```
289
+
290
+ ### Sidecar Health
291
+
292
+ ```bash
293
+ # Sidecar health
294
+ curl -s http://localhost:8787/health
295
+
296
+ # Expected response
297
+ {
298
+ "status": "healthy",
299
+ "policy_version": "v1.2.3",
300
+ "last_sync": "2026-02-20T12:00:00Z"
301
+ }
302
+ ```
303
+
304
+ ### End-to-End Check
305
+
306
+ ```bash
307
+ # Test authorization flow
308
+ curl -X POST http://localhost:8787/v1/authorize \
309
+ -H "Content-Type: application/json" \
310
+ -d '{
311
+ "principal": "test:health-check",
312
+ "action": "health.check",
313
+ "resource": "system"
314
+ }'
315
+
316
+ # Expected: allow decision for health check action
317
+ ```
318
+
319
+ ---
320
+
321
+ ## Monitoring Checklist
322
+
323
+ ### Daily
324
+
325
+ - [ ] Review deny rate trends
326
+ - [ ] Check circuit breaker state
327
+ - [ ] Verify audit export completeness
328
+
329
+ ### Weekly
330
+
331
+ - [ ] Review latency percentiles
332
+ - [ ] Check policy sync freshness
333
+ - [ ] Audit access logs
334
+
335
+ ### Monthly
336
+
337
+ - [ ] Review and update SLO thresholds
338
+ - [ ] Test incident response procedures
339
+ - [ ] Update runbook with learnings
340
+
341
+ ---
342
+
343
+ ## Contact Information
344
+
345
+ | Role | Contact |
346
+ |------|---------|
347
+ | On-call engineer | PagerDuty: `predicate-oncall` |
348
+ | Platform team | Slack: `#predicate-platform` |
349
+ | Security team | Slack: `#security-incidents` |
350
+ | Control plane status | https://status.predicatesystems.ai |
351
+
352
+ ---
353
+
354
+ ## Appendix
355
+
356
+ ### Useful Commands
357
+
358
+ ```bash
359
+ # View real-time logs
360
+ journalctl -u predicate-authorityd -f
361
+
362
+ # Check process status
363
+ systemctl status predicate-authorityd
364
+
365
+ # View metrics
366
+ curl -s http://localhost:8787/metrics
367
+
368
+ # Force policy sync
369
+ curl -X POST http://localhost:8787/v1/sync/trigger
370
+
371
+ # Get current policy version
372
+ curl -s http://localhost:8787/v1/policy/version
373
+ ```
374
+
375
+ ### Log Locations
376
+
377
+ | Component | Log Path |
378
+ |-----------|----------|
379
+ | Provider | `/var/log/openclaw-provider/provider.log` |
380
+ | Sidecar | `/var/log/predicate-authorityd/sidecar.log` |
381
+ | Audit events | `/var/log/predicate-authorityd/audit.jsonl` |
382
+
383
+ ### Configuration Files
384
+
385
+ | Component | Config Path |
386
+ |-----------|-------------|
387
+ | Provider | `/etc/openclaw-provider/config.yaml` |
388
+ | Sidecar | `/etc/predicate-authorityd/config.yaml` |
389
+ | Policy | Managed via control plane |
@@ -0,0 +1,134 @@
1
+ # Production Readiness Checklist
2
+
3
+ This document tracks production readiness criteria for the OpenClaw Predicate
4
+ Provider. All items must be verified before GA release.
5
+
6
+ **Status:** Ready for review
7
+ **Last Updated:** 2026-02-20
8
+ **Owner:** Platform Security
9
+
10
+ ## 1. Security Posture
11
+
12
+ | Criteria | Status | Evidence |
13
+ |----------|--------|----------|
14
+ | Fail-closed default for high-risk actions | ✓ | `provider.ts` throws on sidecar errors |
15
+ | No embedded signing keys in plugin | ✓ | Keys remain in sidecar/control plane |
16
+ | Log redaction for sensitive values | ✓ | Tests in `audit-event-e2e.test.ts` |
17
+ | SecurityError returns redacted reasons | ✓ | `errors.ts` implementation |
18
+ | Path traversal protection | ✓ | Tests in `hack-vs-fix-demo.test.ts` |
19
+ | Prompt injection blocking | ✓ | Tests in `hack-vs-fix-demo.test.ts` |
20
+
21
+ **Security Signoff:** _________________ Date: _________
22
+
23
+ ## 2. Reliability
24
+
25
+ | Criteria | Status | Evidence |
26
+ |----------|--------|----------|
27
+ | Circuit breaker for sidecar outages | ✓ | `circuit-breaker.ts` |
28
+ | Exponential backoff with jitter | ✓ | `calculateBackoff()` function |
29
+ | Configurable timeouts | ✓ | `config.ts` (300ms default) |
30
+ | Graceful degradation on sync failure | ✓ | Local policy evaluation continues |
31
+ | Load tested (100 sequential, 50 concurrent) | ✓ | `load-latency.test.ts` |
32
+
33
+ ## 3. Observability
34
+
35
+ | Criteria | Status | Evidence |
36
+ |----------|--------|----------|
37
+ | Decision telemetry (allow/deny/error) | ✓ | `provider.ts` telemetry hooks |
38
+ | Latency metrics | ✓ | `load-latency.test.ts` p50/p95 |
39
+ | Circuit breaker state metrics | ✓ | `CircuitBreaker.getMetrics()` |
40
+ | Audit export integration | ✓ | `audit-event-e2e.test.ts` |
41
+ | Correlation IDs (session, tenant, trace) | ✓ | `multi-tenant-isolation.test.ts` |
42
+
43
+ ## 4. SLOs and Alerting
44
+
45
+ | Criteria | Status | Evidence |
46
+ |----------|--------|----------|
47
+ | Latency SLOs defined (p50 <25ms, p95 <75ms) | ✓ | `docs/SLO_THRESHOLDS.md` |
48
+ | Availability SLOs defined (99.9%) | ✓ | `docs/SLO_THRESHOLDS.md` |
49
+ | Alert thresholds documented | ✓ | `docs/SLO_THRESHOLDS.md` |
50
+ | Circuit breaker alert thresholds | ✓ | `docs/SLO_THRESHOLDS.md` |
51
+ | Deny spike detection criteria | ✓ | `docs/SLO_THRESHOLDS.md` |
52
+
53
+ ## 5. Operations
54
+
55
+ | Criteria | Status | Evidence |
56
+ |----------|--------|----------|
57
+ | Operational runbook | ✓ | `docs/OPERATIONAL_RUNBOOK.md` |
58
+ | P1-P4 incident procedures | ✓ | `docs/OPERATIONAL_RUNBOOK.md` |
59
+ | Health check endpoints | ✓ | Documented in runbook |
60
+ | Restart/recovery procedures | ✓ | `docs/OPERATIONAL_RUNBOOK.md` |
61
+ | Credential rotation procedures | ✓ | `docs/OPERATIONAL_RUNBOOK.md` |
62
+ | Scaling guidance | ✓ | `docs/OPERATIONAL_RUNBOOK.md` |
63
+
64
+ ## 6. Testing
65
+
66
+ | Criteria | Status | Evidence |
67
+ |----------|--------|----------|
68
+ | Unit tests | ✓ | 15 test files, all passing |
69
+ | Integration tests (sidecar wire format) | ✓ | `provider.test.ts` |
70
+ | Load/latency tests | ✓ | `load-latency.test.ts` |
71
+ | Multi-tenant isolation tests | ✓ | `multi-tenant-isolation.test.ts` |
72
+ | JWKS/key rotation tests | ✓ | `jwks-rotation.test.ts` |
73
+ | Adversarial/security tests | ✓ | `hack-vs-fix-demo.test.ts` |
74
+ | CI pipeline (Node 20/22) | ✓ | `.github/workflows/tests.yml` |
75
+
76
+ ## 7. Documentation
77
+
78
+ | Criteria | Status | Evidence |
79
+ |----------|--------|----------|
80
+ | API contract documented | ✓ | Design doc action/resource mapping |
81
+ | Fail-open/fail-closed policy table | ✓ | Design doc |
82
+ | SLO documentation | ✓ | `docs/SLO_THRESHOLDS.md` |
83
+ | Operational runbook | ✓ | `docs/OPERATIONAL_RUNBOOK.md` |
84
+ | Docker adversarial test guide | ✓ | `examples/README.md` |
85
+
86
+ ## 8. Control Plane Integration
87
+
88
+ | Criteria | Status | Evidence |
89
+ |----------|--------|----------|
90
+ | Policy sync client | ✓ | `control-plane-sync.ts` |
91
+ | Revocation propagation | ✓ | `control-plane-sync.ts` |
92
+ | Stale-sync observability | ✓ | `ControlPlaneSyncStatusTracker` |
93
+ | Audit export wiring | ✓ | `audit-event-e2e.test.ts` |
94
+
95
+ ## 9. Known Limitations
96
+
97
+ | Limitation | Impact | Planned Fix |
98
+ |------------|--------|-------------|
99
+ | `state_hash` not integrated into auth flow | Limits pre-execution state verification | Post-Phase 4 |
100
+ | No automatic sidecar discovery | Requires manual `baseUrl` config | Future enhancement |
101
+
102
+ ## Sign-off
103
+
104
+ ### Engineering Review
105
+
106
+ - [ ] All test suites passing
107
+ - [ ] Code review completed
108
+ - [ ] Performance benchmarks acceptable
109
+
110
+ **Engineering Lead:** _________________ Date: _________
111
+
112
+ ### Security Review
113
+
114
+ - [ ] Fail-closed behavior verified
115
+ - [ ] Log redaction verified
116
+ - [ ] No credential exposure risks
117
+
118
+ **Security Lead:** _________________ Date: _________
119
+
120
+ ### Operations Review
121
+
122
+ - [ ] Runbook reviewed and validated
123
+ - [ ] Alerting configured
124
+ - [ ] On-call procedures documented
125
+
126
+ **Operations Lead:** _________________ Date: _________
127
+
128
+ ### Final Approval
129
+
130
+ - [ ] All sections signed off
131
+ - [ ] No blocking issues
132
+ - [ ] Ready for GA release
133
+
134
+ **Release Manager:** _________________ Date: _________
@@ -0,0 +1,193 @@
1
+ # SLOs and Alert Thresholds
2
+
3
+ This document defines Service Level Objectives (SLOs) and alert thresholds for
4
+ the OpenClaw Predicate Provider in production deployments.
5
+
6
+ ## Latency SLOs
7
+
8
+ ### Authorization Call Latency
9
+
10
+ | Percentile | Target | Alert Threshold |
11
+ |------------|--------|-----------------|
12
+ | p50 | < 25 ms | > 50 ms |
13
+ | p95 | < 75 ms | > 150 ms |
14
+ | p99 | < 150 ms | > 300 ms |
15
+
16
+ These targets assume local sidecar deployment. For remote sidecar deployments,
17
+ add network RTT to each target.
18
+
19
+ ### Sidecar Timeout
20
+
21
+ - **Default timeout:** 300 ms
22
+ - **Hard timeout (fail-closed):** 500 ms
23
+
24
+ If the sidecar does not respond within the timeout, the provider fails closed
25
+ (denies the action) for high-risk operations.
26
+
27
+ ## Availability SLOs
28
+
29
+ ### Provider Availability
30
+
31
+ | Metric | Target | Alert Threshold |
32
+ |--------|--------|-----------------|
33
+ | Uptime | 99.9% | < 99.5% |
34
+ | Error rate | < 0.1% | > 1% |
35
+
36
+ ### Sidecar Availability
37
+
38
+ | Metric | Target | Alert Threshold |
39
+ |--------|--------|-----------------|
40
+ | Uptime | 99.95% | < 99.9% |
41
+ | Circuit breaker open rate | < 0.5% | > 2% |
42
+
43
+ ## Decision Quality SLOs
44
+
45
+ ### Deny Spike Detection
46
+
47
+ | Metric | Baseline | Alert Threshold |
48
+ |--------|----------|-----------------|
49
+ | Deny rate | ~5% (varies by policy) | > 2x baseline over 5 min |
50
+ | Deny rate spike | N/A | > 50% increase in 1 min |
51
+
52
+ A sudden spike in deny rates may indicate:
53
+ - Misconfigured policy rollout
54
+ - Attack attempt (should trigger investigation)
55
+ - Sidecar sync failure
56
+
57
+ ### Reason Code Distribution
58
+
59
+ Monitor reason code distribution for anomalies:
60
+
61
+ | Reason Code | Expected Range | Alert if |
62
+ |-------------|----------------|----------|
63
+ | `denied_by_policy` | 80-95% of denials | < 70% |
64
+ | `sidecar_timeout` | < 1% | > 5% |
65
+ | `circuit_open` | < 0.5% | > 2% |
66
+ | `missing_context` | < 0.1% | > 1% |
67
+
68
+ ## Circuit Breaker Thresholds
69
+
70
+ ### Default Configuration
71
+
72
+ ```typescript
73
+ {
74
+ failureThreshold: 5, // Opens after 5 consecutive failures
75
+ resetTimeoutMs: 30_000, // Attempts recovery after 30 seconds
76
+ successThreshold: 2, // Closes after 2 successful calls in half-open
77
+ }
78
+ ```
79
+
80
+ ### Alert Thresholds
81
+
82
+ | Event | Alert Level | Action |
83
+ |-------|-------------|--------|
84
+ | Circuit opened | Warning | Investigate sidecar health |
85
+ | Circuit open > 1 min | Critical | Page on-call |
86
+ | Circuit open > 5 min | Critical | Consider manual intervention |
87
+
88
+ ## Telemetry and Audit SLOs
89
+
90
+ ### Audit Export Latency
91
+
92
+ | Metric | Target | Alert Threshold |
93
+ |--------|--------|-----------------|
94
+ | Export delay (best-effort) | < 5 seconds | > 30 seconds |
95
+ | Export failure rate | < 1% | > 5% |
96
+
97
+ Note: Audit export is best-effort and should never block the authorization path.
98
+
99
+ ### Telemetry Completeness
100
+
101
+ | Metric | Target | Alert Threshold |
102
+ |--------|--------|-----------------|
103
+ | Decision events captured | > 99.9% | < 99% |
104
+ | Context fields present | > 99% | < 95% |
105
+
106
+ ## Control Plane Sync SLOs
107
+
108
+ ### Policy Sync
109
+
110
+ | Metric | Target | Alert Threshold |
111
+ |--------|--------|-----------------|
112
+ | Sync interval | < 60 seconds | > 5 minutes |
113
+ | Stale policy age | < 5 minutes | > 15 minutes |
114
+
115
+ ### Revocation Propagation
116
+
117
+ | Metric | Target | Alert Threshold |
118
+ |--------|--------|-----------------|
119
+ | Revocation latency | < 30 seconds | > 2 minutes |
120
+ | Revocation completeness | 100% | Any missed revocation |
121
+
122
+ ## Monitoring Implementation
123
+
124
+ ### Required Metrics
125
+
126
+ ```typescript
127
+ // Authorization metrics
128
+ counter("predicate_auth_total", { outcome: "allow" | "deny" | "error" });
129
+ histogram("predicate_auth_latency_ms", { action: string });
130
+
131
+ // Circuit breaker metrics
132
+ gauge("predicate_circuit_state", { state: "closed" | "open" | "half_open" });
133
+ counter("predicate_circuit_transitions", { from: string, to: string });
134
+
135
+ // Sync metrics
136
+ gauge("predicate_policy_age_seconds");
137
+ counter("predicate_sync_failures");
138
+
139
+ // Audit metrics
140
+ counter("predicate_audit_exports", { status: "success" | "failure" });
141
+ histogram("predicate_audit_latency_ms");
142
+ ```
143
+
144
+ ### Dashboard Panels
145
+
146
+ 1. **Authorization Overview**
147
+ - Request rate by action
148
+ - Allow/deny/error distribution
149
+ - p50/p95/p99 latency
150
+
151
+ 2. **Circuit Breaker Status**
152
+ - Current state per sidecar
153
+ - Transition history
154
+ - Recovery time
155
+
156
+ 3. **Sync Health**
157
+ - Policy version timeline
158
+ - Sync lag
159
+ - Revocation propagation
160
+
161
+ 4. **Deny Analysis**
162
+ - Deny rate over time
163
+ - Top deny reasons
164
+ - Deny by tenant/action
165
+
166
+ ## Incident Response
167
+
168
+ ### P1: Circuit Breaker Stuck Open
169
+
170
+ 1. Check sidecar health and logs
171
+ 2. Verify network connectivity
172
+ 3. Check control plane status
173
+ 4. Consider manual circuit reset if sidecar is healthy
174
+
175
+ ### P2: Elevated Deny Rate
176
+
177
+ 1. Compare to policy change timeline
178
+ 2. Check for attack patterns
179
+ 3. Review deny reasons distribution
180
+ 4. Validate policy sync status
181
+
182
+ ### P3: Elevated Latency
183
+
184
+ 1. Check sidecar resource usage
185
+ 2. Review concurrent request load
186
+ 3. Check control plane sync load
187
+ 4. Consider scaling sidecars
188
+
189
+ ## Review Cadence
190
+
191
+ - **Weekly:** Review latency percentiles and deny trends
192
+ - **Monthly:** Audit SLO compliance and adjust thresholds
193
+ - **Quarterly:** Review and update SLO targets based on operational learnings