predicate-claw 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/workflows/release.yml +76 -0
- package/.github/workflows/tests.yml +34 -0
- package/.markdownlint.yaml +5 -0
- package/.pre-commit-config.yaml +100 -0
- package/README.md +405 -0
- package/dist/src/adapter.d.ts +17 -0
- package/dist/src/adapter.js +36 -0
- package/dist/src/authority-client.d.ts +21 -0
- package/dist/src/authority-client.js +22 -0
- package/dist/src/circuit-breaker.d.ts +86 -0
- package/dist/src/circuit-breaker.js +174 -0
- package/dist/src/config.d.ts +8 -0
- package/dist/src/config.js +7 -0
- package/dist/src/control-plane-sync.d.ts +57 -0
- package/dist/src/control-plane-sync.js +99 -0
- package/dist/src/errors.d.ts +6 -0
- package/dist/src/errors.js +6 -0
- package/dist/src/index.d.ts +12 -0
- package/dist/src/index.js +12 -0
- package/dist/src/non-web-evidence.d.ts +46 -0
- package/dist/src/non-web-evidence.js +54 -0
- package/dist/src/openclaw-hooks.d.ts +27 -0
- package/dist/src/openclaw-hooks.js +54 -0
- package/dist/src/openclaw-plugin-api.d.ts +18 -0
- package/dist/src/openclaw-plugin-api.js +17 -0
- package/dist/src/provider.d.ts +48 -0
- package/dist/src/provider.js +154 -0
- package/dist/src/runtime-integration.d.ts +20 -0
- package/dist/src/runtime-integration.js +43 -0
- package/dist/src/web-evidence.d.ts +48 -0
- package/dist/src/web-evidence.js +49 -0
- package/dist/tests/adapter.test.d.ts +1 -0
- package/dist/tests/adapter.test.js +63 -0
- package/dist/tests/audit-event-e2e.test.d.ts +1 -0
- package/dist/tests/audit-event-e2e.test.js +209 -0
- package/dist/tests/authority-client.test.d.ts +1 -0
- package/dist/tests/authority-client.test.js +46 -0
- package/dist/tests/circuit-breaker.test.d.ts +1 -0
- package/dist/tests/circuit-breaker.test.js +200 -0
- package/dist/tests/control-plane-sync.test.d.ts +1 -0
- package/dist/tests/control-plane-sync.test.js +90 -0
- package/dist/tests/hack-vs-fix-demo.test.d.ts +1 -0
- package/dist/tests/hack-vs-fix-demo.test.js +36 -0
- package/dist/tests/jwks-rotation.test.d.ts +1 -0
- package/dist/tests/jwks-rotation.test.js +232 -0
- package/dist/tests/load-latency.test.d.ts +1 -0
- package/dist/tests/load-latency.test.js +175 -0
- package/dist/tests/multi-tenant-isolation.test.d.ts +1 -0
- package/dist/tests/multi-tenant-isolation.test.js +146 -0
- package/dist/tests/non-web-evidence.test.d.ts +1 -0
- package/dist/tests/non-web-evidence.test.js +139 -0
- package/dist/tests/openclaw-hooks.test.d.ts +1 -0
- package/dist/tests/openclaw-hooks.test.js +38 -0
- package/dist/tests/openclaw-plugin-api.test.d.ts +1 -0
- package/dist/tests/openclaw-plugin-api.test.js +40 -0
- package/dist/tests/provider.test.d.ts +1 -0
- package/dist/tests/provider.test.js +190 -0
- package/dist/tests/runtime-integration.test.d.ts +1 -0
- package/dist/tests/runtime-integration.test.js +57 -0
- package/dist/tests/web-evidence.test.d.ts +1 -0
- package/dist/tests/web-evidence.test.js +89 -0
- package/docs/MIGRATION_GUIDE.md +405 -0
- package/docs/OPERATIONAL_RUNBOOK.md +389 -0
- package/docs/PRODUCTION_READINESS.md +134 -0
- package/docs/SLO_THRESHOLDS.md +193 -0
- package/examples/README.md +171 -0
- package/examples/docker/Dockerfile.test +16 -0
- package/examples/docker/README.md +48 -0
- package/examples/docker/docker-compose.test.yml +16 -0
- package/examples/non-web-evidence-demo.ts +184 -0
- package/examples/openclaw-plugin-smoke/index.ts +30 -0
- package/examples/openclaw-plugin-smoke/openclaw.plugin.json +11 -0
- package/examples/openclaw-plugin-smoke/package.json +9 -0
- package/examples/openclaw_integration_example.py +41 -0
- package/examples/policy/README.md +165 -0
- package/examples/policy/approved-hosts.yaml +137 -0
- package/examples/policy/dev-workflow.yaml +206 -0
- package/examples/policy/policy.example.yaml +17 -0
- package/examples/policy/production-strict.yaml +97 -0
- package/examples/policy/sensitive-paths.yaml +114 -0
- package/examples/policy/source-trust.yaml +129 -0
- package/examples/policy/workspace-isolation.yaml +51 -0
- package/examples/runtime_registry_example.py +75 -0
- package/package.json +27 -0
- package/pyproject.toml +41 -0
- package/src/adapter.ts +45 -0
- package/src/authority-client.ts +50 -0
- package/src/circuit-breaker.ts +245 -0
- package/src/config.ts +15 -0
- package/src/control-plane-sync.ts +159 -0
- package/src/errors.ts +5 -0
- package/src/index.ts +12 -0
- package/src/non-web-evidence.ts +116 -0
- package/src/openclaw-hooks.ts +76 -0
- package/src/openclaw-plugin-api.ts +51 -0
- package/src/openclaw_predicate_provider/__init__.py +16 -0
- package/src/openclaw_predicate_provider/__main__.py +5 -0
- package/src/openclaw_predicate_provider/adapter.py +84 -0
- package/src/openclaw_predicate_provider/agentidentity_backend.py +78 -0
- package/src/openclaw_predicate_provider/cli.py +160 -0
- package/src/openclaw_predicate_provider/config.py +42 -0
- package/src/openclaw_predicate_provider/errors.py +13 -0
- package/src/openclaw_predicate_provider/integrations/__init__.py +5 -0
- package/src/openclaw_predicate_provider/integrations/openclaw_runtime.py +74 -0
- package/src/openclaw_predicate_provider/models.py +19 -0
- package/src/openclaw_predicate_provider/openclaw_hooks.py +75 -0
- package/src/openclaw_predicate_provider/provider.py +69 -0
- package/src/openclaw_predicate_provider/py.typed +1 -0
- package/src/openclaw_predicate_provider/sidecar.py +59 -0
- package/src/provider.ts +220 -0
- package/src/runtime-integration.ts +68 -0
- package/src/web-evidence.ts +95 -0
- package/tests/adapter.test.ts +76 -0
- package/tests/audit-event-e2e.test.ts +258 -0
- package/tests/authority-client.test.ts +52 -0
- package/tests/circuit-breaker.test.ts +266 -0
- package/tests/conftest.py +9 -0
- package/tests/control-plane-sync.test.ts +114 -0
- package/tests/hack-vs-fix-demo.test.ts +44 -0
- package/tests/jwks-rotation.test.ts +274 -0
- package/tests/load-latency.test.ts +214 -0
- package/tests/multi-tenant-isolation.test.ts +183 -0
- package/tests/non-web-evidence.test.ts +168 -0
- package/tests/openclaw-hooks.test.ts +46 -0
- package/tests/openclaw-plugin-api.test.ts +50 -0
- package/tests/provider.test.ts +227 -0
- package/tests/runtime-integration.test.ts +70 -0
- package/tests/test_adapter.py +46 -0
- package/tests/test_cli.py +26 -0
- package/tests/test_openclaw_hooks.py +53 -0
- package/tests/test_provider.py +59 -0
- package/tests/test_runtime_integration.py +77 -0
- package/tests/test_sidecar_client.py +198 -0
- package/tests/web-evidence.test.ts +113 -0
- package/tsconfig.json +14 -0
- package/vitest.config.ts +7 -0
|
@@ -0,0 +1,389 @@
|
|
|
1
|
+
# Operational Runbook
|
|
2
|
+
|
|
3
|
+
This runbook provides step-by-step procedures for operating and troubleshooting
|
|
4
|
+
the OpenClaw Predicate Provider in production environments.
|
|
5
|
+
|
|
6
|
+
## Quick Reference
|
|
7
|
+
|
|
8
|
+
| Incident Type | Severity | First Response |
|
|
9
|
+
|---------------|----------|----------------|
|
|
10
|
+
| Circuit breaker open | P1 | Check sidecar health |
|
|
11
|
+
| Elevated deny rate | P2 | Compare to policy changes |
|
|
12
|
+
| High latency | P3 | Check sidecar resources |
|
|
13
|
+
| Audit export failures | P4 | Check control plane connectivity |
|
|
14
|
+
|
|
15
|
+
## Prerequisites
|
|
16
|
+
|
|
17
|
+
Before using this runbook, ensure you have:
|
|
18
|
+
|
|
19
|
+
- Access to provider logs and metrics dashboards
|
|
20
|
+
- Access to sidecar logs (`predicate-authorityd`)
|
|
21
|
+
- Ability to restart provider/sidecar processes
|
|
22
|
+
- Contact information for on-call escalation
|
|
23
|
+
|
|
24
|
+
## Incident Response Procedures
|
|
25
|
+
|
|
26
|
+
### P1: Circuit Breaker Stuck Open
|
|
27
|
+
|
|
28
|
+
**Symptoms:**
|
|
29
|
+
- All authorization requests failing immediately
|
|
30
|
+
- `CircuitOpenError` in provider logs
|
|
31
|
+
- Metrics showing `predicate_circuit_state = open`
|
|
32
|
+
|
|
33
|
+
**Diagnosis Steps:**
|
|
34
|
+
|
|
35
|
+
1. **Check sidecar health**
|
|
36
|
+
```bash
|
|
37
|
+
curl -s http://localhost:8787/health | jq .
|
|
38
|
+
```
|
|
39
|
+
Expected: `{"status": "healthy"}`
|
|
40
|
+
|
|
41
|
+
2. **Check sidecar logs for errors**
|
|
42
|
+
```bash
|
|
43
|
+
journalctl -u predicate-authorityd -n 100 --no-pager
|
|
44
|
+
# or
|
|
45
|
+
docker logs predicate-authorityd --tail 100
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
3. **Verify network connectivity**
|
|
49
|
+
```bash
|
|
50
|
+
curl -w "@curl-format.txt" -s -o /dev/null http://localhost:8787/health
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
4. **Check control plane sync status**
|
|
54
|
+
```bash
|
|
55
|
+
curl -s http://localhost:8787/v1/sync/status | jq .
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
**Resolution Steps:**
|
|
59
|
+
|
|
60
|
+
1. **If sidecar is unhealthy:**
|
|
61
|
+
```bash
|
|
62
|
+
# Restart sidecar
|
|
63
|
+
systemctl restart predicate-authorityd
|
|
64
|
+
# or
|
|
65
|
+
docker restart predicate-authorityd
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
2. **If sidecar is healthy but circuit is still open:**
|
|
69
|
+
- Circuit will auto-recover after `resetTimeoutMs` (default: 30s)
|
|
70
|
+
- For immediate recovery, restart the provider process
|
|
71
|
+
|
|
72
|
+
3. **If control plane sync is failing:**
|
|
73
|
+
- Check control plane endpoint accessibility
|
|
74
|
+
- Verify API credentials are valid
|
|
75
|
+
- Check for control plane service incidents
|
|
76
|
+
|
|
77
|
+
**Escalation:**
|
|
78
|
+
- If not resolved in 5 minutes, page on-call engineer
|
|
79
|
+
- If sidecar restart doesn't help, escalate to platform team
|
|
80
|
+
|
|
81
|
+
---
|
|
82
|
+
|
|
83
|
+
### P2: Elevated Deny Rate
|
|
84
|
+
|
|
85
|
+
**Symptoms:**
|
|
86
|
+
- Sudden increase in deny decisions (>2x baseline)
|
|
87
|
+
- User reports of blocked actions
|
|
88
|
+
- `denied_by_policy` reason code spike
|
|
89
|
+
|
|
90
|
+
**Diagnosis Steps:**
|
|
91
|
+
|
|
92
|
+
1. **Check deny rate trend**
|
|
93
|
+
```bash
|
|
94
|
+
# Query recent deny events
|
|
95
|
+
curl -s "http://localhost:8787/v1/audit/decisions?outcome=deny&limit=50" | jq .
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
2. **Compare to recent policy changes**
|
|
99
|
+
- Check control plane for recent policy deployments
|
|
100
|
+
- Review policy version in metrics
|
|
101
|
+
|
|
102
|
+
3. **Identify affected actions/resources**
|
|
103
|
+
```bash
|
|
104
|
+
# Group denials by action
|
|
105
|
+
curl -s "http://localhost:8787/v1/audit/decisions?outcome=deny" | \
|
|
106
|
+
jq -r '.items | group_by(.action) | map({action: .[0].action, count: length})'
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
4. **Check for attack patterns**
|
|
110
|
+
- Look for repeated denials from same principal
|
|
111
|
+
- Check for unusual resource patterns (path traversal, etc.)
|
|
112
|
+
|
|
113
|
+
**Resolution Steps:**
|
|
114
|
+
|
|
115
|
+
1. **If caused by policy change:**
|
|
116
|
+
- Rollback to previous policy version via control plane
|
|
117
|
+
- Or fix policy and redeploy
|
|
118
|
+
|
|
119
|
+
2. **If attack attempt:**
|
|
120
|
+
- Document attack patterns
|
|
121
|
+
- Consider adding rate limiting
|
|
122
|
+
- Report to security team
|
|
123
|
+
|
|
124
|
+
3. **If false positives:**
|
|
125
|
+
- Review policy rules for overly broad denials
|
|
126
|
+
- Add specific allow rules for legitimate use cases
|
|
127
|
+
|
|
128
|
+
**Escalation:**
|
|
129
|
+
- If attack suspected, notify security team immediately
|
|
130
|
+
- If policy rollback needed, coordinate with policy owners
|
|
131
|
+
|
|
132
|
+
---
|
|
133
|
+
|
|
134
|
+
### P3: High Authorization Latency
|
|
135
|
+
|
|
136
|
+
**Symptoms:**
|
|
137
|
+
- p95 latency > 150ms
|
|
138
|
+
- Slow tool execution reported by users
|
|
139
|
+
- Timeout errors in logs
|
|
140
|
+
|
|
141
|
+
**Diagnosis Steps:**
|
|
142
|
+
|
|
143
|
+
1. **Check current latency percentiles**
|
|
144
|
+
```bash
|
|
145
|
+
curl -s http://localhost:8787/metrics | grep predicate_auth_latency
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
2. **Check sidecar resource usage**
|
|
149
|
+
```bash
|
|
150
|
+
# CPU and memory
|
|
151
|
+
top -p $(pgrep predicate-authorityd)
|
|
152
|
+
# or
|
|
153
|
+
docker stats predicate-authorityd --no-stream
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
3. **Check control plane sync load**
|
|
157
|
+
```bash
|
|
158
|
+
curl -s http://localhost:8787/v1/sync/status | jq '.last_sync_duration_ms'
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
4. **Check concurrent request volume**
|
|
162
|
+
```bash
|
|
163
|
+
curl -s http://localhost:8787/metrics | grep predicate_auth_concurrent
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
**Resolution Steps:**
|
|
167
|
+
|
|
168
|
+
1. **If sidecar CPU is high:**
|
|
169
|
+
- Check for runaway policy evaluation
|
|
170
|
+
- Consider scaling sidecar resources
|
|
171
|
+
- Review policy complexity
|
|
172
|
+
|
|
173
|
+
2. **If sync is slow:**
|
|
174
|
+
- Check control plane latency
|
|
175
|
+
- Consider increasing sync interval
|
|
176
|
+
- Review policy size
|
|
177
|
+
|
|
178
|
+
3. **If high concurrent load:**
|
|
179
|
+
- Consider horizontal scaling
|
|
180
|
+
- Review request batching options
|
|
181
|
+
- Check for retry storms
|
|
182
|
+
|
|
183
|
+
**Escalation:**
|
|
184
|
+
- If resources are maxed, request capacity increase
|
|
185
|
+
- If policy is too complex, work with policy team to optimize
|
|
186
|
+
|
|
187
|
+
---
|
|
188
|
+
|
|
189
|
+
### P4: Audit Export Failures
|
|
190
|
+
|
|
191
|
+
**Symptoms:**
|
|
192
|
+
- Missing audit events in control plane
|
|
193
|
+
- `audit_export_failure` in logs
|
|
194
|
+
- Non-zero `predicate_audit_failures` counter
|
|
195
|
+
|
|
196
|
+
**Diagnosis Steps:**
|
|
197
|
+
|
|
198
|
+
1. **Check export error logs**
|
|
199
|
+
```bash
|
|
200
|
+
grep "audit.*error" /var/log/provider.log | tail -20
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
2. **Verify control plane connectivity**
|
|
204
|
+
```bash
|
|
205
|
+
curl -s https://control-plane.example.com/health
|
|
206
|
+
```
|
|
207
|
+
|
|
208
|
+
3. **Check export queue depth**
|
|
209
|
+
```bash
|
|
210
|
+
curl -s http://localhost:8787/metrics | grep predicate_audit_queue
|
|
211
|
+
```
|
|
212
|
+
|
|
213
|
+
**Resolution Steps:**
|
|
214
|
+
|
|
215
|
+
1. **If control plane unreachable:**
|
|
216
|
+
- Check network/firewall rules
|
|
217
|
+
- Verify TLS certificates
|
|
218
|
+
- Check for control plane incidents
|
|
219
|
+
|
|
220
|
+
2. **If queue is backed up:**
|
|
221
|
+
- Audit export is best-effort; auth continues working
|
|
222
|
+
- Events will retry automatically
|
|
223
|
+
- Check disk space for local buffer
|
|
224
|
+
|
|
225
|
+
3. **If credentials expired:**
|
|
226
|
+
- Rotate API credentials
|
|
227
|
+
- Update provider configuration
|
|
228
|
+
- Restart provider
|
|
229
|
+
|
|
230
|
+
**Escalation:**
|
|
231
|
+
- Audit failures are P4 (non-blocking)
|
|
232
|
+
- Escalate only if prolonged (>1 hour) or compliance-critical
|
|
233
|
+
|
|
234
|
+
---
|
|
235
|
+
|
|
236
|
+
## Routine Operations
|
|
237
|
+
|
|
238
|
+
### Restarting the Provider
|
|
239
|
+
|
|
240
|
+
```bash
|
|
241
|
+
# Graceful restart (allows in-flight requests to complete)
|
|
242
|
+
systemctl reload openclaw-provider
|
|
243
|
+
|
|
244
|
+
# Full restart
|
|
245
|
+
systemctl restart openclaw-provider
|
|
246
|
+
```
|
|
247
|
+
|
|
248
|
+
### Rotating Credentials
|
|
249
|
+
|
|
250
|
+
1. Generate new credentials in control plane
|
|
251
|
+
2. Update provider configuration
|
|
252
|
+
3. Restart provider
|
|
253
|
+
4. Verify connectivity
|
|
254
|
+
5. Revoke old credentials
|
|
255
|
+
|
|
256
|
+
### Updating Policy
|
|
257
|
+
|
|
258
|
+
1. Deploy new policy to control plane
|
|
259
|
+
2. Monitor sync status on sidecars
|
|
260
|
+
3. Watch deny rate for anomalies
|
|
261
|
+
4. Rollback if issues detected
|
|
262
|
+
|
|
263
|
+
### Scaling Sidecars
|
|
264
|
+
|
|
265
|
+
For high-load environments:
|
|
266
|
+
|
|
267
|
+
1. Deploy additional sidecar instances
|
|
268
|
+
2. Configure load balancer
|
|
269
|
+
3. Update provider `baseUrl` to load balancer
|
|
270
|
+
4. Verify even distribution
|
|
271
|
+
|
|
272
|
+
---
|
|
273
|
+
|
|
274
|
+
## Health Checks
|
|
275
|
+
|
|
276
|
+
### Provider Health
|
|
277
|
+
|
|
278
|
+
```bash
|
|
279
|
+
# Local provider health
|
|
280
|
+
curl -s http://localhost:3000/health
|
|
281
|
+
|
|
282
|
+
# Expected response
|
|
283
|
+
{
|
|
284
|
+
"status": "healthy",
|
|
285
|
+
"sidecar": "connected",
|
|
286
|
+
"circuit": "closed"
|
|
287
|
+
}
|
|
288
|
+
```
|
|
289
|
+
|
|
290
|
+
### Sidecar Health
|
|
291
|
+
|
|
292
|
+
```bash
|
|
293
|
+
# Sidecar health
|
|
294
|
+
curl -s http://localhost:8787/health
|
|
295
|
+
|
|
296
|
+
# Expected response
|
|
297
|
+
{
|
|
298
|
+
"status": "healthy",
|
|
299
|
+
"policy_version": "v1.2.3",
|
|
300
|
+
"last_sync": "2026-02-20T12:00:00Z"
|
|
301
|
+
}
|
|
302
|
+
```
|
|
303
|
+
|
|
304
|
+
### End-to-End Check
|
|
305
|
+
|
|
306
|
+
```bash
|
|
307
|
+
# Test authorization flow
|
|
308
|
+
curl -X POST http://localhost:8787/v1/authorize \
|
|
309
|
+
-H "Content-Type: application/json" \
|
|
310
|
+
-d '{
|
|
311
|
+
"principal": "test:health-check",
|
|
312
|
+
"action": "health.check",
|
|
313
|
+
"resource": "system"
|
|
314
|
+
}'
|
|
315
|
+
|
|
316
|
+
# Expected: allow decision for health check action
|
|
317
|
+
```
|
|
318
|
+
|
|
319
|
+
---
|
|
320
|
+
|
|
321
|
+
## Monitoring Checklist
|
|
322
|
+
|
|
323
|
+
### Daily
|
|
324
|
+
|
|
325
|
+
- [ ] Review deny rate trends
|
|
326
|
+
- [ ] Check circuit breaker state
|
|
327
|
+
- [ ] Verify audit export completeness
|
|
328
|
+
|
|
329
|
+
### Weekly
|
|
330
|
+
|
|
331
|
+
- [ ] Review latency percentiles
|
|
332
|
+
- [ ] Check policy sync freshness
|
|
333
|
+
- [ ] Audit access logs
|
|
334
|
+
|
|
335
|
+
### Monthly
|
|
336
|
+
|
|
337
|
+
- [ ] Review and update SLO thresholds
|
|
338
|
+
- [ ] Test incident response procedures
|
|
339
|
+
- [ ] Update runbook with learnings
|
|
340
|
+
|
|
341
|
+
---
|
|
342
|
+
|
|
343
|
+
## Contact Information
|
|
344
|
+
|
|
345
|
+
| Role | Contact |
|
|
346
|
+
|------|---------|
|
|
347
|
+
| On-call engineer | PagerDuty: `predicate-oncall` |
|
|
348
|
+
| Platform team | Slack: `#predicate-platform` |
|
|
349
|
+
| Security team | Slack: `#security-incidents` |
|
|
350
|
+
| Control plane status | https://status.predicatesystems.ai |
|
|
351
|
+
|
|
352
|
+
---
|
|
353
|
+
|
|
354
|
+
## Appendix
|
|
355
|
+
|
|
356
|
+
### Useful Commands
|
|
357
|
+
|
|
358
|
+
```bash
|
|
359
|
+
# View real-time logs
|
|
360
|
+
journalctl -u predicate-authorityd -f
|
|
361
|
+
|
|
362
|
+
# Check process status
|
|
363
|
+
systemctl status predicate-authorityd
|
|
364
|
+
|
|
365
|
+
# View metrics
|
|
366
|
+
curl -s http://localhost:8787/metrics
|
|
367
|
+
|
|
368
|
+
# Force policy sync
|
|
369
|
+
curl -X POST http://localhost:8787/v1/sync/trigger
|
|
370
|
+
|
|
371
|
+
# Get current policy version
|
|
372
|
+
curl -s http://localhost:8787/v1/policy/version
|
|
373
|
+
```
|
|
374
|
+
|
|
375
|
+
### Log Locations
|
|
376
|
+
|
|
377
|
+
| Component | Log Path |
|
|
378
|
+
|-----------|----------|
|
|
379
|
+
| Provider | `/var/log/openclaw-provider/provider.log` |
|
|
380
|
+
| Sidecar | `/var/log/predicate-authorityd/sidecar.log` |
|
|
381
|
+
| Audit events | `/var/log/predicate-authorityd/audit.jsonl` |
|
|
382
|
+
|
|
383
|
+
### Configuration Files
|
|
384
|
+
|
|
385
|
+
| Component | Config Path |
|
|
386
|
+
|-----------|-------------|
|
|
387
|
+
| Provider | `/etc/openclaw-provider/config.yaml` |
|
|
388
|
+
| Sidecar | `/etc/predicate-authorityd/config.yaml` |
|
|
389
|
+
| Policy | Managed via control plane |
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
# Production Readiness Checklist
|
|
2
|
+
|
|
3
|
+
This document tracks production readiness criteria for the OpenClaw Predicate
|
|
4
|
+
Provider. All items must be verified before GA release.
|
|
5
|
+
|
|
6
|
+
**Status:** Ready for review
|
|
7
|
+
**Last Updated:** 2026-02-20
|
|
8
|
+
**Owner:** Platform Security
|
|
9
|
+
|
|
10
|
+
## 1. Security Posture
|
|
11
|
+
|
|
12
|
+
| Criteria | Status | Evidence |
|
|
13
|
+
|----------|--------|----------|
|
|
14
|
+
| Fail-closed default for high-risk actions | ✓ | `provider.ts` throws on sidecar errors |
|
|
15
|
+
| No embedded signing keys in plugin | ✓ | Keys remain in sidecar/control plane |
|
|
16
|
+
| Log redaction for sensitive values | ✓ | Tests in `audit-event-e2e.test.ts` |
|
|
17
|
+
| SecurityError returns redacted reasons | ✓ | `errors.ts` implementation |
|
|
18
|
+
| Path traversal protection | ✓ | Tests in `hack-vs-fix-demo.test.ts` |
|
|
19
|
+
| Prompt injection blocking | ✓ | Tests in `hack-vs-fix-demo.test.ts` |
|
|
20
|
+
|
|
21
|
+
**Security Signoff:** _________________ Date: _________
|
|
22
|
+
|
|
23
|
+
## 2. Reliability
|
|
24
|
+
|
|
25
|
+
| Criteria | Status | Evidence |
|
|
26
|
+
|----------|--------|----------|
|
|
27
|
+
| Circuit breaker for sidecar outages | ✓ | `circuit-breaker.ts` |
|
|
28
|
+
| Exponential backoff with jitter | ✓ | `calculateBackoff()` function |
|
|
29
|
+
| Configurable timeouts | ✓ | `config.ts` (300ms default) |
|
|
30
|
+
| Graceful degradation on sync failure | ✓ | Local policy evaluation continues |
|
|
31
|
+
| Load tested (100 sequential, 50 concurrent) | ✓ | `load-latency.test.ts` |
|
|
32
|
+
|
|
33
|
+
## 3. Observability
|
|
34
|
+
|
|
35
|
+
| Criteria | Status | Evidence |
|
|
36
|
+
|----------|--------|----------|
|
|
37
|
+
| Decision telemetry (allow/deny/error) | ✓ | `provider.ts` telemetry hooks |
|
|
38
|
+
| Latency metrics | ✓ | `load-latency.test.ts` p50/p95 |
|
|
39
|
+
| Circuit breaker state metrics | ✓ | `CircuitBreaker.getMetrics()` |
|
|
40
|
+
| Audit export integration | ✓ | `audit-event-e2e.test.ts` |
|
|
41
|
+
| Correlation IDs (session, tenant, trace) | ✓ | `multi-tenant-isolation.test.ts` |
|
|
42
|
+
|
|
43
|
+
## 4. SLOs and Alerting
|
|
44
|
+
|
|
45
|
+
| Criteria | Status | Evidence |
|
|
46
|
+
|----------|--------|----------|
|
|
47
|
+
| Latency SLOs defined (p50 <25ms, p95 <75ms) | ✓ | `docs/SLO_THRESHOLDS.md` |
|
|
48
|
+
| Availability SLOs defined (99.9%) | ✓ | `docs/SLO_THRESHOLDS.md` |
|
|
49
|
+
| Alert thresholds documented | ✓ | `docs/SLO_THRESHOLDS.md` |
|
|
50
|
+
| Circuit breaker alert thresholds | ✓ | `docs/SLO_THRESHOLDS.md` |
|
|
51
|
+
| Deny spike detection criteria | ✓ | `docs/SLO_THRESHOLDS.md` |
|
|
52
|
+
|
|
53
|
+
## 5. Operations
|
|
54
|
+
|
|
55
|
+
| Criteria | Status | Evidence |
|
|
56
|
+
|----------|--------|----------|
|
|
57
|
+
| Operational runbook | ✓ | `docs/OPERATIONAL_RUNBOOK.md` |
|
|
58
|
+
| P1-P4 incident procedures | ✓ | `docs/OPERATIONAL_RUNBOOK.md` |
|
|
59
|
+
| Health check endpoints | ✓ | Documented in runbook |
|
|
60
|
+
| Restart/recovery procedures | ✓ | `docs/OPERATIONAL_RUNBOOK.md` |
|
|
61
|
+
| Credential rotation procedures | ✓ | `docs/OPERATIONAL_RUNBOOK.md` |
|
|
62
|
+
| Scaling guidance | ✓ | `docs/OPERATIONAL_RUNBOOK.md` |
|
|
63
|
+
|
|
64
|
+
## 6. Testing
|
|
65
|
+
|
|
66
|
+
| Criteria | Status | Evidence |
|
|
67
|
+
|----------|--------|----------|
|
|
68
|
+
| Unit tests | ✓ | 15 test files, all passing |
|
|
69
|
+
| Integration tests (sidecar wire format) | ✓ | `provider.test.ts` |
|
|
70
|
+
| Load/latency tests | ✓ | `load-latency.test.ts` |
|
|
71
|
+
| Multi-tenant isolation tests | ✓ | `multi-tenant-isolation.test.ts` |
|
|
72
|
+
| JWKS/key rotation tests | ✓ | `jwks-rotation.test.ts` |
|
|
73
|
+
| Adversarial/security tests | ✓ | `hack-vs-fix-demo.test.ts` |
|
|
74
|
+
| CI pipeline (Node 20/22) | ✓ | `.github/workflows/tests.yml` |
|
|
75
|
+
|
|
76
|
+
## 7. Documentation
|
|
77
|
+
|
|
78
|
+
| Criteria | Status | Evidence |
|
|
79
|
+
|----------|--------|----------|
|
|
80
|
+
| API contract documented | ✓ | Design doc action/resource mapping |
|
|
81
|
+
| Fail-open/fail-closed policy table | ✓ | Design doc |
|
|
82
|
+
| SLO documentation | ✓ | `docs/SLO_THRESHOLDS.md` |
|
|
83
|
+
| Operational runbook | ✓ | `docs/OPERATIONAL_RUNBOOK.md` |
|
|
84
|
+
| Docker adversarial test guide | ✓ | `examples/README.md` |
|
|
85
|
+
|
|
86
|
+
## 8. Control Plane Integration
|
|
87
|
+
|
|
88
|
+
| Criteria | Status | Evidence |
|
|
89
|
+
|----------|--------|----------|
|
|
90
|
+
| Policy sync client | ✓ | `control-plane-sync.ts` |
|
|
91
|
+
| Revocation propagation | ✓ | `control-plane-sync.ts` |
|
|
92
|
+
| Stale-sync observability | ✓ | `ControlPlaneSyncStatusTracker` |
|
|
93
|
+
| Audit export wiring | ✓ | `audit-event-e2e.test.ts` |
|
|
94
|
+
|
|
95
|
+
## 9. Known Limitations
|
|
96
|
+
|
|
97
|
+
| Limitation | Impact | Planned Fix |
|
|
98
|
+
|------------|--------|-------------|
|
|
99
|
+
| `state_hash` not integrated into auth flow | Limits pre-execution state verification | Post-Phase 4 |
|
|
100
|
+
| No automatic sidecar discovery | Requires manual `baseUrl` config | Future enhancement |
|
|
101
|
+
|
|
102
|
+
## Sign-off
|
|
103
|
+
|
|
104
|
+
### Engineering Review
|
|
105
|
+
|
|
106
|
+
- [ ] All test suites passing
|
|
107
|
+
- [ ] Code review completed
|
|
108
|
+
- [ ] Performance benchmarks acceptable
|
|
109
|
+
|
|
110
|
+
**Engineering Lead:** _________________ Date: _________
|
|
111
|
+
|
|
112
|
+
### Security Review
|
|
113
|
+
|
|
114
|
+
- [ ] Fail-closed behavior verified
|
|
115
|
+
- [ ] Log redaction verified
|
|
116
|
+
- [ ] No credential exposure risks
|
|
117
|
+
|
|
118
|
+
**Security Lead:** _________________ Date: _________
|
|
119
|
+
|
|
120
|
+
### Operations Review
|
|
121
|
+
|
|
122
|
+
- [ ] Runbook reviewed and validated
|
|
123
|
+
- [ ] Alerting configured
|
|
124
|
+
- [ ] On-call procedures documented
|
|
125
|
+
|
|
126
|
+
**Operations Lead:** _________________ Date: _________
|
|
127
|
+
|
|
128
|
+
### Final Approval
|
|
129
|
+
|
|
130
|
+
- [ ] All sections signed off
|
|
131
|
+
- [ ] No blocking issues
|
|
132
|
+
- [ ] Ready for GA release
|
|
133
|
+
|
|
134
|
+
**Release Manager:** _________________ Date: _________
|
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
# SLOs and Alert Thresholds
|
|
2
|
+
|
|
3
|
+
This document defines Service Level Objectives (SLOs) and alert thresholds for
|
|
4
|
+
the OpenClaw Predicate Provider in production deployments.
|
|
5
|
+
|
|
6
|
+
## Latency SLOs
|
|
7
|
+
|
|
8
|
+
### Authorization Call Latency
|
|
9
|
+
|
|
10
|
+
| Percentile | Target | Alert Threshold |
|
|
11
|
+
|------------|--------|-----------------|
|
|
12
|
+
| p50 | < 25 ms | > 50 ms |
|
|
13
|
+
| p95 | < 75 ms | > 150 ms |
|
|
14
|
+
| p99 | < 150 ms | > 300 ms |
|
|
15
|
+
|
|
16
|
+
These targets assume local sidecar deployment. For remote sidecar deployments,
|
|
17
|
+
add network RTT to each target.
|
|
18
|
+
|
|
19
|
+
### Sidecar Timeout
|
|
20
|
+
|
|
21
|
+
- **Default timeout:** 300 ms
|
|
22
|
+
- **Hard timeout (fail-closed):** 500 ms
|
|
23
|
+
|
|
24
|
+
If the sidecar does not respond within the timeout, the provider fails closed
|
|
25
|
+
(denies the action) for high-risk operations.
|
|
26
|
+
|
|
27
|
+
## Availability SLOs
|
|
28
|
+
|
|
29
|
+
### Provider Availability
|
|
30
|
+
|
|
31
|
+
| Metric | Target | Alert Threshold |
|
|
32
|
+
|--------|--------|-----------------|
|
|
33
|
+
| Uptime | 99.9% | < 99.5% |
|
|
34
|
+
| Error rate | < 0.1% | > 1% |
|
|
35
|
+
|
|
36
|
+
### Sidecar Availability
|
|
37
|
+
|
|
38
|
+
| Metric | Target | Alert Threshold |
|
|
39
|
+
|--------|--------|-----------------|
|
|
40
|
+
| Uptime | 99.95% | < 99.9% |
|
|
41
|
+
| Circuit breaker open rate | < 0.5% | > 2% |
|
|
42
|
+
|
|
43
|
+
## Decision Quality SLOs
|
|
44
|
+
|
|
45
|
+
### Deny Spike Detection
|
|
46
|
+
|
|
47
|
+
| Metric | Baseline | Alert Threshold |
|
|
48
|
+
|--------|----------|-----------------|
|
|
49
|
+
| Deny rate | ~5% (varies by policy) | > 2x baseline over 5 min |
|
|
50
|
+
| Deny rate spike | N/A | > 50% increase in 1 min |
|
|
51
|
+
|
|
52
|
+
A sudden spike in deny rates may indicate:
|
|
53
|
+
- Misconfigured policy rollout
|
|
54
|
+
- Attack attempt (should trigger investigation)
|
|
55
|
+
- Sidecar sync failure
|
|
56
|
+
|
|
57
|
+
### Reason Code Distribution
|
|
58
|
+
|
|
59
|
+
Monitor reason code distribution for anomalies:
|
|
60
|
+
|
|
61
|
+
| Reason Code | Expected Range | Alert if |
|
|
62
|
+
|-------------|----------------|----------|
|
|
63
|
+
| `denied_by_policy` | 80-95% of denials | < 70% |
|
|
64
|
+
| `sidecar_timeout` | < 1% | > 5% |
|
|
65
|
+
| `circuit_open` | < 0.5% | > 2% |
|
|
66
|
+
| `missing_context` | < 0.1% | > 1% |
|
|
67
|
+
|
|
68
|
+
## Circuit Breaker Thresholds
|
|
69
|
+
|
|
70
|
+
### Default Configuration
|
|
71
|
+
|
|
72
|
+
```typescript
|
|
73
|
+
{
|
|
74
|
+
failureThreshold: 5, // Opens after 5 consecutive failures
|
|
75
|
+
resetTimeoutMs: 30_000, // Attempts recovery after 30 seconds
|
|
76
|
+
successThreshold: 2, // Closes after 2 successful calls in half-open
|
|
77
|
+
}
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
### Alert Thresholds
|
|
81
|
+
|
|
82
|
+
| Event | Alert Level | Action |
|
|
83
|
+
|-------|-------------|--------|
|
|
84
|
+
| Circuit opened | Warning | Investigate sidecar health |
|
|
85
|
+
| Circuit open > 1 min | Critical | Page on-call |
|
|
86
|
+
| Circuit open > 5 min | Critical | Consider manual intervention |
|
|
87
|
+
|
|
88
|
+
## Telemetry and Audit SLOs
|
|
89
|
+
|
|
90
|
+
### Audit Export Latency
|
|
91
|
+
|
|
92
|
+
| Metric | Target | Alert Threshold |
|
|
93
|
+
|--------|--------|-----------------|
|
|
94
|
+
| Export delay (best-effort) | < 5 seconds | > 30 seconds |
|
|
95
|
+
| Export failure rate | < 1% | > 5% |
|
|
96
|
+
|
|
97
|
+
Note: Audit export is best-effort and should never block the authorization path.
|
|
98
|
+
|
|
99
|
+
### Telemetry Completeness
|
|
100
|
+
|
|
101
|
+
| Metric | Target | Alert Threshold |
|
|
102
|
+
|--------|--------|-----------------|
|
|
103
|
+
| Decision events captured | > 99.9% | < 99% |
|
|
104
|
+
| Context fields present | > 99% | < 95% |
|
|
105
|
+
|
|
106
|
+
## Control Plane Sync SLOs
|
|
107
|
+
|
|
108
|
+
### Policy Sync
|
|
109
|
+
|
|
110
|
+
| Metric | Target | Alert Threshold |
|
|
111
|
+
|--------|--------|-----------------|
|
|
112
|
+
| Sync interval | < 60 seconds | > 5 minutes |
|
|
113
|
+
| Stale policy age | < 5 minutes | > 15 minutes |
|
|
114
|
+
|
|
115
|
+
### Revocation Propagation
|
|
116
|
+
|
|
117
|
+
| Metric | Target | Alert Threshold |
|
|
118
|
+
|--------|--------|-----------------|
|
|
119
|
+
| Revocation latency | < 30 seconds | > 2 minutes |
|
|
120
|
+
| Revocation completeness | 100% | Any missed revocation |
|
|
121
|
+
|
|
122
|
+
## Monitoring Implementation
|
|
123
|
+
|
|
124
|
+
### Required Metrics
|
|
125
|
+
|
|
126
|
+
```typescript
|
|
127
|
+
// Authorization metrics
|
|
128
|
+
counter("predicate_auth_total", { outcome: "allow" | "deny" | "error" });
|
|
129
|
+
histogram("predicate_auth_latency_ms", { action: string });
|
|
130
|
+
|
|
131
|
+
// Circuit breaker metrics
|
|
132
|
+
gauge("predicate_circuit_state", { state: "closed" | "open" | "half_open" });
|
|
133
|
+
counter("predicate_circuit_transitions", { from: string, to: string });
|
|
134
|
+
|
|
135
|
+
// Sync metrics
|
|
136
|
+
gauge("predicate_policy_age_seconds");
|
|
137
|
+
counter("predicate_sync_failures");
|
|
138
|
+
|
|
139
|
+
// Audit metrics
|
|
140
|
+
counter("predicate_audit_exports", { status: "success" | "failure" });
|
|
141
|
+
histogram("predicate_audit_latency_ms");
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
### Dashboard Panels
|
|
145
|
+
|
|
146
|
+
1. **Authorization Overview**
|
|
147
|
+
- Request rate by action
|
|
148
|
+
- Allow/deny/error distribution
|
|
149
|
+
- p50/p95/p99 latency
|
|
150
|
+
|
|
151
|
+
2. **Circuit Breaker Status**
|
|
152
|
+
- Current state per sidecar
|
|
153
|
+
- Transition history
|
|
154
|
+
- Recovery time
|
|
155
|
+
|
|
156
|
+
3. **Sync Health**
|
|
157
|
+
- Policy version timeline
|
|
158
|
+
- Sync lag
|
|
159
|
+
- Revocation propagation
|
|
160
|
+
|
|
161
|
+
4. **Deny Analysis**
|
|
162
|
+
- Deny rate over time
|
|
163
|
+
- Top deny reasons
|
|
164
|
+
- Deny by tenant/action
|
|
165
|
+
|
|
166
|
+
## Incident Response
|
|
167
|
+
|
|
168
|
+
### P1: Circuit Breaker Stuck Open
|
|
169
|
+
|
|
170
|
+
1. Check sidecar health and logs
|
|
171
|
+
2. Verify network connectivity
|
|
172
|
+
3. Check control plane status
|
|
173
|
+
4. Consider manual circuit reset if sidecar is healthy
|
|
174
|
+
|
|
175
|
+
### P2: Elevated Deny Rate
|
|
176
|
+
|
|
177
|
+
1. Compare to policy change timeline
|
|
178
|
+
2. Check for attack patterns
|
|
179
|
+
3. Review deny reasons distribution
|
|
180
|
+
4. Validate policy sync status
|
|
181
|
+
|
|
182
|
+
### P3: Elevated Latency
|
|
183
|
+
|
|
184
|
+
1. Check sidecar resource usage
|
|
185
|
+
2. Review concurrent request load
|
|
186
|
+
3. Check control plane sync load
|
|
187
|
+
4. Consider scaling sidecars
|
|
188
|
+
|
|
189
|
+
## Review Cadence
|
|
190
|
+
|
|
191
|
+
- **Weekly:** Review latency percentiles and deny trends
|
|
192
|
+
- **Monthly:** Audit SLO compliance and adjust thresholds
|
|
193
|
+
- **Quarterly:** Review and update SLO targets based on operational learnings
|