npm - @safefence/openclaw-guardrails - Versions diffs - 0.6.1 → 0.6.3 - Mend

@safefence/openclaw-guardrails 0.6.1 → 0.6.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

package/README.md +414 -67
package/dist/core/token-usage-store.d.ts +5 -0
package/dist/core/token-usage-store.js +23 -19
package/dist/plugin/event-adapter.d.ts +1 -9
package/dist/plugin/openclaw-adapter.d.ts +1 -0
package/dist/plugin/openclaw-adapter.js +6 -22
package/dist/plugin/openclaw-extension.js +16 -28
package/dist/plugin/version.d.ts +1 -0
package/dist/plugin/version.js +1 -0
package/dist/redaction/redact.js +18 -3
package/openclaw.plugin.json +1 -1
package/package.json +13 -2

package/README.md CHANGED Viewed

@@ -1,5 +1,8 @@
 # OpenClaw Guardrails
+[![npm version](https://img.shields.io/npm/v/@safefence/openclaw-guardrails)](https://www.npmjs.com/package/@safefence/openclaw-guardrails)
+[![npm provenance](https://img.shields.io/badge/npm-provenance-brightgreen)](https://docs.npmjs.com/generating-provenance-statements)
 > **Experimental** -- This project is under active development and not yet production-ready. APIs, config schemas, and behavior may change without notice between releases.
 Native TypeScript security kernel for OpenClaw (`>=2026.2.25`) with deterministic local enforcement, principal-aware authorization, and owner approval for group/multi-user safety.
@@ -19,6 +22,200 @@ Native TypeScript security kernel for OpenClaw (`>=2026.2.25`) with deterministi
 - Zero runtime dependencies — uses only Node.js built-ins (`fetch()`, `fs`).
 - Audit mode still applies redaction by default.
+## How It Works
+### Plugin ↔ Engine Flow
+The plugin has three layers: `openclaw-extension.ts` registers typed hooks with OpenClaw, `event-adapter.ts` maps between OpenClaw's structured `(event, ctx)` pairs and the internal `OpenClawContext`, and `openclaw-adapter.ts` converts contexts into `GuardEvent`s for the engine.
+```
+OpenClaw Runtime
+  │
+  ▼
+openclaw-extension.ts ──► api.on(hookName, handler)
+  │
+  ├──► event-adapter.ts: map*(event, ctx) → OpenClawContext
+  │
+  ▼
+openclaw-adapter.ts
+  │  toEvent(phase, ctx) → GuardEvent
+  │
+  ▼
+GuardrailsEngine ──► engine.evaluate(guardEvent, phase)
+  │
+  ▼
+GuardDecision
+  │
+  ▼
+openclaw-adapter.ts
+  │  applyRolloutPolicy()
+  │  updateMetrics()
+  │
+  ▼
+OpenClawHookResult
+  │
+  ▼
+event-adapter.ts: mapTo*Result(hookResult) → typed result
+  │
+  ▼
+OpenClaw Runtime ◄── hook-specific return value
+```
+### Hook Lifecycle
+Six lifecycle hooks span the full agent interaction. Each hook has different blocking/redaction capabilities:
+```
+User / Channel          OpenClaw              Guardrails Plugin
+      │                    │                        │
+      │     ┌──────────────────────────────────────────────────────┐
+      │     │ 1. Agent Initialization                              │
+      │     │    OC ──► before_agent_start(prompt, agentCtx)       │
+      │     │    OC ◄── { prependSystemContext: securityPolicy }   │
+      │     │    Injects immutable security prompt                 │
+      │     └──────────────────────────────────────────────────────┘
+      │                    │                        │
+      │     ┌──────────────────────────────────────────────────────┐
+      │     │ 2. Inbound Message                                   │
+      │ ──► │    OC ──► message_received(from, content, channelCtx)│
+      │     │    OC ◄── void (observe-only, cannot block)          │
+      │     │    Audits violations, defers enforcement             │
+      │     └──────────────────────────────────────────────────────┘
+      │                    │                        │
+      │     ┌──────────────────────────────────────────────────────┐
+      │     │ 3. Tool Execution Gate                               │
+      │     │    OC ──► before_tool_call(toolName, params, agentCtx│
+      │     │    OC ◄── { block: true, blockReason } or {}         │
+      │     │    *** Primary enforcement point ***                 │
+      │     └──────────────────────────────────────────────────────┘
+      │                    │                        │
+      │     ┌──────────────────────────────────────────────────────┐
+      │     │ 4. Tool Result Persistence                           │
+      │     │    OC ──► tool_result_persist(message, toolCtx)      │
+      │     │    OC ◄── { message: { content: redacted } } or {}   │
+      │     │    Sync regex redaction; async engine eval for audit  │
+      │     └──────────────────────────────────────────────────────┘
+      │                    │                        │
+      │     ┌──────────────────────────────────────────────────────┐
+      │     │ 5. Outbound Message Gate                             │
+      │     │    OC ──► message_sending(content, channelCtx)       │
+      │     │    OC ◄── { cancel: true } or { content: redacted }  │
+      │     │    Blocks system prompt leaks                        │
+      │     │    Always enforced in stage_b rollout                │
+      │     └──────────────────────────────────────────────────────┘
+      │                    │                        │
+      │     ┌──────────────────────────────────────────────────────┐
+      │     │ 6. Session End                                       │
+      │     │    OC ──► agent_end(messages, success, agentCtx)     │
+      │     │    OC ◄── void (observe-only)                        │
+      │     │    Emits metrics + monitoring snapshot               │
+      │     └──────────────────────────────────────────────────────┘
+```
+### Hook Capability Matrix
+| Hook | Can Block | Can Redact | Can Cancel | Return Type |
+|---|---|---|---|---|
+| `before_agent_start` | No | No | No | `{ prependSystemContext }` |
+| `message_received` | No (void) | No | No | void |
+| `before_tool_call` | **Yes** | No | No | `{ block, blockReason }` |
+| `tool_result_persist` | No | **Yes** (sync) | No | `{ message }` |
+| `message_sending` | **Yes** | **Yes** | **Yes** | `{ cancel }` or `{ content }` |
+| `agent_end` | No (void) | No | No | void |
+### Detector Pipeline
+All 12 detectors run sequentially for every `engine.evaluate()` call. No short-circuiting — an early DENY does not skip later detectors. All hits are merged, then `DENY > REDACT > ALLOW` precedence determines the outcome.
+```
+Engine.evaluate()
+  │
+  │  normalizeGuardEvent(rawEvent)
+  │
+  ├──► D1  Input Intent ── size limits, injection, exfil, context probes ──► hits[]
+  ├──► D2  Command Policy ── tool allowlist, binary allowlist, shell ops ──► hits[]
+  │        (before_tool_call only)
+  ├──► D3  Path Canonical ── path traversal, workspace boundary, symlinks ──► hits[]
+  │        (async realpath, before_tool_call only)
+  ├──► D4  Network Egress ── host allowlist, private egress, DNS ──► hits[]
+  │        (async DNS, before_tool_call only)
+  ├──► D5  Provenance ── supply chain trust + retrieval trust ──► hits[]
+  │        (async, before_tool_call only)
+  ├──► D6  Principal Authz ── identity, RBAC, mention-gating ──► hits[] + approvalRequirement?
+  │        (anti-spoofing: owner/admin derived from config only)
+  ├──► D7  Owner Approval ── challenge/verify approval token ──► hits[] + approvalChallenge?
+  │        (only runs if D6 returned approvalRequirement)
+  ├──► D8  Sensitive Data ── secret patterns → PII patterns (cascaded) ──► hits[] + redactedContent?
+  ├──► D9  Restricted Info ── data-class redaction for non-owner principals ──► hits[] + redactedContent?
+  ├──► D10 Output Safety ── prompt leak, injected filenames, suspicious patterns ──► hits[] + redactedContent?
+  │        (receives pre-redacted content from D9/D8)
+  ├──► D11 Budget ── requests/min + tool calls/min (sliding window) ──► hits[]
+  └──► D12 Extensions ── external HTTP + custom validators ──► hits[]
+           (concurrent via Promise.all, custom validators fail-open)
+  │
+  │  decideFromHits(): DENY > REDACT > ALLOW
+  │  aggregateRisk(): 1 - exp(-weighted_sum)
+  │  finalizeDecision(): audit mode override
+  │  auditSink.append() if enabled
+  ▼
+```
+#### Detector Details
+| # | Detector | Active Phases | What It Checks | Decision | Weight |
+|---|---|---|---|---|---|
+| 1 | Input Intent | All | Input size limits, prompt injection patterns, exfiltration patterns, context probing (injected filenames, workspace probing) | DENY | 0.75–0.95 |
+| 2 | Command Policy | `before_tool_call` | Tool allowlist, binary allowlist, shell operators, destructive command patterns, arg pattern validation | DENY | 0.8–1.0 |
+| 3 | Path Canonical | `before_tool_call` | Path traversal patterns, workspace boundary (realpath), symlink traversal | DENY | 0.9–0.95 |
+| 4 | Network Egress | `before_tool_call` | Host allowlist, private/local IP blocking, DNS resolution, egress tool detection | DENY | 0.7–0.9 |
+| 5 | Provenance | `before_tool_call` | Skill source trust, hash integrity, retrieval trust level, signed source | DENY | 0.7–0.85 |
+| 6 | Principal Authz | All | Identity resolution, role-based tool policy, mention-gating, group channel enforcement, data-class restrictions | DENY | 0.7–0.95 |
+| 7 | Owner Approval | Conditional | Challenge creation, token verification (TTL, digest, conversation, replay) | DENY | 0.8–0.9 |
+| 8 | Sensitive Data | All | Secret patterns (AWS keys, GitHub PATs, PEM keys, etc.), PII patterns (emails, SSNs, credit cards) | REDACT | 0.5–0.7 |
+| 9 | Restricted Info | `message_received`, `tool_result_persist`, `message_sending` | Data-class policy for non-owner principals, cross-principal redaction | DENY/REDACT | 0.7–0.9 |
+| 10 | Output Safety | `message_received`, `tool_result_persist`, `message_sending` | System prompt leak patterns, injected filename references, suspicious patterns (script tags, bearer tokens) | DENY/REDACT | 0.55–0.95 |
+| 11 | Budget | All (tool calls: `before_tool_call` only) | Requests/minute, tool calls/minute (sliding 60s window, per-principal partitioned) | DENY | 0.65–0.75 |
+| 12 | Extensions | All | External HTTP validators (circuit breaker, timeout), custom validator functions (phase-filtered) | DENY | 0.5–0.7 |
+### Risk Scoring
+Risk score formula: `1 - exp(-Σ(clamp(weight, 0, 1) × multiplier))` where DENY multiplier = 1.0, REDACT multiplier = 0.6. This produces a diminishing-returns curve: many small hits converge toward 1.0 but never exceed it. Rounded to 4 decimal places.
+### Decision Finalization
+```
+All RuleHits merged
+  │
+  ▼
+Any DENY hit? ──Yes──► decision = DENY ──┐
+  │ No                                    │
+  ▼                                       ▼
+Any REDACT hit? ──Yes──► decision = REDACT ──► mode = audit?
+  │ No                                           │
+  ▼                                         Yes ─┤── No
+decision = ALLOW                                 │      │
+  │                                              ▼      ▼
+  │                              Override to ALLOW    Return as-is
+  │                              + AUDIT_WOULD_DENY   with enforcement
+  │                              + redact only if         │
+  │                                applyInAuditMode       │
+  │                                     │                 │
+  └─────────────────────────────────────┴─────────────────┘
+                        │
+                        ▼
+                 Return GuardDecision
+```
+### Rollout Stages
+```
+stage_a_audit ──────────────────► stage_b_high_risk_enforce ──────────► stage_c_full_enforce ──► Production
+  All violations                    message_sending: always enforce       All violations
+  audit-only                        before_tool_call: enforce if           enforced
+                                      highRiskTools
+                                    others: audit-only
+```
 ## Security Features
 ### Identity and Authorization
@@ -26,37 +223,143 @@ Native TypeScript security kernel for OpenClaw (`>=2026.2.25`) with deterministi
 - **Anti-spoofing**: privileged roles (`owner`/`admin`) are derived exclusively from `principal.ownerIds`/`adminIds` in config — caller-supplied `metadata.role` values of `"owner"` or `"admin"` are downgraded to `"member"`.
 - Group-aware authorization (mention-gating + role-based tool policy).
-### Approval Workflow
-- One-time owner approval challenges with TTL, action digest binding, anti-replay, and requester identity binding.
-- Optional persistent approval store (`approval.storagePath`) with storage path validation (must be within `workspaceRoot`) and expired record pruning.
-- Admin notification bridge (`NotificationSink`) for cross-session approval alerts. Ships with `ConsoleNotificationSink`, `CallbackNotificationSink`, and `NoopNotificationSink`.
-### Detection Pipeline (12 detectors, fixed order)
-- Input intent analysis: prompt injection, exfiltration patterns, context probing, and input limits.
-- Command allow/deny policy enforcement with shell operator blocking.
-- Path canonicalization with symlink traversal detection.
-- Network egress validation (host allowlist, private IP blocking).
-- Supply chain verification (skill source trust + hash integrity).
-- Principal authorization (role-based tool policy, group channel enforcement).
-- Owner approval gating (challenges for restricted actions).
-- Sensitive data detection and redaction (secrets, PII) via regex patterns.
-- Restricted-info redaction for non-privileged group principals.
-- Output safety checks for system prompt leaks and injected filename references.
-- Budget enforcement (per-principal partitioned limits).
-- External/custom validators (HTTP-based + user-injected, run concurrently).
-### Operational Controls
-- **Reason code sanitization**: sensitive internal reason codes (e.g. `PROMPT_INJECTION`) are replaced with `CONTENT_POLICY_VIOLATION` in client-facing output to prevent detection fingerprinting.
-- Principal-partitioned budgets (`agent + principal + conversation`).
-- Staged rollout controls (`stage_a_audit`, `stage_b_high_risk_enforce`, `stage_c_full_enforce`).
-- Monitoring snapshot with false-positive threshold signaling. `consecutiveDaysForTuning` is a pass-through config value for external systems; multi-day tracking is not built in.
-- Fail-closed by default — engine errors result in `DENY` unless explicitly configured otherwise.
-### Extensibility
-- **Immutable JSONL audit trail**: every `evaluate()` call optionally emits a structured `AuditEvent` to a JSONL file via `AuditSink`. Enable with `audit.enabled` + `audit.sinkPath`.
-- **Custom business rule validators**: inject domain-specific logic (spending limits, data access boundaries) via the `CustomValidator` interface without forking. Validators are phase-filtered and run concurrently with external validators.
-- **External validator integration**: optional HTTP-based semantic validation (jailbreak detection, PII scanning) via configurable endpoint. Circuit breaker (3 failures → 60s cooldown), configurable timeout, fail-open mode.
-- **Per-user token usage tracking**: records input/output token counts per user/conversation/tool via `TokenUsageStore`. JSONL persistence, per-user aggregation summaries emitted at `agent_end`. Token recording is wired through the `tool_result_persist` hook in the plugin adapter; direct engine users call `tokenUsageStore.record()` explicitly.
+### Owner Approval Workflow
+```
+┌─────────────────────────────────────────────────────────────────────────┐
+│ Phase 1: Challenge                                                      │
+│                                                                         │
+│  Agent ──► Engine: before_tool_call (restricted tool, member role)       │
+│    Engine ──► D6 Principal Authz: evaluateAuthorization()               │
+│    D6 ◄──── approvalRequirement (requiredRole, reason)                  │
+│    Engine ──► D7 Owner Approval: detectOwnerApproval(requirement)       │
+│      D7 ──► ApprovalBroker: createChallenge(toolName, args, requesterId)│
+│        ApprovalBroker: requestId = randomUUID()                         │
+│        ApprovalBroker: actionDigest = SHA-256({toolName, args, ...})    │
+│        ApprovalBroker ──► ApprovalStore: save(record, expiresAt)        │
+│        ApprovalBroker ──► NotificationSink: notify({requestId, ...})    │
+│      D7 ◄── { requestId, expiresAt, requiredRole }                     │
+│    Engine ◄── DENY + approvalChallenge                                  │
+│  Agent ◄── DENY with approvalChallenge.requestId                        │
+└─────────────────────────────────────────────────────────────────────────┘
+┌─────────────────────────────────────────────────────────────────────────┐
+│ Phase 2: Approval                                                       │
+│                                                                         │
+│  Owner ──► Engine: /approve <requestId>                                 │
+│    Engine ──► ApprovalBroker: approveRequest(requestId, ownerId, "owner")│
+│      ApprovalBroker ──► ApprovalStore: lookup(requestId)                │
+│      ApprovalBroker: Verify not expired, role sufficient, not self      │
+│      ApprovalBroker: Check quorum (approverIds.length >= ownerQuorum?) │
+│      ApprovalBroker: Generate token: apr_<uuid>                        │
+│      ApprovalBroker ──► ApprovalStore: setToken(requestId, token)      │
+│    Engine ◄── token string                                              │
+│  Owner ◄── "Approved. Token: apr_..."                                   │
+└─────────────────────────────────────────────────────────────────────────┘
+┌─────────────────────────────────────────────────────────────────────────┐
+│ Phase 3: Redemption                                                     │
+│                                                                         │
+│  Agent ──► Engine: before_tool_call (same tool + approval.token)        │
+│    Engine ──► D7: detectOwnerApproval(requirement)                      │
+│      D7 ──► ApprovalBroker: verifyAndConsumeToken(token)               │
+│        Verify: not expired, not used, conversation match                │
+│        Verify: action digest match (same tool + args)                   │
+│        ApprovalStore: markUsed(requestId)                               │
+│      D7 ◄── "valid"                                                    │
+│    Engine ◄── no hits (ALLOW)                                           │
+│  Agent ◄── ALLOW                                                        │
+└─────────────────────────────────────────────────────────────────────────┘
+┌─────────────────────────────────────────────────────────────────────────┐
+│ Replay Prevention                                                       │
+│                                                                         │
+│  Agent ──► Engine: before_tool_call (same token again)                  │
+│    D7 ──► ApprovalBroker: verifyAndConsumeToken(token)                 │
+│      Token already has usedAt timestamp                                 │
+│    D7 ◄── "replayed"                                                   │
+│    Engine ◄── DENY (OWNER_APPROVAL_REPLAYED)                           │
+│  Agent ◄── DENY                                                        │
+└─────────────────────────────────────────────────────────────────────────┘
+```
+**Approval verification checks** (in order):
+1. Token exists and maps to a valid record
+2. Record not expired (TTL from creation)
+3. Token not already consumed (`usedAt` is null)
+4. RequestId matches (if provided by caller)
+5. Requester identity matches original requester
+6. Conversation matches (if `bindToConversation` enabled)
+7. Action digest matches (SHA-256 of tool + args + context)
+### Outbound Guard (System Prompt Leak Prevention)
+```
+Agent ──► Adapter: message_sending(context)
+  │
+  │  extractOutboundContent()
+  │  (scans ALL string fields, not just "content")
+  │
+  ▼
+Adapter ──► Engine: evaluate(guardEvent, "message_sending")
+  │
+  ▼
+Engine ──► D10 Output Safety: check leak patterns + injected filenames
+  │
+  ├── System prompt content detected:
+  │     D10 → DENY (SYSTEM_PROMPT_LEAK, weight 0.95)
+  │     Agent ◄── { cancel: true }
+  │
+  ├── Suspicious patterns (script tags, tokens):
+  │     D10 → REDACT (UNTRUSTED_OUTPUT, weight 0.55)
+  │     Agent ◄── { content: redactedContent }
+  │
+  └── Clean:
+        D10 → no hits → ALLOW
+        Agent ◄── {}
+```
+### `tool_result_persist` — Split Sync/Async Strategy
+This hook is synchronous in OpenClaw but the engine is async. The adapter splits the work:
+```
+OpenClaw (sync) ──► Extension: tool_result_persist(event, ctx)
+  │
+  ├── [Sync path — returns to OpenClaw immediately]
+  │     Extension: redactWithPatterns(content, precompiled patterns)
+  │     OpenClaw ◄── { message: { content: redacted } } or {}
+  │
+  └── [Async path — fire-and-forget]
+        Extension ──► Adapter: hooks.tool_result_persist(oclCtx)
+          Adapter: engine.evaluate() + metrics
+          Adapter ──► AuditSink: auditSink.append()
+          (Promise .catch() logs errors)
+```
+### Reason Code Sanitization
+Sensitive reason codes are replaced before reaching the client to prevent detection fingerprinting:
+| Internal Code | Client-Facing Code |
+|---|---|
+| `SECRET_DETECTED` | `CONTENT_POLICY_VIOLATION` |
+| `PII_DETECTED` | `CONTENT_POLICY_VIOLATION` |
+| `EXFIL_PATTERN` | `CONTENT_POLICY_VIOLATION` |
+| `SYSTEM_PROMPT_LEAK` | `CONTENT_POLICY_VIOLATION` |
+All other reason codes pass through unchanged.
+### Redaction Cascade
+Sensitive data, restricted info, and output safety detectors produce redacted content in a priority chain:
+```
+D8: Sensitive Data ──► D9: Restricted Info ──► D10: Output Safety ──► Engine picks:
+  (secrets → PII)       (data-class policy)     (leak patterns)        D10 > D9 > D8
+       │                       │                       │
+       └── redactedContent ──► └── redactedContent ──► └── Final redactedContent
+```
 ## Architecture
@@ -101,27 +404,28 @@ src/
 │       ├── restricted-info-detector.ts   # Non-privileged group redaction
 │       └── sensitive-data-detector.ts    # Secret/PII detection
 ├── plugin/
-│   ├── openclaw-adapter.ts           # OpenClaw hook adapter + summary telemetry
-│   └── openclaw-extension.ts         # Plugin entry point (registerOpenClawGuardrails)
+│   ├── version.ts                    # Shared version constant
+│   ├── event-adapter.ts              # OpenClaw typed hook ↔ internal context mapping
+│   ├── openclaw-adapter.ts           # Core guardrails engine adapter + telemetry
+│   └── openclaw-extension.ts         # Plugin entry point (api.on() typed hooks)
 ├── redaction/
-│   └── redact.ts                     # Secret/PII redaction engine
+│   └── redact.ts                     # Secret/PII redaction engine (cached regex)
 └── rules/
     ├── default-policy.ts             # Default config factory + merge
     └── patterns.ts                   # Detection pattern definitions
 ```
-## Owner Approval Flow
+## Provenance
-1. Member in group requests a restricted action.
-2. Engine returns `DENY` with `OWNER_APPROVAL_REQUIRED` and `approvalChallenge`.
-3. Owner/admin approves out-of-band and issues one-time token.
-4. Caller retries with `metadata.approval.token` (and optionally `requestId`).
-5. Engine verifies TTL, digest, conversation binding, requester identity binding, requestId (when provided), and replay status.
-6. Valid token allows reevaluation and execution.
+This package is published with [npm provenance](https://docs.npmjs.com/generating-provenance-statements) via GitHub Actions. Every published version includes a signed attestation linking the tarball to the exact source commit and build workflow in this repository.
-When `notifications.enabled` is true and a `NotificationSink` is configured, the broker automatically notifies admins when a new approval challenge is created.
+You can verify provenance for any version:
-Approval works across all channel types (DM, group, thread), not just groups — group context merely triggers the initial challenge for restricted actions.
+```bash
+npm audit signatures
+```
+The publish workflow (`.github/workflows/publish.yml`) uses GitHub's OIDC token (`id-token: write`) to generate Sigstore-backed provenance statements automatically — no manual signing keys are involved.
 ## Install in OpenClaw
@@ -158,7 +462,13 @@ After changing plugin install/config, restart the OpenClaw service or gateway pr
 Three main entry points:
 ```ts
-// 1. Plugin factory — returns an OpenClaw-compatible plugin with hook handlers
+// 1. OpenClaw plugin — default export, auto-discovered by OpenClaw via
+//    package.json "openclaw.extensions". Registers all typed hooks via api.on().
+import { openclawGuardrailsPlugin } from "@safefence/openclaw-guardrails";
+// openclawGuardrailsPlugin.register(api) is called automatically by OpenClaw.
+// 2. Plugin factory — returns a guardrails engine with hook handlers,
+//    useful for testing or manual integration.
 import { createOpenClawGuardrailsPlugin } from "@safefence/openclaw-guardrails";
 const plugin = createOpenClawGuardrailsPlugin({
@@ -170,10 +480,6 @@ const plugin = createOpenClawGuardrailsPlugin({
 // Out-of-band owner approval
 const token = plugin.approveRequest(requestId, "owner-user-id", "owner");
-// 2. OpenClaw extension entry — auto-registers all hooks from plugin config
-import { registerOpenClawGuardrails } from "@safefence/openclaw-guardrails";
-registerOpenClawGuardrails(api);
 // 3. Engine directly — for custom integrations outside OpenClaw
 import { GuardrailsEngine } from "@safefence/openclaw-guardrails";
 const engine = new GuardrailsEngine(config);
@@ -242,6 +548,58 @@ const engine = new GuardrailsEngine(config, { customValidators: [spendingLimit]
 | Section | Key | Type | Default | Description |
 |---------|-----|------|---------|-------------|
+| *(root)* | `mode` | `"enforce" \| "audit"` | `"enforce"` | Whether violations block or just log |
+| *(root)* | `failClosed` | `boolean` | `true` | On engine error: DENY (true) or ALLOW (false) |
+| *(root)* | `workspaceRoot` | `string` | `process.cwd()` | Anchor for path resolution |
+| `allow` | `tools` | `string[]` | 8 tools | Allowed tool names |
+| `allow` | `commands` | `CommandEntry[]` | 6 binaries | Allowed binaries with optional argPattern |
+| `allow` | `writablePaths` | `string[]` | `[workspaceRoot]` | Filesystem write boundary |
+| `allow` | `networkHosts` | `string[]` | localhost only | Allowed egress hosts |
+| `allow` | `allowPrivateEgress` | `boolean` | `false` | Allow RFC 1918 / loopback destinations |
+| `deny` | `commandPatterns` | `string[]` | 8 patterns | Destructive command regexes |
+| `deny` | `pathPatterns` | `string[]` | 8 patterns | Path traversal regexes |
+| `deny` | `promptInjectionPatterns` | `string[]` | 6 patterns | Injection attempt regexes |
+| `deny` | `exfiltrationPatterns` | `string[]` | 4 patterns | Data exfiltration regexes |
+| `deny` | `shellOperatorPatterns` | `string[]` | 9 patterns | Shell chaining/redirect regexes |
+| `redaction` | `secretPatterns` | `string[]` | 7 patterns | Secret detection regexes (AWS, GitHub, PEM, etc.) |
+| `redaction` | `piiPatterns` | `string[]` | 4 patterns | PII detection regexes (email, SSN, CC, phone) |
+| `redaction` | `replacement` | `string` | `"[REDACTED]"` | Replacement string for matches |
+| `redaction` | `applyInAuditMode` | `boolean` | `true` | Redact even when mode=audit |
+| `limits` | `maxInputChars` | `number` | `20000` | Max input content length |
+| `limits` | `maxToolArgChars` | `number` | `10000` | Max serialized tool args length |
+| `limits` | `maxOutputChars` | `number` | `50000` | Max tool output length |
+| `limits` | `maxRequestsPerMinute` | `number` | `120` | Rate limit: requests per 60s window |
+| `limits` | `maxToolCallsPerMinute` | `number` | `60` | Rate limit: tool calls per 60s window |
+| `pathPolicy` | `enforceCanonicalRealpath` | `boolean` | `true` | Resolve symlinks and verify workspace boundary |
+| `pathPolicy` | `denySymlinkTraversal` | `boolean` | `true` | Block symlinks that escape workspace |
+| `supplyChain` | `trustedSkillSources` | `string[]` | — | Allowed skill installation domains |
+| `supplyChain` | `requireSkillHash` | `boolean` | `true` | Require hash for remote skills |
+| `supplyChain` | `allowedSkillHashes` | `string[]` | — | Pre-approved skill hashes |
+| `principal` | `requireContext` | `boolean` | `true` | Require identity context |
+| `principal` | `ownerIds` | `string[]` | `[]` | User IDs with owner privilege |
+| `principal` | `adminIds` | `string[]` | `[]` | User IDs with admin privilege |
+| `principal` | `failUnknownInGroup` | `boolean` | `true` | Deny unknown users in group channels |
+| `authorization` | `defaultEffect` | `"deny" \| "allow"` | `"deny"` | Default when no explicit rule matches |
+| `authorization` | `requireMentionInGroups` | `boolean` | `true` | Require @mention for group messages |
+| `authorization` | `restrictedTools` | `string[]` | 6 tools | Tools requiring elevated role or approval |
+| `authorization` | `restrictedDataClasses` | `string[]` | — | Data classes requiring elevated access |
+| `authorization` | `toolAllowByRole` | `Record<Role, string[]>` | Role-tiered | Per-role tool access lists |
+| `approval` | `enabled` | `boolean` | `true` | Enable owner approval workflow |
+| `approval` | `ttlSeconds` | `number` | `300` | Approval challenge TTL |
+| `approval` | `requireForTools` | `string[]` | 6 tools | Tools requiring approval |
+| `approval` | `requireForDataClasses` | `string[]` | `["restricted", "secret"]` | Data classes requiring approval |
+| `approval` | `ownerQuorum` | `number` | `1` | Number of approvers required |
+| `approval` | `bindToConversation` | `boolean` | `true` | Bind token to originating conversation |
+| `approval` | `storagePath` | `string?` | — | JSON file for persistent approvals |
+| `tenancy` | `budgetKeyMode` | `string` | `"agent+principal+conversation"` | Budget partitioning strategy |
+| `tenancy` | `redactCrossPrincipalOutput` | `boolean` | `true` | Redact vs deny for restricted data |
+| `outboundGuard` | `enabled` | `boolean` | `true` | Enable outbound leak prevention |
+| `outboundGuard` | `systemPromptLeakPatterns` | `string[]` | 8 patterns | Patterns indicating prompt leakage |
+| `outboundGuard` | `injectedFileNames` | `string[]` | 9 names | Config filenames to block in output |
+| `rollout` | `stage` | `RolloutStage` | `"stage_c_full_enforce"` | Current enforcement stage |
+| `rollout` | `highRiskTools` | `string[]` | — | Tools enforced in stage B |
+| `monitoring` | `falsePositiveThresholdPct` | `number` | `3` | False positive rate threshold |
+| `monitoring` | `consecutiveDaysForTuning` | `number` | `2` | Days above threshold before signaling |
 | `audit` | `enabled` | `boolean` | `false` | Enable JSONL audit trail |
 | `audit` | `sinkPath` | `string?` | — | File path for JSONL audit events |
 | `externalValidation` | `enabled` | `boolean` | `false` | Enable HTTP external validators |
@@ -254,27 +612,16 @@ const engine = new GuardrailsEngine(config, { customValidators: [spendingLimit]
 | `notifications` | `enabled` | `boolean` | `false` | Enable approval notifications |
 | `notifications` | `adminChannelId` | `string?` | — | Target channel for notifications |
-## Config Example (Minimal Overrides)
-Most config has secure defaults. Override only what you need:
-```ts
-const plugin = createOpenClawGuardrailsPlugin({
-  workspaceRoot: "/workspace/project",
-  principal: {
-    ownerIds: ["owner-user-id"],
-    adminIds: ["admin-user-id"]
-  },
-  approval: {
-    enabled: true,
-    storagePath: "/workspace/project/.openclaw/approval-store.json"
-  }
-});
-```
+## Migration
-See the [research doc](../../docs/openclaw-llm-security-research.md) for a full config reference with all fields.
+### v0.6.0 → v0.6.1
-## Migration
+1. **Plugin API alignment**: The plugin now uses OpenClaw's typed hook system (`api.on()`) instead of `api.registerHook()`. Security decisions (block, cancel, redact) are now properly honoured by OpenClaw's pipeline — previously they were silently discarded.
+2. **New event adapter layer**: `src/plugin/event-adapter.ts` bridges OpenClaw's structured `(event, ctx)` hook pairs to the internal `OpenClawContext`. No changes needed for users of `createOpenClawGuardrailsPlugin()` or `GuardrailsEngine` directly.
+3. **Plugin export**: The default export is now an `{ id, name, version, register }` object (compatible with `resolvePluginModuleExport()`). The `registerOpenClawGuardrails` named export is preserved for backward compatibility.
+4. **`tool_result_persist` sync redaction**: Uses the existing `redactWithPatterns()` utility for synchronous redaction. Full async engine evaluation runs fire-and-forget for audit/metrics.
+5. **Manifest cleaned**: Removed unrecognized `entry` and `hooks` fields from `openclaw.plugin.json`. Set `additionalProperties: false` on root config schema.
+6. **Peer dependency**: `openclaw` is now declared as a `peerDependency` (`>=2026.2.25`).
 ### v0.5.x → v0.6.0

package/dist/core/token-usage-store.d.ts CHANGED Viewed

@@ -12,7 +12,12 @@ export interface TokenUsageRecord {
 export declare class TokenUsageStore {
     private records;
     private writer;
+    private totalInput;
+    private totalOutput;
+    private totalTokens;
+    private byUser;
     constructor(storagePath?: string);
+    private addToCounters;
     record(entry: TokenUsageRecord): void;
     getByUser(senderId: string): TokenUsageRecord[];
     getSummary(): TokenUsageSummary;

package/dist/core/token-usage-store.js CHANGED Viewed

@@ -2,41 +2,45 @@ import { JsonlWriter, readJsonlFile } from "./jsonl-writer.js";
 export class TokenUsageStore {
     records = [];
     writer = null;
+    totalInput = 0;
+    totalOutput = 0;
+    totalTokens = 0;
+    byUser = {};
     constructor(storagePath) {
         if (storagePath) {
             this.records = readJsonlFile(storagePath);
             this.writer = new JsonlWriter(storagePath);
+            for (const r of this.records) {
+                this.addToCounters(r);
+            }
+        }
+    }
+    addToCounters(r) {
+        this.totalInput += r.inputTokens;
+        this.totalOutput += r.outputTokens;
+        this.totalTokens += r.totalTokens;
+        if (!this.byUser[r.senderId]) {
+            this.byUser[r.senderId] = { input: 0, output: 0, total: 0 };
         }
+        this.byUser[r.senderId].input += r.inputTokens;
+        this.byUser[r.senderId].output += r.outputTokens;
+        this.byUser[r.senderId].total += r.totalTokens;
     }
     record(entry) {
         this.records.push(entry);
+        this.addToCounters(entry);
         this.writer?.append(entry);
     }
     getByUser(senderId) {
         return this.records.filter((r) => r.senderId === senderId);
     }
     getSummary() {
-        const byUser = {};
-        let totalInput = 0;
-        let totalOutput = 0;
-        let totalTokens = 0;
-        for (const r of this.records) {
-            totalInput += r.inputTokens;
-            totalOutput += r.outputTokens;
-            totalTokens += r.totalTokens;
-            if (!byUser[r.senderId]) {
-                byUser[r.senderId] = { input: 0, output: 0, total: 0 };
-            }
-            byUser[r.senderId].input += r.inputTokens;
-            byUser[r.senderId].output += r.outputTokens;
-            byUser[r.senderId].total += r.totalTokens;
-        }
         return {
-            totalInputTokens: totalInput,
-            totalOutputTokens: totalOutput,
-            totalTokens,
+            totalInputTokens: this.totalInput,
+            totalOutputTokens: this.totalOutput,
+            totalTokens: this.totalTokens,
             recordCount: this.records.length,
-            byUser
+            byUser: { ...this.byUser }
         };
     }
     close() {

package/dist/plugin/event-adapter.d.ts CHANGED Viewed

@@ -101,15 +101,7 @@ export interface AgentEndEvent {
     error?: string;
     durationMs?: number;
 }
-export interface AgentEndContext {
-    agentId?: string;
-    sessionKey?: string;
-    sessionId?: string;
-    workspaceDir?: string;
-    messageProvider?: string;
-    trigger?: string;
-    channelId?: string;
-}
+export type AgentEndContext = BeforeAgentStartContext;
 export declare function mapBeforeAgentStart(event: BeforeAgentStartEvent, ctx: BeforeAgentStartContext): OpenClawContext;
 export declare function mapMessageReceived(event: MessageReceivedEvent, ctx: MessageReceivedContext): OpenClawContext;
 export declare function mapBeforeToolCall(event: BeforeToolCallEvent, ctx: BeforeToolCallContext): OpenClawContext;

package/dist/plugin/openclaw-adapter.d.ts CHANGED Viewed

@@ -34,6 +34,7 @@ export interface OpenClawHookResult extends OpenClawContext {
 export interface OpenClawPlugin {
     name: string;
     version: string;
+    config: GuardrailsConfig;
     approveRequest: (requestId: string, approverId: string, approverRole: ApproverRole) => string | null;
     hooks: {
         before_agent_start: (context: OpenClawContext) => Promise<OpenClawHookResult>;

package/dist/plugin/openclaw-adapter.js CHANGED Viewed

@@ -1,9 +1,10 @@
 import { JsonlAuditSink, NoopAuditSink } from "../core/audit-sink.js";
 import { GuardrailsEngine } from "../core/engine.js";
-import { unique } from "../core/event-utils.js";
+import { isObject, unique } from "../core/event-utils.js";
 import { ConsoleNotificationSink } from "../core/notification-sink.js";
 import { REASON_CODES } from "../core/reason-codes.js";
 import { TokenUsageStore } from "../core/token-usage-store.js";
+import { PLUGIN_VERSION } from "./version.js";
 import { createDefaultConfig, mergeConfig } from "../rules/default-policy.js";
 // Reason codes that reveal what type of sensitive content was detected.
 // Map these to a generic code before exposing in hook results to prevent
@@ -249,7 +250,7 @@ function buildMonitoringSnapshot(config, metrics) {
     };
 }
 function isPluginOptions(arg) {
-    if (!arg || typeof arg !== "object")
+    if (!isObject(arg))
         return false;
     const obj = arg;
     // PluginOptions has keys that never appear on GuardrailsConfig
@@ -282,7 +283,7 @@ export function createOpenClawGuardrailsPlugin(overridesOrOptions = {}) {
     });
     const metrics = createMetrics();
     console.log("[guardrails] plugin created", {
-        version: "0.6.0",
+        version: PLUGIN_VERSION,
         outboundGuardEnabled: config.outboundGuard.enabled,
         injectedFileNames: config.outboundGuard.injectedFileNames,
         mode: config.mode
@@ -295,11 +296,11 @@ export function createOpenClawGuardrailsPlugin(overridesOrOptions = {}) {
     };
     return {
         name: "openclaw-guardrails",
-        version: "0.6.0",
+        version: PLUGIN_VERSION,
+        config,
         approveRequest: (requestId, approverId, approverRole) => engine.approveRequest(requestId, approverId, approverRole),
         hooks: {
             async before_agent_start(context) {
-                console.log("[guardrails:before_agent_start] hook fired", { contextKeys: Object.keys(context) });
                 const decision = await evaluate("before_agent_start", context);
                 const guardPrompt = buildGuardPrompt(config);
                 const existingPrompt = typeof context.systemPrompt === "string"
@@ -323,10 +324,6 @@ export function createOpenClawGuardrailsPlugin(overridesOrOptions = {}) {
                 return output;
             },
             async message_received(context) {
-                console.log("[guardrails:message_received] hook fired", {
-                    contextKeys: Object.keys(context),
-                    contentPreview: typeof context.content === "string" ? context.content.slice(0, 120) : undefined
-                });
                 const decision = await evaluate("message_received", context);
                 const transformedContext = decision.redactedContent
                     ? upsertContentField(context, decision.redactedContent)
@@ -396,19 +393,6 @@ export function createOpenClawGuardrailsPlugin(overridesOrOptions = {}) {
                 };
             },
             async message_sending(context) {
-                const aggregated = extractOutboundContent(context);
-                const stringFields = {};
-                for (const [k, v] of Object.entries(context)) {
-                    if (typeof v === "string" && v.length > 0) {
-                        stringFields[k] = v.length > 120 ? v.slice(0, 120) + "…" : v;
-                    }
-                }
-                console.log("[guardrails:message_sending] hook fired", {
-                    aggregatedLength: aggregated.length,
-                    aggregatedPreview: aggregated.slice(0, 200),
-                    stringFields,
-                    contextKeys: Object.keys(context)
-                });
                 if (!config.outboundGuard.enabled) {
                     return { ...context };
                 }

package/dist/plugin/openclaw-extension.js CHANGED Viewed

@@ -7,8 +7,9 @@
  * - `api.logger` for structured logging
  * - `api.registerCommand()` for the /approve command
  */
-import { createDefaultConfig, mergeConfig } from "../rules/default-policy.js";
+import { redactWithPatterns } from "../redaction/redact.js";
 import { createOpenClawGuardrailsPlugin } from "./openclaw-adapter.js";
+import { PLUGIN_VERSION } from "./version.js";
 import { mapBeforeAgentStart, mapMessageReceived, mapBeforeToolCall, mapToolResultPersist, mapMessageSending, mapAgentEnd, mapToBeforeAgentStartResult, mapToBeforeToolCallResult, mapToMessageSendingResult, } from "./event-adapter.js";
 // ---------------------------------------------------------------------------
 // Plugin definition
@@ -16,12 +17,12 @@ import { mapBeforeAgentStart, mapMessageReceived, mapBeforeToolCall, mapToolResu
 const plugin = {
     id: "openclaw-guardrails",
     name: "OpenClaw Guardrails",
-    version: "0.6.0",
+    version: PLUGIN_VERSION,
     register(api) {
         const rawConfig = (api.pluginConfig ?? {});
         const log = api.logger;
-        const mergedConfig = mergeConfig(createDefaultConfig(rawConfig.workspaceRoot ?? process.cwd()), rawConfig);
         const guardrails = createOpenClawGuardrailsPlugin(rawConfig);
+        const mergedConfig = guardrails.config;
         log.info(`[guardrails] plugin registered (v${guardrails.version}, mode=${mergedConfig.mode})`);
         // ------------------------------------------------------------------
         // before_agent_start — inject security policy prompt
@@ -64,6 +65,12 @@ const plugin = {
         // Outbound content redaction is still enforced by the async
         // `message_sending` hook, which catches leaks before they reach users.
         // ------------------------------------------------------------------
+        // Pre-compile redaction patterns once (config is immutable after merge).
+        const allRedactionPatterns = [
+            ...mergedConfig.redaction.secretPatterns,
+            ...mergedConfig.redaction.piiPatterns,
+        ];
+        const redactionReplacement = mergedConfig.redaction.replacement;
         api.on("tool_result_persist", (event, ctx) => {
             const oclCtx = mapToolResultPersist(event, ctx);
             // Fire engine evaluation async for audit trail and metrics.
@@ -71,31 +78,12 @@ const plugin = {
             guardrails.hooks.tool_result_persist(oclCtx).catch((err) => {
                 log.error(`[guardrails:tool_result_persist] async audit failed: ${String(err)}`);
             });
-            // Sync redaction: apply sensitive-data patterns directly to the
-            // message content if available, without the full engine pipeline.
-            const content = typeof event.message?.content === "string"
-                ? event.message.content
-                : undefined;
-            if (content) {
-                const allPatterns = [
-                    ...mergedConfig.redaction.secretPatterns,
-                    ...mergedConfig.redaction.piiPatterns,
-                ];
-                if (allPatterns.length > 0) {
-                    const replacement = mergedConfig.redaction.replacement;
-                    let redacted = content;
-                    for (const pattern of allPatterns) {
-                        try {
-                            const regex = new RegExp(pattern, "gi");
-                            redacted = redacted.replace(regex, replacement);
-                        }
-                        catch {
-                            // skip invalid patterns
-                        }
-                    }
-                    if (redacted !== content) {
-                        return { message: { ...event.message, content: redacted } };
-                    }
+            // Sync redaction: reuse content already extracted by the mapper.
+            const content = oclCtx.output;
+            if (content && allRedactionPatterns.length > 0) {
+                const { redacted } = redactWithPatterns(content, allRedactionPatterns, redactionReplacement);
+                if (redacted !== content) {
+                    return { message: { ...event.message, content: redacted } };
                 }
             }
             return {};

package/dist/plugin/version.d.ts ADDED Viewed

	@@ -0,0 +1 @@
1	+ export declare const PLUGIN_VERSION = "0.6.2";

package/dist/plugin/version.js ADDED Viewed

	@@ -0,0 +1 @@
1	+ export const PLUGIN_VERSION = "0.6.2";

package/dist/redaction/redact.js CHANGED Viewed

@@ -1,4 +1,14 @@
 import { compilePatterns } from "../rules/patterns.js";
+const patternCache = new Map();
+function getCachedPatterns(patterns, flags) {
+    const key = flags + "\0" + patterns.join("\0");
+    let cached = patternCache.get(key);
+    if (!cached) {
+        cached = compilePatterns(patterns, flags);
+        patternCache.set(key, cached);
+    }
+    return cached;
+}
 export function redactWithPatterns(input, patterns, replacement) {
     if (!input) {
         return {
@@ -8,13 +18,15 @@ export function redactWithPatterns(input, patterns, replacement) {
     }
     let redacted = input;
     const matches = new Set();
-    const regexes = compilePatterns(patterns, "gi");
+    const regexes = getCachedPatterns(patterns, "gi");
     for (const regex of regexes) {
+        regex.lastIndex = 0;
         for (const match of input.matchAll(regex)) {
             if (match[0]) {
                 matches.add(match[0]);
             }
         }
+        regex.lastIndex = 0;
         redacted = redacted.replace(regex, replacement);
     }
     return {
@@ -26,6 +38,9 @@ export function hasPatternMatch(input, patterns) {
     if (!input) {
         return false;
     }
-    const regexes = compilePatterns(patterns, "gi");
-    return regexes.some((regex) => regex.test(input));
+    const regexes = getCachedPatterns(patterns, "gi");
+    return regexes.some((regex) => {
+        regex.lastIndex = 0;
+        return regex.test(input);
+    });
 }

package/openclaw.plugin.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "id": "openclaw-guardrails",
   "name": "openclaw-guardrails",
-  "version": "0.6.0",
+  "version": "0.6.3",
   "description": "Deterministic local guardrails for OpenClaw hooks",
   "configSchema": {
     "type": "object",

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@safefence/openclaw-guardrails",
-  "version": "0.6.1",
+  "version": "0.6.3",
   "description": "Native deterministic guardrails plugin for OpenClaw",
   "openclaw": {
     "extensions": [
@@ -19,7 +19,10 @@
     "build": "tsc -p tsconfig.json",
     "test": "vitest run",
     "test:coverage": "vitest run --coverage",
-    "test:watch": "vitest"
+    "test:watch": "vitest",
+    "preversion": "npm test && npm run build",
+    "version": "bash scripts/sync-version.sh",
+    "postversion": "echo '\nRun this to publish via CI:\n  git push origin master --tags'"
   },
   "engines": {
     "node": ">=20"
@@ -31,6 +34,14 @@
     "owasp",
     "llm"
   ],
+  "repository": {
+    "type": "git",
+    "url": "https://github.com/douglasswm/safefence.git",
+    "directory": "packages/openclaw-guardrails"
+  },
+  "publishConfig": {
+    "provenance": true
+  },
   "license": "MIT",
   "peerDependencies": {
     "openclaw": ">=2026.2.25"