@circuitwall/jarela 0.9.3 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.next/standalone/.next/BUILD_ID +1 -1
- package/.next/standalone/.next/app-path-routes-manifest.json +2 -2
- package/.next/standalone/.next/build-manifest.json +2 -2
- package/.next/standalone/.next/prerender-manifest.json +3 -3
- package/.next/standalone/.next/server/app/_global-error/page_client-reference-manifest.js +1 -1
- package/.next/standalone/.next/server/app/_global-error.html +1 -1
- package/.next/standalone/.next/server/app/_global-error.rsc +1 -1
- package/.next/standalone/.next/server/app/_global-error.segments/_full.segment.rsc +1 -1
- package/.next/standalone/.next/server/app/_global-error.segments/_global-error/__PAGE__.segment.rsc +1 -1
- package/.next/standalone/.next/server/app/_global-error.segments/_global-error.segment.rsc +1 -1
- package/.next/standalone/.next/server/app/_global-error.segments/_head.segment.rsc +1 -1
- package/.next/standalone/.next/server/app/_global-error.segments/_index.segment.rsc +1 -1
- package/.next/standalone/.next/server/app/_global-error.segments/_tree.segment.rsc +1 -1
- package/.next/standalone/.next/server/app/_not-found/page_client-reference-manifest.js +1 -1
- package/.next/standalone/.next/server/app/_not-found.html +2 -2
- package/.next/standalone/.next/server/app/_not-found.rsc +2 -2
- package/.next/standalone/.next/server/app/_not-found.segments/_full.segment.rsc +2 -2
- package/.next/standalone/.next/server/app/_not-found.segments/_head.segment.rsc +1 -1
- package/.next/standalone/.next/server/app/_not-found.segments/_index.segment.rsc +2 -2
- package/.next/standalone/.next/server/app/_not-found.segments/_not-found/__PAGE__.segment.rsc +1 -1
- package/.next/standalone/.next/server/app/_not-found.segments/_not-found.segment.rsc +1 -1
- package/.next/standalone/.next/server/app/_not-found.segments/_tree.segment.rsc +2 -2
- package/.next/standalone/.next/server/app/api/v1/dashboard/metrics/route.js +72 -5
- package/.next/standalone/.next/server/app/api/v1/dashboard/metrics/route.js.map +1 -1
- package/.next/standalone/.next/server/app/api/v1/extensions/route.js +2 -2
- package/.next/standalone/.next/server/app/api/v1/extensions/tools/[name]/secrets/route.js +2 -2
- package/.next/standalone/.next/server/app/api/v1/threads/[thread_id]/run/route.js +136 -26
- package/.next/standalone/.next/server/app/api/v1/threads/[thread_id]/run/route.js.map +1 -1
- package/.next/standalone/.next/server/app/api/v1/tools/route.js +2 -2
- package/.next/standalone/.next/server/app/index.html +2 -2
- package/.next/standalone/.next/server/app/index.rsc +3 -3
- package/.next/standalone/.next/server/app/index.segments/__PAGE__.segment.rsc +2 -2
- package/.next/standalone/.next/server/app/index.segments/_full.segment.rsc +3 -3
- package/.next/standalone/.next/server/app/index.segments/_head.segment.rsc +1 -1
- package/.next/standalone/.next/server/app/index.segments/_index.segment.rsc +2 -2
- package/.next/standalone/.next/server/app/index.segments/_tree.segment.rsc +2 -2
- package/.next/standalone/.next/server/app/page.js +266 -40
- package/.next/standalone/.next/server/app/page.js.map +1 -1
- package/.next/standalone/.next/server/app/page_client-reference-manifest.js +1 -1
- package/.next/standalone/.next/server/app/setup/page_client-reference-manifest.js +1 -1
- package/.next/standalone/.next/server/app/setup.html +1 -1
- package/.next/standalone/.next/server/app/setup.rsc +2 -2
- package/.next/standalone/.next/server/app/setup.segments/_full.segment.rsc +2 -2
- package/.next/standalone/.next/server/app/setup.segments/_head.segment.rsc +1 -1
- package/.next/standalone/.next/server/app/setup.segments/_index.segment.rsc +2 -2
- package/.next/standalone/.next/server/app/setup.segments/_tree.segment.rsc +2 -2
- package/.next/standalone/.next/server/app/setup.segments/setup/__PAGE__.segment.rsc +1 -1
- package/.next/standalone/.next/server/app/setup.segments/setup.segment.rsc +1 -1
- package/.next/standalone/.next/server/app-paths-manifest.json +2 -2
- package/.next/standalone/.next/server/chunks/210.js +1 -1
- package/.next/standalone/.next/server/chunks/2151.js +60 -2
- package/.next/standalone/.next/server/chunks/2151.js.map +1 -1
- package/.next/standalone/.next/server/chunks/614.js +336 -93
- package/.next/standalone/.next/server/chunks/614.js.map +1 -1
- package/.next/standalone/.next/server/chunks/6765.js +35 -0
- package/.next/standalone/.next/server/chunks/6765.js.map +1 -1
- package/.next/standalone/.next/server/chunks/8697.js +15246 -15002
- package/.next/standalone/.next/server/chunks/8697.js.map +1 -1
- package/.next/standalone/.next/server/middleware-build-manifest.js +2 -2
- package/.next/standalone/.next/server/pages/404.html +2 -2
- package/.next/standalone/.next/server/pages/500.html +1 -1
- package/.next/standalone/.next/server/server-reference-manifest.json +1 -1
- package/.next/standalone/.next/static/chunks/{3741-344e2bfc5028b9c8.js → 3741-2d64471ff763b8fa.js} +36 -1
- package/.next/standalone/.next/static/chunks/3741-2d64471ff763b8fa.js.map +1 -0
- package/.next/standalone/.next/static/chunks/app/{page-c77ab600642bbfc2.js → page-318743bf47fac345.js} +267 -41
- package/.next/standalone/.next/static/chunks/app/page-318743bf47fac345.js.map +1 -0
- package/.next/standalone/.next/static/css/b6b85b0f13bc0e98.css +5 -0
- package/.next/standalone/.next/static/css/b6b85b0f13bc0e98.css.map +1 -0
- package/.next/standalone/package.json +1 -1
- package/CHANGELOG.md +48 -0
- package/README.md +2 -0
- package/api/client.ts +37 -1
- package/api/types.ts +18 -0
- package/app/api/v1/threads/[thread_id]/run/route.ts +69 -22
- package/components/agents/AgentEditor.tsx +7 -4
- package/components/chat/MessageBubble.tsx +108 -1
- package/components/dashboard/DashboardPanel.tsx +79 -21
- package/hooks/useSSE.ts +22 -9
- package/lib/agents/prepare/system-prompt.ts +30 -0
- package/lib/agents/run-registry.test.ts +94 -0
- package/lib/agents/run-registry.ts +60 -1
- package/lib/stores/dashboard-metrics.test.ts +33 -0
- package/lib/stores/dashboard-metrics.ts +93 -1
- package/lib/tools/exec.ts +9 -5
- package/lib/tools/files.ts +6 -0
- package/lib/tools/safety.test.ts +95 -0
- package/lib/tools/safety.ts +147 -0
- package/package.json +1 -1
- package/.next/standalone/.next/static/chunks/3741-344e2bfc5028b9c8.js.map +0 -1
- package/.next/standalone/.next/static/chunks/app/page-c77ab600642bbfc2.js.map +0 -1
- package/.next/standalone/.next/static/css/53f85613a5500253.css +0 -5
- package/.next/standalone/.next/static/css/53f85613a5500253.css.map +0 -1
- /package/.next/standalone/.next/static/{6uLoytvvEtLKIblEB53e0 → 8qTBpUDFnSMYwe3Zc0bGV}/_buildManifest.js +0 -0
- /package/.next/standalone/.next/static/{6uLoytvvEtLKIblEB53e0 → 8qTBpUDFnSMYwe3Zc0bGV}/_ssgManifest.js +0 -0
|
@@ -15,6 +15,23 @@ type Subscriber = (chunk: StreamChunk) => void;
|
|
|
15
15
|
|
|
16
16
|
const MAX_BUFFERED = 4000; // text_delta chunks accumulate fast; cap them
|
|
17
17
|
const RECENT_TTL_MS = 5 * 60_000; // keep finished runs visible for 5 min
|
|
18
|
+
// Idle (no-progress) ceiling: if no chunk has been broadcast for this
|
|
19
|
+
// long the registry assumes the LLM/tool call wedged and force-finishes
|
|
20
|
+
// the run. This is the user-perceived "stream is dead" signal and is
|
|
21
|
+
// short by design — long legitimate turns keep streaming text/tool
|
|
22
|
+
// chunks, so they reset the idle clock on every broadcast(). The
|
|
23
|
+
// wall-clock ceiling (runMaxMs) is the absolute safety net for the
|
|
24
|
+
// degenerate case where broadcast() is never called at all (or fires
|
|
25
|
+
// faster than the idle window forever).
|
|
26
|
+
// Override with JARELA_RUN_IDLE_MS / JARELA_RUN_MAX_MS.
|
|
27
|
+
function runIdleMs(): number {
|
|
28
|
+
const raw = Number(process.env.JARELA_RUN_IDLE_MS);
|
|
29
|
+
return Number.isFinite(raw) && raw > 0 ? raw : 90_000;
|
|
30
|
+
}
|
|
31
|
+
function runMaxMs(): number {
|
|
32
|
+
const raw = Number(process.env.JARELA_RUN_MAX_MS);
|
|
33
|
+
return Number.isFinite(raw) && raw > 0 ? raw : 15 * 60_000;
|
|
34
|
+
}
|
|
18
35
|
|
|
19
36
|
export interface ActiveRun {
|
|
20
37
|
thread_id: string;
|
|
@@ -30,6 +47,9 @@ export interface ActiveRun {
|
|
|
30
47
|
// disconnects), we signal this controller so the LangGraph stream cancels
|
|
31
48
|
// itself instead of running to completion in the background.
|
|
32
49
|
abort: AbortController;
|
|
50
|
+
// Last activity timestamp — bumped on every broadcast() so the idle
|
|
51
|
+
// watchdog can tell live progress from a wedged stream.
|
|
52
|
+
last_chunk_at: number;
|
|
33
53
|
}
|
|
34
54
|
|
|
35
55
|
const runs = new Map<string, ActiveRun>();
|
|
@@ -40,25 +60,64 @@ export function startRun(thread_id: string, agent_id: string | null): ActiveRun
|
|
|
40
60
|
if (existing && existing.status === "running") {
|
|
41
61
|
throw new Error(`A run is already active for thread ${thread_id}`);
|
|
42
62
|
}
|
|
63
|
+
const now = Date.now();
|
|
43
64
|
const run: ActiveRun = {
|
|
44
65
|
thread_id,
|
|
45
66
|
agent_id,
|
|
46
|
-
started_at:
|
|
67
|
+
started_at: now,
|
|
47
68
|
finished_at: null,
|
|
48
69
|
status: "running",
|
|
49
70
|
events: [],
|
|
50
71
|
subscribers: new Set(),
|
|
51
72
|
final_text: "",
|
|
52
73
|
abort: new AbortController(),
|
|
74
|
+
last_chunk_at: now,
|
|
53
75
|
};
|
|
54
76
|
runs.set(thread_id, run);
|
|
77
|
+
scheduleIdleWatchdog(run);
|
|
78
|
+
scheduleMaxWatchdog(run);
|
|
55
79
|
return run;
|
|
56
80
|
}
|
|
57
81
|
|
|
82
|
+
// Self-rearming idle watchdog. Fires when no chunk has arrived for
|
|
83
|
+
// `idleMs`; otherwise reschedules itself for `(last_chunk_at + idleMs) -
|
|
84
|
+
// now`. We never carry a handle on the run — the closure just bails if
|
|
85
|
+
// the run is no longer the registry's entry or no longer running.
|
|
86
|
+
function scheduleIdleWatchdog(run: ActiveRun): void {
|
|
87
|
+
const idleMs = runIdleMs();
|
|
88
|
+
const fireIn = Math.max(0, (run.last_chunk_at + idleMs) - Date.now());
|
|
89
|
+
setTimeout(() => {
|
|
90
|
+
const cur = runs.get(run.thread_id);
|
|
91
|
+
if (cur !== run) return;
|
|
92
|
+
if (run.status !== "running") return;
|
|
93
|
+
const idle = Date.now() - run.last_chunk_at;
|
|
94
|
+
if (idle < idleMs) {
|
|
95
|
+
scheduleIdleWatchdog(run);
|
|
96
|
+
return;
|
|
97
|
+
}
|
|
98
|
+
console.warn(`[run-registry] idle watchdog: force-finishing stalled run for thread ${run.thread_id} after ${idle}ms of no progress`);
|
|
99
|
+
try { run.abort.abort("run_idle_timeout"); } catch { /* */ }
|
|
100
|
+
finishRun(run, "error");
|
|
101
|
+
}, fireIn).unref?.();
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
function scheduleMaxWatchdog(run: ActiveRun): void {
|
|
105
|
+
const max = runMaxMs();
|
|
106
|
+
setTimeout(() => {
|
|
107
|
+
const cur = runs.get(run.thread_id);
|
|
108
|
+
if (cur !== run) return;
|
|
109
|
+
if (run.status !== "running") return;
|
|
110
|
+
console.warn(`[run-registry] wall-clock watchdog: force-finishing run for thread ${run.thread_id} after ${max}ms`);
|
|
111
|
+
try { run.abort.abort("run_watchdog_timeout"); } catch { /* */ }
|
|
112
|
+
finishRun(run, "error");
|
|
113
|
+
}, max).unref?.();
|
|
114
|
+
}
|
|
115
|
+
|
|
58
116
|
export function broadcast(run: ActiveRun, chunk: StreamChunk): void {
|
|
59
117
|
// Identity-check: a superseded run must not smear trailing chunks onto
|
|
60
118
|
// the replacement entry in the registry.
|
|
61
119
|
if (runs.get(run.thread_id) !== run) return;
|
|
120
|
+
run.last_chunk_at = Date.now();
|
|
62
121
|
if (chunk.type === "text_delta") {
|
|
63
122
|
run.final_text += (chunk.data.delta as string) ?? "";
|
|
64
123
|
}
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
import { describe, it, expect } from "vitest";
|
|
2
|
+
import { computeDataQuality } from "./dashboard-metrics";
|
|
3
|
+
|
|
4
|
+
describe("computeDataQuality", () => {
|
|
5
|
+
it("treats empty windows as fully measured to avoid a misleading red chip", () => {
|
|
6
|
+
expect(computeDataQuality(0, 0)).toEqual({
|
|
7
|
+
measured_messages: 0,
|
|
8
|
+
estimated_messages: 0,
|
|
9
|
+
measured_pct: 1,
|
|
10
|
+
});
|
|
11
|
+
});
|
|
12
|
+
|
|
13
|
+
it("reports the measured ratio for mixed windows", () => {
|
|
14
|
+
expect(computeDataQuality(9, 1)).toEqual({
|
|
15
|
+
measured_messages: 9,
|
|
16
|
+
estimated_messages: 1,
|
|
17
|
+
measured_pct: 0.9,
|
|
18
|
+
});
|
|
19
|
+
});
|
|
20
|
+
|
|
21
|
+
it("rounds to four decimals", () => {
|
|
22
|
+
const q = computeDataQuality(1, 2);
|
|
23
|
+
expect(q.measured_pct).toBe(0.3333);
|
|
24
|
+
});
|
|
25
|
+
|
|
26
|
+
it("reports 0% when every row is a legacy estimate", () => {
|
|
27
|
+
expect(computeDataQuality(0, 7)).toEqual({
|
|
28
|
+
measured_messages: 0,
|
|
29
|
+
estimated_messages: 7,
|
|
30
|
+
measured_pct: 0,
|
|
31
|
+
});
|
|
32
|
+
});
|
|
33
|
+
});
|
|
@@ -7,6 +7,24 @@ import type { PersistedToolEvent } from "@/lib/stores/threads";
|
|
|
7
7
|
const CHARS_PER_TOKEN = 4;
|
|
8
8
|
const DEFAULT_WINDOW_DAYS = 30;
|
|
9
9
|
|
|
10
|
+
export interface DashboardTierTokens {
|
|
11
|
+
hot_tokens: number;
|
|
12
|
+
warm_tokens: number;
|
|
13
|
+
facts_tokens: number;
|
|
14
|
+
overhead_tokens: number;
|
|
15
|
+
/** Sum of the four tiers — convenience for stacked-bar totals. */
|
|
16
|
+
measured_input_tokens: number;
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
export interface DashboardDataQuality {
|
|
20
|
+
/** Assistant turns in the window that have an immutable message_usage snapshot. */
|
|
21
|
+
measured_messages: number;
|
|
22
|
+
/** Assistant turns falling back to content-length estimates. */
|
|
23
|
+
estimated_messages: number;
|
|
24
|
+
/** measured / (measured + estimated), 0..1; 1 when no traffic. */
|
|
25
|
+
measured_pct: number;
|
|
26
|
+
}
|
|
27
|
+
|
|
10
28
|
export interface DashboardSeriesPoint {
|
|
11
29
|
day: string;
|
|
12
30
|
input_tokens_est: number;
|
|
@@ -17,6 +35,11 @@ export interface DashboardSeriesPoint {
|
|
|
17
35
|
tool_errors: number;
|
|
18
36
|
success_rate: number;
|
|
19
37
|
error_rate: number;
|
|
38
|
+
/** Per-tier breakdown of authoritative snapshot input tokens for the
|
|
39
|
+
* day. Zero for legacy rows with no message_usage entry — these are
|
|
40
|
+
* surfaced via the `data_quality` chip instead so users know the bar
|
|
41
|
+
* reflects only measured traffic. */
|
|
42
|
+
tier_tokens: DashboardTierTokens;
|
|
20
43
|
}
|
|
21
44
|
|
|
22
45
|
export interface DashboardToolTop {
|
|
@@ -92,6 +115,7 @@ export interface DashboardDayBreakdown {
|
|
|
92
115
|
tool_errors: number;
|
|
93
116
|
success_rate: number;
|
|
94
117
|
error_rate: number;
|
|
118
|
+
tier_tokens: DashboardTierTokens;
|
|
95
119
|
};
|
|
96
120
|
top_agents: DashboardAgentTop[];
|
|
97
121
|
by_provider: DashboardProviderBreakdown[];
|
|
@@ -110,6 +134,8 @@ export interface DashboardMetrics {
|
|
|
110
134
|
tool_errors: number;
|
|
111
135
|
success_rate: number;
|
|
112
136
|
error_rate: number;
|
|
137
|
+
tier_tokens: DashboardTierTokens;
|
|
138
|
+
data_quality: DashboardDataQuality;
|
|
113
139
|
};
|
|
114
140
|
series: DashboardSeriesPoint[];
|
|
115
141
|
top_tools: DashboardToolTop[];
|
|
@@ -146,6 +172,17 @@ type UsageRow = {
|
|
|
146
172
|
mu_model_config_name: string | null;
|
|
147
173
|
mu_agent_id: string | null;
|
|
148
174
|
mu_agent_name: string | null;
|
|
175
|
+
mu_hot_tokens: number | null;
|
|
176
|
+
mu_warm_tokens: number | null;
|
|
177
|
+
mu_facts_tokens: number | null;
|
|
178
|
+
mu_overhead_tokens: number | null;
|
|
179
|
+
};
|
|
180
|
+
|
|
181
|
+
type TierBucket = {
|
|
182
|
+
hot: number;
|
|
183
|
+
warm: number;
|
|
184
|
+
facts: number;
|
|
185
|
+
overhead: number;
|
|
149
186
|
};
|
|
150
187
|
|
|
151
188
|
type DayBucket = {
|
|
@@ -155,6 +192,7 @@ type DayBucket = {
|
|
|
155
192
|
toolCalls: number;
|
|
156
193
|
toolSuccesses: number;
|
|
157
194
|
toolErrors: number;
|
|
195
|
+
tier: TierBucket;
|
|
158
196
|
};
|
|
159
197
|
|
|
160
198
|
type AgentBucket = {
|
|
@@ -235,7 +273,11 @@ export async function getDashboardMetrics(days = DEFAULT_WINDOW_DAYS): Promise<D
|
|
|
235
273
|
mu.model_id AS mu_model_id,
|
|
236
274
|
mu.model_config_name AS mu_model_config_name,
|
|
237
275
|
mu.agent_id AS mu_agent_id,
|
|
238
|
-
mu.agent_name AS mu_agent_name
|
|
276
|
+
mu.agent_name AS mu_agent_name,
|
|
277
|
+
mu.hot_tokens AS mu_hot_tokens,
|
|
278
|
+
mu.warm_tokens AS mu_warm_tokens,
|
|
279
|
+
mu.facts_tokens AS mu_facts_tokens,
|
|
280
|
+
mu.overhead_tokens AS mu_overhead_tokens
|
|
239
281
|
FROM messages m
|
|
240
282
|
JOIN threads t ON t.thread_id = m.thread_id
|
|
241
283
|
LEFT JOIN agent_configs a ON a.id = t.agent_id
|
|
@@ -301,6 +343,11 @@ export async function getDashboardMetrics(days = DEFAULT_WINDOW_DAYS): Promise<D
|
|
|
301
343
|
let totalCalls = 0;
|
|
302
344
|
let totalSuccesses = 0;
|
|
303
345
|
let totalErrors = 0;
|
|
346
|
+
const tierTotals: TierBucket = { hot: 0, warm: 0, facts: 0, overhead: 0 };
|
|
347
|
+
// Data-quality counters: only assistant turns are eligible since
|
|
348
|
+
// user/system rows never carry a message_usage snapshot by design.
|
|
349
|
+
let measuredAssistantMessages = 0;
|
|
350
|
+
let estimatedAssistantMessages = 0;
|
|
304
351
|
|
|
305
352
|
for (const row of usageRows) {
|
|
306
353
|
const day = row.created_at.slice(0, 10);
|
|
@@ -332,6 +379,22 @@ export async function getDashboardMetrics(days = DEFAULT_WINDOW_DAYS): Promise<D
|
|
|
332
379
|
attribModelConfig = row.mu_model_config_name ?? attribModelConfig;
|
|
333
380
|
attribAgentId = row.mu_agent_id ?? attribAgentId;
|
|
334
381
|
attribAgentName = row.mu_agent_name ?? attribAgentName;
|
|
382
|
+
if (row.role === "assistant") measuredAssistantMessages += 1;
|
|
383
|
+
// Accumulate tier breakdown — null columns (legacy snapshots
|
|
384
|
+
// before the tier wire-up) contribute zero, which is the right
|
|
385
|
+
// behaviour for a stacked bar that visualises *known* tier split.
|
|
386
|
+
const hot = row.mu_hot_tokens ?? 0;
|
|
387
|
+
const warm = row.mu_warm_tokens ?? 0;
|
|
388
|
+
const facts = row.mu_facts_tokens ?? 0;
|
|
389
|
+
const overhead = row.mu_overhead_tokens ?? 0;
|
|
390
|
+
tierTotals.hot += hot;
|
|
391
|
+
tierTotals.warm += warm;
|
|
392
|
+
tierTotals.facts += facts;
|
|
393
|
+
tierTotals.overhead += overhead;
|
|
394
|
+
dayBucket.tier.hot += hot;
|
|
395
|
+
dayBucket.tier.warm += warm;
|
|
396
|
+
dayBucket.tier.facts += facts;
|
|
397
|
+
dayBucket.tier.overhead += overhead;
|
|
335
398
|
} else if (row.role === "user" && threadHasSnapshot) {
|
|
336
399
|
// Suppressed: snapshotted assistant turns in this thread already
|
|
337
400
|
// capture this user message's tokens in their input_tokens.
|
|
@@ -343,6 +406,7 @@ export async function getDashboardMetrics(days = DEFAULT_WINDOW_DAYS): Promise<D
|
|
|
343
406
|
outputTokens = isInput ? 0 : tokenEstimate;
|
|
344
407
|
const rates = modelRatesFor(byProvider, byProviderModel, byModel, row.provider, row.model_id);
|
|
345
408
|
estCost = estimateCostUsd(inputTokens, outputTokens, rates);
|
|
409
|
+
if (row.role === "assistant") estimatedAssistantMessages += 1;
|
|
346
410
|
}
|
|
347
411
|
|
|
348
412
|
dayBucket.inputTokens += inputTokens;
|
|
@@ -469,6 +533,7 @@ export async function getDashboardMetrics(days = DEFAULT_WINDOW_DAYS): Promise<D
|
|
|
469
533
|
tool_errors: b.toolErrors,
|
|
470
534
|
success_rate: round4(successRate),
|
|
471
535
|
error_rate: round4(errorRate),
|
|
536
|
+
tier_tokens: tierBucketToTokens(b.tier),
|
|
472
537
|
} satisfies DashboardSeriesPoint;
|
|
473
538
|
});
|
|
474
539
|
|
|
@@ -561,6 +626,7 @@ export async function getDashboardMetrics(days = DEFAULT_WINDOW_DAYS): Promise<D
|
|
|
561
626
|
tool_errors: dayPoint?.tool_errors ?? 0,
|
|
562
627
|
success_rate: dayPoint?.success_rate ?? 1,
|
|
563
628
|
error_rate: dayPoint?.error_rate ?? 0,
|
|
629
|
+
tier_tokens: dayPoint?.tier_tokens ?? emptyTierTokens(),
|
|
564
630
|
},
|
|
565
631
|
top_agents: dayAgents,
|
|
566
632
|
by_provider: dayProviders,
|
|
@@ -580,6 +646,8 @@ export async function getDashboardMetrics(days = DEFAULT_WINDOW_DAYS): Promise<D
|
|
|
580
646
|
tool_errors: totalErrors,
|
|
581
647
|
success_rate: round4(overallSuccessRate),
|
|
582
648
|
error_rate: round4(overallErrorRate),
|
|
649
|
+
tier_tokens: tierBucketToTokens(tierTotals),
|
|
650
|
+
data_quality: computeDataQuality(measuredAssistantMessages, estimatedAssistantMessages),
|
|
583
651
|
},
|
|
584
652
|
series,
|
|
585
653
|
top_tools,
|
|
@@ -641,11 +709,35 @@ function seedDayBuckets(now: Date, days: number): Map<string, DayBucket> {
|
|
|
641
709
|
toolCalls: 0,
|
|
642
710
|
toolSuccesses: 0,
|
|
643
711
|
toolErrors: 0,
|
|
712
|
+
tier: { hot: 0, warm: 0, facts: 0, overhead: 0 },
|
|
644
713
|
});
|
|
645
714
|
}
|
|
646
715
|
return out;
|
|
647
716
|
}
|
|
648
717
|
|
|
718
|
+
function emptyTierTokens(): DashboardTierTokens {
|
|
719
|
+
return { hot_tokens: 0, warm_tokens: 0, facts_tokens: 0, overhead_tokens: 0, measured_input_tokens: 0 };
|
|
720
|
+
}
|
|
721
|
+
|
|
722
|
+
function tierBucketToTokens(b: TierBucket): DashboardTierTokens {
|
|
723
|
+
return {
|
|
724
|
+
hot_tokens: b.hot,
|
|
725
|
+
warm_tokens: b.warm,
|
|
726
|
+
facts_tokens: b.facts,
|
|
727
|
+
overhead_tokens: b.overhead,
|
|
728
|
+
measured_input_tokens: b.hot + b.warm + b.facts + b.overhead,
|
|
729
|
+
};
|
|
730
|
+
}
|
|
731
|
+
|
|
732
|
+
export function computeDataQuality(measured: number, estimated: number): DashboardDataQuality {
|
|
733
|
+
const total = measured + estimated;
|
|
734
|
+
return {
|
|
735
|
+
measured_messages: measured,
|
|
736
|
+
estimated_messages: estimated,
|
|
737
|
+
measured_pct: total === 0 ? 1 : round4(measured / total),
|
|
738
|
+
};
|
|
739
|
+
}
|
|
740
|
+
|
|
649
741
|
function estimateTokens(text: string): number {
|
|
650
742
|
const trimmed = text.trim();
|
|
651
743
|
if (!trimmed) return 0;
|
package/lib/tools/exec.ts
CHANGED
|
@@ -3,6 +3,7 @@ import { tool } from "@langchain/core/tools";
|
|
|
3
3
|
import { z } from "zod";
|
|
4
4
|
import { registerTools } from "./registry";
|
|
5
5
|
import { getInjectedSubprocessEnv } from "@/lib/env/allowlist";
|
|
6
|
+
import { checkExecAllowed, resolveSafetyMode } from "./safety";
|
|
6
7
|
|
|
7
8
|
const MAX_OUTPUT_BYTES = 8_000;
|
|
8
9
|
const DEFAULT_TIMEOUT_MS = 10_000;
|
|
@@ -40,11 +41,14 @@ function runLocalCommand(
|
|
|
40
41
|
|
|
41
42
|
const timeout = Math.min(options.timeout_ms ?? DEFAULT_TIMEOUT_MS, MAX_TIMEOUT_MS);
|
|
42
43
|
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
44
|
+
const mode = resolveSafetyMode();
|
|
45
|
+
const gate = checkExecAllowed(command, {
|
|
46
|
+
mode,
|
|
47
|
+
allowUnsafe: options.allow_unsafe,
|
|
48
|
+
blockedByPattern: isBlockedCommand(command),
|
|
49
|
+
});
|
|
50
|
+
if (!gate.allowed) {
|
|
51
|
+
return JSON.stringify({ exit_code: 126, stderr: gate.reason, safety_mode: mode });
|
|
48
52
|
}
|
|
49
53
|
|
|
50
54
|
const cwd = options.cwd?.trim() ? options.cwd : process.cwd();
|
package/lib/tools/files.ts
CHANGED
|
@@ -4,6 +4,7 @@ import path from "node:path";
|
|
|
4
4
|
import { tool } from "@langchain/core/tools";
|
|
5
5
|
import { z } from "zod";
|
|
6
6
|
import { registerTools } from "./registry";
|
|
7
|
+
import { checkFsAllowed, resolveSafetyMode } from "./safety";
|
|
7
8
|
|
|
8
9
|
// Dedicated file tools. Agents previously had to drive every edit through
|
|
9
10
|
// `local_exec` / `shell_exec`, which works for "create a new file with this
|
|
@@ -90,6 +91,11 @@ function jarelaDataDir(): string {
|
|
|
90
91
|
}
|
|
91
92
|
|
|
92
93
|
function assertSafePath(abs: string, op: "read" | "write"): void {
|
|
94
|
+
const mode = resolveSafetyMode();
|
|
95
|
+
const gate = checkFsAllowed(op, { mode });
|
|
96
|
+
if (!gate.allowed) throw new Error(gate.reason);
|
|
97
|
+
// bypass mode disables every guard, including the credential denylist.
|
|
98
|
+
if (mode === "bypass") return;
|
|
93
99
|
if (process.env.JARELA_ALLOW_SENSITIVE_FILES === "1") return;
|
|
94
100
|
for (const base of sensitiveBase()) {
|
|
95
101
|
if (isInside(abs, base)) {
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
import { describe, it, expect, beforeEach, afterEach } from "vitest";
|
|
2
|
+
import { checkExecAllowed, checkFsAllowed, resolveSafetyMode } from "./safety";
|
|
3
|
+
|
|
4
|
+
const ORIGINAL = process.env.JARELA_TOOL_SAFETY;
|
|
5
|
+
|
|
6
|
+
afterEach(() => {
|
|
7
|
+
if (ORIGINAL === undefined) delete process.env.JARELA_TOOL_SAFETY;
|
|
8
|
+
else process.env.JARELA_TOOL_SAFETY = ORIGINAL;
|
|
9
|
+
});
|
|
10
|
+
|
|
11
|
+
describe("resolveSafetyMode", () => {
|
|
12
|
+
it("defaults to mostly_safe", () => {
|
|
13
|
+
delete process.env.JARELA_TOOL_SAFETY;
|
|
14
|
+
expect(resolveSafetyMode()).toBe("mostly_safe");
|
|
15
|
+
});
|
|
16
|
+
it("accepts safe / mostly_safe / bypass", () => {
|
|
17
|
+
process.env.JARELA_TOOL_SAFETY = "safe";
|
|
18
|
+
expect(resolveSafetyMode()).toBe("safe");
|
|
19
|
+
process.env.JARELA_TOOL_SAFETY = "BYPASS";
|
|
20
|
+
expect(resolveSafetyMode()).toBe("bypass");
|
|
21
|
+
process.env.JARELA_TOOL_SAFETY = "unsafe";
|
|
22
|
+
expect(resolveSafetyMode()).toBe("bypass");
|
|
23
|
+
process.env.JARELA_TOOL_SAFETY = "garbage";
|
|
24
|
+
expect(resolveSafetyMode()).toBe("mostly_safe");
|
|
25
|
+
});
|
|
26
|
+
});
|
|
27
|
+
|
|
28
|
+
describe("checkExecAllowed - bypass", () => {
|
|
29
|
+
it("allows anything", () => {
|
|
30
|
+
expect(checkExecAllowed("rm -rf /", { mode: "bypass", blockedByPattern: true }).allowed).toBe(true);
|
|
31
|
+
});
|
|
32
|
+
});
|
|
33
|
+
|
|
34
|
+
describe("checkExecAllowed - mostly_safe", () => {
|
|
35
|
+
it("blocks dangerous pattern without allow_unsafe", () => {
|
|
36
|
+
const r = checkExecAllowed("rm -rf /", { mode: "mostly_safe", blockedByPattern: true });
|
|
37
|
+
expect(r.allowed).toBe(false);
|
|
38
|
+
});
|
|
39
|
+
it("permits dangerous pattern with allow_unsafe", () => {
|
|
40
|
+
const r = checkExecAllowed("rm -rf /", { mode: "mostly_safe", blockedByPattern: true, allowUnsafe: true });
|
|
41
|
+
expect(r.allowed).toBe(true);
|
|
42
|
+
});
|
|
43
|
+
it("permits normal commands", () => {
|
|
44
|
+
expect(checkExecAllowed("ls -la", { mode: "mostly_safe", blockedByPattern: false }).allowed).toBe(true);
|
|
45
|
+
});
|
|
46
|
+
});
|
|
47
|
+
|
|
48
|
+
describe("checkExecAllowed - safe", () => {
|
|
49
|
+
const opts = { mode: "safe" as const, blockedByPattern: false };
|
|
50
|
+
it("allows ls", () => {
|
|
51
|
+
expect(checkExecAllowed("ls -la", opts).allowed).toBe(true);
|
|
52
|
+
});
|
|
53
|
+
it("allows git status", () => {
|
|
54
|
+
expect(checkExecAllowed("git status", opts).allowed).toBe(true);
|
|
55
|
+
});
|
|
56
|
+
it("blocks git push", () => {
|
|
57
|
+
expect(checkExecAllowed("git push origin main", opts).allowed).toBe(false);
|
|
58
|
+
});
|
|
59
|
+
it("blocks unknown commands", () => {
|
|
60
|
+
expect(checkExecAllowed("rm file", opts).allowed).toBe(false);
|
|
61
|
+
});
|
|
62
|
+
it("blocks pipelines and composition", () => {
|
|
63
|
+
expect(checkExecAllowed("ls | grep foo", opts).allowed).toBe(false);
|
|
64
|
+
expect(checkExecAllowed("ls && pwd", opts).allowed).toBe(false);
|
|
65
|
+
expect(checkExecAllowed("ls; pwd", opts).allowed).toBe(false);
|
|
66
|
+
expect(checkExecAllowed("ls > out.txt", opts).allowed).toBe(false);
|
|
67
|
+
expect(checkExecAllowed("echo $(whoami)", opts).allowed).toBe(false);
|
|
68
|
+
});
|
|
69
|
+
it("blocks tools that execute arbitrary code", () => {
|
|
70
|
+
expect(checkExecAllowed("node -e 'process.exit()'", opts).allowed).toBe(false);
|
|
71
|
+
expect(checkExecAllowed("python -c 'print(1)'", opts).allowed).toBe(false);
|
|
72
|
+
expect(checkExecAllowed("npx some-pkg", opts).allowed).toBe(false);
|
|
73
|
+
});
|
|
74
|
+
it("ignores allow_unsafe", () => {
|
|
75
|
+
expect(
|
|
76
|
+
checkExecAllowed("rm -rf /", { mode: "safe", blockedByPattern: true, allowUnsafe: true }).allowed,
|
|
77
|
+
).toBe(false);
|
|
78
|
+
});
|
|
79
|
+
it("rejects empty command", () => {
|
|
80
|
+
expect(checkExecAllowed(" ", opts).allowed).toBe(false);
|
|
81
|
+
});
|
|
82
|
+
});
|
|
83
|
+
|
|
84
|
+
describe("checkFsAllowed", () => {
|
|
85
|
+
it("bypass + mostly_safe always permit", () => {
|
|
86
|
+
expect(checkFsAllowed("write", { mode: "bypass" }).allowed).toBe(true);
|
|
87
|
+
expect(checkFsAllowed("write", { mode: "mostly_safe" }).allowed).toBe(true);
|
|
88
|
+
expect(checkFsAllowed("read", { mode: "bypass" }).allowed).toBe(true);
|
|
89
|
+
expect(checkFsAllowed("read", { mode: "mostly_safe" }).allowed).toBe(true);
|
|
90
|
+
});
|
|
91
|
+
it("safe permits reads, blocks writes", () => {
|
|
92
|
+
expect(checkFsAllowed("read", { mode: "safe" }).allowed).toBe(true);
|
|
93
|
+
expect(checkFsAllowed("write", { mode: "safe" }).allowed).toBe(false);
|
|
94
|
+
});
|
|
95
|
+
});
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
// Safety mode for destructive built-in tools (exec + filesystem writes).
|
|
2
|
+
//
|
|
3
|
+
// Resolved once per call from `JARELA_TOOL_SAFETY`. Three tiers:
|
|
4
|
+
//
|
|
5
|
+
// "safe" — read-only. Exec accepts only an allowlisted set of
|
|
6
|
+
// inspection commands (ls, git status, …); filesystem
|
|
7
|
+
// tools refuse every write, edit, move, copy, delete,
|
|
8
|
+
// or mkdir. Per-call `allow_unsafe` is IGNORED.
|
|
9
|
+
// "mostly_safe" — default. Exec blocks the obviously-dangerous pattern
|
|
10
|
+
// list (rm -rf /, shutdown, fork bomb, …); filesystem
|
|
11
|
+
// tools refuse credential paths and the Jarela data dir.
|
|
12
|
+
// Per-call `allow_unsafe=true` lifts the exec block for
|
|
13
|
+
// that single call.
|
|
14
|
+
// "bypass" — every guard off. For local development on a machine
|
|
15
|
+
// you control and trust completely. NOT for use behind
|
|
16
|
+
// a tunnel or with untrusted prompt sources.
|
|
17
|
+
//
|
|
18
|
+
// The mode is process-wide so prompt injection cannot escalate by
|
|
19
|
+
// passing arguments — the LLM can only ever *downgrade* (via
|
|
20
|
+
// `allow_unsafe=false` semantics, which is just "don't try to bypass").
|
|
21
|
+
|
|
22
|
+
export type SafetyMode = "safe" | "mostly_safe" | "bypass";
|
|
23
|
+
|
|
24
|
+
export function resolveSafetyMode(): SafetyMode {
|
|
25
|
+
const raw = (process.env.JARELA_TOOL_SAFETY ?? "").trim().toLowerCase();
|
|
26
|
+
if (raw === "safe") return "safe";
|
|
27
|
+
if (raw === "bypass" || raw === "unsafe") return "bypass";
|
|
28
|
+
return "mostly_safe";
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
// Inspection-only commands allowed in `safe` mode. Matched as the FIRST
|
|
32
|
+
// token (after stripping leading whitespace) — pipelines, redirections,
|
|
33
|
+
// command substitution, &&, ;, etc. are all rejected because we cannot
|
|
34
|
+
// reason about what the right-hand side will do.
|
|
35
|
+
const SAFE_EXEC_ALLOWLIST = new Set([
|
|
36
|
+
"ls", "dir", "pwd", "cd", "echo", "cat", "type", "head", "tail",
|
|
37
|
+
"wc", "stat", "file", "which", "where", "whoami", "hostname",
|
|
38
|
+
"date", "uname", "df", "du", "ps", "env", "printenv",
|
|
39
|
+
"git", "node", "npm", "npx", "deno", "python", "python3", "pip", "pip3",
|
|
40
|
+
]);
|
|
41
|
+
|
|
42
|
+
// Subcommands considered read-only for tools that take a verb. We only
|
|
43
|
+
// need to enumerate the dangerous tools here — anything not listed falls
|
|
44
|
+
// back to "the whole tool is read-only" (e.g. `cat`, `ls`).
|
|
45
|
+
const SAFE_SUBCOMMANDS: Record<string, Set<string>> = {
|
|
46
|
+
git: new Set([
|
|
47
|
+
"status", "log", "diff", "show", "blame", "branch", "tag",
|
|
48
|
+
"remote", "ls-files", "ls-tree", "config", "rev-parse",
|
|
49
|
+
"describe", "shortlog", "reflog",
|
|
50
|
+
]),
|
|
51
|
+
npm: new Set(["ls", "list", "view", "info", "outdated", "config", "whoami", "ping", "doctor"]),
|
|
52
|
+
npx: new Set([]), // npx runs arbitrary code; never allow under "safe"
|
|
53
|
+
node: new Set([]), // bare `node` opens a REPL; `node script.js` runs anything
|
|
54
|
+
python: new Set([]),
|
|
55
|
+
python3: new Set([]),
|
|
56
|
+
deno: new Set(["info", "doc"]),
|
|
57
|
+
pip: new Set(["list", "show", "freeze", "config"]),
|
|
58
|
+
pip3: new Set(["list", "show", "freeze", "config"]),
|
|
59
|
+
};
|
|
60
|
+
|
|
61
|
+
// Shell metacharacters that compose commands or redirect IO. Their
|
|
62
|
+
// presence in `safe` mode is grounds for rejection because the
|
|
63
|
+
// allowlist check only inspects the first token.
|
|
64
|
+
const COMPOSER_RE = /[|&;`$<>]|\$\(|\|\||&&/;
|
|
65
|
+
|
|
66
|
+
export interface ExecAllowResult {
|
|
67
|
+
allowed: boolean;
|
|
68
|
+
reason?: string;
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
export function checkExecAllowed(
|
|
72
|
+
command: string,
|
|
73
|
+
opts: { mode: SafetyMode; allowUnsafe?: boolean; blockedByPattern: boolean },
|
|
74
|
+
): ExecAllowResult {
|
|
75
|
+
if (opts.mode === "bypass") return { allowed: true };
|
|
76
|
+
if (opts.mode === "mostly_safe") {
|
|
77
|
+
if (opts.blockedByPattern && !opts.allowUnsafe) {
|
|
78
|
+
return {
|
|
79
|
+
allowed: false,
|
|
80
|
+
reason:
|
|
81
|
+
"Command blocked by safety policy (mode=mostly_safe). Pass allow_unsafe=true only when you fully trust the command.",
|
|
82
|
+
};
|
|
83
|
+
}
|
|
84
|
+
return { allowed: true };
|
|
85
|
+
}
|
|
86
|
+
// safe mode
|
|
87
|
+
const trimmed = command.trim();
|
|
88
|
+
if (!trimmed) return { allowed: false, reason: "command is required" };
|
|
89
|
+
if (COMPOSER_RE.test(trimmed)) {
|
|
90
|
+
return {
|
|
91
|
+
allowed: false,
|
|
92
|
+
reason:
|
|
93
|
+
"safe mode rejects pipelines, redirection, command substitution, &&, and ;. " +
|
|
94
|
+
"Set JARELA_TOOL_SAFETY=mostly_safe (or bypass) to allow composite commands.",
|
|
95
|
+
};
|
|
96
|
+
}
|
|
97
|
+
const tokens = trimmed.split(/\s+/);
|
|
98
|
+
const head = tokens[0]?.toLowerCase();
|
|
99
|
+
if (!head || !SAFE_EXEC_ALLOWLIST.has(head)) {
|
|
100
|
+
return {
|
|
101
|
+
allowed: false,
|
|
102
|
+
reason:
|
|
103
|
+
`safe mode allows only inspection commands (${[...SAFE_EXEC_ALLOWLIST].sort().join(", ")}). ` +
|
|
104
|
+
"Set JARELA_TOOL_SAFETY=mostly_safe to enable the broader policy.",
|
|
105
|
+
};
|
|
106
|
+
}
|
|
107
|
+
const subAllowlist = SAFE_SUBCOMMANDS[head];
|
|
108
|
+
if (subAllowlist) {
|
|
109
|
+
const sub = tokens[1]?.toLowerCase().replace(/^--?/, "");
|
|
110
|
+
// Allow bare invocations that are themselves read-only (e.g. `git`
|
|
111
|
+
// alone prints help). Reject if the subcommand is missing for tools
|
|
112
|
+
// that need one to be safe (node/python/npx → arbitrary code).
|
|
113
|
+
if (subAllowlist.size === 0) {
|
|
114
|
+
return {
|
|
115
|
+
allowed: false,
|
|
116
|
+
reason: `safe mode refuses '${head}' because it can execute arbitrary code. Use mostly_safe or bypass.`,
|
|
117
|
+
};
|
|
118
|
+
}
|
|
119
|
+
if (sub && !subAllowlist.has(sub)) {
|
|
120
|
+
return {
|
|
121
|
+
allowed: false,
|
|
122
|
+
reason:
|
|
123
|
+
`safe mode allows '${head}' only for: ${[...subAllowlist].sort().join(", ")}. ` +
|
|
124
|
+
"Use mostly_safe or bypass for other subcommands.",
|
|
125
|
+
};
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
return { allowed: true };
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
// File-system op classification.
|
|
132
|
+
export type FsOp = "read" | "write";
|
|
133
|
+
|
|
134
|
+
export function checkFsAllowed(
|
|
135
|
+
op: FsOp,
|
|
136
|
+
opts: { mode: SafetyMode },
|
|
137
|
+
): ExecAllowResult {
|
|
138
|
+
if (opts.mode === "bypass" || opts.mode === "mostly_safe") return { allowed: true };
|
|
139
|
+
// safe mode: reads are fine, writes are not.
|
|
140
|
+
if (op === "read") return { allowed: true };
|
|
141
|
+
return {
|
|
142
|
+
allowed: false,
|
|
143
|
+
reason:
|
|
144
|
+
"safe mode refuses filesystem mutations (write/edit/move/copy/delete/mkdir). " +
|
|
145
|
+
"Set JARELA_TOOL_SAFETY=mostly_safe to enable writes outside credential dirs.",
|
|
146
|
+
};
|
|
147
|
+
}
|
package/package.json
CHANGED