mcp-coordinator 0.2.1 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +846 -846
- package/dashboard/Dockerfile +19 -19
- package/dashboard/public/index.html +1178 -1178
- package/dist/cli/dashboard.js +9 -5
- package/dist/cli/server/backup.d.ts +7 -0
- package/dist/cli/server/backup.js +162 -0
- package/dist/cli/server/index.js +5 -0
- package/dist/cli/server/restore.d.ts +2 -0
- package/dist/cli/server/restore.js +117 -0
- package/dist/cli/server/start.js +24 -1
- package/dist/cli/server/status.js +16 -23
- package/dist/src/agent-activity.js +6 -6
- package/dist/src/agent-registry.js +6 -6
- package/dist/src/announce-workflow.d.ts +52 -0
- package/dist/src/announce-workflow.js +91 -0
- package/dist/src/consultation.d.ts +22 -0
- package/dist/src/consultation.js +118 -45
- package/dist/src/database.js +126 -126
- package/dist/src/db-adapter.d.ts +30 -0
- package/dist/src/db-adapter.js +32 -1
- package/dist/src/dependency-map.js +5 -5
- package/dist/src/file-tracker.d.ts +10 -0
- package/dist/src/file-tracker.js +40 -8
- package/dist/src/http/handle-health.d.ts +23 -0
- package/dist/src/http/handle-health.js +86 -0
- package/dist/src/http/handle-rest.d.ts +23 -0
- package/dist/src/http/handle-rest.js +374 -0
- package/dist/src/http/utils.d.ts +15 -0
- package/dist/src/http/utils.js +39 -0
- package/dist/src/impact-scorer.js +87 -50
- package/dist/src/introspection.js +1 -1
- package/dist/src/metrics.d.ts +83 -0
- package/dist/src/metrics.js +162 -0
- package/dist/src/mqtt-bridge.d.ts +21 -0
- package/dist/src/mqtt-bridge.js +55 -5
- package/dist/src/mqtt-broker.d.ts +16 -0
- package/dist/src/mqtt-broker.js +16 -1
- package/dist/src/path-guard.d.ts +14 -0
- package/dist/src/path-guard.js +44 -0
- package/dist/src/reset-guard.d.ts +16 -0
- package/dist/src/reset-guard.js +24 -0
- package/dist/src/serve-http.d.ts +31 -1
- package/dist/src/serve-http.js +189 -446
- package/dist/src/server-setup.d.ts +2 -0
- package/dist/src/server-setup.js +25 -366
- package/dist/src/sse-emitter.d.ts +6 -0
- package/dist/src/sse-emitter.js +50 -2
- package/dist/src/tools/agents-tools.d.ts +8 -0
- package/dist/src/tools/agents-tools.js +46 -0
- package/dist/src/tools/consultation-tools.d.ts +21 -0
- package/dist/src/tools/consultation-tools.js +170 -0
- package/dist/src/tools/dependencies-tools.d.ts +8 -0
- package/dist/src/tools/dependencies-tools.js +27 -0
- package/dist/src/tools/files-tools.d.ts +8 -0
- package/dist/src/tools/files-tools.js +28 -0
- package/dist/src/tools/mqtt-tools.d.ts +9 -0
- package/dist/src/tools/mqtt-tools.js +33 -0
- package/dist/src/tools/status-tools.d.ts +8 -0
- package/dist/src/tools/status-tools.js +63 -0
- package/package.json +83 -80
|
@@ -1,3 +1,12 @@
|
|
|
1
|
+
// Layer 0 (announced-intent) recency window. Resolved threads older than this
|
|
2
|
+
// are excluded — yesterday's resolved work shouldn't trigger today's scoring.
|
|
3
|
+
// Aligned with file-tracker's default conflict window per the audit guidance.
|
|
4
|
+
const LAYER_0_WINDOW_MINUTES = 30;
|
|
5
|
+
// Layer 1 / 2 (file-activity) recency window. Preserved at 60 minutes to keep
|
|
6
|
+
// strict behavioral parity with the original scorer (the prior implementation
|
|
7
|
+
// hard-coded 60 in the checkFileConflict calls). Performance optimizations
|
|
8
|
+
// must not change scoring outcomes for existing callers.
|
|
9
|
+
const FILE_ACTIVITY_WINDOW_MINUTES = 60;
|
|
1
10
|
export class ImpactScorer {
|
|
2
11
|
registry;
|
|
3
12
|
fileTracker;
|
|
@@ -11,64 +20,92 @@ export class ImpactScorer {
|
|
|
11
20
|
const onlineAgents = this.registry
|
|
12
21
|
.listOnline()
|
|
13
22
|
.filter((a) => a.id !== params.agent_id);
|
|
23
|
+
if (onlineAgents.length === 0)
|
|
24
|
+
return [];
|
|
25
|
+
// O1: cache parsed agent.modules JSON ONCE per scoring call.
|
|
26
|
+
// Previously each agent's modules were JSON.parse'd inside the hot path
|
|
27
|
+
// (Layer 3), which is O(A) parses for A agents. With Layer 0 also reading
|
|
28
|
+
// thread.target_files / depends_on_files per agent, the original code
|
|
29
|
+
// re-parsed agent state up to ~4·A times per call.
|
|
30
|
+
const moduleCache = new Map();
|
|
31
|
+
for (const a of onlineAgents) {
|
|
32
|
+
moduleCache.set(a.id, JSON.parse(a.modules));
|
|
33
|
+
}
|
|
34
|
+
// O3: pre-compute file → set<agent_id> for every file we'll inspect.
|
|
35
|
+
// Replaces N `checkFileConflict` calls (each = 1 SQL round-trip) with a
|
|
36
|
+
// single batched query, and turns the inner per-agent file check into
|
|
37
|
+
// an O(1) Set.has() lookup.
|
|
38
|
+
const filesToIndex = [
|
|
39
|
+
...params.target_files,
|
|
40
|
+
...(params.depends_on_files || []),
|
|
41
|
+
];
|
|
42
|
+
const fileToAgents = filesToIndex.length > 0
|
|
43
|
+
? this.fileTracker.getFileToAgentsIndex(filesToIndex, params.agent_id, FILE_ACTIVITY_WINDOW_MINUTES)
|
|
44
|
+
: new Map();
|
|
45
|
+
// O2: bound the resolved-thread query to a recency window. Without this,
|
|
46
|
+
// listThreads({status:'resolved'}) returns ALL historical resolved threads
|
|
47
|
+
// (unbounded growth). The Layer 0 filter only keeps threads where the
|
|
48
|
+
// initiator is the currently-evaluated agent, but the SQL still scanned
|
|
49
|
+
// every row before the JS filter ran. Since-bound at the SQL layer.
|
|
50
|
+
let activeThreadsByAgent = null;
|
|
51
|
+
if (this.consultation) {
|
|
52
|
+
const allActive = [
|
|
53
|
+
...this.consultation.listThreads({ status: "open" }),
|
|
54
|
+
...this.consultation.listThreads({ status: "resolving" }),
|
|
55
|
+
...this.consultation.listThreads({ status: "resolved", since_minutes: LAYER_0_WINDOW_MINUTES }),
|
|
56
|
+
];
|
|
57
|
+
// Group by initiator_id so the per-agent loop is O(threads-for-this-agent)
|
|
58
|
+
// rather than O(all-active-threads). Avoids an outer-product scan over
|
|
59
|
+
// (agents × threads) when both sets are large.
|
|
60
|
+
activeThreadsByAgent = new Map();
|
|
61
|
+
for (const t of allActive) {
|
|
62
|
+
const list = activeThreadsByAgent.get(t.initiator_id);
|
|
63
|
+
if (list) {
|
|
64
|
+
list.push(t);
|
|
65
|
+
}
|
|
66
|
+
else {
|
|
67
|
+
activeThreadsByAgent.set(t.initiator_id, [t]);
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
}
|
|
14
71
|
return onlineAgents.map((agent) => {
|
|
15
|
-
const agentModules =
|
|
72
|
+
const agentModules = moduleCache.get(agent.id);
|
|
16
73
|
const reasons = [];
|
|
17
74
|
let maxScore = 0;
|
|
18
75
|
// Layer 0: Announced intent overlap (checks active threads from this agent).
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
].filter((t) => t.initiator_id === agent.id);
|
|
43
|
-
for (const thread of activeThreads) {
|
|
44
|
-
const threadFiles = JSON.parse(thread.target_files || "[]");
|
|
45
|
-
const threadDeps = JSON.parse(thread.depends_on_files || "[]");
|
|
46
|
-
// 0a: My target_files ∩ their target_files → score 100
|
|
47
|
-
const fileOverlap = params.target_files.filter((f) => threadFiles.includes(f));
|
|
48
|
-
if (fileOverlap.length > 0) {
|
|
49
|
-
maxScore = Math.max(maxScore, 100);
|
|
50
|
-
reasons.push(`announced same file: ${fileOverlap.join(", ")} (thread ${thread.id.slice(0, 8)})`);
|
|
51
|
-
}
|
|
52
|
-
// 0b: My depends_on ∩ their target_files → score 80 (they modify what I depend on)
|
|
53
|
-
if (params.depends_on_files) {
|
|
54
|
-
const depOverlap = params.depends_on_files.filter((f) => threadFiles.includes(f));
|
|
55
|
-
if (depOverlap.length > 0) {
|
|
76
|
+
if (activeThreadsByAgent) {
|
|
77
|
+
const agentThreads = activeThreadsByAgent.get(agent.id);
|
|
78
|
+
if (agentThreads) {
|
|
79
|
+
for (const thread of agentThreads) {
|
|
80
|
+
const threadFiles = JSON.parse(thread.target_files || "[]");
|
|
81
|
+
const threadDeps = JSON.parse(thread.depends_on_files || "[]");
|
|
82
|
+
// 0a: My target_files ∩ their target_files → score 100
|
|
83
|
+
const fileOverlap = params.target_files.filter((f) => threadFiles.includes(f));
|
|
84
|
+
if (fileOverlap.length > 0) {
|
|
85
|
+
maxScore = Math.max(maxScore, 100);
|
|
86
|
+
reasons.push(`announced same file: ${fileOverlap.join(", ")} (thread ${thread.id.slice(0, 8)})`);
|
|
87
|
+
}
|
|
88
|
+
// 0b: My depends_on ∩ their target_files → score 80 (they modify what I depend on)
|
|
89
|
+
if (params.depends_on_files) {
|
|
90
|
+
const depOverlap = params.depends_on_files.filter((f) => threadFiles.includes(f));
|
|
91
|
+
if (depOverlap.length > 0) {
|
|
92
|
+
maxScore = Math.max(maxScore, 80);
|
|
93
|
+
reasons.push(`modifies my dependency: ${depOverlap.join(", ")} (thread ${thread.id.slice(0, 8)})`);
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
// 0c: My target_files ∩ their depends_on → score 80 (I modify what they depend on)
|
|
97
|
+
const reverseDepOverlap = params.target_files.filter((f) => threadDeps.includes(f));
|
|
98
|
+
if (reverseDepOverlap.length > 0) {
|
|
56
99
|
maxScore = Math.max(maxScore, 80);
|
|
57
|
-
reasons.push(`
|
|
100
|
+
reasons.push(`they depend on my target: ${reverseDepOverlap.join(", ")} (thread ${thread.id.slice(0, 8)})`);
|
|
58
101
|
}
|
|
59
102
|
}
|
|
60
|
-
// 0c: My target_files ∩ their depends_on → score 80 (I modify what they depend on)
|
|
61
|
-
const reverseDepOverlap = params.target_files.filter((f) => threadDeps.includes(f));
|
|
62
|
-
if (reverseDepOverlap.length > 0) {
|
|
63
|
-
maxScore = Math.max(maxScore, 80);
|
|
64
|
-
reasons.push(`they depend on my target: ${reverseDepOverlap.join(", ")} (thread ${thread.id.slice(0, 8)})`);
|
|
65
|
-
}
|
|
66
103
|
}
|
|
67
104
|
}
|
|
68
|
-
// Layer 1: Same file recently modified (score 100)
|
|
105
|
+
// Layer 1: Same file recently modified (score 100) — uses pre-built index.
|
|
69
106
|
for (const targetFile of params.target_files) {
|
|
70
|
-
const
|
|
71
|
-
if (
|
|
107
|
+
const agentsForFile = fileToAgents.get(targetFile);
|
|
108
|
+
if (agentsForFile && agentsForFile.has(agent.id)) {
|
|
72
109
|
maxScore = Math.max(maxScore, 100);
|
|
73
110
|
reasons.push(`same file: ${targetFile}`);
|
|
74
111
|
}
|
|
@@ -76,8 +113,8 @@ export class ImpactScorer {
|
|
|
76
113
|
// Layer 2: Depends-on file recently modified (score 80)
|
|
77
114
|
if (params.depends_on_files) {
|
|
78
115
|
for (const depFile of params.depends_on_files) {
|
|
79
|
-
const
|
|
80
|
-
if (
|
|
116
|
+
const agentsForFile = fileToAgents.get(depFile);
|
|
117
|
+
if (agentsForFile && agentsForFile.has(agent.id)) {
|
|
81
118
|
maxScore = Math.max(maxScore, 80);
|
|
82
119
|
reasons.push(`depends on: ${depFile}`);
|
|
83
120
|
}
|
|
@@ -4,7 +4,7 @@ export class IntrospectionManager {
|
|
|
4
4
|
create(params) {
|
|
5
5
|
const db = getDb();
|
|
6
6
|
const id = randomUUID();
|
|
7
|
-
db.prepare(`INSERT INTO introspections (id, thread_id, agent_id, score, reasons)
|
|
7
|
+
db.prepare(`INSERT INTO introspections (id, thread_id, agent_id, score, reasons)
|
|
8
8
|
VALUES (?, ?, ?, ?, ?)`).run(id, params.thread_id, params.agent_id, params.score, JSON.stringify(params.reasons));
|
|
9
9
|
return this.get(id);
|
|
10
10
|
}
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Prometheus /metrics endpoint for mcp-coordinator (v0.4 Operability fix).
|
|
3
|
+
*
|
|
4
|
+
* Audit gap: README claims "Production-ready" but only exposed /health stub.
|
|
5
|
+
* This module wires prom-client counters/gauges keyed off the events that
|
|
6
|
+
* already flow through the coordinator (announces, resolutions, MQTT
|
|
7
|
+
* publishes, REST requests, auth rejections) plus a snapshot of the live
|
|
8
|
+
* system state (agents, threads, MQTT listeners, SSE clients).
|
|
9
|
+
*
|
|
10
|
+
* Design notes:
|
|
11
|
+
* - Uses a per-instance `Registry` (not the global default registry) so that
|
|
12
|
+
* multiple Coordinator instances in the same process (essaim's orchestrator
|
|
13
|
+
* pattern) get isolated metric counters instead of cross-contaminating.
|
|
14
|
+
* - Gauges are pulled lazily from a `services` snapshot at scrape time via
|
|
15
|
+
* `gaugeSnapshot()` — cheaper than maintaining mirror state and guarantees
|
|
16
|
+
* the value matches the DB (no drift from missed events).
|
|
17
|
+
* - SSE clients and MQTT listeners aren't exposed as public counts on those
|
|
18
|
+
* classes today. Callers update those gauges directly via `setSseClients`
|
|
19
|
+
* / `setMqttListeners` from the request lifecycle (see integration patch).
|
|
20
|
+
*/
|
|
21
|
+
import type { IncomingMessage, ServerResponse } from "http";
|
|
22
|
+
import { Registry, Counter, Gauge } from "prom-client";
|
|
23
|
+
import type { CoordinatorServices } from "./server-setup.js";
|
|
24
|
+
export type AnnounceResult = "thread_opened" | "auto_resolved";
|
|
25
|
+
export type ResolutionType = "consensus" | "timeout" | "auto_resolved" | "agent_departure" | "max_rounds" | "closed";
|
|
26
|
+
export interface MetricsOptions {
|
|
27
|
+
/**
|
|
28
|
+
* If true, also collect Node.js process metrics (CPU, memory, event-loop
|
|
29
|
+
* lag, etc.) via prom-client's collectDefaultMetrics. Default: true.
|
|
30
|
+
*/
|
|
31
|
+
collectDefault?: boolean;
|
|
32
|
+
}
|
|
33
|
+
export declare class Metrics {
|
|
34
|
+
readonly registry: Registry;
|
|
35
|
+
readonly announces: Counter<"result">;
|
|
36
|
+
readonly threadsResolved: Counter<"type">;
|
|
37
|
+
readonly mqttPublishes: Counter<string>;
|
|
38
|
+
readonly httpRequests: Counter<"route" | "status">;
|
|
39
|
+
readonly authRejected: Counter<string>;
|
|
40
|
+
readonly agentsOnline: Gauge<string>;
|
|
41
|
+
readonly threadsOpen: Gauge<string>;
|
|
42
|
+
readonly threadsResolving: Gauge<string>;
|
|
43
|
+
readonly mqttListenersActive: Gauge<string>;
|
|
44
|
+
readonly sseClientsActive: Gauge<string>;
|
|
45
|
+
constructor(opts?: MetricsOptions);
|
|
46
|
+
recordAnnounce(result: AnnounceResult): void;
|
|
47
|
+
recordThreadResolved(type: ResolutionType): void;
|
|
48
|
+
recordMqttPublish(): void;
|
|
49
|
+
recordHttpRequest(route: string, status: number): void;
|
|
50
|
+
recordAuthRejected(): void;
|
|
51
|
+
setSseClients(n: number): void;
|
|
52
|
+
incSseClients(): void;
|
|
53
|
+
decSseClients(): void;
|
|
54
|
+
setMqttListeners(n: number): void;
|
|
55
|
+
/**
|
|
56
|
+
* Snapshot the gauges that derive from durable state (agents/threads).
|
|
57
|
+
* Called at scrape time so the values are fresh without us having to mirror
|
|
58
|
+
* every state transition. Safe to call repeatedly.
|
|
59
|
+
*
|
|
60
|
+
* Reads via the DB directly because AgentRegistry/Consultation don't yet
|
|
61
|
+
* expose count getters — adding them would touch unrelated modules.
|
|
62
|
+
*/
|
|
63
|
+
gaugeSnapshot(services: CoordinatorServices): void;
|
|
64
|
+
/**
|
|
65
|
+
* Render the current registry as Prometheus text exposition format.
|
|
66
|
+
* Returns the body string + the content-type to set on the response.
|
|
67
|
+
*/
|
|
68
|
+
render(): Promise<{
|
|
69
|
+
body: string;
|
|
70
|
+
contentType: string;
|
|
71
|
+
}>;
|
|
72
|
+
}
|
|
73
|
+
/**
|
|
74
|
+
* HTTP handler for GET /metrics. Refreshes the derived gauges from a
|
|
75
|
+
* services snapshot, then writes the prom-client text exposition.
|
|
76
|
+
*
|
|
77
|
+
* Wire from serve-http.ts:
|
|
78
|
+
* if (url === "/metrics" && req.method === "GET") {
|
|
79
|
+
* await serveMetrics(req, res, services, metrics);
|
|
80
|
+
* return;
|
|
81
|
+
* }
|
|
82
|
+
*/
|
|
83
|
+
export declare function serveMetrics(_req: IncomingMessage, res: ServerResponse, services: CoordinatorServices, metrics: Metrics): Promise<void>;
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
import { Registry, Counter, Gauge, collectDefaultMetrics, } from "prom-client";
|
|
2
|
+
import { getDb } from "./database.js";
|
|
3
|
+
export class Metrics {
|
|
4
|
+
registry;
|
|
5
|
+
// Counters
|
|
6
|
+
announces;
|
|
7
|
+
threadsResolved;
|
|
8
|
+
mqttPublishes;
|
|
9
|
+
httpRequests;
|
|
10
|
+
authRejected;
|
|
11
|
+
// Gauges
|
|
12
|
+
agentsOnline;
|
|
13
|
+
threadsOpen;
|
|
14
|
+
threadsResolving;
|
|
15
|
+
mqttListenersActive;
|
|
16
|
+
sseClientsActive;
|
|
17
|
+
constructor(opts = {}) {
|
|
18
|
+
this.registry = new Registry();
|
|
19
|
+
if (opts.collectDefault !== false) {
|
|
20
|
+
collectDefaultMetrics({ register: this.registry });
|
|
21
|
+
}
|
|
22
|
+
this.announces = new Counter({
|
|
23
|
+
name: "mcp_coordinator_announces_total",
|
|
24
|
+
help: "Total announce_work calls, partitioned by outcome",
|
|
25
|
+
labelNames: ["result"],
|
|
26
|
+
registers: [this.registry],
|
|
27
|
+
});
|
|
28
|
+
this.threadsResolved = new Counter({
|
|
29
|
+
name: "mcp_coordinator_threads_resolved_total",
|
|
30
|
+
help: "Total threads resolved, partitioned by resolution type",
|
|
31
|
+
labelNames: ["type"],
|
|
32
|
+
registers: [this.registry],
|
|
33
|
+
});
|
|
34
|
+
this.mqttPublishes = new Counter({
|
|
35
|
+
name: "mcp_coordinator_mqtt_publishes_total",
|
|
36
|
+
help: "Total MQTT publishes by the coordinator bridge",
|
|
37
|
+
registers: [this.registry],
|
|
38
|
+
});
|
|
39
|
+
this.httpRequests = new Counter({
|
|
40
|
+
name: "mcp_coordinator_http_requests_total",
|
|
41
|
+
help: "Total HTTP requests handled, partitioned by route + status",
|
|
42
|
+
labelNames: ["route", "status"],
|
|
43
|
+
registers: [this.registry],
|
|
44
|
+
});
|
|
45
|
+
this.authRejected = new Counter({
|
|
46
|
+
name: "mcp_coordinator_auth_rejected_total",
|
|
47
|
+
help: "Total authentication rejections",
|
|
48
|
+
registers: [this.registry],
|
|
49
|
+
});
|
|
50
|
+
this.agentsOnline = new Gauge({
|
|
51
|
+
name: "mcp_coordinator_agents_online",
|
|
52
|
+
help: "Current number of agents reporting status=online",
|
|
53
|
+
registers: [this.registry],
|
|
54
|
+
});
|
|
55
|
+
this.threadsOpen = new Gauge({
|
|
56
|
+
name: "mcp_coordinator_threads_open",
|
|
57
|
+
help: "Current number of threads in status=open",
|
|
58
|
+
registers: [this.registry],
|
|
59
|
+
});
|
|
60
|
+
this.threadsResolving = new Gauge({
|
|
61
|
+
name: "mcp_coordinator_threads_resolving",
|
|
62
|
+
help: "Current number of threads in status=resolving",
|
|
63
|
+
registers: [this.registry],
|
|
64
|
+
});
|
|
65
|
+
this.mqttListenersActive = new Gauge({
|
|
66
|
+
name: "mcp_coordinator_mqtt_listeners_active",
|
|
67
|
+
help: "Current number of registered MQTT consultation listeners",
|
|
68
|
+
registers: [this.registry],
|
|
69
|
+
});
|
|
70
|
+
this.sseClientsActive = new Gauge({
|
|
71
|
+
name: "mcp_coordinator_sse_clients_active",
|
|
72
|
+
help: "Current number of connected SSE clients",
|
|
73
|
+
registers: [this.registry],
|
|
74
|
+
});
|
|
75
|
+
}
|
|
76
|
+
// ── Counter helpers (named methods make hook points obvious) ──
|
|
77
|
+
recordAnnounce(result) {
|
|
78
|
+
this.announces.inc({ result }, 1);
|
|
79
|
+
}
|
|
80
|
+
recordThreadResolved(type) {
|
|
81
|
+
this.threadsResolved.inc({ type }, 1);
|
|
82
|
+
}
|
|
83
|
+
recordMqttPublish() {
|
|
84
|
+
this.mqttPublishes.inc(1);
|
|
85
|
+
}
|
|
86
|
+
recordHttpRequest(route, status) {
|
|
87
|
+
this.httpRequests.inc({ route, status: String(status) }, 1);
|
|
88
|
+
}
|
|
89
|
+
recordAuthRejected() {
|
|
90
|
+
this.authRejected.inc(1);
|
|
91
|
+
}
|
|
92
|
+
// ── Gauge setters (called from request lifecycle, see integration patch) ──
|
|
93
|
+
setSseClients(n) {
|
|
94
|
+
this.sseClientsActive.set(n);
|
|
95
|
+
}
|
|
96
|
+
incSseClients() {
|
|
97
|
+
this.sseClientsActive.inc(1);
|
|
98
|
+
}
|
|
99
|
+
decSseClients() {
|
|
100
|
+
this.sseClientsActive.dec(1);
|
|
101
|
+
}
|
|
102
|
+
setMqttListeners(n) {
|
|
103
|
+
this.mqttListenersActive.set(n);
|
|
104
|
+
}
|
|
105
|
+
/**
|
|
106
|
+
* Snapshot the gauges that derive from durable state (agents/threads).
|
|
107
|
+
* Called at scrape time so the values are fresh without us having to mirror
|
|
108
|
+
* every state transition. Safe to call repeatedly.
|
|
109
|
+
*
|
|
110
|
+
* Reads via the DB directly because AgentRegistry/Consultation don't yet
|
|
111
|
+
* expose count getters — adding them would touch unrelated modules.
|
|
112
|
+
*/
|
|
113
|
+
gaugeSnapshot(services) {
|
|
114
|
+
try {
|
|
115
|
+
this.agentsOnline.set(services.registry.listOnline().length);
|
|
116
|
+
}
|
|
117
|
+
catch {
|
|
118
|
+
// Registry not initialised yet (test bootstrap race) — leave gauge at 0.
|
|
119
|
+
}
|
|
120
|
+
try {
|
|
121
|
+
const db = getDb();
|
|
122
|
+
const open = db
|
|
123
|
+
.prepare("SELECT COUNT(*) as c FROM threads WHERE status = 'open'")
|
|
124
|
+
.get();
|
|
125
|
+
const resolving = db
|
|
126
|
+
.prepare("SELECT COUNT(*) as c FROM threads WHERE status = 'resolving'")
|
|
127
|
+
.get();
|
|
128
|
+
this.threadsOpen.set(open.c);
|
|
129
|
+
this.threadsResolving.set(resolving.c);
|
|
130
|
+
}
|
|
131
|
+
catch {
|
|
132
|
+
// DB not initialised — leave gauges at their last-set values.
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
/**
|
|
136
|
+
* Render the current registry as Prometheus text exposition format.
|
|
137
|
+
* Returns the body string + the content-type to set on the response.
|
|
138
|
+
*/
|
|
139
|
+
async render() {
|
|
140
|
+
const body = await this.registry.metrics();
|
|
141
|
+
return { body, contentType: this.registry.contentType };
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
/**
|
|
145
|
+
* HTTP handler for GET /metrics. Refreshes the derived gauges from a
|
|
146
|
+
* services snapshot, then writes the prom-client text exposition.
|
|
147
|
+
*
|
|
148
|
+
* Wire from serve-http.ts:
|
|
149
|
+
* if (url === "/metrics" && req.method === "GET") {
|
|
150
|
+
* await serveMetrics(req, res, services, metrics);
|
|
151
|
+
* return;
|
|
152
|
+
* }
|
|
153
|
+
*/
|
|
154
|
+
export async function serveMetrics(_req, res, services, metrics) {
|
|
155
|
+
metrics.gaugeSnapshot(services);
|
|
156
|
+
const { body, contentType } = await metrics.render();
|
|
157
|
+
res.writeHead(200, {
|
|
158
|
+
"Content-Type": contentType,
|
|
159
|
+
"Cache-Control": "no-cache",
|
|
160
|
+
});
|
|
161
|
+
res.end(body);
|
|
162
|
+
}
|
|
@@ -10,14 +10,35 @@ export declare class MqttBridge {
|
|
|
10
10
|
private onOfflineHandler;
|
|
11
11
|
private listeners;
|
|
12
12
|
private log;
|
|
13
|
+
private agentId;
|
|
14
|
+
/**
|
|
15
|
+
* P1: track the last threadId we retained on `coordinator/consultations/new`.
|
|
16
|
+
* The topic is fixed (not per-thread), so retain holds only the LAST event.
|
|
17
|
+
* `clearRetainedConsultation(threadId)` only clears when it matches, so a
|
|
18
|
+
* later consultation isn't accidentally wiped by a stale resolve callback.
|
|
19
|
+
*/
|
|
20
|
+
private lastRetainedConsultationThreadId;
|
|
13
21
|
constructor(logger?: Logger);
|
|
14
22
|
connect(config: {
|
|
15
23
|
url: string;
|
|
24
|
+
username?: string;
|
|
25
|
+
password?: string;
|
|
26
|
+
agentId?: string;
|
|
16
27
|
}): Promise<void>;
|
|
17
28
|
isConnected(): boolean;
|
|
18
29
|
onOffline(handler: (agentId: string) => void): void;
|
|
19
30
|
registerAgent(agentId: string, name: string): void;
|
|
20
31
|
publishConsultation(threadId: string, agentId: string, subject: string, targetModules: string[]): void;
|
|
32
|
+
/**
|
|
33
|
+
* P1 fix: clear the retained `coordinator/consultations/new` event when the
|
|
34
|
+
* matching thread resolves. The topic is fixed (not per-thread), so retain
|
|
35
|
+
* holds only the LAST consultation — clearing here means a coordinator
|
|
36
|
+
* restart after resolution doesn't re-broadcast a stale "new" event.
|
|
37
|
+
*
|
|
38
|
+
* No-op when the supplied threadId doesn't match the currently retained one
|
|
39
|
+
* (a newer consultation has already overwritten it).
|
|
40
|
+
*/
|
|
41
|
+
clearRetainedConsultation(threadId: string): void;
|
|
21
42
|
publishMessage(threadId: string, agentId: string, type: string, content: string): void;
|
|
22
43
|
publishResolution(threadId: string, status: string, summary: string): void;
|
|
23
44
|
publishBroadcast(agentId: string, message: string): void;
|
package/dist/src/mqtt-bridge.js
CHANGED
|
@@ -6,6 +6,14 @@ export class MqttBridge {
|
|
|
6
6
|
onOfflineHandler = null;
|
|
7
7
|
listeners = new Map();
|
|
8
8
|
log;
|
|
9
|
+
agentId = "coordinator-internal";
|
|
10
|
+
/**
|
|
11
|
+
* P1: track the last threadId we retained on `coordinator/consultations/new`.
|
|
12
|
+
* The topic is fixed (not per-thread), so retain holds only the LAST event.
|
|
13
|
+
* `clearRetainedConsultation(threadId)` only clears when it matches, so a
|
|
14
|
+
* later consultation isn't accidentally wiped by a stale resolve callback.
|
|
15
|
+
*/
|
|
16
|
+
lastRetainedConsultationThreadId = null;
|
|
9
17
|
constructor(logger) {
|
|
10
18
|
this.log = logger || silentLogger;
|
|
11
19
|
}
|
|
@@ -14,9 +22,24 @@ export class MqttBridge {
|
|
|
14
22
|
const timeout = setTimeout(() => {
|
|
15
23
|
reject(new Error("MQTT connection timeout"));
|
|
16
24
|
}, 5000);
|
|
25
|
+
// P1 fix: LWT requires a stable agent identifier. Default to
|
|
26
|
+
// "coordinator-internal" which matches the auth identity used by
|
|
27
|
+
// serve-http for the embedded broker bridge.
|
|
28
|
+
this.agentId = config.agentId || "coordinator-internal";
|
|
17
29
|
this.client = mqtt.connect(config.url, {
|
|
18
|
-
clientId:
|
|
30
|
+
clientId: `${this.agentId}-${Date.now()}`,
|
|
19
31
|
clean: true,
|
|
32
|
+
username: config.username,
|
|
33
|
+
password: config.password,
|
|
34
|
+
// P1 fix: register Last Will & Testament so a crashed/disconnected
|
|
35
|
+
// bridge automatically broadcasts offline status. Without this the
|
|
36
|
+
// agent appears online indefinitely after an unexpected disconnect.
|
|
37
|
+
will: {
|
|
38
|
+
topic: `coordinator/agents/${this.agentId}/status`,
|
|
39
|
+
payload: Buffer.from(JSON.stringify({ status: "offline", reason: "lwt_unexpected" })),
|
|
40
|
+
qos: 1,
|
|
41
|
+
retain: false,
|
|
42
|
+
},
|
|
20
43
|
});
|
|
21
44
|
this.client.on("connect", () => {
|
|
22
45
|
clearTimeout(timeout);
|
|
@@ -78,17 +101,40 @@ export class MqttBridge {
|
|
|
78
101
|
publishConsultation(threadId, agentId, subject, targetModules) {
|
|
79
102
|
if (!this.client || !this.connected)
|
|
80
103
|
return;
|
|
81
|
-
|
|
104
|
+
// P1 fix: QoS 1 (at-least-once) so consultation events survive transient
|
|
105
|
+
// disconnects. retain=true so a coordinator/subscriber restart can rebuild
|
|
106
|
+
// the active state without an event-history replay.
|
|
107
|
+
this.lastRetainedConsultationThreadId = threadId;
|
|
108
|
+
this.client.publish("coordinator/consultations/new", JSON.stringify({ thread_id: threadId, agent_id: agentId, subject, target_modules: targetModules }), { qos: 1, retain: true });
|
|
109
|
+
}
|
|
110
|
+
/**
|
|
111
|
+
* P1 fix: clear the retained `coordinator/consultations/new` event when the
|
|
112
|
+
* matching thread resolves. The topic is fixed (not per-thread), so retain
|
|
113
|
+
* holds only the LAST consultation — clearing here means a coordinator
|
|
114
|
+
* restart after resolution doesn't re-broadcast a stale "new" event.
|
|
115
|
+
*
|
|
116
|
+
* No-op when the supplied threadId doesn't match the currently retained one
|
|
117
|
+
* (a newer consultation has already overwritten it).
|
|
118
|
+
*/
|
|
119
|
+
clearRetainedConsultation(threadId) {
|
|
120
|
+
if (!this.client || !this.connected)
|
|
121
|
+
return;
|
|
122
|
+
if (this.lastRetainedConsultationThreadId !== threadId)
|
|
123
|
+
return;
|
|
124
|
+
this.client.publish("coordinator/consultations/new", "", { qos: 1, retain: true });
|
|
125
|
+
this.lastRetainedConsultationThreadId = null;
|
|
82
126
|
}
|
|
83
127
|
publishMessage(threadId, agentId, type, content) {
|
|
84
128
|
if (!this.client || !this.connected)
|
|
85
129
|
return;
|
|
130
|
+
// QoS 0: high-frequency chat-style traffic, lossy-OK.
|
|
86
131
|
this.client.publish(`coordinator/consultations/${threadId}/messages`, JSON.stringify({ agent_id: agentId, type, content }));
|
|
87
132
|
}
|
|
88
133
|
publishResolution(threadId, status, summary) {
|
|
89
134
|
if (!this.client || !this.connected)
|
|
90
135
|
return;
|
|
91
|
-
|
|
136
|
+
// P1 fix: QoS 1 (at-least-once) — resolution is a state-change event.
|
|
137
|
+
this.client.publish(`coordinator/consultations/${threadId}/status`, JSON.stringify({ status, summary }), { qos: 1, retain: true });
|
|
92
138
|
}
|
|
93
139
|
publishBroadcast(agentId, message) {
|
|
94
140
|
if (!this.client || !this.connected)
|
|
@@ -103,12 +149,15 @@ export class MqttBridge {
|
|
|
103
149
|
publishTaskClaimed(threadId, claimedBy) {
|
|
104
150
|
if (!this.client || !this.connected)
|
|
105
151
|
return;
|
|
106
|
-
|
|
152
|
+
// P1 fix: QoS 1 — claim is a coordination state-change. Loss would mean
|
|
153
|
+
// multiple agents think a task is unclaimed.
|
|
154
|
+
this.client.publish(`coordinator/consultations/${threadId}/claimed`, JSON.stringify({ agent_id: claimedBy, claimed_by: claimedBy, claimed_at: new Date().toISOString() }), { qos: 1 });
|
|
107
155
|
}
|
|
108
156
|
publishTaskCompleted(threadId, completedBy, summary) {
|
|
109
157
|
if (!this.client || !this.connected)
|
|
110
158
|
return;
|
|
111
|
-
|
|
159
|
+
// P1 fix: QoS 1 — completion is a coordination state-change.
|
|
160
|
+
this.client.publish(`coordinator/consultations/${threadId}/completed`, JSON.stringify({ agent_id: completedBy, completed_by: completedBy, summary }), { qos: 1 });
|
|
112
161
|
}
|
|
113
162
|
/**
|
|
114
163
|
* Fanout a refreshed QuotaInfo to live subscribers (dashboard widget,
|
|
@@ -118,6 +167,7 @@ export class MqttBridge {
|
|
|
118
167
|
publishQuotaUpdate(info) {
|
|
119
168
|
if (!this.client || !this.connected)
|
|
120
169
|
return;
|
|
170
|
+
// QoS 0: high-frequency telemetry, lossy-OK (the next refresh overwrites).
|
|
121
171
|
this.client.publish("coordinator/quota/update", JSON.stringify(info));
|
|
122
172
|
}
|
|
123
173
|
// ── Agent listener methods (for integrated MCP tools) ──
|
|
@@ -5,11 +5,27 @@ export interface EmbeddedMqttBroker {
|
|
|
5
5
|
wsPath: string | null;
|
|
6
6
|
close: () => Promise<void>;
|
|
7
7
|
}
|
|
8
|
+
/**
|
|
9
|
+
* B3 fix: opt-in MQTT authentication. When provided, every CONNECT packet's
|
|
10
|
+
* password field is passed to authenticate(). Returning false rejects the
|
|
11
|
+
* client. When omitted (default), the broker accepts anonymous connections —
|
|
12
|
+
* preserving the existing behavior so essaim and other clients without auth
|
|
13
|
+
* keep working unchanged.
|
|
14
|
+
*
|
|
15
|
+
* The internal coordinator client (MqttBridge) bypasses this by passing an
|
|
16
|
+
* internal admin token when AUTH_ENABLED is true.
|
|
17
|
+
*/
|
|
18
|
+
export type MqttAuthVerifier = (username: string | undefined, password: Buffer | undefined) => Promise<boolean>;
|
|
8
19
|
export interface EmbeddedMqttOptions {
|
|
9
20
|
tcpPort?: number;
|
|
10
21
|
httpServer?: HttpServer;
|
|
11
22
|
wsPath?: string;
|
|
12
23
|
logger: Logger;
|
|
24
|
+
/**
|
|
25
|
+
* Per-CONNECT auth verifier. Omit to allow anonymous (default — backwards
|
|
26
|
+
* compatible with essaim and any client not using auth).
|
|
27
|
+
*/
|
|
28
|
+
authenticate?: MqttAuthVerifier;
|
|
13
29
|
}
|
|
14
30
|
/**
|
|
15
31
|
* Start an embedded MQTT broker (aedes) exposed via TCP, WebSocket, or both.
|
package/dist/src/mqtt-broker.js
CHANGED
|
@@ -38,8 +38,23 @@ function wsToDuplex(ws) {
|
|
|
38
38
|
* fully ready, which causes client connect timeouts in compiled binaries.
|
|
39
39
|
*/
|
|
40
40
|
export async function startEmbeddedMqttBroker(opts) {
|
|
41
|
-
const { tcpPort, httpServer, wsPath = "/mqtt", logger } = opts;
|
|
41
|
+
const { tcpPort, httpServer, wsPath = "/mqtt", logger, authenticate } = opts;
|
|
42
42
|
const broker = await Aedes.createBroker();
|
|
43
|
+
if (authenticate) {
|
|
44
|
+
// B3 fix: when AUTH_ENABLED, every CONNECT must present a valid token.
|
|
45
|
+
broker.authenticate =
|
|
46
|
+
(client, username, password, cb) => {
|
|
47
|
+
Promise.resolve(authenticate(username, password)).then((ok) => {
|
|
48
|
+
if (!ok)
|
|
49
|
+
logger.warn({ client_id: client?.id, username }, "MQTT auth rejected");
|
|
50
|
+
cb(null, ok);
|
|
51
|
+
}, (err) => {
|
|
52
|
+
logger.warn({ client_id: client?.id, err: err.message }, "MQTT auth error");
|
|
53
|
+
cb(null, false);
|
|
54
|
+
});
|
|
55
|
+
};
|
|
56
|
+
logger.info("MQTT auth enabled (token in CONNECT password)");
|
|
57
|
+
}
|
|
43
58
|
broker.on("client", (client) => {
|
|
44
59
|
logger.debug({ client_id: client?.id }, "MQTT client connected");
|
|
45
60
|
});
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Resolve a request URL into a safe filesystem path within a known root.
|
|
3
|
+
*
|
|
4
|
+
* Defends against path traversal: a request like `/dashboard/../../etc/passwd`
|
|
5
|
+
* would otherwise escape the dashboard directory because `path.join` does not
|
|
6
|
+
* validate that the result stays under the root.
|
|
7
|
+
*
|
|
8
|
+
* Returns the resolved absolute path on success, or `null` if the path would
|
|
9
|
+
* escape the root, contains a null byte, or is otherwise invalid.
|
|
10
|
+
*
|
|
11
|
+
* `urlPath` should already have the route prefix stripped (e.g. for
|
|
12
|
+
* `/dashboard/app.js` pass `"app.js"`).
|
|
13
|
+
*/
|
|
14
|
+
export declare function safeJoinUnderRoot(root: string, urlPath: string): string | null;
|