@poolzin/pool-bot 2026.3.7 → 2026.3.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +40 -0
- package/README.md +147 -69
- package/dist/.buildstamp +1 -1
- package/dist/agents/error-classifier.js +251 -0
- package/dist/agents/skills/security.js +211 -0
- package/dist/build-info.json +3 -3
- package/dist/cli/cron-cli/register.cron-dashboard.js +339 -0
- package/dist/cli/cron-cli/register.js +2 -0
- package/dist/cli/errors.js +187 -0
- package/dist/cli/lazy-commands.example.js +113 -0
- package/dist/cli/lazy-commands.js +329 -0
- package/dist/cli/program/command-registry.js +26 -0
- package/dist/cli/program/register.maintenance.js +21 -0
- package/dist/cli/program/register.skills.js +4 -0
- package/dist/cli/program/register.subclis.js +9 -0
- package/dist/cli/swarm-cli/register.js +8 -0
- package/dist/cli/swarm-cli/register.swarm-status.js +488 -0
- package/dist/cli/telemetry-cli/register.js +10 -0
- package/dist/cli/telemetry-cli/register.telemetry-alerts.js +176 -0
- package/dist/cli/telemetry-cli/register.telemetry-metrics.js +323 -0
- package/dist/cli/telemetry-cli/register.telemetry-status.js +179 -0
- package/dist/commands/doctor-checks.js +498 -0
- package/dist/config/config.js +1 -0
- package/dist/config/secrets-integration.js +88 -0
- package/dist/context-engine/index.js +33 -0
- package/dist/context-engine/legacy.js +179 -0
- package/dist/context-engine/registry.js +86 -0
- package/dist/context-engine/summarizing.js +290 -0
- package/dist/context-engine/types.js +7 -0
- package/dist/cron/service/timer.js +18 -0
- package/dist/gateway/protocol/index.js +5 -2
- package/dist/gateway/protocol/schema/error-codes.js +1 -0
- package/dist/gateway/protocol/schema/swarm.js +80 -0
- package/dist/gateway/protocol/schema.js +1 -0
- package/dist/gateway/server-close.js +4 -0
- package/dist/gateway/server-constants.js +1 -0
- package/dist/gateway/server-cron.js +29 -0
- package/dist/gateway/server-maintenance.js +35 -2
- package/dist/gateway/server-methods/swarm.js +58 -0
- package/dist/gateway/server-methods/telemetry.js +71 -0
- package/dist/gateway/server-methods-list.js +8 -0
- package/dist/gateway/server-methods.js +9 -2
- package/dist/gateway/server.impl.js +33 -16
- package/dist/infra/abort-pattern.js +106 -0
- package/dist/infra/retry.js +96 -0
- package/dist/secrets/index.js +28 -0
- package/dist/secrets/resolver.js +185 -0
- package/dist/secrets/runtime.js +142 -0
- package/dist/secrets/types.js +11 -0
- package/dist/security/dangerous-tools.js +80 -0
- package/dist/security/types.js +12 -0
- package/dist/skills/commands.js +333 -0
- package/dist/skills/index.js +164 -0
- package/dist/skills/loader.js +282 -0
- package/dist/skills/parser.js +446 -0
- package/dist/skills/registry.js +394 -0
- package/dist/skills/security.js +312 -0
- package/dist/skills/types.js +21 -0
- package/dist/swarm/service.js +247 -0
- package/dist/telemetry/alert-engine.js +258 -0
- package/dist/telemetry/cron-instrumentation.js +49 -0
- package/dist/telemetry/gateway-instrumentation.js +80 -0
- package/dist/telemetry/instrumentation.js +66 -0
- package/dist/telemetry/service.js +345 -0
- package/dist/test-utils/index.js +219 -0
- package/dist/tui/components/assistant-message.js +6 -2
- package/dist/tui/components/hyperlink-markdown.js +32 -0
- package/dist/tui/components/searchable-select-list.js +12 -1
- package/dist/tui/components/user-message.js +6 -2
- package/dist/tui/index.js +611 -0
- package/dist/tui/theme/theme-detection.js +226 -0
- package/dist/tui/tui-command-handlers.js +20 -0
- package/dist/tui/tui-formatters.js +4 -3
- package/dist/tui/utils/ctrl-c-handler.js +67 -0
- package/dist/tui/utils/osc8-hyperlinks.js +208 -0
- package/dist/tui/utils/safe-stop.js +180 -0
- package/dist/tui/utils/session-key-utils.js +81 -0
- package/dist/tui/utils/text-sanitization.js +284 -0
- package/dist/utils/lru-cache.js +116 -0
- package/dist/utils/performance.js +199 -0
- package/dist/utils/retry.js +240 -0
- package/docs/INTEGRATION_PLAN.md +475 -0
- package/docs/INTEGRATION_SUMMARY.md +215 -0
- package/docs/MELHORIAS_IMPLEMENTADAS.md +228 -0
- package/docs/MELHORIAS_PROFISSIONAIS.md +282 -0
- package/docs/PLANO_ACAO_TUI.md +357 -0
- package/docs/PROGRESSO_TUI.md +66 -0
- package/docs/RELATORIO_FINAL.md +217 -0
- package/docs/diagnostico-shell-completion.md +265 -0
- package/docs/features/advanced-memory.md +585 -0
- package/docs/features/discord-components-v2.md +277 -0
- package/docs/features/swarm.md +100 -0
- package/docs/features/telemetry.md +284 -0
- package/docs/integrations/HEXSTRIKE_PLAN.md +796 -0
- package/docs/integrations/INTEGRATION_PLAN.md +744 -0
- package/docs/integrations/PAGE_AGENT_PLAN.md +370 -0
- package/docs/integrations/XYOPS_PLAN.md +978 -0
- package/docs/models/provider-infrastructure.md +400 -0
- package/docs/security/exec-approvals.md +294 -0
- package/docs/skills/IMPLEMENTATION_SUMMARY.md +145 -0
- package/docs/skills/SKILL.md +524 -0
- package/docs/skills.md +405 -0
- package/extensions/bluebubbles/package.json +1 -1
- package/extensions/copilot-proxy/package.json +1 -1
- package/extensions/diagnostics-otel/package.json +1 -1
- package/extensions/discord/package.json +1 -1
- package/extensions/feishu/package.json +1 -1
- package/extensions/google-antigravity-auth/package.json +1 -1
- package/extensions/google-gemini-cli-auth/package.json +1 -1
- package/extensions/googlechat/package.json +1 -1
- package/extensions/hexstrike-bridge/README.md +119 -0
- package/extensions/hexstrike-bridge/index.test.ts +247 -0
- package/extensions/hexstrike-bridge/index.ts +487 -0
- package/extensions/hexstrike-bridge/package.json +17 -0
- package/extensions/imessage/package.json +1 -1
- package/extensions/irc/package.json +1 -1
- package/extensions/line/package.json +1 -1
- package/extensions/llm-task/package.json +1 -1
- package/extensions/lobster/package.json +1 -1
- package/extensions/matrix/CHANGELOG.md +5 -0
- package/extensions/matrix/package.json +1 -1
- package/extensions/mattermost/package.json +1 -1
- package/extensions/mcp-server/index.ts +14 -0
- package/extensions/mcp-server/package.json +11 -0
- package/extensions/mcp-server/src/service.ts +540 -0
- package/extensions/memory-core/package.json +1 -1
- package/extensions/memory-lancedb/package.json +1 -1
- package/extensions/minimax-portal-auth/package.json +1 -1
- package/extensions/msteams/CHANGELOG.md +5 -0
- package/extensions/msteams/package.json +1 -1
- package/extensions/nextcloud-talk/package.json +1 -1
- package/extensions/nostr/CHANGELOG.md +5 -0
- package/extensions/nostr/package.json +1 -1
- package/extensions/open-prose/package.json +1 -1
- package/extensions/openai-codex-auth/package.json +1 -1
- package/extensions/signal/package.json +1 -1
- package/extensions/slack/package.json +1 -1
- package/extensions/telegram/package.json +1 -1
- package/extensions/tlon/package.json +1 -1
- package/extensions/twitch/CHANGELOG.md +5 -0
- package/extensions/twitch/package.json +1 -1
- package/extensions/voice-call/CHANGELOG.md +5 -0
- package/extensions/voice-call/package.json +1 -1
- package/extensions/whatsapp/package.json +1 -1
- package/extensions/zalo/CHANGELOG.md +5 -0
- package/extensions/zalo/package.json +1 -1
- package/extensions/zalouser/CHANGELOG.md +5 -0
- package/extensions/zalouser/package.json +1 -1
- package/package.json +8 -1
- package/skills/example-skill/SKILL.md +195 -0
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
import { Type } from "@sinclair/typebox";
|
|
2
|
+
export const SwarmStrategySchema = Type.Union([
|
|
3
|
+
Type.Literal("round_robin"),
|
|
4
|
+
Type.Literal("least_busy"),
|
|
5
|
+
Type.Literal("capability_match"),
|
|
6
|
+
Type.Literal("priority_queue"),
|
|
7
|
+
]);
|
|
8
|
+
export const SwarmMemberSchema = Type.Object({
|
|
9
|
+
agentId: Type.String(),
|
|
10
|
+
sessionKey: Type.String(),
|
|
11
|
+
status: Type.Union([Type.Literal("idle"), Type.Literal("working"), Type.Literal("offline")]),
|
|
12
|
+
capabilities: Type.Array(Type.String()),
|
|
13
|
+
joinedAt: Type.Number(),
|
|
14
|
+
lastHeartbeatAt: Type.Number(),
|
|
15
|
+
completedTasks: Type.Number(),
|
|
16
|
+
failedTasks: Type.Number(),
|
|
17
|
+
currentTaskId: Type.Optional(Type.String()),
|
|
18
|
+
});
|
|
19
|
+
export const SwarmTaskSchema = Type.Object({
|
|
20
|
+
id: Type.String(),
|
|
21
|
+
description: Type.String(),
|
|
22
|
+
status: Type.Union([
|
|
23
|
+
Type.Literal("pending"),
|
|
24
|
+
Type.Literal("in_progress"),
|
|
25
|
+
Type.Literal("completed"),
|
|
26
|
+
Type.Literal("failed"),
|
|
27
|
+
]),
|
|
28
|
+
priority: Type.Number(),
|
|
29
|
+
createdAt: Type.Number(),
|
|
30
|
+
startedAt: Type.Optional(Type.Number()),
|
|
31
|
+
completedAt: Type.Optional(Type.Number()),
|
|
32
|
+
assignedTo: Type.Optional(Type.String()),
|
|
33
|
+
requiredCapabilities: Type.Array(Type.String()),
|
|
34
|
+
});
|
|
35
|
+
export const SwarmStateSchema = Type.Object({
|
|
36
|
+
id: Type.String(),
|
|
37
|
+
name: Type.String(),
|
|
38
|
+
description: Type.Optional(Type.String()),
|
|
39
|
+
createdAt: Type.Number(),
|
|
40
|
+
orchestratorAgentId: Type.String(),
|
|
41
|
+
members: Type.Array(SwarmMemberSchema),
|
|
42
|
+
tasks: Type.Array(SwarmTaskSchema),
|
|
43
|
+
strategy: SwarmStrategySchema,
|
|
44
|
+
status: Type.Union([Type.Literal("active"), Type.Literal("paused"), Type.Literal("shutdown")]),
|
|
45
|
+
});
|
|
46
|
+
export const SwarmListParamsSchema = Type.Object({});
|
|
47
|
+
export const SwarmStatusParamsSchema = Type.Object({
|
|
48
|
+
swarmId: Type.Optional(Type.String()),
|
|
49
|
+
});
|
|
50
|
+
export const SwarmCreateParamsSchema = Type.Object({
|
|
51
|
+
name: Type.String(),
|
|
52
|
+
description: Type.Optional(Type.String()),
|
|
53
|
+
strategy: SwarmStrategySchema,
|
|
54
|
+
orchestrator: Type.Optional(Type.String()),
|
|
55
|
+
});
|
|
56
|
+
export const SwarmListResultSchema = Type.Object({
|
|
57
|
+
swarms: Type.Array(Type.Object({
|
|
58
|
+
id: Type.String(),
|
|
59
|
+
name: Type.String(),
|
|
60
|
+
status: Type.Union([
|
|
61
|
+
Type.Literal("active"),
|
|
62
|
+
Type.Literal("paused"),
|
|
63
|
+
Type.Literal("shutdown"),
|
|
64
|
+
]),
|
|
65
|
+
members: Type.Number(),
|
|
66
|
+
tasks: Type.Number(),
|
|
67
|
+
})),
|
|
68
|
+
});
|
|
69
|
+
export const SwarmStatusResultSchema = Type.Object({
|
|
70
|
+
swarm: Type.Union([SwarmStateSchema, Type.Null()]),
|
|
71
|
+
});
|
|
72
|
+
export const SwarmCreateResultSchema = Type.Object({
|
|
73
|
+
id: Type.String(),
|
|
74
|
+
name: Type.String(),
|
|
75
|
+
description: Type.Optional(Type.String()),
|
|
76
|
+
strategy: SwarmStrategySchema,
|
|
77
|
+
orchestratorAgentId: Type.String(),
|
|
78
|
+
createdAt: Type.Number(),
|
|
79
|
+
status: Type.Union([Type.Literal("active"), Type.Literal("paused"), Type.Literal("shutdown")]),
|
|
80
|
+
});
|
|
@@ -13,5 +13,6 @@ export * from "./schema/push.js";
|
|
|
13
13
|
export * from "./schema/protocol-schemas.js";
|
|
14
14
|
export * from "./schema/sessions.js";
|
|
15
15
|
export * from "./schema/snapshot.js";
|
|
16
|
+
export * from "./schema/swarm.js";
|
|
16
17
|
export * from "./schema/types.js";
|
|
17
18
|
export * from "./schema/wizard.js";
|
|
@@ -54,6 +54,7 @@ export function createGatewayCloseHandler(params) {
|
|
|
54
54
|
clearInterval(params.tickInterval);
|
|
55
55
|
clearInterval(params.healthInterval);
|
|
56
56
|
clearInterval(params.dedupeCleanup);
|
|
57
|
+
clearInterval(params.telemetryInterval);
|
|
57
58
|
if (params.agentUnsub) {
|
|
58
59
|
try {
|
|
59
60
|
params.agentUnsub();
|
|
@@ -84,6 +85,9 @@ export function createGatewayCloseHandler(params) {
|
|
|
84
85
|
if (params.browserControl) {
|
|
85
86
|
await params.browserControl.stop().catch(() => { });
|
|
86
87
|
}
|
|
88
|
+
if (params.telemetryService) {
|
|
89
|
+
await params.telemetryService.shutdown().catch(() => { });
|
|
90
|
+
}
|
|
87
91
|
await new Promise((resolve) => params.wss.close(() => resolve()));
|
|
88
92
|
const servers = params.httpServers && params.httpServers.length > 0
|
|
89
93
|
? params.httpServers
|
|
@@ -29,5 +29,6 @@ export const getHandshakeTimeoutMs = () => {
|
|
|
29
29
|
};
|
|
30
30
|
export const TICK_INTERVAL_MS = 30_000;
|
|
31
31
|
export const HEALTH_REFRESH_INTERVAL_MS = 60_000;
|
|
32
|
+
export const TELEMETRY_METRICS_INTERVAL_MS = 30_000; // Broadcast telemetry metrics every 30s
|
|
32
33
|
export const DEDUPE_TTL_MS = 5 * 60_000;
|
|
33
34
|
export const DEDUPE_MAX = 1000;
|
|
@@ -338,6 +338,35 @@ export function buildGatewayCronService(params) {
|
|
|
338
338
|
});
|
|
339
339
|
}
|
|
340
340
|
},
|
|
341
|
+
onJobMetrics: (metrics) => {
|
|
342
|
+
// Import telemetry service dynamically to avoid circular dependencies
|
|
343
|
+
void import("../telemetry/service.js")
|
|
344
|
+
.then(({ getGlobalTelemetryService }) => {
|
|
345
|
+
const telemetry = getGlobalTelemetryService();
|
|
346
|
+
if (telemetry) {
|
|
347
|
+
telemetry.recordCounter("poolbot.cron.jobs_executed", 1, {
|
|
348
|
+
job_id: metrics.jobId,
|
|
349
|
+
job_name: metrics.jobName,
|
|
350
|
+
schedule: metrics.schedule,
|
|
351
|
+
status: metrics.success ? "success" : "failure",
|
|
352
|
+
});
|
|
353
|
+
telemetry.recordHistogram("poolbot.cron.job_duration_ms", metrics.durationMs, {
|
|
354
|
+
job_id: metrics.jobId,
|
|
355
|
+
job_name: metrics.jobName,
|
|
356
|
+
});
|
|
357
|
+
if (!metrics.success) {
|
|
358
|
+
telemetry.recordCounter("poolbot.cron.job_failures", 1, {
|
|
359
|
+
job_id: metrics.jobId,
|
|
360
|
+
job_name: metrics.jobName,
|
|
361
|
+
error_type: metrics.errorType ?? "unknown",
|
|
362
|
+
});
|
|
363
|
+
}
|
|
364
|
+
}
|
|
365
|
+
})
|
|
366
|
+
.catch(() => {
|
|
367
|
+
// Silently ignore telemetry errors
|
|
368
|
+
});
|
|
369
|
+
},
|
|
341
370
|
});
|
|
342
371
|
return { cron, storePath, cronEnabled };
|
|
343
372
|
}
|
|
@@ -1,7 +1,11 @@
|
|
|
1
|
+
import { createAlertEngine } from "../telemetry/alert-engine.js";
|
|
2
|
+
import { getGlobalTelemetryService } from "../telemetry/service.js";
|
|
1
3
|
import { abortChatRunById } from "./chat-abort.js";
|
|
2
|
-
import { DEDUPE_MAX, DEDUPE_TTL_MS, HEALTH_REFRESH_INTERVAL_MS, TICK_INTERVAL_MS, } from "./server-constants.js";
|
|
4
|
+
import { DEDUPE_MAX, DEDUPE_TTL_MS, HEALTH_REFRESH_INTERVAL_MS, TELEMETRY_METRICS_INTERVAL_MS, TICK_INTERVAL_MS, } from "./server-constants.js";
|
|
3
5
|
import { formatError } from "./server-utils.js";
|
|
4
6
|
import { setBroadcastHealthUpdate } from "./server/health-state.js";
|
|
7
|
+
import { createSubsystemLogger } from "../logging/subsystem.js";
|
|
8
|
+
const logTelemetry = createSubsystemLogger("telemetry:broadcast");
|
|
5
9
|
export function startGatewayMaintenanceTimers(params) {
|
|
6
10
|
setBroadcastHealthUpdate((snap) => {
|
|
7
11
|
params.broadcast("health", snap, {
|
|
@@ -28,6 +32,35 @@ export function startGatewayMaintenanceTimers(params) {
|
|
|
28
32
|
void params
|
|
29
33
|
.refreshGatewayHealthSnapshot({ probe: true })
|
|
30
34
|
.catch((err) => params.logHealth.error(`initial refresh failed: ${formatError(err)}`));
|
|
35
|
+
// Setup alert engine
|
|
36
|
+
const telemetry = getGlobalTelemetryService();
|
|
37
|
+
if (telemetry?.isEnabled()) {
|
|
38
|
+
const alertEngine = createAlertEngine();
|
|
39
|
+
telemetry.setAlertEngine(alertEngine);
|
|
40
|
+
logTelemetry.info(`Alert engine initialized with ${alertEngine.getRules().length} rules`);
|
|
41
|
+
}
|
|
42
|
+
// periodic telemetry metrics broadcast
|
|
43
|
+
const telemetryInterval = setInterval(() => {
|
|
44
|
+
const telemetry = getGlobalTelemetryService();
|
|
45
|
+
if (!telemetry?.isEnabled()) {
|
|
46
|
+
return;
|
|
47
|
+
}
|
|
48
|
+
const snapshot = telemetry.getSnapshot();
|
|
49
|
+
if (snapshot) {
|
|
50
|
+
const payload = { metrics: snapshot, ts: Date.now() };
|
|
51
|
+
params.broadcast("telemetry.metrics", payload, { dropIfSlow: true });
|
|
52
|
+
params.nodeSendToAllSubscribed("telemetry.metrics", payload);
|
|
53
|
+
logTelemetry.debug(`broadcasted ${snapshot.metrics.length} metrics, ${snapshot.spans.length} spans`);
|
|
54
|
+
// Evaluate alerts
|
|
55
|
+
const alerts = telemetry.evaluateAlerts(snapshot);
|
|
56
|
+
for (const alert of alerts) {
|
|
57
|
+
const alertPayload = { alert, ts: Date.now() };
|
|
58
|
+
params.broadcast("telemetry.alert", alertPayload, { dropIfSlow: false });
|
|
59
|
+
params.nodeSendToAllSubscribed("telemetry.alert", alertPayload);
|
|
60
|
+
logTelemetry.warn(`Alert triggered: ${alert.name} (${alert.severity}) - ${alert.message}`);
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
}, TELEMETRY_METRICS_INTERVAL_MS);
|
|
31
64
|
// dedupe cache cleanup
|
|
32
65
|
const dedupeCleanup = setInterval(() => {
|
|
33
66
|
const AGENT_RUN_SEQ_MAX = 10_000;
|
|
@@ -79,5 +112,5 @@ export function startGatewayMaintenanceTimers(params) {
|
|
|
79
112
|
params.chatDeltaSentAt.delete(runId);
|
|
80
113
|
}
|
|
81
114
|
}, 60_000);
|
|
82
|
-
return { tickInterval, healthInterval, dedupeCleanup };
|
|
115
|
+
return { tickInterval, healthInterval, dedupeCleanup, telemetryInterval };
|
|
83
116
|
}
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
import { ErrorCodes, errorShape, formatValidationErrors, validateSwarmCreateParams, validateSwarmListParams, validateSwarmStatusParams, } from "../protocol/index.js";
|
|
2
|
+
import { swarmService } from "../../swarm/service.js";
|
|
3
|
+
export const swarmHandlers = {
|
|
4
|
+
"swarm.list": async ({ params, respond }) => {
|
|
5
|
+
if (!validateSwarmListParams(params)) {
|
|
6
|
+
respond(false, undefined, errorShape(ErrorCodes.INVALID_REQUEST, `invalid swarm.list params: ${formatValidationErrors(validateSwarmListParams.errors)}`));
|
|
7
|
+
return;
|
|
8
|
+
}
|
|
9
|
+
try {
|
|
10
|
+
const result = await swarmService.list();
|
|
11
|
+
respond(true, result, undefined);
|
|
12
|
+
}
|
|
13
|
+
catch (err) {
|
|
14
|
+
respond(false, undefined, errorShape(ErrorCodes.INTERNAL_ERROR, `swarm.list failed: ${err}`));
|
|
15
|
+
}
|
|
16
|
+
},
|
|
17
|
+
"swarm.status": async ({ params, respond }) => {
|
|
18
|
+
if (!validateSwarmStatusParams(params)) {
|
|
19
|
+
respond(false, undefined, errorShape(ErrorCodes.INVALID_REQUEST, `invalid swarm.status params: ${formatValidationErrors(validateSwarmStatusParams.errors)}`));
|
|
20
|
+
return;
|
|
21
|
+
}
|
|
22
|
+
try {
|
|
23
|
+
const p = params;
|
|
24
|
+
// If no swarmId provided, return default swarm
|
|
25
|
+
const swarmId = p.swarmId ?? "swarm-default";
|
|
26
|
+
let result = await swarmService.getStatus(swarmId);
|
|
27
|
+
// If swarm doesn't exist, create default one
|
|
28
|
+
if (!result.swarm) {
|
|
29
|
+
const defaultSwarm = await swarmService.getOrCreateDefaultSwarm();
|
|
30
|
+
result = { swarm: defaultSwarm };
|
|
31
|
+
}
|
|
32
|
+
respond(true, result, undefined);
|
|
33
|
+
}
|
|
34
|
+
catch (err) {
|
|
35
|
+
respond(false, undefined, errorShape(ErrorCodes.INTERNAL_ERROR, `swarm.status failed: ${err}`));
|
|
36
|
+
}
|
|
37
|
+
},
|
|
38
|
+
"swarm.create": async ({ params, respond }) => {
|
|
39
|
+
if (!validateSwarmCreateParams(params)) {
|
|
40
|
+
respond(false, undefined, errorShape(ErrorCodes.INVALID_REQUEST, `invalid swarm.create params: ${formatValidationErrors(validateSwarmCreateParams.errors)}`));
|
|
41
|
+
return;
|
|
42
|
+
}
|
|
43
|
+
try {
|
|
44
|
+
const p = params;
|
|
45
|
+
const result = await swarmService.create({
|
|
46
|
+
name: p.name,
|
|
47
|
+
description: p.description,
|
|
48
|
+
strategy: p.strategy ??
|
|
49
|
+
"capability_match",
|
|
50
|
+
orchestratorAgentId: p.orchestrator ?? "main",
|
|
51
|
+
});
|
|
52
|
+
respond(true, result, undefined);
|
|
53
|
+
}
|
|
54
|
+
catch (err) {
|
|
55
|
+
respond(false, undefined, errorShape(ErrorCodes.INTERNAL_ERROR, `swarm.create failed: ${err}`));
|
|
56
|
+
}
|
|
57
|
+
},
|
|
58
|
+
};
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
import { getGlobalTelemetryService } from "../../telemetry/service.js";
|
|
2
|
+
import { ErrorCodes, errorShape } from "../protocol/index.js";
|
|
3
|
+
export const telemetryHandlers = {
|
|
4
|
+
"telemetry.status": ({ respond }) => {
|
|
5
|
+
const telemetry = getGlobalTelemetryService();
|
|
6
|
+
if (!telemetry) {
|
|
7
|
+
respond(true, { enabled: false, initialized: false }, undefined);
|
|
8
|
+
return;
|
|
9
|
+
}
|
|
10
|
+
const config = telemetry.getConfig();
|
|
11
|
+
const snapshot = telemetry.getSnapshot();
|
|
12
|
+
respond(true, {
|
|
13
|
+
enabled: config.enabled,
|
|
14
|
+
initialized: true,
|
|
15
|
+
serviceName: config.serviceName,
|
|
16
|
+
exporter: config.tracing.exporter,
|
|
17
|
+
endpoint: config.tracing.otlpEndpoint,
|
|
18
|
+
tracing: config.tracing,
|
|
19
|
+
metrics: config.metrics,
|
|
20
|
+
sampleRate: config.tracing.sampleRate,
|
|
21
|
+
metricsSnapshot: snapshot,
|
|
22
|
+
}, undefined);
|
|
23
|
+
},
|
|
24
|
+
"telemetry.metrics": ({ respond }) => {
|
|
25
|
+
const telemetry = getGlobalTelemetryService();
|
|
26
|
+
if (!telemetry) {
|
|
27
|
+
respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, "Telemetry service not initialized"));
|
|
28
|
+
return;
|
|
29
|
+
}
|
|
30
|
+
const snapshot = telemetry.getSnapshot();
|
|
31
|
+
respond(true, { metrics: snapshot }, undefined);
|
|
32
|
+
},
|
|
33
|
+
"telemetry.config": ({ params, respond }) => {
|
|
34
|
+
const telemetry = getGlobalTelemetryService();
|
|
35
|
+
if (!telemetry) {
|
|
36
|
+
respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, "Telemetry service not initialized"));
|
|
37
|
+
return;
|
|
38
|
+
}
|
|
39
|
+
// Allow runtime updates to sample rate
|
|
40
|
+
if (typeof params.sampleRate === "number") {
|
|
41
|
+
// Note: This would require restarting the service with new config
|
|
42
|
+
// For now, we just return the current config
|
|
43
|
+
}
|
|
44
|
+
const config = telemetry.getConfig();
|
|
45
|
+
respond(true, {
|
|
46
|
+
enabled: config.enabled,
|
|
47
|
+
serviceName: config.serviceName,
|
|
48
|
+
tracing: config.tracing,
|
|
49
|
+
metrics: config.metrics,
|
|
50
|
+
}, undefined);
|
|
51
|
+
},
|
|
52
|
+
"telemetry.alerts.list": ({ respond }) => {
|
|
53
|
+
const telemetry = getGlobalTelemetryService();
|
|
54
|
+
if (!telemetry) {
|
|
55
|
+
respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, "Telemetry service not initialized"));
|
|
56
|
+
return;
|
|
57
|
+
}
|
|
58
|
+
const alerts = telemetry.evaluateAlerts();
|
|
59
|
+
respond(true, { alerts }, undefined);
|
|
60
|
+
},
|
|
61
|
+
"telemetry.alerts.test": ({ respond }) => {
|
|
62
|
+
const telemetry = getGlobalTelemetryService();
|
|
63
|
+
if (!telemetry) {
|
|
64
|
+
respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, "Telemetry service not initialized"));
|
|
65
|
+
return;
|
|
66
|
+
}
|
|
67
|
+
// Force alert evaluation and return any triggered alerts
|
|
68
|
+
const alerts = telemetry.evaluateAlerts();
|
|
69
|
+
respond(true, { triggered: alerts.length > 0, alerts }, undefined);
|
|
70
|
+
},
|
|
71
|
+
};
|
|
@@ -89,6 +89,12 @@ const BASE_METHODS = [
|
|
|
89
89
|
"chat.history",
|
|
90
90
|
"chat.abort",
|
|
91
91
|
"chat.send",
|
|
92
|
+
// Telemetry methods
|
|
93
|
+
"telemetry.status",
|
|
94
|
+
"telemetry.metrics",
|
|
95
|
+
"telemetry.config",
|
|
96
|
+
"telemetry.alerts.list",
|
|
97
|
+
"telemetry.alerts.test",
|
|
92
98
|
];
|
|
93
99
|
export function listGatewayMethods() {
|
|
94
100
|
const channelMethods = listChannelPlugins().flatMap((plugin) => plugin.gatewayMethods ?? []);
|
|
@@ -113,4 +119,6 @@ export const GATEWAY_EVENTS = [
|
|
|
113
119
|
"voicewake.changed",
|
|
114
120
|
"exec.approval.requested",
|
|
115
121
|
"exec.approval.resolved",
|
|
122
|
+
"telemetry.metrics",
|
|
123
|
+
"telemetry.alert",
|
|
116
124
|
];
|
|
@@ -8,7 +8,6 @@ import { configHandlers } from "./server-methods/config.js";
|
|
|
8
8
|
import { connectHandlers } from "./server-methods/connect.js";
|
|
9
9
|
import { cronHandlers } from "./server-methods/cron.js";
|
|
10
10
|
import { deviceHandlers } from "./server-methods/devices.js";
|
|
11
|
-
import { execApprovalsHandlers } from "./server-methods/exec-approvals.js";
|
|
12
11
|
import { healthHandlers } from "./server-methods/health.js";
|
|
13
12
|
import { logsHandlers } from "./server-methods/logs.js";
|
|
14
13
|
import { modelsHandlers } from "./server-methods/models.js";
|
|
@@ -16,7 +15,9 @@ import { nodeHandlers } from "./server-methods/nodes.js";
|
|
|
16
15
|
import { sendHandlers } from "./server-methods/send.js";
|
|
17
16
|
import { sessionsHandlers } from "./server-methods/sessions.js";
|
|
18
17
|
import { skillsHandlers } from "./server-methods/skills.js";
|
|
18
|
+
import { swarmHandlers } from "./server-methods/swarm.js";
|
|
19
19
|
import { systemHandlers } from "./server-methods/system.js";
|
|
20
|
+
import { telemetryHandlers } from "./server-methods/telemetry.js";
|
|
20
21
|
import { talkHandlers } from "./server-methods/talk.js";
|
|
21
22
|
import { ttsHandlers } from "./server-methods/tts.js";
|
|
22
23
|
import { updateHandlers } from "./server-methods/update.js";
|
|
@@ -68,6 +69,8 @@ const READ_METHODS = new Set([
|
|
|
68
69
|
"cron.list",
|
|
69
70
|
"cron.status",
|
|
70
71
|
"cron.runs",
|
|
72
|
+
"swarm.list",
|
|
73
|
+
"swarm.status",
|
|
71
74
|
"system-presence",
|
|
72
75
|
"last-heartbeat",
|
|
73
76
|
"node.list",
|
|
@@ -75,6 +78,9 @@ const READ_METHODS = new Set([
|
|
|
75
78
|
"chat.history",
|
|
76
79
|
"config.get",
|
|
77
80
|
"talk.config",
|
|
81
|
+
"telemetry.status",
|
|
82
|
+
"telemetry.metrics",
|
|
83
|
+
"telemetry.config",
|
|
78
84
|
]);
|
|
79
85
|
const WRITE_METHODS = new Set([
|
|
80
86
|
"send",
|
|
@@ -170,7 +176,7 @@ export const coreGatewayHandlers = {
|
|
|
170
176
|
...chatHandlers,
|
|
171
177
|
...cronHandlers,
|
|
172
178
|
...deviceHandlers,
|
|
173
|
-
...
|
|
179
|
+
...telemetryHandlers,
|
|
174
180
|
...webHandlers,
|
|
175
181
|
...modelsHandlers,
|
|
176
182
|
...configHandlers,
|
|
@@ -187,6 +193,7 @@ export const coreGatewayHandlers = {
|
|
|
187
193
|
...agentHandlers,
|
|
188
194
|
...agentsHandlers,
|
|
189
195
|
...browserHandlers,
|
|
196
|
+
...swarmHandlers,
|
|
190
197
|
};
|
|
191
198
|
export async function handleGatewayRequest(opts) {
|
|
192
199
|
const { req, respond, client, isWebchatConnect, context } = opts;
|
|
@@ -27,6 +27,7 @@ import { createSubsystemLogger, runtimeForLogger } from "../logging/subsystem.js
|
|
|
27
27
|
import { getGlobalHookRunner, runGlobalGatewayStopSafely } from "../plugins/hook-runner-global.js";
|
|
28
28
|
import { createEmptyPluginRegistry } from "../plugins/registry.js";
|
|
29
29
|
import { getTotalQueueSize } from "../process/command-queue.js";
|
|
30
|
+
import { createTelemetryService, setGlobalTelemetryService, telemetryConfigFromOtelConfig, } from "../telemetry/service.js";
|
|
30
31
|
import { runOnboardingWizard } from "../wizard/onboarding.js";
|
|
31
32
|
import { createAuthRateLimiter } from "./auth-rate-limit.js";
|
|
32
33
|
import { startChannelHealthMonitor } from "./channel-health-monitor.js";
|
|
@@ -148,6 +149,18 @@ export async function startGatewayServer(port = 18789, opts = {}) {
|
|
|
148
149
|
if (diagnosticsEnabled) {
|
|
149
150
|
startDiagnosticHeartbeat();
|
|
150
151
|
}
|
|
152
|
+
// Initialize telemetry service if enabled
|
|
153
|
+
let telemetryService = null;
|
|
154
|
+
const otelConfig = cfgAtStart.diagnostics?.otel;
|
|
155
|
+
if (otelConfig?.enabled) {
|
|
156
|
+
const telemetryConfig = telemetryConfigFromOtelConfig(otelConfig, {
|
|
157
|
+
defaultServiceName: "poolbot-gateway",
|
|
158
|
+
});
|
|
159
|
+
telemetryService = createTelemetryService(telemetryConfig);
|
|
160
|
+
setGlobalTelemetryService(telemetryService);
|
|
161
|
+
await telemetryService.start();
|
|
162
|
+
log.info(`telemetry: initialized with ${telemetryConfig.tracing.exporter} exporter`);
|
|
163
|
+
}
|
|
151
164
|
setGatewaySigusr1RestartPolicy({ allowExternal: isRestartEnabled(cfgAtStart) });
|
|
152
165
|
setPreRestartDeferralCheck(() => getTotalQueueSize() + getTotalPendingReplies() + getActiveEmbeddedRunCount());
|
|
153
166
|
initSubagentRegistry();
|
|
@@ -326,23 +339,25 @@ export async function startGatewayServer(port = 18789, opts = {}) {
|
|
|
326
339
|
let tickInterval = noopInterval();
|
|
327
340
|
let healthInterval = noopInterval();
|
|
328
341
|
let dedupeCleanup = noopInterval();
|
|
342
|
+
let telemetryInterval = noopInterval();
|
|
329
343
|
if (!minimalTestGateway) {
|
|
330
|
-
({ tickInterval, healthInterval, dedupeCleanup } =
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
344
|
+
({ tickInterval, healthInterval, dedupeCleanup, telemetryInterval } =
|
|
345
|
+
startGatewayMaintenanceTimers({
|
|
346
|
+
broadcast,
|
|
347
|
+
nodeSendToAllSubscribed,
|
|
348
|
+
getPresenceVersion,
|
|
349
|
+
getHealthVersion,
|
|
350
|
+
refreshGatewayHealthSnapshot,
|
|
351
|
+
logHealth,
|
|
352
|
+
dedupe,
|
|
353
|
+
chatAbortControllers,
|
|
354
|
+
chatRunState,
|
|
355
|
+
chatRunBuffers,
|
|
356
|
+
chatDeltaSentAt,
|
|
357
|
+
removeChatRun,
|
|
358
|
+
agentRunSeq,
|
|
359
|
+
nodeSendToSession,
|
|
360
|
+
}));
|
|
346
361
|
}
|
|
347
362
|
const agentUnsub = minimalTestGateway
|
|
348
363
|
? null
|
|
@@ -564,12 +579,14 @@ export async function startGatewayServer(port = 18789, opts = {}) {
|
|
|
564
579
|
tickInterval,
|
|
565
580
|
healthInterval,
|
|
566
581
|
dedupeCleanup,
|
|
582
|
+
telemetryInterval,
|
|
567
583
|
agentUnsub,
|
|
568
584
|
heartbeatUnsub,
|
|
569
585
|
chatRunState,
|
|
570
586
|
clients,
|
|
571
587
|
configReloader,
|
|
572
588
|
browserControl,
|
|
589
|
+
telemetryService,
|
|
573
590
|
wss,
|
|
574
591
|
httpServer,
|
|
575
592
|
httpServers,
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Abort Pattern Utilities
|
|
3
|
+
*
|
|
4
|
+
* Provides memory-leak-free abort signal handling.
|
|
5
|
+
*
|
|
6
|
+
* CRITICAL FIX: Uses `.bind()` instead of closures to prevent memory leaks.
|
|
7
|
+
* Issue #7174: Closure-based abort handlers capture scope and leak memory.
|
|
8
|
+
*
|
|
9
|
+
* @example
|
|
10
|
+
* ```typescript
|
|
11
|
+
* // BAD: Captures closure scope (leaks memory)
|
|
12
|
+
* signal.addEventListener('abort', () => controller.abort());
|
|
13
|
+
*
|
|
14
|
+
* // GOOD: No closure capture
|
|
15
|
+
* signal.addEventListener('abort', relayAbort.bind(controller));
|
|
16
|
+
* ```
|
|
17
|
+
*/
|
|
18
|
+
/**
|
|
19
|
+
* Relay abort signal without closure capture
|
|
20
|
+
* Prevents memory leak by using bind instead of arrow function
|
|
21
|
+
*/
|
|
22
|
+
export function relayAbort() {
|
|
23
|
+
this.abort();
|
|
24
|
+
}
|
|
25
|
+
/**
|
|
26
|
+
* Create an abort relay that doesn't capture closure scope
|
|
27
|
+
*/
|
|
28
|
+
export function createAbortRelay(controller) {
|
|
29
|
+
return relayAbort.bind(controller);
|
|
30
|
+
}
|
|
31
|
+
/**
|
|
32
|
+
* Link an abort signal to a controller without memory leaks
|
|
33
|
+
*/
|
|
34
|
+
export function linkAbortSignal(source, target) {
|
|
35
|
+
const handler = createAbortRelay(target);
|
|
36
|
+
source.addEventListener("abort", handler, { once: true });
|
|
37
|
+
// Return cleanup function
|
|
38
|
+
return () => {
|
|
39
|
+
source.removeEventListener("abort", handler);
|
|
40
|
+
};
|
|
41
|
+
}
|
|
42
|
+
/**
|
|
43
|
+
* Create a timeout-based abort controller
|
|
44
|
+
*/
|
|
45
|
+
export function createTimeoutAbortController(timeoutMs) {
|
|
46
|
+
const controller = new AbortController();
|
|
47
|
+
if (timeoutMs > 0) {
|
|
48
|
+
const timeout = setTimeout(() => {
|
|
49
|
+
controller.abort();
|
|
50
|
+
}, timeoutMs);
|
|
51
|
+
return {
|
|
52
|
+
controller,
|
|
53
|
+
cleanup: () => clearTimeout(timeout),
|
|
54
|
+
};
|
|
55
|
+
}
|
|
56
|
+
return {
|
|
57
|
+
controller,
|
|
58
|
+
cleanup: () => { },
|
|
59
|
+
};
|
|
60
|
+
}
|
|
61
|
+
/**
|
|
62
|
+
* Race multiple abort signals
|
|
63
|
+
*/
|
|
64
|
+
export function raceAbortSignals(signals, controller) {
|
|
65
|
+
const handlers = [];
|
|
66
|
+
let alreadyAborted = false;
|
|
67
|
+
for (const signal of signals) {
|
|
68
|
+
if (signal.aborted && !alreadyAborted) {
|
|
69
|
+
alreadyAborted = true;
|
|
70
|
+
controller.abort();
|
|
71
|
+
}
|
|
72
|
+
else if (!alreadyAborted) {
|
|
73
|
+
const handler = createAbortRelay(controller);
|
|
74
|
+
signal.addEventListener("abort", handler, { once: true });
|
|
75
|
+
handlers.push(() => signal.removeEventListener("abort", handler));
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
return () => {
|
|
79
|
+
for (const cleanup of handlers) {
|
|
80
|
+
cleanup();
|
|
81
|
+
}
|
|
82
|
+
};
|
|
83
|
+
}
|
|
84
|
+
/**
|
|
85
|
+
* Fetch with timeout and proper abort handling
|
|
86
|
+
*/
|
|
87
|
+
export async function fetchWithTimeout(url, options = {}) {
|
|
88
|
+
const { timeoutMs = 30000, ...fetchOptions } = options;
|
|
89
|
+
const { controller, cleanup } = createTimeoutAbortController(timeoutMs);
|
|
90
|
+
// Link existing signal if provided
|
|
91
|
+
let unlink;
|
|
92
|
+
if (fetchOptions.signal) {
|
|
93
|
+
unlink = linkAbortSignal(fetchOptions.signal, controller);
|
|
94
|
+
}
|
|
95
|
+
try {
|
|
96
|
+
const response = await fetch(url, {
|
|
97
|
+
...fetchOptions,
|
|
98
|
+
signal: controller.signal,
|
|
99
|
+
});
|
|
100
|
+
return response;
|
|
101
|
+
}
|
|
102
|
+
finally {
|
|
103
|
+
unlink?.();
|
|
104
|
+
cleanup();
|
|
105
|
+
}
|
|
106
|
+
}
|