@mcoda/mswarm 0.1.76 → 0.1.79

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/runtime.js CHANGED
@@ -1,9 +1,10 @@
1
- import { chmod, mkdir, readFile, rm, writeFile } from "node:fs/promises";
2
- import { dirname, join } from "node:path";
3
- import { hostname, homedir, platform, userInfo } from "node:os";
1
+ import { chmod, lstat, mkdir, readdir, readFile, rm, writeFile } from "node:fs/promises";
2
+ import { dirname, isAbsolute, join, relative, resolve } from "node:path";
3
+ import { cpus, freemem, hostname, homedir, loadavg, platform, totalmem, userInfo } from "node:os";
4
4
  import { spawn } from "node:child_process";
5
- import { createHash, randomUUID } from "node:crypto";
5
+ import { createHash, createHmac, randomUUID } from "node:crypto";
6
6
  import { MswarmCodaliExecutor } from "./codali-executor.js";
7
+ import { MSWARM_CAPABILITY_SCHEMA_VERSION, assertMswarmSafeRelativePath, validateMswarmArchiveEntry, buildMswarmCapabilityNames, buildMswarmPrivateCapabilityCatalogEntry, buildMswarmLocalArtifactUri, buildMswarmSandboxProfile, defaultMswarmArtifactAccessPolicy, defaultMswarmArtifactRetentionPolicy, projectMswarmPublicCapabilities, validateMswarmGenericJobRequest } from "@mcoda/shared";
7
8
  const DEFAULT_GATEWAY_BASE_URL = "http://127.0.0.1:8080";
8
9
  const DEFAULT_SETUP_GATEWAY_BASE_URL = "https://api.mswarm.org";
9
10
  const DEFAULT_OLLAMA_BASE_URL = "http://127.0.0.1:11434";
@@ -14,11 +15,91 @@ const DEFAULT_SELF_HOSTED_NODE_VERSION = "0.1.70";
14
15
  const DEFAULT_REQUEST_TIMEOUT_MS = 10000;
15
16
  const DEFAULT_JOB_TIMEOUT_MS = 3600000;
16
17
  const DEFAULT_SERVICE_COMMAND_TIMEOUT_MS = 60000;
18
+ const DEFAULT_CAPABILITY_PROBE_TIMEOUT_MS = 2000;
19
+ const SELF_HOSTED_RUNTIME_PROTOCOL_VERSION = 1;
20
+ const SELF_HOSTED_LOAD_BALANCER_PROTOCOL_VERSION = 1;
21
+ const SELF_HOSTED_CATALOG_METADATA_VERSION = 1;
22
+ const MAX_TELEMETRY_LATENCY_SAMPLES = 50;
23
+ const MAX_TELEMETRY_FAILURES = 20;
17
24
  const DEFAULT_MCODA_BIN = "mcoda";
18
25
  const DEFAULT_MCODA_LIST_ARGS = ["agent", "list", "--json", "--refresh-health"];
19
26
  const DEFAULT_COMMAND_MAX_BUFFER = 16 * 1024 * 1024;
27
+ const DEFAULT_LOCAL_ARTIFACT_MAX_BYTES = 512 * 1024 * 1024;
20
28
  const DEFAULT_JOB_POLL_WAIT_MS = 25000;
21
29
  const DEFAULT_STREAM_EVENT_BATCH_SIZE = 8;
30
+ const OWNER_LOCAL_TEST_ECHO_JOB_TYPE = "tenant.test-echo";
31
+ const TEST_ECHO_RUNNER_ID = "test.echo";
32
+ const RENDER_BLENDER_JOB_TYPE = "render.blender";
33
+ const BLENDER_RENDER_RUNNER_ID = "blender.render";
34
+ const CUDA_RUN_JOB_TYPE = "cuda.run";
35
+ const CUDA_PACKAGE_RUNNER_ID = "cuda.package";
36
+ const APPROVED_NVIDIA_CUDA_IMAGES = new Set([
37
+ "nvidia/cuda:12.4.1-devel-ubuntu22.04"
38
+ ]);
39
+ const OWNER_LOCAL_GENERIC_JOB_CATALOG = [
40
+ {
41
+ job_type: OWNER_LOCAL_TEST_ECHO_JOB_TYPE,
42
+ args_schema: {
43
+ type: "object",
44
+ additionalProperties: true,
45
+ properties: {
46
+ message: { type: "string" },
47
+ delay_ms: { type: "number", minimum: 0 },
48
+ repeat: { type: "number", minimum: 1 },
49
+ fail: { type: "boolean" }
50
+ }
51
+ },
52
+ policy: {
53
+ trust_mode: "owner-local",
54
+ network: "none",
55
+ allow_raw_command: false
56
+ },
57
+ runner: TEST_ECHO_RUNNER_ID
58
+ },
59
+ {
60
+ job_type: RENDER_BLENDER_JOB_TYPE,
61
+ args_schema: {
62
+ type: "object",
63
+ additionalProperties: false,
64
+ properties: {
65
+ frames: { type: ["string", "number"] },
66
+ engine: { enum: ["cycles", "eevee", "workbench"] },
67
+ resolution: { type: "string", pattern: "^[1-9][0-9]{0,4}x[1-9][0-9]{0,4}$" },
68
+ output_format: { enum: ["png", "jpeg", "open_exr"] },
69
+ scene: { type: "string" },
70
+ camera: { type: "string" }
71
+ }
72
+ },
73
+ policy: {
74
+ trust_mode: "owner-local",
75
+ network: "none",
76
+ allow_raw_command: false
77
+ },
78
+ runner: BLENDER_RENDER_RUNNER_ID,
79
+ required_capabilities: ["software.blender"]
80
+ },
81
+ {
82
+ job_type: CUDA_RUN_JOB_TYPE,
83
+ args_schema: {
84
+ type: "object",
85
+ additionalProperties: false,
86
+ required: ["manifest_path", "profile", "target"],
87
+ properties: {
88
+ manifest_path: { type: "string" },
89
+ profile: { type: "string" },
90
+ target: { type: "string" }
91
+ }
92
+ },
93
+ policy: {
94
+ trust_mode: "owner-local",
95
+ network: "none",
96
+ allow_raw_command: false,
97
+ allowed_images: Array.from(APPROVED_NVIDIA_CUDA_IMAGES)
98
+ },
99
+ runner: CUDA_PACKAGE_RUNNER_ID,
100
+ required_capabilities: ["gpu.nvidia", "software.docker", "docker.nvidia"]
101
+ }
102
+ ];
22
103
  const SERVICE_LABEL = "com.mcoda.mswarm.self-hosted-node";
23
104
  const SYSTEMD_SERVICE_NAME = "mswarm-self-hosted-node.service";
24
105
  const WINDOWS_TASK_NAME = "MswarmSelfHostedNode";
@@ -327,6 +408,9 @@ function defaultStatePath() {
327
408
  function defaultRuntimeTokenPath() {
328
409
  return join(homedir(), ".mswarm", "self-hosted-node", "node.key");
329
410
  }
411
+ function defaultArtifactStorePath() {
412
+ return join(homedir(), ".mswarm", "self-hosted-node", "artifacts");
413
+ }
330
414
  export async function readOrCreateSelfHostedMachineId(machineIdPath = defaultMachineIdPath()) {
331
415
  try {
332
416
  const existing = (await readFile(machineIdPath, "utf8")).trim();
@@ -381,6 +465,112 @@ function optionalBoolean(...values) {
381
465
  }
382
466
  return null;
383
467
  }
468
+ function roundedTelemetryNumber(value, digits = 3) {
469
+ if (!Number.isFinite(value)) {
470
+ return 0;
471
+ }
472
+ const factor = 10 ** digits;
473
+ return Math.round(value * factor) / factor;
474
+ }
475
+ function nonNegativeTelemetryInteger(value) {
476
+ return typeof value === "number" && Number.isFinite(value) && value > 0 ? Math.floor(value) : 0;
477
+ }
478
+ function sha256Json(value) {
479
+ return createHash("sha256").update(JSON.stringify(value)).digest("hex");
480
+ }
481
+ function buildCatalogFingerprint(models) {
482
+ const projection = models
483
+ .map((model) => ({
484
+ name: optionalText(model.name) || "",
485
+ provider: optionalText(model.provider) || null,
486
+ adapter: optionalText(model.adapter) || null,
487
+ source_agent_slug: optionalText(model.source_agent_slug) || null,
488
+ model_id: optionalText(model.model_id) || optionalText(model.model) || null,
489
+ exposed: model.exposed !== false,
490
+ capabilities: normalizeCapabilities(model.capabilities).sort(),
491
+ health_status: normalizeHealthStatus(model.health_status)
492
+ }))
493
+ .sort((left, right) => `${left.provider || ""}:${left.name}`.localeCompare(`${right.provider || ""}:${right.name}`));
494
+ return `sha256:${sha256Json(projection)}`;
495
+ }
496
+ function executionClassCapacity(input) {
497
+ const maxConcurrency = Math.max(1, Math.floor(input.maxConcurrency));
498
+ const activeJobs = nonNegativeTelemetryInteger(input.activeJobs);
499
+ const queuedJobs = nonNegativeTelemetryInteger(input.queuedJobs);
500
+ return {
501
+ max_concurrency: maxConcurrency,
502
+ active_jobs: activeJobs,
503
+ queued_jobs: queuedJobs,
504
+ free_slots: input.drainMode ? 0 : Math.max(0, maxConcurrency - activeJobs - queuedJobs)
505
+ };
506
+ }
507
+ function totalHostMemoryBucket() {
508
+ const gib = totalmem() / (1024 ** 3);
509
+ if (!Number.isFinite(gib) || gib <= 0)
510
+ return "unknown";
511
+ if (gib <= 8)
512
+ return "<=8GiB";
513
+ if (gib <= 16)
514
+ return "<=16GiB";
515
+ if (gib <= 32)
516
+ return "<=32GiB";
517
+ if (gib <= 64)
518
+ return "<=64GiB";
519
+ if (gib <= 128)
520
+ return "<=128GiB";
521
+ return ">128GiB";
522
+ }
523
+ function coarsePublicVramTier(value, gpuCount) {
524
+ if (value === "none" ||
525
+ value === "lt8" ||
526
+ value === "8-15" ||
527
+ value === "16-31" ||
528
+ value === "32plus") {
529
+ return value;
530
+ }
531
+ return gpuCount > 0 ? "unknown" : "none";
532
+ }
533
+ function buildCoarseHardwarePressure(capabilityPayload) {
534
+ const cpuCount = Math.max(1, cpus().length || 1);
535
+ const totalMemory = totalmem();
536
+ const freeMemory = freemem();
537
+ const projection = capabilityPayload?.public_projection;
538
+ const projectionRecord = projection && typeof projection === "object" && !Array.isArray(projection)
539
+ ? projection
540
+ : {};
541
+ const accelerators = projectionRecord.accelerators && typeof projectionRecord.accelerators === "object"
542
+ ? projectionRecord.accelerators
543
+ : {};
544
+ const gpu = accelerators.gpu && typeof accelerators.gpu === "object" && !Array.isArray(accelerators.gpu)
545
+ ? accelerators.gpu
546
+ : null;
547
+ const rawGpuCount = gpu?.["count"];
548
+ const gpuCount = typeof rawGpuCount === "number" && Number.isFinite(rawGpuCount)
549
+ ? Math.max(0, Math.floor(rawGpuCount))
550
+ : 0;
551
+ const vramTier = coarsePublicVramTier(gpu?.["vram_tier"], gpuCount);
552
+ return {
553
+ schema_version: 1,
554
+ collected_at: new Date().toISOString(),
555
+ cpu: {
556
+ core_count: cpuCount,
557
+ load_1m_ratio: roundedTelemetryNumber((loadavg()[0] || 0) / cpuCount)
558
+ },
559
+ ram: {
560
+ used_ratio: totalMemory > 0 ? roundedTelemetryNumber((totalMemory - freeMemory) / totalMemory) : null,
561
+ total_bucket: totalHostMemoryBucket()
562
+ },
563
+ gpu: {
564
+ available: Boolean(gpu?.["available"]),
565
+ count: gpuCount,
566
+ cuda: Boolean(gpu?.["cuda"] || gpu?.["has_cuda"]),
567
+ vram: {
568
+ total_tier: vramTier,
569
+ used_ratio: null
570
+ }
571
+ }
572
+ };
573
+ }
384
574
  function normalizeCapabilities(value) {
385
575
  if (!Array.isArray(value)) {
386
576
  return [];
@@ -537,6 +727,7 @@ function serviceEnvironment(config, env, homeDir) {
537
727
  MSWARM_GATEWAY_BASE_URL: config.gatewayBaseUrl,
538
728
  MSWARM_SELF_HOSTED_NODE_STATE_PATH: config.statePath,
539
729
  MSWARM_SELF_HOSTED_NODE_KEY_PATH: config.runtimeTokenPath,
730
+ MSWARM_SELF_HOSTED_ARTIFACT_STORE_PATH: config.artifactStorePath || null,
540
731
  MSWARM_SELF_HOSTED_RELAY_MODE: config.relayMode || "outbound",
541
732
  MSWARM_SELF_HOSTED_DIRECT_BASE_URL: config.directBaseUrl || null,
542
733
  MSWARM_SELF_HOSTED_DISCOVERY_MODE: config.discoveryMode,
@@ -550,7 +741,18 @@ function serviceEnvironment(config, env, homeDir) {
550
741
  MSWARM_SELF_HOSTED_MODEL_BLOCKLIST: config.modelBlocklist.join(","),
551
742
  MSWARM_SELF_HOSTED_HEARTBEAT_INTERVAL_SECONDS: String(config.heartbeatIntervalSeconds),
552
743
  MSWARM_SELF_HOSTED_REQUEST_TIMEOUT_MS: String(config.requestTimeoutMs),
553
- MSWARM_SELF_HOSTED_JOB_TIMEOUT_MS: String(config.jobTimeoutMs)
744
+ MSWARM_SELF_HOSTED_JOB_TIMEOUT_MS: String(config.jobTimeoutMs),
745
+ MSWARM_SELF_HOSTED_MAX_CONCURRENT_JOBS: String(config.maxConcurrentJobs || 1),
746
+ MSWARM_SELF_HOSTED_MAX_CONCURRENT_LLM_JOBS: String(config.maxConcurrentLlmJobs || config.maxConcurrentJobs || 1),
747
+ MSWARM_SELF_HOSTED_GENERIC_JOBS_ENABLED: config.genericJobsEnabled ? "true" : "false",
748
+ MSWARM_SELF_HOSTED_GENERIC_JOB_TIMEOUT_MS: String(config.genericJobTimeoutMs),
749
+ MSWARM_SELF_HOSTED_GENERIC_JOB_MAX_CONCURRENCY: String(config.genericJobMaxConcurrency),
750
+ MSWARM_SELF_HOSTED_DRAIN_MODE: config.drainMode ? "true" : "false",
751
+ MSWARM_SELF_HOSTED_LOAD_REPORTING_ENABLED: config.loadReportingEnabled === false ? "false" : "true",
752
+ MSWARM_SELF_HOSTED_HARDWARE_TELEMETRY_ENABLED: config.hardwareTelemetryEnabled ? "true" : "false",
753
+ MSWARM_SELF_HOSTED_CAPABILITY_PROBE_TIMEOUT_MS: config.capabilityProbeTimeoutMs
754
+ ? String(config.capabilityProbeTimeoutMs)
755
+ : null
554
756
  };
555
757
  return Object.fromEntries(Object.entries(values).filter((entry) => typeof entry[1] === "string" && entry[1] !== ""));
556
758
  }
@@ -955,6 +1157,8 @@ export async function readSelfHostedNodeConfig(env = process.env) {
955
1157
  optionalText(env.OLLAMA_HOST) ||
956
1158
  DEFAULT_OLLAMA_BASE_URL;
957
1159
  const packageNodeVersion = await readPackageNodeVersion();
1160
+ const maxConcurrentJobs = parsePositiveInteger(env.MSWARM_SELF_HOSTED_MAX_CONCURRENT_JOBS, state.max_concurrent_jobs || 1);
1161
+ const maxConcurrentLlmJobs = parsePositiveInteger(env.MSWARM_SELF_HOSTED_MAX_CONCURRENT_LLM_JOBS, state.max_concurrent_llm_jobs || maxConcurrentJobs);
958
1162
  return {
959
1163
  gatewayBaseUrl: trimTrailingSlash(gatewayBaseUrl),
960
1164
  nodeId,
@@ -970,6 +1174,9 @@ export async function readSelfHostedNodeConfig(env = process.env) {
970
1174
  ollamaBaseUrl: trimTrailingSlash(ollamaBaseUrl),
971
1175
  statePath,
972
1176
  runtimeTokenPath,
1177
+ artifactStorePath: optionalText(env.MSWARM_SELF_HOSTED_ARTIFACT_STORE_PATH) ||
1178
+ state.artifact_store_path ||
1179
+ defaultArtifactStorePath(),
973
1180
  invocationSigningSecret: optionalText(env.MSWARM_SELF_HOSTED_INVOCATION_SIGNING_SECRET) ||
974
1181
  optionalText(env.MSWARM_SELF_HOSTED_RELAY_SIGNING_SECRET),
975
1182
  listenHost: optionalText(env.MSWARM_SELF_HOSTED_LISTEN_HOST) || DEFAULT_LISTEN_HOST,
@@ -981,6 +1188,15 @@ export async function readSelfHostedNodeConfig(env = process.env) {
981
1188
  heartbeatIntervalSeconds: parsePositiveInteger(env.MSWARM_SELF_HOSTED_HEARTBEAT_INTERVAL_SECONDS, state.heartbeat_interval_seconds || DEFAULT_HEARTBEAT_INTERVAL_SECONDS),
982
1189
  requestTimeoutMs: parsePositiveInteger(env.MSWARM_SELF_HOSTED_REQUEST_TIMEOUT_MS, state.request_timeout_ms || DEFAULT_REQUEST_TIMEOUT_MS),
983
1190
  jobTimeoutMs: parsePositiveInteger(env.MSWARM_SELF_HOSTED_JOB_TIMEOUT_MS, state.job_timeout_ms || DEFAULT_JOB_TIMEOUT_MS),
1191
+ maxConcurrentJobs,
1192
+ maxConcurrentLlmJobs,
1193
+ genericJobsEnabled: parseBoolean(env.MSWARM_SELF_HOSTED_GENERIC_JOBS_ENABLED ?? env.MSWARM_SELF_HOSTED_GENERIC_JOBS, state.generic_jobs_enabled === true),
1194
+ genericJobTimeoutMs: parsePositiveInteger(env.MSWARM_SELF_HOSTED_GENERIC_JOB_TIMEOUT_MS, state.generic_job_timeout_ms || state.job_timeout_ms || DEFAULT_JOB_TIMEOUT_MS),
1195
+ genericJobMaxConcurrency: parsePositiveInteger(env.MSWARM_SELF_HOSTED_GENERIC_JOB_MAX_CONCURRENCY, state.generic_job_max_concurrency || 1),
1196
+ capabilityProbeTimeoutMs: parsePositiveInteger(env.MSWARM_SELF_HOSTED_CAPABILITY_PROBE_TIMEOUT_MS, state.capability_probe_timeout_ms || DEFAULT_CAPABILITY_PROBE_TIMEOUT_MS),
1197
+ drainMode: parseBoolean(env.MSWARM_SELF_HOSTED_DRAIN_MODE, state.drain_mode === true),
1198
+ loadReportingEnabled: parseBoolean(env.MSWARM_SELF_HOSTED_LOAD_REPORTING_ENABLED ?? env.MSWARM_SELF_HOSTED_LOAD_REPORTING, state.load_reporting_enabled !== false),
1199
+ hardwareTelemetryEnabled: parseBoolean(env.MSWARM_SELF_HOSTED_HARDWARE_TELEMETRY_ENABLED ?? env.MSWARM_SELF_HOSTED_HARDWARE_TELEMETRY, state.hardware_telemetry_enabled === true),
984
1200
  exposeAllModels: resolveDaemonExposeAllModels(env, state),
985
1201
  modelAllowlist: parseList(env.MSWARM_SELF_HOSTED_MODEL_ALLOWLIST || state.model_allowlist),
986
1202
  modelBlocklist: parseList(env.MSWARM_SELF_HOSTED_MODEL_BLOCKLIST || state.model_blocklist)
@@ -1011,6 +1227,8 @@ export async function readOwnerSetupConfig(argv = process.argv.slice(3), env = p
1011
1227
  const allowlist = parseList(options.allow || env.MSWARM_SELF_HOSTED_MODEL_ALLOWLIST);
1012
1228
  const blocklist = parseList(options.block || env.MSWARM_SELF_HOSTED_MODEL_BLOCKLIST);
1013
1229
  const packageNodeVersion = await readPackageNodeVersion();
1230
+ const maxConcurrentJobs = parsePositiveInteger(options["max-concurrent-jobs"] || env.MSWARM_SELF_HOSTED_MAX_CONCURRENT_JOBS, 1);
1231
+ const maxConcurrentLlmJobs = parsePositiveInteger(options["max-concurrent-llm-jobs"] || env.MSWARM_SELF_HOSTED_MAX_CONCURRENT_LLM_JOBS, maxConcurrentJobs);
1014
1232
  return {
1015
1233
  apiKey,
1016
1234
  gatewayBaseUrl: trimTrailingSlash(gatewayBaseUrl),
@@ -1022,6 +1240,9 @@ export async function readOwnerSetupConfig(argv = process.argv.slice(3), env = p
1022
1240
  discoveryMode: parseDiscoveryMode(env.MSWARM_SELF_HOSTED_DISCOVERY_MODE),
1023
1241
  statePath,
1024
1242
  runtimeTokenPath,
1243
+ artifactStorePath: optionalText(options["artifact-store-path"]) ||
1244
+ optionalText(env.MSWARM_SELF_HOSTED_ARTIFACT_STORE_PATH) ||
1245
+ defaultArtifactStorePath(),
1025
1246
  machineIdPath: optionalText(env.MSWARM_SELF_HOSTED_MACHINE_ID_PATH) || defaultMachineIdPath(),
1026
1247
  mcodaBin: optionalText(env.MSWARM_SELF_HOSTED_MCODA_BIN) || DEFAULT_MCODA_BIN,
1027
1248
  mcodaListArgs: parseArgs(env.MSWARM_SELF_HOSTED_MCODA_LIST_ARGS, DEFAULT_MCODA_LIST_ARGS),
@@ -1030,6 +1251,17 @@ export async function readOwnerSetupConfig(argv = process.argv.slice(3), env = p
1030
1251
  heartbeatIntervalSeconds: parsePositiveInteger(env.MSWARM_SELF_HOSTED_HEARTBEAT_INTERVAL_SECONDS, DEFAULT_HEARTBEAT_INTERVAL_SECONDS),
1031
1252
  requestTimeoutMs: parsePositiveInteger(env.MSWARM_SELF_HOSTED_REQUEST_TIMEOUT_MS, DEFAULT_REQUEST_TIMEOUT_MS),
1032
1253
  jobTimeoutMs: parsePositiveInteger(options["job-timeout-ms"] || env.MSWARM_SELF_HOSTED_JOB_TIMEOUT_MS, DEFAULT_JOB_TIMEOUT_MS),
1254
+ maxConcurrentJobs,
1255
+ maxConcurrentLlmJobs,
1256
+ genericJobsEnabled: parseBoolean(options["enable-generic-jobs"] || env.MSWARM_SELF_HOSTED_GENERIC_JOBS_ENABLED || env.MSWARM_SELF_HOSTED_GENERIC_JOBS, false),
1257
+ genericJobTimeoutMs: parsePositiveInteger(options["generic-job-timeout-ms"] || env.MSWARM_SELF_HOSTED_GENERIC_JOB_TIMEOUT_MS, DEFAULT_JOB_TIMEOUT_MS),
1258
+ genericJobMaxConcurrency: parsePositiveInteger(options["generic-job-max-concurrency"] || env.MSWARM_SELF_HOSTED_GENERIC_JOB_MAX_CONCURRENCY, 1),
1259
+ capabilityProbeTimeoutMs: parsePositiveInteger(env.MSWARM_SELF_HOSTED_CAPABILITY_PROBE_TIMEOUT_MS, DEFAULT_CAPABILITY_PROBE_TIMEOUT_MS),
1260
+ drainMode: parseBoolean(options.drain || env.MSWARM_SELF_HOSTED_DRAIN_MODE, false),
1261
+ loadReportingEnabled: parseBoolean(options["disable-load-reporting"] === true
1262
+ ? false
1263
+ : (env.MSWARM_SELF_HOSTED_LOAD_REPORTING_ENABLED ?? env.MSWARM_SELF_HOSTED_LOAD_REPORTING), true),
1264
+ hardwareTelemetryEnabled: parseBoolean(options["enable-hardware-telemetry"] || env.MSWARM_SELF_HOSTED_HARDWARE_TELEMETRY_ENABLED || env.MSWARM_SELF_HOSTED_HARDWARE_TELEMETRY, false),
1033
1265
  exposeAllModels: resolveOwnerSetupExposeAllModels(options, env),
1034
1266
  modelAllowlist: allowlist,
1035
1267
  modelBlocklist: blocklist,
@@ -1148,10 +1380,17 @@ async function defaultCommandRunner(command, args, options) {
1148
1380
  let stdout = "";
1149
1381
  let stderr = "";
1150
1382
  let settled = false;
1383
+ const abort = () => {
1384
+ if (settled)
1385
+ return;
1386
+ child.kill("SIGTERM");
1387
+ finish(new Error("command aborted"));
1388
+ };
1151
1389
  const timer = setTimeout(() => {
1152
1390
  if (settled)
1153
1391
  return;
1154
1392
  settled = true;
1393
+ options.signal?.removeEventListener("abort", abort);
1155
1394
  child.kill("SIGTERM");
1156
1395
  reject(new Error(`command timed out after ${options.timeoutMs}ms: ${command}`));
1157
1396
  }, options.timeoutMs);
@@ -1160,6 +1399,7 @@ async function defaultCommandRunner(command, args, options) {
1160
1399
  return;
1161
1400
  settled = true;
1162
1401
  clearTimeout(timer);
1402
+ options.signal?.removeEventListener("abort", abort);
1163
1403
  if (error) {
1164
1404
  reject(error);
1165
1405
  return;
@@ -1188,6 +1428,11 @@ async function defaultCommandRunner(command, args, options) {
1188
1428
  }
1189
1429
  finish();
1190
1430
  });
1431
+ if (options.signal?.aborted) {
1432
+ abort();
1433
+ return;
1434
+ }
1435
+ options.signal?.addEventListener("abort", abort, { once: true });
1191
1436
  if (options.input) {
1192
1437
  child.stdin.write(options.input);
1193
1438
  }
@@ -1414,10 +1659,6 @@ function mapMcodaAgentToCodaliAgent(agent, fallbackSlug) {
1414
1659
  maxOutputTokens: optionalNumber(agent.maxOutputTokens, agent.max_output_tokens) ?? undefined,
1415
1660
  };
1416
1661
  }
1417
- function isExposedLocalAgent(agent, config) {
1418
- const mapped = mapMcodaAgentToSelfHostedModel(agent, config);
1419
- return Boolean(mapped?.exposed);
1420
- }
1421
1662
  function buildCodaliWorkspace(job) {
1422
1663
  const root = optionalText(job.workspace?.root);
1423
1664
  if (!root) {
@@ -1437,6 +1678,13 @@ const DOCDEX_JOB_ERROR_CODES = new Set([
1437
1678
  "docdex_repo_access_denied",
1438
1679
  "docdex_unavailable",
1439
1680
  ]);
1681
+ const PRE_START_JOB_ERROR_CODES = new Set([
1682
+ "selected_agent_unavailable",
1683
+ "selected_agent_unhealthy",
1684
+ "validation_failed",
1685
+ "docdex_context_missing",
1686
+ "docdex_api_key_missing",
1687
+ ]);
1440
1688
  class SelfHostedDocdexJobError extends Error {
1441
1689
  constructor(code, message) {
1442
1690
  super(message);
@@ -1444,6 +1692,13 @@ class SelfHostedDocdexJobError extends Error {
1444
1692
  this.code = code;
1445
1693
  }
1446
1694
  }
1695
+ class SelfHostedPreStartJobError extends Error {
1696
+ constructor(code, message) {
1697
+ super(message);
1698
+ this.name = code;
1699
+ this.code = code;
1700
+ }
1701
+ }
1447
1702
  function normalizeDocdexCapabilityMap(value) {
1448
1703
  const record = objectRecord(value);
1449
1704
  if (!record)
@@ -1501,11 +1756,14 @@ function selfHostedErrorCode(error) {
1501
1756
  if (!error || typeof error !== "object")
1502
1757
  return undefined;
1503
1758
  const code = error.code;
1504
- if (typeof code === "string" && DOCDEX_JOB_ERROR_CODES.has(code)) {
1759
+ if (typeof code === "string" &&
1760
+ (DOCDEX_JOB_ERROR_CODES.has(code) || PRE_START_JOB_ERROR_CODES.has(code))) {
1505
1761
  return code;
1506
1762
  }
1507
1763
  const name = error.name;
1508
- return typeof name === "string" && DOCDEX_JOB_ERROR_CODES.has(name) ? name : undefined;
1764
+ return typeof name === "string" && (DOCDEX_JOB_ERROR_CODES.has(name) || PRE_START_JOB_ERROR_CODES.has(name))
1765
+ ? name
1766
+ : undefined;
1509
1767
  }
1510
1768
  function redactRuntimeSecretValues(value, secrets) {
1511
1769
  let output = value;
@@ -1530,159 +1788,1556 @@ function buildCodaliPolicy(job) {
1530
1788
  maxOutputTokens: job.policy?.max_output_tokens ?? job.openai_request.max_tokens,
1531
1789
  };
1532
1790
  }
1533
- function usageTokens(usage) {
1534
- return {
1535
- promptTokens: positiveInteger(usage?.inputTokens),
1536
- completionTokens: positiveInteger(usage?.outputTokens),
1537
- };
1791
+ function numberArg(value, fallback) {
1792
+ const parsed = Number(value);
1793
+ return Number.isFinite(parsed) ? parsed : fallback;
1538
1794
  }
1539
- export class McodaLocalAgentExecutor {
1540
- constructor(input) {
1541
- this.command = input.command || DEFAULT_MCODA_BIN;
1542
- this.timeoutMs = input.timeoutMs || DEFAULT_JOB_TIMEOUT_MS;
1543
- this.runner = input.runner || defaultCommandRunner;
1795
+ function boundedMilliseconds(value, fallback, max) {
1796
+ return Math.max(0, Math.min(max, Math.floor(numberArg(value, fallback))));
1797
+ }
1798
+ function abortErrorCode(signal) {
1799
+ return signal.reason === "timeout" ? "timeout" : "cancelled";
1800
+ }
1801
+ function abortErrorMessage(signal) {
1802
+ return abortErrorCode(signal) === "timeout" ? "generic job timed out" : "generic job cancelled";
1803
+ }
1804
+ async function sleepWithAbort(ms, signal) {
1805
+ if (ms <= 0)
1806
+ return;
1807
+ if (signal.aborted) {
1808
+ throw new Error(abortErrorMessage(signal));
1544
1809
  }
1545
- async invoke(agentSlug, prompt) {
1546
- const stdout = (await this.runner(this.command, ["agent-run", agentSlug, "--json", "--stdin"], {
1547
- timeoutMs: this.timeoutMs,
1548
- maxBuffer: DEFAULT_COMMAND_MAX_BUFFER,
1549
- input: prompt
1550
- })).stdout;
1551
- const parsed = JSON.parse(stdout);
1552
- if (!parsed || typeof parsed !== "object" || !Array.isArray(parsed.responses)) {
1553
- throw new Error("mcoda agent-run returned unsupported JSON");
1810
+ await new Promise((resolve, reject) => {
1811
+ const timer = setTimeout(() => {
1812
+ cleanup();
1813
+ resolve();
1814
+ }, ms);
1815
+ const onAbort = () => {
1816
+ cleanup();
1817
+ reject(new Error(abortErrorMessage(signal)));
1818
+ };
1819
+ const cleanup = () => {
1820
+ clearTimeout(timer);
1821
+ signal.removeEventListener("abort", onAbort);
1822
+ };
1823
+ signal.addEventListener("abort", onAbort, { once: true });
1824
+ });
1825
+ }
1826
+ function safeLocalArtifactJobId(jobId) {
1827
+ const normalized = jobId.replace(/[^a-zA-Z0-9_.-]/g, "_") || "job";
1828
+ return assertMswarmSafeRelativePath(normalized, "job_id");
1829
+ }
1830
+ function safeLocalArtifactName(value, fallback) {
1831
+ const normalized = value.replace(/[^a-zA-Z0-9_.-]/g, "_") || fallback;
1832
+ return assertMswarmSafeRelativePath(normalized, "artifact_name");
1833
+ }
1834
+ function resolveWithinRoot(root, relativePath) {
1835
+ const rootPath = resolve(root);
1836
+ const target = resolve(rootPath, relativePath);
1837
+ const delta = relative(rootPath, target);
1838
+ if (delta === "" || (!delta.startsWith("..") && !isAbsolute(delta))) {
1839
+ return target;
1840
+ }
1841
+ throw new Error("path_escape_not_allowed");
1842
+ }
1843
+ function sha256Hex(buffer) {
1844
+ return createHash("sha256").update(buffer).digest("hex");
1845
+ }
1846
+ function positiveByteLimit(...values) {
1847
+ const positive = values.filter((value) => typeof value === "number" && Number.isFinite(value) && value > 0);
1848
+ return positive.length ? Math.min(...positive) : DEFAULT_LOCAL_ARTIFACT_MAX_BYTES;
1849
+ }
1850
+ function parseLocalArtifactUri(uri) {
1851
+ try {
1852
+ const parsed = new URL(uri);
1853
+ if (parsed.protocol !== "artifact:" || parsed.hostname !== "local") {
1854
+ return null;
1554
1855
  }
1555
- const response = parsed.responses[0] || {};
1556
- const output = optionalText(response.output);
1557
- if (!output) {
1558
- throw new Error("mcoda agent-run response did not include output");
1856
+ const parts = decodeURIComponent(parsed.pathname).split("/").filter(Boolean);
1857
+ if (parts.length < 2) {
1858
+ return null;
1559
1859
  }
1860
+ const [jobId, ...artifactPath] = parts;
1560
1861
  return {
1561
- output,
1562
- adapter: optionalText(response.adapter) || undefined,
1563
- model: optionalText(response.model) || undefined,
1564
- metadata: response.metadata && typeof response.metadata === "object" ? response.metadata : undefined
1862
+ jobId: assertMswarmSafeRelativePath(jobId, "artifact_job_id"),
1863
+ path: assertMswarmSafeRelativePath(artifactPath.join("/"), "artifact_path")
1565
1864
  };
1566
1865
  }
1866
+ catch {
1867
+ return null;
1868
+ }
1567
1869
  }
1568
- export class MswarmSelfHostedNodeClient {
1569
- constructor(input) {
1570
- this.gatewayBaseUrl = trimTrailingSlash(input.gatewayBaseUrl);
1571
- this.fetchImpl = input.fetchImpl || fetch;
1572
- this.timeoutMs = input.timeoutMs || DEFAULT_REQUEST_TIMEOUT_MS;
1870
+ export class MswarmLocalArtifactStore {
1871
+ constructor(input = {}) {
1872
+ this.rootDir = input.rootDir || defaultArtifactStorePath();
1873
+ this.now = input.now || (() => new Date());
1573
1874
  }
1574
- async enroll(nodeId, enrollmentToken) {
1575
- return fetchJson(this.fetchImpl, `${this.gatewayBaseUrl}/v1/swarm/self-hosted/node/enroll`, {
1576
- method: "POST",
1577
- headers: { "content-type": "application/json" },
1578
- body: JSON.stringify({ node_id: nodeId, enrollment_token: enrollmentToken })
1579
- }, this.timeoutMs);
1875
+ async prepareJobWorkspace(jobId, job) {
1876
+ const safeJobId = safeLocalArtifactJobId(jobId);
1877
+ const workDir = resolveWithinRoot(this.rootDir, safeJobId);
1878
+ const inputDir = resolveWithinRoot(workDir, "inputs");
1879
+ const outputDir = resolveWithinRoot(workDir, "outputs");
1880
+ await rm(workDir, { recursive: true, force: true });
1881
+ await mkdir(inputDir, { recursive: true });
1882
+ await mkdir(outputDir, { recursive: true });
1883
+ const store = {
1884
+ backend: "local-dev",
1885
+ root_uri: `artifact://local/${safeJobId}`
1886
+ };
1887
+ const registeredInputs = await Promise.all((job.inputs || []).map((input, index) => this.registerInput(jobId, job, input, index, inputDir, store)));
1888
+ const outputSpecs = (job.outputs || []).map((output) => ({
1889
+ ...output,
1890
+ path: assertMswarmSafeRelativePath(output.path, "output_path")
1891
+ }));
1892
+ const sandbox = buildMswarmSandboxProfile({
1893
+ policy: job.policy,
1894
+ limits: job.limits,
1895
+ containerized: job.policy.trust_mode === "tenant-owned" || job.job_type === CUDA_RUN_JOB_TYPE,
1896
+ gpu: job.resources?.gpu ? "nvidia" : "none"
1897
+ });
1898
+ return {
1899
+ store,
1900
+ workDir,
1901
+ inputDir,
1902
+ outputDir,
1903
+ registeredInputs,
1904
+ outputSpecs,
1905
+ sandbox
1906
+ };
1580
1907
  }
1581
- async bootstrap(apiKey, payload) {
1582
- return fetchJson(this.fetchImpl, `${this.gatewayBaseUrl}/v1/swarm/self-hosted/node/bootstrap`, {
1583
- method: "POST",
1584
- headers: {
1585
- "content-type": "application/json",
1586
- "x-api-key": apiKey
1587
- },
1588
- body: JSON.stringify(payload)
1589
- }, this.timeoutMs);
1908
+ async collectOutputs(context, jobId) {
1909
+ const artifacts = [];
1910
+ let totalBytes = 0;
1911
+ for (const output of context.outputSpecs) {
1912
+ const collected = await this.collectDeclaredOutput(context, jobId, output);
1913
+ for (const artifact of collected) {
1914
+ totalBytes += artifact.size_bytes || 0;
1915
+ const totalLimit = positiveByteLimit(context.sandbox.limits.max_output_bytes);
1916
+ if (totalBytes > totalLimit) {
1917
+ throw new Error("output_size_limit_exceeded");
1918
+ }
1919
+ artifacts.push(artifact);
1920
+ }
1921
+ }
1922
+ return artifacts;
1590
1923
  }
1591
- async health() {
1592
- return fetchJson(this.fetchImpl, `${this.gatewayBaseUrl}/healthz`, { method: "GET" }, this.timeoutMs);
1924
+ async registerInput(jobId, job, input, index, inputDir, store) {
1925
+ const mountPath = input.mount_path
1926
+ ? assertMswarmSafeRelativePath(input.mount_path, "input_mount_path")
1927
+ : safeLocalArtifactName(input.name, `input-${index}`);
1928
+ const targetPath = resolveWithinRoot(inputDir, mountPath);
1929
+ const maxArtifactBytes = positiveByteLimit(job.policy.max_artifact_bytes);
1930
+ if (Number.isFinite(input.artifact.size_bytes) && input.artifact.size_bytes !== undefined) {
1931
+ if (input.artifact.size_bytes > maxArtifactBytes) {
1932
+ throw new Error("input_artifact_size_limit_exceeded");
1933
+ }
1934
+ }
1935
+ const source = parseLocalArtifactUri(input.artifact.uri);
1936
+ let localPath;
1937
+ if (source) {
1938
+ const sourcePath = resolveWithinRoot(resolveWithinRoot(this.rootDir, source.jobId), join("outputs", source.path));
1939
+ try {
1940
+ const sourceStat = await lstat(sourcePath);
1941
+ if (!sourceStat.isFile()) {
1942
+ throw new Error("input_artifact_must_be_file");
1943
+ }
1944
+ if (sourceStat.size > maxArtifactBytes) {
1945
+ throw new Error("input_artifact_size_limit_exceeded");
1946
+ }
1947
+ const bytes = await readFile(sourcePath);
1948
+ if (input.artifact.sha256 && input.artifact.sha256 !== sha256Hex(bytes)) {
1949
+ throw new Error("input_artifact_checksum_mismatch");
1950
+ }
1951
+ await mkdir(dirname(targetPath), { recursive: true });
1952
+ await writeFile(targetPath, bytes);
1953
+ localPath = targetPath;
1954
+ }
1955
+ catch (error) {
1956
+ if (error.code !== "ENOENT" || input.required === true) {
1957
+ throw error;
1958
+ }
1959
+ }
1960
+ }
1961
+ else if (input.required === true) {
1962
+ throw new Error("input_artifact_unavailable");
1963
+ }
1964
+ const registeredAt = this.now().toISOString();
1965
+ return {
1966
+ ...input.artifact,
1967
+ id: input.artifact.id || `input_${sha256Hex(Buffer.from(`${jobId}:${input.name}:${input.artifact.uri}`)).slice(0, 16)}`,
1968
+ job_id: jobId,
1969
+ name: input.name,
1970
+ scope: "input",
1971
+ registered_at: registeredAt,
1972
+ store,
1973
+ access: defaultMswarmArtifactAccessPolicy(job.policy.trust_mode === "tenant-owned" ? "tenant-scoped" : "owner-local"),
1974
+ retention: defaultMswarmArtifactRetentionPolicy(),
1975
+ ...(localPath ? { local_path: localPath } : {})
1976
+ };
1593
1977
  }
1594
- async heartbeat(runtimeToken, payload) {
1595
- return fetchJson(this.fetchImpl, `${this.gatewayBaseUrl}/v1/swarm/self-hosted/node/heartbeat`, {
1596
- method: "POST",
1597
- headers: {
1598
- "content-type": "application/json",
1599
- authorization: `Bearer ${runtimeToken}`
1600
- },
1601
- body: JSON.stringify(payload)
1602
- }, this.timeoutMs);
1978
+ async collectDeclaredOutput(context, jobId, output) {
1979
+ const normalizedPath = assertMswarmSafeRelativePath(output.path, "output_path");
1980
+ const targetPath = resolveWithinRoot(context.outputDir, normalizedPath);
1981
+ try {
1982
+ const targetStat = await lstat(targetPath);
1983
+ if (targetStat.isSymbolicLink()) {
1984
+ throw new Error("output_symlink_not_allowed");
1985
+ }
1986
+ if (targetStat.isDirectory()) {
1987
+ return this.collectOutputDirectory(context, jobId, output, normalizedPath);
1988
+ }
1989
+ if (targetStat.isFile()) {
1990
+ return [await this.collectOutputFile(context, jobId, output, normalizedPath, targetPath)];
1991
+ }
1992
+ throw new Error("output_entry_type_not_allowed");
1993
+ }
1994
+ catch (error) {
1995
+ if (error.code === "ENOENT" && output.required !== true) {
1996
+ return [];
1997
+ }
1998
+ throw error;
1999
+ }
1603
2000
  }
1604
- async uninstall(runtimeToken, payload) {
1605
- return fetchJson(this.fetchImpl, `${this.gatewayBaseUrl}/v1/swarm/self-hosted/node/uninstall`, {
1606
- method: "POST",
1607
- headers: {
1608
- "content-type": "application/json",
1609
- authorization: `Bearer ${runtimeToken}`
1610
- },
1611
- body: JSON.stringify(payload)
1612
- }, this.timeoutMs);
2001
+ async collectOutputDirectory(context, jobId, output, relativeDir) {
2002
+ const dirPath = resolveWithinRoot(context.outputDir, relativeDir);
2003
+ const entries = await readdir(dirPath, { withFileTypes: true });
2004
+ const artifacts = [];
2005
+ for (const entry of entries) {
2006
+ const childRelativePath = assertMswarmSafeRelativePath(`${relativeDir}/${entry.name}`, "output_path");
2007
+ const childPath = resolveWithinRoot(context.outputDir, childRelativePath);
2008
+ if (entry.isSymbolicLink()) {
2009
+ throw new Error("output_symlink_not_allowed");
2010
+ }
2011
+ if (entry.isDirectory()) {
2012
+ artifacts.push(...(await this.collectOutputDirectory(context, jobId, output, childRelativePath)));
2013
+ }
2014
+ else if (entry.isFile()) {
2015
+ artifacts.push(await this.collectOutputFile(context, jobId, output, childRelativePath, childPath));
2016
+ }
2017
+ else {
2018
+ throw new Error("output_entry_type_not_allowed");
2019
+ }
2020
+ }
2021
+ return artifacts;
1613
2022
  }
1614
- async pushModels(runtimeToken, payload) {
1615
- return fetchJson(this.fetchImpl, `${this.gatewayBaseUrl}/v1/swarm/self-hosted/node/models`, {
1616
- method: "POST",
1617
- headers: {
1618
- "content-type": "application/json",
1619
- authorization: `Bearer ${runtimeToken}`
1620
- },
1621
- body: JSON.stringify(payload)
1622
- }, this.timeoutMs);
2023
+ async collectOutputFile(context, jobId, output, relativePath, filePath) {
2024
+ const stat = await lstat(filePath);
2025
+ if (!stat.isFile()) {
2026
+ throw new Error("output_entry_type_not_allowed");
2027
+ }
2028
+ const perArtifactLimit = positiveByteLimit(context.sandbox.limits.max_artifact_bytes, context.sandbox.limits.max_output_bytes);
2029
+ if (stat.size > perArtifactLimit) {
2030
+ throw new Error("output_artifact_size_limit_exceeded");
2031
+ }
2032
+ const bytes = await readFile(filePath);
2033
+ return {
2034
+ id: `output_${sha256Hex(Buffer.from(`${jobId}:${relativePath}`)).slice(0, 16)}`,
2035
+ job_id: jobId,
2036
+ name: output.path === relativePath ? output.name : `${output.name}/${relativePath}`,
2037
+ uri: buildMswarmLocalArtifactUri(jobId, relativePath),
2038
+ content_type: output.content_type,
2039
+ size_bytes: stat.size,
2040
+ sha256: sha256Hex(bytes),
2041
+ scope: "output",
2042
+ registered_at: this.now().toISOString(),
2043
+ store: context.store,
2044
+ access: defaultMswarmArtifactAccessPolicy(context.sandbox.trust_mode === "tenant-owned" ? "tenant-scoped" : "owner-local"),
2045
+ retention: defaultMswarmArtifactRetentionPolicy()
2046
+ };
1623
2047
  }
1624
- async pollJob(runtimeToken, payload) {
1625
- return fetchJson(this.fetchImpl, `${this.gatewayBaseUrl}/v1/swarm/self-hosted/node/jobs/poll`, {
1626
- method: "POST",
1627
- headers: {
1628
- "content-type": "application/json",
1629
- authorization: `Bearer ${runtimeToken}`
1630
- },
1631
- body: JSON.stringify(payload)
1632
- }, Math.max(this.timeoutMs, (payload.wait_ms || 0) + 5000));
2048
+ }
2049
+ export class MswarmTestEchoRunner {
2050
+ constructor() {
2051
+ this.id = TEST_ECHO_RUNNER_ID;
1633
2052
  }
1634
- async postJobResult(runtimeToken, jobId, payload) {
1635
- return fetchJson(this.fetchImpl, `${this.gatewayBaseUrl}/v1/swarm/self-hosted/node/jobs/${encodeURIComponent(jobId)}/result`, {
1636
- method: "POST",
1637
- headers: {
1638
- "content-type": "application/json",
1639
- authorization: `Bearer ${runtimeToken}`
1640
- },
1641
- body: JSON.stringify(payload)
1642
- }, this.timeoutMs);
2053
+ async run(context) {
2054
+ const args = context.job.args || {};
2055
+ const message = optionalText(args.message) || "ok";
2056
+ const repeat = Math.max(1, Math.min(20, Math.floor(numberArg(args.repeat, 1))));
2057
+ const delayMs = boundedMilliseconds(args.delay_ms, 0, 30000);
2058
+ if (args.fail === true) {
2059
+ throw new Error(message);
2060
+ }
2061
+ for (let index = 0; index < repeat; index += 1) {
2062
+ if (context.signal.aborted) {
2063
+ throw new Error(abortErrorMessage(context.signal));
2064
+ }
2065
+ if (delayMs > 0) {
2066
+ await sleepWithAbort(delayMs, context.signal);
2067
+ }
2068
+ await context.emitEvent({
2069
+ type: "stdout",
2070
+ message,
2071
+ data: {
2072
+ runner: this.id,
2073
+ index,
2074
+ repeat
2075
+ }
2076
+ });
2077
+ }
2078
+ await context.emitEvent({
2079
+ type: "progress",
2080
+ message: "echo complete",
2081
+ data: {
2082
+ completed: repeat,
2083
+ total: repeat
2084
+ }
2085
+ });
2086
+ return {
2087
+ job_id: context.job.idempotency_key || "local-generic-job",
2088
+ status: "succeeded",
2089
+ exit_code: 0,
2090
+ started_at: new Date().toISOString(),
2091
+ finished_at: new Date().toISOString(),
2092
+ metrics: {
2093
+ runner: this.id,
2094
+ echoed: repeat,
2095
+ message
2096
+ }
2097
+ };
1643
2098
  }
1644
- async postJobEvents(runtimeToken, jobId, payload) {
1645
- return fetchJson(this.fetchImpl, `${this.gatewayBaseUrl}/v1/swarm/self-hosted/node/jobs/${encodeURIComponent(jobId)}/events`, {
1646
- method: "POST",
1647
- headers: {
1648
- "content-type": "application/json",
1649
- authorization: `Bearer ${runtimeToken}`
1650
- },
1651
- body: JSON.stringify(payload)
1652
- }, this.timeoutMs);
2099
+ }
2100
+ const BLENDER_ENGINE_ARGS = {
2101
+ cycles: "CYCLES",
2102
+ eevee: "BLENDER_EEVEE_NEXT",
2103
+ workbench: "BLENDER_WORKBENCH"
2104
+ };
2105
+ const BLENDER_OUTPUT_FORMAT_ARGS = {
2106
+ png: "PNG",
2107
+ jpeg: "JPEG",
2108
+ open_exr: "OPEN_EXR"
2109
+ };
2110
+ function positiveSafeInteger(value) {
2111
+ if (typeof value !== "number" || !Number.isSafeInteger(value) || value <= 0) {
2112
+ return null;
1653
2113
  }
2114
+ return value;
1654
2115
  }
1655
- export class SelfHostedNodeRuntime {
1656
- constructor(config, deps) {
1657
- this.config = config;
1658
- this.gateway =
1659
- deps?.gateway ||
1660
- new MswarmSelfHostedNodeClient({
1661
- gatewayBaseUrl: config.gatewayBaseUrl,
1662
- fetchImpl: deps?.fetchImpl,
1663
- timeoutMs: config.requestTimeoutMs
1664
- });
1665
- this.mcoda =
1666
- deps?.mcoda ||
1667
- new McodaAgentInventoryClient({
1668
- command: config.mcodaBin,
1669
- args: config.mcodaListArgs,
1670
- timeoutMs: config.requestTimeoutMs
1671
- });
1672
- this.mcodaExecutor =
1673
- deps?.mcodaExecutor ||
1674
- new McodaLocalAgentExecutor({
1675
- command: config.mcodaBin,
1676
- timeoutMs: config.jobTimeoutMs
1677
- });
1678
- this.codaliExecutor = deps?.codaliExecutor || new MswarmCodaliExecutor();
1679
- this.ollama =
1680
- deps?.ollama ||
1681
- new OllamaClient({
1682
- baseUrl: config.ollamaBaseUrl,
1683
- fetchImpl: deps?.fetchImpl,
1684
- timeoutMs: config.requestTimeoutMs
1685
- });
2116
+ function parseBlenderFrameSelection(value) {
2117
+ const defaultFrame = 1;
2118
+ if (value === undefined || value === null) {
2119
+ return { mode: "frame", frame: defaultFrame, label: String(defaultFrame), total: 1 };
2120
+ }
2121
+ const numericFrame = positiveSafeInteger(value);
2122
+ if (numericFrame !== null) {
2123
+ return { mode: "frame", frame: numericFrame, label: String(numericFrame), total: 1 };
2124
+ }
2125
+ const raw = optionalText(value);
2126
+ const match = raw?.match(/^([1-9]\d{0,6})(?:-([1-9]\d{0,6}))?$/);
2127
+ if (!match) {
2128
+ throw new Error("render.blender args.frames must be a positive frame number or start-end range");
2129
+ }
2130
+ const start = Number(match[1]);
2131
+ const end = match[2] ? Number(match[2]) : start;
2132
+ if (!Number.isSafeInteger(start) || !Number.isSafeInteger(end) || start <= 0 || end <= 0 || end < start) {
2133
+ throw new Error("render.blender args.frames must use a valid positive frame range");
2134
+ }
2135
+ if (end - start > 10000) {
2136
+ throw new Error("render.blender args.frames range exceeds the maximum supported 10001 frames");
2137
+ }
2138
+ if (start === end) {
2139
+ return { mode: "frame", frame: start, label: String(start), total: 1 };
2140
+ }
2141
+ return { mode: "range", start, end, label: `${start}-${end}`, total: end - start + 1 };
2142
+ }
2143
+ function normalizeBlenderEngine(value) {
2144
+ const raw = optionalText(value);
2145
+ if (!raw)
2146
+ return undefined;
2147
+ const key = raw.toLowerCase();
2148
+ const blender = BLENDER_ENGINE_ARGS[key];
2149
+ if (!blender) {
2150
+ throw new Error("render.blender args.engine must be cycles, eevee, or workbench");
2151
+ }
2152
+ return { label: key, blender };
2153
+ }
2154
+ function normalizeBlenderOutputFormat(value) {
2155
+ const key = (optionalText(value) || "png").toLowerCase();
2156
+ const blender = BLENDER_OUTPUT_FORMAT_ARGS[key];
2157
+ if (!blender) {
2158
+ throw new Error("render.blender args.output_format must be png, jpeg, or open_exr");
2159
+ }
2160
+ return { label: key, blender, extension: key === "open_exr" ? "exr" : key === "jpeg" ? "jpg" : "png" };
2161
+ }
2162
+ function parseBlenderResolution(value) {
2163
+ if (value === undefined || value === null)
2164
+ return undefined;
2165
+ const raw = optionalText(value);
2166
+ const match = raw?.match(/^([1-9]\d{0,4})x([1-9]\d{0,4})$/i);
2167
+ if (!match) {
2168
+ throw new Error("render.blender args.resolution must use WIDTHxHEIGHT");
2169
+ }
2170
+ const width = Number(match[1]);
2171
+ const height = Number(match[2]);
2172
+ if (width > 16384 || height > 16384) {
2173
+ throw new Error("render.blender args.resolution exceeds 16384x16384");
2174
+ }
2175
+ return { width, height, label: `${width}x${height}` };
2176
+ }
2177
+ function safeBlenderSceneName(value, label) {
2178
+ const raw = optionalText(value);
2179
+ if (!raw)
2180
+ return undefined;
2181
+ if (raw.length > 128 || /[\0\r\n]/.test(raw)) {
2182
+ throw new Error(`render.blender args.${label} is not a safe Blender object name`);
2183
+ }
2184
+ return raw;
2185
+ }
2186
+ function blenderSceneInputPath(context) {
2187
+ const scene = context.artifacts.registeredInputs.find((input) => input.name === "scene") || context.artifacts.registeredInputs[0];
2188
+ if (!scene?.local_path) {
2189
+ throw new Error("render.blender requires a materialized scene input artifact");
2190
+ }
2191
+ return scene.local_path;
2192
+ }
2193
+ function blenderOutputPattern(context) {
2194
+ const output = context.artifacts.outputSpecs[0];
2195
+ if (!output) {
2196
+ throw new Error("render.blender requires a declared output directory");
2197
+ }
2198
+ const normalizedPath = assertMswarmSafeRelativePath(output.path, "render_blender_output_path");
2199
+ const leaf = normalizedPath.split("/").filter(Boolean).at(-1) || normalizedPath;
2200
+ if (/\.[a-zA-Z0-9]{1,8}$/.test(leaf)) {
2201
+ throw new Error("render.blender output path must be a directory, not a file path");
2202
+ }
2203
+ return resolveWithinRoot(context.artifacts.outputDir, `${normalizedPath}/frame_####`);
2204
+ }
2205
+ function redactBlenderLocalPaths(context, value) {
2206
+ const replacements = [
2207
+ [context.artifacts.workDir, "[job-workdir]"],
2208
+ [context.artifacts.inputDir, "[job-inputs]"],
2209
+ [context.artifacts.outputDir, "[job-outputs]"],
2210
+ ...context.artifacts.registeredInputs.map((input) => [input.local_path, "[job-input]"])
2211
+ ];
2212
+ let output = value;
2213
+ for (const [source, replacement] of replacements) {
2214
+ if (source) {
2215
+ output = output.split(source).join(replacement);
2216
+ }
2217
+ }
2218
+ return output;
2219
+ }
2220
+ async function emitBlenderOutput(context, type, value) {
2221
+ const lines = value.split(/\r?\n/).map((line) => line.trim()).filter(Boolean).slice(0, 200);
2222
+ for (const line of lines) {
2223
+ await context.emitEvent({
2224
+ type,
2225
+ message: redactBlenderLocalPaths(context, line),
2226
+ data: { runner: BLENDER_RENDER_RUNNER_ID }
2227
+ });
2228
+ }
2229
+ }
2230
+ async function emitBlenderProgress(context, output, frames) {
2231
+ const seen = new Set();
2232
+ const lowerBound = frames.mode === "range" ? frames.start : frames.frame;
2233
+ const upperBound = frames.mode === "range" ? frames.end : frames.frame;
2234
+ for (const line of output.split(/\r?\n/)) {
2235
+ const match = line.match(/\bFra:(\d+)\b/i) || line.match(/\bFrame\s+(\d+)\b/i);
2236
+ if (!match)
2237
+ continue;
2238
+ const frame = Number(match[1]);
2239
+ if (!Number.isSafeInteger(frame) || frame < lowerBound || frame > upperBound || seen.has(frame)) {
2240
+ continue;
2241
+ }
2242
+ seen.add(frame);
2243
+ await context.emitEvent({
2244
+ type: "progress",
2245
+ message: `rendered frame ${frame}`,
2246
+ data: {
2247
+ runner: BLENDER_RENDER_RUNNER_ID,
2248
+ frame,
2249
+ completed: seen.size,
2250
+ total: frames.total
2251
+ }
2252
+ });
2253
+ }
2254
+ }
2255
+ function blenderFailureResult(job, code, message, startedAt) {
2256
+ return {
2257
+ job_id: job.idempotency_key || "render.blender",
2258
+ status: "failed",
2259
+ exit_code: 1,
2260
+ started_at: startedAt,
2261
+ finished_at: new Date().toISOString(),
2262
+ error: {
2263
+ code,
2264
+ message,
2265
+ retryable: false
2266
+ }
2267
+ };
2268
+ }
2269
+ function blenderGpuComputeDeviceType() {
2270
+ // The current GPU probe only marks NVIDIA devices as available, so CUDA is
2271
+ // the only concrete Blender compute backend this runner can safely request.
2272
+ return "CUDA";
2273
+ }
2274
+ export class MswarmBlenderRenderRunner {
2275
+ constructor(runner = defaultCommandRunner) {
2276
+ this.id = BLENDER_RENDER_RUNNER_ID;
2277
+ this.runner = runner;
2278
+ }
2279
+ async run(context) {
2280
+ const startedAt = new Date().toISOString();
2281
+ if (context.signal.aborted) {
2282
+ throw new Error(abortErrorMessage(context.signal));
2283
+ }
2284
+ if (context.job.policy.trust_mode !== "owner-local") {
2285
+ return blenderFailureResult(context.job, "policy_denied", "render.blender is owner-local only until containerized Blender execution is available", startedAt);
2286
+ }
2287
+ let scenePath;
2288
+ let frames;
2289
+ let engine;
2290
+ let outputFormat;
2291
+ let resolution;
2292
+ let sceneName;
2293
+ let cameraName;
2294
+ let outputPattern;
2295
+ const gpuRequested = Boolean(context.job.resources?.gpu);
2296
+ try {
2297
+ const args = context.job.args || {};
2298
+ scenePath = blenderSceneInputPath(context);
2299
+ frames = parseBlenderFrameSelection(args.frames);
2300
+ engine = normalizeBlenderEngine(args.engine);
2301
+ outputFormat = normalizeBlenderOutputFormat(args.output_format);
2302
+ resolution = parseBlenderResolution(args.resolution);
2303
+ sceneName = safeBlenderSceneName(args.scene, "scene");
2304
+ cameraName = safeBlenderSceneName(args.camera, "camera");
2305
+ outputPattern = blenderOutputPattern(context);
2306
+ await mkdir(dirname(outputPattern), { recursive: true });
2307
+ }
2308
+ catch (error) {
2309
+ return blenderFailureResult(context.job, "validation_failed", error instanceof Error ? error.message : String(error || "render.blender validation failed"), startedAt);
2310
+ }
2311
+ const pythonStatements = [];
2312
+ if (resolution) {
2313
+ pythonStatements.push(`bpy.context.scene.render.resolution_x=${resolution.width}`);
2314
+ pythonStatements.push(`bpy.context.scene.render.resolution_y=${resolution.height}`);
2315
+ }
2316
+ if (cameraName) {
2317
+ pythonStatements.push(`camera=bpy.data.objects.get(${JSON.stringify(cameraName)})`);
2318
+ pythonStatements.push("bpy.context.scene.camera=camera if camera is not None else bpy.context.scene.camera");
2319
+ }
2320
+ if (gpuRequested) {
2321
+ const computeDeviceType = blenderGpuComputeDeviceType();
2322
+ pythonStatements.push("cycles_addon=bpy.context.preferences.addons.get('cycles')");
2323
+ pythonStatements.push("cycles_prefs=cycles_addon.preferences if cycles_addon is not None else None");
2324
+ pythonStatements.push(`setattr(cycles_prefs,'compute_device_type',${JSON.stringify(computeDeviceType)}) if cycles_prefs is not None and hasattr(cycles_prefs,'compute_device_type') else None`);
2325
+ pythonStatements.push("getattr(cycles_prefs,'get_devices',lambda: None)() if cycles_prefs is not None else None");
2326
+ pythonStatements.push("setattr(bpy.context.scene.cycles,'device','GPU') if hasattr(bpy.context.scene,'cycles') else None");
2327
+ pythonStatements.push("[setattr(device,'use',True) for device in getattr(cycles_prefs,'devices',[]) if hasattr(device,'use')] if cycles_prefs is not None else None");
2328
+ }
2329
+ const blenderArgs = ["-b", scenePath];
2330
+ if (sceneName) {
2331
+ blenderArgs.push("--scene", sceneName);
2332
+ }
2333
+ if (engine) {
2334
+ blenderArgs.push("--engine", engine.blender);
2335
+ }
2336
+ if (pythonStatements.length > 0) {
2337
+ blenderArgs.push("--python-expr", `import bpy; ${pythonStatements.join("; ")}`);
2338
+ }
2339
+ blenderArgs.push("--render-output", outputPattern, "--render-format", outputFormat.blender);
2340
+ if (frames.mode === "range") {
2341
+ blenderArgs.push("-s", String(frames.start), "-e", String(frames.end), "-a");
2342
+ }
2343
+ else {
2344
+ blenderArgs.push("--render-frame", String(frames.frame));
2345
+ }
2346
+ await context.emitEvent({
2347
+ type: "progress",
2348
+ message: "blender render starting",
2349
+ data: {
2350
+ runner: this.id,
2351
+ frames: frames.label,
2352
+ engine: engine?.label || "scene-default",
2353
+ output_format: outputFormat.label,
2354
+ ...(resolution ? { resolution: resolution.label } : {}),
2355
+ gpu_requested: gpuRequested,
2356
+ render_device: gpuRequested ? "gpu" : "scene-default"
2357
+ }
2358
+ });
2359
+ const timeoutMs = Math.max(1000, Math.min(DEFAULT_JOB_TIMEOUT_MS, Math.floor((context.sandbox.limits.timeout_sec || DEFAULT_JOB_TIMEOUT_MS / 1000) * 1000)));
2360
+ const maxBuffer = Math.min(DEFAULT_COMMAND_MAX_BUFFER, Math.max(1024 * 1024, context.job.limits?.max_stdout_bytes || 0, context.job.limits?.max_stderr_bytes || 0));
2361
+ try {
2362
+ const result = await this.runner("blender", blenderArgs, {
2363
+ timeoutMs,
2364
+ maxBuffer,
2365
+ signal: context.signal
2366
+ });
2367
+ await emitBlenderOutput(context, "stdout", result.stdout);
2368
+ await emitBlenderOutput(context, "stderr", result.stderr);
2369
+ await emitBlenderProgress(context, `${result.stdout}\n${result.stderr}`, frames);
2370
+ return {
2371
+ job_id: context.job.idempotency_key || "render.blender",
2372
+ status: "succeeded",
2373
+ exit_code: 0,
2374
+ started_at: startedAt,
2375
+ finished_at: new Date().toISOString(),
2376
+ metrics: {
2377
+ runner: this.id,
2378
+ frames: frames.label,
2379
+ engine: engine?.label || "scene-default",
2380
+ output_format: outputFormat.label,
2381
+ ...(resolution ? { resolution: resolution.label } : {}),
2382
+ gpu_requested: gpuRequested,
2383
+ render_device: gpuRequested ? "gpu" : "scene-default"
2384
+ }
2385
+ };
2386
+ }
2387
+ catch (error) {
2388
+ if (context.signal.aborted) {
2389
+ throw error;
2390
+ }
2391
+ return blenderFailureResult(context.job, "runner_failed", redactBlenderLocalPaths(context, error instanceof Error ? error.message : String(error || "Blender render failed")), startedAt);
2392
+ }
2393
+ }
2394
+ }
2395
+ const SAFE_CUDA_IDENTIFIER = /^[a-zA-Z0-9][a-zA-Z0-9_.-]{0,127}$/;
2396
+ const SAFE_CUDA_TOKEN = /^[a-zA-Z0-9_@%+=:,./-]{1,200}$/;
2397
+ const UNSAFE_CUDA_MANIFEST_KEYS = new Set([
2398
+ "command",
2399
+ "cmd",
2400
+ "shell",
2401
+ "entrypoint",
2402
+ "docker_args",
2403
+ "mount",
2404
+ "mounts",
2405
+ "volumes",
2406
+ "binds",
2407
+ "device",
2408
+ "devices",
2409
+ "privileged",
2410
+ "network",
2411
+ "host_network"
2412
+ ]);
2413
+ function cudaFailureResult(job, code, message, startedAt) {
2414
+ return {
2415
+ job_id: job.idempotency_key || "cuda.run",
2416
+ status: "failed",
2417
+ exit_code: 1,
2418
+ started_at: startedAt,
2419
+ finished_at: new Date().toISOString(),
2420
+ error: {
2421
+ code,
2422
+ message,
2423
+ retryable: false
2424
+ }
2425
+ };
2426
+ }
2427
+ function safeCudaIdentifier(value, label) {
2428
+ const text = optionalText(value);
2429
+ if (!text || !SAFE_CUDA_IDENTIFIER.test(text)) {
2430
+ throw new Error(`${label}_invalid`);
2431
+ }
2432
+ return text;
2433
+ }
2434
+ function safeCudaRelativePath(value, label) {
2435
+ return assertMswarmSafeRelativePath(optionalText(value), label);
2436
+ }
2437
+ function safeCudaToken(value, label) {
2438
+ const text = optionalText(value);
2439
+ if (!text || !SAFE_CUDA_TOKEN.test(text) || /[`$;&|<>\r\n]/.test(text)) {
2440
+ throw new Error(`${label}_invalid`);
2441
+ }
2442
+ return text;
2443
+ }
2444
+ function safeCudaTokenList(value, label) {
2445
+ if (value === undefined)
2446
+ return [];
2447
+ if (!Array.isArray(value)) {
2448
+ throw new Error(`${label}_must_be_array`);
2449
+ }
2450
+ return value.map((entry, index) => safeCudaToken(entry, `${label}_${index}`));
2451
+ }
2452
+ function assertNoUnsafeCudaManifestKeys(record, label) {
2453
+ for (const key of Object.keys(record)) {
2454
+ if (UNSAFE_CUDA_MANIFEST_KEYS.has(key)) {
2455
+ throw new Error(`${label}_${key}_not_allowed`);
2456
+ }
2457
+ }
2458
+ }
2459
+ function parseCudaRunArgs(job) {
2460
+ const args = job.args || {};
2461
+ return {
2462
+ manifestPath: safeCudaRelativePath(args.manifest_path, "cuda_manifest_path"),
2463
+ profile: safeCudaIdentifier(args.profile, "cuda_profile"),
2464
+ target: safeCudaIdentifier(args.target, "cuda_target")
2465
+ };
2466
+ }
2467
+ function cudaPackageArchive(context) {
2468
+ const registeredInput = context.artifacts.registeredInputs.find((input) => input.name === "package" && input.local_path) ||
2469
+ context.artifacts.registeredInputs.find((input) => input.local_path && input.name !== "manifest");
2470
+ if (!registeredInput?.local_path) {
2471
+ throw new Error("cuda_package_artifact_required");
2472
+ }
2473
+ const inputPath = assertMswarmSafeRelativePath(relative(context.artifacts.inputDir, registeredInput.local_path), "cuda_package_input_path");
2474
+ if (!/(\.tar\.gz|\.tgz)$/i.test(inputPath)) {
2475
+ throw new Error("cuda_package_archive_must_be_targz");
2476
+ }
2477
+ return { hostPath: registeredInput.local_path, inputPath };
2478
+ }
2479
+ function cudaArchiveValidationError(reason) {
2480
+ return new Error(`cuda_package_archive_${reason || "invalid"}`);
2481
+ }
2482
+ function cudaTarVerboseEntryType(line) {
2483
+ const marker = line.trimStart()[0];
2484
+ if (marker === "d")
2485
+ return "directory";
2486
+ if (marker === "-")
2487
+ return "file";
2488
+ if (marker === "l")
2489
+ return "symlink";
2490
+ if (marker === "h")
2491
+ return "hardlink";
2492
+ if (marker === "b" || marker === "c")
2493
+ return "device";
2494
+ return marker ? "other" : "file";
2495
+ }
2496
+ async function validateCudaPackageArchive(context, runner, archive) {
2497
+ const listOptions = {
2498
+ timeoutMs: 5000,
2499
+ maxBuffer: 512 * 1024,
2500
+ signal: context.signal
2501
+ };
2502
+ const names = await runner("tar", ["-tzf", archive.hostPath], listOptions);
2503
+ let entryCount = 0;
2504
+ for (const rawLine of names.stdout.split(/\r?\n/)) {
2505
+ const entryPath = rawLine.trim();
2506
+ if (!entryPath)
2507
+ continue;
2508
+ entryCount += 1;
2509
+ const result = validateMswarmArchiveEntry({
2510
+ path: entryPath,
2511
+ type: entryPath.endsWith("/") ? "directory" : "file"
2512
+ });
2513
+ if (!result.ok) {
2514
+ throw cudaArchiveValidationError(result.reason);
2515
+ }
2516
+ }
2517
+ if (entryCount === 0) {
2518
+ throw cudaArchiveValidationError("empty");
2519
+ }
2520
+ const verbose = await runner("tar", ["-tvzf", archive.hostPath], listOptions);
2521
+ for (const rawLine of verbose.stdout.split(/\r?\n/)) {
2522
+ if (!rawLine.trim())
2523
+ continue;
2524
+ const type = cudaTarVerboseEntryType(rawLine);
2525
+ if (type === "file" || type === "directory")
2526
+ continue;
2527
+ const result = validateMswarmArchiveEntry({ path: "entry", type });
2528
+ throw cudaArchiveValidationError(result.reason);
2529
+ }
2530
+ }
2531
+ async function readCudaManifestText(context, runner, args) {
2532
+ const directManifestPath = resolveWithinRoot(context.artifacts.inputDir, args.manifestPath);
2533
+ try {
2534
+ const directStat = await lstat(directManifestPath);
2535
+ if (directStat.isFile()) {
2536
+ return await readFile(directManifestPath, "utf8");
2537
+ }
2538
+ }
2539
+ catch (error) {
2540
+ if (error.code !== "ENOENT") {
2541
+ throw error;
2542
+ }
2543
+ }
2544
+ const archive = cudaPackageArchive(context);
2545
+ const extracted = await runner("tar", ["-xOf", archive.hostPath, args.manifestPath], {
2546
+ timeoutMs: 5000,
2547
+ maxBuffer: 256 * 1024,
2548
+ signal: context.signal
2549
+ });
2550
+ return extracted.stdout;
2551
+ }
2552
+ function parseCudaPackageManifest(text, args, policy) {
2553
+ const parsed = JSON.parse(text);
2554
+ const manifest = objectRecord(parsed);
2555
+ if (!manifest) {
2556
+ throw new Error("cuda_manifest_must_be_object");
2557
+ }
2558
+ assertNoUnsafeCudaManifestKeys(manifest, "cuda_manifest");
2559
+ const schemaVersion = optionalText(manifest.schema_version);
2560
+ if (schemaVersion !== "2026-06-14") {
2561
+ throw new Error("cuda_manifest_schema_version_invalid");
2562
+ }
2563
+ const packageInfo = objectRecord(manifest.package);
2564
+ const publisher = optionalText(packageInfo?.publisher);
2565
+ if (policy.allowed_package_publishers?.length) {
2566
+ if (!publisher || !policy.allowed_package_publishers.includes(publisher)) {
2567
+ throw new Error("cuda_manifest_publisher_not_allowed");
2568
+ }
2569
+ }
2570
+ const profiles = objectRecord(manifest.profiles);
2571
+ const targets = objectRecord(manifest.targets);
2572
+ const profile = objectRecord(profiles?.[args.profile]);
2573
+ const target = objectRecord(targets?.[args.target]);
2574
+ if (!profile) {
2575
+ throw new Error("cuda_manifest_profile_not_found");
2576
+ }
2577
+ if (!target) {
2578
+ throw new Error("cuda_manifest_target_not_found");
2579
+ }
2580
+ assertNoUnsafeCudaManifestKeys(profile, "cuda_manifest_profile");
2581
+ assertNoUnsafeCudaManifestKeys(target, "cuda_manifest_target");
2582
+ const image = optionalText(profile.image);
2583
+ if (!image || !APPROVED_NVIDIA_CUDA_IMAGES.has(image)) {
2584
+ throw new Error("cuda_image_not_approved");
2585
+ }
2586
+ if (!policy.allowed_images?.includes(image)) {
2587
+ throw new Error("cuda_image_not_allowed_by_policy");
2588
+ }
2589
+ const compiler = optionalText(profile.compiler) || "nvcc";
2590
+ if (compiler !== "nvcc") {
2591
+ throw new Error("cuda_compiler_not_allowed");
2592
+ }
2593
+ const source = safeCudaRelativePath(target.source, "cuda_target_source");
2594
+ if (!source.endsWith(".cu")) {
2595
+ throw new Error("cuda_target_source_must_be_cu");
2596
+ }
2597
+ const output = safeCudaRelativePath(optionalText(target.output) || `bin/${args.target}`, "cuda_target_output");
2598
+ return {
2599
+ schemaVersion,
2600
+ packageName: optionalText(packageInfo?.name) ?? undefined,
2601
+ publisher: publisher ?? undefined,
2602
+ image,
2603
+ compiler,
2604
+ source,
2605
+ output,
2606
+ flags: [...safeCudaTokenList(profile.flags, "cuda_profile_flags"), ...safeCudaTokenList(target.flags, "cuda_target_flags")],
2607
+ runArgs: safeCudaTokenList(target.args, "cuda_target_args")
2608
+ };
2609
+ }
2610
+ function redactCudaLocalPaths(context, value) {
2611
+ const replacements = [
2612
+ ...context.artifacts.registeredInputs.map((input) => [input.local_path, "[job-input]"]),
2613
+ [context.artifacts.inputDir, "[job-inputs]"],
2614
+ [context.artifacts.outputDir, "[job-outputs]"],
2615
+ [context.artifacts.workDir, "[job-workdir]"]
2616
+ ];
2617
+ replacements.sort((left, right) => (right[0]?.length || 0) - (left[0]?.length || 0));
2618
+ let output = value;
2619
+ for (const [source, replacement] of replacements) {
2620
+ if (source) {
2621
+ output = output.split(source).join(replacement);
2622
+ }
2623
+ }
2624
+ return output;
2625
+ }
2626
+ async function emitCudaOutput(context, type, value) {
2627
+ const lines = value.split(/\r?\n/).map((line) => line.trim()).filter(Boolean).slice(0, 200);
2628
+ for (const line of lines) {
2629
+ await context.emitEvent({
2630
+ type,
2631
+ message: redactCudaLocalPaths(context, line),
2632
+ data: { runner: CUDA_PACKAGE_RUNNER_ID }
2633
+ });
2634
+ }
2635
+ }
2636
+ function buildCudaRunnerScript(input) {
2637
+ const srcDir = "/workspace/work/src";
2638
+ const buildOutput = `/workspace/work/${input.selection.output}`;
2639
+ const compile = [
2640
+ "/usr/local/cuda/bin/nvcc",
2641
+ ...input.selection.flags,
2642
+ "-o",
2643
+ buildOutput,
2644
+ `${srcDir}/${input.selection.source}`
2645
+ ].map(quotePosixShellValue).join(" ");
2646
+ const run = [
2647
+ buildOutput,
2648
+ ...input.selection.runArgs
2649
+ ].map(quotePosixShellValue).join(" ");
2650
+ return [
2651
+ "set -euo pipefail",
2652
+ "mkdir -p /workspace/work/src /workspace/outputs",
2653
+ `tar -xzf ${quotePosixShellValue(`/workspace/inputs/${input.archiveInputPath}`)} -C /workspace/work/src`,
2654
+ `mkdir -p ${quotePosixShellValue(dirname(buildOutput))}`,
2655
+ "cd /workspace/work/src",
2656
+ compile,
2657
+ run
2658
+ ].join("\n");
2659
+ }
2660
+ function dockerBindMount(hostPath, containerPath, mode) {
2661
+ return `${hostPath}:${containerPath}:${mode}`;
2662
+ }
2663
+ function buildCudaDockerArgs(input) {
2664
+ const gpuCount = Math.max(1, input.context.job.resources?.gpu?.count || 1);
2665
+ const args = [
2666
+ "run",
2667
+ "--rm",
2668
+ "--pull",
2669
+ "never",
2670
+ "--network",
2671
+ "none",
2672
+ "--runtime",
2673
+ "nvidia",
2674
+ "--gpus",
2675
+ `count=${gpuCount}`,
2676
+ "--user",
2677
+ input.context.sandbox.container.user,
2678
+ "--read-only",
2679
+ "--cap-drop",
2680
+ "ALL",
2681
+ "--security-opt",
2682
+ "no-new-privileges",
2683
+ "--workdir",
2684
+ "/workspace",
2685
+ "--env",
2686
+ "CUDA_CACHE_PATH=/workspace/work/.cuda-cache",
2687
+ "--tmpfs",
2688
+ "/tmp:rw,nosuid,nodev,size=64m"
2689
+ ];
2690
+ if (Number.isFinite(input.context.job.resources?.memory_gb) && input.context.job.resources?.memory_gb) {
2691
+ args.push("--memory", `${Math.floor(input.context.job.resources.memory_gb)}g`);
2692
+ }
2693
+ if (Number.isFinite(input.context.job.resources?.disk_gb) && input.context.job.resources?.disk_gb) {
2694
+ args.push("--storage-opt", `size=${Math.floor(input.context.job.resources.disk_gb)}G`);
2695
+ }
2696
+ args.push("-v", dockerBindMount(input.context.artifacts.inputDir, "/workspace/inputs", "ro"), "-v", dockerBindMount(input.context.artifacts.outputDir, "/workspace/outputs", "rw"), "-v", dockerBindMount(input.workPath, "/workspace/work", "rw"), "-v", dockerBindMount(input.scriptPath, "/workspace/__mcoda_cuda_run.sh", "ro"), input.selection.image, "/bin/bash", "/workspace/__mcoda_cuda_run.sh");
2697
+ return args;
2698
+ }
2699
+ export class MswarmCudaPackageRunner {
2700
+ constructor(runner = defaultCommandRunner) {
2701
+ this.id = CUDA_PACKAGE_RUNNER_ID;
2702
+ this.runner = runner;
2703
+ }
2704
+ async run(context) {
2705
+ const startedAt = new Date().toISOString();
2706
+ if (context.signal.aborted) {
2707
+ throw new Error(abortErrorMessage(context.signal));
2708
+ }
2709
+ if (context.job.policy.network !== "none") {
2710
+ return cudaFailureResult(context.job, "policy_denied", "cuda.run requires network policy none", startedAt);
2711
+ }
2712
+ if (context.job.policy.allow_raw_command !== false) {
2713
+ return cudaFailureResult(context.job, "policy_denied", "cuda.run does not allow raw commands", startedAt);
2714
+ }
2715
+ if (!context.job.resources?.gpu) {
2716
+ return cudaFailureResult(context.job, "validation_failed", "cuda.run requires GPU resources", startedAt);
2717
+ }
2718
+ if (!context.job.outputs?.length) {
2719
+ return cudaFailureResult(context.job, "validation_failed", "cuda.run requires declared outputs", startedAt);
2720
+ }
2721
+ let args;
2722
+ let archive;
2723
+ let selection;
2724
+ let scriptPath;
2725
+ let workPath;
2726
+ try {
2727
+ args = parseCudaRunArgs(context.job);
2728
+ archive = cudaPackageArchive(context);
2729
+ await validateCudaPackageArchive(context, this.runner, archive);
2730
+ const manifestText = await readCudaManifestText(context, this.runner, args);
2731
+ selection = parseCudaPackageManifest(manifestText, args, context.job.policy);
2732
+ scriptPath = resolveWithinRoot(context.artifacts.workDir, "__mcoda_cuda_run.sh");
2733
+ workPath = resolveWithinRoot(context.artifacts.workDir, "cuda-work");
2734
+ await mkdir(workPath, { recursive: true });
2735
+ await chmod(workPath, 0o777);
2736
+ await chmod(context.artifacts.outputDir, 0o777);
2737
+ await writeFile(scriptPath, buildCudaRunnerScript({ archiveInputPath: archive.inputPath, selection }), { mode: 0o644 });
2738
+ }
2739
+ catch (error) {
2740
+ return cudaFailureResult(context.job, "validation_failed", redactCudaLocalPaths(context, error instanceof Error ? error.message : String(error || "cuda.run validation failed")), startedAt);
2741
+ }
2742
+ const dockerArgs = buildCudaDockerArgs({
2743
+ context,
2744
+ selection,
2745
+ archiveInputPath: archive.inputPath,
2746
+ scriptPath,
2747
+ workPath
2748
+ });
2749
+ await context.emitEvent({
2750
+ type: "progress",
2751
+ message: "cuda package container starting",
2752
+ data: {
2753
+ runner: this.id,
2754
+ image: selection.image,
2755
+ profile: args.profile,
2756
+ target: args.target,
2757
+ gpu_count: Math.max(1, context.job.resources.gpu.count || 1),
2758
+ network: "none",
2759
+ container_user: context.sandbox.container.user
2760
+ }
2761
+ });
2762
+ const timeoutMs = Math.max(1000, Math.min(DEFAULT_JOB_TIMEOUT_MS, Math.floor((context.sandbox.limits.timeout_sec || DEFAULT_JOB_TIMEOUT_MS / 1000) * 1000)));
2763
+ const maxBuffer = Math.min(DEFAULT_COMMAND_MAX_BUFFER, Math.max(1024 * 1024, context.job.limits?.max_stdout_bytes || 0, context.job.limits?.max_stderr_bytes || 0));
2764
+ try {
2765
+ const result = await this.runner("docker", dockerArgs, {
2766
+ timeoutMs,
2767
+ maxBuffer,
2768
+ signal: context.signal
2769
+ });
2770
+ await emitCudaOutput(context, "stdout", result.stdout);
2771
+ await emitCudaOutput(context, "stderr", result.stderr);
2772
+ await context.emitEvent({
2773
+ type: "progress",
2774
+ message: "cuda package container completed",
2775
+ data: {
2776
+ runner: this.id,
2777
+ profile: args.profile,
2778
+ target: args.target
2779
+ }
2780
+ });
2781
+ return {
2782
+ job_id: context.job.idempotency_key || "cuda.run",
2783
+ status: "succeeded",
2784
+ exit_code: 0,
2785
+ started_at: startedAt,
2786
+ finished_at: new Date().toISOString(),
2787
+ metrics: {
2788
+ runner: this.id,
2789
+ image: selection.image,
2790
+ profile: args.profile,
2791
+ target: args.target,
2792
+ package: selection.packageName,
2793
+ publisher: selection.publisher,
2794
+ gpu_count: Math.max(1, context.job.resources.gpu.count || 1),
2795
+ network: "none",
2796
+ container_user: context.sandbox.container.user
2797
+ }
2798
+ };
2799
+ }
2800
+ catch (error) {
2801
+ if (context.signal.aborted) {
2802
+ throw error;
2803
+ }
2804
+ return cudaFailureResult(context.job, "runner_failed", redactCudaLocalPaths(context, error instanceof Error ? error.message : String(error || "cuda.run failed")), startedAt);
2805
+ }
2806
+ }
2807
+ }
2808
+ function createDefaultGenericJobRunners(runner = defaultCommandRunner) {
2809
+ return [new MswarmTestEchoRunner(), new MswarmBlenderRenderRunner(runner), new MswarmCudaPackageRunner(runner)];
2810
+ }
2811
+ function uniqueSortedStrings(values) {
2812
+ return Array.from(new Set(values.filter((value) => typeof value === "string" && value.length > 0))).sort();
2813
+ }
2814
+ function capabilityProbeTimeoutMs(config) {
2815
+ return parsePositiveInteger(config.capabilityProbeTimeoutMs, DEFAULT_CAPABILITY_PROBE_TIMEOUT_MS);
2816
+ }
2817
+ function capabilityCommandFailureMessage(error) {
2818
+ if (error instanceof Error && error.message)
2819
+ return error.message;
2820
+ return String(error || "capability probe failed");
2821
+ }
2822
+ function isMissingCapabilityCommand(error, stderr = "") {
2823
+ const message = `${capabilityCommandFailureMessage(error)}\n${stderr}`.toLowerCase();
2824
+ return /enoent|not found|command not found|no such file|executable file not found/.test(message);
2825
+ }
2826
+ async function runCapabilityCommand(runner, command, args, timeoutMs) {
2827
+ try {
2828
+ const result = await runner(command, args, {
2829
+ timeoutMs,
2830
+ maxBuffer: Math.min(DEFAULT_COMMAND_MAX_BUFFER, 512 * 1024)
2831
+ });
2832
+ return { ok: true, stdout: result.stdout, stderr: result.stderr };
2833
+ }
2834
+ catch (error) {
2835
+ return {
2836
+ ok: false,
2837
+ missing: isMissingCapabilityCommand(error),
2838
+ message: capabilityCommandFailureMessage(error)
2839
+ };
2840
+ }
2841
+ }
2842
+ function parseNvidiaSmiMemoryGb(value) {
2843
+ if (!value)
2844
+ return undefined;
2845
+ const parsed = Number(value.replace(/[^\d.]/g, ""));
2846
+ if (!Number.isFinite(parsed) || parsed <= 0)
2847
+ return undefined;
2848
+ return Math.round((parsed / 1024) * 10) / 10;
2849
+ }
2850
+ function parseNvidiaGpuProbe(stdout) {
2851
+ const devices = [];
2852
+ const cudaVersions = new Set();
2853
+ for (const line of stdout.split(/\r?\n/)) {
2854
+ const trimmed = line.trim();
2855
+ if (!trimmed)
2856
+ continue;
2857
+ const [index, name, memoryMb, driverVersion, computeCapability, cudaVersion] = trimmed
2858
+ .split(",")
2859
+ .map((part) => part.trim());
2860
+ const id = index ? `gpu-${index}` : `gpu-${devices.length}`;
2861
+ if (cudaVersion) {
2862
+ cudaVersions.add(cudaVersion);
2863
+ }
2864
+ devices.push({
2865
+ id,
2866
+ vendor: "nvidia",
2867
+ ...(name ? { name } : {}),
2868
+ ...(parseNvidiaSmiMemoryGb(memoryMb) ? { vram_gb: parseNvidiaSmiMemoryGb(memoryMb) } : {}),
2869
+ ...(driverVersion ? { driver_version: driverVersion } : {}),
2870
+ ...(cudaVersion ? { cuda_version: cudaVersion } : {}),
2871
+ ...(computeCapability ? { compute_capability: computeCapability } : {}),
2872
+ capabilities: ["cuda"]
2873
+ });
2874
+ }
2875
+ const maxVramGb = devices.reduce((max, device) => {
2876
+ if (!Number.isFinite(device.vram_gb))
2877
+ return max;
2878
+ return max === undefined ? device.vram_gb : Math.max(max, device.vram_gb || 0);
2879
+ }, undefined);
2880
+ return {
2881
+ status: devices.length > 0 ? "available" : "missing",
2882
+ count: devices.length,
2883
+ vendors: devices.length > 0 ? ["nvidia"] : [],
2884
+ devices,
2885
+ ...(cudaVersions.size > 0 ? { cuda_versions: Array.from(cudaVersions).sort() } : {}),
2886
+ ...(maxVramGb !== undefined ? { max_vram_gb: maxVramGb } : {}),
2887
+ ...(devices.length === 0 ? { message: "nvidia-smi returned no GPU rows" } : {})
2888
+ };
2889
+ }
2890
+ function parseNvidiaSmiCudaVersion(stdout) {
2891
+ return stdout.match(/CUDA\s+Version:\s*([0-9]+(?:\.[0-9]+)?)/i)?.[1];
2892
+ }
2893
+ async function probeNvidiaGpuCapabilities(runner, timeoutMs) {
2894
+ const result = await runCapabilityCommand(runner, "nvidia-smi", ["--query-gpu=index,name,memory.total,driver_version,compute_cap", "--format=csv,noheader,nounits"], timeoutMs);
2895
+ if (!result.ok) {
2896
+ return {
2897
+ status: result.missing ? "missing" : "error",
2898
+ count: 0,
2899
+ vendors: [],
2900
+ devices: [],
2901
+ message: result.message
2902
+ };
2903
+ }
2904
+ const probe = parseNvidiaGpuProbe(result.stdout);
2905
+ const versionResult = await runCapabilityCommand(runner, "nvidia-smi", [], timeoutMs);
2906
+ if (!versionResult.ok) {
2907
+ return probe;
2908
+ }
2909
+ const cudaVersion = parseNvidiaSmiCudaVersion(versionResult.stdout || versionResult.stderr);
2910
+ if (!cudaVersion) {
2911
+ return probe;
2912
+ }
2913
+ const cudaVersions = Array.from(new Set([...(probe.cuda_versions || []), cudaVersion])).sort();
2914
+ return {
2915
+ ...probe,
2916
+ cuda_versions: cudaVersions,
2917
+ devices: probe.devices.map((device) => ({
2918
+ ...device,
2919
+ cuda_version: device.cuda_version || cudaVersion
2920
+ }))
2921
+ };
2922
+ }
2923
+ function missingSoftwareProbe(name, message) {
2924
+ return {
2925
+ name,
2926
+ status: "missing",
2927
+ ...(message ? { message } : {})
2928
+ };
2929
+ }
2930
+ function errorSoftwareProbe(name, message) {
2931
+ return {
2932
+ name,
2933
+ status: "error",
2934
+ message
2935
+ };
2936
+ }
2937
+ function extractToolVersion(stdout, tool) {
2938
+ const firstLine = stdout.split(/\r?\n/).find((line) => line.trim().length > 0)?.trim() || "";
2939
+ if (tool === "blender") {
2940
+ return firstLine.match(/Blender\s+([^\s]+)/i)?.[1];
2941
+ }
2942
+ if (tool === "ffmpeg") {
2943
+ return firstLine.match(/ffmpeg\s+version\s+([^\s]+)/i)?.[1];
2944
+ }
2945
+ return firstLine || undefined;
2946
+ }
2947
+ async function probeVersionedSoftware(runner, name, command, args, timeoutMs) {
2948
+ const result = await runCapabilityCommand(runner, command, args, timeoutMs);
2949
+ if (!result.ok) {
2950
+ return result.missing
2951
+ ? missingSoftwareProbe(name, result.message)
2952
+ : errorSoftwareProbe(name, result.message);
2953
+ }
2954
+ return {
2955
+ name,
2956
+ status: "available",
2957
+ ...(extractToolVersion(result.stdout || result.stderr, name) ? { version: extractToolVersion(result.stdout || result.stderr, name) } : {})
2958
+ };
2959
+ }
2960
+ async function probeDockerCapabilities(runner, timeoutMs) {
2961
+ const result = await runCapabilityCommand(runner, "docker", ["info", "--format", "{{json .Runtimes}}"], timeoutMs);
2962
+ if (!result.ok) {
2963
+ const docker = result.missing
2964
+ ? missingSoftwareProbe("docker", result.message)
2965
+ : errorSoftwareProbe("docker", result.message);
2966
+ return {
2967
+ docker,
2968
+ dockerNvidia: { name: "docker-nvidia", status: docker.status, message: result.message }
2969
+ };
2970
+ }
2971
+ try {
2972
+ const runtimes = JSON.parse(result.stdout || "{}");
2973
+ const runtimeNames = Object.keys(runtimes);
2974
+ const hasNvidiaRuntime = runtimeNames.some((name) => name.toLowerCase() === "nvidia");
2975
+ return {
2976
+ docker: { name: "docker", status: "available" },
2977
+ dockerNvidia: hasNvidiaRuntime
2978
+ ? { name: "docker-nvidia", status: "available", version: "nvidia" }
2979
+ : missingSoftwareProbe("docker-nvidia", "Docker is available but the nvidia runtime is not registered")
2980
+ };
2981
+ }
2982
+ catch (error) {
2983
+ const message = capabilityCommandFailureMessage(error);
2984
+ return {
2985
+ docker: errorSoftwareProbe("docker", `Unable to parse docker runtime inventory: ${message}`),
2986
+ dockerNvidia: errorSoftwareProbe("docker-nvidia", `Unable to parse docker runtime inventory: ${message}`)
2987
+ };
2988
+ }
2989
+ }
2990
+ function capabilityDiagnostics(snapshot) {
2991
+ const diagnostics = [];
2992
+ if (snapshot.gpu.status !== "available") {
2993
+ diagnostics.push({
2994
+ name: "gpu",
2995
+ status: snapshot.gpu.status,
2996
+ message: snapshot.gpu.message
2997
+ });
2998
+ }
2999
+ for (const result of Object.values(snapshot.software)) {
3000
+ if (result.status !== "available") {
3001
+ diagnostics.push({
3002
+ name: result.name,
3003
+ status: result.status,
3004
+ message: result.message
3005
+ });
3006
+ }
3007
+ }
3008
+ return diagnostics.length ? diagnostics : undefined;
3009
+ }
3010
+ function buildCapabilitySnapshotId(snapshot) {
3011
+ const digest = createHash("sha256").update(JSON.stringify(snapshot)).digest("hex").slice(0, 16);
3012
+ return `caps_${digest}`;
3013
+ }
3014
+ function buildRunnerCapabilityCatalog(config, runners) {
3015
+ if (!config.genericJobsEnabled) {
3016
+ return [];
3017
+ }
3018
+ return OWNER_LOCAL_GENERIC_JOB_CATALOG
3019
+ .filter((entry) => runners.has(entry.runner))
3020
+ .map((entry) => ({
3021
+ job_type: entry.job_type,
3022
+ runner: entry.runner,
3023
+ trust_modes: uniqueSortedStrings([entry.policy.trust_mode]),
3024
+ required_capabilities: entry.required_capabilities || []
3025
+ }));
3026
+ }
3027
+ function runnerCapabilityRequirementsAvailable(entry, input) {
3028
+ if (!input.genericJobsEnabled)
3029
+ return false;
3030
+ if (!entry.required_capabilities?.length)
3031
+ return true;
3032
+ const snapshot = {
3033
+ schema_version: MSWARM_CAPABILITY_SCHEMA_VERSION,
3034
+ snapshot_id: "caps_requirement_check",
3035
+ captured_at: new Date(0).toISOString(),
3036
+ generic_jobs_enabled: input.genericJobsEnabled,
3037
+ job_types: [],
3038
+ trust_modes: [],
3039
+ gpu: input.gpu,
3040
+ software: input.software,
3041
+ runner_catalog: []
3042
+ };
3043
+ const capabilities = new Set(buildMswarmCapabilityNames(snapshot));
3044
+ return entry.required_capabilities.every((capability) => capabilities.has(capability));
3045
+ }
3046
+ function registeredOwnerLocalGenericJobCatalog() {
3047
+ return OWNER_LOCAL_GENERIC_JOB_CATALOG.filter((entry) => entry.job_type.startsWith("tenant.") || entry.job_type.startsWith("package."));
3048
+ }
3049
+ function base64UrlEncodeRuntime(buffer) {
3050
+ return buffer.toString("base64").replace(/=/g, "").replace(/\+/g, "-").replace(/\//g, "_");
3051
+ }
3052
+ function signCapabilityPayload(input) {
3053
+ const unsignedPayload = {
3054
+ schema_version: MSWARM_CAPABILITY_SCHEMA_VERSION,
3055
+ snapshot_id: input.privateCatalogEntry.snapshot_id,
3056
+ private_catalog_entry: input.privateCatalogEntry,
3057
+ scheduler_match: input.privateCatalogEntry.scheduler_match,
3058
+ public_projection: input.privateCatalogEntry.public_projection
3059
+ };
3060
+ const signature = base64UrlEncodeRuntime(createHmac("sha256", input.runtimeToken).update(JSON.stringify(unsignedPayload)).digest());
3061
+ return {
3062
+ ...unsignedPayload,
3063
+ signature: {
3064
+ alg: "HS256",
3065
+ value: signature,
3066
+ signed_at: new Date().toISOString(),
3067
+ key_id: "self_hosted_runtime_token"
3068
+ }
3069
+ };
3070
+ }
3071
+ function runnerForGenericJob(job, runners) {
3072
+ const catalogEntry = OWNER_LOCAL_GENERIC_JOB_CATALOG.find((entry) => entry.job_type === job.job_type);
3073
+ return catalogEntry ? runners.get(catalogEntry.runner) || null : null;
3074
+ }
3075
+ function compareDottedVersion(left, right) {
3076
+ if (!left || !right)
3077
+ return 0;
3078
+ const leftParts = left.split(".").map((part) => Number(part.replace(/[^\d]/g, "")) || 0);
3079
+ const rightParts = right.split(".").map((part) => Number(part.replace(/[^\d]/g, "")) || 0);
3080
+ const length = Math.max(leftParts.length, rightParts.length);
3081
+ for (let index = 0; index < length; index += 1) {
3082
+ const delta = (leftParts[index] || 0) - (rightParts[index] || 0);
3083
+ if (delta !== 0)
3084
+ return delta;
3085
+ }
3086
+ return 0;
3087
+ }
3088
+ function snapshotHasCudaVersion(snapshot, minVersion) {
3089
+ if (!minVersion)
3090
+ return true;
3091
+ const versions = [
3092
+ ...(snapshot.gpu.cuda_versions || []),
3093
+ ...snapshot.gpu.devices.map((device) => device.cuda_version).filter((value) => Boolean(value))
3094
+ ];
3095
+ return versions.some((version) => compareDottedVersion(version, minVersion) >= 0);
3096
+ }
3097
+ export function genericJobCapabilityMismatch(job, snapshot) {
3098
+ if (!snapshot.generic_jobs_enabled) {
3099
+ return { code: "no_capable_node", message: "Generic jobs are disabled on this node." };
3100
+ }
3101
+ if (job.job_type === RENDER_BLENDER_JOB_TYPE && snapshot.software.blender.status !== "available") {
3102
+ return {
3103
+ code: "no_capable_node",
3104
+ message: "Blender is not available on this node."
3105
+ };
3106
+ }
3107
+ if (job.job_type === CUDA_RUN_JOB_TYPE) {
3108
+ if (snapshot.gpu.status !== "available" || !snapshot.gpu.vendors.includes("nvidia")) {
3109
+ return {
3110
+ code: "no_capable_node",
3111
+ message: "No NVIDIA GPU is available on this node."
3112
+ };
3113
+ }
3114
+ if (snapshot.software.docker.status !== "available" || snapshot.software["docker-nvidia"].status !== "available") {
3115
+ return {
3116
+ code: "no_capable_node",
3117
+ message: "Docker with the NVIDIA runtime is not available on this node."
3118
+ };
3119
+ }
3120
+ }
3121
+ if (!snapshot.job_types.includes(job.job_type)) {
3122
+ return {
3123
+ code: "no_capable_node",
3124
+ message: `No capable owner-local node is available for ${job.job_type}.`
3125
+ };
3126
+ }
3127
+ if (job.resources?.gpu) {
3128
+ const requestedCount = Math.max(1, job.resources.gpu.count || 1);
3129
+ if (snapshot.gpu.status !== "available" || snapshot.gpu.count < requestedCount) {
3130
+ return {
3131
+ code: "no_capable_node",
3132
+ message: `Requested ${requestedCount} GPU(s), but this node reports ${snapshot.gpu.count}.`
3133
+ };
3134
+ }
3135
+ if (job.resources.gpu.vendor && !snapshot.gpu.vendors.includes(job.resources.gpu.vendor)) {
3136
+ return {
3137
+ code: "no_capable_node",
3138
+ message: `Requested GPU vendor ${job.resources.gpu.vendor} is not available on this node.`
3139
+ };
3140
+ }
3141
+ if (Number.isFinite(job.resources.gpu.min_vram_gb) &&
3142
+ job.resources.gpu.min_vram_gb !== undefined &&
3143
+ (!Number.isFinite(snapshot.gpu.max_vram_gb) || (snapshot.gpu.max_vram_gb || 0) < job.resources.gpu.min_vram_gb)) {
3144
+ return {
3145
+ code: "no_capable_node",
3146
+ message: `Requested GPU VRAM ${job.resources.gpu.min_vram_gb}GB exceeds this node capability.`
3147
+ };
3148
+ }
3149
+ if (!snapshotHasCudaVersion(snapshot, job.resources.gpu.cuda_min_version)) {
3150
+ return {
3151
+ code: "no_capable_node",
3152
+ message: `Requested CUDA ${job.resources.gpu.cuda_min_version} is not available on this node.`
3153
+ };
3154
+ }
3155
+ }
3156
+ return null;
3157
+ }
3158
+ function genericJobTimeoutMs(job, fallbackMs) {
3159
+ const limitSeconds = positiveInteger(job.limits?.timeout_sec);
3160
+ if (!limitSeconds) {
3161
+ return fallbackMs;
3162
+ }
3163
+ return Math.max(1, Math.min(fallbackMs, limitSeconds * 1000));
3164
+ }
3165
+ function isGenericAbortError(error, signal) {
3166
+ if (signal.aborted)
3167
+ return true;
3168
+ if (!(error instanceof Error))
3169
+ return false;
3170
+ return /cancelled|canceled|aborted|timed out|timeout/i.test(error.message);
3171
+ }
3172
+ function usageTokens(usage) {
3173
+ return {
3174
+ promptTokens: positiveInteger(usage?.inputTokens),
3175
+ completionTokens: positiveInteger(usage?.outputTokens),
3176
+ };
3177
+ }
3178
+ export class McodaLocalAgentExecutor {
3179
+ constructor(input) {
3180
+ this.command = input.command || DEFAULT_MCODA_BIN;
3181
+ this.timeoutMs = input.timeoutMs || DEFAULT_JOB_TIMEOUT_MS;
3182
+ this.runner = input.runner || defaultCommandRunner;
3183
+ }
3184
+ async invoke(agentSlug, prompt) {
3185
+ const stdout = (await this.runner(this.command, ["agent-run", agentSlug, "--json", "--stdin"], {
3186
+ timeoutMs: this.timeoutMs,
3187
+ maxBuffer: DEFAULT_COMMAND_MAX_BUFFER,
3188
+ input: prompt
3189
+ })).stdout;
3190
+ const parsed = JSON.parse(stdout);
3191
+ if (!parsed || typeof parsed !== "object" || !Array.isArray(parsed.responses)) {
3192
+ throw new Error("mcoda agent-run returned unsupported JSON");
3193
+ }
3194
+ const response = parsed.responses[0] || {};
3195
+ const output = optionalText(response.output);
3196
+ if (!output) {
3197
+ throw new Error("mcoda agent-run response did not include output");
3198
+ }
3199
+ return {
3200
+ output,
3201
+ adapter: optionalText(response.adapter) || undefined,
3202
+ model: optionalText(response.model) || undefined,
3203
+ metadata: response.metadata && typeof response.metadata === "object" ? response.metadata : undefined
3204
+ };
3205
+ }
3206
+ }
3207
+ export class MswarmSelfHostedNodeClient {
3208
+ constructor(input) {
3209
+ this.gatewayBaseUrl = trimTrailingSlash(input.gatewayBaseUrl);
3210
+ this.fetchImpl = input.fetchImpl || fetch;
3211
+ this.timeoutMs = input.timeoutMs || DEFAULT_REQUEST_TIMEOUT_MS;
3212
+ }
3213
+ async enroll(nodeId, enrollmentToken) {
3214
+ return fetchJson(this.fetchImpl, `${this.gatewayBaseUrl}/v1/swarm/self-hosted/node/enroll`, {
3215
+ method: "POST",
3216
+ headers: { "content-type": "application/json" },
3217
+ body: JSON.stringify({ node_id: nodeId, enrollment_token: enrollmentToken })
3218
+ }, this.timeoutMs);
3219
+ }
3220
+ async bootstrap(apiKey, payload) {
3221
+ return fetchJson(this.fetchImpl, `${this.gatewayBaseUrl}/v1/swarm/self-hosted/node/bootstrap`, {
3222
+ method: "POST",
3223
+ headers: {
3224
+ "content-type": "application/json",
3225
+ "x-api-key": apiKey
3226
+ },
3227
+ body: JSON.stringify(payload)
3228
+ }, this.timeoutMs);
3229
+ }
3230
+ async health() {
3231
+ return fetchJson(this.fetchImpl, `${this.gatewayBaseUrl}/healthz`, { method: "GET" }, this.timeoutMs);
3232
+ }
3233
+ async heartbeat(runtimeToken, payload) {
3234
+ return fetchJson(this.fetchImpl, `${this.gatewayBaseUrl}/v1/swarm/self-hosted/node/heartbeat`, {
3235
+ method: "POST",
3236
+ headers: {
3237
+ "content-type": "application/json",
3238
+ authorization: `Bearer ${runtimeToken}`
3239
+ },
3240
+ body: JSON.stringify(payload)
3241
+ }, this.timeoutMs);
3242
+ }
3243
+ async uninstall(runtimeToken, payload) {
3244
+ return fetchJson(this.fetchImpl, `${this.gatewayBaseUrl}/v1/swarm/self-hosted/node/uninstall`, {
3245
+ method: "POST",
3246
+ headers: {
3247
+ "content-type": "application/json",
3248
+ authorization: `Bearer ${runtimeToken}`
3249
+ },
3250
+ body: JSON.stringify(payload)
3251
+ }, this.timeoutMs);
3252
+ }
3253
+ async pushModels(runtimeToken, payload) {
3254
+ return fetchJson(this.fetchImpl, `${this.gatewayBaseUrl}/v1/swarm/self-hosted/node/models`, {
3255
+ method: "POST",
3256
+ headers: {
3257
+ "content-type": "application/json",
3258
+ authorization: `Bearer ${runtimeToken}`
3259
+ },
3260
+ body: JSON.stringify(payload)
3261
+ }, this.timeoutMs);
3262
+ }
3263
+ async pollJob(runtimeToken, payload) {
3264
+ return fetchJson(this.fetchImpl, `${this.gatewayBaseUrl}/v1/swarm/self-hosted/node/jobs/poll`, {
3265
+ method: "POST",
3266
+ headers: {
3267
+ "content-type": "application/json",
3268
+ authorization: `Bearer ${runtimeToken}`
3269
+ },
3270
+ body: JSON.stringify(payload)
3271
+ }, Math.max(this.timeoutMs, (payload.wait_ms || 0) + 5000));
3272
+ }
3273
+ async postJobResult(runtimeToken, jobId, payload) {
3274
+ return fetchJson(this.fetchImpl, `${this.gatewayBaseUrl}/v1/swarm/self-hosted/node/jobs/${encodeURIComponent(jobId)}/result`, {
3275
+ method: "POST",
3276
+ headers: {
3277
+ "content-type": "application/json",
3278
+ authorization: `Bearer ${runtimeToken}`
3279
+ },
3280
+ body: JSON.stringify(payload)
3281
+ }, this.timeoutMs);
3282
+ }
3283
+ async postJobStart(runtimeToken, jobId, payload) {
3284
+ return fetchJson(this.fetchImpl, `${this.gatewayBaseUrl}/v1/swarm/self-hosted/node/jobs/${encodeURIComponent(jobId)}/start`, {
3285
+ method: "POST",
3286
+ headers: {
3287
+ "content-type": "application/json",
3288
+ authorization: `Bearer ${runtimeToken}`
3289
+ },
3290
+ body: JSON.stringify(payload)
3291
+ }, this.timeoutMs);
3292
+ }
3293
+ async postJobEvents(runtimeToken, jobId, payload) {
3294
+ return fetchJson(this.fetchImpl, `${this.gatewayBaseUrl}/v1/swarm/self-hosted/node/jobs/${encodeURIComponent(jobId)}/events`, {
3295
+ method: "POST",
3296
+ headers: {
3297
+ "content-type": "application/json",
3298
+ authorization: `Bearer ${runtimeToken}`
3299
+ },
3300
+ body: JSON.stringify(payload)
3301
+ }, this.timeoutMs);
3302
+ }
3303
+ }
3304
+ export class SelfHostedNodeRuntime {
3305
+ constructor(config, deps) {
3306
+ this.activeLlmJobs = 0;
3307
+ this.activeGenericJobs = 0;
3308
+ this.queuedLlmJobs = 0;
3309
+ this.queuedGenericJobs = 0;
3310
+ this.latencySamplesMs = [];
3311
+ this.recentFailures = [];
3312
+ this.config = config;
3313
+ this.gateway =
3314
+ deps?.gateway ||
3315
+ new MswarmSelfHostedNodeClient({
3316
+ gatewayBaseUrl: config.gatewayBaseUrl,
3317
+ fetchImpl: deps?.fetchImpl,
3318
+ timeoutMs: config.requestTimeoutMs
3319
+ });
3320
+ this.mcoda =
3321
+ deps?.mcoda ||
3322
+ new McodaAgentInventoryClient({
3323
+ command: config.mcodaBin,
3324
+ args: config.mcodaListArgs,
3325
+ timeoutMs: config.requestTimeoutMs
3326
+ });
3327
+ this.mcodaExecutor =
3328
+ deps?.mcodaExecutor ||
3329
+ new McodaLocalAgentExecutor({
3330
+ command: config.mcodaBin,
3331
+ timeoutMs: config.jobTimeoutMs
3332
+ });
3333
+ this.codaliExecutor = deps?.codaliExecutor || new MswarmCodaliExecutor();
3334
+ this.ollama =
3335
+ deps?.ollama ||
3336
+ new OllamaClient({
3337
+ baseUrl: config.ollamaBaseUrl,
3338
+ fetchImpl: deps?.fetchImpl,
3339
+ timeoutMs: config.requestTimeoutMs
3340
+ });
1686
3341
  this.jobOllama =
1687
3342
  deps?.ollama ||
1688
3343
  new OllamaClient({
@@ -1690,6 +3345,107 @@ export class SelfHostedNodeRuntime {
1690
3345
  fetchImpl: deps?.fetchImpl,
1691
3346
  timeoutMs: config.jobTimeoutMs
1692
3347
  });
3348
+ this.capabilityRunner = deps?.capabilityRunner || defaultCommandRunner;
3349
+ this.genericRunners = new Map((deps?.genericRunners || createDefaultGenericJobRunners(this.capabilityRunner)).map((runner) => [runner.id, runner]));
3350
+ this.artifactStore =
3351
+ deps?.artifactStore ||
3352
+ new MswarmLocalArtifactStore({
3353
+ rootDir: config.artifactStorePath || defaultArtifactStorePath()
3354
+ });
3355
+ }
3356
+ updateLocalQueueTelemetry(input) {
3357
+ if (input.llmQueuedJobs !== undefined) {
3358
+ this.queuedLlmJobs = nonNegativeTelemetryInteger(input.llmQueuedJobs);
3359
+ }
3360
+ if (input.genericQueuedJobs !== undefined) {
3361
+ this.queuedGenericJobs = nonNegativeTelemetryInteger(input.genericQueuedJobs);
3362
+ }
3363
+ }
3364
+ beginExecutionTelemetry(executionClass) {
3365
+ if (executionClass === "generic_job") {
3366
+ this.activeGenericJobs += 1;
3367
+ return;
3368
+ }
3369
+ this.activeLlmJobs += 1;
3370
+ }
3371
+ finishExecutionTelemetry(input) {
3372
+ if (input.executionClass === "generic_job") {
3373
+ this.activeGenericJobs = Math.max(0, this.activeGenericJobs - 1);
3374
+ }
3375
+ else {
3376
+ this.activeLlmJobs = Math.max(0, this.activeLlmJobs - 1);
3377
+ }
3378
+ this.latencySamplesMs.push(Math.max(0, Date.now() - input.startedAt));
3379
+ while (this.latencySamplesMs.length > MAX_TELEMETRY_LATENCY_SAMPLES) {
3380
+ this.latencySamplesMs.shift();
3381
+ }
3382
+ if (!input.ok) {
3383
+ this.recentFailures.unshift({
3384
+ execution_class: input.executionClass === "generic_job" ? "generic_job" : "agentic",
3385
+ code: optionalText(input.code) || "upstream_error",
3386
+ at: new Date().toISOString()
3387
+ });
3388
+ this.recentFailures.splice(MAX_TELEMETRY_FAILURES);
3389
+ }
3390
+ }
3391
+ averageLatencyMs(fallback = null) {
3392
+ if (this.latencySamplesMs.length === 0) {
3393
+ return fallback;
3394
+ }
3395
+ const total = this.latencySamplesMs.reduce((sum, value) => sum + value, 0);
3396
+ return Math.round(total / this.latencySamplesMs.length);
3397
+ }
3398
+ buildLoadTelemetry(input) {
3399
+ const drainMode = this.config.drainMode === true;
3400
+ const llmMaxConcurrency = Math.max(1, Math.floor(this.config.maxConcurrentLlmJobs || this.config.maxConcurrentJobs || 1));
3401
+ const genericMaxConcurrency = Math.max(1, Math.floor(this.config.genericJobMaxConcurrency || 1));
3402
+ const maxConcurrency = Math.max(1, Math.floor(this.config.maxConcurrentJobs || 1), llmMaxConcurrency, this.config.genericJobsEnabled ? genericMaxConcurrency : 1);
3403
+ const activeLlmJobs = nonNegativeTelemetryInteger(this.activeLlmJobs);
3404
+ const activeGenericJobs = nonNegativeTelemetryInteger(this.activeGenericJobs);
3405
+ const queuedLlmJobs = nonNegativeTelemetryInteger(this.queuedLlmJobs);
3406
+ const queuedGenericJobs = nonNegativeTelemetryInteger(this.queuedGenericJobs);
3407
+ const llmCapacity = executionClassCapacity({
3408
+ maxConcurrency: llmMaxConcurrency,
3409
+ activeJobs: activeLlmJobs,
3410
+ queuedJobs: queuedLlmJobs,
3411
+ drainMode
3412
+ });
3413
+ const genericCapacity = executionClassCapacity({
3414
+ maxConcurrency: genericMaxConcurrency,
3415
+ activeJobs: activeGenericJobs,
3416
+ queuedJobs: queuedGenericJobs,
3417
+ drainMode: drainMode || !this.config.genericJobsEnabled
3418
+ });
3419
+ const activeJobs = activeLlmJobs + activeGenericJobs;
3420
+ const queuedJobs = queuedLlmJobs + queuedGenericJobs;
3421
+ const freeSlots = drainMode ? 0 : Math.max(0, maxConcurrency - activeJobs - queuedJobs);
3422
+ const failures = this.recentFailures.slice(0, 10);
3423
+ const discoveryFailureCount = nonNegativeTelemetryInteger(input.discoveryFailureCount);
3424
+ const telemetry = {
3425
+ runtime_protocol_version: SELF_HOSTED_RUNTIME_PROTOCOL_VERSION,
3426
+ load_balancer_protocol_version: SELF_HOSTED_LOAD_BALANCER_PROTOCOL_VERSION,
3427
+ catalog_metadata_version: SELF_HOSTED_CATALOG_METADATA_VERSION,
3428
+ catalog_fingerprint: buildCatalogFingerprint(input.models),
3429
+ max_concurrency: maxConcurrency,
3430
+ max_concurrent_llm_jobs: llmMaxConcurrency,
3431
+ max_concurrent_generic_jobs: this.config.genericJobsEnabled ? genericMaxConcurrency : 0,
3432
+ active_jobs: activeJobs,
3433
+ queued_jobs: queuedJobs,
3434
+ free_slots: freeSlots,
3435
+ drain_mode: drainMode,
3436
+ execution_class_capacity: {
3437
+ chat: llmCapacity,
3438
+ agentic: llmCapacity,
3439
+ generic_job: genericCapacity
3440
+ },
3441
+ avg_latency_ms: this.averageLatencyMs(input.discoveryLatencyMs ?? null),
3442
+ recent_failure_count: failures.length + discoveryFailureCount,
3443
+ recent_failures: failures
3444
+ };
3445
+ if (this.config.hardwareTelemetryEnabled === true) {
3446
+ telemetry.hardware_pressure = buildCoarseHardwarePressure(input.capabilityPayload || null);
3447
+ }
3448
+ return telemetry;
1693
3449
  }
1694
3450
  static async setup(setupConfig, deps) {
1695
3451
  const gateway = deps?.gateway ||
@@ -1711,7 +3467,13 @@ export class SelfHostedNodeRuntime {
1711
3467
  expose_all_models: setupConfig.exposeAllModels,
1712
3468
  model_allowlist: setupConfig.modelAllowlist,
1713
3469
  model_blocklist: setupConfig.modelBlocklist,
1714
- heartbeat_interval_seconds: setupConfig.heartbeatIntervalSeconds
3470
+ heartbeat_interval_seconds: setupConfig.heartbeatIntervalSeconds,
3471
+ max_concurrent_jobs: setupConfig.maxConcurrentJobs,
3472
+ max_concurrent_llm_jobs: setupConfig.maxConcurrentLlmJobs,
3473
+ drain_mode: setupConfig.drainMode,
3474
+ load_reporting_enabled: setupConfig.loadReportingEnabled,
3475
+ hardware_telemetry_enabled: setupConfig.hardwareTelemetryEnabled,
3476
+ generic_job_max_concurrency: setupConfig.genericJobMaxConcurrency
1715
3477
  });
1716
3478
  const nodeId = optionalText(bootstrap.node?.node_id);
1717
3479
  const runtimeToken = optionalText(bootstrap.runtime_token);
@@ -1726,6 +3488,7 @@ export class SelfHostedNodeRuntime {
1726
3488
  machine_fingerprint: machineFingerprint,
1727
3489
  direct_base_url: setupConfig.directBaseUrl || null,
1728
3490
  runtime_token: undefined,
3491
+ artifact_store_path: setupConfig.artifactStorePath || defaultArtifactStorePath(),
1729
3492
  config_version: bootstrap.config_version,
1730
3493
  heartbeat_interval_seconds: heartbeatInterval,
1731
3494
  heartbeat_timeout_seconds: bootstrap.heartbeat_timeout_seconds,
@@ -1739,6 +3502,15 @@ export class SelfHostedNodeRuntime {
1739
3502
  node_version: setupConfig.nodeVersion,
1740
3503
  request_timeout_ms: setupConfig.requestTimeoutMs,
1741
3504
  job_timeout_ms: setupConfig.jobTimeoutMs,
3505
+ max_concurrent_jobs: setupConfig.maxConcurrentJobs,
3506
+ max_concurrent_llm_jobs: setupConfig.maxConcurrentLlmJobs,
3507
+ generic_jobs_enabled: setupConfig.genericJobsEnabled,
3508
+ generic_job_timeout_ms: setupConfig.genericJobTimeoutMs,
3509
+ generic_job_max_concurrency: setupConfig.genericJobMaxConcurrency,
3510
+ capability_probe_timeout_ms: setupConfig.capabilityProbeTimeoutMs || DEFAULT_CAPABILITY_PROBE_TIMEOUT_MS,
3511
+ drain_mode: setupConfig.drainMode,
3512
+ load_reporting_enabled: setupConfig.loadReportingEnabled,
3513
+ hardware_telemetry_enabled: setupConfig.hardwareTelemetryEnabled,
1742
3514
  expose_all_models: setupConfig.exposeAllModels,
1743
3515
  exposure_policy: setupConfig.exposeAllModels ? "all" : "none",
1744
3516
  model_allowlist: setupConfig.modelAllowlist,
@@ -1761,6 +3533,7 @@ export class SelfHostedNodeRuntime {
1761
3533
  ollamaBaseUrl: setupConfig.ollamaBaseUrl,
1762
3534
  statePath: setupConfig.statePath,
1763
3535
  runtimeTokenPath: setupConfig.runtimeTokenPath,
3536
+ artifactStorePath: setupConfig.artifactStorePath || defaultArtifactStorePath(),
1764
3537
  invocationSigningSecret: null,
1765
3538
  listenHost: DEFAULT_LISTEN_HOST,
1766
3539
  listenPort: DEFAULT_LISTEN_PORT,
@@ -1768,6 +3541,15 @@ export class SelfHostedNodeRuntime {
1768
3541
  heartbeatIntervalSeconds: heartbeatInterval,
1769
3542
  requestTimeoutMs: setupConfig.requestTimeoutMs,
1770
3543
  jobTimeoutMs: setupConfig.jobTimeoutMs,
3544
+ maxConcurrentJobs: setupConfig.maxConcurrentJobs,
3545
+ maxConcurrentLlmJobs: setupConfig.maxConcurrentLlmJobs,
3546
+ genericJobsEnabled: setupConfig.genericJobsEnabled,
3547
+ genericJobTimeoutMs: setupConfig.genericJobTimeoutMs,
3548
+ genericJobMaxConcurrency: setupConfig.genericJobMaxConcurrency,
3549
+ capabilityProbeTimeoutMs: setupConfig.capabilityProbeTimeoutMs || DEFAULT_CAPABILITY_PROBE_TIMEOUT_MS,
3550
+ drainMode: setupConfig.drainMode,
3551
+ loadReportingEnabled: setupConfig.loadReportingEnabled,
3552
+ hardwareTelemetryEnabled: setupConfig.hardwareTelemetryEnabled,
1771
3553
  exposeAllModels: setupConfig.exposeAllModels,
1772
3554
  modelAllowlist: setupConfig.modelAllowlist,
1773
3555
  modelBlocklist: setupConfig.modelBlocklist
@@ -1795,6 +3577,53 @@ export class SelfHostedNodeRuntime {
1795
3577
  const models = await this.mcoda.listAgents(this.config);
1796
3578
  return { source: "mcoda", status: "online", models, version: null, failureCount: 0 };
1797
3579
  }
3580
+ async probeCapabilities() {
3581
+ const timeoutMs = capabilityProbeTimeoutMs(this.config);
3582
+ const [gpu, docker, blender, ffmpeg] = await Promise.all([
3583
+ probeNvidiaGpuCapabilities(this.capabilityRunner, timeoutMs),
3584
+ probeDockerCapabilities(this.capabilityRunner, timeoutMs),
3585
+ probeVersionedSoftware(this.capabilityRunner, "blender", "blender", ["--version"], timeoutMs),
3586
+ probeVersionedSoftware(this.capabilityRunner, "ffmpeg", "ffmpeg", ["-version"], timeoutMs)
3587
+ ]);
3588
+ const software = {
3589
+ docker: docker.docker,
3590
+ "docker-nvidia": docker.dockerNvidia,
3591
+ blender,
3592
+ ffmpeg
3593
+ };
3594
+ const runnerCatalog = buildRunnerCapabilityCatalog(this.config, this.genericRunners).filter((entry) => runnerCapabilityRequirementsAvailable(entry, {
3595
+ gpu,
3596
+ software,
3597
+ genericJobsEnabled: this.config.genericJobsEnabled
3598
+ }));
3599
+ const snapshotWithoutId = {
3600
+ schema_version: MSWARM_CAPABILITY_SCHEMA_VERSION,
3601
+ captured_at: new Date().toISOString(),
3602
+ node_id: this.config.nodeId,
3603
+ platform: platform(),
3604
+ arch: process.arch,
3605
+ generic_jobs_enabled: this.config.genericJobsEnabled,
3606
+ job_types: uniqueSortedStrings(runnerCatalog.map((entry) => entry.job_type)),
3607
+ trust_modes: uniqueSortedStrings(runnerCatalog.flatMap((entry) => entry.trust_modes)),
3608
+ gpu,
3609
+ software,
3610
+ runner_catalog: runnerCatalog
3611
+ };
3612
+ const snapshot = {
3613
+ ...snapshotWithoutId,
3614
+ snapshot_id: buildCapabilitySnapshotId(snapshotWithoutId)
3615
+ };
3616
+ const diagnostics = capabilityDiagnostics(snapshot);
3617
+ return diagnostics ? { ...snapshot, diagnostics } : snapshot;
3618
+ }
3619
+ async publicCapabilityProjection() {
3620
+ return projectMswarmPublicCapabilities(await this.probeCapabilities());
3621
+ }
3622
+ async buildCapabilityHeartbeatPayload(runtimeToken) {
3623
+ const snapshot = await this.probeCapabilities();
3624
+ const privateCatalogEntry = buildMswarmPrivateCapabilityCatalogEntry(snapshot);
3625
+ return signCapabilityPayload({ privateCatalogEntry, runtimeToken });
3626
+ }
1798
3627
  async ensureEnrolled() {
1799
3628
  const currentState = await readSelfHostedNodeState(this.config.statePath);
1800
3629
  const persistedRuntimeToken = await readSelfHostedRuntimeToken(this.config.runtimeTokenPath);
@@ -1827,6 +3656,14 @@ export class SelfHostedNodeRuntime {
1827
3656
  node_version: this.config.nodeVersion,
1828
3657
  request_timeout_ms: this.config.requestTimeoutMs,
1829
3658
  job_timeout_ms: this.config.jobTimeoutMs,
3659
+ max_concurrent_jobs: this.config.maxConcurrentJobs,
3660
+ max_concurrent_llm_jobs: this.config.maxConcurrentLlmJobs,
3661
+ generic_jobs_enabled: this.config.genericJobsEnabled,
3662
+ generic_job_timeout_ms: this.config.genericJobTimeoutMs,
3663
+ generic_job_max_concurrency: this.config.genericJobMaxConcurrency,
3664
+ drain_mode: this.config.drainMode === true,
3665
+ load_reporting_enabled: this.config.loadReportingEnabled !== false,
3666
+ hardware_telemetry_enabled: this.config.hardwareTelemetryEnabled === true,
1830
3667
  expose_all_models: this.config.exposeAllModels,
1831
3668
  exposure_policy: this.config.exposeAllModels ? "all" : "none",
1832
3669
  model_allowlist: this.config.modelAllowlist,
@@ -1837,27 +3674,213 @@ export class SelfHostedNodeRuntime {
1837
3674
  return { runtimeToken, state: nextState, enrolled: true };
1838
3675
  }
1839
3676
  async resolveMcodaAgentForJob(job) {
1840
- const selected = optionalText(job.source_agent_slug) ||
1841
- optionalText(job.agent_slug) ||
1842
- optionalText(job.model) ||
1843
- optionalText(job.openai_request.model);
3677
+ const selectedSourceAgentSlug = optionalText(job.source_agent_slug);
3678
+ const selectedAgentSlug = optionalText(job.agent_slug);
3679
+ const selectedModel = optionalText(job.model) || optionalText(job.openai_request.model);
3680
+ const selected = selectedSourceAgentSlug || selectedAgentSlug || selectedModel;
1844
3681
  if (!selected) {
1845
- throw new Error("mcoda source agent slug is required");
3682
+ throw new SelfHostedPreStartJobError("selected_agent_unavailable", "mcoda source agent slug is required");
1846
3683
  }
1847
3684
  const rawAgents = await this.mcoda.listRawAgents();
3685
+ const strictSelectedAgent = selectedSourceAgentSlug || selectedAgentSlug;
1848
3686
  const agent = rawAgents.find((entry) => {
1849
3687
  const slug = optionalText(entry.slug);
3688
+ if (strictSelectedAgent) {
3689
+ return slug === strictSelectedAgent;
3690
+ }
1850
3691
  const defaultModel = mcodaAgentDefaultModel(entry);
1851
3692
  return slug === selected || defaultModel === selected;
1852
3693
  });
1853
- if (!agent || !isExposedLocalAgent(agent, this.config)) {
1854
- throw new Error("selected local mcoda agent is not exposed by this node");
3694
+ if (!agent) {
3695
+ throw new SelfHostedPreStartJobError("selected_agent_unavailable", `selected local mcoda agent ${selected} is not available on this node`);
3696
+ }
3697
+ const mapped = mapMcodaAgentToSelfHostedModel(agent, this.config);
3698
+ if (!mapped?.exposed) {
3699
+ throw new SelfHostedPreStartJobError("selected_agent_unavailable", `selected local mcoda agent ${selected} is not exposed by this node`);
3700
+ }
3701
+ if (mapped.health_status && mapped.health_status !== "healthy" && mapped.health_status !== "unknown") {
3702
+ throw new SelfHostedPreStartJobError("selected_agent_unhealthy", `selected local mcoda agent ${selected} is ${mapped.health_status}`);
1855
3703
  }
1856
3704
  return mapMcodaAgentToCodaliAgent(agent, selected);
1857
3705
  }
3706
+ async executeGenericJob(envelope, options = {}) {
3707
+ const startedAt = Date.now();
3708
+ this.beginExecutionTelemetry("generic_job");
3709
+ const events = [];
3710
+ let sequence = 0;
3711
+ const emitEvent = async (event) => {
3712
+ const next = {
3713
+ job_id: envelope.job_id,
3714
+ sequence,
3715
+ timestamp: new Date().toISOString(),
3716
+ ...event
3717
+ };
3718
+ sequence += 1;
3719
+ events.push(next);
3720
+ await options.onEvent?.(next);
3721
+ };
3722
+ const failed = async (code, message, validationIssues) => {
3723
+ await emitEvent({
3724
+ type: code === "cancelled" ? "cancelled" : "failed",
3725
+ message,
3726
+ data: { code }
3727
+ });
3728
+ const status = code === "cancelled" ? "cancelled" : "failed";
3729
+ const result = {
3730
+ job_id: envelope.job_id,
3731
+ status,
3732
+ error: {
3733
+ code,
3734
+ message,
3735
+ retryable: code === "timeout"
3736
+ },
3737
+ finished_at: new Date().toISOString()
3738
+ };
3739
+ this.finishExecutionTelemetry({
3740
+ executionClass: "generic_job",
3741
+ startedAt,
3742
+ ok: false,
3743
+ code
3744
+ });
3745
+ return {
3746
+ job_id: envelope.job_id,
3747
+ request_id: envelope.request_id,
3748
+ status,
3749
+ result,
3750
+ events,
3751
+ ...(validationIssues?.length ? { validation_issues: validationIssues } : {}),
3752
+ timing: { local_latency_ms: Date.now() - startedAt }
3753
+ };
3754
+ };
3755
+ if (!this.config.genericJobsEnabled) {
3756
+ return failed("feature_disabled", "Generic node jobs are disabled on this node.");
3757
+ }
3758
+ if (envelope.node_id !== this.config.nodeId) {
3759
+ return failed("validation_failed", "generic job node_id does not match this node");
3760
+ }
3761
+ const validation = validateMswarmGenericJobRequest(envelope.job, {
3762
+ registeredJobCatalog: registeredOwnerLocalGenericJobCatalog()
3763
+ });
3764
+ if (!validation.ok || !validation.value) {
3765
+ return failed("validation_failed", "generic job request failed validation", validation.issues);
3766
+ }
3767
+ const job = validation.value;
3768
+ const runner = runnerForGenericJob(job, this.genericRunners);
3769
+ if (!runner) {
3770
+ return failed("runner_unavailable", `No generic job runner is registered for ${job.job_type}.`);
3771
+ }
3772
+ if (job.job_type === RENDER_BLENDER_JOB_TYPE || job.job_type === CUDA_RUN_JOB_TYPE) {
3773
+ const capabilityMismatch = genericJobCapabilityMismatch(job, await this.probeCapabilities());
3774
+ if (capabilityMismatch) {
3775
+ return failed(capabilityMismatch.code, capabilityMismatch.message);
3776
+ }
3777
+ }
3778
+ let artifactContext;
3779
+ try {
3780
+ artifactContext = await this.artifactStore.prepareJobWorkspace(envelope.job_id, job);
3781
+ }
3782
+ catch (error) {
3783
+ return failed("validation_failed", error instanceof Error ? error.message : String(error || "generic job artifact preparation failed"));
3784
+ }
3785
+ const controller = new AbortController();
3786
+ const timeoutMs = genericJobTimeoutMs(job, this.config.genericJobTimeoutMs || this.config.jobTimeoutMs);
3787
+ const onAbort = () => {
3788
+ if (!controller.signal.aborted) {
3789
+ controller.abort(options.signal?.reason || "cancelled");
3790
+ }
3791
+ };
3792
+ if (options.signal?.aborted) {
3793
+ controller.abort(options.signal.reason || "cancelled");
3794
+ }
3795
+ options.signal?.addEventListener("abort", onAbort, { once: true });
3796
+ const timeout = setTimeout(() => {
3797
+ if (!controller.signal.aborted) {
3798
+ controller.abort("timeout");
3799
+ }
3800
+ }, timeoutMs);
3801
+ try {
3802
+ await emitEvent({
3803
+ type: "started",
3804
+ message: `Running ${job.job_type}`,
3805
+ data: {
3806
+ runner: runner.id,
3807
+ sandbox_profile: artifactContext.sandbox.name,
3808
+ timeout_ms: timeoutMs
3809
+ }
3810
+ });
3811
+ const runnerResult = await runner.run({
3812
+ job,
3813
+ signal: controller.signal,
3814
+ emitEvent,
3815
+ artifacts: artifactContext,
3816
+ sandbox: artifactContext.sandbox
3817
+ });
3818
+ const status = runnerResult.status || "succeeded";
3819
+ const outputContext = status === "succeeded"
3820
+ ? artifactContext
3821
+ : {
3822
+ ...artifactContext,
3823
+ outputSpecs: artifactContext.outputSpecs.map((output) => ({ ...output, required: false }))
3824
+ };
3825
+ const outputArtifacts = await this.artifactStore.collectOutputs(outputContext, envelope.job_id);
3826
+ for (const artifact of outputArtifacts) {
3827
+ await emitEvent({
3828
+ type: "artifact",
3829
+ message: "output artifact collected",
3830
+ data: { artifact }
3831
+ });
3832
+ }
3833
+ const result = {
3834
+ ...runnerResult,
3835
+ job_id: envelope.job_id,
3836
+ status,
3837
+ artifacts: [...(runnerResult.artifacts || []), ...outputArtifacts],
3838
+ started_at: runnerResult.started_at || new Date(startedAt).toISOString(),
3839
+ finished_at: runnerResult.finished_at || new Date().toISOString()
3840
+ };
3841
+ await emitEvent({
3842
+ type: status === "succeeded" ? "completed" : "failed",
3843
+ message: status === "succeeded" ? "generic job completed" : runnerResult.error?.message || "generic job failed",
3844
+ data: {
3845
+ status,
3846
+ exit_code: result.exit_code,
3847
+ runner: runner.id
3848
+ }
3849
+ });
3850
+ this.finishExecutionTelemetry({
3851
+ executionClass: "generic_job",
3852
+ startedAt,
3853
+ ok: status === "succeeded",
3854
+ code: runnerResult.error?.code || status
3855
+ });
3856
+ return {
3857
+ job_id: envelope.job_id,
3858
+ request_id: envelope.request_id,
3859
+ status,
3860
+ result,
3861
+ events,
3862
+ timing: { local_latency_ms: Date.now() - startedAt }
3863
+ };
3864
+ }
3865
+ catch (error) {
3866
+ const code = isGenericAbortError(error, controller.signal) ? abortErrorCode(controller.signal) : "runner_error";
3867
+ const message = code === "timeout" || code === "cancelled"
3868
+ ? abortErrorMessage(controller.signal)
3869
+ : error instanceof Error
3870
+ ? error.message
3871
+ : String(error);
3872
+ return failed(code, message);
3873
+ }
3874
+ finally {
3875
+ clearTimeout(timeout);
3876
+ options.signal?.removeEventListener("abort", onAbort);
3877
+ }
3878
+ }
1858
3879
  async executeJob(job, options = {}) {
1859
3880
  const startedAt = Date.now();
3881
+ this.beginExecutionTelemetry("llm");
1860
3882
  let selectedAgent;
3883
+ let jobStarted = false;
1861
3884
  const progressEvents = [];
1862
3885
  const streamEvents = [];
1863
3886
  const recordProgress = async (event) => {
@@ -1868,13 +3891,35 @@ export class SelfHostedNodeRuntime {
1868
3891
  streamEvents.push(chunk);
1869
3892
  await options.onOpenAIChunk?.(chunk);
1870
3893
  };
3894
+ const acknowledgeStarted = async (agent) => {
3895
+ if (jobStarted) {
3896
+ return;
3897
+ }
3898
+ await options.onStarted?.({
3899
+ job_id: job.job_id,
3900
+ request_id: job.request_id,
3901
+ node_id: job.node_id,
3902
+ agent_slug: optionalText(job.agent_slug) || agent?.slug || "",
3903
+ source_agent_slug: optionalText(job.source_agent_slug) || agent?.slug || null,
3904
+ model: optionalText(job.model) || optionalText(job.openai_request.model)
3905
+ });
3906
+ jobStarted = true;
3907
+ };
1871
3908
  if (job.node_id !== this.config.nodeId) {
1872
- return {
3909
+ const result = {
1873
3910
  job_id: job.job_id,
1874
3911
  request_id: job.request_id,
1875
3912
  status: "failed",
3913
+ pre_start_failure: true,
1876
3914
  error: { code: "validation_failed", message: "job node_id does not match this node" }
1877
3915
  };
3916
+ this.finishExecutionTelemetry({
3917
+ executionClass: "llm",
3918
+ startedAt,
3919
+ ok: false,
3920
+ code: "validation_failed"
3921
+ });
3922
+ return result;
1878
3923
  }
1879
3924
  try {
1880
3925
  if (job.provider === "ollama") {
@@ -1887,7 +3932,8 @@ export class SelfHostedNodeRuntime {
1887
3932
  options.num_predict = job.openai_request.max_tokens;
1888
3933
  if (job.openai_request.stop !== undefined)
1889
3934
  options.stop = job.openai_request.stop;
1890
- const result = await this.jobOllama.chat({
3935
+ await acknowledgeStarted();
3936
+ const ollamaResult = await this.jobOllama.chat({
1891
3937
  model: job.model || job.openai_request.model,
1892
3938
  messages: job.openai_request.messages,
1893
3939
  options,
@@ -1900,7 +3946,7 @@ export class SelfHostedNodeRuntime {
1900
3946
  created: Math.floor(Date.now() / 1000),
1901
3947
  model: job.openai_request.model,
1902
3948
  choices: [
1903
- { index: 0, delta: { content: result.content }, finish_reason: null }
3949
+ { index: 0, delta: { content: ollamaResult.content }, finish_reason: null }
1904
3950
  ]
1905
3951
  });
1906
3952
  await emitOpenAIChunk({
@@ -1913,22 +3959,28 @@ export class SelfHostedNodeRuntime {
1913
3959
  ]
1914
3960
  });
1915
3961
  }
1916
- return {
3962
+ const invocationResult = {
1917
3963
  job_id: job.job_id,
1918
3964
  request_id: job.request_id,
1919
3965
  status: "success",
1920
3966
  openai_response: buildOpenAIChatCompletion({
1921
3967
  requestId: job.request_id,
1922
3968
  model: job.openai_request.model,
1923
- content: result.content,
1924
- promptTokens: result.promptTokens,
1925
- completionTokens: result.completionTokens,
1926
- metadata: { provider: "ollama", raw: result.raw }
3969
+ content: ollamaResult.content,
3970
+ promptTokens: ollamaResult.promptTokens,
3971
+ completionTokens: ollamaResult.completionTokens,
3972
+ metadata: { provider: "ollama", raw: ollamaResult.raw }
1927
3973
  }),
1928
3974
  ...(streamEvents.length ? { stream_events: streamEvents } : {}),
1929
3975
  ...(progressEvents.length ? { progress_events: progressEvents } : {}),
1930
3976
  timing: { local_latency_ms: Date.now() - startedAt }
1931
3977
  };
3978
+ this.finishExecutionTelemetry({
3979
+ executionClass: "llm",
3980
+ startedAt,
3981
+ ok: true
3982
+ });
3983
+ return invocationResult;
1932
3984
  }
1933
3985
  const taskPreview = messagesToPrompt(job.openai_request.messages);
1934
3986
  if (!taskPreview) {
@@ -1938,6 +3990,7 @@ export class SelfHostedNodeRuntime {
1938
3990
  selectedAgent = agent;
1939
3991
  validateRequiredDocdexContext(job, options.attachedMswarmApiKey);
1940
3992
  const attachedMswarmApiKey = attachedMswarmApiKeyForDocdex(job, options.attachedMswarmApiKey);
3993
+ await acknowledgeStarted(agent);
1941
3994
  await recordProgress({
1942
3995
  type: "agent_selected",
1943
3996
  job_id: job.job_id,
@@ -1976,7 +4029,7 @@ export class SelfHostedNodeRuntime {
1976
4029
  }
1977
4030
  });
1978
4031
  const tokens = usageTokens(response.usage);
1979
- return {
4032
+ const result = {
1980
4033
  job_id: job.job_id,
1981
4034
  request_id: job.request_id,
1982
4035
  status: "success",
@@ -2002,6 +4055,12 @@ export class SelfHostedNodeRuntime {
2002
4055
  ...(progressEvents.length ? { progress_events: progressEvents } : {}),
2003
4056
  timing: { local_latency_ms: Date.now() - startedAt }
2004
4057
  };
4058
+ this.finishExecutionTelemetry({
4059
+ executionClass: "llm",
4060
+ startedAt,
4061
+ ok: true
4062
+ });
4063
+ return result;
2005
4064
  }
2006
4065
  catch (error) {
2007
4066
  const message = redactRuntimeSecretValues(error instanceof Error ? error.message : String(error), [selectedAgent?.apiKey, options.attachedMswarmApiKey]);
@@ -2014,10 +4073,11 @@ export class SelfHostedNodeRuntime {
2014
4073
  : /permission|policy|denied/i.test(message)
2015
4074
  ? "policy_denied"
2016
4075
  : "upstream_error");
2017
- return {
4076
+ const result = {
2018
4077
  job_id: job.job_id,
2019
4078
  request_id: job.request_id,
2020
4079
  status: "failed",
4080
+ ...(!jobStarted ? { pre_start_failure: true } : {}),
2021
4081
  error: {
2022
4082
  code,
2023
4083
  message
@@ -2026,6 +4086,13 @@ export class SelfHostedNodeRuntime {
2026
4086
  ...(progressEvents.length ? { progress_events: progressEvents } : {}),
2027
4087
  timing: { local_latency_ms: Date.now() - startedAt }
2028
4088
  };
4089
+ this.finishExecutionTelemetry({
4090
+ executionClass: "llm",
4091
+ startedAt,
4092
+ ok: false,
4093
+ code
4094
+ });
4095
+ return result;
2029
4096
  }
2030
4097
  }
2031
4098
  async runOnce() {
@@ -2050,11 +4117,49 @@ export class SelfHostedNodeRuntime {
2050
4117
  models = [];
2051
4118
  version = null;
2052
4119
  }
4120
+ const discoveryLatencyMs = Date.now() - startedAt;
4121
+ const capabilityPayload = await this.buildCapabilityHeartbeatPayload(enrollment.runtimeToken);
4122
+ const loadTelemetry = this.buildLoadTelemetry({
4123
+ models,
4124
+ discoveryLatencyMs,
4125
+ discoveryFailureCount: recentFailureCount,
4126
+ capabilityPayload
4127
+ });
4128
+ const exposedModelCount = models.filter((model) => model.exposed !== false).length;
4129
+ const loadReportingEnabled = this.config.loadReportingEnabled !== false;
4130
+ const capacityPayload = loadReportingEnabled
4131
+ ? {
4132
+ protocol_version: loadTelemetry.runtime_protocol_version,
4133
+ runtime_protocol_version: loadTelemetry.runtime_protocol_version,
4134
+ load_balancer_protocol_version: loadTelemetry.load_balancer_protocol_version,
4135
+ catalog_metadata_version: loadTelemetry.catalog_metadata_version,
4136
+ catalog_fingerprint: loadTelemetry.catalog_fingerprint,
4137
+ max_concurrency: loadTelemetry.max_concurrency,
4138
+ max_concurrent_llm_jobs: loadTelemetry.max_concurrent_llm_jobs,
4139
+ max_concurrent_generic_jobs: loadTelemetry.max_concurrent_generic_jobs,
4140
+ active_jobs: loadTelemetry.active_jobs,
4141
+ queued_jobs: loadTelemetry.queued_jobs,
4142
+ free_slots: loadTelemetry.free_slots,
4143
+ drain_mode: loadTelemetry.drain_mode,
4144
+ execution_class_capacity: loadTelemetry.execution_class_capacity
4145
+ }
4146
+ : {
4147
+ active_jobs: loadTelemetry.active_jobs,
4148
+ queued_jobs: loadTelemetry.queued_jobs
4149
+ };
2053
4150
  const heartbeatPayload = {
2054
4151
  node_id: this.config.nodeId,
2055
4152
  node_version: this.config.nodeVersion,
4153
+ runtime_protocol_version: SELF_HOSTED_RUNTIME_PROTOCOL_VERSION,
2056
4154
  config_version: enrollment.state.config_version ?? null,
2057
4155
  status,
4156
+ runtime: {
4157
+ protocol_version: SELF_HOSTED_RUNTIME_PROTOCOL_VERSION,
4158
+ relay_mode: this.config.relayMode || "outbound",
4159
+ load_reporting_enabled: loadReportingEnabled,
4160
+ hardware_telemetry_enabled: this.config.hardwareTelemetryEnabled === true,
4161
+ drain_mode: this.config.drainMode === true
4162
+ },
2058
4163
  discovery: {
2059
4164
  source: discoverySource,
2060
4165
  mcoda_status: discoverySource === "mcoda" && status === "online" ? "ok" : status === "degraded" ? "error" : null
@@ -2068,19 +4173,24 @@ export class SelfHostedNodeRuntime {
2068
4173
  status: null,
2069
4174
  version: null
2070
4175
  },
2071
- capacity: {
2072
- active_jobs: 0,
2073
- queued_jobs: 0
2074
- },
4176
+ capacity: capacityPayload,
2075
4177
  health: {
2076
- avg_latency_ms: Date.now() - startedAt,
2077
- recent_failure_count: recentFailureCount,
4178
+ avg_latency_ms: loadTelemetry.avg_latency_ms ?? discoveryLatencyMs,
4179
+ recent_failure_count: loadTelemetry.recent_failure_count,
4180
+ recent_failures: loadTelemetry.recent_failures,
2078
4181
  last_success_at: status === "online" ? new Date().toISOString() : null
2079
4182
  },
2080
- models
4183
+ local_agent_catalog: {
4184
+ revision: loadTelemetry.catalog_fingerprint,
4185
+ metadata_version: loadTelemetry.catalog_metadata_version,
4186
+ model_count: models.length,
4187
+ exposed_model_count: exposedModelCount
4188
+ },
4189
+ models,
4190
+ capabilities: capabilityPayload,
4191
+ ...(loadTelemetry.hardware_pressure ? { hardware_pressure: loadTelemetry.hardware_pressure } : {})
2081
4192
  };
2082
4193
  const heartbeatResponse = await this.gateway.heartbeat(enrollment.runtimeToken, heartbeatPayload);
2083
- const exposedModelCount = models.filter((model) => model.exposed !== false).length;
2084
4194
  return {
2085
4195
  enrolled: enrollment.enrolled,
2086
4196
  status,
@@ -2088,6 +4198,7 @@ export class SelfHostedNodeRuntime {
2088
4198
  discovery_source: discoverySource,
2089
4199
  mcoda_agent_count: discoverySource === "mcoda" ? exposedModelCount : undefined,
2090
4200
  ollama_version: version,
4201
+ capacity: loadTelemetry,
2091
4202
  heartbeat_response: heartbeatResponse
2092
4203
  };
2093
4204
  }
@@ -2122,9 +4233,17 @@ export class SelfHostedNodeRuntime {
2122
4233
  }
2123
4234
  async pollAndExecuteJob(waitMs = DEFAULT_JOB_POLL_WAIT_MS) {
2124
4235
  const enrollment = await this.ensureEnrolled();
4236
+ const pollCapacity = this.buildLoadTelemetry({ models: [] });
2125
4237
  const response = await this.gateway.pollJob(enrollment.runtimeToken, {
2126
4238
  node_id: this.config.nodeId,
2127
- capacity: { active_jobs: 0, max_jobs: 1 },
4239
+ capacity: {
4240
+ active_jobs: pollCapacity.active_jobs,
4241
+ queued_jobs: pollCapacity.queued_jobs,
4242
+ max_jobs: pollCapacity.max_concurrency,
4243
+ max_concurrency: pollCapacity.max_concurrency,
4244
+ free_slots: pollCapacity.free_slots,
4245
+ drain_mode: pollCapacity.drain_mode
4246
+ },
2128
4247
  wait_ms: waitMs
2129
4248
  });
2130
4249
  const job = response.job || null;
@@ -2152,6 +4271,14 @@ export class SelfHostedNodeRuntime {
2152
4271
  };
2153
4272
  const result = await this.executeJob(job, {
2154
4273
  attachedMswarmApiKey: optionalText(response.attached_mswarm_api_key) || undefined,
4274
+ onStarted: async (event) => {
4275
+ await this.gateway.postJobStart(enrollment.runtimeToken, job.job_id, {
4276
+ node_id: this.config.nodeId,
4277
+ agent_slug: event.agent_slug || job.agent_slug,
4278
+ source_agent_slug: event.source_agent_slug || job.source_agent_slug || null,
4279
+ model: event.model || job.model || job.openai_request.model
4280
+ });
4281
+ },
2155
4282
  onOpenAIChunk: async (chunk) => {
2156
4283
  if (job.openai_request.stream !== true || streamEventForwardingFailed) {
2157
4284
  return;