@mcoda/mswarm 0.1.76 → 0.1.79
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +93 -0
- package/dist/invocation-token.d.ts +48 -0
- package/dist/invocation-token.d.ts.map +1 -1
- package/dist/invocation-token.js +109 -0
- package/dist/invocation-token.js.map +1 -1
- package/dist/runtime.d.ts +183 -0
- package/dist/runtime.d.ts.map +1 -1
- package/dist/runtime.js +2299 -172
- package/dist/runtime.js.map +1 -1
- package/dist/server.d.ts.map +1 -1
- package/dist/server.js +1416 -4
- package/dist/server.js.map +1 -1
- package/package.json +4 -4
package/dist/runtime.js
CHANGED
|
@@ -1,9 +1,10 @@
|
|
|
1
|
-
import { chmod, mkdir, readFile, rm, writeFile } from "node:fs/promises";
|
|
2
|
-
import { dirname, join } from "node:path";
|
|
3
|
-
import { hostname, homedir, platform, userInfo } from "node:os";
|
|
1
|
+
import { chmod, lstat, mkdir, readdir, readFile, rm, writeFile } from "node:fs/promises";
|
|
2
|
+
import { dirname, isAbsolute, join, relative, resolve } from "node:path";
|
|
3
|
+
import { cpus, freemem, hostname, homedir, loadavg, platform, totalmem, userInfo } from "node:os";
|
|
4
4
|
import { spawn } from "node:child_process";
|
|
5
|
-
import { createHash, randomUUID } from "node:crypto";
|
|
5
|
+
import { createHash, createHmac, randomUUID } from "node:crypto";
|
|
6
6
|
import { MswarmCodaliExecutor } from "./codali-executor.js";
|
|
7
|
+
import { MSWARM_CAPABILITY_SCHEMA_VERSION, assertMswarmSafeRelativePath, validateMswarmArchiveEntry, buildMswarmCapabilityNames, buildMswarmPrivateCapabilityCatalogEntry, buildMswarmLocalArtifactUri, buildMswarmSandboxProfile, defaultMswarmArtifactAccessPolicy, defaultMswarmArtifactRetentionPolicy, projectMswarmPublicCapabilities, validateMswarmGenericJobRequest } from "@mcoda/shared";
|
|
7
8
|
const DEFAULT_GATEWAY_BASE_URL = "http://127.0.0.1:8080";
|
|
8
9
|
const DEFAULT_SETUP_GATEWAY_BASE_URL = "https://api.mswarm.org";
|
|
9
10
|
const DEFAULT_OLLAMA_BASE_URL = "http://127.0.0.1:11434";
|
|
@@ -14,11 +15,91 @@ const DEFAULT_SELF_HOSTED_NODE_VERSION = "0.1.70";
|
|
|
14
15
|
const DEFAULT_REQUEST_TIMEOUT_MS = 10000;
|
|
15
16
|
const DEFAULT_JOB_TIMEOUT_MS = 3600000;
|
|
16
17
|
const DEFAULT_SERVICE_COMMAND_TIMEOUT_MS = 60000;
|
|
18
|
+
const DEFAULT_CAPABILITY_PROBE_TIMEOUT_MS = 2000;
|
|
19
|
+
const SELF_HOSTED_RUNTIME_PROTOCOL_VERSION = 1;
|
|
20
|
+
const SELF_HOSTED_LOAD_BALANCER_PROTOCOL_VERSION = 1;
|
|
21
|
+
const SELF_HOSTED_CATALOG_METADATA_VERSION = 1;
|
|
22
|
+
const MAX_TELEMETRY_LATENCY_SAMPLES = 50;
|
|
23
|
+
const MAX_TELEMETRY_FAILURES = 20;
|
|
17
24
|
const DEFAULT_MCODA_BIN = "mcoda";
|
|
18
25
|
const DEFAULT_MCODA_LIST_ARGS = ["agent", "list", "--json", "--refresh-health"];
|
|
19
26
|
const DEFAULT_COMMAND_MAX_BUFFER = 16 * 1024 * 1024;
|
|
27
|
+
const DEFAULT_LOCAL_ARTIFACT_MAX_BYTES = 512 * 1024 * 1024;
|
|
20
28
|
const DEFAULT_JOB_POLL_WAIT_MS = 25000;
|
|
21
29
|
const DEFAULT_STREAM_EVENT_BATCH_SIZE = 8;
|
|
30
|
+
const OWNER_LOCAL_TEST_ECHO_JOB_TYPE = "tenant.test-echo";
|
|
31
|
+
const TEST_ECHO_RUNNER_ID = "test.echo";
|
|
32
|
+
const RENDER_BLENDER_JOB_TYPE = "render.blender";
|
|
33
|
+
const BLENDER_RENDER_RUNNER_ID = "blender.render";
|
|
34
|
+
const CUDA_RUN_JOB_TYPE = "cuda.run";
|
|
35
|
+
const CUDA_PACKAGE_RUNNER_ID = "cuda.package";
|
|
36
|
+
const APPROVED_NVIDIA_CUDA_IMAGES = new Set([
|
|
37
|
+
"nvidia/cuda:12.4.1-devel-ubuntu22.04"
|
|
38
|
+
]);
|
|
39
|
+
const OWNER_LOCAL_GENERIC_JOB_CATALOG = [
|
|
40
|
+
{
|
|
41
|
+
job_type: OWNER_LOCAL_TEST_ECHO_JOB_TYPE,
|
|
42
|
+
args_schema: {
|
|
43
|
+
type: "object",
|
|
44
|
+
additionalProperties: true,
|
|
45
|
+
properties: {
|
|
46
|
+
message: { type: "string" },
|
|
47
|
+
delay_ms: { type: "number", minimum: 0 },
|
|
48
|
+
repeat: { type: "number", minimum: 1 },
|
|
49
|
+
fail: { type: "boolean" }
|
|
50
|
+
}
|
|
51
|
+
},
|
|
52
|
+
policy: {
|
|
53
|
+
trust_mode: "owner-local",
|
|
54
|
+
network: "none",
|
|
55
|
+
allow_raw_command: false
|
|
56
|
+
},
|
|
57
|
+
runner: TEST_ECHO_RUNNER_ID
|
|
58
|
+
},
|
|
59
|
+
{
|
|
60
|
+
job_type: RENDER_BLENDER_JOB_TYPE,
|
|
61
|
+
args_schema: {
|
|
62
|
+
type: "object",
|
|
63
|
+
additionalProperties: false,
|
|
64
|
+
properties: {
|
|
65
|
+
frames: { type: ["string", "number"] },
|
|
66
|
+
engine: { enum: ["cycles", "eevee", "workbench"] },
|
|
67
|
+
resolution: { type: "string", pattern: "^[1-9][0-9]{0,4}x[1-9][0-9]{0,4}$" },
|
|
68
|
+
output_format: { enum: ["png", "jpeg", "open_exr"] },
|
|
69
|
+
scene: { type: "string" },
|
|
70
|
+
camera: { type: "string" }
|
|
71
|
+
}
|
|
72
|
+
},
|
|
73
|
+
policy: {
|
|
74
|
+
trust_mode: "owner-local",
|
|
75
|
+
network: "none",
|
|
76
|
+
allow_raw_command: false
|
|
77
|
+
},
|
|
78
|
+
runner: BLENDER_RENDER_RUNNER_ID,
|
|
79
|
+
required_capabilities: ["software.blender"]
|
|
80
|
+
},
|
|
81
|
+
{
|
|
82
|
+
job_type: CUDA_RUN_JOB_TYPE,
|
|
83
|
+
args_schema: {
|
|
84
|
+
type: "object",
|
|
85
|
+
additionalProperties: false,
|
|
86
|
+
required: ["manifest_path", "profile", "target"],
|
|
87
|
+
properties: {
|
|
88
|
+
manifest_path: { type: "string" },
|
|
89
|
+
profile: { type: "string" },
|
|
90
|
+
target: { type: "string" }
|
|
91
|
+
}
|
|
92
|
+
},
|
|
93
|
+
policy: {
|
|
94
|
+
trust_mode: "owner-local",
|
|
95
|
+
network: "none",
|
|
96
|
+
allow_raw_command: false,
|
|
97
|
+
allowed_images: Array.from(APPROVED_NVIDIA_CUDA_IMAGES)
|
|
98
|
+
},
|
|
99
|
+
runner: CUDA_PACKAGE_RUNNER_ID,
|
|
100
|
+
required_capabilities: ["gpu.nvidia", "software.docker", "docker.nvidia"]
|
|
101
|
+
}
|
|
102
|
+
];
|
|
22
103
|
const SERVICE_LABEL = "com.mcoda.mswarm.self-hosted-node";
|
|
23
104
|
const SYSTEMD_SERVICE_NAME = "mswarm-self-hosted-node.service";
|
|
24
105
|
const WINDOWS_TASK_NAME = "MswarmSelfHostedNode";
|
|
@@ -327,6 +408,9 @@ function defaultStatePath() {
|
|
|
327
408
|
function defaultRuntimeTokenPath() {
|
|
328
409
|
return join(homedir(), ".mswarm", "self-hosted-node", "node.key");
|
|
329
410
|
}
|
|
411
|
+
function defaultArtifactStorePath() {
|
|
412
|
+
return join(homedir(), ".mswarm", "self-hosted-node", "artifacts");
|
|
413
|
+
}
|
|
330
414
|
export async function readOrCreateSelfHostedMachineId(machineIdPath = defaultMachineIdPath()) {
|
|
331
415
|
try {
|
|
332
416
|
const existing = (await readFile(machineIdPath, "utf8")).trim();
|
|
@@ -381,6 +465,112 @@ function optionalBoolean(...values) {
|
|
|
381
465
|
}
|
|
382
466
|
return null;
|
|
383
467
|
}
|
|
468
|
+
function roundedTelemetryNumber(value, digits = 3) {
|
|
469
|
+
if (!Number.isFinite(value)) {
|
|
470
|
+
return 0;
|
|
471
|
+
}
|
|
472
|
+
const factor = 10 ** digits;
|
|
473
|
+
return Math.round(value * factor) / factor;
|
|
474
|
+
}
|
|
475
|
+
function nonNegativeTelemetryInteger(value) {
|
|
476
|
+
return typeof value === "number" && Number.isFinite(value) && value > 0 ? Math.floor(value) : 0;
|
|
477
|
+
}
|
|
478
|
+
function sha256Json(value) {
|
|
479
|
+
return createHash("sha256").update(JSON.stringify(value)).digest("hex");
|
|
480
|
+
}
|
|
481
|
+
function buildCatalogFingerprint(models) {
|
|
482
|
+
const projection = models
|
|
483
|
+
.map((model) => ({
|
|
484
|
+
name: optionalText(model.name) || "",
|
|
485
|
+
provider: optionalText(model.provider) || null,
|
|
486
|
+
adapter: optionalText(model.adapter) || null,
|
|
487
|
+
source_agent_slug: optionalText(model.source_agent_slug) || null,
|
|
488
|
+
model_id: optionalText(model.model_id) || optionalText(model.model) || null,
|
|
489
|
+
exposed: model.exposed !== false,
|
|
490
|
+
capabilities: normalizeCapabilities(model.capabilities).sort(),
|
|
491
|
+
health_status: normalizeHealthStatus(model.health_status)
|
|
492
|
+
}))
|
|
493
|
+
.sort((left, right) => `${left.provider || ""}:${left.name}`.localeCompare(`${right.provider || ""}:${right.name}`));
|
|
494
|
+
return `sha256:${sha256Json(projection)}`;
|
|
495
|
+
}
|
|
496
|
+
function executionClassCapacity(input) {
|
|
497
|
+
const maxConcurrency = Math.max(1, Math.floor(input.maxConcurrency));
|
|
498
|
+
const activeJobs = nonNegativeTelemetryInteger(input.activeJobs);
|
|
499
|
+
const queuedJobs = nonNegativeTelemetryInteger(input.queuedJobs);
|
|
500
|
+
return {
|
|
501
|
+
max_concurrency: maxConcurrency,
|
|
502
|
+
active_jobs: activeJobs,
|
|
503
|
+
queued_jobs: queuedJobs,
|
|
504
|
+
free_slots: input.drainMode ? 0 : Math.max(0, maxConcurrency - activeJobs - queuedJobs)
|
|
505
|
+
};
|
|
506
|
+
}
|
|
507
|
+
function totalHostMemoryBucket() {
|
|
508
|
+
const gib = totalmem() / (1024 ** 3);
|
|
509
|
+
if (!Number.isFinite(gib) || gib <= 0)
|
|
510
|
+
return "unknown";
|
|
511
|
+
if (gib <= 8)
|
|
512
|
+
return "<=8GiB";
|
|
513
|
+
if (gib <= 16)
|
|
514
|
+
return "<=16GiB";
|
|
515
|
+
if (gib <= 32)
|
|
516
|
+
return "<=32GiB";
|
|
517
|
+
if (gib <= 64)
|
|
518
|
+
return "<=64GiB";
|
|
519
|
+
if (gib <= 128)
|
|
520
|
+
return "<=128GiB";
|
|
521
|
+
return ">128GiB";
|
|
522
|
+
}
|
|
523
|
+
function coarsePublicVramTier(value, gpuCount) {
|
|
524
|
+
if (value === "none" ||
|
|
525
|
+
value === "lt8" ||
|
|
526
|
+
value === "8-15" ||
|
|
527
|
+
value === "16-31" ||
|
|
528
|
+
value === "32plus") {
|
|
529
|
+
return value;
|
|
530
|
+
}
|
|
531
|
+
return gpuCount > 0 ? "unknown" : "none";
|
|
532
|
+
}
|
|
533
|
+
function buildCoarseHardwarePressure(capabilityPayload) {
|
|
534
|
+
const cpuCount = Math.max(1, cpus().length || 1);
|
|
535
|
+
const totalMemory = totalmem();
|
|
536
|
+
const freeMemory = freemem();
|
|
537
|
+
const projection = capabilityPayload?.public_projection;
|
|
538
|
+
const projectionRecord = projection && typeof projection === "object" && !Array.isArray(projection)
|
|
539
|
+
? projection
|
|
540
|
+
: {};
|
|
541
|
+
const accelerators = projectionRecord.accelerators && typeof projectionRecord.accelerators === "object"
|
|
542
|
+
? projectionRecord.accelerators
|
|
543
|
+
: {};
|
|
544
|
+
const gpu = accelerators.gpu && typeof accelerators.gpu === "object" && !Array.isArray(accelerators.gpu)
|
|
545
|
+
? accelerators.gpu
|
|
546
|
+
: null;
|
|
547
|
+
const rawGpuCount = gpu?.["count"];
|
|
548
|
+
const gpuCount = typeof rawGpuCount === "number" && Number.isFinite(rawGpuCount)
|
|
549
|
+
? Math.max(0, Math.floor(rawGpuCount))
|
|
550
|
+
: 0;
|
|
551
|
+
const vramTier = coarsePublicVramTier(gpu?.["vram_tier"], gpuCount);
|
|
552
|
+
return {
|
|
553
|
+
schema_version: 1,
|
|
554
|
+
collected_at: new Date().toISOString(),
|
|
555
|
+
cpu: {
|
|
556
|
+
core_count: cpuCount,
|
|
557
|
+
load_1m_ratio: roundedTelemetryNumber((loadavg()[0] || 0) / cpuCount)
|
|
558
|
+
},
|
|
559
|
+
ram: {
|
|
560
|
+
used_ratio: totalMemory > 0 ? roundedTelemetryNumber((totalMemory - freeMemory) / totalMemory) : null,
|
|
561
|
+
total_bucket: totalHostMemoryBucket()
|
|
562
|
+
},
|
|
563
|
+
gpu: {
|
|
564
|
+
available: Boolean(gpu?.["available"]),
|
|
565
|
+
count: gpuCount,
|
|
566
|
+
cuda: Boolean(gpu?.["cuda"] || gpu?.["has_cuda"]),
|
|
567
|
+
vram: {
|
|
568
|
+
total_tier: vramTier,
|
|
569
|
+
used_ratio: null
|
|
570
|
+
}
|
|
571
|
+
}
|
|
572
|
+
};
|
|
573
|
+
}
|
|
384
574
|
function normalizeCapabilities(value) {
|
|
385
575
|
if (!Array.isArray(value)) {
|
|
386
576
|
return [];
|
|
@@ -537,6 +727,7 @@ function serviceEnvironment(config, env, homeDir) {
|
|
|
537
727
|
MSWARM_GATEWAY_BASE_URL: config.gatewayBaseUrl,
|
|
538
728
|
MSWARM_SELF_HOSTED_NODE_STATE_PATH: config.statePath,
|
|
539
729
|
MSWARM_SELF_HOSTED_NODE_KEY_PATH: config.runtimeTokenPath,
|
|
730
|
+
MSWARM_SELF_HOSTED_ARTIFACT_STORE_PATH: config.artifactStorePath || null,
|
|
540
731
|
MSWARM_SELF_HOSTED_RELAY_MODE: config.relayMode || "outbound",
|
|
541
732
|
MSWARM_SELF_HOSTED_DIRECT_BASE_URL: config.directBaseUrl || null,
|
|
542
733
|
MSWARM_SELF_HOSTED_DISCOVERY_MODE: config.discoveryMode,
|
|
@@ -550,7 +741,18 @@ function serviceEnvironment(config, env, homeDir) {
|
|
|
550
741
|
MSWARM_SELF_HOSTED_MODEL_BLOCKLIST: config.modelBlocklist.join(","),
|
|
551
742
|
MSWARM_SELF_HOSTED_HEARTBEAT_INTERVAL_SECONDS: String(config.heartbeatIntervalSeconds),
|
|
552
743
|
MSWARM_SELF_HOSTED_REQUEST_TIMEOUT_MS: String(config.requestTimeoutMs),
|
|
553
|
-
MSWARM_SELF_HOSTED_JOB_TIMEOUT_MS: String(config.jobTimeoutMs)
|
|
744
|
+
MSWARM_SELF_HOSTED_JOB_TIMEOUT_MS: String(config.jobTimeoutMs),
|
|
745
|
+
MSWARM_SELF_HOSTED_MAX_CONCURRENT_JOBS: String(config.maxConcurrentJobs || 1),
|
|
746
|
+
MSWARM_SELF_HOSTED_MAX_CONCURRENT_LLM_JOBS: String(config.maxConcurrentLlmJobs || config.maxConcurrentJobs || 1),
|
|
747
|
+
MSWARM_SELF_HOSTED_GENERIC_JOBS_ENABLED: config.genericJobsEnabled ? "true" : "false",
|
|
748
|
+
MSWARM_SELF_HOSTED_GENERIC_JOB_TIMEOUT_MS: String(config.genericJobTimeoutMs),
|
|
749
|
+
MSWARM_SELF_HOSTED_GENERIC_JOB_MAX_CONCURRENCY: String(config.genericJobMaxConcurrency),
|
|
750
|
+
MSWARM_SELF_HOSTED_DRAIN_MODE: config.drainMode ? "true" : "false",
|
|
751
|
+
MSWARM_SELF_HOSTED_LOAD_REPORTING_ENABLED: config.loadReportingEnabled === false ? "false" : "true",
|
|
752
|
+
MSWARM_SELF_HOSTED_HARDWARE_TELEMETRY_ENABLED: config.hardwareTelemetryEnabled ? "true" : "false",
|
|
753
|
+
MSWARM_SELF_HOSTED_CAPABILITY_PROBE_TIMEOUT_MS: config.capabilityProbeTimeoutMs
|
|
754
|
+
? String(config.capabilityProbeTimeoutMs)
|
|
755
|
+
: null
|
|
554
756
|
};
|
|
555
757
|
return Object.fromEntries(Object.entries(values).filter((entry) => typeof entry[1] === "string" && entry[1] !== ""));
|
|
556
758
|
}
|
|
@@ -955,6 +1157,8 @@ export async function readSelfHostedNodeConfig(env = process.env) {
|
|
|
955
1157
|
optionalText(env.OLLAMA_HOST) ||
|
|
956
1158
|
DEFAULT_OLLAMA_BASE_URL;
|
|
957
1159
|
const packageNodeVersion = await readPackageNodeVersion();
|
|
1160
|
+
const maxConcurrentJobs = parsePositiveInteger(env.MSWARM_SELF_HOSTED_MAX_CONCURRENT_JOBS, state.max_concurrent_jobs || 1);
|
|
1161
|
+
const maxConcurrentLlmJobs = parsePositiveInteger(env.MSWARM_SELF_HOSTED_MAX_CONCURRENT_LLM_JOBS, state.max_concurrent_llm_jobs || maxConcurrentJobs);
|
|
958
1162
|
return {
|
|
959
1163
|
gatewayBaseUrl: trimTrailingSlash(gatewayBaseUrl),
|
|
960
1164
|
nodeId,
|
|
@@ -970,6 +1174,9 @@ export async function readSelfHostedNodeConfig(env = process.env) {
|
|
|
970
1174
|
ollamaBaseUrl: trimTrailingSlash(ollamaBaseUrl),
|
|
971
1175
|
statePath,
|
|
972
1176
|
runtimeTokenPath,
|
|
1177
|
+
artifactStorePath: optionalText(env.MSWARM_SELF_HOSTED_ARTIFACT_STORE_PATH) ||
|
|
1178
|
+
state.artifact_store_path ||
|
|
1179
|
+
defaultArtifactStorePath(),
|
|
973
1180
|
invocationSigningSecret: optionalText(env.MSWARM_SELF_HOSTED_INVOCATION_SIGNING_SECRET) ||
|
|
974
1181
|
optionalText(env.MSWARM_SELF_HOSTED_RELAY_SIGNING_SECRET),
|
|
975
1182
|
listenHost: optionalText(env.MSWARM_SELF_HOSTED_LISTEN_HOST) || DEFAULT_LISTEN_HOST,
|
|
@@ -981,6 +1188,15 @@ export async function readSelfHostedNodeConfig(env = process.env) {
|
|
|
981
1188
|
heartbeatIntervalSeconds: parsePositiveInteger(env.MSWARM_SELF_HOSTED_HEARTBEAT_INTERVAL_SECONDS, state.heartbeat_interval_seconds || DEFAULT_HEARTBEAT_INTERVAL_SECONDS),
|
|
982
1189
|
requestTimeoutMs: parsePositiveInteger(env.MSWARM_SELF_HOSTED_REQUEST_TIMEOUT_MS, state.request_timeout_ms || DEFAULT_REQUEST_TIMEOUT_MS),
|
|
983
1190
|
jobTimeoutMs: parsePositiveInteger(env.MSWARM_SELF_HOSTED_JOB_TIMEOUT_MS, state.job_timeout_ms || DEFAULT_JOB_TIMEOUT_MS),
|
|
1191
|
+
maxConcurrentJobs,
|
|
1192
|
+
maxConcurrentLlmJobs,
|
|
1193
|
+
genericJobsEnabled: parseBoolean(env.MSWARM_SELF_HOSTED_GENERIC_JOBS_ENABLED ?? env.MSWARM_SELF_HOSTED_GENERIC_JOBS, state.generic_jobs_enabled === true),
|
|
1194
|
+
genericJobTimeoutMs: parsePositiveInteger(env.MSWARM_SELF_HOSTED_GENERIC_JOB_TIMEOUT_MS, state.generic_job_timeout_ms || state.job_timeout_ms || DEFAULT_JOB_TIMEOUT_MS),
|
|
1195
|
+
genericJobMaxConcurrency: parsePositiveInteger(env.MSWARM_SELF_HOSTED_GENERIC_JOB_MAX_CONCURRENCY, state.generic_job_max_concurrency || 1),
|
|
1196
|
+
capabilityProbeTimeoutMs: parsePositiveInteger(env.MSWARM_SELF_HOSTED_CAPABILITY_PROBE_TIMEOUT_MS, state.capability_probe_timeout_ms || DEFAULT_CAPABILITY_PROBE_TIMEOUT_MS),
|
|
1197
|
+
drainMode: parseBoolean(env.MSWARM_SELF_HOSTED_DRAIN_MODE, state.drain_mode === true),
|
|
1198
|
+
loadReportingEnabled: parseBoolean(env.MSWARM_SELF_HOSTED_LOAD_REPORTING_ENABLED ?? env.MSWARM_SELF_HOSTED_LOAD_REPORTING, state.load_reporting_enabled !== false),
|
|
1199
|
+
hardwareTelemetryEnabled: parseBoolean(env.MSWARM_SELF_HOSTED_HARDWARE_TELEMETRY_ENABLED ?? env.MSWARM_SELF_HOSTED_HARDWARE_TELEMETRY, state.hardware_telemetry_enabled === true),
|
|
984
1200
|
exposeAllModels: resolveDaemonExposeAllModels(env, state),
|
|
985
1201
|
modelAllowlist: parseList(env.MSWARM_SELF_HOSTED_MODEL_ALLOWLIST || state.model_allowlist),
|
|
986
1202
|
modelBlocklist: parseList(env.MSWARM_SELF_HOSTED_MODEL_BLOCKLIST || state.model_blocklist)
|
|
@@ -1011,6 +1227,8 @@ export async function readOwnerSetupConfig(argv = process.argv.slice(3), env = p
|
|
|
1011
1227
|
const allowlist = parseList(options.allow || env.MSWARM_SELF_HOSTED_MODEL_ALLOWLIST);
|
|
1012
1228
|
const blocklist = parseList(options.block || env.MSWARM_SELF_HOSTED_MODEL_BLOCKLIST);
|
|
1013
1229
|
const packageNodeVersion = await readPackageNodeVersion();
|
|
1230
|
+
const maxConcurrentJobs = parsePositiveInteger(options["max-concurrent-jobs"] || env.MSWARM_SELF_HOSTED_MAX_CONCURRENT_JOBS, 1);
|
|
1231
|
+
const maxConcurrentLlmJobs = parsePositiveInteger(options["max-concurrent-llm-jobs"] || env.MSWARM_SELF_HOSTED_MAX_CONCURRENT_LLM_JOBS, maxConcurrentJobs);
|
|
1014
1232
|
return {
|
|
1015
1233
|
apiKey,
|
|
1016
1234
|
gatewayBaseUrl: trimTrailingSlash(gatewayBaseUrl),
|
|
@@ -1022,6 +1240,9 @@ export async function readOwnerSetupConfig(argv = process.argv.slice(3), env = p
|
|
|
1022
1240
|
discoveryMode: parseDiscoveryMode(env.MSWARM_SELF_HOSTED_DISCOVERY_MODE),
|
|
1023
1241
|
statePath,
|
|
1024
1242
|
runtimeTokenPath,
|
|
1243
|
+
artifactStorePath: optionalText(options["artifact-store-path"]) ||
|
|
1244
|
+
optionalText(env.MSWARM_SELF_HOSTED_ARTIFACT_STORE_PATH) ||
|
|
1245
|
+
defaultArtifactStorePath(),
|
|
1025
1246
|
machineIdPath: optionalText(env.MSWARM_SELF_HOSTED_MACHINE_ID_PATH) || defaultMachineIdPath(),
|
|
1026
1247
|
mcodaBin: optionalText(env.MSWARM_SELF_HOSTED_MCODA_BIN) || DEFAULT_MCODA_BIN,
|
|
1027
1248
|
mcodaListArgs: parseArgs(env.MSWARM_SELF_HOSTED_MCODA_LIST_ARGS, DEFAULT_MCODA_LIST_ARGS),
|
|
@@ -1030,6 +1251,17 @@ export async function readOwnerSetupConfig(argv = process.argv.slice(3), env = p
|
|
|
1030
1251
|
heartbeatIntervalSeconds: parsePositiveInteger(env.MSWARM_SELF_HOSTED_HEARTBEAT_INTERVAL_SECONDS, DEFAULT_HEARTBEAT_INTERVAL_SECONDS),
|
|
1031
1252
|
requestTimeoutMs: parsePositiveInteger(env.MSWARM_SELF_HOSTED_REQUEST_TIMEOUT_MS, DEFAULT_REQUEST_TIMEOUT_MS),
|
|
1032
1253
|
jobTimeoutMs: parsePositiveInteger(options["job-timeout-ms"] || env.MSWARM_SELF_HOSTED_JOB_TIMEOUT_MS, DEFAULT_JOB_TIMEOUT_MS),
|
|
1254
|
+
maxConcurrentJobs,
|
|
1255
|
+
maxConcurrentLlmJobs,
|
|
1256
|
+
genericJobsEnabled: parseBoolean(options["enable-generic-jobs"] || env.MSWARM_SELF_HOSTED_GENERIC_JOBS_ENABLED || env.MSWARM_SELF_HOSTED_GENERIC_JOBS, false),
|
|
1257
|
+
genericJobTimeoutMs: parsePositiveInteger(options["generic-job-timeout-ms"] || env.MSWARM_SELF_HOSTED_GENERIC_JOB_TIMEOUT_MS, DEFAULT_JOB_TIMEOUT_MS),
|
|
1258
|
+
genericJobMaxConcurrency: parsePositiveInteger(options["generic-job-max-concurrency"] || env.MSWARM_SELF_HOSTED_GENERIC_JOB_MAX_CONCURRENCY, 1),
|
|
1259
|
+
capabilityProbeTimeoutMs: parsePositiveInteger(env.MSWARM_SELF_HOSTED_CAPABILITY_PROBE_TIMEOUT_MS, DEFAULT_CAPABILITY_PROBE_TIMEOUT_MS),
|
|
1260
|
+
drainMode: parseBoolean(options.drain || env.MSWARM_SELF_HOSTED_DRAIN_MODE, false),
|
|
1261
|
+
loadReportingEnabled: parseBoolean(options["disable-load-reporting"] === true
|
|
1262
|
+
? false
|
|
1263
|
+
: (env.MSWARM_SELF_HOSTED_LOAD_REPORTING_ENABLED ?? env.MSWARM_SELF_HOSTED_LOAD_REPORTING), true),
|
|
1264
|
+
hardwareTelemetryEnabled: parseBoolean(options["enable-hardware-telemetry"] || env.MSWARM_SELF_HOSTED_HARDWARE_TELEMETRY_ENABLED || env.MSWARM_SELF_HOSTED_HARDWARE_TELEMETRY, false),
|
|
1033
1265
|
exposeAllModels: resolveOwnerSetupExposeAllModels(options, env),
|
|
1034
1266
|
modelAllowlist: allowlist,
|
|
1035
1267
|
modelBlocklist: blocklist,
|
|
@@ -1148,10 +1380,17 @@ async function defaultCommandRunner(command, args, options) {
|
|
|
1148
1380
|
let stdout = "";
|
|
1149
1381
|
let stderr = "";
|
|
1150
1382
|
let settled = false;
|
|
1383
|
+
const abort = () => {
|
|
1384
|
+
if (settled)
|
|
1385
|
+
return;
|
|
1386
|
+
child.kill("SIGTERM");
|
|
1387
|
+
finish(new Error("command aborted"));
|
|
1388
|
+
};
|
|
1151
1389
|
const timer = setTimeout(() => {
|
|
1152
1390
|
if (settled)
|
|
1153
1391
|
return;
|
|
1154
1392
|
settled = true;
|
|
1393
|
+
options.signal?.removeEventListener("abort", abort);
|
|
1155
1394
|
child.kill("SIGTERM");
|
|
1156
1395
|
reject(new Error(`command timed out after ${options.timeoutMs}ms: ${command}`));
|
|
1157
1396
|
}, options.timeoutMs);
|
|
@@ -1160,6 +1399,7 @@ async function defaultCommandRunner(command, args, options) {
|
|
|
1160
1399
|
return;
|
|
1161
1400
|
settled = true;
|
|
1162
1401
|
clearTimeout(timer);
|
|
1402
|
+
options.signal?.removeEventListener("abort", abort);
|
|
1163
1403
|
if (error) {
|
|
1164
1404
|
reject(error);
|
|
1165
1405
|
return;
|
|
@@ -1188,6 +1428,11 @@ async function defaultCommandRunner(command, args, options) {
|
|
|
1188
1428
|
}
|
|
1189
1429
|
finish();
|
|
1190
1430
|
});
|
|
1431
|
+
if (options.signal?.aborted) {
|
|
1432
|
+
abort();
|
|
1433
|
+
return;
|
|
1434
|
+
}
|
|
1435
|
+
options.signal?.addEventListener("abort", abort, { once: true });
|
|
1191
1436
|
if (options.input) {
|
|
1192
1437
|
child.stdin.write(options.input);
|
|
1193
1438
|
}
|
|
@@ -1414,10 +1659,6 @@ function mapMcodaAgentToCodaliAgent(agent, fallbackSlug) {
|
|
|
1414
1659
|
maxOutputTokens: optionalNumber(agent.maxOutputTokens, agent.max_output_tokens) ?? undefined,
|
|
1415
1660
|
};
|
|
1416
1661
|
}
|
|
1417
|
-
function isExposedLocalAgent(agent, config) {
|
|
1418
|
-
const mapped = mapMcodaAgentToSelfHostedModel(agent, config);
|
|
1419
|
-
return Boolean(mapped?.exposed);
|
|
1420
|
-
}
|
|
1421
1662
|
function buildCodaliWorkspace(job) {
|
|
1422
1663
|
const root = optionalText(job.workspace?.root);
|
|
1423
1664
|
if (!root) {
|
|
@@ -1437,6 +1678,13 @@ const DOCDEX_JOB_ERROR_CODES = new Set([
|
|
|
1437
1678
|
"docdex_repo_access_denied",
|
|
1438
1679
|
"docdex_unavailable",
|
|
1439
1680
|
]);
|
|
1681
|
+
const PRE_START_JOB_ERROR_CODES = new Set([
|
|
1682
|
+
"selected_agent_unavailable",
|
|
1683
|
+
"selected_agent_unhealthy",
|
|
1684
|
+
"validation_failed",
|
|
1685
|
+
"docdex_context_missing",
|
|
1686
|
+
"docdex_api_key_missing",
|
|
1687
|
+
]);
|
|
1440
1688
|
class SelfHostedDocdexJobError extends Error {
|
|
1441
1689
|
constructor(code, message) {
|
|
1442
1690
|
super(message);
|
|
@@ -1444,6 +1692,13 @@ class SelfHostedDocdexJobError extends Error {
|
|
|
1444
1692
|
this.code = code;
|
|
1445
1693
|
}
|
|
1446
1694
|
}
|
|
1695
|
+
class SelfHostedPreStartJobError extends Error {
|
|
1696
|
+
constructor(code, message) {
|
|
1697
|
+
super(message);
|
|
1698
|
+
this.name = code;
|
|
1699
|
+
this.code = code;
|
|
1700
|
+
}
|
|
1701
|
+
}
|
|
1447
1702
|
function normalizeDocdexCapabilityMap(value) {
|
|
1448
1703
|
const record = objectRecord(value);
|
|
1449
1704
|
if (!record)
|
|
@@ -1501,11 +1756,14 @@ function selfHostedErrorCode(error) {
|
|
|
1501
1756
|
if (!error || typeof error !== "object")
|
|
1502
1757
|
return undefined;
|
|
1503
1758
|
const code = error.code;
|
|
1504
|
-
if (typeof code === "string" &&
|
|
1759
|
+
if (typeof code === "string" &&
|
|
1760
|
+
(DOCDEX_JOB_ERROR_CODES.has(code) || PRE_START_JOB_ERROR_CODES.has(code))) {
|
|
1505
1761
|
return code;
|
|
1506
1762
|
}
|
|
1507
1763
|
const name = error.name;
|
|
1508
|
-
return typeof name === "string" && DOCDEX_JOB_ERROR_CODES.has(name)
|
|
1764
|
+
return typeof name === "string" && (DOCDEX_JOB_ERROR_CODES.has(name) || PRE_START_JOB_ERROR_CODES.has(name))
|
|
1765
|
+
? name
|
|
1766
|
+
: undefined;
|
|
1509
1767
|
}
|
|
1510
1768
|
function redactRuntimeSecretValues(value, secrets) {
|
|
1511
1769
|
let output = value;
|
|
@@ -1530,159 +1788,1556 @@ function buildCodaliPolicy(job) {
|
|
|
1530
1788
|
maxOutputTokens: job.policy?.max_output_tokens ?? job.openai_request.max_tokens,
|
|
1531
1789
|
};
|
|
1532
1790
|
}
|
|
1533
|
-
function
|
|
1534
|
-
|
|
1535
|
-
|
|
1536
|
-
completionTokens: positiveInteger(usage?.outputTokens),
|
|
1537
|
-
};
|
|
1791
|
+
function numberArg(value, fallback) {
|
|
1792
|
+
const parsed = Number(value);
|
|
1793
|
+
return Number.isFinite(parsed) ? parsed : fallback;
|
|
1538
1794
|
}
|
|
1539
|
-
|
|
1540
|
-
|
|
1541
|
-
|
|
1542
|
-
|
|
1543
|
-
|
|
1795
|
+
function boundedMilliseconds(value, fallback, max) {
|
|
1796
|
+
return Math.max(0, Math.min(max, Math.floor(numberArg(value, fallback))));
|
|
1797
|
+
}
|
|
1798
|
+
function abortErrorCode(signal) {
|
|
1799
|
+
return signal.reason === "timeout" ? "timeout" : "cancelled";
|
|
1800
|
+
}
|
|
1801
|
+
function abortErrorMessage(signal) {
|
|
1802
|
+
return abortErrorCode(signal) === "timeout" ? "generic job timed out" : "generic job cancelled";
|
|
1803
|
+
}
|
|
1804
|
+
async function sleepWithAbort(ms, signal) {
|
|
1805
|
+
if (ms <= 0)
|
|
1806
|
+
return;
|
|
1807
|
+
if (signal.aborted) {
|
|
1808
|
+
throw new Error(abortErrorMessage(signal));
|
|
1544
1809
|
}
|
|
1545
|
-
|
|
1546
|
-
const
|
|
1547
|
-
|
|
1548
|
-
|
|
1549
|
-
|
|
1550
|
-
|
|
1551
|
-
|
|
1552
|
-
|
|
1553
|
-
|
|
1810
|
+
await new Promise((resolve, reject) => {
|
|
1811
|
+
const timer = setTimeout(() => {
|
|
1812
|
+
cleanup();
|
|
1813
|
+
resolve();
|
|
1814
|
+
}, ms);
|
|
1815
|
+
const onAbort = () => {
|
|
1816
|
+
cleanup();
|
|
1817
|
+
reject(new Error(abortErrorMessage(signal)));
|
|
1818
|
+
};
|
|
1819
|
+
const cleanup = () => {
|
|
1820
|
+
clearTimeout(timer);
|
|
1821
|
+
signal.removeEventListener("abort", onAbort);
|
|
1822
|
+
};
|
|
1823
|
+
signal.addEventListener("abort", onAbort, { once: true });
|
|
1824
|
+
});
|
|
1825
|
+
}
|
|
1826
|
+
function safeLocalArtifactJobId(jobId) {
|
|
1827
|
+
const normalized = jobId.replace(/[^a-zA-Z0-9_.-]/g, "_") || "job";
|
|
1828
|
+
return assertMswarmSafeRelativePath(normalized, "job_id");
|
|
1829
|
+
}
|
|
1830
|
+
function safeLocalArtifactName(value, fallback) {
|
|
1831
|
+
const normalized = value.replace(/[^a-zA-Z0-9_.-]/g, "_") || fallback;
|
|
1832
|
+
return assertMswarmSafeRelativePath(normalized, "artifact_name");
|
|
1833
|
+
}
|
|
1834
|
+
function resolveWithinRoot(root, relativePath) {
|
|
1835
|
+
const rootPath = resolve(root);
|
|
1836
|
+
const target = resolve(rootPath, relativePath);
|
|
1837
|
+
const delta = relative(rootPath, target);
|
|
1838
|
+
if (delta === "" || (!delta.startsWith("..") && !isAbsolute(delta))) {
|
|
1839
|
+
return target;
|
|
1840
|
+
}
|
|
1841
|
+
throw new Error("path_escape_not_allowed");
|
|
1842
|
+
}
|
|
1843
|
+
function sha256Hex(buffer) {
|
|
1844
|
+
return createHash("sha256").update(buffer).digest("hex");
|
|
1845
|
+
}
|
|
1846
|
+
function positiveByteLimit(...values) {
|
|
1847
|
+
const positive = values.filter((value) => typeof value === "number" && Number.isFinite(value) && value > 0);
|
|
1848
|
+
return positive.length ? Math.min(...positive) : DEFAULT_LOCAL_ARTIFACT_MAX_BYTES;
|
|
1849
|
+
}
|
|
1850
|
+
function parseLocalArtifactUri(uri) {
|
|
1851
|
+
try {
|
|
1852
|
+
const parsed = new URL(uri);
|
|
1853
|
+
if (parsed.protocol !== "artifact:" || parsed.hostname !== "local") {
|
|
1854
|
+
return null;
|
|
1554
1855
|
}
|
|
1555
|
-
const
|
|
1556
|
-
|
|
1557
|
-
|
|
1558
|
-
throw new Error("mcoda agent-run response did not include output");
|
|
1856
|
+
const parts = decodeURIComponent(parsed.pathname).split("/").filter(Boolean);
|
|
1857
|
+
if (parts.length < 2) {
|
|
1858
|
+
return null;
|
|
1559
1859
|
}
|
|
1860
|
+
const [jobId, ...artifactPath] = parts;
|
|
1560
1861
|
return {
|
|
1561
|
-
|
|
1562
|
-
|
|
1563
|
-
model: optionalText(response.model) || undefined,
|
|
1564
|
-
metadata: response.metadata && typeof response.metadata === "object" ? response.metadata : undefined
|
|
1862
|
+
jobId: assertMswarmSafeRelativePath(jobId, "artifact_job_id"),
|
|
1863
|
+
path: assertMswarmSafeRelativePath(artifactPath.join("/"), "artifact_path")
|
|
1565
1864
|
};
|
|
1566
1865
|
}
|
|
1866
|
+
catch {
|
|
1867
|
+
return null;
|
|
1868
|
+
}
|
|
1567
1869
|
}
|
|
1568
|
-
export class
|
|
1569
|
-
constructor(input) {
|
|
1570
|
-
this.
|
|
1571
|
-
this.
|
|
1572
|
-
this.timeoutMs = input.timeoutMs || DEFAULT_REQUEST_TIMEOUT_MS;
|
|
1870
|
+
export class MswarmLocalArtifactStore {
|
|
1871
|
+
constructor(input = {}) {
|
|
1872
|
+
this.rootDir = input.rootDir || defaultArtifactStorePath();
|
|
1873
|
+
this.now = input.now || (() => new Date());
|
|
1573
1874
|
}
|
|
1574
|
-
async
|
|
1575
|
-
|
|
1576
|
-
|
|
1577
|
-
|
|
1578
|
-
|
|
1579
|
-
|
|
1875
|
+
async prepareJobWorkspace(jobId, job) {
|
|
1876
|
+
const safeJobId = safeLocalArtifactJobId(jobId);
|
|
1877
|
+
const workDir = resolveWithinRoot(this.rootDir, safeJobId);
|
|
1878
|
+
const inputDir = resolveWithinRoot(workDir, "inputs");
|
|
1879
|
+
const outputDir = resolveWithinRoot(workDir, "outputs");
|
|
1880
|
+
await rm(workDir, { recursive: true, force: true });
|
|
1881
|
+
await mkdir(inputDir, { recursive: true });
|
|
1882
|
+
await mkdir(outputDir, { recursive: true });
|
|
1883
|
+
const store = {
|
|
1884
|
+
backend: "local-dev",
|
|
1885
|
+
root_uri: `artifact://local/${safeJobId}`
|
|
1886
|
+
};
|
|
1887
|
+
const registeredInputs = await Promise.all((job.inputs || []).map((input, index) => this.registerInput(jobId, job, input, index, inputDir, store)));
|
|
1888
|
+
const outputSpecs = (job.outputs || []).map((output) => ({
|
|
1889
|
+
...output,
|
|
1890
|
+
path: assertMswarmSafeRelativePath(output.path, "output_path")
|
|
1891
|
+
}));
|
|
1892
|
+
const sandbox = buildMswarmSandboxProfile({
|
|
1893
|
+
policy: job.policy,
|
|
1894
|
+
limits: job.limits,
|
|
1895
|
+
containerized: job.policy.trust_mode === "tenant-owned" || job.job_type === CUDA_RUN_JOB_TYPE,
|
|
1896
|
+
gpu: job.resources?.gpu ? "nvidia" : "none"
|
|
1897
|
+
});
|
|
1898
|
+
return {
|
|
1899
|
+
store,
|
|
1900
|
+
workDir,
|
|
1901
|
+
inputDir,
|
|
1902
|
+
outputDir,
|
|
1903
|
+
registeredInputs,
|
|
1904
|
+
outputSpecs,
|
|
1905
|
+
sandbox
|
|
1906
|
+
};
|
|
1580
1907
|
}
|
|
1581
|
-
async
|
|
1582
|
-
|
|
1583
|
-
|
|
1584
|
-
|
|
1585
|
-
|
|
1586
|
-
|
|
1587
|
-
|
|
1588
|
-
|
|
1589
|
-
|
|
1908
|
+
async collectOutputs(context, jobId) {
|
|
1909
|
+
const artifacts = [];
|
|
1910
|
+
let totalBytes = 0;
|
|
1911
|
+
for (const output of context.outputSpecs) {
|
|
1912
|
+
const collected = await this.collectDeclaredOutput(context, jobId, output);
|
|
1913
|
+
for (const artifact of collected) {
|
|
1914
|
+
totalBytes += artifact.size_bytes || 0;
|
|
1915
|
+
const totalLimit = positiveByteLimit(context.sandbox.limits.max_output_bytes);
|
|
1916
|
+
if (totalBytes > totalLimit) {
|
|
1917
|
+
throw new Error("output_size_limit_exceeded");
|
|
1918
|
+
}
|
|
1919
|
+
artifacts.push(artifact);
|
|
1920
|
+
}
|
|
1921
|
+
}
|
|
1922
|
+
return artifacts;
|
|
1590
1923
|
}
|
|
1591
|
-
async
|
|
1592
|
-
|
|
1924
|
+
async registerInput(jobId, job, input, index, inputDir, store) {
|
|
1925
|
+
const mountPath = input.mount_path
|
|
1926
|
+
? assertMswarmSafeRelativePath(input.mount_path, "input_mount_path")
|
|
1927
|
+
: safeLocalArtifactName(input.name, `input-${index}`);
|
|
1928
|
+
const targetPath = resolveWithinRoot(inputDir, mountPath);
|
|
1929
|
+
const maxArtifactBytes = positiveByteLimit(job.policy.max_artifact_bytes);
|
|
1930
|
+
if (Number.isFinite(input.artifact.size_bytes) && input.artifact.size_bytes !== undefined) {
|
|
1931
|
+
if (input.artifact.size_bytes > maxArtifactBytes) {
|
|
1932
|
+
throw new Error("input_artifact_size_limit_exceeded");
|
|
1933
|
+
}
|
|
1934
|
+
}
|
|
1935
|
+
const source = parseLocalArtifactUri(input.artifact.uri);
|
|
1936
|
+
let localPath;
|
|
1937
|
+
if (source) {
|
|
1938
|
+
const sourcePath = resolveWithinRoot(resolveWithinRoot(this.rootDir, source.jobId), join("outputs", source.path));
|
|
1939
|
+
try {
|
|
1940
|
+
const sourceStat = await lstat(sourcePath);
|
|
1941
|
+
if (!sourceStat.isFile()) {
|
|
1942
|
+
throw new Error("input_artifact_must_be_file");
|
|
1943
|
+
}
|
|
1944
|
+
if (sourceStat.size > maxArtifactBytes) {
|
|
1945
|
+
throw new Error("input_artifact_size_limit_exceeded");
|
|
1946
|
+
}
|
|
1947
|
+
const bytes = await readFile(sourcePath);
|
|
1948
|
+
if (input.artifact.sha256 && input.artifact.sha256 !== sha256Hex(bytes)) {
|
|
1949
|
+
throw new Error("input_artifact_checksum_mismatch");
|
|
1950
|
+
}
|
|
1951
|
+
await mkdir(dirname(targetPath), { recursive: true });
|
|
1952
|
+
await writeFile(targetPath, bytes);
|
|
1953
|
+
localPath = targetPath;
|
|
1954
|
+
}
|
|
1955
|
+
catch (error) {
|
|
1956
|
+
if (error.code !== "ENOENT" || input.required === true) {
|
|
1957
|
+
throw error;
|
|
1958
|
+
}
|
|
1959
|
+
}
|
|
1960
|
+
}
|
|
1961
|
+
else if (input.required === true) {
|
|
1962
|
+
throw new Error("input_artifact_unavailable");
|
|
1963
|
+
}
|
|
1964
|
+
const registeredAt = this.now().toISOString();
|
|
1965
|
+
return {
|
|
1966
|
+
...input.artifact,
|
|
1967
|
+
id: input.artifact.id || `input_${sha256Hex(Buffer.from(`${jobId}:${input.name}:${input.artifact.uri}`)).slice(0, 16)}`,
|
|
1968
|
+
job_id: jobId,
|
|
1969
|
+
name: input.name,
|
|
1970
|
+
scope: "input",
|
|
1971
|
+
registered_at: registeredAt,
|
|
1972
|
+
store,
|
|
1973
|
+
access: defaultMswarmArtifactAccessPolicy(job.policy.trust_mode === "tenant-owned" ? "tenant-scoped" : "owner-local"),
|
|
1974
|
+
retention: defaultMswarmArtifactRetentionPolicy(),
|
|
1975
|
+
...(localPath ? { local_path: localPath } : {})
|
|
1976
|
+
};
|
|
1593
1977
|
}
|
|
1594
|
-
async
|
|
1595
|
-
|
|
1596
|
-
|
|
1597
|
-
|
|
1598
|
-
|
|
1599
|
-
|
|
1600
|
-
|
|
1601
|
-
|
|
1602
|
-
|
|
1978
|
+
async collectDeclaredOutput(context, jobId, output) {
|
|
1979
|
+
const normalizedPath = assertMswarmSafeRelativePath(output.path, "output_path");
|
|
1980
|
+
const targetPath = resolveWithinRoot(context.outputDir, normalizedPath);
|
|
1981
|
+
try {
|
|
1982
|
+
const targetStat = await lstat(targetPath);
|
|
1983
|
+
if (targetStat.isSymbolicLink()) {
|
|
1984
|
+
throw new Error("output_symlink_not_allowed");
|
|
1985
|
+
}
|
|
1986
|
+
if (targetStat.isDirectory()) {
|
|
1987
|
+
return this.collectOutputDirectory(context, jobId, output, normalizedPath);
|
|
1988
|
+
}
|
|
1989
|
+
if (targetStat.isFile()) {
|
|
1990
|
+
return [await this.collectOutputFile(context, jobId, output, normalizedPath, targetPath)];
|
|
1991
|
+
}
|
|
1992
|
+
throw new Error("output_entry_type_not_allowed");
|
|
1993
|
+
}
|
|
1994
|
+
catch (error) {
|
|
1995
|
+
if (error.code === "ENOENT" && output.required !== true) {
|
|
1996
|
+
return [];
|
|
1997
|
+
}
|
|
1998
|
+
throw error;
|
|
1999
|
+
}
|
|
1603
2000
|
}
|
|
1604
|
-
async
|
|
1605
|
-
|
|
1606
|
-
|
|
1607
|
-
|
|
1608
|
-
|
|
1609
|
-
|
|
1610
|
-
|
|
1611
|
-
|
|
1612
|
-
|
|
2001
|
+
async collectOutputDirectory(context, jobId, output, relativeDir) {
|
|
2002
|
+
const dirPath = resolveWithinRoot(context.outputDir, relativeDir);
|
|
2003
|
+
const entries = await readdir(dirPath, { withFileTypes: true });
|
|
2004
|
+
const artifacts = [];
|
|
2005
|
+
for (const entry of entries) {
|
|
2006
|
+
const childRelativePath = assertMswarmSafeRelativePath(`${relativeDir}/${entry.name}`, "output_path");
|
|
2007
|
+
const childPath = resolveWithinRoot(context.outputDir, childRelativePath);
|
|
2008
|
+
if (entry.isSymbolicLink()) {
|
|
2009
|
+
throw new Error("output_symlink_not_allowed");
|
|
2010
|
+
}
|
|
2011
|
+
if (entry.isDirectory()) {
|
|
2012
|
+
artifacts.push(...(await this.collectOutputDirectory(context, jobId, output, childRelativePath)));
|
|
2013
|
+
}
|
|
2014
|
+
else if (entry.isFile()) {
|
|
2015
|
+
artifacts.push(await this.collectOutputFile(context, jobId, output, childRelativePath, childPath));
|
|
2016
|
+
}
|
|
2017
|
+
else {
|
|
2018
|
+
throw new Error("output_entry_type_not_allowed");
|
|
2019
|
+
}
|
|
2020
|
+
}
|
|
2021
|
+
return artifacts;
|
|
1613
2022
|
}
|
|
1614
|
-
async
|
|
1615
|
-
|
|
1616
|
-
|
|
1617
|
-
|
|
1618
|
-
|
|
1619
|
-
|
|
1620
|
-
|
|
1621
|
-
|
|
1622
|
-
}
|
|
2023
|
+
async collectOutputFile(context, jobId, output, relativePath, filePath) {
|
|
2024
|
+
const stat = await lstat(filePath);
|
|
2025
|
+
if (!stat.isFile()) {
|
|
2026
|
+
throw new Error("output_entry_type_not_allowed");
|
|
2027
|
+
}
|
|
2028
|
+
const perArtifactLimit = positiveByteLimit(context.sandbox.limits.max_artifact_bytes, context.sandbox.limits.max_output_bytes);
|
|
2029
|
+
if (stat.size > perArtifactLimit) {
|
|
2030
|
+
throw new Error("output_artifact_size_limit_exceeded");
|
|
2031
|
+
}
|
|
2032
|
+
const bytes = await readFile(filePath);
|
|
2033
|
+
return {
|
|
2034
|
+
id: `output_${sha256Hex(Buffer.from(`${jobId}:${relativePath}`)).slice(0, 16)}`,
|
|
2035
|
+
job_id: jobId,
|
|
2036
|
+
name: output.path === relativePath ? output.name : `${output.name}/${relativePath}`,
|
|
2037
|
+
uri: buildMswarmLocalArtifactUri(jobId, relativePath),
|
|
2038
|
+
content_type: output.content_type,
|
|
2039
|
+
size_bytes: stat.size,
|
|
2040
|
+
sha256: sha256Hex(bytes),
|
|
2041
|
+
scope: "output",
|
|
2042
|
+
registered_at: this.now().toISOString(),
|
|
2043
|
+
store: context.store,
|
|
2044
|
+
access: defaultMswarmArtifactAccessPolicy(context.sandbox.trust_mode === "tenant-owned" ? "tenant-scoped" : "owner-local"),
|
|
2045
|
+
retention: defaultMswarmArtifactRetentionPolicy()
|
|
2046
|
+
};
|
|
1623
2047
|
}
|
|
1624
|
-
|
|
1625
|
-
|
|
1626
|
-
|
|
1627
|
-
|
|
1628
|
-
"content-type": "application/json",
|
|
1629
|
-
authorization: `Bearer ${runtimeToken}`
|
|
1630
|
-
},
|
|
1631
|
-
body: JSON.stringify(payload)
|
|
1632
|
-
}, Math.max(this.timeoutMs, (payload.wait_ms || 0) + 5000));
|
|
2048
|
+
}
|
|
2049
|
+
export class MswarmTestEchoRunner {
|
|
2050
|
+
constructor() {
|
|
2051
|
+
this.id = TEST_ECHO_RUNNER_ID;
|
|
1633
2052
|
}
|
|
1634
|
-
async
|
|
1635
|
-
|
|
1636
|
-
|
|
1637
|
-
|
|
1638
|
-
|
|
1639
|
-
|
|
1640
|
-
|
|
1641
|
-
|
|
1642
|
-
|
|
2053
|
+
async run(context) {
|
|
2054
|
+
const args = context.job.args || {};
|
|
2055
|
+
const message = optionalText(args.message) || "ok";
|
|
2056
|
+
const repeat = Math.max(1, Math.min(20, Math.floor(numberArg(args.repeat, 1))));
|
|
2057
|
+
const delayMs = boundedMilliseconds(args.delay_ms, 0, 30000);
|
|
2058
|
+
if (args.fail === true) {
|
|
2059
|
+
throw new Error(message);
|
|
2060
|
+
}
|
|
2061
|
+
for (let index = 0; index < repeat; index += 1) {
|
|
2062
|
+
if (context.signal.aborted) {
|
|
2063
|
+
throw new Error(abortErrorMessage(context.signal));
|
|
2064
|
+
}
|
|
2065
|
+
if (delayMs > 0) {
|
|
2066
|
+
await sleepWithAbort(delayMs, context.signal);
|
|
2067
|
+
}
|
|
2068
|
+
await context.emitEvent({
|
|
2069
|
+
type: "stdout",
|
|
2070
|
+
message,
|
|
2071
|
+
data: {
|
|
2072
|
+
runner: this.id,
|
|
2073
|
+
index,
|
|
2074
|
+
repeat
|
|
2075
|
+
}
|
|
2076
|
+
});
|
|
2077
|
+
}
|
|
2078
|
+
await context.emitEvent({
|
|
2079
|
+
type: "progress",
|
|
2080
|
+
message: "echo complete",
|
|
2081
|
+
data: {
|
|
2082
|
+
completed: repeat,
|
|
2083
|
+
total: repeat
|
|
2084
|
+
}
|
|
2085
|
+
});
|
|
2086
|
+
return {
|
|
2087
|
+
job_id: context.job.idempotency_key || "local-generic-job",
|
|
2088
|
+
status: "succeeded",
|
|
2089
|
+
exit_code: 0,
|
|
2090
|
+
started_at: new Date().toISOString(),
|
|
2091
|
+
finished_at: new Date().toISOString(),
|
|
2092
|
+
metrics: {
|
|
2093
|
+
runner: this.id,
|
|
2094
|
+
echoed: repeat,
|
|
2095
|
+
message
|
|
2096
|
+
}
|
|
2097
|
+
};
|
|
1643
2098
|
}
|
|
1644
|
-
|
|
1645
|
-
|
|
1646
|
-
|
|
1647
|
-
|
|
1648
|
-
|
|
1649
|
-
|
|
1650
|
-
|
|
1651
|
-
|
|
1652
|
-
|
|
2099
|
+
}
|
|
2100
|
+
const BLENDER_ENGINE_ARGS = {
|
|
2101
|
+
cycles: "CYCLES",
|
|
2102
|
+
eevee: "BLENDER_EEVEE_NEXT",
|
|
2103
|
+
workbench: "BLENDER_WORKBENCH"
|
|
2104
|
+
};
|
|
2105
|
+
const BLENDER_OUTPUT_FORMAT_ARGS = {
|
|
2106
|
+
png: "PNG",
|
|
2107
|
+
jpeg: "JPEG",
|
|
2108
|
+
open_exr: "OPEN_EXR"
|
|
2109
|
+
};
|
|
2110
|
+
function positiveSafeInteger(value) {
|
|
2111
|
+
if (typeof value !== "number" || !Number.isSafeInteger(value) || value <= 0) {
|
|
2112
|
+
return null;
|
|
1653
2113
|
}
|
|
2114
|
+
return value;
|
|
1654
2115
|
}
|
|
1655
|
-
|
|
1656
|
-
|
|
1657
|
-
|
|
1658
|
-
|
|
1659
|
-
|
|
1660
|
-
|
|
1661
|
-
|
|
1662
|
-
|
|
1663
|
-
|
|
1664
|
-
|
|
1665
|
-
|
|
1666
|
-
|
|
1667
|
-
|
|
1668
|
-
|
|
1669
|
-
|
|
1670
|
-
|
|
1671
|
-
|
|
1672
|
-
|
|
1673
|
-
|
|
1674
|
-
|
|
1675
|
-
|
|
1676
|
-
|
|
1677
|
-
|
|
1678
|
-
|
|
1679
|
-
|
|
1680
|
-
|
|
1681
|
-
|
|
1682
|
-
|
|
1683
|
-
|
|
1684
|
-
|
|
1685
|
-
|
|
2116
|
+
function parseBlenderFrameSelection(value) {
|
|
2117
|
+
const defaultFrame = 1;
|
|
2118
|
+
if (value === undefined || value === null) {
|
|
2119
|
+
return { mode: "frame", frame: defaultFrame, label: String(defaultFrame), total: 1 };
|
|
2120
|
+
}
|
|
2121
|
+
const numericFrame = positiveSafeInteger(value);
|
|
2122
|
+
if (numericFrame !== null) {
|
|
2123
|
+
return { mode: "frame", frame: numericFrame, label: String(numericFrame), total: 1 };
|
|
2124
|
+
}
|
|
2125
|
+
const raw = optionalText(value);
|
|
2126
|
+
const match = raw?.match(/^([1-9]\d{0,6})(?:-([1-9]\d{0,6}))?$/);
|
|
2127
|
+
if (!match) {
|
|
2128
|
+
throw new Error("render.blender args.frames must be a positive frame number or start-end range");
|
|
2129
|
+
}
|
|
2130
|
+
const start = Number(match[1]);
|
|
2131
|
+
const end = match[2] ? Number(match[2]) : start;
|
|
2132
|
+
if (!Number.isSafeInteger(start) || !Number.isSafeInteger(end) || start <= 0 || end <= 0 || end < start) {
|
|
2133
|
+
throw new Error("render.blender args.frames must use a valid positive frame range");
|
|
2134
|
+
}
|
|
2135
|
+
if (end - start > 10000) {
|
|
2136
|
+
throw new Error("render.blender args.frames range exceeds the maximum supported 10001 frames");
|
|
2137
|
+
}
|
|
2138
|
+
if (start === end) {
|
|
2139
|
+
return { mode: "frame", frame: start, label: String(start), total: 1 };
|
|
2140
|
+
}
|
|
2141
|
+
return { mode: "range", start, end, label: `${start}-${end}`, total: end - start + 1 };
|
|
2142
|
+
}
|
|
2143
|
+
function normalizeBlenderEngine(value) {
|
|
2144
|
+
const raw = optionalText(value);
|
|
2145
|
+
if (!raw)
|
|
2146
|
+
return undefined;
|
|
2147
|
+
const key = raw.toLowerCase();
|
|
2148
|
+
const blender = BLENDER_ENGINE_ARGS[key];
|
|
2149
|
+
if (!blender) {
|
|
2150
|
+
throw new Error("render.blender args.engine must be cycles, eevee, or workbench");
|
|
2151
|
+
}
|
|
2152
|
+
return { label: key, blender };
|
|
2153
|
+
}
|
|
2154
|
+
function normalizeBlenderOutputFormat(value) {
|
|
2155
|
+
const key = (optionalText(value) || "png").toLowerCase();
|
|
2156
|
+
const blender = BLENDER_OUTPUT_FORMAT_ARGS[key];
|
|
2157
|
+
if (!blender) {
|
|
2158
|
+
throw new Error("render.blender args.output_format must be png, jpeg, or open_exr");
|
|
2159
|
+
}
|
|
2160
|
+
return { label: key, blender, extension: key === "open_exr" ? "exr" : key === "jpeg" ? "jpg" : "png" };
|
|
2161
|
+
}
|
|
2162
|
+
function parseBlenderResolution(value) {
|
|
2163
|
+
if (value === undefined || value === null)
|
|
2164
|
+
return undefined;
|
|
2165
|
+
const raw = optionalText(value);
|
|
2166
|
+
const match = raw?.match(/^([1-9]\d{0,4})x([1-9]\d{0,4})$/i);
|
|
2167
|
+
if (!match) {
|
|
2168
|
+
throw new Error("render.blender args.resolution must use WIDTHxHEIGHT");
|
|
2169
|
+
}
|
|
2170
|
+
const width = Number(match[1]);
|
|
2171
|
+
const height = Number(match[2]);
|
|
2172
|
+
if (width > 16384 || height > 16384) {
|
|
2173
|
+
throw new Error("render.blender args.resolution exceeds 16384x16384");
|
|
2174
|
+
}
|
|
2175
|
+
return { width, height, label: `${width}x${height}` };
|
|
2176
|
+
}
|
|
2177
|
+
function safeBlenderSceneName(value, label) {
|
|
2178
|
+
const raw = optionalText(value);
|
|
2179
|
+
if (!raw)
|
|
2180
|
+
return undefined;
|
|
2181
|
+
if (raw.length > 128 || /[\0\r\n]/.test(raw)) {
|
|
2182
|
+
throw new Error(`render.blender args.${label} is not a safe Blender object name`);
|
|
2183
|
+
}
|
|
2184
|
+
return raw;
|
|
2185
|
+
}
|
|
2186
|
+
function blenderSceneInputPath(context) {
|
|
2187
|
+
const scene = context.artifacts.registeredInputs.find((input) => input.name === "scene") || context.artifacts.registeredInputs[0];
|
|
2188
|
+
if (!scene?.local_path) {
|
|
2189
|
+
throw new Error("render.blender requires a materialized scene input artifact");
|
|
2190
|
+
}
|
|
2191
|
+
return scene.local_path;
|
|
2192
|
+
}
|
|
2193
|
+
function blenderOutputPattern(context) {
|
|
2194
|
+
const output = context.artifacts.outputSpecs[0];
|
|
2195
|
+
if (!output) {
|
|
2196
|
+
throw new Error("render.blender requires a declared output directory");
|
|
2197
|
+
}
|
|
2198
|
+
const normalizedPath = assertMswarmSafeRelativePath(output.path, "render_blender_output_path");
|
|
2199
|
+
const leaf = normalizedPath.split("/").filter(Boolean).at(-1) || normalizedPath;
|
|
2200
|
+
if (/\.[a-zA-Z0-9]{1,8}$/.test(leaf)) {
|
|
2201
|
+
throw new Error("render.blender output path must be a directory, not a file path");
|
|
2202
|
+
}
|
|
2203
|
+
return resolveWithinRoot(context.artifacts.outputDir, `${normalizedPath}/frame_####`);
|
|
2204
|
+
}
|
|
2205
|
+
function redactBlenderLocalPaths(context, value) {
|
|
2206
|
+
const replacements = [
|
|
2207
|
+
[context.artifacts.workDir, "[job-workdir]"],
|
|
2208
|
+
[context.artifacts.inputDir, "[job-inputs]"],
|
|
2209
|
+
[context.artifacts.outputDir, "[job-outputs]"],
|
|
2210
|
+
...context.artifacts.registeredInputs.map((input) => [input.local_path, "[job-input]"])
|
|
2211
|
+
];
|
|
2212
|
+
let output = value;
|
|
2213
|
+
for (const [source, replacement] of replacements) {
|
|
2214
|
+
if (source) {
|
|
2215
|
+
output = output.split(source).join(replacement);
|
|
2216
|
+
}
|
|
2217
|
+
}
|
|
2218
|
+
return output;
|
|
2219
|
+
}
|
|
2220
|
+
async function emitBlenderOutput(context, type, value) {
|
|
2221
|
+
const lines = value.split(/\r?\n/).map((line) => line.trim()).filter(Boolean).slice(0, 200);
|
|
2222
|
+
for (const line of lines) {
|
|
2223
|
+
await context.emitEvent({
|
|
2224
|
+
type,
|
|
2225
|
+
message: redactBlenderLocalPaths(context, line),
|
|
2226
|
+
data: { runner: BLENDER_RENDER_RUNNER_ID }
|
|
2227
|
+
});
|
|
2228
|
+
}
|
|
2229
|
+
}
|
|
2230
|
+
async function emitBlenderProgress(context, output, frames) {
|
|
2231
|
+
const seen = new Set();
|
|
2232
|
+
const lowerBound = frames.mode === "range" ? frames.start : frames.frame;
|
|
2233
|
+
const upperBound = frames.mode === "range" ? frames.end : frames.frame;
|
|
2234
|
+
for (const line of output.split(/\r?\n/)) {
|
|
2235
|
+
const match = line.match(/\bFra:(\d+)\b/i) || line.match(/\bFrame\s+(\d+)\b/i);
|
|
2236
|
+
if (!match)
|
|
2237
|
+
continue;
|
|
2238
|
+
const frame = Number(match[1]);
|
|
2239
|
+
if (!Number.isSafeInteger(frame) || frame < lowerBound || frame > upperBound || seen.has(frame)) {
|
|
2240
|
+
continue;
|
|
2241
|
+
}
|
|
2242
|
+
seen.add(frame);
|
|
2243
|
+
await context.emitEvent({
|
|
2244
|
+
type: "progress",
|
|
2245
|
+
message: `rendered frame ${frame}`,
|
|
2246
|
+
data: {
|
|
2247
|
+
runner: BLENDER_RENDER_RUNNER_ID,
|
|
2248
|
+
frame,
|
|
2249
|
+
completed: seen.size,
|
|
2250
|
+
total: frames.total
|
|
2251
|
+
}
|
|
2252
|
+
});
|
|
2253
|
+
}
|
|
2254
|
+
}
|
|
2255
|
+
function blenderFailureResult(job, code, message, startedAt) {
|
|
2256
|
+
return {
|
|
2257
|
+
job_id: job.idempotency_key || "render.blender",
|
|
2258
|
+
status: "failed",
|
|
2259
|
+
exit_code: 1,
|
|
2260
|
+
started_at: startedAt,
|
|
2261
|
+
finished_at: new Date().toISOString(),
|
|
2262
|
+
error: {
|
|
2263
|
+
code,
|
|
2264
|
+
message,
|
|
2265
|
+
retryable: false
|
|
2266
|
+
}
|
|
2267
|
+
};
|
|
2268
|
+
}
|
|
2269
|
+
function blenderGpuComputeDeviceType() {
|
|
2270
|
+
// The current GPU probe only marks NVIDIA devices as available, so CUDA is
|
|
2271
|
+
// the only concrete Blender compute backend this runner can safely request.
|
|
2272
|
+
return "CUDA";
|
|
2273
|
+
}
|
|
2274
|
+
export class MswarmBlenderRenderRunner {
|
|
2275
|
+
constructor(runner = defaultCommandRunner) {
|
|
2276
|
+
this.id = BLENDER_RENDER_RUNNER_ID;
|
|
2277
|
+
this.runner = runner;
|
|
2278
|
+
}
|
|
2279
|
+
async run(context) {
|
|
2280
|
+
const startedAt = new Date().toISOString();
|
|
2281
|
+
if (context.signal.aborted) {
|
|
2282
|
+
throw new Error(abortErrorMessage(context.signal));
|
|
2283
|
+
}
|
|
2284
|
+
if (context.job.policy.trust_mode !== "owner-local") {
|
|
2285
|
+
return blenderFailureResult(context.job, "policy_denied", "render.blender is owner-local only until containerized Blender execution is available", startedAt);
|
|
2286
|
+
}
|
|
2287
|
+
let scenePath;
|
|
2288
|
+
let frames;
|
|
2289
|
+
let engine;
|
|
2290
|
+
let outputFormat;
|
|
2291
|
+
let resolution;
|
|
2292
|
+
let sceneName;
|
|
2293
|
+
let cameraName;
|
|
2294
|
+
let outputPattern;
|
|
2295
|
+
const gpuRequested = Boolean(context.job.resources?.gpu);
|
|
2296
|
+
try {
|
|
2297
|
+
const args = context.job.args || {};
|
|
2298
|
+
scenePath = blenderSceneInputPath(context);
|
|
2299
|
+
frames = parseBlenderFrameSelection(args.frames);
|
|
2300
|
+
engine = normalizeBlenderEngine(args.engine);
|
|
2301
|
+
outputFormat = normalizeBlenderOutputFormat(args.output_format);
|
|
2302
|
+
resolution = parseBlenderResolution(args.resolution);
|
|
2303
|
+
sceneName = safeBlenderSceneName(args.scene, "scene");
|
|
2304
|
+
cameraName = safeBlenderSceneName(args.camera, "camera");
|
|
2305
|
+
outputPattern = blenderOutputPattern(context);
|
|
2306
|
+
await mkdir(dirname(outputPattern), { recursive: true });
|
|
2307
|
+
}
|
|
2308
|
+
catch (error) {
|
|
2309
|
+
return blenderFailureResult(context.job, "validation_failed", error instanceof Error ? error.message : String(error || "render.blender validation failed"), startedAt);
|
|
2310
|
+
}
|
|
2311
|
+
const pythonStatements = [];
|
|
2312
|
+
if (resolution) {
|
|
2313
|
+
pythonStatements.push(`bpy.context.scene.render.resolution_x=${resolution.width}`);
|
|
2314
|
+
pythonStatements.push(`bpy.context.scene.render.resolution_y=${resolution.height}`);
|
|
2315
|
+
}
|
|
2316
|
+
if (cameraName) {
|
|
2317
|
+
pythonStatements.push(`camera=bpy.data.objects.get(${JSON.stringify(cameraName)})`);
|
|
2318
|
+
pythonStatements.push("bpy.context.scene.camera=camera if camera is not None else bpy.context.scene.camera");
|
|
2319
|
+
}
|
|
2320
|
+
if (gpuRequested) {
|
|
2321
|
+
const computeDeviceType = blenderGpuComputeDeviceType();
|
|
2322
|
+
pythonStatements.push("cycles_addon=bpy.context.preferences.addons.get('cycles')");
|
|
2323
|
+
pythonStatements.push("cycles_prefs=cycles_addon.preferences if cycles_addon is not None else None");
|
|
2324
|
+
pythonStatements.push(`setattr(cycles_prefs,'compute_device_type',${JSON.stringify(computeDeviceType)}) if cycles_prefs is not None and hasattr(cycles_prefs,'compute_device_type') else None`);
|
|
2325
|
+
pythonStatements.push("getattr(cycles_prefs,'get_devices',lambda: None)() if cycles_prefs is not None else None");
|
|
2326
|
+
pythonStatements.push("setattr(bpy.context.scene.cycles,'device','GPU') if hasattr(bpy.context.scene,'cycles') else None");
|
|
2327
|
+
pythonStatements.push("[setattr(device,'use',True) for device in getattr(cycles_prefs,'devices',[]) if hasattr(device,'use')] if cycles_prefs is not None else None");
|
|
2328
|
+
}
|
|
2329
|
+
const blenderArgs = ["-b", scenePath];
|
|
2330
|
+
if (sceneName) {
|
|
2331
|
+
blenderArgs.push("--scene", sceneName);
|
|
2332
|
+
}
|
|
2333
|
+
if (engine) {
|
|
2334
|
+
blenderArgs.push("--engine", engine.blender);
|
|
2335
|
+
}
|
|
2336
|
+
if (pythonStatements.length > 0) {
|
|
2337
|
+
blenderArgs.push("--python-expr", `import bpy; ${pythonStatements.join("; ")}`);
|
|
2338
|
+
}
|
|
2339
|
+
blenderArgs.push("--render-output", outputPattern, "--render-format", outputFormat.blender);
|
|
2340
|
+
if (frames.mode === "range") {
|
|
2341
|
+
blenderArgs.push("-s", String(frames.start), "-e", String(frames.end), "-a");
|
|
2342
|
+
}
|
|
2343
|
+
else {
|
|
2344
|
+
blenderArgs.push("--render-frame", String(frames.frame));
|
|
2345
|
+
}
|
|
2346
|
+
await context.emitEvent({
|
|
2347
|
+
type: "progress",
|
|
2348
|
+
message: "blender render starting",
|
|
2349
|
+
data: {
|
|
2350
|
+
runner: this.id,
|
|
2351
|
+
frames: frames.label,
|
|
2352
|
+
engine: engine?.label || "scene-default",
|
|
2353
|
+
output_format: outputFormat.label,
|
|
2354
|
+
...(resolution ? { resolution: resolution.label } : {}),
|
|
2355
|
+
gpu_requested: gpuRequested,
|
|
2356
|
+
render_device: gpuRequested ? "gpu" : "scene-default"
|
|
2357
|
+
}
|
|
2358
|
+
});
|
|
2359
|
+
const timeoutMs = Math.max(1000, Math.min(DEFAULT_JOB_TIMEOUT_MS, Math.floor((context.sandbox.limits.timeout_sec || DEFAULT_JOB_TIMEOUT_MS / 1000) * 1000)));
|
|
2360
|
+
const maxBuffer = Math.min(DEFAULT_COMMAND_MAX_BUFFER, Math.max(1024 * 1024, context.job.limits?.max_stdout_bytes || 0, context.job.limits?.max_stderr_bytes || 0));
|
|
2361
|
+
try {
|
|
2362
|
+
const result = await this.runner("blender", blenderArgs, {
|
|
2363
|
+
timeoutMs,
|
|
2364
|
+
maxBuffer,
|
|
2365
|
+
signal: context.signal
|
|
2366
|
+
});
|
|
2367
|
+
await emitBlenderOutput(context, "stdout", result.stdout);
|
|
2368
|
+
await emitBlenderOutput(context, "stderr", result.stderr);
|
|
2369
|
+
await emitBlenderProgress(context, `${result.stdout}\n${result.stderr}`, frames);
|
|
2370
|
+
return {
|
|
2371
|
+
job_id: context.job.idempotency_key || "render.blender",
|
|
2372
|
+
status: "succeeded",
|
|
2373
|
+
exit_code: 0,
|
|
2374
|
+
started_at: startedAt,
|
|
2375
|
+
finished_at: new Date().toISOString(),
|
|
2376
|
+
metrics: {
|
|
2377
|
+
runner: this.id,
|
|
2378
|
+
frames: frames.label,
|
|
2379
|
+
engine: engine?.label || "scene-default",
|
|
2380
|
+
output_format: outputFormat.label,
|
|
2381
|
+
...(resolution ? { resolution: resolution.label } : {}),
|
|
2382
|
+
gpu_requested: gpuRequested,
|
|
2383
|
+
render_device: gpuRequested ? "gpu" : "scene-default"
|
|
2384
|
+
}
|
|
2385
|
+
};
|
|
2386
|
+
}
|
|
2387
|
+
catch (error) {
|
|
2388
|
+
if (context.signal.aborted) {
|
|
2389
|
+
throw error;
|
|
2390
|
+
}
|
|
2391
|
+
return blenderFailureResult(context.job, "runner_failed", redactBlenderLocalPaths(context, error instanceof Error ? error.message : String(error || "Blender render failed")), startedAt);
|
|
2392
|
+
}
|
|
2393
|
+
}
|
|
2394
|
+
}
|
|
2395
|
+
const SAFE_CUDA_IDENTIFIER = /^[a-zA-Z0-9][a-zA-Z0-9_.-]{0,127}$/;
|
|
2396
|
+
const SAFE_CUDA_TOKEN = /^[a-zA-Z0-9_@%+=:,./-]{1,200}$/;
|
|
2397
|
+
const UNSAFE_CUDA_MANIFEST_KEYS = new Set([
|
|
2398
|
+
"command",
|
|
2399
|
+
"cmd",
|
|
2400
|
+
"shell",
|
|
2401
|
+
"entrypoint",
|
|
2402
|
+
"docker_args",
|
|
2403
|
+
"mount",
|
|
2404
|
+
"mounts",
|
|
2405
|
+
"volumes",
|
|
2406
|
+
"binds",
|
|
2407
|
+
"device",
|
|
2408
|
+
"devices",
|
|
2409
|
+
"privileged",
|
|
2410
|
+
"network",
|
|
2411
|
+
"host_network"
|
|
2412
|
+
]);
|
|
2413
|
+
function cudaFailureResult(job, code, message, startedAt) {
|
|
2414
|
+
return {
|
|
2415
|
+
job_id: job.idempotency_key || "cuda.run",
|
|
2416
|
+
status: "failed",
|
|
2417
|
+
exit_code: 1,
|
|
2418
|
+
started_at: startedAt,
|
|
2419
|
+
finished_at: new Date().toISOString(),
|
|
2420
|
+
error: {
|
|
2421
|
+
code,
|
|
2422
|
+
message,
|
|
2423
|
+
retryable: false
|
|
2424
|
+
}
|
|
2425
|
+
};
|
|
2426
|
+
}
|
|
2427
|
+
function safeCudaIdentifier(value, label) {
|
|
2428
|
+
const text = optionalText(value);
|
|
2429
|
+
if (!text || !SAFE_CUDA_IDENTIFIER.test(text)) {
|
|
2430
|
+
throw new Error(`${label}_invalid`);
|
|
2431
|
+
}
|
|
2432
|
+
return text;
|
|
2433
|
+
}
|
|
2434
|
+
function safeCudaRelativePath(value, label) {
|
|
2435
|
+
return assertMswarmSafeRelativePath(optionalText(value), label);
|
|
2436
|
+
}
|
|
2437
|
+
function safeCudaToken(value, label) {
|
|
2438
|
+
const text = optionalText(value);
|
|
2439
|
+
if (!text || !SAFE_CUDA_TOKEN.test(text) || /[`$;&|<>\r\n]/.test(text)) {
|
|
2440
|
+
throw new Error(`${label}_invalid`);
|
|
2441
|
+
}
|
|
2442
|
+
return text;
|
|
2443
|
+
}
|
|
2444
|
+
function safeCudaTokenList(value, label) {
|
|
2445
|
+
if (value === undefined)
|
|
2446
|
+
return [];
|
|
2447
|
+
if (!Array.isArray(value)) {
|
|
2448
|
+
throw new Error(`${label}_must_be_array`);
|
|
2449
|
+
}
|
|
2450
|
+
return value.map((entry, index) => safeCudaToken(entry, `${label}_${index}`));
|
|
2451
|
+
}
|
|
2452
|
+
function assertNoUnsafeCudaManifestKeys(record, label) {
|
|
2453
|
+
for (const key of Object.keys(record)) {
|
|
2454
|
+
if (UNSAFE_CUDA_MANIFEST_KEYS.has(key)) {
|
|
2455
|
+
throw new Error(`${label}_${key}_not_allowed`);
|
|
2456
|
+
}
|
|
2457
|
+
}
|
|
2458
|
+
}
|
|
2459
|
+
function parseCudaRunArgs(job) {
|
|
2460
|
+
const args = job.args || {};
|
|
2461
|
+
return {
|
|
2462
|
+
manifestPath: safeCudaRelativePath(args.manifest_path, "cuda_manifest_path"),
|
|
2463
|
+
profile: safeCudaIdentifier(args.profile, "cuda_profile"),
|
|
2464
|
+
target: safeCudaIdentifier(args.target, "cuda_target")
|
|
2465
|
+
};
|
|
2466
|
+
}
|
|
2467
|
+
function cudaPackageArchive(context) {
|
|
2468
|
+
const registeredInput = context.artifacts.registeredInputs.find((input) => input.name === "package" && input.local_path) ||
|
|
2469
|
+
context.artifacts.registeredInputs.find((input) => input.local_path && input.name !== "manifest");
|
|
2470
|
+
if (!registeredInput?.local_path) {
|
|
2471
|
+
throw new Error("cuda_package_artifact_required");
|
|
2472
|
+
}
|
|
2473
|
+
const inputPath = assertMswarmSafeRelativePath(relative(context.artifacts.inputDir, registeredInput.local_path), "cuda_package_input_path");
|
|
2474
|
+
if (!/(\.tar\.gz|\.tgz)$/i.test(inputPath)) {
|
|
2475
|
+
throw new Error("cuda_package_archive_must_be_targz");
|
|
2476
|
+
}
|
|
2477
|
+
return { hostPath: registeredInput.local_path, inputPath };
|
|
2478
|
+
}
|
|
2479
|
+
function cudaArchiveValidationError(reason) {
|
|
2480
|
+
return new Error(`cuda_package_archive_${reason || "invalid"}`);
|
|
2481
|
+
}
|
|
2482
|
+
function cudaTarVerboseEntryType(line) {
|
|
2483
|
+
const marker = line.trimStart()[0];
|
|
2484
|
+
if (marker === "d")
|
|
2485
|
+
return "directory";
|
|
2486
|
+
if (marker === "-")
|
|
2487
|
+
return "file";
|
|
2488
|
+
if (marker === "l")
|
|
2489
|
+
return "symlink";
|
|
2490
|
+
if (marker === "h")
|
|
2491
|
+
return "hardlink";
|
|
2492
|
+
if (marker === "b" || marker === "c")
|
|
2493
|
+
return "device";
|
|
2494
|
+
return marker ? "other" : "file";
|
|
2495
|
+
}
|
|
2496
|
+
async function validateCudaPackageArchive(context, runner, archive) {
|
|
2497
|
+
const listOptions = {
|
|
2498
|
+
timeoutMs: 5000,
|
|
2499
|
+
maxBuffer: 512 * 1024,
|
|
2500
|
+
signal: context.signal
|
|
2501
|
+
};
|
|
2502
|
+
const names = await runner("tar", ["-tzf", archive.hostPath], listOptions);
|
|
2503
|
+
let entryCount = 0;
|
|
2504
|
+
for (const rawLine of names.stdout.split(/\r?\n/)) {
|
|
2505
|
+
const entryPath = rawLine.trim();
|
|
2506
|
+
if (!entryPath)
|
|
2507
|
+
continue;
|
|
2508
|
+
entryCount += 1;
|
|
2509
|
+
const result = validateMswarmArchiveEntry({
|
|
2510
|
+
path: entryPath,
|
|
2511
|
+
type: entryPath.endsWith("/") ? "directory" : "file"
|
|
2512
|
+
});
|
|
2513
|
+
if (!result.ok) {
|
|
2514
|
+
throw cudaArchiveValidationError(result.reason);
|
|
2515
|
+
}
|
|
2516
|
+
}
|
|
2517
|
+
if (entryCount === 0) {
|
|
2518
|
+
throw cudaArchiveValidationError("empty");
|
|
2519
|
+
}
|
|
2520
|
+
const verbose = await runner("tar", ["-tvzf", archive.hostPath], listOptions);
|
|
2521
|
+
for (const rawLine of verbose.stdout.split(/\r?\n/)) {
|
|
2522
|
+
if (!rawLine.trim())
|
|
2523
|
+
continue;
|
|
2524
|
+
const type = cudaTarVerboseEntryType(rawLine);
|
|
2525
|
+
if (type === "file" || type === "directory")
|
|
2526
|
+
continue;
|
|
2527
|
+
const result = validateMswarmArchiveEntry({ path: "entry", type });
|
|
2528
|
+
throw cudaArchiveValidationError(result.reason);
|
|
2529
|
+
}
|
|
2530
|
+
}
|
|
2531
|
+
async function readCudaManifestText(context, runner, args) {
|
|
2532
|
+
const directManifestPath = resolveWithinRoot(context.artifacts.inputDir, args.manifestPath);
|
|
2533
|
+
try {
|
|
2534
|
+
const directStat = await lstat(directManifestPath);
|
|
2535
|
+
if (directStat.isFile()) {
|
|
2536
|
+
return await readFile(directManifestPath, "utf8");
|
|
2537
|
+
}
|
|
2538
|
+
}
|
|
2539
|
+
catch (error) {
|
|
2540
|
+
if (error.code !== "ENOENT") {
|
|
2541
|
+
throw error;
|
|
2542
|
+
}
|
|
2543
|
+
}
|
|
2544
|
+
const archive = cudaPackageArchive(context);
|
|
2545
|
+
const extracted = await runner("tar", ["-xOf", archive.hostPath, args.manifestPath], {
|
|
2546
|
+
timeoutMs: 5000,
|
|
2547
|
+
maxBuffer: 256 * 1024,
|
|
2548
|
+
signal: context.signal
|
|
2549
|
+
});
|
|
2550
|
+
return extracted.stdout;
|
|
2551
|
+
}
|
|
2552
|
+
function parseCudaPackageManifest(text, args, policy) {
|
|
2553
|
+
const parsed = JSON.parse(text);
|
|
2554
|
+
const manifest = objectRecord(parsed);
|
|
2555
|
+
if (!manifest) {
|
|
2556
|
+
throw new Error("cuda_manifest_must_be_object");
|
|
2557
|
+
}
|
|
2558
|
+
assertNoUnsafeCudaManifestKeys(manifest, "cuda_manifest");
|
|
2559
|
+
const schemaVersion = optionalText(manifest.schema_version);
|
|
2560
|
+
if (schemaVersion !== "2026-06-14") {
|
|
2561
|
+
throw new Error("cuda_manifest_schema_version_invalid");
|
|
2562
|
+
}
|
|
2563
|
+
const packageInfo = objectRecord(manifest.package);
|
|
2564
|
+
const publisher = optionalText(packageInfo?.publisher);
|
|
2565
|
+
if (policy.allowed_package_publishers?.length) {
|
|
2566
|
+
if (!publisher || !policy.allowed_package_publishers.includes(publisher)) {
|
|
2567
|
+
throw new Error("cuda_manifest_publisher_not_allowed");
|
|
2568
|
+
}
|
|
2569
|
+
}
|
|
2570
|
+
const profiles = objectRecord(manifest.profiles);
|
|
2571
|
+
const targets = objectRecord(manifest.targets);
|
|
2572
|
+
const profile = objectRecord(profiles?.[args.profile]);
|
|
2573
|
+
const target = objectRecord(targets?.[args.target]);
|
|
2574
|
+
if (!profile) {
|
|
2575
|
+
throw new Error("cuda_manifest_profile_not_found");
|
|
2576
|
+
}
|
|
2577
|
+
if (!target) {
|
|
2578
|
+
throw new Error("cuda_manifest_target_not_found");
|
|
2579
|
+
}
|
|
2580
|
+
assertNoUnsafeCudaManifestKeys(profile, "cuda_manifest_profile");
|
|
2581
|
+
assertNoUnsafeCudaManifestKeys(target, "cuda_manifest_target");
|
|
2582
|
+
const image = optionalText(profile.image);
|
|
2583
|
+
if (!image || !APPROVED_NVIDIA_CUDA_IMAGES.has(image)) {
|
|
2584
|
+
throw new Error("cuda_image_not_approved");
|
|
2585
|
+
}
|
|
2586
|
+
if (!policy.allowed_images?.includes(image)) {
|
|
2587
|
+
throw new Error("cuda_image_not_allowed_by_policy");
|
|
2588
|
+
}
|
|
2589
|
+
const compiler = optionalText(profile.compiler) || "nvcc";
|
|
2590
|
+
if (compiler !== "nvcc") {
|
|
2591
|
+
throw new Error("cuda_compiler_not_allowed");
|
|
2592
|
+
}
|
|
2593
|
+
const source = safeCudaRelativePath(target.source, "cuda_target_source");
|
|
2594
|
+
if (!source.endsWith(".cu")) {
|
|
2595
|
+
throw new Error("cuda_target_source_must_be_cu");
|
|
2596
|
+
}
|
|
2597
|
+
const output = safeCudaRelativePath(optionalText(target.output) || `bin/${args.target}`, "cuda_target_output");
|
|
2598
|
+
return {
|
|
2599
|
+
schemaVersion,
|
|
2600
|
+
packageName: optionalText(packageInfo?.name) ?? undefined,
|
|
2601
|
+
publisher: publisher ?? undefined,
|
|
2602
|
+
image,
|
|
2603
|
+
compiler,
|
|
2604
|
+
source,
|
|
2605
|
+
output,
|
|
2606
|
+
flags: [...safeCudaTokenList(profile.flags, "cuda_profile_flags"), ...safeCudaTokenList(target.flags, "cuda_target_flags")],
|
|
2607
|
+
runArgs: safeCudaTokenList(target.args, "cuda_target_args")
|
|
2608
|
+
};
|
|
2609
|
+
}
|
|
2610
|
+
function redactCudaLocalPaths(context, value) {
|
|
2611
|
+
const replacements = [
|
|
2612
|
+
...context.artifacts.registeredInputs.map((input) => [input.local_path, "[job-input]"]),
|
|
2613
|
+
[context.artifacts.inputDir, "[job-inputs]"],
|
|
2614
|
+
[context.artifacts.outputDir, "[job-outputs]"],
|
|
2615
|
+
[context.artifacts.workDir, "[job-workdir]"]
|
|
2616
|
+
];
|
|
2617
|
+
replacements.sort((left, right) => (right[0]?.length || 0) - (left[0]?.length || 0));
|
|
2618
|
+
let output = value;
|
|
2619
|
+
for (const [source, replacement] of replacements) {
|
|
2620
|
+
if (source) {
|
|
2621
|
+
output = output.split(source).join(replacement);
|
|
2622
|
+
}
|
|
2623
|
+
}
|
|
2624
|
+
return output;
|
|
2625
|
+
}
|
|
2626
|
+
async function emitCudaOutput(context, type, value) {
|
|
2627
|
+
const lines = value.split(/\r?\n/).map((line) => line.trim()).filter(Boolean).slice(0, 200);
|
|
2628
|
+
for (const line of lines) {
|
|
2629
|
+
await context.emitEvent({
|
|
2630
|
+
type,
|
|
2631
|
+
message: redactCudaLocalPaths(context, line),
|
|
2632
|
+
data: { runner: CUDA_PACKAGE_RUNNER_ID }
|
|
2633
|
+
});
|
|
2634
|
+
}
|
|
2635
|
+
}
|
|
2636
|
+
function buildCudaRunnerScript(input) {
|
|
2637
|
+
const srcDir = "/workspace/work/src";
|
|
2638
|
+
const buildOutput = `/workspace/work/${input.selection.output}`;
|
|
2639
|
+
const compile = [
|
|
2640
|
+
"/usr/local/cuda/bin/nvcc",
|
|
2641
|
+
...input.selection.flags,
|
|
2642
|
+
"-o",
|
|
2643
|
+
buildOutput,
|
|
2644
|
+
`${srcDir}/${input.selection.source}`
|
|
2645
|
+
].map(quotePosixShellValue).join(" ");
|
|
2646
|
+
const run = [
|
|
2647
|
+
buildOutput,
|
|
2648
|
+
...input.selection.runArgs
|
|
2649
|
+
].map(quotePosixShellValue).join(" ");
|
|
2650
|
+
return [
|
|
2651
|
+
"set -euo pipefail",
|
|
2652
|
+
"mkdir -p /workspace/work/src /workspace/outputs",
|
|
2653
|
+
`tar -xzf ${quotePosixShellValue(`/workspace/inputs/${input.archiveInputPath}`)} -C /workspace/work/src`,
|
|
2654
|
+
`mkdir -p ${quotePosixShellValue(dirname(buildOutput))}`,
|
|
2655
|
+
"cd /workspace/work/src",
|
|
2656
|
+
compile,
|
|
2657
|
+
run
|
|
2658
|
+
].join("\n");
|
|
2659
|
+
}
|
|
2660
|
+
function dockerBindMount(hostPath, containerPath, mode) {
|
|
2661
|
+
return `${hostPath}:${containerPath}:${mode}`;
|
|
2662
|
+
}
|
|
2663
|
+
function buildCudaDockerArgs(input) {
|
|
2664
|
+
const gpuCount = Math.max(1, input.context.job.resources?.gpu?.count || 1);
|
|
2665
|
+
const args = [
|
|
2666
|
+
"run",
|
|
2667
|
+
"--rm",
|
|
2668
|
+
"--pull",
|
|
2669
|
+
"never",
|
|
2670
|
+
"--network",
|
|
2671
|
+
"none",
|
|
2672
|
+
"--runtime",
|
|
2673
|
+
"nvidia",
|
|
2674
|
+
"--gpus",
|
|
2675
|
+
`count=${gpuCount}`,
|
|
2676
|
+
"--user",
|
|
2677
|
+
input.context.sandbox.container.user,
|
|
2678
|
+
"--read-only",
|
|
2679
|
+
"--cap-drop",
|
|
2680
|
+
"ALL",
|
|
2681
|
+
"--security-opt",
|
|
2682
|
+
"no-new-privileges",
|
|
2683
|
+
"--workdir",
|
|
2684
|
+
"/workspace",
|
|
2685
|
+
"--env",
|
|
2686
|
+
"CUDA_CACHE_PATH=/workspace/work/.cuda-cache",
|
|
2687
|
+
"--tmpfs",
|
|
2688
|
+
"/tmp:rw,nosuid,nodev,size=64m"
|
|
2689
|
+
];
|
|
2690
|
+
if (Number.isFinite(input.context.job.resources?.memory_gb) && input.context.job.resources?.memory_gb) {
|
|
2691
|
+
args.push("--memory", `${Math.floor(input.context.job.resources.memory_gb)}g`);
|
|
2692
|
+
}
|
|
2693
|
+
if (Number.isFinite(input.context.job.resources?.disk_gb) && input.context.job.resources?.disk_gb) {
|
|
2694
|
+
args.push("--storage-opt", `size=${Math.floor(input.context.job.resources.disk_gb)}G`);
|
|
2695
|
+
}
|
|
2696
|
+
args.push("-v", dockerBindMount(input.context.artifacts.inputDir, "/workspace/inputs", "ro"), "-v", dockerBindMount(input.context.artifacts.outputDir, "/workspace/outputs", "rw"), "-v", dockerBindMount(input.workPath, "/workspace/work", "rw"), "-v", dockerBindMount(input.scriptPath, "/workspace/__mcoda_cuda_run.sh", "ro"), input.selection.image, "/bin/bash", "/workspace/__mcoda_cuda_run.sh");
|
|
2697
|
+
return args;
|
|
2698
|
+
}
|
|
2699
|
+
export class MswarmCudaPackageRunner {
|
|
2700
|
+
constructor(runner = defaultCommandRunner) {
|
|
2701
|
+
this.id = CUDA_PACKAGE_RUNNER_ID;
|
|
2702
|
+
this.runner = runner;
|
|
2703
|
+
}
|
|
2704
|
+
async run(context) {
|
|
2705
|
+
const startedAt = new Date().toISOString();
|
|
2706
|
+
if (context.signal.aborted) {
|
|
2707
|
+
throw new Error(abortErrorMessage(context.signal));
|
|
2708
|
+
}
|
|
2709
|
+
if (context.job.policy.network !== "none") {
|
|
2710
|
+
return cudaFailureResult(context.job, "policy_denied", "cuda.run requires network policy none", startedAt);
|
|
2711
|
+
}
|
|
2712
|
+
if (context.job.policy.allow_raw_command !== false) {
|
|
2713
|
+
return cudaFailureResult(context.job, "policy_denied", "cuda.run does not allow raw commands", startedAt);
|
|
2714
|
+
}
|
|
2715
|
+
if (!context.job.resources?.gpu) {
|
|
2716
|
+
return cudaFailureResult(context.job, "validation_failed", "cuda.run requires GPU resources", startedAt);
|
|
2717
|
+
}
|
|
2718
|
+
if (!context.job.outputs?.length) {
|
|
2719
|
+
return cudaFailureResult(context.job, "validation_failed", "cuda.run requires declared outputs", startedAt);
|
|
2720
|
+
}
|
|
2721
|
+
let args;
|
|
2722
|
+
let archive;
|
|
2723
|
+
let selection;
|
|
2724
|
+
let scriptPath;
|
|
2725
|
+
let workPath;
|
|
2726
|
+
try {
|
|
2727
|
+
args = parseCudaRunArgs(context.job);
|
|
2728
|
+
archive = cudaPackageArchive(context);
|
|
2729
|
+
await validateCudaPackageArchive(context, this.runner, archive);
|
|
2730
|
+
const manifestText = await readCudaManifestText(context, this.runner, args);
|
|
2731
|
+
selection = parseCudaPackageManifest(manifestText, args, context.job.policy);
|
|
2732
|
+
scriptPath = resolveWithinRoot(context.artifacts.workDir, "__mcoda_cuda_run.sh");
|
|
2733
|
+
workPath = resolveWithinRoot(context.artifacts.workDir, "cuda-work");
|
|
2734
|
+
await mkdir(workPath, { recursive: true });
|
|
2735
|
+
await chmod(workPath, 0o777);
|
|
2736
|
+
await chmod(context.artifacts.outputDir, 0o777);
|
|
2737
|
+
await writeFile(scriptPath, buildCudaRunnerScript({ archiveInputPath: archive.inputPath, selection }), { mode: 0o644 });
|
|
2738
|
+
}
|
|
2739
|
+
catch (error) {
|
|
2740
|
+
return cudaFailureResult(context.job, "validation_failed", redactCudaLocalPaths(context, error instanceof Error ? error.message : String(error || "cuda.run validation failed")), startedAt);
|
|
2741
|
+
}
|
|
2742
|
+
const dockerArgs = buildCudaDockerArgs({
|
|
2743
|
+
context,
|
|
2744
|
+
selection,
|
|
2745
|
+
archiveInputPath: archive.inputPath,
|
|
2746
|
+
scriptPath,
|
|
2747
|
+
workPath
|
|
2748
|
+
});
|
|
2749
|
+
await context.emitEvent({
|
|
2750
|
+
type: "progress",
|
|
2751
|
+
message: "cuda package container starting",
|
|
2752
|
+
data: {
|
|
2753
|
+
runner: this.id,
|
|
2754
|
+
image: selection.image,
|
|
2755
|
+
profile: args.profile,
|
|
2756
|
+
target: args.target,
|
|
2757
|
+
gpu_count: Math.max(1, context.job.resources.gpu.count || 1),
|
|
2758
|
+
network: "none",
|
|
2759
|
+
container_user: context.sandbox.container.user
|
|
2760
|
+
}
|
|
2761
|
+
});
|
|
2762
|
+
const timeoutMs = Math.max(1000, Math.min(DEFAULT_JOB_TIMEOUT_MS, Math.floor((context.sandbox.limits.timeout_sec || DEFAULT_JOB_TIMEOUT_MS / 1000) * 1000)));
|
|
2763
|
+
const maxBuffer = Math.min(DEFAULT_COMMAND_MAX_BUFFER, Math.max(1024 * 1024, context.job.limits?.max_stdout_bytes || 0, context.job.limits?.max_stderr_bytes || 0));
|
|
2764
|
+
try {
|
|
2765
|
+
const result = await this.runner("docker", dockerArgs, {
|
|
2766
|
+
timeoutMs,
|
|
2767
|
+
maxBuffer,
|
|
2768
|
+
signal: context.signal
|
|
2769
|
+
});
|
|
2770
|
+
await emitCudaOutput(context, "stdout", result.stdout);
|
|
2771
|
+
await emitCudaOutput(context, "stderr", result.stderr);
|
|
2772
|
+
await context.emitEvent({
|
|
2773
|
+
type: "progress",
|
|
2774
|
+
message: "cuda package container completed",
|
|
2775
|
+
data: {
|
|
2776
|
+
runner: this.id,
|
|
2777
|
+
profile: args.profile,
|
|
2778
|
+
target: args.target
|
|
2779
|
+
}
|
|
2780
|
+
});
|
|
2781
|
+
return {
|
|
2782
|
+
job_id: context.job.idempotency_key || "cuda.run",
|
|
2783
|
+
status: "succeeded",
|
|
2784
|
+
exit_code: 0,
|
|
2785
|
+
started_at: startedAt,
|
|
2786
|
+
finished_at: new Date().toISOString(),
|
|
2787
|
+
metrics: {
|
|
2788
|
+
runner: this.id,
|
|
2789
|
+
image: selection.image,
|
|
2790
|
+
profile: args.profile,
|
|
2791
|
+
target: args.target,
|
|
2792
|
+
package: selection.packageName,
|
|
2793
|
+
publisher: selection.publisher,
|
|
2794
|
+
gpu_count: Math.max(1, context.job.resources.gpu.count || 1),
|
|
2795
|
+
network: "none",
|
|
2796
|
+
container_user: context.sandbox.container.user
|
|
2797
|
+
}
|
|
2798
|
+
};
|
|
2799
|
+
}
|
|
2800
|
+
catch (error) {
|
|
2801
|
+
if (context.signal.aborted) {
|
|
2802
|
+
throw error;
|
|
2803
|
+
}
|
|
2804
|
+
return cudaFailureResult(context.job, "runner_failed", redactCudaLocalPaths(context, error instanceof Error ? error.message : String(error || "cuda.run failed")), startedAt);
|
|
2805
|
+
}
|
|
2806
|
+
}
|
|
2807
|
+
}
|
|
2808
|
+
function createDefaultGenericJobRunners(runner = defaultCommandRunner) {
|
|
2809
|
+
return [new MswarmTestEchoRunner(), new MswarmBlenderRenderRunner(runner), new MswarmCudaPackageRunner(runner)];
|
|
2810
|
+
}
|
|
2811
|
+
function uniqueSortedStrings(values) {
|
|
2812
|
+
return Array.from(new Set(values.filter((value) => typeof value === "string" && value.length > 0))).sort();
|
|
2813
|
+
}
|
|
2814
|
+
function capabilityProbeTimeoutMs(config) {
|
|
2815
|
+
return parsePositiveInteger(config.capabilityProbeTimeoutMs, DEFAULT_CAPABILITY_PROBE_TIMEOUT_MS);
|
|
2816
|
+
}
|
|
2817
|
+
function capabilityCommandFailureMessage(error) {
|
|
2818
|
+
if (error instanceof Error && error.message)
|
|
2819
|
+
return error.message;
|
|
2820
|
+
return String(error || "capability probe failed");
|
|
2821
|
+
}
|
|
2822
|
+
function isMissingCapabilityCommand(error, stderr = "") {
|
|
2823
|
+
const message = `${capabilityCommandFailureMessage(error)}\n${stderr}`.toLowerCase();
|
|
2824
|
+
return /enoent|not found|command not found|no such file|executable file not found/.test(message);
|
|
2825
|
+
}
|
|
2826
|
+
async function runCapabilityCommand(runner, command, args, timeoutMs) {
|
|
2827
|
+
try {
|
|
2828
|
+
const result = await runner(command, args, {
|
|
2829
|
+
timeoutMs,
|
|
2830
|
+
maxBuffer: Math.min(DEFAULT_COMMAND_MAX_BUFFER, 512 * 1024)
|
|
2831
|
+
});
|
|
2832
|
+
return { ok: true, stdout: result.stdout, stderr: result.stderr };
|
|
2833
|
+
}
|
|
2834
|
+
catch (error) {
|
|
2835
|
+
return {
|
|
2836
|
+
ok: false,
|
|
2837
|
+
missing: isMissingCapabilityCommand(error),
|
|
2838
|
+
message: capabilityCommandFailureMessage(error)
|
|
2839
|
+
};
|
|
2840
|
+
}
|
|
2841
|
+
}
|
|
2842
|
+
function parseNvidiaSmiMemoryGb(value) {
|
|
2843
|
+
if (!value)
|
|
2844
|
+
return undefined;
|
|
2845
|
+
const parsed = Number(value.replace(/[^\d.]/g, ""));
|
|
2846
|
+
if (!Number.isFinite(parsed) || parsed <= 0)
|
|
2847
|
+
return undefined;
|
|
2848
|
+
return Math.round((parsed / 1024) * 10) / 10;
|
|
2849
|
+
}
|
|
2850
|
+
function parseNvidiaGpuProbe(stdout) {
|
|
2851
|
+
const devices = [];
|
|
2852
|
+
const cudaVersions = new Set();
|
|
2853
|
+
for (const line of stdout.split(/\r?\n/)) {
|
|
2854
|
+
const trimmed = line.trim();
|
|
2855
|
+
if (!trimmed)
|
|
2856
|
+
continue;
|
|
2857
|
+
const [index, name, memoryMb, driverVersion, computeCapability, cudaVersion] = trimmed
|
|
2858
|
+
.split(",")
|
|
2859
|
+
.map((part) => part.trim());
|
|
2860
|
+
const id = index ? `gpu-${index}` : `gpu-${devices.length}`;
|
|
2861
|
+
if (cudaVersion) {
|
|
2862
|
+
cudaVersions.add(cudaVersion);
|
|
2863
|
+
}
|
|
2864
|
+
devices.push({
|
|
2865
|
+
id,
|
|
2866
|
+
vendor: "nvidia",
|
|
2867
|
+
...(name ? { name } : {}),
|
|
2868
|
+
...(parseNvidiaSmiMemoryGb(memoryMb) ? { vram_gb: parseNvidiaSmiMemoryGb(memoryMb) } : {}),
|
|
2869
|
+
...(driverVersion ? { driver_version: driverVersion } : {}),
|
|
2870
|
+
...(cudaVersion ? { cuda_version: cudaVersion } : {}),
|
|
2871
|
+
...(computeCapability ? { compute_capability: computeCapability } : {}),
|
|
2872
|
+
capabilities: ["cuda"]
|
|
2873
|
+
});
|
|
2874
|
+
}
|
|
2875
|
+
const maxVramGb = devices.reduce((max, device) => {
|
|
2876
|
+
if (!Number.isFinite(device.vram_gb))
|
|
2877
|
+
return max;
|
|
2878
|
+
return max === undefined ? device.vram_gb : Math.max(max, device.vram_gb || 0);
|
|
2879
|
+
}, undefined);
|
|
2880
|
+
return {
|
|
2881
|
+
status: devices.length > 0 ? "available" : "missing",
|
|
2882
|
+
count: devices.length,
|
|
2883
|
+
vendors: devices.length > 0 ? ["nvidia"] : [],
|
|
2884
|
+
devices,
|
|
2885
|
+
...(cudaVersions.size > 0 ? { cuda_versions: Array.from(cudaVersions).sort() } : {}),
|
|
2886
|
+
...(maxVramGb !== undefined ? { max_vram_gb: maxVramGb } : {}),
|
|
2887
|
+
...(devices.length === 0 ? { message: "nvidia-smi returned no GPU rows" } : {})
|
|
2888
|
+
};
|
|
2889
|
+
}
|
|
2890
|
+
function parseNvidiaSmiCudaVersion(stdout) {
|
|
2891
|
+
return stdout.match(/CUDA\s+Version:\s*([0-9]+(?:\.[0-9]+)?)/i)?.[1];
|
|
2892
|
+
}
|
|
2893
|
+
async function probeNvidiaGpuCapabilities(runner, timeoutMs) {
|
|
2894
|
+
const result = await runCapabilityCommand(runner, "nvidia-smi", ["--query-gpu=index,name,memory.total,driver_version,compute_cap", "--format=csv,noheader,nounits"], timeoutMs);
|
|
2895
|
+
if (!result.ok) {
|
|
2896
|
+
return {
|
|
2897
|
+
status: result.missing ? "missing" : "error",
|
|
2898
|
+
count: 0,
|
|
2899
|
+
vendors: [],
|
|
2900
|
+
devices: [],
|
|
2901
|
+
message: result.message
|
|
2902
|
+
};
|
|
2903
|
+
}
|
|
2904
|
+
const probe = parseNvidiaGpuProbe(result.stdout);
|
|
2905
|
+
const versionResult = await runCapabilityCommand(runner, "nvidia-smi", [], timeoutMs);
|
|
2906
|
+
if (!versionResult.ok) {
|
|
2907
|
+
return probe;
|
|
2908
|
+
}
|
|
2909
|
+
const cudaVersion = parseNvidiaSmiCudaVersion(versionResult.stdout || versionResult.stderr);
|
|
2910
|
+
if (!cudaVersion) {
|
|
2911
|
+
return probe;
|
|
2912
|
+
}
|
|
2913
|
+
const cudaVersions = Array.from(new Set([...(probe.cuda_versions || []), cudaVersion])).sort();
|
|
2914
|
+
return {
|
|
2915
|
+
...probe,
|
|
2916
|
+
cuda_versions: cudaVersions,
|
|
2917
|
+
devices: probe.devices.map((device) => ({
|
|
2918
|
+
...device,
|
|
2919
|
+
cuda_version: device.cuda_version || cudaVersion
|
|
2920
|
+
}))
|
|
2921
|
+
};
|
|
2922
|
+
}
|
|
2923
|
+
function missingSoftwareProbe(name, message) {
|
|
2924
|
+
return {
|
|
2925
|
+
name,
|
|
2926
|
+
status: "missing",
|
|
2927
|
+
...(message ? { message } : {})
|
|
2928
|
+
};
|
|
2929
|
+
}
|
|
2930
|
+
function errorSoftwareProbe(name, message) {
|
|
2931
|
+
return {
|
|
2932
|
+
name,
|
|
2933
|
+
status: "error",
|
|
2934
|
+
message
|
|
2935
|
+
};
|
|
2936
|
+
}
|
|
2937
|
+
function extractToolVersion(stdout, tool) {
|
|
2938
|
+
const firstLine = stdout.split(/\r?\n/).find((line) => line.trim().length > 0)?.trim() || "";
|
|
2939
|
+
if (tool === "blender") {
|
|
2940
|
+
return firstLine.match(/Blender\s+([^\s]+)/i)?.[1];
|
|
2941
|
+
}
|
|
2942
|
+
if (tool === "ffmpeg") {
|
|
2943
|
+
return firstLine.match(/ffmpeg\s+version\s+([^\s]+)/i)?.[1];
|
|
2944
|
+
}
|
|
2945
|
+
return firstLine || undefined;
|
|
2946
|
+
}
|
|
2947
|
+
async function probeVersionedSoftware(runner, name, command, args, timeoutMs) {
|
|
2948
|
+
const result = await runCapabilityCommand(runner, command, args, timeoutMs);
|
|
2949
|
+
if (!result.ok) {
|
|
2950
|
+
return result.missing
|
|
2951
|
+
? missingSoftwareProbe(name, result.message)
|
|
2952
|
+
: errorSoftwareProbe(name, result.message);
|
|
2953
|
+
}
|
|
2954
|
+
return {
|
|
2955
|
+
name,
|
|
2956
|
+
status: "available",
|
|
2957
|
+
...(extractToolVersion(result.stdout || result.stderr, name) ? { version: extractToolVersion(result.stdout || result.stderr, name) } : {})
|
|
2958
|
+
};
|
|
2959
|
+
}
|
|
2960
|
+
async function probeDockerCapabilities(runner, timeoutMs) {
|
|
2961
|
+
const result = await runCapabilityCommand(runner, "docker", ["info", "--format", "{{json .Runtimes}}"], timeoutMs);
|
|
2962
|
+
if (!result.ok) {
|
|
2963
|
+
const docker = result.missing
|
|
2964
|
+
? missingSoftwareProbe("docker", result.message)
|
|
2965
|
+
: errorSoftwareProbe("docker", result.message);
|
|
2966
|
+
return {
|
|
2967
|
+
docker,
|
|
2968
|
+
dockerNvidia: { name: "docker-nvidia", status: docker.status, message: result.message }
|
|
2969
|
+
};
|
|
2970
|
+
}
|
|
2971
|
+
try {
|
|
2972
|
+
const runtimes = JSON.parse(result.stdout || "{}");
|
|
2973
|
+
const runtimeNames = Object.keys(runtimes);
|
|
2974
|
+
const hasNvidiaRuntime = runtimeNames.some((name) => name.toLowerCase() === "nvidia");
|
|
2975
|
+
return {
|
|
2976
|
+
docker: { name: "docker", status: "available" },
|
|
2977
|
+
dockerNvidia: hasNvidiaRuntime
|
|
2978
|
+
? { name: "docker-nvidia", status: "available", version: "nvidia" }
|
|
2979
|
+
: missingSoftwareProbe("docker-nvidia", "Docker is available but the nvidia runtime is not registered")
|
|
2980
|
+
};
|
|
2981
|
+
}
|
|
2982
|
+
catch (error) {
|
|
2983
|
+
const message = capabilityCommandFailureMessage(error);
|
|
2984
|
+
return {
|
|
2985
|
+
docker: errorSoftwareProbe("docker", `Unable to parse docker runtime inventory: ${message}`),
|
|
2986
|
+
dockerNvidia: errorSoftwareProbe("docker-nvidia", `Unable to parse docker runtime inventory: ${message}`)
|
|
2987
|
+
};
|
|
2988
|
+
}
|
|
2989
|
+
}
|
|
2990
|
+
function capabilityDiagnostics(snapshot) {
|
|
2991
|
+
const diagnostics = [];
|
|
2992
|
+
if (snapshot.gpu.status !== "available") {
|
|
2993
|
+
diagnostics.push({
|
|
2994
|
+
name: "gpu",
|
|
2995
|
+
status: snapshot.gpu.status,
|
|
2996
|
+
message: snapshot.gpu.message
|
|
2997
|
+
});
|
|
2998
|
+
}
|
|
2999
|
+
for (const result of Object.values(snapshot.software)) {
|
|
3000
|
+
if (result.status !== "available") {
|
|
3001
|
+
diagnostics.push({
|
|
3002
|
+
name: result.name,
|
|
3003
|
+
status: result.status,
|
|
3004
|
+
message: result.message
|
|
3005
|
+
});
|
|
3006
|
+
}
|
|
3007
|
+
}
|
|
3008
|
+
return diagnostics.length ? diagnostics : undefined;
|
|
3009
|
+
}
|
|
3010
|
+
function buildCapabilitySnapshotId(snapshot) {
|
|
3011
|
+
const digest = createHash("sha256").update(JSON.stringify(snapshot)).digest("hex").slice(0, 16);
|
|
3012
|
+
return `caps_${digest}`;
|
|
3013
|
+
}
|
|
3014
|
+
function buildRunnerCapabilityCatalog(config, runners) {
|
|
3015
|
+
if (!config.genericJobsEnabled) {
|
|
3016
|
+
return [];
|
|
3017
|
+
}
|
|
3018
|
+
return OWNER_LOCAL_GENERIC_JOB_CATALOG
|
|
3019
|
+
.filter((entry) => runners.has(entry.runner))
|
|
3020
|
+
.map((entry) => ({
|
|
3021
|
+
job_type: entry.job_type,
|
|
3022
|
+
runner: entry.runner,
|
|
3023
|
+
trust_modes: uniqueSortedStrings([entry.policy.trust_mode]),
|
|
3024
|
+
required_capabilities: entry.required_capabilities || []
|
|
3025
|
+
}));
|
|
3026
|
+
}
|
|
3027
|
+
function runnerCapabilityRequirementsAvailable(entry, input) {
|
|
3028
|
+
if (!input.genericJobsEnabled)
|
|
3029
|
+
return false;
|
|
3030
|
+
if (!entry.required_capabilities?.length)
|
|
3031
|
+
return true;
|
|
3032
|
+
const snapshot = {
|
|
3033
|
+
schema_version: MSWARM_CAPABILITY_SCHEMA_VERSION,
|
|
3034
|
+
snapshot_id: "caps_requirement_check",
|
|
3035
|
+
captured_at: new Date(0).toISOString(),
|
|
3036
|
+
generic_jobs_enabled: input.genericJobsEnabled,
|
|
3037
|
+
job_types: [],
|
|
3038
|
+
trust_modes: [],
|
|
3039
|
+
gpu: input.gpu,
|
|
3040
|
+
software: input.software,
|
|
3041
|
+
runner_catalog: []
|
|
3042
|
+
};
|
|
3043
|
+
const capabilities = new Set(buildMswarmCapabilityNames(snapshot));
|
|
3044
|
+
return entry.required_capabilities.every((capability) => capabilities.has(capability));
|
|
3045
|
+
}
|
|
3046
|
+
function registeredOwnerLocalGenericJobCatalog() {
|
|
3047
|
+
return OWNER_LOCAL_GENERIC_JOB_CATALOG.filter((entry) => entry.job_type.startsWith("tenant.") || entry.job_type.startsWith("package."));
|
|
3048
|
+
}
|
|
3049
|
+
function base64UrlEncodeRuntime(buffer) {
|
|
3050
|
+
return buffer.toString("base64").replace(/=/g, "").replace(/\+/g, "-").replace(/\//g, "_");
|
|
3051
|
+
}
|
|
3052
|
+
function signCapabilityPayload(input) {
|
|
3053
|
+
const unsignedPayload = {
|
|
3054
|
+
schema_version: MSWARM_CAPABILITY_SCHEMA_VERSION,
|
|
3055
|
+
snapshot_id: input.privateCatalogEntry.snapshot_id,
|
|
3056
|
+
private_catalog_entry: input.privateCatalogEntry,
|
|
3057
|
+
scheduler_match: input.privateCatalogEntry.scheduler_match,
|
|
3058
|
+
public_projection: input.privateCatalogEntry.public_projection
|
|
3059
|
+
};
|
|
3060
|
+
const signature = base64UrlEncodeRuntime(createHmac("sha256", input.runtimeToken).update(JSON.stringify(unsignedPayload)).digest());
|
|
3061
|
+
return {
|
|
3062
|
+
...unsignedPayload,
|
|
3063
|
+
signature: {
|
|
3064
|
+
alg: "HS256",
|
|
3065
|
+
value: signature,
|
|
3066
|
+
signed_at: new Date().toISOString(),
|
|
3067
|
+
key_id: "self_hosted_runtime_token"
|
|
3068
|
+
}
|
|
3069
|
+
};
|
|
3070
|
+
}
|
|
3071
|
+
function runnerForGenericJob(job, runners) {
|
|
3072
|
+
const catalogEntry = OWNER_LOCAL_GENERIC_JOB_CATALOG.find((entry) => entry.job_type === job.job_type);
|
|
3073
|
+
return catalogEntry ? runners.get(catalogEntry.runner) || null : null;
|
|
3074
|
+
}
|
|
3075
|
+
function compareDottedVersion(left, right) {
|
|
3076
|
+
if (!left || !right)
|
|
3077
|
+
return 0;
|
|
3078
|
+
const leftParts = left.split(".").map((part) => Number(part.replace(/[^\d]/g, "")) || 0);
|
|
3079
|
+
const rightParts = right.split(".").map((part) => Number(part.replace(/[^\d]/g, "")) || 0);
|
|
3080
|
+
const length = Math.max(leftParts.length, rightParts.length);
|
|
3081
|
+
for (let index = 0; index < length; index += 1) {
|
|
3082
|
+
const delta = (leftParts[index] || 0) - (rightParts[index] || 0);
|
|
3083
|
+
if (delta !== 0)
|
|
3084
|
+
return delta;
|
|
3085
|
+
}
|
|
3086
|
+
return 0;
|
|
3087
|
+
}
|
|
3088
|
+
function snapshotHasCudaVersion(snapshot, minVersion) {
|
|
3089
|
+
if (!minVersion)
|
|
3090
|
+
return true;
|
|
3091
|
+
const versions = [
|
|
3092
|
+
...(snapshot.gpu.cuda_versions || []),
|
|
3093
|
+
...snapshot.gpu.devices.map((device) => device.cuda_version).filter((value) => Boolean(value))
|
|
3094
|
+
];
|
|
3095
|
+
return versions.some((version) => compareDottedVersion(version, minVersion) >= 0);
|
|
3096
|
+
}
|
|
3097
|
+
export function genericJobCapabilityMismatch(job, snapshot) {
|
|
3098
|
+
if (!snapshot.generic_jobs_enabled) {
|
|
3099
|
+
return { code: "no_capable_node", message: "Generic jobs are disabled on this node." };
|
|
3100
|
+
}
|
|
3101
|
+
if (job.job_type === RENDER_BLENDER_JOB_TYPE && snapshot.software.blender.status !== "available") {
|
|
3102
|
+
return {
|
|
3103
|
+
code: "no_capable_node",
|
|
3104
|
+
message: "Blender is not available on this node."
|
|
3105
|
+
};
|
|
3106
|
+
}
|
|
3107
|
+
if (job.job_type === CUDA_RUN_JOB_TYPE) {
|
|
3108
|
+
if (snapshot.gpu.status !== "available" || !snapshot.gpu.vendors.includes("nvidia")) {
|
|
3109
|
+
return {
|
|
3110
|
+
code: "no_capable_node",
|
|
3111
|
+
message: "No NVIDIA GPU is available on this node."
|
|
3112
|
+
};
|
|
3113
|
+
}
|
|
3114
|
+
if (snapshot.software.docker.status !== "available" || snapshot.software["docker-nvidia"].status !== "available") {
|
|
3115
|
+
return {
|
|
3116
|
+
code: "no_capable_node",
|
|
3117
|
+
message: "Docker with the NVIDIA runtime is not available on this node."
|
|
3118
|
+
};
|
|
3119
|
+
}
|
|
3120
|
+
}
|
|
3121
|
+
if (!snapshot.job_types.includes(job.job_type)) {
|
|
3122
|
+
return {
|
|
3123
|
+
code: "no_capable_node",
|
|
3124
|
+
message: `No capable owner-local node is available for ${job.job_type}.`
|
|
3125
|
+
};
|
|
3126
|
+
}
|
|
3127
|
+
if (job.resources?.gpu) {
|
|
3128
|
+
const requestedCount = Math.max(1, job.resources.gpu.count || 1);
|
|
3129
|
+
if (snapshot.gpu.status !== "available" || snapshot.gpu.count < requestedCount) {
|
|
3130
|
+
return {
|
|
3131
|
+
code: "no_capable_node",
|
|
3132
|
+
message: `Requested ${requestedCount} GPU(s), but this node reports ${snapshot.gpu.count}.`
|
|
3133
|
+
};
|
|
3134
|
+
}
|
|
3135
|
+
if (job.resources.gpu.vendor && !snapshot.gpu.vendors.includes(job.resources.gpu.vendor)) {
|
|
3136
|
+
return {
|
|
3137
|
+
code: "no_capable_node",
|
|
3138
|
+
message: `Requested GPU vendor ${job.resources.gpu.vendor} is not available on this node.`
|
|
3139
|
+
};
|
|
3140
|
+
}
|
|
3141
|
+
if (Number.isFinite(job.resources.gpu.min_vram_gb) &&
|
|
3142
|
+
job.resources.gpu.min_vram_gb !== undefined &&
|
|
3143
|
+
(!Number.isFinite(snapshot.gpu.max_vram_gb) || (snapshot.gpu.max_vram_gb || 0) < job.resources.gpu.min_vram_gb)) {
|
|
3144
|
+
return {
|
|
3145
|
+
code: "no_capable_node",
|
|
3146
|
+
message: `Requested GPU VRAM ${job.resources.gpu.min_vram_gb}GB exceeds this node capability.`
|
|
3147
|
+
};
|
|
3148
|
+
}
|
|
3149
|
+
if (!snapshotHasCudaVersion(snapshot, job.resources.gpu.cuda_min_version)) {
|
|
3150
|
+
return {
|
|
3151
|
+
code: "no_capable_node",
|
|
3152
|
+
message: `Requested CUDA ${job.resources.gpu.cuda_min_version} is not available on this node.`
|
|
3153
|
+
};
|
|
3154
|
+
}
|
|
3155
|
+
}
|
|
3156
|
+
return null;
|
|
3157
|
+
}
|
|
3158
|
+
function genericJobTimeoutMs(job, fallbackMs) {
|
|
3159
|
+
const limitSeconds = positiveInteger(job.limits?.timeout_sec);
|
|
3160
|
+
if (!limitSeconds) {
|
|
3161
|
+
return fallbackMs;
|
|
3162
|
+
}
|
|
3163
|
+
return Math.max(1, Math.min(fallbackMs, limitSeconds * 1000));
|
|
3164
|
+
}
|
|
3165
|
+
function isGenericAbortError(error, signal) {
|
|
3166
|
+
if (signal.aborted)
|
|
3167
|
+
return true;
|
|
3168
|
+
if (!(error instanceof Error))
|
|
3169
|
+
return false;
|
|
3170
|
+
return /cancelled|canceled|aborted|timed out|timeout/i.test(error.message);
|
|
3171
|
+
}
|
|
3172
|
+
function usageTokens(usage) {
|
|
3173
|
+
return {
|
|
3174
|
+
promptTokens: positiveInteger(usage?.inputTokens),
|
|
3175
|
+
completionTokens: positiveInteger(usage?.outputTokens),
|
|
3176
|
+
};
|
|
3177
|
+
}
|
|
3178
|
+
export class McodaLocalAgentExecutor {
|
|
3179
|
+
constructor(input) {
|
|
3180
|
+
this.command = input.command || DEFAULT_MCODA_BIN;
|
|
3181
|
+
this.timeoutMs = input.timeoutMs || DEFAULT_JOB_TIMEOUT_MS;
|
|
3182
|
+
this.runner = input.runner || defaultCommandRunner;
|
|
3183
|
+
}
|
|
3184
|
+
async invoke(agentSlug, prompt) {
|
|
3185
|
+
const stdout = (await this.runner(this.command, ["agent-run", agentSlug, "--json", "--stdin"], {
|
|
3186
|
+
timeoutMs: this.timeoutMs,
|
|
3187
|
+
maxBuffer: DEFAULT_COMMAND_MAX_BUFFER,
|
|
3188
|
+
input: prompt
|
|
3189
|
+
})).stdout;
|
|
3190
|
+
const parsed = JSON.parse(stdout);
|
|
3191
|
+
if (!parsed || typeof parsed !== "object" || !Array.isArray(parsed.responses)) {
|
|
3192
|
+
throw new Error("mcoda agent-run returned unsupported JSON");
|
|
3193
|
+
}
|
|
3194
|
+
const response = parsed.responses[0] || {};
|
|
3195
|
+
const output = optionalText(response.output);
|
|
3196
|
+
if (!output) {
|
|
3197
|
+
throw new Error("mcoda agent-run response did not include output");
|
|
3198
|
+
}
|
|
3199
|
+
return {
|
|
3200
|
+
output,
|
|
3201
|
+
adapter: optionalText(response.adapter) || undefined,
|
|
3202
|
+
model: optionalText(response.model) || undefined,
|
|
3203
|
+
metadata: response.metadata && typeof response.metadata === "object" ? response.metadata : undefined
|
|
3204
|
+
};
|
|
3205
|
+
}
|
|
3206
|
+
}
|
|
3207
|
+
export class MswarmSelfHostedNodeClient {
|
|
3208
|
+
constructor(input) {
|
|
3209
|
+
this.gatewayBaseUrl = trimTrailingSlash(input.gatewayBaseUrl);
|
|
3210
|
+
this.fetchImpl = input.fetchImpl || fetch;
|
|
3211
|
+
this.timeoutMs = input.timeoutMs || DEFAULT_REQUEST_TIMEOUT_MS;
|
|
3212
|
+
}
|
|
3213
|
+
async enroll(nodeId, enrollmentToken) {
|
|
3214
|
+
return fetchJson(this.fetchImpl, `${this.gatewayBaseUrl}/v1/swarm/self-hosted/node/enroll`, {
|
|
3215
|
+
method: "POST",
|
|
3216
|
+
headers: { "content-type": "application/json" },
|
|
3217
|
+
body: JSON.stringify({ node_id: nodeId, enrollment_token: enrollmentToken })
|
|
3218
|
+
}, this.timeoutMs);
|
|
3219
|
+
}
|
|
3220
|
+
async bootstrap(apiKey, payload) {
|
|
3221
|
+
return fetchJson(this.fetchImpl, `${this.gatewayBaseUrl}/v1/swarm/self-hosted/node/bootstrap`, {
|
|
3222
|
+
method: "POST",
|
|
3223
|
+
headers: {
|
|
3224
|
+
"content-type": "application/json",
|
|
3225
|
+
"x-api-key": apiKey
|
|
3226
|
+
},
|
|
3227
|
+
body: JSON.stringify(payload)
|
|
3228
|
+
}, this.timeoutMs);
|
|
3229
|
+
}
|
|
3230
|
+
async health() {
|
|
3231
|
+
return fetchJson(this.fetchImpl, `${this.gatewayBaseUrl}/healthz`, { method: "GET" }, this.timeoutMs);
|
|
3232
|
+
}
|
|
3233
|
+
async heartbeat(runtimeToken, payload) {
|
|
3234
|
+
return fetchJson(this.fetchImpl, `${this.gatewayBaseUrl}/v1/swarm/self-hosted/node/heartbeat`, {
|
|
3235
|
+
method: "POST",
|
|
3236
|
+
headers: {
|
|
3237
|
+
"content-type": "application/json",
|
|
3238
|
+
authorization: `Bearer ${runtimeToken}`
|
|
3239
|
+
},
|
|
3240
|
+
body: JSON.stringify(payload)
|
|
3241
|
+
}, this.timeoutMs);
|
|
3242
|
+
}
|
|
3243
|
+
async uninstall(runtimeToken, payload) {
|
|
3244
|
+
return fetchJson(this.fetchImpl, `${this.gatewayBaseUrl}/v1/swarm/self-hosted/node/uninstall`, {
|
|
3245
|
+
method: "POST",
|
|
3246
|
+
headers: {
|
|
3247
|
+
"content-type": "application/json",
|
|
3248
|
+
authorization: `Bearer ${runtimeToken}`
|
|
3249
|
+
},
|
|
3250
|
+
body: JSON.stringify(payload)
|
|
3251
|
+
}, this.timeoutMs);
|
|
3252
|
+
}
|
|
3253
|
+
async pushModels(runtimeToken, payload) {
|
|
3254
|
+
return fetchJson(this.fetchImpl, `${this.gatewayBaseUrl}/v1/swarm/self-hosted/node/models`, {
|
|
3255
|
+
method: "POST",
|
|
3256
|
+
headers: {
|
|
3257
|
+
"content-type": "application/json",
|
|
3258
|
+
authorization: `Bearer ${runtimeToken}`
|
|
3259
|
+
},
|
|
3260
|
+
body: JSON.stringify(payload)
|
|
3261
|
+
}, this.timeoutMs);
|
|
3262
|
+
}
|
|
3263
|
+
async pollJob(runtimeToken, payload) {
|
|
3264
|
+
return fetchJson(this.fetchImpl, `${this.gatewayBaseUrl}/v1/swarm/self-hosted/node/jobs/poll`, {
|
|
3265
|
+
method: "POST",
|
|
3266
|
+
headers: {
|
|
3267
|
+
"content-type": "application/json",
|
|
3268
|
+
authorization: `Bearer ${runtimeToken}`
|
|
3269
|
+
},
|
|
3270
|
+
body: JSON.stringify(payload)
|
|
3271
|
+
}, Math.max(this.timeoutMs, (payload.wait_ms || 0) + 5000));
|
|
3272
|
+
}
|
|
3273
|
+
async postJobResult(runtimeToken, jobId, payload) {
|
|
3274
|
+
return fetchJson(this.fetchImpl, `${this.gatewayBaseUrl}/v1/swarm/self-hosted/node/jobs/${encodeURIComponent(jobId)}/result`, {
|
|
3275
|
+
method: "POST",
|
|
3276
|
+
headers: {
|
|
3277
|
+
"content-type": "application/json",
|
|
3278
|
+
authorization: `Bearer ${runtimeToken}`
|
|
3279
|
+
},
|
|
3280
|
+
body: JSON.stringify(payload)
|
|
3281
|
+
}, this.timeoutMs);
|
|
3282
|
+
}
|
|
3283
|
+
async postJobStart(runtimeToken, jobId, payload) {
|
|
3284
|
+
return fetchJson(this.fetchImpl, `${this.gatewayBaseUrl}/v1/swarm/self-hosted/node/jobs/${encodeURIComponent(jobId)}/start`, {
|
|
3285
|
+
method: "POST",
|
|
3286
|
+
headers: {
|
|
3287
|
+
"content-type": "application/json",
|
|
3288
|
+
authorization: `Bearer ${runtimeToken}`
|
|
3289
|
+
},
|
|
3290
|
+
body: JSON.stringify(payload)
|
|
3291
|
+
}, this.timeoutMs);
|
|
3292
|
+
}
|
|
3293
|
+
async postJobEvents(runtimeToken, jobId, payload) {
|
|
3294
|
+
return fetchJson(this.fetchImpl, `${this.gatewayBaseUrl}/v1/swarm/self-hosted/node/jobs/${encodeURIComponent(jobId)}/events`, {
|
|
3295
|
+
method: "POST",
|
|
3296
|
+
headers: {
|
|
3297
|
+
"content-type": "application/json",
|
|
3298
|
+
authorization: `Bearer ${runtimeToken}`
|
|
3299
|
+
},
|
|
3300
|
+
body: JSON.stringify(payload)
|
|
3301
|
+
}, this.timeoutMs);
|
|
3302
|
+
}
|
|
3303
|
+
}
|
|
3304
|
+
export class SelfHostedNodeRuntime {
|
|
3305
|
+
constructor(config, deps) {
|
|
3306
|
+
this.activeLlmJobs = 0;
|
|
3307
|
+
this.activeGenericJobs = 0;
|
|
3308
|
+
this.queuedLlmJobs = 0;
|
|
3309
|
+
this.queuedGenericJobs = 0;
|
|
3310
|
+
this.latencySamplesMs = [];
|
|
3311
|
+
this.recentFailures = [];
|
|
3312
|
+
this.config = config;
|
|
3313
|
+
this.gateway =
|
|
3314
|
+
deps?.gateway ||
|
|
3315
|
+
new MswarmSelfHostedNodeClient({
|
|
3316
|
+
gatewayBaseUrl: config.gatewayBaseUrl,
|
|
3317
|
+
fetchImpl: deps?.fetchImpl,
|
|
3318
|
+
timeoutMs: config.requestTimeoutMs
|
|
3319
|
+
});
|
|
3320
|
+
this.mcoda =
|
|
3321
|
+
deps?.mcoda ||
|
|
3322
|
+
new McodaAgentInventoryClient({
|
|
3323
|
+
command: config.mcodaBin,
|
|
3324
|
+
args: config.mcodaListArgs,
|
|
3325
|
+
timeoutMs: config.requestTimeoutMs
|
|
3326
|
+
});
|
|
3327
|
+
this.mcodaExecutor =
|
|
3328
|
+
deps?.mcodaExecutor ||
|
|
3329
|
+
new McodaLocalAgentExecutor({
|
|
3330
|
+
command: config.mcodaBin,
|
|
3331
|
+
timeoutMs: config.jobTimeoutMs
|
|
3332
|
+
});
|
|
3333
|
+
this.codaliExecutor = deps?.codaliExecutor || new MswarmCodaliExecutor();
|
|
3334
|
+
this.ollama =
|
|
3335
|
+
deps?.ollama ||
|
|
3336
|
+
new OllamaClient({
|
|
3337
|
+
baseUrl: config.ollamaBaseUrl,
|
|
3338
|
+
fetchImpl: deps?.fetchImpl,
|
|
3339
|
+
timeoutMs: config.requestTimeoutMs
|
|
3340
|
+
});
|
|
1686
3341
|
this.jobOllama =
|
|
1687
3342
|
deps?.ollama ||
|
|
1688
3343
|
new OllamaClient({
|
|
@@ -1690,6 +3345,107 @@ export class SelfHostedNodeRuntime {
|
|
|
1690
3345
|
fetchImpl: deps?.fetchImpl,
|
|
1691
3346
|
timeoutMs: config.jobTimeoutMs
|
|
1692
3347
|
});
|
|
3348
|
+
this.capabilityRunner = deps?.capabilityRunner || defaultCommandRunner;
|
|
3349
|
+
this.genericRunners = new Map((deps?.genericRunners || createDefaultGenericJobRunners(this.capabilityRunner)).map((runner) => [runner.id, runner]));
|
|
3350
|
+
this.artifactStore =
|
|
3351
|
+
deps?.artifactStore ||
|
|
3352
|
+
new MswarmLocalArtifactStore({
|
|
3353
|
+
rootDir: config.artifactStorePath || defaultArtifactStorePath()
|
|
3354
|
+
});
|
|
3355
|
+
}
|
|
3356
|
+
updateLocalQueueTelemetry(input) {
|
|
3357
|
+
if (input.llmQueuedJobs !== undefined) {
|
|
3358
|
+
this.queuedLlmJobs = nonNegativeTelemetryInteger(input.llmQueuedJobs);
|
|
3359
|
+
}
|
|
3360
|
+
if (input.genericQueuedJobs !== undefined) {
|
|
3361
|
+
this.queuedGenericJobs = nonNegativeTelemetryInteger(input.genericQueuedJobs);
|
|
3362
|
+
}
|
|
3363
|
+
}
|
|
3364
|
+
beginExecutionTelemetry(executionClass) {
|
|
3365
|
+
if (executionClass === "generic_job") {
|
|
3366
|
+
this.activeGenericJobs += 1;
|
|
3367
|
+
return;
|
|
3368
|
+
}
|
|
3369
|
+
this.activeLlmJobs += 1;
|
|
3370
|
+
}
|
|
3371
|
+
finishExecutionTelemetry(input) {
|
|
3372
|
+
if (input.executionClass === "generic_job") {
|
|
3373
|
+
this.activeGenericJobs = Math.max(0, this.activeGenericJobs - 1);
|
|
3374
|
+
}
|
|
3375
|
+
else {
|
|
3376
|
+
this.activeLlmJobs = Math.max(0, this.activeLlmJobs - 1);
|
|
3377
|
+
}
|
|
3378
|
+
this.latencySamplesMs.push(Math.max(0, Date.now() - input.startedAt));
|
|
3379
|
+
while (this.latencySamplesMs.length > MAX_TELEMETRY_LATENCY_SAMPLES) {
|
|
3380
|
+
this.latencySamplesMs.shift();
|
|
3381
|
+
}
|
|
3382
|
+
if (!input.ok) {
|
|
3383
|
+
this.recentFailures.unshift({
|
|
3384
|
+
execution_class: input.executionClass === "generic_job" ? "generic_job" : "agentic",
|
|
3385
|
+
code: optionalText(input.code) || "upstream_error",
|
|
3386
|
+
at: new Date().toISOString()
|
|
3387
|
+
});
|
|
3388
|
+
this.recentFailures.splice(MAX_TELEMETRY_FAILURES);
|
|
3389
|
+
}
|
|
3390
|
+
}
|
|
3391
|
+
averageLatencyMs(fallback = null) {
|
|
3392
|
+
if (this.latencySamplesMs.length === 0) {
|
|
3393
|
+
return fallback;
|
|
3394
|
+
}
|
|
3395
|
+
const total = this.latencySamplesMs.reduce((sum, value) => sum + value, 0);
|
|
3396
|
+
return Math.round(total / this.latencySamplesMs.length);
|
|
3397
|
+
}
|
|
3398
|
+
buildLoadTelemetry(input) {
|
|
3399
|
+
const drainMode = this.config.drainMode === true;
|
|
3400
|
+
const llmMaxConcurrency = Math.max(1, Math.floor(this.config.maxConcurrentLlmJobs || this.config.maxConcurrentJobs || 1));
|
|
3401
|
+
const genericMaxConcurrency = Math.max(1, Math.floor(this.config.genericJobMaxConcurrency || 1));
|
|
3402
|
+
const maxConcurrency = Math.max(1, Math.floor(this.config.maxConcurrentJobs || 1), llmMaxConcurrency, this.config.genericJobsEnabled ? genericMaxConcurrency : 1);
|
|
3403
|
+
const activeLlmJobs = nonNegativeTelemetryInteger(this.activeLlmJobs);
|
|
3404
|
+
const activeGenericJobs = nonNegativeTelemetryInteger(this.activeGenericJobs);
|
|
3405
|
+
const queuedLlmJobs = nonNegativeTelemetryInteger(this.queuedLlmJobs);
|
|
3406
|
+
const queuedGenericJobs = nonNegativeTelemetryInteger(this.queuedGenericJobs);
|
|
3407
|
+
const llmCapacity = executionClassCapacity({
|
|
3408
|
+
maxConcurrency: llmMaxConcurrency,
|
|
3409
|
+
activeJobs: activeLlmJobs,
|
|
3410
|
+
queuedJobs: queuedLlmJobs,
|
|
3411
|
+
drainMode
|
|
3412
|
+
});
|
|
3413
|
+
const genericCapacity = executionClassCapacity({
|
|
3414
|
+
maxConcurrency: genericMaxConcurrency,
|
|
3415
|
+
activeJobs: activeGenericJobs,
|
|
3416
|
+
queuedJobs: queuedGenericJobs,
|
|
3417
|
+
drainMode: drainMode || !this.config.genericJobsEnabled
|
|
3418
|
+
});
|
|
3419
|
+
const activeJobs = activeLlmJobs + activeGenericJobs;
|
|
3420
|
+
const queuedJobs = queuedLlmJobs + queuedGenericJobs;
|
|
3421
|
+
const freeSlots = drainMode ? 0 : Math.max(0, maxConcurrency - activeJobs - queuedJobs);
|
|
3422
|
+
const failures = this.recentFailures.slice(0, 10);
|
|
3423
|
+
const discoveryFailureCount = nonNegativeTelemetryInteger(input.discoveryFailureCount);
|
|
3424
|
+
const telemetry = {
|
|
3425
|
+
runtime_protocol_version: SELF_HOSTED_RUNTIME_PROTOCOL_VERSION,
|
|
3426
|
+
load_balancer_protocol_version: SELF_HOSTED_LOAD_BALANCER_PROTOCOL_VERSION,
|
|
3427
|
+
catalog_metadata_version: SELF_HOSTED_CATALOG_METADATA_VERSION,
|
|
3428
|
+
catalog_fingerprint: buildCatalogFingerprint(input.models),
|
|
3429
|
+
max_concurrency: maxConcurrency,
|
|
3430
|
+
max_concurrent_llm_jobs: llmMaxConcurrency,
|
|
3431
|
+
max_concurrent_generic_jobs: this.config.genericJobsEnabled ? genericMaxConcurrency : 0,
|
|
3432
|
+
active_jobs: activeJobs,
|
|
3433
|
+
queued_jobs: queuedJobs,
|
|
3434
|
+
free_slots: freeSlots,
|
|
3435
|
+
drain_mode: drainMode,
|
|
3436
|
+
execution_class_capacity: {
|
|
3437
|
+
chat: llmCapacity,
|
|
3438
|
+
agentic: llmCapacity,
|
|
3439
|
+
generic_job: genericCapacity
|
|
3440
|
+
},
|
|
3441
|
+
avg_latency_ms: this.averageLatencyMs(input.discoveryLatencyMs ?? null),
|
|
3442
|
+
recent_failure_count: failures.length + discoveryFailureCount,
|
|
3443
|
+
recent_failures: failures
|
|
3444
|
+
};
|
|
3445
|
+
if (this.config.hardwareTelemetryEnabled === true) {
|
|
3446
|
+
telemetry.hardware_pressure = buildCoarseHardwarePressure(input.capabilityPayload || null);
|
|
3447
|
+
}
|
|
3448
|
+
return telemetry;
|
|
1693
3449
|
}
|
|
1694
3450
|
static async setup(setupConfig, deps) {
|
|
1695
3451
|
const gateway = deps?.gateway ||
|
|
@@ -1711,7 +3467,13 @@ export class SelfHostedNodeRuntime {
|
|
|
1711
3467
|
expose_all_models: setupConfig.exposeAllModels,
|
|
1712
3468
|
model_allowlist: setupConfig.modelAllowlist,
|
|
1713
3469
|
model_blocklist: setupConfig.modelBlocklist,
|
|
1714
|
-
heartbeat_interval_seconds: setupConfig.heartbeatIntervalSeconds
|
|
3470
|
+
heartbeat_interval_seconds: setupConfig.heartbeatIntervalSeconds,
|
|
3471
|
+
max_concurrent_jobs: setupConfig.maxConcurrentJobs,
|
|
3472
|
+
max_concurrent_llm_jobs: setupConfig.maxConcurrentLlmJobs,
|
|
3473
|
+
drain_mode: setupConfig.drainMode,
|
|
3474
|
+
load_reporting_enabled: setupConfig.loadReportingEnabled,
|
|
3475
|
+
hardware_telemetry_enabled: setupConfig.hardwareTelemetryEnabled,
|
|
3476
|
+
generic_job_max_concurrency: setupConfig.genericJobMaxConcurrency
|
|
1715
3477
|
});
|
|
1716
3478
|
const nodeId = optionalText(bootstrap.node?.node_id);
|
|
1717
3479
|
const runtimeToken = optionalText(bootstrap.runtime_token);
|
|
@@ -1726,6 +3488,7 @@ export class SelfHostedNodeRuntime {
|
|
|
1726
3488
|
machine_fingerprint: machineFingerprint,
|
|
1727
3489
|
direct_base_url: setupConfig.directBaseUrl || null,
|
|
1728
3490
|
runtime_token: undefined,
|
|
3491
|
+
artifact_store_path: setupConfig.artifactStorePath || defaultArtifactStorePath(),
|
|
1729
3492
|
config_version: bootstrap.config_version,
|
|
1730
3493
|
heartbeat_interval_seconds: heartbeatInterval,
|
|
1731
3494
|
heartbeat_timeout_seconds: bootstrap.heartbeat_timeout_seconds,
|
|
@@ -1739,6 +3502,15 @@ export class SelfHostedNodeRuntime {
|
|
|
1739
3502
|
node_version: setupConfig.nodeVersion,
|
|
1740
3503
|
request_timeout_ms: setupConfig.requestTimeoutMs,
|
|
1741
3504
|
job_timeout_ms: setupConfig.jobTimeoutMs,
|
|
3505
|
+
max_concurrent_jobs: setupConfig.maxConcurrentJobs,
|
|
3506
|
+
max_concurrent_llm_jobs: setupConfig.maxConcurrentLlmJobs,
|
|
3507
|
+
generic_jobs_enabled: setupConfig.genericJobsEnabled,
|
|
3508
|
+
generic_job_timeout_ms: setupConfig.genericJobTimeoutMs,
|
|
3509
|
+
generic_job_max_concurrency: setupConfig.genericJobMaxConcurrency,
|
|
3510
|
+
capability_probe_timeout_ms: setupConfig.capabilityProbeTimeoutMs || DEFAULT_CAPABILITY_PROBE_TIMEOUT_MS,
|
|
3511
|
+
drain_mode: setupConfig.drainMode,
|
|
3512
|
+
load_reporting_enabled: setupConfig.loadReportingEnabled,
|
|
3513
|
+
hardware_telemetry_enabled: setupConfig.hardwareTelemetryEnabled,
|
|
1742
3514
|
expose_all_models: setupConfig.exposeAllModels,
|
|
1743
3515
|
exposure_policy: setupConfig.exposeAllModels ? "all" : "none",
|
|
1744
3516
|
model_allowlist: setupConfig.modelAllowlist,
|
|
@@ -1761,6 +3533,7 @@ export class SelfHostedNodeRuntime {
|
|
|
1761
3533
|
ollamaBaseUrl: setupConfig.ollamaBaseUrl,
|
|
1762
3534
|
statePath: setupConfig.statePath,
|
|
1763
3535
|
runtimeTokenPath: setupConfig.runtimeTokenPath,
|
|
3536
|
+
artifactStorePath: setupConfig.artifactStorePath || defaultArtifactStorePath(),
|
|
1764
3537
|
invocationSigningSecret: null,
|
|
1765
3538
|
listenHost: DEFAULT_LISTEN_HOST,
|
|
1766
3539
|
listenPort: DEFAULT_LISTEN_PORT,
|
|
@@ -1768,6 +3541,15 @@ export class SelfHostedNodeRuntime {
|
|
|
1768
3541
|
heartbeatIntervalSeconds: heartbeatInterval,
|
|
1769
3542
|
requestTimeoutMs: setupConfig.requestTimeoutMs,
|
|
1770
3543
|
jobTimeoutMs: setupConfig.jobTimeoutMs,
|
|
3544
|
+
maxConcurrentJobs: setupConfig.maxConcurrentJobs,
|
|
3545
|
+
maxConcurrentLlmJobs: setupConfig.maxConcurrentLlmJobs,
|
|
3546
|
+
genericJobsEnabled: setupConfig.genericJobsEnabled,
|
|
3547
|
+
genericJobTimeoutMs: setupConfig.genericJobTimeoutMs,
|
|
3548
|
+
genericJobMaxConcurrency: setupConfig.genericJobMaxConcurrency,
|
|
3549
|
+
capabilityProbeTimeoutMs: setupConfig.capabilityProbeTimeoutMs || DEFAULT_CAPABILITY_PROBE_TIMEOUT_MS,
|
|
3550
|
+
drainMode: setupConfig.drainMode,
|
|
3551
|
+
loadReportingEnabled: setupConfig.loadReportingEnabled,
|
|
3552
|
+
hardwareTelemetryEnabled: setupConfig.hardwareTelemetryEnabled,
|
|
1771
3553
|
exposeAllModels: setupConfig.exposeAllModels,
|
|
1772
3554
|
modelAllowlist: setupConfig.modelAllowlist,
|
|
1773
3555
|
modelBlocklist: setupConfig.modelBlocklist
|
|
@@ -1795,6 +3577,53 @@ export class SelfHostedNodeRuntime {
|
|
|
1795
3577
|
const models = await this.mcoda.listAgents(this.config);
|
|
1796
3578
|
return { source: "mcoda", status: "online", models, version: null, failureCount: 0 };
|
|
1797
3579
|
}
|
|
3580
|
+
async probeCapabilities() {
|
|
3581
|
+
const timeoutMs = capabilityProbeTimeoutMs(this.config);
|
|
3582
|
+
const [gpu, docker, blender, ffmpeg] = await Promise.all([
|
|
3583
|
+
probeNvidiaGpuCapabilities(this.capabilityRunner, timeoutMs),
|
|
3584
|
+
probeDockerCapabilities(this.capabilityRunner, timeoutMs),
|
|
3585
|
+
probeVersionedSoftware(this.capabilityRunner, "blender", "blender", ["--version"], timeoutMs),
|
|
3586
|
+
probeVersionedSoftware(this.capabilityRunner, "ffmpeg", "ffmpeg", ["-version"], timeoutMs)
|
|
3587
|
+
]);
|
|
3588
|
+
const software = {
|
|
3589
|
+
docker: docker.docker,
|
|
3590
|
+
"docker-nvidia": docker.dockerNvidia,
|
|
3591
|
+
blender,
|
|
3592
|
+
ffmpeg
|
|
3593
|
+
};
|
|
3594
|
+
const runnerCatalog = buildRunnerCapabilityCatalog(this.config, this.genericRunners).filter((entry) => runnerCapabilityRequirementsAvailable(entry, {
|
|
3595
|
+
gpu,
|
|
3596
|
+
software,
|
|
3597
|
+
genericJobsEnabled: this.config.genericJobsEnabled
|
|
3598
|
+
}));
|
|
3599
|
+
const snapshotWithoutId = {
|
|
3600
|
+
schema_version: MSWARM_CAPABILITY_SCHEMA_VERSION,
|
|
3601
|
+
captured_at: new Date().toISOString(),
|
|
3602
|
+
node_id: this.config.nodeId,
|
|
3603
|
+
platform: platform(),
|
|
3604
|
+
arch: process.arch,
|
|
3605
|
+
generic_jobs_enabled: this.config.genericJobsEnabled,
|
|
3606
|
+
job_types: uniqueSortedStrings(runnerCatalog.map((entry) => entry.job_type)),
|
|
3607
|
+
trust_modes: uniqueSortedStrings(runnerCatalog.flatMap((entry) => entry.trust_modes)),
|
|
3608
|
+
gpu,
|
|
3609
|
+
software,
|
|
3610
|
+
runner_catalog: runnerCatalog
|
|
3611
|
+
};
|
|
3612
|
+
const snapshot = {
|
|
3613
|
+
...snapshotWithoutId,
|
|
3614
|
+
snapshot_id: buildCapabilitySnapshotId(snapshotWithoutId)
|
|
3615
|
+
};
|
|
3616
|
+
const diagnostics = capabilityDiagnostics(snapshot);
|
|
3617
|
+
return diagnostics ? { ...snapshot, diagnostics } : snapshot;
|
|
3618
|
+
}
|
|
3619
|
+
async publicCapabilityProjection() {
|
|
3620
|
+
return projectMswarmPublicCapabilities(await this.probeCapabilities());
|
|
3621
|
+
}
|
|
3622
|
+
async buildCapabilityHeartbeatPayload(runtimeToken) {
|
|
3623
|
+
const snapshot = await this.probeCapabilities();
|
|
3624
|
+
const privateCatalogEntry = buildMswarmPrivateCapabilityCatalogEntry(snapshot);
|
|
3625
|
+
return signCapabilityPayload({ privateCatalogEntry, runtimeToken });
|
|
3626
|
+
}
|
|
1798
3627
|
async ensureEnrolled() {
|
|
1799
3628
|
const currentState = await readSelfHostedNodeState(this.config.statePath);
|
|
1800
3629
|
const persistedRuntimeToken = await readSelfHostedRuntimeToken(this.config.runtimeTokenPath);
|
|
@@ -1827,6 +3656,14 @@ export class SelfHostedNodeRuntime {
|
|
|
1827
3656
|
node_version: this.config.nodeVersion,
|
|
1828
3657
|
request_timeout_ms: this.config.requestTimeoutMs,
|
|
1829
3658
|
job_timeout_ms: this.config.jobTimeoutMs,
|
|
3659
|
+
max_concurrent_jobs: this.config.maxConcurrentJobs,
|
|
3660
|
+
max_concurrent_llm_jobs: this.config.maxConcurrentLlmJobs,
|
|
3661
|
+
generic_jobs_enabled: this.config.genericJobsEnabled,
|
|
3662
|
+
generic_job_timeout_ms: this.config.genericJobTimeoutMs,
|
|
3663
|
+
generic_job_max_concurrency: this.config.genericJobMaxConcurrency,
|
|
3664
|
+
drain_mode: this.config.drainMode === true,
|
|
3665
|
+
load_reporting_enabled: this.config.loadReportingEnabled !== false,
|
|
3666
|
+
hardware_telemetry_enabled: this.config.hardwareTelemetryEnabled === true,
|
|
1830
3667
|
expose_all_models: this.config.exposeAllModels,
|
|
1831
3668
|
exposure_policy: this.config.exposeAllModels ? "all" : "none",
|
|
1832
3669
|
model_allowlist: this.config.modelAllowlist,
|
|
@@ -1837,27 +3674,213 @@ export class SelfHostedNodeRuntime {
|
|
|
1837
3674
|
return { runtimeToken, state: nextState, enrolled: true };
|
|
1838
3675
|
}
|
|
1839
3676
|
async resolveMcodaAgentForJob(job) {
|
|
1840
|
-
const
|
|
1841
|
-
|
|
1842
|
-
|
|
1843
|
-
|
|
3677
|
+
const selectedSourceAgentSlug = optionalText(job.source_agent_slug);
|
|
3678
|
+
const selectedAgentSlug = optionalText(job.agent_slug);
|
|
3679
|
+
const selectedModel = optionalText(job.model) || optionalText(job.openai_request.model);
|
|
3680
|
+
const selected = selectedSourceAgentSlug || selectedAgentSlug || selectedModel;
|
|
1844
3681
|
if (!selected) {
|
|
1845
|
-
throw new
|
|
3682
|
+
throw new SelfHostedPreStartJobError("selected_agent_unavailable", "mcoda source agent slug is required");
|
|
1846
3683
|
}
|
|
1847
3684
|
const rawAgents = await this.mcoda.listRawAgents();
|
|
3685
|
+
const strictSelectedAgent = selectedSourceAgentSlug || selectedAgentSlug;
|
|
1848
3686
|
const agent = rawAgents.find((entry) => {
|
|
1849
3687
|
const slug = optionalText(entry.slug);
|
|
3688
|
+
if (strictSelectedAgent) {
|
|
3689
|
+
return slug === strictSelectedAgent;
|
|
3690
|
+
}
|
|
1850
3691
|
const defaultModel = mcodaAgentDefaultModel(entry);
|
|
1851
3692
|
return slug === selected || defaultModel === selected;
|
|
1852
3693
|
});
|
|
1853
|
-
if (!agent
|
|
1854
|
-
throw new
|
|
3694
|
+
if (!agent) {
|
|
3695
|
+
throw new SelfHostedPreStartJobError("selected_agent_unavailable", `selected local mcoda agent ${selected} is not available on this node`);
|
|
3696
|
+
}
|
|
3697
|
+
const mapped = mapMcodaAgentToSelfHostedModel(agent, this.config);
|
|
3698
|
+
if (!mapped?.exposed) {
|
|
3699
|
+
throw new SelfHostedPreStartJobError("selected_agent_unavailable", `selected local mcoda agent ${selected} is not exposed by this node`);
|
|
3700
|
+
}
|
|
3701
|
+
if (mapped.health_status && mapped.health_status !== "healthy" && mapped.health_status !== "unknown") {
|
|
3702
|
+
throw new SelfHostedPreStartJobError("selected_agent_unhealthy", `selected local mcoda agent ${selected} is ${mapped.health_status}`);
|
|
1855
3703
|
}
|
|
1856
3704
|
return mapMcodaAgentToCodaliAgent(agent, selected);
|
|
1857
3705
|
}
|
|
3706
|
+
async executeGenericJob(envelope, options = {}) {
|
|
3707
|
+
const startedAt = Date.now();
|
|
3708
|
+
this.beginExecutionTelemetry("generic_job");
|
|
3709
|
+
const events = [];
|
|
3710
|
+
let sequence = 0;
|
|
3711
|
+
const emitEvent = async (event) => {
|
|
3712
|
+
const next = {
|
|
3713
|
+
job_id: envelope.job_id,
|
|
3714
|
+
sequence,
|
|
3715
|
+
timestamp: new Date().toISOString(),
|
|
3716
|
+
...event
|
|
3717
|
+
};
|
|
3718
|
+
sequence += 1;
|
|
3719
|
+
events.push(next);
|
|
3720
|
+
await options.onEvent?.(next);
|
|
3721
|
+
};
|
|
3722
|
+
const failed = async (code, message, validationIssues) => {
|
|
3723
|
+
await emitEvent({
|
|
3724
|
+
type: code === "cancelled" ? "cancelled" : "failed",
|
|
3725
|
+
message,
|
|
3726
|
+
data: { code }
|
|
3727
|
+
});
|
|
3728
|
+
const status = code === "cancelled" ? "cancelled" : "failed";
|
|
3729
|
+
const result = {
|
|
3730
|
+
job_id: envelope.job_id,
|
|
3731
|
+
status,
|
|
3732
|
+
error: {
|
|
3733
|
+
code,
|
|
3734
|
+
message,
|
|
3735
|
+
retryable: code === "timeout"
|
|
3736
|
+
},
|
|
3737
|
+
finished_at: new Date().toISOString()
|
|
3738
|
+
};
|
|
3739
|
+
this.finishExecutionTelemetry({
|
|
3740
|
+
executionClass: "generic_job",
|
|
3741
|
+
startedAt,
|
|
3742
|
+
ok: false,
|
|
3743
|
+
code
|
|
3744
|
+
});
|
|
3745
|
+
return {
|
|
3746
|
+
job_id: envelope.job_id,
|
|
3747
|
+
request_id: envelope.request_id,
|
|
3748
|
+
status,
|
|
3749
|
+
result,
|
|
3750
|
+
events,
|
|
3751
|
+
...(validationIssues?.length ? { validation_issues: validationIssues } : {}),
|
|
3752
|
+
timing: { local_latency_ms: Date.now() - startedAt }
|
|
3753
|
+
};
|
|
3754
|
+
};
|
|
3755
|
+
if (!this.config.genericJobsEnabled) {
|
|
3756
|
+
return failed("feature_disabled", "Generic node jobs are disabled on this node.");
|
|
3757
|
+
}
|
|
3758
|
+
if (envelope.node_id !== this.config.nodeId) {
|
|
3759
|
+
return failed("validation_failed", "generic job node_id does not match this node");
|
|
3760
|
+
}
|
|
3761
|
+
const validation = validateMswarmGenericJobRequest(envelope.job, {
|
|
3762
|
+
registeredJobCatalog: registeredOwnerLocalGenericJobCatalog()
|
|
3763
|
+
});
|
|
3764
|
+
if (!validation.ok || !validation.value) {
|
|
3765
|
+
return failed("validation_failed", "generic job request failed validation", validation.issues);
|
|
3766
|
+
}
|
|
3767
|
+
const job = validation.value;
|
|
3768
|
+
const runner = runnerForGenericJob(job, this.genericRunners);
|
|
3769
|
+
if (!runner) {
|
|
3770
|
+
return failed("runner_unavailable", `No generic job runner is registered for ${job.job_type}.`);
|
|
3771
|
+
}
|
|
3772
|
+
if (job.job_type === RENDER_BLENDER_JOB_TYPE || job.job_type === CUDA_RUN_JOB_TYPE) {
|
|
3773
|
+
const capabilityMismatch = genericJobCapabilityMismatch(job, await this.probeCapabilities());
|
|
3774
|
+
if (capabilityMismatch) {
|
|
3775
|
+
return failed(capabilityMismatch.code, capabilityMismatch.message);
|
|
3776
|
+
}
|
|
3777
|
+
}
|
|
3778
|
+
let artifactContext;
|
|
3779
|
+
try {
|
|
3780
|
+
artifactContext = await this.artifactStore.prepareJobWorkspace(envelope.job_id, job);
|
|
3781
|
+
}
|
|
3782
|
+
catch (error) {
|
|
3783
|
+
return failed("validation_failed", error instanceof Error ? error.message : String(error || "generic job artifact preparation failed"));
|
|
3784
|
+
}
|
|
3785
|
+
const controller = new AbortController();
|
|
3786
|
+
const timeoutMs = genericJobTimeoutMs(job, this.config.genericJobTimeoutMs || this.config.jobTimeoutMs);
|
|
3787
|
+
const onAbort = () => {
|
|
3788
|
+
if (!controller.signal.aborted) {
|
|
3789
|
+
controller.abort(options.signal?.reason || "cancelled");
|
|
3790
|
+
}
|
|
3791
|
+
};
|
|
3792
|
+
if (options.signal?.aborted) {
|
|
3793
|
+
controller.abort(options.signal.reason || "cancelled");
|
|
3794
|
+
}
|
|
3795
|
+
options.signal?.addEventListener("abort", onAbort, { once: true });
|
|
3796
|
+
const timeout = setTimeout(() => {
|
|
3797
|
+
if (!controller.signal.aborted) {
|
|
3798
|
+
controller.abort("timeout");
|
|
3799
|
+
}
|
|
3800
|
+
}, timeoutMs);
|
|
3801
|
+
try {
|
|
3802
|
+
await emitEvent({
|
|
3803
|
+
type: "started",
|
|
3804
|
+
message: `Running ${job.job_type}`,
|
|
3805
|
+
data: {
|
|
3806
|
+
runner: runner.id,
|
|
3807
|
+
sandbox_profile: artifactContext.sandbox.name,
|
|
3808
|
+
timeout_ms: timeoutMs
|
|
3809
|
+
}
|
|
3810
|
+
});
|
|
3811
|
+
const runnerResult = await runner.run({
|
|
3812
|
+
job,
|
|
3813
|
+
signal: controller.signal,
|
|
3814
|
+
emitEvent,
|
|
3815
|
+
artifacts: artifactContext,
|
|
3816
|
+
sandbox: artifactContext.sandbox
|
|
3817
|
+
});
|
|
3818
|
+
const status = runnerResult.status || "succeeded";
|
|
3819
|
+
const outputContext = status === "succeeded"
|
|
3820
|
+
? artifactContext
|
|
3821
|
+
: {
|
|
3822
|
+
...artifactContext,
|
|
3823
|
+
outputSpecs: artifactContext.outputSpecs.map((output) => ({ ...output, required: false }))
|
|
3824
|
+
};
|
|
3825
|
+
const outputArtifacts = await this.artifactStore.collectOutputs(outputContext, envelope.job_id);
|
|
3826
|
+
for (const artifact of outputArtifacts) {
|
|
3827
|
+
await emitEvent({
|
|
3828
|
+
type: "artifact",
|
|
3829
|
+
message: "output artifact collected",
|
|
3830
|
+
data: { artifact }
|
|
3831
|
+
});
|
|
3832
|
+
}
|
|
3833
|
+
const result = {
|
|
3834
|
+
...runnerResult,
|
|
3835
|
+
job_id: envelope.job_id,
|
|
3836
|
+
status,
|
|
3837
|
+
artifacts: [...(runnerResult.artifacts || []), ...outputArtifacts],
|
|
3838
|
+
started_at: runnerResult.started_at || new Date(startedAt).toISOString(),
|
|
3839
|
+
finished_at: runnerResult.finished_at || new Date().toISOString()
|
|
3840
|
+
};
|
|
3841
|
+
await emitEvent({
|
|
3842
|
+
type: status === "succeeded" ? "completed" : "failed",
|
|
3843
|
+
message: status === "succeeded" ? "generic job completed" : runnerResult.error?.message || "generic job failed",
|
|
3844
|
+
data: {
|
|
3845
|
+
status,
|
|
3846
|
+
exit_code: result.exit_code,
|
|
3847
|
+
runner: runner.id
|
|
3848
|
+
}
|
|
3849
|
+
});
|
|
3850
|
+
this.finishExecutionTelemetry({
|
|
3851
|
+
executionClass: "generic_job",
|
|
3852
|
+
startedAt,
|
|
3853
|
+
ok: status === "succeeded",
|
|
3854
|
+
code: runnerResult.error?.code || status
|
|
3855
|
+
});
|
|
3856
|
+
return {
|
|
3857
|
+
job_id: envelope.job_id,
|
|
3858
|
+
request_id: envelope.request_id,
|
|
3859
|
+
status,
|
|
3860
|
+
result,
|
|
3861
|
+
events,
|
|
3862
|
+
timing: { local_latency_ms: Date.now() - startedAt }
|
|
3863
|
+
};
|
|
3864
|
+
}
|
|
3865
|
+
catch (error) {
|
|
3866
|
+
const code = isGenericAbortError(error, controller.signal) ? abortErrorCode(controller.signal) : "runner_error";
|
|
3867
|
+
const message = code === "timeout" || code === "cancelled"
|
|
3868
|
+
? abortErrorMessage(controller.signal)
|
|
3869
|
+
: error instanceof Error
|
|
3870
|
+
? error.message
|
|
3871
|
+
: String(error);
|
|
3872
|
+
return failed(code, message);
|
|
3873
|
+
}
|
|
3874
|
+
finally {
|
|
3875
|
+
clearTimeout(timeout);
|
|
3876
|
+
options.signal?.removeEventListener("abort", onAbort);
|
|
3877
|
+
}
|
|
3878
|
+
}
|
|
1858
3879
|
async executeJob(job, options = {}) {
|
|
1859
3880
|
const startedAt = Date.now();
|
|
3881
|
+
this.beginExecutionTelemetry("llm");
|
|
1860
3882
|
let selectedAgent;
|
|
3883
|
+
let jobStarted = false;
|
|
1861
3884
|
const progressEvents = [];
|
|
1862
3885
|
const streamEvents = [];
|
|
1863
3886
|
const recordProgress = async (event) => {
|
|
@@ -1868,13 +3891,35 @@ export class SelfHostedNodeRuntime {
|
|
|
1868
3891
|
streamEvents.push(chunk);
|
|
1869
3892
|
await options.onOpenAIChunk?.(chunk);
|
|
1870
3893
|
};
|
|
3894
|
+
const acknowledgeStarted = async (agent) => {
|
|
3895
|
+
if (jobStarted) {
|
|
3896
|
+
return;
|
|
3897
|
+
}
|
|
3898
|
+
await options.onStarted?.({
|
|
3899
|
+
job_id: job.job_id,
|
|
3900
|
+
request_id: job.request_id,
|
|
3901
|
+
node_id: job.node_id,
|
|
3902
|
+
agent_slug: optionalText(job.agent_slug) || agent?.slug || "",
|
|
3903
|
+
source_agent_slug: optionalText(job.source_agent_slug) || agent?.slug || null,
|
|
3904
|
+
model: optionalText(job.model) || optionalText(job.openai_request.model)
|
|
3905
|
+
});
|
|
3906
|
+
jobStarted = true;
|
|
3907
|
+
};
|
|
1871
3908
|
if (job.node_id !== this.config.nodeId) {
|
|
1872
|
-
|
|
3909
|
+
const result = {
|
|
1873
3910
|
job_id: job.job_id,
|
|
1874
3911
|
request_id: job.request_id,
|
|
1875
3912
|
status: "failed",
|
|
3913
|
+
pre_start_failure: true,
|
|
1876
3914
|
error: { code: "validation_failed", message: "job node_id does not match this node" }
|
|
1877
3915
|
};
|
|
3916
|
+
this.finishExecutionTelemetry({
|
|
3917
|
+
executionClass: "llm",
|
|
3918
|
+
startedAt,
|
|
3919
|
+
ok: false,
|
|
3920
|
+
code: "validation_failed"
|
|
3921
|
+
});
|
|
3922
|
+
return result;
|
|
1878
3923
|
}
|
|
1879
3924
|
try {
|
|
1880
3925
|
if (job.provider === "ollama") {
|
|
@@ -1887,7 +3932,8 @@ export class SelfHostedNodeRuntime {
|
|
|
1887
3932
|
options.num_predict = job.openai_request.max_tokens;
|
|
1888
3933
|
if (job.openai_request.stop !== undefined)
|
|
1889
3934
|
options.stop = job.openai_request.stop;
|
|
1890
|
-
|
|
3935
|
+
await acknowledgeStarted();
|
|
3936
|
+
const ollamaResult = await this.jobOllama.chat({
|
|
1891
3937
|
model: job.model || job.openai_request.model,
|
|
1892
3938
|
messages: job.openai_request.messages,
|
|
1893
3939
|
options,
|
|
@@ -1900,7 +3946,7 @@ export class SelfHostedNodeRuntime {
|
|
|
1900
3946
|
created: Math.floor(Date.now() / 1000),
|
|
1901
3947
|
model: job.openai_request.model,
|
|
1902
3948
|
choices: [
|
|
1903
|
-
{ index: 0, delta: { content:
|
|
3949
|
+
{ index: 0, delta: { content: ollamaResult.content }, finish_reason: null }
|
|
1904
3950
|
]
|
|
1905
3951
|
});
|
|
1906
3952
|
await emitOpenAIChunk({
|
|
@@ -1913,22 +3959,28 @@ export class SelfHostedNodeRuntime {
|
|
|
1913
3959
|
]
|
|
1914
3960
|
});
|
|
1915
3961
|
}
|
|
1916
|
-
|
|
3962
|
+
const invocationResult = {
|
|
1917
3963
|
job_id: job.job_id,
|
|
1918
3964
|
request_id: job.request_id,
|
|
1919
3965
|
status: "success",
|
|
1920
3966
|
openai_response: buildOpenAIChatCompletion({
|
|
1921
3967
|
requestId: job.request_id,
|
|
1922
3968
|
model: job.openai_request.model,
|
|
1923
|
-
content:
|
|
1924
|
-
promptTokens:
|
|
1925
|
-
completionTokens:
|
|
1926
|
-
metadata: { provider: "ollama", raw:
|
|
3969
|
+
content: ollamaResult.content,
|
|
3970
|
+
promptTokens: ollamaResult.promptTokens,
|
|
3971
|
+
completionTokens: ollamaResult.completionTokens,
|
|
3972
|
+
metadata: { provider: "ollama", raw: ollamaResult.raw }
|
|
1927
3973
|
}),
|
|
1928
3974
|
...(streamEvents.length ? { stream_events: streamEvents } : {}),
|
|
1929
3975
|
...(progressEvents.length ? { progress_events: progressEvents } : {}),
|
|
1930
3976
|
timing: { local_latency_ms: Date.now() - startedAt }
|
|
1931
3977
|
};
|
|
3978
|
+
this.finishExecutionTelemetry({
|
|
3979
|
+
executionClass: "llm",
|
|
3980
|
+
startedAt,
|
|
3981
|
+
ok: true
|
|
3982
|
+
});
|
|
3983
|
+
return invocationResult;
|
|
1932
3984
|
}
|
|
1933
3985
|
const taskPreview = messagesToPrompt(job.openai_request.messages);
|
|
1934
3986
|
if (!taskPreview) {
|
|
@@ -1938,6 +3990,7 @@ export class SelfHostedNodeRuntime {
|
|
|
1938
3990
|
selectedAgent = agent;
|
|
1939
3991
|
validateRequiredDocdexContext(job, options.attachedMswarmApiKey);
|
|
1940
3992
|
const attachedMswarmApiKey = attachedMswarmApiKeyForDocdex(job, options.attachedMswarmApiKey);
|
|
3993
|
+
await acknowledgeStarted(agent);
|
|
1941
3994
|
await recordProgress({
|
|
1942
3995
|
type: "agent_selected",
|
|
1943
3996
|
job_id: job.job_id,
|
|
@@ -1976,7 +4029,7 @@ export class SelfHostedNodeRuntime {
|
|
|
1976
4029
|
}
|
|
1977
4030
|
});
|
|
1978
4031
|
const tokens = usageTokens(response.usage);
|
|
1979
|
-
|
|
4032
|
+
const result = {
|
|
1980
4033
|
job_id: job.job_id,
|
|
1981
4034
|
request_id: job.request_id,
|
|
1982
4035
|
status: "success",
|
|
@@ -2002,6 +4055,12 @@ export class SelfHostedNodeRuntime {
|
|
|
2002
4055
|
...(progressEvents.length ? { progress_events: progressEvents } : {}),
|
|
2003
4056
|
timing: { local_latency_ms: Date.now() - startedAt }
|
|
2004
4057
|
};
|
|
4058
|
+
this.finishExecutionTelemetry({
|
|
4059
|
+
executionClass: "llm",
|
|
4060
|
+
startedAt,
|
|
4061
|
+
ok: true
|
|
4062
|
+
});
|
|
4063
|
+
return result;
|
|
2005
4064
|
}
|
|
2006
4065
|
catch (error) {
|
|
2007
4066
|
const message = redactRuntimeSecretValues(error instanceof Error ? error.message : String(error), [selectedAgent?.apiKey, options.attachedMswarmApiKey]);
|
|
@@ -2014,10 +4073,11 @@ export class SelfHostedNodeRuntime {
|
|
|
2014
4073
|
: /permission|policy|denied/i.test(message)
|
|
2015
4074
|
? "policy_denied"
|
|
2016
4075
|
: "upstream_error");
|
|
2017
|
-
|
|
4076
|
+
const result = {
|
|
2018
4077
|
job_id: job.job_id,
|
|
2019
4078
|
request_id: job.request_id,
|
|
2020
4079
|
status: "failed",
|
|
4080
|
+
...(!jobStarted ? { pre_start_failure: true } : {}),
|
|
2021
4081
|
error: {
|
|
2022
4082
|
code,
|
|
2023
4083
|
message
|
|
@@ -2026,6 +4086,13 @@ export class SelfHostedNodeRuntime {
|
|
|
2026
4086
|
...(progressEvents.length ? { progress_events: progressEvents } : {}),
|
|
2027
4087
|
timing: { local_latency_ms: Date.now() - startedAt }
|
|
2028
4088
|
};
|
|
4089
|
+
this.finishExecutionTelemetry({
|
|
4090
|
+
executionClass: "llm",
|
|
4091
|
+
startedAt,
|
|
4092
|
+
ok: false,
|
|
4093
|
+
code
|
|
4094
|
+
});
|
|
4095
|
+
return result;
|
|
2029
4096
|
}
|
|
2030
4097
|
}
|
|
2031
4098
|
async runOnce() {
|
|
@@ -2050,11 +4117,49 @@ export class SelfHostedNodeRuntime {
|
|
|
2050
4117
|
models = [];
|
|
2051
4118
|
version = null;
|
|
2052
4119
|
}
|
|
4120
|
+
const discoveryLatencyMs = Date.now() - startedAt;
|
|
4121
|
+
const capabilityPayload = await this.buildCapabilityHeartbeatPayload(enrollment.runtimeToken);
|
|
4122
|
+
const loadTelemetry = this.buildLoadTelemetry({
|
|
4123
|
+
models,
|
|
4124
|
+
discoveryLatencyMs,
|
|
4125
|
+
discoveryFailureCount: recentFailureCount,
|
|
4126
|
+
capabilityPayload
|
|
4127
|
+
});
|
|
4128
|
+
const exposedModelCount = models.filter((model) => model.exposed !== false).length;
|
|
4129
|
+
const loadReportingEnabled = this.config.loadReportingEnabled !== false;
|
|
4130
|
+
const capacityPayload = loadReportingEnabled
|
|
4131
|
+
? {
|
|
4132
|
+
protocol_version: loadTelemetry.runtime_protocol_version,
|
|
4133
|
+
runtime_protocol_version: loadTelemetry.runtime_protocol_version,
|
|
4134
|
+
load_balancer_protocol_version: loadTelemetry.load_balancer_protocol_version,
|
|
4135
|
+
catalog_metadata_version: loadTelemetry.catalog_metadata_version,
|
|
4136
|
+
catalog_fingerprint: loadTelemetry.catalog_fingerprint,
|
|
4137
|
+
max_concurrency: loadTelemetry.max_concurrency,
|
|
4138
|
+
max_concurrent_llm_jobs: loadTelemetry.max_concurrent_llm_jobs,
|
|
4139
|
+
max_concurrent_generic_jobs: loadTelemetry.max_concurrent_generic_jobs,
|
|
4140
|
+
active_jobs: loadTelemetry.active_jobs,
|
|
4141
|
+
queued_jobs: loadTelemetry.queued_jobs,
|
|
4142
|
+
free_slots: loadTelemetry.free_slots,
|
|
4143
|
+
drain_mode: loadTelemetry.drain_mode,
|
|
4144
|
+
execution_class_capacity: loadTelemetry.execution_class_capacity
|
|
4145
|
+
}
|
|
4146
|
+
: {
|
|
4147
|
+
active_jobs: loadTelemetry.active_jobs,
|
|
4148
|
+
queued_jobs: loadTelemetry.queued_jobs
|
|
4149
|
+
};
|
|
2053
4150
|
const heartbeatPayload = {
|
|
2054
4151
|
node_id: this.config.nodeId,
|
|
2055
4152
|
node_version: this.config.nodeVersion,
|
|
4153
|
+
runtime_protocol_version: SELF_HOSTED_RUNTIME_PROTOCOL_VERSION,
|
|
2056
4154
|
config_version: enrollment.state.config_version ?? null,
|
|
2057
4155
|
status,
|
|
4156
|
+
runtime: {
|
|
4157
|
+
protocol_version: SELF_HOSTED_RUNTIME_PROTOCOL_VERSION,
|
|
4158
|
+
relay_mode: this.config.relayMode || "outbound",
|
|
4159
|
+
load_reporting_enabled: loadReportingEnabled,
|
|
4160
|
+
hardware_telemetry_enabled: this.config.hardwareTelemetryEnabled === true,
|
|
4161
|
+
drain_mode: this.config.drainMode === true
|
|
4162
|
+
},
|
|
2058
4163
|
discovery: {
|
|
2059
4164
|
source: discoverySource,
|
|
2060
4165
|
mcoda_status: discoverySource === "mcoda" && status === "online" ? "ok" : status === "degraded" ? "error" : null
|
|
@@ -2068,19 +4173,24 @@ export class SelfHostedNodeRuntime {
|
|
|
2068
4173
|
status: null,
|
|
2069
4174
|
version: null
|
|
2070
4175
|
},
|
|
2071
|
-
capacity:
|
|
2072
|
-
active_jobs: 0,
|
|
2073
|
-
queued_jobs: 0
|
|
2074
|
-
},
|
|
4176
|
+
capacity: capacityPayload,
|
|
2075
4177
|
health: {
|
|
2076
|
-
avg_latency_ms:
|
|
2077
|
-
recent_failure_count:
|
|
4178
|
+
avg_latency_ms: loadTelemetry.avg_latency_ms ?? discoveryLatencyMs,
|
|
4179
|
+
recent_failure_count: loadTelemetry.recent_failure_count,
|
|
4180
|
+
recent_failures: loadTelemetry.recent_failures,
|
|
2078
4181
|
last_success_at: status === "online" ? new Date().toISOString() : null
|
|
2079
4182
|
},
|
|
2080
|
-
|
|
4183
|
+
local_agent_catalog: {
|
|
4184
|
+
revision: loadTelemetry.catalog_fingerprint,
|
|
4185
|
+
metadata_version: loadTelemetry.catalog_metadata_version,
|
|
4186
|
+
model_count: models.length,
|
|
4187
|
+
exposed_model_count: exposedModelCount
|
|
4188
|
+
},
|
|
4189
|
+
models,
|
|
4190
|
+
capabilities: capabilityPayload,
|
|
4191
|
+
...(loadTelemetry.hardware_pressure ? { hardware_pressure: loadTelemetry.hardware_pressure } : {})
|
|
2081
4192
|
};
|
|
2082
4193
|
const heartbeatResponse = await this.gateway.heartbeat(enrollment.runtimeToken, heartbeatPayload);
|
|
2083
|
-
const exposedModelCount = models.filter((model) => model.exposed !== false).length;
|
|
2084
4194
|
return {
|
|
2085
4195
|
enrolled: enrollment.enrolled,
|
|
2086
4196
|
status,
|
|
@@ -2088,6 +4198,7 @@ export class SelfHostedNodeRuntime {
|
|
|
2088
4198
|
discovery_source: discoverySource,
|
|
2089
4199
|
mcoda_agent_count: discoverySource === "mcoda" ? exposedModelCount : undefined,
|
|
2090
4200
|
ollama_version: version,
|
|
4201
|
+
capacity: loadTelemetry,
|
|
2091
4202
|
heartbeat_response: heartbeatResponse
|
|
2092
4203
|
};
|
|
2093
4204
|
}
|
|
@@ -2122,9 +4233,17 @@ export class SelfHostedNodeRuntime {
|
|
|
2122
4233
|
}
|
|
2123
4234
|
async pollAndExecuteJob(waitMs = DEFAULT_JOB_POLL_WAIT_MS) {
|
|
2124
4235
|
const enrollment = await this.ensureEnrolled();
|
|
4236
|
+
const pollCapacity = this.buildLoadTelemetry({ models: [] });
|
|
2125
4237
|
const response = await this.gateway.pollJob(enrollment.runtimeToken, {
|
|
2126
4238
|
node_id: this.config.nodeId,
|
|
2127
|
-
capacity: {
|
|
4239
|
+
capacity: {
|
|
4240
|
+
active_jobs: pollCapacity.active_jobs,
|
|
4241
|
+
queued_jobs: pollCapacity.queued_jobs,
|
|
4242
|
+
max_jobs: pollCapacity.max_concurrency,
|
|
4243
|
+
max_concurrency: pollCapacity.max_concurrency,
|
|
4244
|
+
free_slots: pollCapacity.free_slots,
|
|
4245
|
+
drain_mode: pollCapacity.drain_mode
|
|
4246
|
+
},
|
|
2128
4247
|
wait_ms: waitMs
|
|
2129
4248
|
});
|
|
2130
4249
|
const job = response.job || null;
|
|
@@ -2152,6 +4271,14 @@ export class SelfHostedNodeRuntime {
|
|
|
2152
4271
|
};
|
|
2153
4272
|
const result = await this.executeJob(job, {
|
|
2154
4273
|
attachedMswarmApiKey: optionalText(response.attached_mswarm_api_key) || undefined,
|
|
4274
|
+
onStarted: async (event) => {
|
|
4275
|
+
await this.gateway.postJobStart(enrollment.runtimeToken, job.job_id, {
|
|
4276
|
+
node_id: this.config.nodeId,
|
|
4277
|
+
agent_slug: event.agent_slug || job.agent_slug,
|
|
4278
|
+
source_agent_slug: event.source_agent_slug || job.source_agent_slug || null,
|
|
4279
|
+
model: event.model || job.model || job.openai_request.model
|
|
4280
|
+
});
|
|
4281
|
+
},
|
|
2155
4282
|
onOpenAIChunk: async (chunk) => {
|
|
2156
4283
|
if (job.openai_request.stream !== true || streamEventForwardingFailed) {
|
|
2157
4284
|
return;
|