@mcoda/mswarm 0.1.75 → 0.1.78
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +27 -0
- package/dist/invocation-token.d.ts +48 -0
- package/dist/invocation-token.d.ts.map +1 -1
- package/dist/invocation-token.js +109 -0
- package/dist/invocation-token.js.map +1 -1
- package/dist/runtime.d.ts +109 -0
- package/dist/runtime.d.ts.map +1 -1
- package/dist/runtime.js +1730 -6
- package/dist/runtime.js.map +1 -1
- package/dist/server.d.ts.map +1 -1
- package/dist/server.js +1401 -4
- package/dist/server.js.map +1 -1
- package/package.json +4 -4
package/dist/runtime.js
CHANGED
|
@@ -1,9 +1,10 @@
|
|
|
1
|
-
import { chmod, mkdir, readFile, rm, writeFile } from "node:fs/promises";
|
|
2
|
-
import { dirname, join } from "node:path";
|
|
1
|
+
import { chmod, lstat, mkdir, readdir, readFile, rm, writeFile } from "node:fs/promises";
|
|
2
|
+
import { dirname, isAbsolute, join, relative, resolve } from "node:path";
|
|
3
3
|
import { hostname, homedir, platform, userInfo } from "node:os";
|
|
4
4
|
import { spawn } from "node:child_process";
|
|
5
|
-
import { createHash, randomUUID } from "node:crypto";
|
|
5
|
+
import { createHash, createHmac, randomUUID } from "node:crypto";
|
|
6
6
|
import { MswarmCodaliExecutor } from "./codali-executor.js";
|
|
7
|
+
import { MSWARM_CAPABILITY_SCHEMA_VERSION, assertMswarmSafeRelativePath, validateMswarmArchiveEntry, buildMswarmCapabilityNames, buildMswarmPrivateCapabilityCatalogEntry, buildMswarmLocalArtifactUri, buildMswarmSandboxProfile, defaultMswarmArtifactAccessPolicy, defaultMswarmArtifactRetentionPolicy, projectMswarmPublicCapabilities, validateMswarmGenericJobRequest } from "@mcoda/shared";
|
|
7
8
|
const DEFAULT_GATEWAY_BASE_URL = "http://127.0.0.1:8080";
|
|
8
9
|
const DEFAULT_SETUP_GATEWAY_BASE_URL = "https://api.mswarm.org";
|
|
9
10
|
const DEFAULT_OLLAMA_BASE_URL = "http://127.0.0.1:11434";
|
|
@@ -14,11 +15,86 @@ const DEFAULT_SELF_HOSTED_NODE_VERSION = "0.1.70";
|
|
|
14
15
|
const DEFAULT_REQUEST_TIMEOUT_MS = 10000;
|
|
15
16
|
const DEFAULT_JOB_TIMEOUT_MS = 3600000;
|
|
16
17
|
const DEFAULT_SERVICE_COMMAND_TIMEOUT_MS = 60000;
|
|
18
|
+
const DEFAULT_CAPABILITY_PROBE_TIMEOUT_MS = 2000;
|
|
17
19
|
const DEFAULT_MCODA_BIN = "mcoda";
|
|
18
20
|
const DEFAULT_MCODA_LIST_ARGS = ["agent", "list", "--json", "--refresh-health"];
|
|
19
21
|
const DEFAULT_COMMAND_MAX_BUFFER = 16 * 1024 * 1024;
|
|
22
|
+
const DEFAULT_LOCAL_ARTIFACT_MAX_BYTES = 512 * 1024 * 1024;
|
|
20
23
|
const DEFAULT_JOB_POLL_WAIT_MS = 25000;
|
|
21
24
|
const DEFAULT_STREAM_EVENT_BATCH_SIZE = 8;
|
|
25
|
+
const OWNER_LOCAL_TEST_ECHO_JOB_TYPE = "tenant.test-echo";
|
|
26
|
+
const TEST_ECHO_RUNNER_ID = "test.echo";
|
|
27
|
+
const RENDER_BLENDER_JOB_TYPE = "render.blender";
|
|
28
|
+
const BLENDER_RENDER_RUNNER_ID = "blender.render";
|
|
29
|
+
const CUDA_RUN_JOB_TYPE = "cuda.run";
|
|
30
|
+
const CUDA_PACKAGE_RUNNER_ID = "cuda.package";
|
|
31
|
+
const APPROVED_NVIDIA_CUDA_IMAGES = new Set([
|
|
32
|
+
"nvidia/cuda:12.4.1-devel-ubuntu22.04"
|
|
33
|
+
]);
|
|
34
|
+
const OWNER_LOCAL_GENERIC_JOB_CATALOG = [
|
|
35
|
+
{
|
|
36
|
+
job_type: OWNER_LOCAL_TEST_ECHO_JOB_TYPE,
|
|
37
|
+
args_schema: {
|
|
38
|
+
type: "object",
|
|
39
|
+
additionalProperties: true,
|
|
40
|
+
properties: {
|
|
41
|
+
message: { type: "string" },
|
|
42
|
+
delay_ms: { type: "number", minimum: 0 },
|
|
43
|
+
repeat: { type: "number", minimum: 1 },
|
|
44
|
+
fail: { type: "boolean" }
|
|
45
|
+
}
|
|
46
|
+
},
|
|
47
|
+
policy: {
|
|
48
|
+
trust_mode: "owner-local",
|
|
49
|
+
network: "none",
|
|
50
|
+
allow_raw_command: false
|
|
51
|
+
},
|
|
52
|
+
runner: TEST_ECHO_RUNNER_ID
|
|
53
|
+
},
|
|
54
|
+
{
|
|
55
|
+
job_type: RENDER_BLENDER_JOB_TYPE,
|
|
56
|
+
args_schema: {
|
|
57
|
+
type: "object",
|
|
58
|
+
additionalProperties: false,
|
|
59
|
+
properties: {
|
|
60
|
+
frames: { type: ["string", "number"] },
|
|
61
|
+
engine: { enum: ["cycles", "eevee", "workbench"] },
|
|
62
|
+
resolution: { type: "string", pattern: "^[1-9][0-9]{0,4}x[1-9][0-9]{0,4}$" },
|
|
63
|
+
output_format: { enum: ["png", "jpeg", "open_exr"] },
|
|
64
|
+
scene: { type: "string" },
|
|
65
|
+
camera: { type: "string" }
|
|
66
|
+
}
|
|
67
|
+
},
|
|
68
|
+
policy: {
|
|
69
|
+
trust_mode: "owner-local",
|
|
70
|
+
network: "none",
|
|
71
|
+
allow_raw_command: false
|
|
72
|
+
},
|
|
73
|
+
runner: BLENDER_RENDER_RUNNER_ID,
|
|
74
|
+
required_capabilities: ["software.blender"]
|
|
75
|
+
},
|
|
76
|
+
{
|
|
77
|
+
job_type: CUDA_RUN_JOB_TYPE,
|
|
78
|
+
args_schema: {
|
|
79
|
+
type: "object",
|
|
80
|
+
additionalProperties: false,
|
|
81
|
+
required: ["manifest_path", "profile", "target"],
|
|
82
|
+
properties: {
|
|
83
|
+
manifest_path: { type: "string" },
|
|
84
|
+
profile: { type: "string" },
|
|
85
|
+
target: { type: "string" }
|
|
86
|
+
}
|
|
87
|
+
},
|
|
88
|
+
policy: {
|
|
89
|
+
trust_mode: "owner-local",
|
|
90
|
+
network: "none",
|
|
91
|
+
allow_raw_command: false,
|
|
92
|
+
allowed_images: Array.from(APPROVED_NVIDIA_CUDA_IMAGES)
|
|
93
|
+
},
|
|
94
|
+
runner: CUDA_PACKAGE_RUNNER_ID,
|
|
95
|
+
required_capabilities: ["gpu.nvidia", "software.docker", "docker.nvidia"]
|
|
96
|
+
}
|
|
97
|
+
];
|
|
22
98
|
const SERVICE_LABEL = "com.mcoda.mswarm.self-hosted-node";
|
|
23
99
|
const SYSTEMD_SERVICE_NAME = "mswarm-self-hosted-node.service";
|
|
24
100
|
const WINDOWS_TASK_NAME = "MswarmSelfHostedNode";
|
|
@@ -327,6 +403,9 @@ function defaultStatePath() {
|
|
|
327
403
|
function defaultRuntimeTokenPath() {
|
|
328
404
|
return join(homedir(), ".mswarm", "self-hosted-node", "node.key");
|
|
329
405
|
}
|
|
406
|
+
function defaultArtifactStorePath() {
|
|
407
|
+
return join(homedir(), ".mswarm", "self-hosted-node", "artifacts");
|
|
408
|
+
}
|
|
330
409
|
export async function readOrCreateSelfHostedMachineId(machineIdPath = defaultMachineIdPath()) {
|
|
331
410
|
try {
|
|
332
411
|
const existing = (await readFile(machineIdPath, "utf8")).trim();
|
|
@@ -537,6 +616,7 @@ function serviceEnvironment(config, env, homeDir) {
|
|
|
537
616
|
MSWARM_GATEWAY_BASE_URL: config.gatewayBaseUrl,
|
|
538
617
|
MSWARM_SELF_HOSTED_NODE_STATE_PATH: config.statePath,
|
|
539
618
|
MSWARM_SELF_HOSTED_NODE_KEY_PATH: config.runtimeTokenPath,
|
|
619
|
+
MSWARM_SELF_HOSTED_ARTIFACT_STORE_PATH: config.artifactStorePath || null,
|
|
540
620
|
MSWARM_SELF_HOSTED_RELAY_MODE: config.relayMode || "outbound",
|
|
541
621
|
MSWARM_SELF_HOSTED_DIRECT_BASE_URL: config.directBaseUrl || null,
|
|
542
622
|
MSWARM_SELF_HOSTED_DISCOVERY_MODE: config.discoveryMode,
|
|
@@ -550,7 +630,13 @@ function serviceEnvironment(config, env, homeDir) {
|
|
|
550
630
|
MSWARM_SELF_HOSTED_MODEL_BLOCKLIST: config.modelBlocklist.join(","),
|
|
551
631
|
MSWARM_SELF_HOSTED_HEARTBEAT_INTERVAL_SECONDS: String(config.heartbeatIntervalSeconds),
|
|
552
632
|
MSWARM_SELF_HOSTED_REQUEST_TIMEOUT_MS: String(config.requestTimeoutMs),
|
|
553
|
-
MSWARM_SELF_HOSTED_JOB_TIMEOUT_MS: String(config.jobTimeoutMs)
|
|
633
|
+
MSWARM_SELF_HOSTED_JOB_TIMEOUT_MS: String(config.jobTimeoutMs),
|
|
634
|
+
MSWARM_SELF_HOSTED_GENERIC_JOBS_ENABLED: config.genericJobsEnabled ? "true" : "false",
|
|
635
|
+
MSWARM_SELF_HOSTED_GENERIC_JOB_TIMEOUT_MS: String(config.genericJobTimeoutMs),
|
|
636
|
+
MSWARM_SELF_HOSTED_GENERIC_JOB_MAX_CONCURRENCY: String(config.genericJobMaxConcurrency),
|
|
637
|
+
MSWARM_SELF_HOSTED_CAPABILITY_PROBE_TIMEOUT_MS: config.capabilityProbeTimeoutMs
|
|
638
|
+
? String(config.capabilityProbeTimeoutMs)
|
|
639
|
+
: null
|
|
554
640
|
};
|
|
555
641
|
return Object.fromEntries(Object.entries(values).filter((entry) => typeof entry[1] === "string" && entry[1] !== ""));
|
|
556
642
|
}
|
|
@@ -970,6 +1056,9 @@ export async function readSelfHostedNodeConfig(env = process.env) {
|
|
|
970
1056
|
ollamaBaseUrl: trimTrailingSlash(ollamaBaseUrl),
|
|
971
1057
|
statePath,
|
|
972
1058
|
runtimeTokenPath,
|
|
1059
|
+
artifactStorePath: optionalText(env.MSWARM_SELF_HOSTED_ARTIFACT_STORE_PATH) ||
|
|
1060
|
+
state.artifact_store_path ||
|
|
1061
|
+
defaultArtifactStorePath(),
|
|
973
1062
|
invocationSigningSecret: optionalText(env.MSWARM_SELF_HOSTED_INVOCATION_SIGNING_SECRET) ||
|
|
974
1063
|
optionalText(env.MSWARM_SELF_HOSTED_RELAY_SIGNING_SECRET),
|
|
975
1064
|
listenHost: optionalText(env.MSWARM_SELF_HOSTED_LISTEN_HOST) || DEFAULT_LISTEN_HOST,
|
|
@@ -981,6 +1070,10 @@ export async function readSelfHostedNodeConfig(env = process.env) {
|
|
|
981
1070
|
heartbeatIntervalSeconds: parsePositiveInteger(env.MSWARM_SELF_HOSTED_HEARTBEAT_INTERVAL_SECONDS, state.heartbeat_interval_seconds || DEFAULT_HEARTBEAT_INTERVAL_SECONDS),
|
|
982
1071
|
requestTimeoutMs: parsePositiveInteger(env.MSWARM_SELF_HOSTED_REQUEST_TIMEOUT_MS, state.request_timeout_ms || DEFAULT_REQUEST_TIMEOUT_MS),
|
|
983
1072
|
jobTimeoutMs: parsePositiveInteger(env.MSWARM_SELF_HOSTED_JOB_TIMEOUT_MS, state.job_timeout_ms || DEFAULT_JOB_TIMEOUT_MS),
|
|
1073
|
+
genericJobsEnabled: parseBoolean(env.MSWARM_SELF_HOSTED_GENERIC_JOBS_ENABLED ?? env.MSWARM_SELF_HOSTED_GENERIC_JOBS, state.generic_jobs_enabled === true),
|
|
1074
|
+
genericJobTimeoutMs: parsePositiveInteger(env.MSWARM_SELF_HOSTED_GENERIC_JOB_TIMEOUT_MS, state.generic_job_timeout_ms || state.job_timeout_ms || DEFAULT_JOB_TIMEOUT_MS),
|
|
1075
|
+
genericJobMaxConcurrency: parsePositiveInteger(env.MSWARM_SELF_HOSTED_GENERIC_JOB_MAX_CONCURRENCY, state.generic_job_max_concurrency || 1),
|
|
1076
|
+
capabilityProbeTimeoutMs: parsePositiveInteger(env.MSWARM_SELF_HOSTED_CAPABILITY_PROBE_TIMEOUT_MS, state.capability_probe_timeout_ms || DEFAULT_CAPABILITY_PROBE_TIMEOUT_MS),
|
|
984
1077
|
exposeAllModels: resolveDaemonExposeAllModels(env, state),
|
|
985
1078
|
modelAllowlist: parseList(env.MSWARM_SELF_HOSTED_MODEL_ALLOWLIST || state.model_allowlist),
|
|
986
1079
|
modelBlocklist: parseList(env.MSWARM_SELF_HOSTED_MODEL_BLOCKLIST || state.model_blocklist)
|
|
@@ -1022,6 +1115,9 @@ export async function readOwnerSetupConfig(argv = process.argv.slice(3), env = p
|
|
|
1022
1115
|
discoveryMode: parseDiscoveryMode(env.MSWARM_SELF_HOSTED_DISCOVERY_MODE),
|
|
1023
1116
|
statePath,
|
|
1024
1117
|
runtimeTokenPath,
|
|
1118
|
+
artifactStorePath: optionalText(options["artifact-store-path"]) ||
|
|
1119
|
+
optionalText(env.MSWARM_SELF_HOSTED_ARTIFACT_STORE_PATH) ||
|
|
1120
|
+
defaultArtifactStorePath(),
|
|
1025
1121
|
machineIdPath: optionalText(env.MSWARM_SELF_HOSTED_MACHINE_ID_PATH) || defaultMachineIdPath(),
|
|
1026
1122
|
mcodaBin: optionalText(env.MSWARM_SELF_HOSTED_MCODA_BIN) || DEFAULT_MCODA_BIN,
|
|
1027
1123
|
mcodaListArgs: parseArgs(env.MSWARM_SELF_HOSTED_MCODA_LIST_ARGS, DEFAULT_MCODA_LIST_ARGS),
|
|
@@ -1030,6 +1126,10 @@ export async function readOwnerSetupConfig(argv = process.argv.slice(3), env = p
|
|
|
1030
1126
|
heartbeatIntervalSeconds: parsePositiveInteger(env.MSWARM_SELF_HOSTED_HEARTBEAT_INTERVAL_SECONDS, DEFAULT_HEARTBEAT_INTERVAL_SECONDS),
|
|
1031
1127
|
requestTimeoutMs: parsePositiveInteger(env.MSWARM_SELF_HOSTED_REQUEST_TIMEOUT_MS, DEFAULT_REQUEST_TIMEOUT_MS),
|
|
1032
1128
|
jobTimeoutMs: parsePositiveInteger(options["job-timeout-ms"] || env.MSWARM_SELF_HOSTED_JOB_TIMEOUT_MS, DEFAULT_JOB_TIMEOUT_MS),
|
|
1129
|
+
genericJobsEnabled: parseBoolean(options["enable-generic-jobs"] || env.MSWARM_SELF_HOSTED_GENERIC_JOBS_ENABLED || env.MSWARM_SELF_HOSTED_GENERIC_JOBS, false),
|
|
1130
|
+
genericJobTimeoutMs: parsePositiveInteger(options["generic-job-timeout-ms"] || env.MSWARM_SELF_HOSTED_GENERIC_JOB_TIMEOUT_MS, DEFAULT_JOB_TIMEOUT_MS),
|
|
1131
|
+
genericJobMaxConcurrency: parsePositiveInteger(options["generic-job-max-concurrency"] || env.MSWARM_SELF_HOSTED_GENERIC_JOB_MAX_CONCURRENCY, 1),
|
|
1132
|
+
capabilityProbeTimeoutMs: parsePositiveInteger(env.MSWARM_SELF_HOSTED_CAPABILITY_PROBE_TIMEOUT_MS, DEFAULT_CAPABILITY_PROBE_TIMEOUT_MS),
|
|
1033
1133
|
exposeAllModels: resolveOwnerSetupExposeAllModels(options, env),
|
|
1034
1134
|
modelAllowlist: allowlist,
|
|
1035
1135
|
modelBlocklist: blocklist,
|
|
@@ -1148,10 +1248,17 @@ async function defaultCommandRunner(command, args, options) {
|
|
|
1148
1248
|
let stdout = "";
|
|
1149
1249
|
let stderr = "";
|
|
1150
1250
|
let settled = false;
|
|
1251
|
+
const abort = () => {
|
|
1252
|
+
if (settled)
|
|
1253
|
+
return;
|
|
1254
|
+
child.kill("SIGTERM");
|
|
1255
|
+
finish(new Error("command aborted"));
|
|
1256
|
+
};
|
|
1151
1257
|
const timer = setTimeout(() => {
|
|
1152
1258
|
if (settled)
|
|
1153
1259
|
return;
|
|
1154
1260
|
settled = true;
|
|
1261
|
+
options.signal?.removeEventListener("abort", abort);
|
|
1155
1262
|
child.kill("SIGTERM");
|
|
1156
1263
|
reject(new Error(`command timed out after ${options.timeoutMs}ms: ${command}`));
|
|
1157
1264
|
}, options.timeoutMs);
|
|
@@ -1160,6 +1267,7 @@ async function defaultCommandRunner(command, args, options) {
|
|
|
1160
1267
|
return;
|
|
1161
1268
|
settled = true;
|
|
1162
1269
|
clearTimeout(timer);
|
|
1270
|
+
options.signal?.removeEventListener("abort", abort);
|
|
1163
1271
|
if (error) {
|
|
1164
1272
|
reject(error);
|
|
1165
1273
|
return;
|
|
@@ -1188,6 +1296,11 @@ async function defaultCommandRunner(command, args, options) {
|
|
|
1188
1296
|
}
|
|
1189
1297
|
finish();
|
|
1190
1298
|
});
|
|
1299
|
+
if (options.signal?.aborted) {
|
|
1300
|
+
abort();
|
|
1301
|
+
return;
|
|
1302
|
+
}
|
|
1303
|
+
options.signal?.addEventListener("abort", abort, { once: true });
|
|
1191
1304
|
if (options.input) {
|
|
1192
1305
|
child.stdin.write(options.input);
|
|
1193
1306
|
}
|
|
@@ -1530,6 +1643,1387 @@ function buildCodaliPolicy(job) {
|
|
|
1530
1643
|
maxOutputTokens: job.policy?.max_output_tokens ?? job.openai_request.max_tokens,
|
|
1531
1644
|
};
|
|
1532
1645
|
}
|
|
1646
|
+
function numberArg(value, fallback) {
|
|
1647
|
+
const parsed = Number(value);
|
|
1648
|
+
return Number.isFinite(parsed) ? parsed : fallback;
|
|
1649
|
+
}
|
|
1650
|
+
function boundedMilliseconds(value, fallback, max) {
|
|
1651
|
+
return Math.max(0, Math.min(max, Math.floor(numberArg(value, fallback))));
|
|
1652
|
+
}
|
|
1653
|
+
function abortErrorCode(signal) {
|
|
1654
|
+
return signal.reason === "timeout" ? "timeout" : "cancelled";
|
|
1655
|
+
}
|
|
1656
|
+
function abortErrorMessage(signal) {
|
|
1657
|
+
return abortErrorCode(signal) === "timeout" ? "generic job timed out" : "generic job cancelled";
|
|
1658
|
+
}
|
|
1659
|
+
async function sleepWithAbort(ms, signal) {
|
|
1660
|
+
if (ms <= 0)
|
|
1661
|
+
return;
|
|
1662
|
+
if (signal.aborted) {
|
|
1663
|
+
throw new Error(abortErrorMessage(signal));
|
|
1664
|
+
}
|
|
1665
|
+
await new Promise((resolve, reject) => {
|
|
1666
|
+
const timer = setTimeout(() => {
|
|
1667
|
+
cleanup();
|
|
1668
|
+
resolve();
|
|
1669
|
+
}, ms);
|
|
1670
|
+
const onAbort = () => {
|
|
1671
|
+
cleanup();
|
|
1672
|
+
reject(new Error(abortErrorMessage(signal)));
|
|
1673
|
+
};
|
|
1674
|
+
const cleanup = () => {
|
|
1675
|
+
clearTimeout(timer);
|
|
1676
|
+
signal.removeEventListener("abort", onAbort);
|
|
1677
|
+
};
|
|
1678
|
+
signal.addEventListener("abort", onAbort, { once: true });
|
|
1679
|
+
});
|
|
1680
|
+
}
|
|
1681
|
+
function safeLocalArtifactJobId(jobId) {
|
|
1682
|
+
const normalized = jobId.replace(/[^a-zA-Z0-9_.-]/g, "_") || "job";
|
|
1683
|
+
return assertMswarmSafeRelativePath(normalized, "job_id");
|
|
1684
|
+
}
|
|
1685
|
+
function safeLocalArtifactName(value, fallback) {
|
|
1686
|
+
const normalized = value.replace(/[^a-zA-Z0-9_.-]/g, "_") || fallback;
|
|
1687
|
+
return assertMswarmSafeRelativePath(normalized, "artifact_name");
|
|
1688
|
+
}
|
|
1689
|
+
function resolveWithinRoot(root, relativePath) {
|
|
1690
|
+
const rootPath = resolve(root);
|
|
1691
|
+
const target = resolve(rootPath, relativePath);
|
|
1692
|
+
const delta = relative(rootPath, target);
|
|
1693
|
+
if (delta === "" || (!delta.startsWith("..") && !isAbsolute(delta))) {
|
|
1694
|
+
return target;
|
|
1695
|
+
}
|
|
1696
|
+
throw new Error("path_escape_not_allowed");
|
|
1697
|
+
}
|
|
1698
|
+
function sha256Hex(buffer) {
|
|
1699
|
+
return createHash("sha256").update(buffer).digest("hex");
|
|
1700
|
+
}
|
|
1701
|
+
function positiveByteLimit(...values) {
|
|
1702
|
+
const positive = values.filter((value) => typeof value === "number" && Number.isFinite(value) && value > 0);
|
|
1703
|
+
return positive.length ? Math.min(...positive) : DEFAULT_LOCAL_ARTIFACT_MAX_BYTES;
|
|
1704
|
+
}
|
|
1705
|
+
function parseLocalArtifactUri(uri) {
|
|
1706
|
+
try {
|
|
1707
|
+
const parsed = new URL(uri);
|
|
1708
|
+
if (parsed.protocol !== "artifact:" || parsed.hostname !== "local") {
|
|
1709
|
+
return null;
|
|
1710
|
+
}
|
|
1711
|
+
const parts = decodeURIComponent(parsed.pathname).split("/").filter(Boolean);
|
|
1712
|
+
if (parts.length < 2) {
|
|
1713
|
+
return null;
|
|
1714
|
+
}
|
|
1715
|
+
const [jobId, ...artifactPath] = parts;
|
|
1716
|
+
return {
|
|
1717
|
+
jobId: assertMswarmSafeRelativePath(jobId, "artifact_job_id"),
|
|
1718
|
+
path: assertMswarmSafeRelativePath(artifactPath.join("/"), "artifact_path")
|
|
1719
|
+
};
|
|
1720
|
+
}
|
|
1721
|
+
catch {
|
|
1722
|
+
return null;
|
|
1723
|
+
}
|
|
1724
|
+
}
|
|
1725
|
+
export class MswarmLocalArtifactStore {
|
|
1726
|
+
constructor(input = {}) {
|
|
1727
|
+
this.rootDir = input.rootDir || defaultArtifactStorePath();
|
|
1728
|
+
this.now = input.now || (() => new Date());
|
|
1729
|
+
}
|
|
1730
|
+
async prepareJobWorkspace(jobId, job) {
|
|
1731
|
+
const safeJobId = safeLocalArtifactJobId(jobId);
|
|
1732
|
+
const workDir = resolveWithinRoot(this.rootDir, safeJobId);
|
|
1733
|
+
const inputDir = resolveWithinRoot(workDir, "inputs");
|
|
1734
|
+
const outputDir = resolveWithinRoot(workDir, "outputs");
|
|
1735
|
+
await rm(workDir, { recursive: true, force: true });
|
|
1736
|
+
await mkdir(inputDir, { recursive: true });
|
|
1737
|
+
await mkdir(outputDir, { recursive: true });
|
|
1738
|
+
const store = {
|
|
1739
|
+
backend: "local-dev",
|
|
1740
|
+
root_uri: `artifact://local/${safeJobId}`
|
|
1741
|
+
};
|
|
1742
|
+
const registeredInputs = await Promise.all((job.inputs || []).map((input, index) => this.registerInput(jobId, job, input, index, inputDir, store)));
|
|
1743
|
+
const outputSpecs = (job.outputs || []).map((output) => ({
|
|
1744
|
+
...output,
|
|
1745
|
+
path: assertMswarmSafeRelativePath(output.path, "output_path")
|
|
1746
|
+
}));
|
|
1747
|
+
const sandbox = buildMswarmSandboxProfile({
|
|
1748
|
+
policy: job.policy,
|
|
1749
|
+
limits: job.limits,
|
|
1750
|
+
containerized: job.policy.trust_mode === "tenant-owned" || job.job_type === CUDA_RUN_JOB_TYPE,
|
|
1751
|
+
gpu: job.resources?.gpu ? "nvidia" : "none"
|
|
1752
|
+
});
|
|
1753
|
+
return {
|
|
1754
|
+
store,
|
|
1755
|
+
workDir,
|
|
1756
|
+
inputDir,
|
|
1757
|
+
outputDir,
|
|
1758
|
+
registeredInputs,
|
|
1759
|
+
outputSpecs,
|
|
1760
|
+
sandbox
|
|
1761
|
+
};
|
|
1762
|
+
}
|
|
1763
|
+
async collectOutputs(context, jobId) {
|
|
1764
|
+
const artifacts = [];
|
|
1765
|
+
let totalBytes = 0;
|
|
1766
|
+
for (const output of context.outputSpecs) {
|
|
1767
|
+
const collected = await this.collectDeclaredOutput(context, jobId, output);
|
|
1768
|
+
for (const artifact of collected) {
|
|
1769
|
+
totalBytes += artifact.size_bytes || 0;
|
|
1770
|
+
const totalLimit = positiveByteLimit(context.sandbox.limits.max_output_bytes);
|
|
1771
|
+
if (totalBytes > totalLimit) {
|
|
1772
|
+
throw new Error("output_size_limit_exceeded");
|
|
1773
|
+
}
|
|
1774
|
+
artifacts.push(artifact);
|
|
1775
|
+
}
|
|
1776
|
+
}
|
|
1777
|
+
return artifacts;
|
|
1778
|
+
}
|
|
1779
|
+
async registerInput(jobId, job, input, index, inputDir, store) {
|
|
1780
|
+
const mountPath = input.mount_path
|
|
1781
|
+
? assertMswarmSafeRelativePath(input.mount_path, "input_mount_path")
|
|
1782
|
+
: safeLocalArtifactName(input.name, `input-${index}`);
|
|
1783
|
+
const targetPath = resolveWithinRoot(inputDir, mountPath);
|
|
1784
|
+
const maxArtifactBytes = positiveByteLimit(job.policy.max_artifact_bytes);
|
|
1785
|
+
if (Number.isFinite(input.artifact.size_bytes) && input.artifact.size_bytes !== undefined) {
|
|
1786
|
+
if (input.artifact.size_bytes > maxArtifactBytes) {
|
|
1787
|
+
throw new Error("input_artifact_size_limit_exceeded");
|
|
1788
|
+
}
|
|
1789
|
+
}
|
|
1790
|
+
const source = parseLocalArtifactUri(input.artifact.uri);
|
|
1791
|
+
let localPath;
|
|
1792
|
+
if (source) {
|
|
1793
|
+
const sourcePath = resolveWithinRoot(resolveWithinRoot(this.rootDir, source.jobId), join("outputs", source.path));
|
|
1794
|
+
try {
|
|
1795
|
+
const sourceStat = await lstat(sourcePath);
|
|
1796
|
+
if (!sourceStat.isFile()) {
|
|
1797
|
+
throw new Error("input_artifact_must_be_file");
|
|
1798
|
+
}
|
|
1799
|
+
if (sourceStat.size > maxArtifactBytes) {
|
|
1800
|
+
throw new Error("input_artifact_size_limit_exceeded");
|
|
1801
|
+
}
|
|
1802
|
+
const bytes = await readFile(sourcePath);
|
|
1803
|
+
if (input.artifact.sha256 && input.artifact.sha256 !== sha256Hex(bytes)) {
|
|
1804
|
+
throw new Error("input_artifact_checksum_mismatch");
|
|
1805
|
+
}
|
|
1806
|
+
await mkdir(dirname(targetPath), { recursive: true });
|
|
1807
|
+
await writeFile(targetPath, bytes);
|
|
1808
|
+
localPath = targetPath;
|
|
1809
|
+
}
|
|
1810
|
+
catch (error) {
|
|
1811
|
+
if (error.code !== "ENOENT" || input.required === true) {
|
|
1812
|
+
throw error;
|
|
1813
|
+
}
|
|
1814
|
+
}
|
|
1815
|
+
}
|
|
1816
|
+
else if (input.required === true) {
|
|
1817
|
+
throw new Error("input_artifact_unavailable");
|
|
1818
|
+
}
|
|
1819
|
+
const registeredAt = this.now().toISOString();
|
|
1820
|
+
return {
|
|
1821
|
+
...input.artifact,
|
|
1822
|
+
id: input.artifact.id || `input_${sha256Hex(Buffer.from(`${jobId}:${input.name}:${input.artifact.uri}`)).slice(0, 16)}`,
|
|
1823
|
+
job_id: jobId,
|
|
1824
|
+
name: input.name,
|
|
1825
|
+
scope: "input",
|
|
1826
|
+
registered_at: registeredAt,
|
|
1827
|
+
store,
|
|
1828
|
+
access: defaultMswarmArtifactAccessPolicy(job.policy.trust_mode === "tenant-owned" ? "tenant-scoped" : "owner-local"),
|
|
1829
|
+
retention: defaultMswarmArtifactRetentionPolicy(),
|
|
1830
|
+
...(localPath ? { local_path: localPath } : {})
|
|
1831
|
+
};
|
|
1832
|
+
}
|
|
1833
|
+
async collectDeclaredOutput(context, jobId, output) {
|
|
1834
|
+
const normalizedPath = assertMswarmSafeRelativePath(output.path, "output_path");
|
|
1835
|
+
const targetPath = resolveWithinRoot(context.outputDir, normalizedPath);
|
|
1836
|
+
try {
|
|
1837
|
+
const targetStat = await lstat(targetPath);
|
|
1838
|
+
if (targetStat.isSymbolicLink()) {
|
|
1839
|
+
throw new Error("output_symlink_not_allowed");
|
|
1840
|
+
}
|
|
1841
|
+
if (targetStat.isDirectory()) {
|
|
1842
|
+
return this.collectOutputDirectory(context, jobId, output, normalizedPath);
|
|
1843
|
+
}
|
|
1844
|
+
if (targetStat.isFile()) {
|
|
1845
|
+
return [await this.collectOutputFile(context, jobId, output, normalizedPath, targetPath)];
|
|
1846
|
+
}
|
|
1847
|
+
throw new Error("output_entry_type_not_allowed");
|
|
1848
|
+
}
|
|
1849
|
+
catch (error) {
|
|
1850
|
+
if (error.code === "ENOENT" && output.required !== true) {
|
|
1851
|
+
return [];
|
|
1852
|
+
}
|
|
1853
|
+
throw error;
|
|
1854
|
+
}
|
|
1855
|
+
}
|
|
1856
|
+
async collectOutputDirectory(context, jobId, output, relativeDir) {
|
|
1857
|
+
const dirPath = resolveWithinRoot(context.outputDir, relativeDir);
|
|
1858
|
+
const entries = await readdir(dirPath, { withFileTypes: true });
|
|
1859
|
+
const artifacts = [];
|
|
1860
|
+
for (const entry of entries) {
|
|
1861
|
+
const childRelativePath = assertMswarmSafeRelativePath(`${relativeDir}/${entry.name}`, "output_path");
|
|
1862
|
+
const childPath = resolveWithinRoot(context.outputDir, childRelativePath);
|
|
1863
|
+
if (entry.isSymbolicLink()) {
|
|
1864
|
+
throw new Error("output_symlink_not_allowed");
|
|
1865
|
+
}
|
|
1866
|
+
if (entry.isDirectory()) {
|
|
1867
|
+
artifacts.push(...(await this.collectOutputDirectory(context, jobId, output, childRelativePath)));
|
|
1868
|
+
}
|
|
1869
|
+
else if (entry.isFile()) {
|
|
1870
|
+
artifacts.push(await this.collectOutputFile(context, jobId, output, childRelativePath, childPath));
|
|
1871
|
+
}
|
|
1872
|
+
else {
|
|
1873
|
+
throw new Error("output_entry_type_not_allowed");
|
|
1874
|
+
}
|
|
1875
|
+
}
|
|
1876
|
+
return artifacts;
|
|
1877
|
+
}
|
|
1878
|
+
async collectOutputFile(context, jobId, output, relativePath, filePath) {
|
|
1879
|
+
const stat = await lstat(filePath);
|
|
1880
|
+
if (!stat.isFile()) {
|
|
1881
|
+
throw new Error("output_entry_type_not_allowed");
|
|
1882
|
+
}
|
|
1883
|
+
const perArtifactLimit = positiveByteLimit(context.sandbox.limits.max_artifact_bytes, context.sandbox.limits.max_output_bytes);
|
|
1884
|
+
if (stat.size > perArtifactLimit) {
|
|
1885
|
+
throw new Error("output_artifact_size_limit_exceeded");
|
|
1886
|
+
}
|
|
1887
|
+
const bytes = await readFile(filePath);
|
|
1888
|
+
return {
|
|
1889
|
+
id: `output_${sha256Hex(Buffer.from(`${jobId}:${relativePath}`)).slice(0, 16)}`,
|
|
1890
|
+
job_id: jobId,
|
|
1891
|
+
name: output.path === relativePath ? output.name : `${output.name}/${relativePath}`,
|
|
1892
|
+
uri: buildMswarmLocalArtifactUri(jobId, relativePath),
|
|
1893
|
+
content_type: output.content_type,
|
|
1894
|
+
size_bytes: stat.size,
|
|
1895
|
+
sha256: sha256Hex(bytes),
|
|
1896
|
+
scope: "output",
|
|
1897
|
+
registered_at: this.now().toISOString(),
|
|
1898
|
+
store: context.store,
|
|
1899
|
+
access: defaultMswarmArtifactAccessPolicy(context.sandbox.trust_mode === "tenant-owned" ? "tenant-scoped" : "owner-local"),
|
|
1900
|
+
retention: defaultMswarmArtifactRetentionPolicy()
|
|
1901
|
+
};
|
|
1902
|
+
}
|
|
1903
|
+
}
|
|
1904
|
+
export class MswarmTestEchoRunner {
|
|
1905
|
+
constructor() {
|
|
1906
|
+
this.id = TEST_ECHO_RUNNER_ID;
|
|
1907
|
+
}
|
|
1908
|
+
async run(context) {
|
|
1909
|
+
const args = context.job.args || {};
|
|
1910
|
+
const message = optionalText(args.message) || "ok";
|
|
1911
|
+
const repeat = Math.max(1, Math.min(20, Math.floor(numberArg(args.repeat, 1))));
|
|
1912
|
+
const delayMs = boundedMilliseconds(args.delay_ms, 0, 30000);
|
|
1913
|
+
if (args.fail === true) {
|
|
1914
|
+
throw new Error(message);
|
|
1915
|
+
}
|
|
1916
|
+
for (let index = 0; index < repeat; index += 1) {
|
|
1917
|
+
if (context.signal.aborted) {
|
|
1918
|
+
throw new Error(abortErrorMessage(context.signal));
|
|
1919
|
+
}
|
|
1920
|
+
if (delayMs > 0) {
|
|
1921
|
+
await sleepWithAbort(delayMs, context.signal);
|
|
1922
|
+
}
|
|
1923
|
+
await context.emitEvent({
|
|
1924
|
+
type: "stdout",
|
|
1925
|
+
message,
|
|
1926
|
+
data: {
|
|
1927
|
+
runner: this.id,
|
|
1928
|
+
index,
|
|
1929
|
+
repeat
|
|
1930
|
+
}
|
|
1931
|
+
});
|
|
1932
|
+
}
|
|
1933
|
+
await context.emitEvent({
|
|
1934
|
+
type: "progress",
|
|
1935
|
+
message: "echo complete",
|
|
1936
|
+
data: {
|
|
1937
|
+
completed: repeat,
|
|
1938
|
+
total: repeat
|
|
1939
|
+
}
|
|
1940
|
+
});
|
|
1941
|
+
return {
|
|
1942
|
+
job_id: context.job.idempotency_key || "local-generic-job",
|
|
1943
|
+
status: "succeeded",
|
|
1944
|
+
exit_code: 0,
|
|
1945
|
+
started_at: new Date().toISOString(),
|
|
1946
|
+
finished_at: new Date().toISOString(),
|
|
1947
|
+
metrics: {
|
|
1948
|
+
runner: this.id,
|
|
1949
|
+
echoed: repeat,
|
|
1950
|
+
message
|
|
1951
|
+
}
|
|
1952
|
+
};
|
|
1953
|
+
}
|
|
1954
|
+
}
|
|
1955
|
+
const BLENDER_ENGINE_ARGS = {
|
|
1956
|
+
cycles: "CYCLES",
|
|
1957
|
+
eevee: "BLENDER_EEVEE_NEXT",
|
|
1958
|
+
workbench: "BLENDER_WORKBENCH"
|
|
1959
|
+
};
|
|
1960
|
+
const BLENDER_OUTPUT_FORMAT_ARGS = {
|
|
1961
|
+
png: "PNG",
|
|
1962
|
+
jpeg: "JPEG",
|
|
1963
|
+
open_exr: "OPEN_EXR"
|
|
1964
|
+
};
|
|
1965
|
+
function positiveSafeInteger(value) {
|
|
1966
|
+
if (typeof value !== "number" || !Number.isSafeInteger(value) || value <= 0) {
|
|
1967
|
+
return null;
|
|
1968
|
+
}
|
|
1969
|
+
return value;
|
|
1970
|
+
}
|
|
1971
|
+
function parseBlenderFrameSelection(value) {
|
|
1972
|
+
const defaultFrame = 1;
|
|
1973
|
+
if (value === undefined || value === null) {
|
|
1974
|
+
return { mode: "frame", frame: defaultFrame, label: String(defaultFrame), total: 1 };
|
|
1975
|
+
}
|
|
1976
|
+
const numericFrame = positiveSafeInteger(value);
|
|
1977
|
+
if (numericFrame !== null) {
|
|
1978
|
+
return { mode: "frame", frame: numericFrame, label: String(numericFrame), total: 1 };
|
|
1979
|
+
}
|
|
1980
|
+
const raw = optionalText(value);
|
|
1981
|
+
const match = raw?.match(/^([1-9]\d{0,6})(?:-([1-9]\d{0,6}))?$/);
|
|
1982
|
+
if (!match) {
|
|
1983
|
+
throw new Error("render.blender args.frames must be a positive frame number or start-end range");
|
|
1984
|
+
}
|
|
1985
|
+
const start = Number(match[1]);
|
|
1986
|
+
const end = match[2] ? Number(match[2]) : start;
|
|
1987
|
+
if (!Number.isSafeInteger(start) || !Number.isSafeInteger(end) || start <= 0 || end <= 0 || end < start) {
|
|
1988
|
+
throw new Error("render.blender args.frames must use a valid positive frame range");
|
|
1989
|
+
}
|
|
1990
|
+
if (end - start > 10000) {
|
|
1991
|
+
throw new Error("render.blender args.frames range exceeds the maximum supported 10001 frames");
|
|
1992
|
+
}
|
|
1993
|
+
if (start === end) {
|
|
1994
|
+
return { mode: "frame", frame: start, label: String(start), total: 1 };
|
|
1995
|
+
}
|
|
1996
|
+
return { mode: "range", start, end, label: `${start}-${end}`, total: end - start + 1 };
|
|
1997
|
+
}
|
|
1998
|
+
function normalizeBlenderEngine(value) {
|
|
1999
|
+
const raw = optionalText(value);
|
|
2000
|
+
if (!raw)
|
|
2001
|
+
return undefined;
|
|
2002
|
+
const key = raw.toLowerCase();
|
|
2003
|
+
const blender = BLENDER_ENGINE_ARGS[key];
|
|
2004
|
+
if (!blender) {
|
|
2005
|
+
throw new Error("render.blender args.engine must be cycles, eevee, or workbench");
|
|
2006
|
+
}
|
|
2007
|
+
return { label: key, blender };
|
|
2008
|
+
}
|
|
2009
|
+
function normalizeBlenderOutputFormat(value) {
|
|
2010
|
+
const key = (optionalText(value) || "png").toLowerCase();
|
|
2011
|
+
const blender = BLENDER_OUTPUT_FORMAT_ARGS[key];
|
|
2012
|
+
if (!blender) {
|
|
2013
|
+
throw new Error("render.blender args.output_format must be png, jpeg, or open_exr");
|
|
2014
|
+
}
|
|
2015
|
+
return { label: key, blender, extension: key === "open_exr" ? "exr" : key === "jpeg" ? "jpg" : "png" };
|
|
2016
|
+
}
|
|
2017
|
+
function parseBlenderResolution(value) {
|
|
2018
|
+
if (value === undefined || value === null)
|
|
2019
|
+
return undefined;
|
|
2020
|
+
const raw = optionalText(value);
|
|
2021
|
+
const match = raw?.match(/^([1-9]\d{0,4})x([1-9]\d{0,4})$/i);
|
|
2022
|
+
if (!match) {
|
|
2023
|
+
throw new Error("render.blender args.resolution must use WIDTHxHEIGHT");
|
|
2024
|
+
}
|
|
2025
|
+
const width = Number(match[1]);
|
|
2026
|
+
const height = Number(match[2]);
|
|
2027
|
+
if (width > 16384 || height > 16384) {
|
|
2028
|
+
throw new Error("render.blender args.resolution exceeds 16384x16384");
|
|
2029
|
+
}
|
|
2030
|
+
return { width, height, label: `${width}x${height}` };
|
|
2031
|
+
}
|
|
2032
|
+
function safeBlenderSceneName(value, label) {
|
|
2033
|
+
const raw = optionalText(value);
|
|
2034
|
+
if (!raw)
|
|
2035
|
+
return undefined;
|
|
2036
|
+
if (raw.length > 128 || /[\0\r\n]/.test(raw)) {
|
|
2037
|
+
throw new Error(`render.blender args.${label} is not a safe Blender object name`);
|
|
2038
|
+
}
|
|
2039
|
+
return raw;
|
|
2040
|
+
}
|
|
2041
|
+
function blenderSceneInputPath(context) {
|
|
2042
|
+
const scene = context.artifacts.registeredInputs.find((input) => input.name === "scene") || context.artifacts.registeredInputs[0];
|
|
2043
|
+
if (!scene?.local_path) {
|
|
2044
|
+
throw new Error("render.blender requires a materialized scene input artifact");
|
|
2045
|
+
}
|
|
2046
|
+
return scene.local_path;
|
|
2047
|
+
}
|
|
2048
|
+
function blenderOutputPattern(context) {
|
|
2049
|
+
const output = context.artifacts.outputSpecs[0];
|
|
2050
|
+
if (!output) {
|
|
2051
|
+
throw new Error("render.blender requires a declared output directory");
|
|
2052
|
+
}
|
|
2053
|
+
const normalizedPath = assertMswarmSafeRelativePath(output.path, "render_blender_output_path");
|
|
2054
|
+
const leaf = normalizedPath.split("/").filter(Boolean).at(-1) || normalizedPath;
|
|
2055
|
+
if (/\.[a-zA-Z0-9]{1,8}$/.test(leaf)) {
|
|
2056
|
+
throw new Error("render.blender output path must be a directory, not a file path");
|
|
2057
|
+
}
|
|
2058
|
+
return resolveWithinRoot(context.artifacts.outputDir, `${normalizedPath}/frame_####`);
|
|
2059
|
+
}
|
|
2060
|
+
function redactBlenderLocalPaths(context, value) {
|
|
2061
|
+
const replacements = [
|
|
2062
|
+
[context.artifacts.workDir, "[job-workdir]"],
|
|
2063
|
+
[context.artifacts.inputDir, "[job-inputs]"],
|
|
2064
|
+
[context.artifacts.outputDir, "[job-outputs]"],
|
|
2065
|
+
...context.artifacts.registeredInputs.map((input) => [input.local_path, "[job-input]"])
|
|
2066
|
+
];
|
|
2067
|
+
let output = value;
|
|
2068
|
+
for (const [source, replacement] of replacements) {
|
|
2069
|
+
if (source) {
|
|
2070
|
+
output = output.split(source).join(replacement);
|
|
2071
|
+
}
|
|
2072
|
+
}
|
|
2073
|
+
return output;
|
|
2074
|
+
}
|
|
2075
|
+
async function emitBlenderOutput(context, type, value) {
|
|
2076
|
+
const lines = value.split(/\r?\n/).map((line) => line.trim()).filter(Boolean).slice(0, 200);
|
|
2077
|
+
for (const line of lines) {
|
|
2078
|
+
await context.emitEvent({
|
|
2079
|
+
type,
|
|
2080
|
+
message: redactBlenderLocalPaths(context, line),
|
|
2081
|
+
data: { runner: BLENDER_RENDER_RUNNER_ID }
|
|
2082
|
+
});
|
|
2083
|
+
}
|
|
2084
|
+
}
|
|
2085
|
+
async function emitBlenderProgress(context, output, frames) {
|
|
2086
|
+
const seen = new Set();
|
|
2087
|
+
const lowerBound = frames.mode === "range" ? frames.start : frames.frame;
|
|
2088
|
+
const upperBound = frames.mode === "range" ? frames.end : frames.frame;
|
|
2089
|
+
for (const line of output.split(/\r?\n/)) {
|
|
2090
|
+
const match = line.match(/\bFra:(\d+)\b/i) || line.match(/\bFrame\s+(\d+)\b/i);
|
|
2091
|
+
if (!match)
|
|
2092
|
+
continue;
|
|
2093
|
+
const frame = Number(match[1]);
|
|
2094
|
+
if (!Number.isSafeInteger(frame) || frame < lowerBound || frame > upperBound || seen.has(frame)) {
|
|
2095
|
+
continue;
|
|
2096
|
+
}
|
|
2097
|
+
seen.add(frame);
|
|
2098
|
+
await context.emitEvent({
|
|
2099
|
+
type: "progress",
|
|
2100
|
+
message: `rendered frame ${frame}`,
|
|
2101
|
+
data: {
|
|
2102
|
+
runner: BLENDER_RENDER_RUNNER_ID,
|
|
2103
|
+
frame,
|
|
2104
|
+
completed: seen.size,
|
|
2105
|
+
total: frames.total
|
|
2106
|
+
}
|
|
2107
|
+
});
|
|
2108
|
+
}
|
|
2109
|
+
}
|
|
2110
|
+
function blenderFailureResult(job, code, message, startedAt) {
|
|
2111
|
+
return {
|
|
2112
|
+
job_id: job.idempotency_key || "render.blender",
|
|
2113
|
+
status: "failed",
|
|
2114
|
+
exit_code: 1,
|
|
2115
|
+
started_at: startedAt,
|
|
2116
|
+
finished_at: new Date().toISOString(),
|
|
2117
|
+
error: {
|
|
2118
|
+
code,
|
|
2119
|
+
message,
|
|
2120
|
+
retryable: false
|
|
2121
|
+
}
|
|
2122
|
+
};
|
|
2123
|
+
}
|
|
2124
|
+
function blenderGpuComputeDeviceType() {
|
|
2125
|
+
// The current GPU probe only marks NVIDIA devices as available, so CUDA is
|
|
2126
|
+
// the only concrete Blender compute backend this runner can safely request.
|
|
2127
|
+
return "CUDA";
|
|
2128
|
+
}
|
|
2129
|
+
export class MswarmBlenderRenderRunner {
|
|
2130
|
+
constructor(runner = defaultCommandRunner) {
|
|
2131
|
+
this.id = BLENDER_RENDER_RUNNER_ID;
|
|
2132
|
+
this.runner = runner;
|
|
2133
|
+
}
|
|
2134
|
+
async run(context) {
|
|
2135
|
+
const startedAt = new Date().toISOString();
|
|
2136
|
+
if (context.signal.aborted) {
|
|
2137
|
+
throw new Error(abortErrorMessage(context.signal));
|
|
2138
|
+
}
|
|
2139
|
+
if (context.job.policy.trust_mode !== "owner-local") {
|
|
2140
|
+
return blenderFailureResult(context.job, "policy_denied", "render.blender is owner-local only until containerized Blender execution is available", startedAt);
|
|
2141
|
+
}
|
|
2142
|
+
let scenePath;
|
|
2143
|
+
let frames;
|
|
2144
|
+
let engine;
|
|
2145
|
+
let outputFormat;
|
|
2146
|
+
let resolution;
|
|
2147
|
+
let sceneName;
|
|
2148
|
+
let cameraName;
|
|
2149
|
+
let outputPattern;
|
|
2150
|
+
const gpuRequested = Boolean(context.job.resources?.gpu);
|
|
2151
|
+
try {
|
|
2152
|
+
const args = context.job.args || {};
|
|
2153
|
+
scenePath = blenderSceneInputPath(context);
|
|
2154
|
+
frames = parseBlenderFrameSelection(args.frames);
|
|
2155
|
+
engine = normalizeBlenderEngine(args.engine);
|
|
2156
|
+
outputFormat = normalizeBlenderOutputFormat(args.output_format);
|
|
2157
|
+
resolution = parseBlenderResolution(args.resolution);
|
|
2158
|
+
sceneName = safeBlenderSceneName(args.scene, "scene");
|
|
2159
|
+
cameraName = safeBlenderSceneName(args.camera, "camera");
|
|
2160
|
+
outputPattern = blenderOutputPattern(context);
|
|
2161
|
+
await mkdir(dirname(outputPattern), { recursive: true });
|
|
2162
|
+
}
|
|
2163
|
+
catch (error) {
|
|
2164
|
+
return blenderFailureResult(context.job, "validation_failed", error instanceof Error ? error.message : String(error || "render.blender validation failed"), startedAt);
|
|
2165
|
+
}
|
|
2166
|
+
const pythonStatements = [];
|
|
2167
|
+
if (resolution) {
|
|
2168
|
+
pythonStatements.push(`bpy.context.scene.render.resolution_x=${resolution.width}`);
|
|
2169
|
+
pythonStatements.push(`bpy.context.scene.render.resolution_y=${resolution.height}`);
|
|
2170
|
+
}
|
|
2171
|
+
if (cameraName) {
|
|
2172
|
+
pythonStatements.push(`camera=bpy.data.objects.get(${JSON.stringify(cameraName)})`);
|
|
2173
|
+
pythonStatements.push("bpy.context.scene.camera=camera if camera is not None else bpy.context.scene.camera");
|
|
2174
|
+
}
|
|
2175
|
+
if (gpuRequested) {
|
|
2176
|
+
const computeDeviceType = blenderGpuComputeDeviceType();
|
|
2177
|
+
pythonStatements.push("cycles_addon=bpy.context.preferences.addons.get('cycles')");
|
|
2178
|
+
pythonStatements.push("cycles_prefs=cycles_addon.preferences if cycles_addon is not None else None");
|
|
2179
|
+
pythonStatements.push(`setattr(cycles_prefs,'compute_device_type',${JSON.stringify(computeDeviceType)}) if cycles_prefs is not None and hasattr(cycles_prefs,'compute_device_type') else None`);
|
|
2180
|
+
pythonStatements.push("getattr(cycles_prefs,'get_devices',lambda: None)() if cycles_prefs is not None else None");
|
|
2181
|
+
pythonStatements.push("setattr(bpy.context.scene.cycles,'device','GPU') if hasattr(bpy.context.scene,'cycles') else None");
|
|
2182
|
+
pythonStatements.push("[setattr(device,'use',True) for device in getattr(cycles_prefs,'devices',[]) if hasattr(device,'use')] if cycles_prefs is not None else None");
|
|
2183
|
+
}
|
|
2184
|
+
const blenderArgs = ["-b", scenePath];
|
|
2185
|
+
if (sceneName) {
|
|
2186
|
+
blenderArgs.push("--scene", sceneName);
|
|
2187
|
+
}
|
|
2188
|
+
if (engine) {
|
|
2189
|
+
blenderArgs.push("--engine", engine.blender);
|
|
2190
|
+
}
|
|
2191
|
+
if (pythonStatements.length > 0) {
|
|
2192
|
+
blenderArgs.push("--python-expr", `import bpy; ${pythonStatements.join("; ")}`);
|
|
2193
|
+
}
|
|
2194
|
+
blenderArgs.push("--render-output", outputPattern, "--render-format", outputFormat.blender);
|
|
2195
|
+
if (frames.mode === "range") {
|
|
2196
|
+
blenderArgs.push("-s", String(frames.start), "-e", String(frames.end), "-a");
|
|
2197
|
+
}
|
|
2198
|
+
else {
|
|
2199
|
+
blenderArgs.push("--render-frame", String(frames.frame));
|
|
2200
|
+
}
|
|
2201
|
+
await context.emitEvent({
|
|
2202
|
+
type: "progress",
|
|
2203
|
+
message: "blender render starting",
|
|
2204
|
+
data: {
|
|
2205
|
+
runner: this.id,
|
|
2206
|
+
frames: frames.label,
|
|
2207
|
+
engine: engine?.label || "scene-default",
|
|
2208
|
+
output_format: outputFormat.label,
|
|
2209
|
+
...(resolution ? { resolution: resolution.label } : {}),
|
|
2210
|
+
gpu_requested: gpuRequested,
|
|
2211
|
+
render_device: gpuRequested ? "gpu" : "scene-default"
|
|
2212
|
+
}
|
|
2213
|
+
});
|
|
2214
|
+
const timeoutMs = Math.max(1000, Math.min(DEFAULT_JOB_TIMEOUT_MS, Math.floor((context.sandbox.limits.timeout_sec || DEFAULT_JOB_TIMEOUT_MS / 1000) * 1000)));
|
|
2215
|
+
const maxBuffer = Math.min(DEFAULT_COMMAND_MAX_BUFFER, Math.max(1024 * 1024, context.job.limits?.max_stdout_bytes || 0, context.job.limits?.max_stderr_bytes || 0));
|
|
2216
|
+
try {
|
|
2217
|
+
const result = await this.runner("blender", blenderArgs, {
|
|
2218
|
+
timeoutMs,
|
|
2219
|
+
maxBuffer,
|
|
2220
|
+
signal: context.signal
|
|
2221
|
+
});
|
|
2222
|
+
await emitBlenderOutput(context, "stdout", result.stdout);
|
|
2223
|
+
await emitBlenderOutput(context, "stderr", result.stderr);
|
|
2224
|
+
await emitBlenderProgress(context, `${result.stdout}\n${result.stderr}`, frames);
|
|
2225
|
+
return {
|
|
2226
|
+
job_id: context.job.idempotency_key || "render.blender",
|
|
2227
|
+
status: "succeeded",
|
|
2228
|
+
exit_code: 0,
|
|
2229
|
+
started_at: startedAt,
|
|
2230
|
+
finished_at: new Date().toISOString(),
|
|
2231
|
+
metrics: {
|
|
2232
|
+
runner: this.id,
|
|
2233
|
+
frames: frames.label,
|
|
2234
|
+
engine: engine?.label || "scene-default",
|
|
2235
|
+
output_format: outputFormat.label,
|
|
2236
|
+
...(resolution ? { resolution: resolution.label } : {}),
|
|
2237
|
+
gpu_requested: gpuRequested,
|
|
2238
|
+
render_device: gpuRequested ? "gpu" : "scene-default"
|
|
2239
|
+
}
|
|
2240
|
+
};
|
|
2241
|
+
}
|
|
2242
|
+
catch (error) {
|
|
2243
|
+
if (context.signal.aborted) {
|
|
2244
|
+
throw error;
|
|
2245
|
+
}
|
|
2246
|
+
return blenderFailureResult(context.job, "runner_failed", redactBlenderLocalPaths(context, error instanceof Error ? error.message : String(error || "Blender render failed")), startedAt);
|
|
2247
|
+
}
|
|
2248
|
+
}
|
|
2249
|
+
}
|
|
2250
|
+
const SAFE_CUDA_IDENTIFIER = /^[a-zA-Z0-9][a-zA-Z0-9_.-]{0,127}$/;
|
|
2251
|
+
const SAFE_CUDA_TOKEN = /^[a-zA-Z0-9_@%+=:,./-]{1,200}$/;
|
|
2252
|
+
const UNSAFE_CUDA_MANIFEST_KEYS = new Set([
|
|
2253
|
+
"command",
|
|
2254
|
+
"cmd",
|
|
2255
|
+
"shell",
|
|
2256
|
+
"entrypoint",
|
|
2257
|
+
"docker_args",
|
|
2258
|
+
"mount",
|
|
2259
|
+
"mounts",
|
|
2260
|
+
"volumes",
|
|
2261
|
+
"binds",
|
|
2262
|
+
"device",
|
|
2263
|
+
"devices",
|
|
2264
|
+
"privileged",
|
|
2265
|
+
"network",
|
|
2266
|
+
"host_network"
|
|
2267
|
+
]);
|
|
2268
|
+
function cudaFailureResult(job, code, message, startedAt) {
|
|
2269
|
+
return {
|
|
2270
|
+
job_id: job.idempotency_key || "cuda.run",
|
|
2271
|
+
status: "failed",
|
|
2272
|
+
exit_code: 1,
|
|
2273
|
+
started_at: startedAt,
|
|
2274
|
+
finished_at: new Date().toISOString(),
|
|
2275
|
+
error: {
|
|
2276
|
+
code,
|
|
2277
|
+
message,
|
|
2278
|
+
retryable: false
|
|
2279
|
+
}
|
|
2280
|
+
};
|
|
2281
|
+
}
|
|
2282
|
+
function safeCudaIdentifier(value, label) {
|
|
2283
|
+
const text = optionalText(value);
|
|
2284
|
+
if (!text || !SAFE_CUDA_IDENTIFIER.test(text)) {
|
|
2285
|
+
throw new Error(`${label}_invalid`);
|
|
2286
|
+
}
|
|
2287
|
+
return text;
|
|
2288
|
+
}
|
|
2289
|
+
function safeCudaRelativePath(value, label) {
|
|
2290
|
+
return assertMswarmSafeRelativePath(optionalText(value), label);
|
|
2291
|
+
}
|
|
2292
|
+
function safeCudaToken(value, label) {
|
|
2293
|
+
const text = optionalText(value);
|
|
2294
|
+
if (!text || !SAFE_CUDA_TOKEN.test(text) || /[`$;&|<>\r\n]/.test(text)) {
|
|
2295
|
+
throw new Error(`${label}_invalid`);
|
|
2296
|
+
}
|
|
2297
|
+
return text;
|
|
2298
|
+
}
|
|
2299
|
+
function safeCudaTokenList(value, label) {
|
|
2300
|
+
if (value === undefined)
|
|
2301
|
+
return [];
|
|
2302
|
+
if (!Array.isArray(value)) {
|
|
2303
|
+
throw new Error(`${label}_must_be_array`);
|
|
2304
|
+
}
|
|
2305
|
+
return value.map((entry, index) => safeCudaToken(entry, `${label}_${index}`));
|
|
2306
|
+
}
|
|
2307
|
+
function assertNoUnsafeCudaManifestKeys(record, label) {
|
|
2308
|
+
for (const key of Object.keys(record)) {
|
|
2309
|
+
if (UNSAFE_CUDA_MANIFEST_KEYS.has(key)) {
|
|
2310
|
+
throw new Error(`${label}_${key}_not_allowed`);
|
|
2311
|
+
}
|
|
2312
|
+
}
|
|
2313
|
+
}
|
|
2314
|
+
function parseCudaRunArgs(job) {
|
|
2315
|
+
const args = job.args || {};
|
|
2316
|
+
return {
|
|
2317
|
+
manifestPath: safeCudaRelativePath(args.manifest_path, "cuda_manifest_path"),
|
|
2318
|
+
profile: safeCudaIdentifier(args.profile, "cuda_profile"),
|
|
2319
|
+
target: safeCudaIdentifier(args.target, "cuda_target")
|
|
2320
|
+
};
|
|
2321
|
+
}
|
|
2322
|
+
function cudaPackageArchive(context) {
|
|
2323
|
+
const registeredInput = context.artifacts.registeredInputs.find((input) => input.name === "package" && input.local_path) ||
|
|
2324
|
+
context.artifacts.registeredInputs.find((input) => input.local_path && input.name !== "manifest");
|
|
2325
|
+
if (!registeredInput?.local_path) {
|
|
2326
|
+
throw new Error("cuda_package_artifact_required");
|
|
2327
|
+
}
|
|
2328
|
+
const inputPath = assertMswarmSafeRelativePath(relative(context.artifacts.inputDir, registeredInput.local_path), "cuda_package_input_path");
|
|
2329
|
+
if (!/(\.tar\.gz|\.tgz)$/i.test(inputPath)) {
|
|
2330
|
+
throw new Error("cuda_package_archive_must_be_targz");
|
|
2331
|
+
}
|
|
2332
|
+
return { hostPath: registeredInput.local_path, inputPath };
|
|
2333
|
+
}
|
|
2334
|
+
function cudaArchiveValidationError(reason) {
|
|
2335
|
+
return new Error(`cuda_package_archive_${reason || "invalid"}`);
|
|
2336
|
+
}
|
|
2337
|
+
function cudaTarVerboseEntryType(line) {
|
|
2338
|
+
const marker = line.trimStart()[0];
|
|
2339
|
+
if (marker === "d")
|
|
2340
|
+
return "directory";
|
|
2341
|
+
if (marker === "-")
|
|
2342
|
+
return "file";
|
|
2343
|
+
if (marker === "l")
|
|
2344
|
+
return "symlink";
|
|
2345
|
+
if (marker === "h")
|
|
2346
|
+
return "hardlink";
|
|
2347
|
+
if (marker === "b" || marker === "c")
|
|
2348
|
+
return "device";
|
|
2349
|
+
return marker ? "other" : "file";
|
|
2350
|
+
}
|
|
2351
|
+
async function validateCudaPackageArchive(context, runner, archive) {
|
|
2352
|
+
const listOptions = {
|
|
2353
|
+
timeoutMs: 5000,
|
|
2354
|
+
maxBuffer: 512 * 1024,
|
|
2355
|
+
signal: context.signal
|
|
2356
|
+
};
|
|
2357
|
+
const names = await runner("tar", ["-tzf", archive.hostPath], listOptions);
|
|
2358
|
+
let entryCount = 0;
|
|
2359
|
+
for (const rawLine of names.stdout.split(/\r?\n/)) {
|
|
2360
|
+
const entryPath = rawLine.trim();
|
|
2361
|
+
if (!entryPath)
|
|
2362
|
+
continue;
|
|
2363
|
+
entryCount += 1;
|
|
2364
|
+
const result = validateMswarmArchiveEntry({
|
|
2365
|
+
path: entryPath,
|
|
2366
|
+
type: entryPath.endsWith("/") ? "directory" : "file"
|
|
2367
|
+
});
|
|
2368
|
+
if (!result.ok) {
|
|
2369
|
+
throw cudaArchiveValidationError(result.reason);
|
|
2370
|
+
}
|
|
2371
|
+
}
|
|
2372
|
+
if (entryCount === 0) {
|
|
2373
|
+
throw cudaArchiveValidationError("empty");
|
|
2374
|
+
}
|
|
2375
|
+
const verbose = await runner("tar", ["-tvzf", archive.hostPath], listOptions);
|
|
2376
|
+
for (const rawLine of verbose.stdout.split(/\r?\n/)) {
|
|
2377
|
+
if (!rawLine.trim())
|
|
2378
|
+
continue;
|
|
2379
|
+
const type = cudaTarVerboseEntryType(rawLine);
|
|
2380
|
+
if (type === "file" || type === "directory")
|
|
2381
|
+
continue;
|
|
2382
|
+
const result = validateMswarmArchiveEntry({ path: "entry", type });
|
|
2383
|
+
throw cudaArchiveValidationError(result.reason);
|
|
2384
|
+
}
|
|
2385
|
+
}
|
|
2386
|
+
async function readCudaManifestText(context, runner, args) {
|
|
2387
|
+
const directManifestPath = resolveWithinRoot(context.artifacts.inputDir, args.manifestPath);
|
|
2388
|
+
try {
|
|
2389
|
+
const directStat = await lstat(directManifestPath);
|
|
2390
|
+
if (directStat.isFile()) {
|
|
2391
|
+
return await readFile(directManifestPath, "utf8");
|
|
2392
|
+
}
|
|
2393
|
+
}
|
|
2394
|
+
catch (error) {
|
|
2395
|
+
if (error.code !== "ENOENT") {
|
|
2396
|
+
throw error;
|
|
2397
|
+
}
|
|
2398
|
+
}
|
|
2399
|
+
const archive = cudaPackageArchive(context);
|
|
2400
|
+
const extracted = await runner("tar", ["-xOf", archive.hostPath, args.manifestPath], {
|
|
2401
|
+
timeoutMs: 5000,
|
|
2402
|
+
maxBuffer: 256 * 1024,
|
|
2403
|
+
signal: context.signal
|
|
2404
|
+
});
|
|
2405
|
+
return extracted.stdout;
|
|
2406
|
+
}
|
|
2407
|
+
function parseCudaPackageManifest(text, args, policy) {
|
|
2408
|
+
const parsed = JSON.parse(text);
|
|
2409
|
+
const manifest = objectRecord(parsed);
|
|
2410
|
+
if (!manifest) {
|
|
2411
|
+
throw new Error("cuda_manifest_must_be_object");
|
|
2412
|
+
}
|
|
2413
|
+
assertNoUnsafeCudaManifestKeys(manifest, "cuda_manifest");
|
|
2414
|
+
const schemaVersion = optionalText(manifest.schema_version);
|
|
2415
|
+
if (schemaVersion !== "2026-06-14") {
|
|
2416
|
+
throw new Error("cuda_manifest_schema_version_invalid");
|
|
2417
|
+
}
|
|
2418
|
+
const packageInfo = objectRecord(manifest.package);
|
|
2419
|
+
const publisher = optionalText(packageInfo?.publisher);
|
|
2420
|
+
if (policy.allowed_package_publishers?.length) {
|
|
2421
|
+
if (!publisher || !policy.allowed_package_publishers.includes(publisher)) {
|
|
2422
|
+
throw new Error("cuda_manifest_publisher_not_allowed");
|
|
2423
|
+
}
|
|
2424
|
+
}
|
|
2425
|
+
const profiles = objectRecord(manifest.profiles);
|
|
2426
|
+
const targets = objectRecord(manifest.targets);
|
|
2427
|
+
const profile = objectRecord(profiles?.[args.profile]);
|
|
2428
|
+
const target = objectRecord(targets?.[args.target]);
|
|
2429
|
+
if (!profile) {
|
|
2430
|
+
throw new Error("cuda_manifest_profile_not_found");
|
|
2431
|
+
}
|
|
2432
|
+
if (!target) {
|
|
2433
|
+
throw new Error("cuda_manifest_target_not_found");
|
|
2434
|
+
}
|
|
2435
|
+
assertNoUnsafeCudaManifestKeys(profile, "cuda_manifest_profile");
|
|
2436
|
+
assertNoUnsafeCudaManifestKeys(target, "cuda_manifest_target");
|
|
2437
|
+
const image = optionalText(profile.image);
|
|
2438
|
+
if (!image || !APPROVED_NVIDIA_CUDA_IMAGES.has(image)) {
|
|
2439
|
+
throw new Error("cuda_image_not_approved");
|
|
2440
|
+
}
|
|
2441
|
+
if (!policy.allowed_images?.includes(image)) {
|
|
2442
|
+
throw new Error("cuda_image_not_allowed_by_policy");
|
|
2443
|
+
}
|
|
2444
|
+
const compiler = optionalText(profile.compiler) || "nvcc";
|
|
2445
|
+
if (compiler !== "nvcc") {
|
|
2446
|
+
throw new Error("cuda_compiler_not_allowed");
|
|
2447
|
+
}
|
|
2448
|
+
const source = safeCudaRelativePath(target.source, "cuda_target_source");
|
|
2449
|
+
if (!source.endsWith(".cu")) {
|
|
2450
|
+
throw new Error("cuda_target_source_must_be_cu");
|
|
2451
|
+
}
|
|
2452
|
+
const output = safeCudaRelativePath(optionalText(target.output) || `bin/${args.target}`, "cuda_target_output");
|
|
2453
|
+
return {
|
|
2454
|
+
schemaVersion,
|
|
2455
|
+
packageName: optionalText(packageInfo?.name) ?? undefined,
|
|
2456
|
+
publisher: publisher ?? undefined,
|
|
2457
|
+
image,
|
|
2458
|
+
compiler,
|
|
2459
|
+
source,
|
|
2460
|
+
output,
|
|
2461
|
+
flags: [...safeCudaTokenList(profile.flags, "cuda_profile_flags"), ...safeCudaTokenList(target.flags, "cuda_target_flags")],
|
|
2462
|
+
runArgs: safeCudaTokenList(target.args, "cuda_target_args")
|
|
2463
|
+
};
|
|
2464
|
+
}
|
|
2465
|
+
function redactCudaLocalPaths(context, value) {
|
|
2466
|
+
const replacements = [
|
|
2467
|
+
...context.artifacts.registeredInputs.map((input) => [input.local_path, "[job-input]"]),
|
|
2468
|
+
[context.artifacts.inputDir, "[job-inputs]"],
|
|
2469
|
+
[context.artifacts.outputDir, "[job-outputs]"],
|
|
2470
|
+
[context.artifacts.workDir, "[job-workdir]"]
|
|
2471
|
+
];
|
|
2472
|
+
replacements.sort((left, right) => (right[0]?.length || 0) - (left[0]?.length || 0));
|
|
2473
|
+
let output = value;
|
|
2474
|
+
for (const [source, replacement] of replacements) {
|
|
2475
|
+
if (source) {
|
|
2476
|
+
output = output.split(source).join(replacement);
|
|
2477
|
+
}
|
|
2478
|
+
}
|
|
2479
|
+
return output;
|
|
2480
|
+
}
|
|
2481
|
+
async function emitCudaOutput(context, type, value) {
|
|
2482
|
+
const lines = value.split(/\r?\n/).map((line) => line.trim()).filter(Boolean).slice(0, 200);
|
|
2483
|
+
for (const line of lines) {
|
|
2484
|
+
await context.emitEvent({
|
|
2485
|
+
type,
|
|
2486
|
+
message: redactCudaLocalPaths(context, line),
|
|
2487
|
+
data: { runner: CUDA_PACKAGE_RUNNER_ID }
|
|
2488
|
+
});
|
|
2489
|
+
}
|
|
2490
|
+
}
|
|
2491
|
+
function buildCudaRunnerScript(input) {
|
|
2492
|
+
const srcDir = "/workspace/work/src";
|
|
2493
|
+
const buildOutput = `/workspace/work/${input.selection.output}`;
|
|
2494
|
+
const compile = [
|
|
2495
|
+
"/usr/local/cuda/bin/nvcc",
|
|
2496
|
+
...input.selection.flags,
|
|
2497
|
+
"-o",
|
|
2498
|
+
buildOutput,
|
|
2499
|
+
`${srcDir}/${input.selection.source}`
|
|
2500
|
+
].map(quotePosixShellValue).join(" ");
|
|
2501
|
+
const run = [
|
|
2502
|
+
buildOutput,
|
|
2503
|
+
...input.selection.runArgs
|
|
2504
|
+
].map(quotePosixShellValue).join(" ");
|
|
2505
|
+
return [
|
|
2506
|
+
"set -euo pipefail",
|
|
2507
|
+
"mkdir -p /workspace/work/src /workspace/outputs",
|
|
2508
|
+
`tar -xzf ${quotePosixShellValue(`/workspace/inputs/${input.archiveInputPath}`)} -C /workspace/work/src`,
|
|
2509
|
+
`mkdir -p ${quotePosixShellValue(dirname(buildOutput))}`,
|
|
2510
|
+
"cd /workspace/work/src",
|
|
2511
|
+
compile,
|
|
2512
|
+
run
|
|
2513
|
+
].join("\n");
|
|
2514
|
+
}
|
|
2515
|
+
function dockerBindMount(hostPath, containerPath, mode) {
|
|
2516
|
+
return `${hostPath}:${containerPath}:${mode}`;
|
|
2517
|
+
}
|
|
2518
|
+
function buildCudaDockerArgs(input) {
|
|
2519
|
+
const gpuCount = Math.max(1, input.context.job.resources?.gpu?.count || 1);
|
|
2520
|
+
const args = [
|
|
2521
|
+
"run",
|
|
2522
|
+
"--rm",
|
|
2523
|
+
"--pull",
|
|
2524
|
+
"never",
|
|
2525
|
+
"--network",
|
|
2526
|
+
"none",
|
|
2527
|
+
"--runtime",
|
|
2528
|
+
"nvidia",
|
|
2529
|
+
"--gpus",
|
|
2530
|
+
`count=${gpuCount}`,
|
|
2531
|
+
"--user",
|
|
2532
|
+
input.context.sandbox.container.user,
|
|
2533
|
+
"--read-only",
|
|
2534
|
+
"--cap-drop",
|
|
2535
|
+
"ALL",
|
|
2536
|
+
"--security-opt",
|
|
2537
|
+
"no-new-privileges",
|
|
2538
|
+
"--workdir",
|
|
2539
|
+
"/workspace",
|
|
2540
|
+
"--env",
|
|
2541
|
+
"CUDA_CACHE_PATH=/workspace/work/.cuda-cache",
|
|
2542
|
+
"--tmpfs",
|
|
2543
|
+
"/tmp:rw,nosuid,nodev,size=64m"
|
|
2544
|
+
];
|
|
2545
|
+
if (Number.isFinite(input.context.job.resources?.memory_gb) && input.context.job.resources?.memory_gb) {
|
|
2546
|
+
args.push("--memory", `${Math.floor(input.context.job.resources.memory_gb)}g`);
|
|
2547
|
+
}
|
|
2548
|
+
if (Number.isFinite(input.context.job.resources?.disk_gb) && input.context.job.resources?.disk_gb) {
|
|
2549
|
+
args.push("--storage-opt", `size=${Math.floor(input.context.job.resources.disk_gb)}G`);
|
|
2550
|
+
}
|
|
2551
|
+
args.push("-v", dockerBindMount(input.context.artifacts.inputDir, "/workspace/inputs", "ro"), "-v", dockerBindMount(input.context.artifacts.outputDir, "/workspace/outputs", "rw"), "-v", dockerBindMount(input.workPath, "/workspace/work", "rw"), "-v", dockerBindMount(input.scriptPath, "/workspace/__mcoda_cuda_run.sh", "ro"), input.selection.image, "/bin/bash", "/workspace/__mcoda_cuda_run.sh");
|
|
2552
|
+
return args;
|
|
2553
|
+
}
|
|
2554
|
+
export class MswarmCudaPackageRunner {
|
|
2555
|
+
constructor(runner = defaultCommandRunner) {
|
|
2556
|
+
this.id = CUDA_PACKAGE_RUNNER_ID;
|
|
2557
|
+
this.runner = runner;
|
|
2558
|
+
}
|
|
2559
|
+
async run(context) {
|
|
2560
|
+
const startedAt = new Date().toISOString();
|
|
2561
|
+
if (context.signal.aborted) {
|
|
2562
|
+
throw new Error(abortErrorMessage(context.signal));
|
|
2563
|
+
}
|
|
2564
|
+
if (context.job.policy.network !== "none") {
|
|
2565
|
+
return cudaFailureResult(context.job, "policy_denied", "cuda.run requires network policy none", startedAt);
|
|
2566
|
+
}
|
|
2567
|
+
if (context.job.policy.allow_raw_command !== false) {
|
|
2568
|
+
return cudaFailureResult(context.job, "policy_denied", "cuda.run does not allow raw commands", startedAt);
|
|
2569
|
+
}
|
|
2570
|
+
if (!context.job.resources?.gpu) {
|
|
2571
|
+
return cudaFailureResult(context.job, "validation_failed", "cuda.run requires GPU resources", startedAt);
|
|
2572
|
+
}
|
|
2573
|
+
if (!context.job.outputs?.length) {
|
|
2574
|
+
return cudaFailureResult(context.job, "validation_failed", "cuda.run requires declared outputs", startedAt);
|
|
2575
|
+
}
|
|
2576
|
+
let args;
|
|
2577
|
+
let archive;
|
|
2578
|
+
let selection;
|
|
2579
|
+
let scriptPath;
|
|
2580
|
+
let workPath;
|
|
2581
|
+
try {
|
|
2582
|
+
args = parseCudaRunArgs(context.job);
|
|
2583
|
+
archive = cudaPackageArchive(context);
|
|
2584
|
+
await validateCudaPackageArchive(context, this.runner, archive);
|
|
2585
|
+
const manifestText = await readCudaManifestText(context, this.runner, args);
|
|
2586
|
+
selection = parseCudaPackageManifest(manifestText, args, context.job.policy);
|
|
2587
|
+
scriptPath = resolveWithinRoot(context.artifacts.workDir, "__mcoda_cuda_run.sh");
|
|
2588
|
+
workPath = resolveWithinRoot(context.artifacts.workDir, "cuda-work");
|
|
2589
|
+
await mkdir(workPath, { recursive: true });
|
|
2590
|
+
await chmod(workPath, 0o777);
|
|
2591
|
+
await chmod(context.artifacts.outputDir, 0o777);
|
|
2592
|
+
await writeFile(scriptPath, buildCudaRunnerScript({ archiveInputPath: archive.inputPath, selection }), { mode: 0o644 });
|
|
2593
|
+
}
|
|
2594
|
+
catch (error) {
|
|
2595
|
+
return cudaFailureResult(context.job, "validation_failed", redactCudaLocalPaths(context, error instanceof Error ? error.message : String(error || "cuda.run validation failed")), startedAt);
|
|
2596
|
+
}
|
|
2597
|
+
const dockerArgs = buildCudaDockerArgs({
|
|
2598
|
+
context,
|
|
2599
|
+
selection,
|
|
2600
|
+
archiveInputPath: archive.inputPath,
|
|
2601
|
+
scriptPath,
|
|
2602
|
+
workPath
|
|
2603
|
+
});
|
|
2604
|
+
await context.emitEvent({
|
|
2605
|
+
type: "progress",
|
|
2606
|
+
message: "cuda package container starting",
|
|
2607
|
+
data: {
|
|
2608
|
+
runner: this.id,
|
|
2609
|
+
image: selection.image,
|
|
2610
|
+
profile: args.profile,
|
|
2611
|
+
target: args.target,
|
|
2612
|
+
gpu_count: Math.max(1, context.job.resources.gpu.count || 1),
|
|
2613
|
+
network: "none",
|
|
2614
|
+
container_user: context.sandbox.container.user
|
|
2615
|
+
}
|
|
2616
|
+
});
|
|
2617
|
+
const timeoutMs = Math.max(1000, Math.min(DEFAULT_JOB_TIMEOUT_MS, Math.floor((context.sandbox.limits.timeout_sec || DEFAULT_JOB_TIMEOUT_MS / 1000) * 1000)));
|
|
2618
|
+
const maxBuffer = Math.min(DEFAULT_COMMAND_MAX_BUFFER, Math.max(1024 * 1024, context.job.limits?.max_stdout_bytes || 0, context.job.limits?.max_stderr_bytes || 0));
|
|
2619
|
+
try {
|
|
2620
|
+
const result = await this.runner("docker", dockerArgs, {
|
|
2621
|
+
timeoutMs,
|
|
2622
|
+
maxBuffer,
|
|
2623
|
+
signal: context.signal
|
|
2624
|
+
});
|
|
2625
|
+
await emitCudaOutput(context, "stdout", result.stdout);
|
|
2626
|
+
await emitCudaOutput(context, "stderr", result.stderr);
|
|
2627
|
+
await context.emitEvent({
|
|
2628
|
+
type: "progress",
|
|
2629
|
+
message: "cuda package container completed",
|
|
2630
|
+
data: {
|
|
2631
|
+
runner: this.id,
|
|
2632
|
+
profile: args.profile,
|
|
2633
|
+
target: args.target
|
|
2634
|
+
}
|
|
2635
|
+
});
|
|
2636
|
+
return {
|
|
2637
|
+
job_id: context.job.idempotency_key || "cuda.run",
|
|
2638
|
+
status: "succeeded",
|
|
2639
|
+
exit_code: 0,
|
|
2640
|
+
started_at: startedAt,
|
|
2641
|
+
finished_at: new Date().toISOString(),
|
|
2642
|
+
metrics: {
|
|
2643
|
+
runner: this.id,
|
|
2644
|
+
image: selection.image,
|
|
2645
|
+
profile: args.profile,
|
|
2646
|
+
target: args.target,
|
|
2647
|
+
package: selection.packageName,
|
|
2648
|
+
publisher: selection.publisher,
|
|
2649
|
+
gpu_count: Math.max(1, context.job.resources.gpu.count || 1),
|
|
2650
|
+
network: "none",
|
|
2651
|
+
container_user: context.sandbox.container.user
|
|
2652
|
+
}
|
|
2653
|
+
};
|
|
2654
|
+
}
|
|
2655
|
+
catch (error) {
|
|
2656
|
+
if (context.signal.aborted) {
|
|
2657
|
+
throw error;
|
|
2658
|
+
}
|
|
2659
|
+
return cudaFailureResult(context.job, "runner_failed", redactCudaLocalPaths(context, error instanceof Error ? error.message : String(error || "cuda.run failed")), startedAt);
|
|
2660
|
+
}
|
|
2661
|
+
}
|
|
2662
|
+
}
|
|
2663
|
+
function createDefaultGenericJobRunners(runner = defaultCommandRunner) {
|
|
2664
|
+
return [new MswarmTestEchoRunner(), new MswarmBlenderRenderRunner(runner), new MswarmCudaPackageRunner(runner)];
|
|
2665
|
+
}
|
|
2666
|
+
function uniqueSortedStrings(values) {
|
|
2667
|
+
return Array.from(new Set(values.filter((value) => typeof value === "string" && value.length > 0))).sort();
|
|
2668
|
+
}
|
|
2669
|
+
function capabilityProbeTimeoutMs(config) {
|
|
2670
|
+
return parsePositiveInteger(config.capabilityProbeTimeoutMs, DEFAULT_CAPABILITY_PROBE_TIMEOUT_MS);
|
|
2671
|
+
}
|
|
2672
|
+
function capabilityCommandFailureMessage(error) {
|
|
2673
|
+
if (error instanceof Error && error.message)
|
|
2674
|
+
return error.message;
|
|
2675
|
+
return String(error || "capability probe failed");
|
|
2676
|
+
}
|
|
2677
|
+
function isMissingCapabilityCommand(error, stderr = "") {
|
|
2678
|
+
const message = `${capabilityCommandFailureMessage(error)}\n${stderr}`.toLowerCase();
|
|
2679
|
+
return /enoent|not found|command not found|no such file|executable file not found/.test(message);
|
|
2680
|
+
}
|
|
2681
|
+
async function runCapabilityCommand(runner, command, args, timeoutMs) {
|
|
2682
|
+
try {
|
|
2683
|
+
const result = await runner(command, args, {
|
|
2684
|
+
timeoutMs,
|
|
2685
|
+
maxBuffer: Math.min(DEFAULT_COMMAND_MAX_BUFFER, 512 * 1024)
|
|
2686
|
+
});
|
|
2687
|
+
return { ok: true, stdout: result.stdout, stderr: result.stderr };
|
|
2688
|
+
}
|
|
2689
|
+
catch (error) {
|
|
2690
|
+
return {
|
|
2691
|
+
ok: false,
|
|
2692
|
+
missing: isMissingCapabilityCommand(error),
|
|
2693
|
+
message: capabilityCommandFailureMessage(error)
|
|
2694
|
+
};
|
|
2695
|
+
}
|
|
2696
|
+
}
|
|
2697
|
+
function parseNvidiaSmiMemoryGb(value) {
|
|
2698
|
+
if (!value)
|
|
2699
|
+
return undefined;
|
|
2700
|
+
const parsed = Number(value.replace(/[^\d.]/g, ""));
|
|
2701
|
+
if (!Number.isFinite(parsed) || parsed <= 0)
|
|
2702
|
+
return undefined;
|
|
2703
|
+
return Math.round((parsed / 1024) * 10) / 10;
|
|
2704
|
+
}
|
|
2705
|
+
function parseNvidiaGpuProbe(stdout) {
|
|
2706
|
+
const devices = [];
|
|
2707
|
+
const cudaVersions = new Set();
|
|
2708
|
+
for (const line of stdout.split(/\r?\n/)) {
|
|
2709
|
+
const trimmed = line.trim();
|
|
2710
|
+
if (!trimmed)
|
|
2711
|
+
continue;
|
|
2712
|
+
const [index, name, memoryMb, driverVersion, computeCapability, cudaVersion] = trimmed
|
|
2713
|
+
.split(",")
|
|
2714
|
+
.map((part) => part.trim());
|
|
2715
|
+
const id = index ? `gpu-${index}` : `gpu-${devices.length}`;
|
|
2716
|
+
if (cudaVersion) {
|
|
2717
|
+
cudaVersions.add(cudaVersion);
|
|
2718
|
+
}
|
|
2719
|
+
devices.push({
|
|
2720
|
+
id,
|
|
2721
|
+
vendor: "nvidia",
|
|
2722
|
+
...(name ? { name } : {}),
|
|
2723
|
+
...(parseNvidiaSmiMemoryGb(memoryMb) ? { vram_gb: parseNvidiaSmiMemoryGb(memoryMb) } : {}),
|
|
2724
|
+
...(driverVersion ? { driver_version: driverVersion } : {}),
|
|
2725
|
+
...(cudaVersion ? { cuda_version: cudaVersion } : {}),
|
|
2726
|
+
...(computeCapability ? { compute_capability: computeCapability } : {}),
|
|
2727
|
+
capabilities: ["cuda"]
|
|
2728
|
+
});
|
|
2729
|
+
}
|
|
2730
|
+
const maxVramGb = devices.reduce((max, device) => {
|
|
2731
|
+
if (!Number.isFinite(device.vram_gb))
|
|
2732
|
+
return max;
|
|
2733
|
+
return max === undefined ? device.vram_gb : Math.max(max, device.vram_gb || 0);
|
|
2734
|
+
}, undefined);
|
|
2735
|
+
return {
|
|
2736
|
+
status: devices.length > 0 ? "available" : "missing",
|
|
2737
|
+
count: devices.length,
|
|
2738
|
+
vendors: devices.length > 0 ? ["nvidia"] : [],
|
|
2739
|
+
devices,
|
|
2740
|
+
...(cudaVersions.size > 0 ? { cuda_versions: Array.from(cudaVersions).sort() } : {}),
|
|
2741
|
+
...(maxVramGb !== undefined ? { max_vram_gb: maxVramGb } : {}),
|
|
2742
|
+
...(devices.length === 0 ? { message: "nvidia-smi returned no GPU rows" } : {})
|
|
2743
|
+
};
|
|
2744
|
+
}
|
|
2745
|
+
function parseNvidiaSmiCudaVersion(stdout) {
|
|
2746
|
+
return stdout.match(/CUDA\s+Version:\s*([0-9]+(?:\.[0-9]+)?)/i)?.[1];
|
|
2747
|
+
}
|
|
2748
|
+
async function probeNvidiaGpuCapabilities(runner, timeoutMs) {
|
|
2749
|
+
const result = await runCapabilityCommand(runner, "nvidia-smi", ["--query-gpu=index,name,memory.total,driver_version,compute_cap", "--format=csv,noheader,nounits"], timeoutMs);
|
|
2750
|
+
if (!result.ok) {
|
|
2751
|
+
return {
|
|
2752
|
+
status: result.missing ? "missing" : "error",
|
|
2753
|
+
count: 0,
|
|
2754
|
+
vendors: [],
|
|
2755
|
+
devices: [],
|
|
2756
|
+
message: result.message
|
|
2757
|
+
};
|
|
2758
|
+
}
|
|
2759
|
+
const probe = parseNvidiaGpuProbe(result.stdout);
|
|
2760
|
+
const versionResult = await runCapabilityCommand(runner, "nvidia-smi", [], timeoutMs);
|
|
2761
|
+
if (!versionResult.ok) {
|
|
2762
|
+
return probe;
|
|
2763
|
+
}
|
|
2764
|
+
const cudaVersion = parseNvidiaSmiCudaVersion(versionResult.stdout || versionResult.stderr);
|
|
2765
|
+
if (!cudaVersion) {
|
|
2766
|
+
return probe;
|
|
2767
|
+
}
|
|
2768
|
+
const cudaVersions = Array.from(new Set([...(probe.cuda_versions || []), cudaVersion])).sort();
|
|
2769
|
+
return {
|
|
2770
|
+
...probe,
|
|
2771
|
+
cuda_versions: cudaVersions,
|
|
2772
|
+
devices: probe.devices.map((device) => ({
|
|
2773
|
+
...device,
|
|
2774
|
+
cuda_version: device.cuda_version || cudaVersion
|
|
2775
|
+
}))
|
|
2776
|
+
};
|
|
2777
|
+
}
|
|
2778
|
+
function missingSoftwareProbe(name, message) {
|
|
2779
|
+
return {
|
|
2780
|
+
name,
|
|
2781
|
+
status: "missing",
|
|
2782
|
+
...(message ? { message } : {})
|
|
2783
|
+
};
|
|
2784
|
+
}
|
|
2785
|
+
function errorSoftwareProbe(name, message) {
|
|
2786
|
+
return {
|
|
2787
|
+
name,
|
|
2788
|
+
status: "error",
|
|
2789
|
+
message
|
|
2790
|
+
};
|
|
2791
|
+
}
|
|
2792
|
+
function extractToolVersion(stdout, tool) {
|
|
2793
|
+
const firstLine = stdout.split(/\r?\n/).find((line) => line.trim().length > 0)?.trim() || "";
|
|
2794
|
+
if (tool === "blender") {
|
|
2795
|
+
return firstLine.match(/Blender\s+([^\s]+)/i)?.[1];
|
|
2796
|
+
}
|
|
2797
|
+
if (tool === "ffmpeg") {
|
|
2798
|
+
return firstLine.match(/ffmpeg\s+version\s+([^\s]+)/i)?.[1];
|
|
2799
|
+
}
|
|
2800
|
+
return firstLine || undefined;
|
|
2801
|
+
}
|
|
2802
|
+
async function probeVersionedSoftware(runner, name, command, args, timeoutMs) {
|
|
2803
|
+
const result = await runCapabilityCommand(runner, command, args, timeoutMs);
|
|
2804
|
+
if (!result.ok) {
|
|
2805
|
+
return result.missing
|
|
2806
|
+
? missingSoftwareProbe(name, result.message)
|
|
2807
|
+
: errorSoftwareProbe(name, result.message);
|
|
2808
|
+
}
|
|
2809
|
+
return {
|
|
2810
|
+
name,
|
|
2811
|
+
status: "available",
|
|
2812
|
+
...(extractToolVersion(result.stdout || result.stderr, name) ? { version: extractToolVersion(result.stdout || result.stderr, name) } : {})
|
|
2813
|
+
};
|
|
2814
|
+
}
|
|
2815
|
+
async function probeDockerCapabilities(runner, timeoutMs) {
|
|
2816
|
+
const result = await runCapabilityCommand(runner, "docker", ["info", "--format", "{{json .Runtimes}}"], timeoutMs);
|
|
2817
|
+
if (!result.ok) {
|
|
2818
|
+
const docker = result.missing
|
|
2819
|
+
? missingSoftwareProbe("docker", result.message)
|
|
2820
|
+
: errorSoftwareProbe("docker", result.message);
|
|
2821
|
+
return {
|
|
2822
|
+
docker,
|
|
2823
|
+
dockerNvidia: { name: "docker-nvidia", status: docker.status, message: result.message }
|
|
2824
|
+
};
|
|
2825
|
+
}
|
|
2826
|
+
try {
|
|
2827
|
+
const runtimes = JSON.parse(result.stdout || "{}");
|
|
2828
|
+
const runtimeNames = Object.keys(runtimes);
|
|
2829
|
+
const hasNvidiaRuntime = runtimeNames.some((name) => name.toLowerCase() === "nvidia");
|
|
2830
|
+
return {
|
|
2831
|
+
docker: { name: "docker", status: "available" },
|
|
2832
|
+
dockerNvidia: hasNvidiaRuntime
|
|
2833
|
+
? { name: "docker-nvidia", status: "available", version: "nvidia" }
|
|
2834
|
+
: missingSoftwareProbe("docker-nvidia", "Docker is available but the nvidia runtime is not registered")
|
|
2835
|
+
};
|
|
2836
|
+
}
|
|
2837
|
+
catch (error) {
|
|
2838
|
+
const message = capabilityCommandFailureMessage(error);
|
|
2839
|
+
return {
|
|
2840
|
+
docker: errorSoftwareProbe("docker", `Unable to parse docker runtime inventory: ${message}`),
|
|
2841
|
+
dockerNvidia: errorSoftwareProbe("docker-nvidia", `Unable to parse docker runtime inventory: ${message}`)
|
|
2842
|
+
};
|
|
2843
|
+
}
|
|
2844
|
+
}
|
|
2845
|
+
function capabilityDiagnostics(snapshot) {
|
|
2846
|
+
const diagnostics = [];
|
|
2847
|
+
if (snapshot.gpu.status !== "available") {
|
|
2848
|
+
diagnostics.push({
|
|
2849
|
+
name: "gpu",
|
|
2850
|
+
status: snapshot.gpu.status,
|
|
2851
|
+
message: snapshot.gpu.message
|
|
2852
|
+
});
|
|
2853
|
+
}
|
|
2854
|
+
for (const result of Object.values(snapshot.software)) {
|
|
2855
|
+
if (result.status !== "available") {
|
|
2856
|
+
diagnostics.push({
|
|
2857
|
+
name: result.name,
|
|
2858
|
+
status: result.status,
|
|
2859
|
+
message: result.message
|
|
2860
|
+
});
|
|
2861
|
+
}
|
|
2862
|
+
}
|
|
2863
|
+
return diagnostics.length ? diagnostics : undefined;
|
|
2864
|
+
}
|
|
2865
|
+
function buildCapabilitySnapshotId(snapshot) {
|
|
2866
|
+
const digest = createHash("sha256").update(JSON.stringify(snapshot)).digest("hex").slice(0, 16);
|
|
2867
|
+
return `caps_${digest}`;
|
|
2868
|
+
}
|
|
2869
|
+
function buildRunnerCapabilityCatalog(config, runners) {
|
|
2870
|
+
if (!config.genericJobsEnabled) {
|
|
2871
|
+
return [];
|
|
2872
|
+
}
|
|
2873
|
+
return OWNER_LOCAL_GENERIC_JOB_CATALOG
|
|
2874
|
+
.filter((entry) => runners.has(entry.runner))
|
|
2875
|
+
.map((entry) => ({
|
|
2876
|
+
job_type: entry.job_type,
|
|
2877
|
+
runner: entry.runner,
|
|
2878
|
+
trust_modes: uniqueSortedStrings([entry.policy.trust_mode]),
|
|
2879
|
+
required_capabilities: entry.required_capabilities || []
|
|
2880
|
+
}));
|
|
2881
|
+
}
|
|
2882
|
+
function runnerCapabilityRequirementsAvailable(entry, input) {
|
|
2883
|
+
if (!input.genericJobsEnabled)
|
|
2884
|
+
return false;
|
|
2885
|
+
if (!entry.required_capabilities?.length)
|
|
2886
|
+
return true;
|
|
2887
|
+
const snapshot = {
|
|
2888
|
+
schema_version: MSWARM_CAPABILITY_SCHEMA_VERSION,
|
|
2889
|
+
snapshot_id: "caps_requirement_check",
|
|
2890
|
+
captured_at: new Date(0).toISOString(),
|
|
2891
|
+
generic_jobs_enabled: input.genericJobsEnabled,
|
|
2892
|
+
job_types: [],
|
|
2893
|
+
trust_modes: [],
|
|
2894
|
+
gpu: input.gpu,
|
|
2895
|
+
software: input.software,
|
|
2896
|
+
runner_catalog: []
|
|
2897
|
+
};
|
|
2898
|
+
const capabilities = new Set(buildMswarmCapabilityNames(snapshot));
|
|
2899
|
+
return entry.required_capabilities.every((capability) => capabilities.has(capability));
|
|
2900
|
+
}
|
|
2901
|
+
function registeredOwnerLocalGenericJobCatalog() {
|
|
2902
|
+
return OWNER_LOCAL_GENERIC_JOB_CATALOG.filter((entry) => entry.job_type.startsWith("tenant.") || entry.job_type.startsWith("package."));
|
|
2903
|
+
}
|
|
2904
|
+
function base64UrlEncodeRuntime(buffer) {
|
|
2905
|
+
return buffer.toString("base64").replace(/=/g, "").replace(/\+/g, "-").replace(/\//g, "_");
|
|
2906
|
+
}
|
|
2907
|
+
function signCapabilityPayload(input) {
|
|
2908
|
+
const unsignedPayload = {
|
|
2909
|
+
schema_version: MSWARM_CAPABILITY_SCHEMA_VERSION,
|
|
2910
|
+
snapshot_id: input.privateCatalogEntry.snapshot_id,
|
|
2911
|
+
private_catalog_entry: input.privateCatalogEntry,
|
|
2912
|
+
scheduler_match: input.privateCatalogEntry.scheduler_match,
|
|
2913
|
+
public_projection: input.privateCatalogEntry.public_projection
|
|
2914
|
+
};
|
|
2915
|
+
const signature = base64UrlEncodeRuntime(createHmac("sha256", input.runtimeToken).update(JSON.stringify(unsignedPayload)).digest());
|
|
2916
|
+
return {
|
|
2917
|
+
...unsignedPayload,
|
|
2918
|
+
signature: {
|
|
2919
|
+
alg: "HS256",
|
|
2920
|
+
value: signature,
|
|
2921
|
+
signed_at: new Date().toISOString(),
|
|
2922
|
+
key_id: "self_hosted_runtime_token"
|
|
2923
|
+
}
|
|
2924
|
+
};
|
|
2925
|
+
}
|
|
2926
|
+
function runnerForGenericJob(job, runners) {
|
|
2927
|
+
const catalogEntry = OWNER_LOCAL_GENERIC_JOB_CATALOG.find((entry) => entry.job_type === job.job_type);
|
|
2928
|
+
return catalogEntry ? runners.get(catalogEntry.runner) || null : null;
|
|
2929
|
+
}
|
|
2930
|
+
function compareDottedVersion(left, right) {
|
|
2931
|
+
if (!left || !right)
|
|
2932
|
+
return 0;
|
|
2933
|
+
const leftParts = left.split(".").map((part) => Number(part.replace(/[^\d]/g, "")) || 0);
|
|
2934
|
+
const rightParts = right.split(".").map((part) => Number(part.replace(/[^\d]/g, "")) || 0);
|
|
2935
|
+
const length = Math.max(leftParts.length, rightParts.length);
|
|
2936
|
+
for (let index = 0; index < length; index += 1) {
|
|
2937
|
+
const delta = (leftParts[index] || 0) - (rightParts[index] || 0);
|
|
2938
|
+
if (delta !== 0)
|
|
2939
|
+
return delta;
|
|
2940
|
+
}
|
|
2941
|
+
return 0;
|
|
2942
|
+
}
|
|
2943
|
+
function snapshotHasCudaVersion(snapshot, minVersion) {
|
|
2944
|
+
if (!minVersion)
|
|
2945
|
+
return true;
|
|
2946
|
+
const versions = [
|
|
2947
|
+
...(snapshot.gpu.cuda_versions || []),
|
|
2948
|
+
...snapshot.gpu.devices.map((device) => device.cuda_version).filter((value) => Boolean(value))
|
|
2949
|
+
];
|
|
2950
|
+
return versions.some((version) => compareDottedVersion(version, minVersion) >= 0);
|
|
2951
|
+
}
|
|
2952
|
+
export function genericJobCapabilityMismatch(job, snapshot) {
|
|
2953
|
+
if (!snapshot.generic_jobs_enabled) {
|
|
2954
|
+
return { code: "no_capable_node", message: "Generic jobs are disabled on this node." };
|
|
2955
|
+
}
|
|
2956
|
+
if (job.job_type === RENDER_BLENDER_JOB_TYPE && snapshot.software.blender.status !== "available") {
|
|
2957
|
+
return {
|
|
2958
|
+
code: "no_capable_node",
|
|
2959
|
+
message: "Blender is not available on this node."
|
|
2960
|
+
};
|
|
2961
|
+
}
|
|
2962
|
+
if (job.job_type === CUDA_RUN_JOB_TYPE) {
|
|
2963
|
+
if (snapshot.gpu.status !== "available" || !snapshot.gpu.vendors.includes("nvidia")) {
|
|
2964
|
+
return {
|
|
2965
|
+
code: "no_capable_node",
|
|
2966
|
+
message: "No NVIDIA GPU is available on this node."
|
|
2967
|
+
};
|
|
2968
|
+
}
|
|
2969
|
+
if (snapshot.software.docker.status !== "available" || snapshot.software["docker-nvidia"].status !== "available") {
|
|
2970
|
+
return {
|
|
2971
|
+
code: "no_capable_node",
|
|
2972
|
+
message: "Docker with the NVIDIA runtime is not available on this node."
|
|
2973
|
+
};
|
|
2974
|
+
}
|
|
2975
|
+
}
|
|
2976
|
+
if (!snapshot.job_types.includes(job.job_type)) {
|
|
2977
|
+
return {
|
|
2978
|
+
code: "no_capable_node",
|
|
2979
|
+
message: `No capable owner-local node is available for ${job.job_type}.`
|
|
2980
|
+
};
|
|
2981
|
+
}
|
|
2982
|
+
if (job.resources?.gpu) {
|
|
2983
|
+
const requestedCount = Math.max(1, job.resources.gpu.count || 1);
|
|
2984
|
+
if (snapshot.gpu.status !== "available" || snapshot.gpu.count < requestedCount) {
|
|
2985
|
+
return {
|
|
2986
|
+
code: "no_capable_node",
|
|
2987
|
+
message: `Requested ${requestedCount} GPU(s), but this node reports ${snapshot.gpu.count}.`
|
|
2988
|
+
};
|
|
2989
|
+
}
|
|
2990
|
+
if (job.resources.gpu.vendor && !snapshot.gpu.vendors.includes(job.resources.gpu.vendor)) {
|
|
2991
|
+
return {
|
|
2992
|
+
code: "no_capable_node",
|
|
2993
|
+
message: `Requested GPU vendor ${job.resources.gpu.vendor} is not available on this node.`
|
|
2994
|
+
};
|
|
2995
|
+
}
|
|
2996
|
+
if (Number.isFinite(job.resources.gpu.min_vram_gb) &&
|
|
2997
|
+
job.resources.gpu.min_vram_gb !== undefined &&
|
|
2998
|
+
(!Number.isFinite(snapshot.gpu.max_vram_gb) || (snapshot.gpu.max_vram_gb || 0) < job.resources.gpu.min_vram_gb)) {
|
|
2999
|
+
return {
|
|
3000
|
+
code: "no_capable_node",
|
|
3001
|
+
message: `Requested GPU VRAM ${job.resources.gpu.min_vram_gb}GB exceeds this node capability.`
|
|
3002
|
+
};
|
|
3003
|
+
}
|
|
3004
|
+
if (!snapshotHasCudaVersion(snapshot, job.resources.gpu.cuda_min_version)) {
|
|
3005
|
+
return {
|
|
3006
|
+
code: "no_capable_node",
|
|
3007
|
+
message: `Requested CUDA ${job.resources.gpu.cuda_min_version} is not available on this node.`
|
|
3008
|
+
};
|
|
3009
|
+
}
|
|
3010
|
+
}
|
|
3011
|
+
return null;
|
|
3012
|
+
}
|
|
3013
|
+
function genericJobTimeoutMs(job, fallbackMs) {
|
|
3014
|
+
const limitSeconds = positiveInteger(job.limits?.timeout_sec);
|
|
3015
|
+
if (!limitSeconds) {
|
|
3016
|
+
return fallbackMs;
|
|
3017
|
+
}
|
|
3018
|
+
return Math.max(1, Math.min(fallbackMs, limitSeconds * 1000));
|
|
3019
|
+
}
|
|
3020
|
+
function isGenericAbortError(error, signal) {
|
|
3021
|
+
if (signal.aborted)
|
|
3022
|
+
return true;
|
|
3023
|
+
if (!(error instanceof Error))
|
|
3024
|
+
return false;
|
|
3025
|
+
return /cancelled|canceled|aborted|timed out|timeout/i.test(error.message);
|
|
3026
|
+
}
|
|
1533
3027
|
function usageTokens(usage) {
|
|
1534
3028
|
return {
|
|
1535
3029
|
promptTokens: positiveInteger(usage?.inputTokens),
|
|
@@ -1690,6 +3184,13 @@ export class SelfHostedNodeRuntime {
|
|
|
1690
3184
|
fetchImpl: deps?.fetchImpl,
|
|
1691
3185
|
timeoutMs: config.jobTimeoutMs
|
|
1692
3186
|
});
|
|
3187
|
+
this.capabilityRunner = deps?.capabilityRunner || defaultCommandRunner;
|
|
3188
|
+
this.genericRunners = new Map((deps?.genericRunners || createDefaultGenericJobRunners(this.capabilityRunner)).map((runner) => [runner.id, runner]));
|
|
3189
|
+
this.artifactStore =
|
|
3190
|
+
deps?.artifactStore ||
|
|
3191
|
+
new MswarmLocalArtifactStore({
|
|
3192
|
+
rootDir: config.artifactStorePath || defaultArtifactStorePath()
|
|
3193
|
+
});
|
|
1693
3194
|
}
|
|
1694
3195
|
static async setup(setupConfig, deps) {
|
|
1695
3196
|
const gateway = deps?.gateway ||
|
|
@@ -1711,7 +3212,8 @@ export class SelfHostedNodeRuntime {
|
|
|
1711
3212
|
expose_all_models: setupConfig.exposeAllModels,
|
|
1712
3213
|
model_allowlist: setupConfig.modelAllowlist,
|
|
1713
3214
|
model_blocklist: setupConfig.modelBlocklist,
|
|
1714
|
-
heartbeat_interval_seconds: setupConfig.heartbeatIntervalSeconds
|
|
3215
|
+
heartbeat_interval_seconds: setupConfig.heartbeatIntervalSeconds,
|
|
3216
|
+
generic_job_max_concurrency: setupConfig.genericJobMaxConcurrency
|
|
1715
3217
|
});
|
|
1716
3218
|
const nodeId = optionalText(bootstrap.node?.node_id);
|
|
1717
3219
|
const runtimeToken = optionalText(bootstrap.runtime_token);
|
|
@@ -1726,6 +3228,7 @@ export class SelfHostedNodeRuntime {
|
|
|
1726
3228
|
machine_fingerprint: machineFingerprint,
|
|
1727
3229
|
direct_base_url: setupConfig.directBaseUrl || null,
|
|
1728
3230
|
runtime_token: undefined,
|
|
3231
|
+
artifact_store_path: setupConfig.artifactStorePath || defaultArtifactStorePath(),
|
|
1729
3232
|
config_version: bootstrap.config_version,
|
|
1730
3233
|
heartbeat_interval_seconds: heartbeatInterval,
|
|
1731
3234
|
heartbeat_timeout_seconds: bootstrap.heartbeat_timeout_seconds,
|
|
@@ -1739,6 +3242,10 @@ export class SelfHostedNodeRuntime {
|
|
|
1739
3242
|
node_version: setupConfig.nodeVersion,
|
|
1740
3243
|
request_timeout_ms: setupConfig.requestTimeoutMs,
|
|
1741
3244
|
job_timeout_ms: setupConfig.jobTimeoutMs,
|
|
3245
|
+
generic_jobs_enabled: setupConfig.genericJobsEnabled,
|
|
3246
|
+
generic_job_timeout_ms: setupConfig.genericJobTimeoutMs,
|
|
3247
|
+
generic_job_max_concurrency: setupConfig.genericJobMaxConcurrency,
|
|
3248
|
+
capability_probe_timeout_ms: setupConfig.capabilityProbeTimeoutMs || DEFAULT_CAPABILITY_PROBE_TIMEOUT_MS,
|
|
1742
3249
|
expose_all_models: setupConfig.exposeAllModels,
|
|
1743
3250
|
exposure_policy: setupConfig.exposeAllModels ? "all" : "none",
|
|
1744
3251
|
model_allowlist: setupConfig.modelAllowlist,
|
|
@@ -1761,6 +3268,7 @@ export class SelfHostedNodeRuntime {
|
|
|
1761
3268
|
ollamaBaseUrl: setupConfig.ollamaBaseUrl,
|
|
1762
3269
|
statePath: setupConfig.statePath,
|
|
1763
3270
|
runtimeTokenPath: setupConfig.runtimeTokenPath,
|
|
3271
|
+
artifactStorePath: setupConfig.artifactStorePath || defaultArtifactStorePath(),
|
|
1764
3272
|
invocationSigningSecret: null,
|
|
1765
3273
|
listenHost: DEFAULT_LISTEN_HOST,
|
|
1766
3274
|
listenPort: DEFAULT_LISTEN_PORT,
|
|
@@ -1768,6 +3276,10 @@ export class SelfHostedNodeRuntime {
|
|
|
1768
3276
|
heartbeatIntervalSeconds: heartbeatInterval,
|
|
1769
3277
|
requestTimeoutMs: setupConfig.requestTimeoutMs,
|
|
1770
3278
|
jobTimeoutMs: setupConfig.jobTimeoutMs,
|
|
3279
|
+
genericJobsEnabled: setupConfig.genericJobsEnabled,
|
|
3280
|
+
genericJobTimeoutMs: setupConfig.genericJobTimeoutMs,
|
|
3281
|
+
genericJobMaxConcurrency: setupConfig.genericJobMaxConcurrency,
|
|
3282
|
+
capabilityProbeTimeoutMs: setupConfig.capabilityProbeTimeoutMs || DEFAULT_CAPABILITY_PROBE_TIMEOUT_MS,
|
|
1771
3283
|
exposeAllModels: setupConfig.exposeAllModels,
|
|
1772
3284
|
modelAllowlist: setupConfig.modelAllowlist,
|
|
1773
3285
|
modelBlocklist: setupConfig.modelBlocklist
|
|
@@ -1795,6 +3307,53 @@ export class SelfHostedNodeRuntime {
|
|
|
1795
3307
|
const models = await this.mcoda.listAgents(this.config);
|
|
1796
3308
|
return { source: "mcoda", status: "online", models, version: null, failureCount: 0 };
|
|
1797
3309
|
}
|
|
3310
|
+
async probeCapabilities() {
|
|
3311
|
+
const timeoutMs = capabilityProbeTimeoutMs(this.config);
|
|
3312
|
+
const [gpu, docker, blender, ffmpeg] = await Promise.all([
|
|
3313
|
+
probeNvidiaGpuCapabilities(this.capabilityRunner, timeoutMs),
|
|
3314
|
+
probeDockerCapabilities(this.capabilityRunner, timeoutMs),
|
|
3315
|
+
probeVersionedSoftware(this.capabilityRunner, "blender", "blender", ["--version"], timeoutMs),
|
|
3316
|
+
probeVersionedSoftware(this.capabilityRunner, "ffmpeg", "ffmpeg", ["-version"], timeoutMs)
|
|
3317
|
+
]);
|
|
3318
|
+
const software = {
|
|
3319
|
+
docker: docker.docker,
|
|
3320
|
+
"docker-nvidia": docker.dockerNvidia,
|
|
3321
|
+
blender,
|
|
3322
|
+
ffmpeg
|
|
3323
|
+
};
|
|
3324
|
+
const runnerCatalog = buildRunnerCapabilityCatalog(this.config, this.genericRunners).filter((entry) => runnerCapabilityRequirementsAvailable(entry, {
|
|
3325
|
+
gpu,
|
|
3326
|
+
software,
|
|
3327
|
+
genericJobsEnabled: this.config.genericJobsEnabled
|
|
3328
|
+
}));
|
|
3329
|
+
const snapshotWithoutId = {
|
|
3330
|
+
schema_version: MSWARM_CAPABILITY_SCHEMA_VERSION,
|
|
3331
|
+
captured_at: new Date().toISOString(),
|
|
3332
|
+
node_id: this.config.nodeId,
|
|
3333
|
+
platform: platform(),
|
|
3334
|
+
arch: process.arch,
|
|
3335
|
+
generic_jobs_enabled: this.config.genericJobsEnabled,
|
|
3336
|
+
job_types: uniqueSortedStrings(runnerCatalog.map((entry) => entry.job_type)),
|
|
3337
|
+
trust_modes: uniqueSortedStrings(runnerCatalog.flatMap((entry) => entry.trust_modes)),
|
|
3338
|
+
gpu,
|
|
3339
|
+
software,
|
|
3340
|
+
runner_catalog: runnerCatalog
|
|
3341
|
+
};
|
|
3342
|
+
const snapshot = {
|
|
3343
|
+
...snapshotWithoutId,
|
|
3344
|
+
snapshot_id: buildCapabilitySnapshotId(snapshotWithoutId)
|
|
3345
|
+
};
|
|
3346
|
+
const diagnostics = capabilityDiagnostics(snapshot);
|
|
3347
|
+
return diagnostics ? { ...snapshot, diagnostics } : snapshot;
|
|
3348
|
+
}
|
|
3349
|
+
async publicCapabilityProjection() {
|
|
3350
|
+
return projectMswarmPublicCapabilities(await this.probeCapabilities());
|
|
3351
|
+
}
|
|
3352
|
+
async buildCapabilityHeartbeatPayload(runtimeToken) {
|
|
3353
|
+
const snapshot = await this.probeCapabilities();
|
|
3354
|
+
const privateCatalogEntry = buildMswarmPrivateCapabilityCatalogEntry(snapshot);
|
|
3355
|
+
return signCapabilityPayload({ privateCatalogEntry, runtimeToken });
|
|
3356
|
+
}
|
|
1798
3357
|
async ensureEnrolled() {
|
|
1799
3358
|
const currentState = await readSelfHostedNodeState(this.config.statePath);
|
|
1800
3359
|
const persistedRuntimeToken = await readSelfHostedRuntimeToken(this.config.runtimeTokenPath);
|
|
@@ -1827,6 +3386,9 @@ export class SelfHostedNodeRuntime {
|
|
|
1827
3386
|
node_version: this.config.nodeVersion,
|
|
1828
3387
|
request_timeout_ms: this.config.requestTimeoutMs,
|
|
1829
3388
|
job_timeout_ms: this.config.jobTimeoutMs,
|
|
3389
|
+
generic_jobs_enabled: this.config.genericJobsEnabled,
|
|
3390
|
+
generic_job_timeout_ms: this.config.genericJobTimeoutMs,
|
|
3391
|
+
generic_job_max_concurrency: this.config.genericJobMaxConcurrency,
|
|
1830
3392
|
expose_all_models: this.config.exposeAllModels,
|
|
1831
3393
|
exposure_policy: this.config.exposeAllModels ? "all" : "none",
|
|
1832
3394
|
model_allowlist: this.config.modelAllowlist,
|
|
@@ -1855,6 +3417,166 @@ export class SelfHostedNodeRuntime {
|
|
|
1855
3417
|
}
|
|
1856
3418
|
return mapMcodaAgentToCodaliAgent(agent, selected);
|
|
1857
3419
|
}
|
|
3420
|
+
async executeGenericJob(envelope, options = {}) {
|
|
3421
|
+
const startedAt = Date.now();
|
|
3422
|
+
const events = [];
|
|
3423
|
+
let sequence = 0;
|
|
3424
|
+
const emitEvent = async (event) => {
|
|
3425
|
+
const next = {
|
|
3426
|
+
job_id: envelope.job_id,
|
|
3427
|
+
sequence,
|
|
3428
|
+
timestamp: new Date().toISOString(),
|
|
3429
|
+
...event
|
|
3430
|
+
};
|
|
3431
|
+
sequence += 1;
|
|
3432
|
+
events.push(next);
|
|
3433
|
+
await options.onEvent?.(next);
|
|
3434
|
+
};
|
|
3435
|
+
const failed = async (code, message, validationIssues) => {
|
|
3436
|
+
await emitEvent({
|
|
3437
|
+
type: code === "cancelled" ? "cancelled" : "failed",
|
|
3438
|
+
message,
|
|
3439
|
+
data: { code }
|
|
3440
|
+
});
|
|
3441
|
+
const status = code === "cancelled" ? "cancelled" : "failed";
|
|
3442
|
+
const result = {
|
|
3443
|
+
job_id: envelope.job_id,
|
|
3444
|
+
status,
|
|
3445
|
+
error: {
|
|
3446
|
+
code,
|
|
3447
|
+
message,
|
|
3448
|
+
retryable: code === "timeout"
|
|
3449
|
+
},
|
|
3450
|
+
finished_at: new Date().toISOString()
|
|
3451
|
+
};
|
|
3452
|
+
return {
|
|
3453
|
+
job_id: envelope.job_id,
|
|
3454
|
+
request_id: envelope.request_id,
|
|
3455
|
+
status,
|
|
3456
|
+
result,
|
|
3457
|
+
events,
|
|
3458
|
+
...(validationIssues?.length ? { validation_issues: validationIssues } : {}),
|
|
3459
|
+
timing: { local_latency_ms: Date.now() - startedAt }
|
|
3460
|
+
};
|
|
3461
|
+
};
|
|
3462
|
+
if (!this.config.genericJobsEnabled) {
|
|
3463
|
+
return failed("feature_disabled", "Generic node jobs are disabled on this node.");
|
|
3464
|
+
}
|
|
3465
|
+
if (envelope.node_id !== this.config.nodeId) {
|
|
3466
|
+
return failed("validation_failed", "generic job node_id does not match this node");
|
|
3467
|
+
}
|
|
3468
|
+
const validation = validateMswarmGenericJobRequest(envelope.job, {
|
|
3469
|
+
registeredJobCatalog: registeredOwnerLocalGenericJobCatalog()
|
|
3470
|
+
});
|
|
3471
|
+
if (!validation.ok || !validation.value) {
|
|
3472
|
+
return failed("validation_failed", "generic job request failed validation", validation.issues);
|
|
3473
|
+
}
|
|
3474
|
+
const job = validation.value;
|
|
3475
|
+
const runner = runnerForGenericJob(job, this.genericRunners);
|
|
3476
|
+
if (!runner) {
|
|
3477
|
+
return failed("runner_unavailable", `No generic job runner is registered for ${job.job_type}.`);
|
|
3478
|
+
}
|
|
3479
|
+
if (job.job_type === RENDER_BLENDER_JOB_TYPE || job.job_type === CUDA_RUN_JOB_TYPE) {
|
|
3480
|
+
const capabilityMismatch = genericJobCapabilityMismatch(job, await this.probeCapabilities());
|
|
3481
|
+
if (capabilityMismatch) {
|
|
3482
|
+
return failed(capabilityMismatch.code, capabilityMismatch.message);
|
|
3483
|
+
}
|
|
3484
|
+
}
|
|
3485
|
+
let artifactContext;
|
|
3486
|
+
try {
|
|
3487
|
+
artifactContext = await this.artifactStore.prepareJobWorkspace(envelope.job_id, job);
|
|
3488
|
+
}
|
|
3489
|
+
catch (error) {
|
|
3490
|
+
return failed("validation_failed", error instanceof Error ? error.message : String(error || "generic job artifact preparation failed"));
|
|
3491
|
+
}
|
|
3492
|
+
const controller = new AbortController();
|
|
3493
|
+
const timeoutMs = genericJobTimeoutMs(job, this.config.genericJobTimeoutMs || this.config.jobTimeoutMs);
|
|
3494
|
+
const onAbort = () => {
|
|
3495
|
+
if (!controller.signal.aborted) {
|
|
3496
|
+
controller.abort(options.signal?.reason || "cancelled");
|
|
3497
|
+
}
|
|
3498
|
+
};
|
|
3499
|
+
if (options.signal?.aborted) {
|
|
3500
|
+
controller.abort(options.signal.reason || "cancelled");
|
|
3501
|
+
}
|
|
3502
|
+
options.signal?.addEventListener("abort", onAbort, { once: true });
|
|
3503
|
+
const timeout = setTimeout(() => {
|
|
3504
|
+
if (!controller.signal.aborted) {
|
|
3505
|
+
controller.abort("timeout");
|
|
3506
|
+
}
|
|
3507
|
+
}, timeoutMs);
|
|
3508
|
+
try {
|
|
3509
|
+
await emitEvent({
|
|
3510
|
+
type: "started",
|
|
3511
|
+
message: `Running ${job.job_type}`,
|
|
3512
|
+
data: {
|
|
3513
|
+
runner: runner.id,
|
|
3514
|
+
sandbox_profile: artifactContext.sandbox.name,
|
|
3515
|
+
timeout_ms: timeoutMs
|
|
3516
|
+
}
|
|
3517
|
+
});
|
|
3518
|
+
const runnerResult = await runner.run({
|
|
3519
|
+
job,
|
|
3520
|
+
signal: controller.signal,
|
|
3521
|
+
emitEvent,
|
|
3522
|
+
artifacts: artifactContext,
|
|
3523
|
+
sandbox: artifactContext.sandbox
|
|
3524
|
+
});
|
|
3525
|
+
const status = runnerResult.status || "succeeded";
|
|
3526
|
+
const outputContext = status === "succeeded"
|
|
3527
|
+
? artifactContext
|
|
3528
|
+
: {
|
|
3529
|
+
...artifactContext,
|
|
3530
|
+
outputSpecs: artifactContext.outputSpecs.map((output) => ({ ...output, required: false }))
|
|
3531
|
+
};
|
|
3532
|
+
const outputArtifacts = await this.artifactStore.collectOutputs(outputContext, envelope.job_id);
|
|
3533
|
+
for (const artifact of outputArtifacts) {
|
|
3534
|
+
await emitEvent({
|
|
3535
|
+
type: "artifact",
|
|
3536
|
+
message: "output artifact collected",
|
|
3537
|
+
data: { artifact }
|
|
3538
|
+
});
|
|
3539
|
+
}
|
|
3540
|
+
const result = {
|
|
3541
|
+
...runnerResult,
|
|
3542
|
+
job_id: envelope.job_id,
|
|
3543
|
+
status,
|
|
3544
|
+
artifacts: [...(runnerResult.artifacts || []), ...outputArtifacts],
|
|
3545
|
+
started_at: runnerResult.started_at || new Date(startedAt).toISOString(),
|
|
3546
|
+
finished_at: runnerResult.finished_at || new Date().toISOString()
|
|
3547
|
+
};
|
|
3548
|
+
await emitEvent({
|
|
3549
|
+
type: status === "succeeded" ? "completed" : "failed",
|
|
3550
|
+
message: status === "succeeded" ? "generic job completed" : runnerResult.error?.message || "generic job failed",
|
|
3551
|
+
data: {
|
|
3552
|
+
status,
|
|
3553
|
+
exit_code: result.exit_code,
|
|
3554
|
+
runner: runner.id
|
|
3555
|
+
}
|
|
3556
|
+
});
|
|
3557
|
+
return {
|
|
3558
|
+
job_id: envelope.job_id,
|
|
3559
|
+
request_id: envelope.request_id,
|
|
3560
|
+
status,
|
|
3561
|
+
result,
|
|
3562
|
+
events,
|
|
3563
|
+
timing: { local_latency_ms: Date.now() - startedAt }
|
|
3564
|
+
};
|
|
3565
|
+
}
|
|
3566
|
+
catch (error) {
|
|
3567
|
+
const code = isGenericAbortError(error, controller.signal) ? abortErrorCode(controller.signal) : "runner_error";
|
|
3568
|
+
const message = code === "timeout" || code === "cancelled"
|
|
3569
|
+
? abortErrorMessage(controller.signal)
|
|
3570
|
+
: error instanceof Error
|
|
3571
|
+
? error.message
|
|
3572
|
+
: String(error);
|
|
3573
|
+
return failed(code, message);
|
|
3574
|
+
}
|
|
3575
|
+
finally {
|
|
3576
|
+
clearTimeout(timeout);
|
|
3577
|
+
options.signal?.removeEventListener("abort", onAbort);
|
|
3578
|
+
}
|
|
3579
|
+
}
|
|
1858
3580
|
async executeJob(job, options = {}) {
|
|
1859
3581
|
const startedAt = Date.now();
|
|
1860
3582
|
let selectedAgent;
|
|
@@ -2050,6 +3772,7 @@ export class SelfHostedNodeRuntime {
|
|
|
2050
3772
|
models = [];
|
|
2051
3773
|
version = null;
|
|
2052
3774
|
}
|
|
3775
|
+
const capabilityPayload = await this.buildCapabilityHeartbeatPayload(enrollment.runtimeToken);
|
|
2053
3776
|
const heartbeatPayload = {
|
|
2054
3777
|
node_id: this.config.nodeId,
|
|
2055
3778
|
node_version: this.config.nodeVersion,
|
|
@@ -2077,7 +3800,8 @@ export class SelfHostedNodeRuntime {
|
|
|
2077
3800
|
recent_failure_count: recentFailureCount,
|
|
2078
3801
|
last_success_at: status === "online" ? new Date().toISOString() : null
|
|
2079
3802
|
},
|
|
2080
|
-
models
|
|
3803
|
+
models,
|
|
3804
|
+
capabilities: capabilityPayload
|
|
2081
3805
|
};
|
|
2082
3806
|
const heartbeatResponse = await this.gateway.heartbeat(enrollment.runtimeToken, heartbeatPayload);
|
|
2083
3807
|
const exposedModelCount = models.filter((model) => model.exposed !== false).length;
|