@mcoda/mswarm 0.1.76 → 0.1.78
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +27 -0
- package/dist/invocation-token.d.ts +48 -0
- package/dist/invocation-token.d.ts.map +1 -1
- package/dist/invocation-token.js +109 -0
- package/dist/invocation-token.js.map +1 -1
- package/dist/runtime.d.ts +109 -0
- package/dist/runtime.d.ts.map +1 -1
- package/dist/runtime.js +1730 -6
- package/dist/runtime.js.map +1 -1
- package/dist/server.d.ts.map +1 -1
- package/dist/server.js +1401 -4
- package/dist/server.js.map +1 -1
- package/package.json +4 -4
package/dist/server.js
CHANGED
|
@@ -1,11 +1,13 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
import { realpathSync } from "node:fs";
|
|
3
|
-
import { readFile } from "node:fs/promises";
|
|
3
|
+
import { lstat, mkdir, readFile, writeFile } from "node:fs/promises";
|
|
4
4
|
import { fileURLToPath } from "node:url";
|
|
5
|
-
import { resolve } from "node:path";
|
|
5
|
+
import { dirname, resolve, sep } from "node:path";
|
|
6
|
+
import { createHash, randomUUID } from "node:crypto";
|
|
6
7
|
import Fastify from "fastify";
|
|
7
|
-
import {
|
|
8
|
-
import {
|
|
8
|
+
import { MSWARM_JOB_LIFECYCLE_SCHEMA_VERSION, assertMswarmSafeRelativePath, buildMswarmLocalArtifactUri, defaultMswarmArtifactAccessPolicy, defaultMswarmArtifactRetentionPolicy, buildMswarmGenericJobAuditEvent, buildMswarmGenericJobEnvelopeDescriptor, isMswarmLifecycleStateTransitionAllowed, isMswarmTerminalLifecycleState, normalizeMswarmGenericJobIdempotencyKey } from "@mcoda/shared";
|
|
9
|
+
import { verifySelfHostedCapabilityToken, verifySelfHostedGenericJobOpsToken, verifySelfHostedGenericJobToken, verifySelfHostedInvocationToken } from "./invocation-token.js";
|
|
10
|
+
import { controlSelfHostedNodeService, installSelfHostedNodeService, readOwnerSetupConfig, readSelfHostedNodeConfig, resolveSelfHostedNodeServiceLayout, SelfHostedNodeRuntime, uninstallSelfHostedNodeService, genericJobCapabilityMismatch } from "./runtime.js";
|
|
9
11
|
const SELF_HOSTED_NODE_PROCESS_TITLE = "mswarm-node";
|
|
10
12
|
function applySelfHostedNodeProcessTitle() {
|
|
11
13
|
const title = process.env.MSWARM_SELF_HOSTED_PROCESS_TITLE?.trim() || SELF_HOSTED_NODE_PROCESS_TITLE;
|
|
@@ -44,7 +46,12 @@ Environment:
|
|
|
44
46
|
MSWARM_SELF_HOSTED_OLLAMA_BASE_URL Ollama base URL, defaults to http://127.0.0.1:11434
|
|
45
47
|
MSWARM_SELF_HOSTED_NODE_STATE_PATH Config/state file, defaults to ~/.mswarm/self-hosted-node/config.json
|
|
46
48
|
MSWARM_SELF_HOSTED_NODE_KEY_PATH Runtime token file, defaults to ~/.mswarm/self-hosted-node/node.key
|
|
49
|
+
MSWARM_SELF_HOSTED_ARTIFACT_STORE_PATH Local generic-job artifact store, defaults to ~/.mswarm/self-hosted-node/artifacts
|
|
47
50
|
MSWARM_SELF_HOSTED_INVOCATION_SIGNING_SECRET Shared direct-job signing secret
|
|
51
|
+
MSWARM_SELF_HOSTED_GENERIC_JOBS_ENABLED Enable owner-local generic jobs, defaults to false
|
|
52
|
+
MSWARM_SELF_HOSTED_GENERIC_JOB_TIMEOUT_MS Generic job timeout, defaults to self-hosted job timeout
|
|
53
|
+
MSWARM_SELF_HOSTED_GENERIC_JOB_MAX_CONCURRENCY Generic job concurrency, defaults to 1
|
|
54
|
+
MSWARM_SELF_HOSTED_CAPABILITY_PROBE_TIMEOUT_MS Capability probe timeout, defaults to 2000
|
|
48
55
|
MSWARM_SELF_HOSTED_LISTEN_HOST Direct node bind host, defaults to 127.0.0.1
|
|
49
56
|
MSWARM_SELF_HOSTED_LISTEN_PORT Direct node bind port, defaults to 18083
|
|
50
57
|
MSWARM_SELF_HOSTED_MODEL_ALLOWLIST Comma-separated local agent slugs/model names to expose
|
|
@@ -63,6 +70,10 @@ Setup options:
|
|
|
63
70
|
--block <SLUGS> Comma-separated blocklist
|
|
64
71
|
--expose-all Expose all healthy non-embedding local agents (default)
|
|
65
72
|
--no-expose-all Expose only allowlisted local agents
|
|
73
|
+
--enable-generic-jobs Enable owner-local generic job endpoint for development
|
|
74
|
+
--generic-job-timeout-ms <N> Generic job timeout for owner-local development
|
|
75
|
+
--generic-job-max-concurrency <N> Generic job concurrency for owner-local development
|
|
76
|
+
--artifact-store-path <PATH> Local generic-job artifact store path
|
|
66
77
|
--start Start foreground daemon after setup
|
|
67
78
|
|
|
68
79
|
Log options:
|
|
@@ -149,17 +160,913 @@ function assertJobMatchesClaims(job, claims) {
|
|
|
149
160
|
throw new Error("model does not match invocation token");
|
|
150
161
|
}
|
|
151
162
|
}
|
|
163
|
+
function assertGenericJobMatchesClaims(job, claims) {
|
|
164
|
+
if (job.node_id !== claims.node_id) {
|
|
165
|
+
throw new Error("generic job node_id does not match invocation token");
|
|
166
|
+
}
|
|
167
|
+
if (job.job_id !== claims.job_id) {
|
|
168
|
+
throw new Error("generic job_id does not match invocation token");
|
|
169
|
+
}
|
|
170
|
+
if (job.request_id !== claims.request_id) {
|
|
171
|
+
throw new Error("generic request_id does not match invocation token");
|
|
172
|
+
}
|
|
173
|
+
if (job.job?.schema_version !== claims.schema_version) {
|
|
174
|
+
throw new Error("generic schema_version does not match invocation token");
|
|
175
|
+
}
|
|
176
|
+
if (job.job?.job_type !== claims.job_type) {
|
|
177
|
+
throw new Error("generic job_type does not match invocation token");
|
|
178
|
+
}
|
|
179
|
+
}
|
|
180
|
+
function isOwnerLocalHost(value) {
|
|
181
|
+
const normalized = (value || "").trim().toLowerCase();
|
|
182
|
+
return (normalized === "localhost" ||
|
|
183
|
+
normalized === "::1" ||
|
|
184
|
+
normalized === "[::1]" ||
|
|
185
|
+
normalized === "127.0.0.1" ||
|
|
186
|
+
/^127\.\d{1,3}\.\d{1,3}\.\d{1,3}$/.test(normalized));
|
|
187
|
+
}
|
|
188
|
+
function isOwnerLocalGenericMode(config) {
|
|
189
|
+
if (!config.genericJobsEnabled || !isOwnerLocalHost(config.listenHost)) {
|
|
190
|
+
return false;
|
|
191
|
+
}
|
|
192
|
+
if (!config.directBaseUrl) {
|
|
193
|
+
return true;
|
|
194
|
+
}
|
|
195
|
+
try {
|
|
196
|
+
return isOwnerLocalHost(new URL(config.directBaseUrl).hostname);
|
|
197
|
+
}
|
|
198
|
+
catch {
|
|
199
|
+
return false;
|
|
200
|
+
}
|
|
201
|
+
}
|
|
202
|
+
function isOwnerLocalNodeApiMode(config) {
|
|
203
|
+
if (!isOwnerLocalHost(config.listenHost)) {
|
|
204
|
+
return false;
|
|
205
|
+
}
|
|
206
|
+
if (!config.directBaseUrl) {
|
|
207
|
+
return true;
|
|
208
|
+
}
|
|
209
|
+
try {
|
|
210
|
+
return isOwnerLocalHost(new URL(config.directBaseUrl).hostname);
|
|
211
|
+
}
|
|
212
|
+
catch {
|
|
213
|
+
return false;
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
function acceptsGenericEventStream(headers) {
|
|
217
|
+
const accept = headerText(headers, "accept") || "";
|
|
218
|
+
return accept.split(",").some((entry) => entry.trim().toLowerCase().startsWith("text/event-stream"));
|
|
219
|
+
}
|
|
152
220
|
function writeSelfHostedSseChunk(raw, chunk) {
|
|
153
221
|
raw.write(`data: ${JSON.stringify(chunk)}\n\n`);
|
|
154
222
|
}
|
|
155
223
|
function writeSelfHostedSseDone(raw) {
|
|
156
224
|
raw.write("data: [DONE]\n\n");
|
|
157
225
|
}
|
|
226
|
+
function writeGenericJobSseEvent(raw, event) {
|
|
227
|
+
raw.write(`event: ${String(event.type || "message")}\n`);
|
|
228
|
+
raw.write(`data: ${JSON.stringify(event)}\n\n`);
|
|
229
|
+
}
|
|
230
|
+
function genericJobFailureStatusCode(result) {
|
|
231
|
+
const code = result.result.error?.code;
|
|
232
|
+
return code === "validation_failed"
|
|
233
|
+
? 400
|
|
234
|
+
: code === "timeout"
|
|
235
|
+
? 408
|
|
236
|
+
: result.status === "cancelled"
|
|
237
|
+
? 409
|
|
238
|
+
: 502;
|
|
239
|
+
}
|
|
240
|
+
const MAX_OWNER_LOCAL_ARTIFACT_UPLOAD_BYTES = 128 * 1024 * 1024;
|
|
241
|
+
function optionalString(value) {
|
|
242
|
+
return typeof value === "string" && value.trim().length > 0 ? value.trim() : undefined;
|
|
243
|
+
}
|
|
244
|
+
function decodeArtifactUploadBody(body) {
|
|
245
|
+
const payload = (body && typeof body === "object" && !Array.isArray(body) ? body : {});
|
|
246
|
+
const artifactPath = assertMswarmSafeRelativePath(payload.path, "artifact_path");
|
|
247
|
+
const rawBase64 = optionalString(payload.content_base64);
|
|
248
|
+
if (!rawBase64) {
|
|
249
|
+
throw new Error("content_base64_required");
|
|
250
|
+
}
|
|
251
|
+
const normalizedBase64 = rawBase64.replace(/\s/g, "");
|
|
252
|
+
if (!/^[a-zA-Z0-9+/]+={0,2}$/.test(normalizedBase64) || normalizedBase64.length % 4 === 1) {
|
|
253
|
+
throw new Error("content_base64_invalid");
|
|
254
|
+
}
|
|
255
|
+
const bytes = Buffer.from(normalizedBase64, "base64");
|
|
256
|
+
if (bytes.length > MAX_OWNER_LOCAL_ARTIFACT_UPLOAD_BYTES) {
|
|
257
|
+
throw new Error("artifact_upload_size_limit_exceeded");
|
|
258
|
+
}
|
|
259
|
+
if (typeof payload.size_bytes === "number" && Number.isFinite(payload.size_bytes) && payload.size_bytes !== bytes.length) {
|
|
260
|
+
throw new Error("artifact_upload_size_mismatch");
|
|
261
|
+
}
|
|
262
|
+
const sha256 = sha256Hex(bytes);
|
|
263
|
+
const expectedSha = optionalString(payload.sha256);
|
|
264
|
+
if (expectedSha && expectedSha !== sha256) {
|
|
265
|
+
throw new Error("artifact_upload_checksum_mismatch");
|
|
266
|
+
}
|
|
267
|
+
return {
|
|
268
|
+
name: optionalString(payload.name) || artifactPath.split("/").pop() || "artifact",
|
|
269
|
+
path: artifactPath,
|
|
270
|
+
contentType: optionalString(payload.content_type),
|
|
271
|
+
sha256,
|
|
272
|
+
bytes
|
|
273
|
+
};
|
|
274
|
+
}
|
|
275
|
+
function artifactUploadRoot(config, jobId) {
|
|
276
|
+
const safeJobId = assertMswarmSafeRelativePath(jobId.replace(/[^a-zA-Z0-9_.-]/g, "_"), "job_id");
|
|
277
|
+
return resolve(config.artifactStorePath || ".", safeJobId);
|
|
278
|
+
}
|
|
279
|
+
function resolveArtifactUploadTarget(config, jobId, relativePath) {
|
|
280
|
+
const root = artifactUploadRoot(config, jobId);
|
|
281
|
+
const target = resolve(root, relativePath);
|
|
282
|
+
const rootPrefix = root.endsWith(sep) ? root : `${root}${sep}`;
|
|
283
|
+
if (target !== root && !target.startsWith(rootPrefix)) {
|
|
284
|
+
throw new Error("artifact_path_escape_rejected");
|
|
285
|
+
}
|
|
286
|
+
return target;
|
|
287
|
+
}
|
|
288
|
+
async function assertNoArtifactSymlinkSegments(root, relativePath) {
|
|
289
|
+
let cursor = root;
|
|
290
|
+
const segments = relativePath.split("/").slice(0, -1);
|
|
291
|
+
for (const segment of segments) {
|
|
292
|
+
cursor = resolve(cursor, segment);
|
|
293
|
+
try {
|
|
294
|
+
const info = await lstat(cursor);
|
|
295
|
+
if (info.isSymbolicLink()) {
|
|
296
|
+
throw new Error("artifact_path_symlink_rejected");
|
|
297
|
+
}
|
|
298
|
+
if (!info.isDirectory()) {
|
|
299
|
+
throw new Error("artifact_path_parent_not_directory");
|
|
300
|
+
}
|
|
301
|
+
}
|
|
302
|
+
catch (error) {
|
|
303
|
+
if (error.code === "ENOENT")
|
|
304
|
+
continue;
|
|
305
|
+
throw error;
|
|
306
|
+
}
|
|
307
|
+
}
|
|
308
|
+
}
|
|
309
|
+
function sha256Hex(buffer) {
|
|
310
|
+
return createHash("sha256").update(buffer).digest("hex");
|
|
311
|
+
}
|
|
312
|
+
function verifyOwnerLocalGenericJobRequest(config, headers) {
|
|
313
|
+
if (!config.genericJobsEnabled) {
|
|
314
|
+
return {
|
|
315
|
+
ok: false,
|
|
316
|
+
statusCode: 404,
|
|
317
|
+
payload: {
|
|
318
|
+
error: "not_found",
|
|
319
|
+
code: "feature_disabled",
|
|
320
|
+
message: "Generic node jobs are disabled on this node"
|
|
321
|
+
}
|
|
322
|
+
};
|
|
323
|
+
}
|
|
324
|
+
if (!isOwnerLocalGenericMode(config)) {
|
|
325
|
+
return {
|
|
326
|
+
ok: false,
|
|
327
|
+
statusCode: 403,
|
|
328
|
+
payload: {
|
|
329
|
+
error: "forbidden",
|
|
330
|
+
code: "owner_local_required",
|
|
331
|
+
message: "Generic node jobs are only available in owner-local direct mode"
|
|
332
|
+
}
|
|
333
|
+
};
|
|
334
|
+
}
|
|
335
|
+
if (!config.invocationSigningSecret) {
|
|
336
|
+
return {
|
|
337
|
+
ok: false,
|
|
338
|
+
statusCode: 503,
|
|
339
|
+
payload: {
|
|
340
|
+
error: "service_unavailable",
|
|
341
|
+
code: "missing_config",
|
|
342
|
+
message: "MSWARM_SELF_HOSTED_INVOCATION_SIGNING_SECRET is required for generic jobs"
|
|
343
|
+
}
|
|
344
|
+
};
|
|
345
|
+
}
|
|
346
|
+
const token = extractBearerToken(headers);
|
|
347
|
+
if (!token) {
|
|
348
|
+
return {
|
|
349
|
+
ok: false,
|
|
350
|
+
statusCode: 401,
|
|
351
|
+
payload: {
|
|
352
|
+
error: "unauthorized",
|
|
353
|
+
code: "unauthorized",
|
|
354
|
+
message: "Missing generic job token"
|
|
355
|
+
}
|
|
356
|
+
};
|
|
357
|
+
}
|
|
358
|
+
try {
|
|
359
|
+
return {
|
|
360
|
+
ok: true,
|
|
361
|
+
token,
|
|
362
|
+
claims: verifySelfHostedGenericJobToken({
|
|
363
|
+
token,
|
|
364
|
+
secret: config.invocationSigningSecret
|
|
365
|
+
})
|
|
366
|
+
};
|
|
367
|
+
}
|
|
368
|
+
catch (error) {
|
|
369
|
+
return {
|
|
370
|
+
ok: false,
|
|
371
|
+
statusCode: 401,
|
|
372
|
+
payload: {
|
|
373
|
+
error: "unauthorized",
|
|
374
|
+
code: "unauthorized",
|
|
375
|
+
message: error instanceof Error ? error.message : "Invalid generic job token"
|
|
376
|
+
}
|
|
377
|
+
};
|
|
378
|
+
}
|
|
379
|
+
}
|
|
380
|
+
function verifyOwnerLocalGenericJobOpsRequest(config, headers) {
|
|
381
|
+
if (!config.genericJobsEnabled) {
|
|
382
|
+
return {
|
|
383
|
+
ok: false,
|
|
384
|
+
statusCode: 404,
|
|
385
|
+
payload: {
|
|
386
|
+
error: "not_found",
|
|
387
|
+
code: "feature_disabled",
|
|
388
|
+
message: "Generic node jobs are disabled on this node"
|
|
389
|
+
}
|
|
390
|
+
};
|
|
391
|
+
}
|
|
392
|
+
if (!isOwnerLocalGenericMode(config)) {
|
|
393
|
+
return {
|
|
394
|
+
ok: false,
|
|
395
|
+
statusCode: 403,
|
|
396
|
+
payload: {
|
|
397
|
+
error: "forbidden",
|
|
398
|
+
code: "owner_local_required",
|
|
399
|
+
message: "Generic node operations are only available in owner-local direct mode"
|
|
400
|
+
}
|
|
401
|
+
};
|
|
402
|
+
}
|
|
403
|
+
if (!config.invocationSigningSecret) {
|
|
404
|
+
return {
|
|
405
|
+
ok: false,
|
|
406
|
+
statusCode: 503,
|
|
407
|
+
payload: {
|
|
408
|
+
error: "service_unavailable",
|
|
409
|
+
code: "missing_config",
|
|
410
|
+
message: "MSWARM_SELF_HOSTED_INVOCATION_SIGNING_SECRET is required for generic job operations"
|
|
411
|
+
}
|
|
412
|
+
};
|
|
413
|
+
}
|
|
414
|
+
const token = extractBearerToken(headers);
|
|
415
|
+
if (!token) {
|
|
416
|
+
return {
|
|
417
|
+
ok: false,
|
|
418
|
+
statusCode: 401,
|
|
419
|
+
payload: {
|
|
420
|
+
error: "unauthorized",
|
|
421
|
+
code: "unauthorized",
|
|
422
|
+
message: "Missing generic job ops token"
|
|
423
|
+
}
|
|
424
|
+
};
|
|
425
|
+
}
|
|
426
|
+
try {
|
|
427
|
+
const claims = verifySelfHostedGenericJobOpsToken({
|
|
428
|
+
token,
|
|
429
|
+
secret: config.invocationSigningSecret
|
|
430
|
+
});
|
|
431
|
+
if (claims.node_id !== config.nodeId) {
|
|
432
|
+
return {
|
|
433
|
+
ok: false,
|
|
434
|
+
statusCode: 400,
|
|
435
|
+
payload: {
|
|
436
|
+
error: "bad_request",
|
|
437
|
+
code: "validation_failed",
|
|
438
|
+
message: "generic job ops token does not match this node"
|
|
439
|
+
}
|
|
440
|
+
};
|
|
441
|
+
}
|
|
442
|
+
return { ok: true, claims };
|
|
443
|
+
}
|
|
444
|
+
catch (error) {
|
|
445
|
+
return {
|
|
446
|
+
ok: false,
|
|
447
|
+
statusCode: 401,
|
|
448
|
+
payload: {
|
|
449
|
+
error: "unauthorized",
|
|
450
|
+
code: "unauthorized",
|
|
451
|
+
message: error instanceof Error ? error.message : "Invalid generic job ops token"
|
|
452
|
+
}
|
|
453
|
+
};
|
|
454
|
+
}
|
|
455
|
+
}
|
|
456
|
+
function assertLifecycleJobIdMatchesClaims(jobId, config, claims) {
|
|
457
|
+
if (claims.node_id !== config.nodeId || claims.job_id !== jobId) {
|
|
458
|
+
throw new Error("generic job token does not match this node or job");
|
|
459
|
+
}
|
|
460
|
+
}
|
|
461
|
+
function tenantIdForGenericJob(job) {
|
|
462
|
+
const metadata = job.job.metadata;
|
|
463
|
+
const tenantId = metadata && typeof metadata.tenant_id === "string" ? metadata.tenant_id.trim() : "";
|
|
464
|
+
return tenantId || "owner-local";
|
|
465
|
+
}
|
|
466
|
+
function genericJobMaxConcurrency(config) {
|
|
467
|
+
const configured = config.genericJobMaxConcurrency;
|
|
468
|
+
return Number.isFinite(configured) && configured && configured > 0 ? Math.floor(configured) : 1;
|
|
469
|
+
}
|
|
470
|
+
function clampOpsQueryNumber(value, fallback, max) {
|
|
471
|
+
const raw = Array.isArray(value) ? value[value.length - 1] : value;
|
|
472
|
+
const parsed = typeof raw === "number" ? raw : typeof raw === "string" ? Number.parseInt(raw, 10) : NaN;
|
|
473
|
+
if (!Number.isFinite(parsed) || parsed < 0) {
|
|
474
|
+
return fallback;
|
|
475
|
+
}
|
|
476
|
+
return Math.min(max, Math.floor(parsed));
|
|
477
|
+
}
|
|
478
|
+
function opsQueryOptions(query) {
|
|
479
|
+
const record = query && typeof query === "object" && !Array.isArray(query) ? query : {};
|
|
480
|
+
return {
|
|
481
|
+
auditLimit: clampOpsQueryNumber(record.audit_limit ?? record.auditLimit, 50, 250),
|
|
482
|
+
auditOffset: clampOpsQueryNumber(record.audit_offset ?? record.auditOffset, 0, 10000)
|
|
483
|
+
};
|
|
484
|
+
}
|
|
485
|
+
function artifactBytes(record) {
|
|
486
|
+
return (record.artifacts || []).reduce((total, artifact) => total + (artifact.size_bytes || 0), 0);
|
|
487
|
+
}
|
|
488
|
+
function logBytes(logs, stream) {
|
|
489
|
+
return logs
|
|
490
|
+
.filter((log) => !stream || log.stream === stream)
|
|
491
|
+
.reduce((total, log) => total + Buffer.byteLength(log.message || "", "utf8"), 0);
|
|
492
|
+
}
|
|
493
|
+
function progressPercent(events) {
|
|
494
|
+
for (const event of [...events].reverse()) {
|
|
495
|
+
if (event.type !== "progress" || !event.data || typeof event.data !== "object") {
|
|
496
|
+
continue;
|
|
497
|
+
}
|
|
498
|
+
const data = event.data;
|
|
499
|
+
const value = data.progress_percent ?? data.percent ?? data.progress;
|
|
500
|
+
if (typeof value === "number" && Number.isFinite(value)) {
|
|
501
|
+
return Math.max(0, Math.min(100, value));
|
|
502
|
+
}
|
|
503
|
+
}
|
|
504
|
+
return undefined;
|
|
505
|
+
}
|
|
506
|
+
function gpuSeconds(record) {
|
|
507
|
+
if (!record.started_at || !record.finished_at) {
|
|
508
|
+
return 0;
|
|
509
|
+
}
|
|
510
|
+
const started = Date.parse(record.started_at);
|
|
511
|
+
const finished = Date.parse(record.finished_at);
|
|
512
|
+
if (!Number.isFinite(started) || !Number.isFinite(finished) || finished <= started) {
|
|
513
|
+
return 0;
|
|
514
|
+
}
|
|
515
|
+
const gpuCount = Math.max(1, Math.floor(record.reservation?.resources?.gpu_count || record.job.resources?.gpu?.count || 1));
|
|
516
|
+
return Math.round(((finished - started) / 1000) * gpuCount * 1000) / 1000;
|
|
517
|
+
}
|
|
518
|
+
function tokenSha256(token) {
|
|
519
|
+
return createHash("sha256").update(token).digest("hex");
|
|
520
|
+
}
|
|
521
|
+
function lifecycleRetryPolicy(job) {
|
|
522
|
+
const retry = job.job.metadata?.retry;
|
|
523
|
+
const retryRecord = retry && typeof retry === "object" && !Array.isArray(retry) ? retry : null;
|
|
524
|
+
const maxRetries = typeof retryRecord?.max_retries === "number"
|
|
525
|
+
? Math.max(0, Math.min(3, Math.floor(retryRecord.max_retries)))
|
|
526
|
+
: 0;
|
|
527
|
+
return {
|
|
528
|
+
max_retries: maxRetries,
|
|
529
|
+
retry_count: 0,
|
|
530
|
+
retryable_error_codes: ["timeout"]
|
|
531
|
+
};
|
|
532
|
+
}
|
|
533
|
+
class OwnerLocalGenericJobLifecycleScheduler {
|
|
534
|
+
constructor(runtime, config) {
|
|
535
|
+
this.runtime = runtime;
|
|
536
|
+
this.config = config;
|
|
537
|
+
this.jobs = new Map();
|
|
538
|
+
this.idempotency = new Map();
|
|
539
|
+
this.dispatching = false;
|
|
540
|
+
}
|
|
541
|
+
create(job, claims, token) {
|
|
542
|
+
const tenantId = tenantIdForGenericJob(job);
|
|
543
|
+
const idempotencyKey = normalizeMswarmGenericJobIdempotencyKey({
|
|
544
|
+
tenantId,
|
|
545
|
+
idempotencyKey: job.job.idempotency_key,
|
|
546
|
+
jobId: job.job_id,
|
|
547
|
+
requestId: job.request_id
|
|
548
|
+
});
|
|
549
|
+
const existingId = this.idempotency.get(idempotencyKey);
|
|
550
|
+
if (existingId) {
|
|
551
|
+
const existing = this.mustGetEntry(existingId);
|
|
552
|
+
if (existing.record.job_id !== job.job_id || existing.record.request_id !== job.request_id) {
|
|
553
|
+
throw new Error("idempotency_key_conflict");
|
|
554
|
+
}
|
|
555
|
+
this.audit(existing, "job_idempotent_reused", { idempotency_key: idempotencyKey });
|
|
556
|
+
return { snapshot: this.snapshot(existing), reused: true };
|
|
557
|
+
}
|
|
558
|
+
if (this.jobs.has(job.job_id)) {
|
|
559
|
+
throw new Error("job_id_conflict");
|
|
560
|
+
}
|
|
561
|
+
const now = new Date().toISOString();
|
|
562
|
+
const entry = {
|
|
563
|
+
claims,
|
|
564
|
+
tokenSha256: tokenSha256(token),
|
|
565
|
+
events: [],
|
|
566
|
+
logs: [],
|
|
567
|
+
audit: [],
|
|
568
|
+
record: {
|
|
569
|
+
schema_version: MSWARM_JOB_LIFECYCLE_SCHEMA_VERSION,
|
|
570
|
+
job_id: job.job_id,
|
|
571
|
+
request_id: job.request_id,
|
|
572
|
+
tenant_id: tenantId,
|
|
573
|
+
node_id: this.config.nodeId,
|
|
574
|
+
state: "queued",
|
|
575
|
+
job: job.job,
|
|
576
|
+
idempotency_key: idempotencyKey,
|
|
577
|
+
created_at: now,
|
|
578
|
+
updated_at: now,
|
|
579
|
+
queued_at: now,
|
|
580
|
+
retry: lifecycleRetryPolicy(job)
|
|
581
|
+
}
|
|
582
|
+
};
|
|
583
|
+
this.jobs.set(job.job_id, entry);
|
|
584
|
+
this.idempotency.set(idempotencyKey, job.job_id);
|
|
585
|
+
this.audit(entry, "job_created", { idempotency_key: idempotencyKey });
|
|
586
|
+
this.audit(entry, "job_queued");
|
|
587
|
+
queueMicrotask(() => {
|
|
588
|
+
void this.dispatchQueued();
|
|
589
|
+
});
|
|
590
|
+
return { snapshot: this.snapshot(entry), reused: false };
|
|
591
|
+
}
|
|
592
|
+
get(jobId) {
|
|
593
|
+
const entry = this.jobs.get(jobId);
|
|
594
|
+
return entry ? this.snapshot(entry) : null;
|
|
595
|
+
}
|
|
596
|
+
async ops(options) {
|
|
597
|
+
const capabilities = await this.runtime.publicCapabilityProjection();
|
|
598
|
+
const entries = Array.from(this.jobs.values());
|
|
599
|
+
const totalsByState = {};
|
|
600
|
+
for (const entry of entries) {
|
|
601
|
+
totalsByState[entry.record.state] = (totalsByState[entry.record.state] || 0) + 1;
|
|
602
|
+
}
|
|
603
|
+
const activeJobs = this.activeEntries().length;
|
|
604
|
+
const queuedJobs = this.queuedEntries().length;
|
|
605
|
+
const terminalJobs = entries.filter((entry) => isMswarmTerminalLifecycleState(entry.record.state)).length;
|
|
606
|
+
const stdoutBytes = entries.reduce((total, entry) => total + logBytes(entry.logs, "stdout"), 0);
|
|
607
|
+
const stderrBytes = entries.reduce((total, entry) => total + logBytes(entry.logs, "stderr"), 0);
|
|
608
|
+
const allAudit = entries
|
|
609
|
+
.flatMap((entry) => entry.audit)
|
|
610
|
+
.sort((a, b) => b.timestamp.localeCompare(a.timestamp));
|
|
611
|
+
const jobs = entries
|
|
612
|
+
.map((entry) => this.opsJobSummary(entry))
|
|
613
|
+
.sort((a, b) => b.updated_at.localeCompare(a.updated_at));
|
|
614
|
+
return {
|
|
615
|
+
schema_version: MSWARM_JOB_LIFECYCLE_SCHEMA_VERSION,
|
|
616
|
+
generated_at: new Date().toISOString(),
|
|
617
|
+
node: {
|
|
618
|
+
node_id: this.config.nodeId,
|
|
619
|
+
listen_host: this.config.listenHost,
|
|
620
|
+
listen_port: this.config.listenPort,
|
|
621
|
+
owner_local: isOwnerLocalGenericMode(this.config),
|
|
622
|
+
generic_jobs_enabled: this.config.genericJobsEnabled,
|
|
623
|
+
artifact_store_configured: Boolean(this.config.artifactStorePath),
|
|
624
|
+
max_concurrent_jobs: genericJobMaxConcurrency(this.config)
|
|
625
|
+
},
|
|
626
|
+
capabilities: capabilities,
|
|
627
|
+
queue: {
|
|
628
|
+
jobs,
|
|
629
|
+
totals_by_state: totalsByState,
|
|
630
|
+
active_jobs: activeJobs,
|
|
631
|
+
queued_jobs: queuedJobs,
|
|
632
|
+
terminal_jobs: terminalJobs
|
|
633
|
+
},
|
|
634
|
+
quota: {
|
|
635
|
+
max_concurrent_jobs: genericJobMaxConcurrency(this.config),
|
|
636
|
+
active_jobs: activeJobs,
|
|
637
|
+
queued_jobs: queuedJobs,
|
|
638
|
+
available_slots: Math.max(0, genericJobMaxConcurrency(this.config) - activeJobs),
|
|
639
|
+
production_enforced: false,
|
|
640
|
+
limits: {
|
|
641
|
+
generic_job_timeout_ms: this.config.genericJobTimeoutMs,
|
|
642
|
+
job_timeout_ms: this.config.jobTimeoutMs,
|
|
643
|
+
request_timeout_ms: this.config.requestTimeoutMs,
|
|
644
|
+
artifact_store_configured: Boolean(this.config.artifactStorePath)
|
|
645
|
+
}
|
|
646
|
+
},
|
|
647
|
+
usage: {
|
|
648
|
+
total_jobs: entries.length,
|
|
649
|
+
active_jobs: activeJobs,
|
|
650
|
+
terminal_jobs: terminalJobs,
|
|
651
|
+
succeeded_jobs: totalsByState.succeeded || 0,
|
|
652
|
+
failed_jobs: totalsByState.failed || 0,
|
|
653
|
+
cancelled_jobs: totalsByState.cancelled || 0,
|
|
654
|
+
blocked_jobs: totalsByState.blocked || 0,
|
|
655
|
+
expired_jobs: totalsByState.expired || 0,
|
|
656
|
+
gpu_seconds: Math.round(entries.reduce((total, entry) => total + gpuSeconds(entry.record), 0) * 1000) / 1000,
|
|
657
|
+
artifact_count: entries.reduce((total, entry) => total + (entry.record.artifacts || []).length, 0),
|
|
658
|
+
artifact_bytes: entries.reduce((total, entry) => total + artifactBytes(entry.record), 0),
|
|
659
|
+
event_count: entries.reduce((total, entry) => total + entry.events.length, 0),
|
|
660
|
+
audit_event_count: allAudit.length,
|
|
661
|
+
stdout_bytes: stdoutBytes,
|
|
662
|
+
stderr_bytes: stderrBytes,
|
|
663
|
+
log_bytes: stdoutBytes + stderrBytes
|
|
664
|
+
},
|
|
665
|
+
audit: {
|
|
666
|
+
total: allAudit.length,
|
|
667
|
+
offset: options.auditOffset,
|
|
668
|
+
limit: options.auditLimit,
|
|
669
|
+
events: allAudit.slice(options.auditOffset, options.auditOffset + options.auditLimit)
|
|
670
|
+
}
|
|
671
|
+
};
|
|
672
|
+
}
|
|
673
|
+
cancel(jobId, claims) {
|
|
674
|
+
const entry = this.mustGetEntry(jobId);
|
|
675
|
+
if (entry.claims.request_id !== claims.request_id ||
|
|
676
|
+
entry.claims.schema_version !== claims.schema_version ||
|
|
677
|
+
entry.claims.job_type !== claims.job_type) {
|
|
678
|
+
throw new Error("generic cancellation token does not match the lifecycle job");
|
|
679
|
+
}
|
|
680
|
+
this.audit(entry, "job_cancel_requested");
|
|
681
|
+
if (isMswarmTerminalLifecycleState(entry.record.state)) {
|
|
682
|
+
return this.snapshot(entry);
|
|
683
|
+
}
|
|
684
|
+
if (entry.controller && !entry.controller.signal.aborted) {
|
|
685
|
+
entry.controller.abort("cancelled");
|
|
686
|
+
return this.snapshot(entry);
|
|
687
|
+
}
|
|
688
|
+
this.transition(entry, "cancelled", {
|
|
689
|
+
finished_at: new Date().toISOString(),
|
|
690
|
+
result: {
|
|
691
|
+
job_id: entry.record.job_id,
|
|
692
|
+
status: "cancelled",
|
|
693
|
+
error: {
|
|
694
|
+
code: "cancelled",
|
|
695
|
+
message: "generic job cancelled before dispatch"
|
|
696
|
+
}
|
|
697
|
+
}
|
|
698
|
+
});
|
|
699
|
+
this.audit(entry, "job_cancelled");
|
|
700
|
+
this.releaseReservation(entry);
|
|
701
|
+
return this.snapshot(entry);
|
|
702
|
+
}
|
|
703
|
+
retry(jobId, claims) {
|
|
704
|
+
const entry = this.mustGetEntry(jobId);
|
|
705
|
+
if (entry.claims.request_id !== claims.request_id ||
|
|
706
|
+
entry.claims.schema_version !== claims.schema_version ||
|
|
707
|
+
entry.claims.job_type !== claims.job_type) {
|
|
708
|
+
throw new Error("generic retry token does not match the lifecycle job");
|
|
709
|
+
}
|
|
710
|
+
if (!isMswarmTerminalLifecycleState(entry.record.state)) {
|
|
711
|
+
throw new Error("job_retry_requires_terminal_state");
|
|
712
|
+
}
|
|
713
|
+
if (entry.record.state === "succeeded") {
|
|
714
|
+
throw new Error("job_retry_not_allowed_for_succeeded_jobs");
|
|
715
|
+
}
|
|
716
|
+
this.releaseReservation(entry);
|
|
717
|
+
const now = new Date().toISOString();
|
|
718
|
+
const retryCount = entry.record.retry.retry_count + 1;
|
|
719
|
+
entry.record = {
|
|
720
|
+
...entry.record,
|
|
721
|
+
state: "queued",
|
|
722
|
+
updated_at: now,
|
|
723
|
+
queued_at: now,
|
|
724
|
+
scheduled_at: undefined,
|
|
725
|
+
started_at: undefined,
|
|
726
|
+
finished_at: undefined,
|
|
727
|
+
reservation: undefined,
|
|
728
|
+
envelope: undefined,
|
|
729
|
+
backpressure: undefined,
|
|
730
|
+
result: undefined,
|
|
731
|
+
artifacts: undefined,
|
|
732
|
+
retry: {
|
|
733
|
+
...entry.record.retry,
|
|
734
|
+
retry_count: retryCount,
|
|
735
|
+
next_retry_at: now
|
|
736
|
+
}
|
|
737
|
+
};
|
|
738
|
+
this.audit(entry, "job_retry_scheduled", { retry_count: retryCount, manual: true });
|
|
739
|
+
queueMicrotask(() => {
|
|
740
|
+
void this.dispatchQueued();
|
|
741
|
+
});
|
|
742
|
+
return this.snapshot(entry);
|
|
743
|
+
}
|
|
744
|
+
async dispatchQueued() {
|
|
745
|
+
if (this.dispatching) {
|
|
746
|
+
return;
|
|
747
|
+
}
|
|
748
|
+
this.dispatching = true;
|
|
749
|
+
try {
|
|
750
|
+
while (this.activeEntries().length < genericJobMaxConcurrency(this.config)) {
|
|
751
|
+
const entry = this.nextDispatchableEntry();
|
|
752
|
+
if (!entry) {
|
|
753
|
+
return;
|
|
754
|
+
}
|
|
755
|
+
const activeTenant = this.activeTenantId();
|
|
756
|
+
if (activeTenant && activeTenant !== entry.record.tenant_id) {
|
|
757
|
+
this.setBackpressure(entry, "tenant_reserved", "Node is reserved for another tenant until active jobs finish.");
|
|
758
|
+
return;
|
|
759
|
+
}
|
|
760
|
+
const capabilityOk = await this.recheckCapabilities(entry);
|
|
761
|
+
if (!capabilityOk) {
|
|
762
|
+
continue;
|
|
763
|
+
}
|
|
764
|
+
this.schedule(entry);
|
|
765
|
+
void this.runScheduled(entry);
|
|
766
|
+
}
|
|
767
|
+
for (const entry of this.queuedEntries()) {
|
|
768
|
+
this.setBackpressure(entry, "node_at_capacity", "Node is at generic job concurrency limit.", 1000);
|
|
769
|
+
}
|
|
770
|
+
}
|
|
771
|
+
finally {
|
|
772
|
+
this.dispatching = false;
|
|
773
|
+
}
|
|
774
|
+
}
|
|
775
|
+
activeEntries() {
|
|
776
|
+
return Array.from(this.jobs.values()).filter((entry) => entry.record.state === "scheduled" || entry.record.state === "running");
|
|
777
|
+
}
|
|
778
|
+
queuedEntries() {
|
|
779
|
+
return Array.from(this.jobs.values()).filter((entry) => entry.record.state === "queued" || entry.record.state === "retrying");
|
|
780
|
+
}
|
|
781
|
+
nextDispatchableEntry() {
|
|
782
|
+
return this.queuedEntries().sort((a, b) => a.record.created_at.localeCompare(b.record.created_at))[0] || null;
|
|
783
|
+
}
|
|
784
|
+
activeTenantId() {
|
|
785
|
+
const active = this.activeEntries().find((entry) => entry.record.reservation && !entry.record.reservation.released_at);
|
|
786
|
+
return active?.record.tenant_id || null;
|
|
787
|
+
}
|
|
788
|
+
async recheckCapabilities(entry) {
|
|
789
|
+
const snapshot = await this.runtime.probeCapabilities();
|
|
790
|
+
const capabilityMismatch = genericJobCapabilityMismatch(entry.record.job, snapshot);
|
|
791
|
+
if (capabilityMismatch) {
|
|
792
|
+
this.transition(entry, "blocked", {
|
|
793
|
+
finished_at: new Date().toISOString(),
|
|
794
|
+
backpressure: {
|
|
795
|
+
reason: "no_capable_node",
|
|
796
|
+
message: capabilityMismatch.message
|
|
797
|
+
},
|
|
798
|
+
result: {
|
|
799
|
+
job_id: entry.record.job_id,
|
|
800
|
+
status: "failed",
|
|
801
|
+
error: {
|
|
802
|
+
code: capabilityMismatch.code,
|
|
803
|
+
message: capabilityMismatch.message,
|
|
804
|
+
retryable: true
|
|
805
|
+
}
|
|
806
|
+
}
|
|
807
|
+
});
|
|
808
|
+
this.audit(entry, "job_blocked", { reason: capabilityMismatch.code });
|
|
809
|
+
return false;
|
|
810
|
+
}
|
|
811
|
+
return true;
|
|
812
|
+
}
|
|
813
|
+
schedule(entry) {
|
|
814
|
+
const now = new Date().toISOString();
|
|
815
|
+
const reservation = {
|
|
816
|
+
node_id: this.config.nodeId,
|
|
817
|
+
tenant_id: entry.record.tenant_id,
|
|
818
|
+
job_id: entry.record.job_id,
|
|
819
|
+
request_id: entry.record.request_id,
|
|
820
|
+
reserved_at: now,
|
|
821
|
+
resources: {
|
|
822
|
+
...(entry.record.job.resources?.gpu?.count ? { gpu_count: entry.record.job.resources.gpu.count } : {}),
|
|
823
|
+
...(entry.record.job.resources?.cpu?.cores ? { cpu_cores: entry.record.job.resources.cpu.cores } : {}),
|
|
824
|
+
...(entry.record.job.resources?.memory_gb ? { memory_gb: entry.record.job.resources.memory_gb } : {}),
|
|
825
|
+
...(entry.record.job.resources?.disk_gb ? { disk_gb: entry.record.job.resources.disk_gb } : {})
|
|
826
|
+
}
|
|
827
|
+
};
|
|
828
|
+
const expiresAt = new Date(Date.now() + 5 * 60000).toISOString();
|
|
829
|
+
this.transition(entry, "scheduled", {
|
|
830
|
+
node_id: this.config.nodeId,
|
|
831
|
+
scheduled_at: now,
|
|
832
|
+
reservation,
|
|
833
|
+
backpressure: undefined,
|
|
834
|
+
envelope: buildMswarmGenericJobEnvelopeDescriptor({
|
|
835
|
+
jobId: entry.record.job_id,
|
|
836
|
+
requestId: entry.record.request_id,
|
|
837
|
+
nodeId: this.config.nodeId,
|
|
838
|
+
job: entry.record.job,
|
|
839
|
+
issuedAt: now,
|
|
840
|
+
expiresAt,
|
|
841
|
+
tokenSha256: entry.tokenSha256
|
|
842
|
+
})
|
|
843
|
+
});
|
|
844
|
+
this.audit(entry, "reservation_created", { resources: reservation.resources });
|
|
845
|
+
this.audit(entry, "envelope_issued", { expires_at: expiresAt });
|
|
846
|
+
this.audit(entry, "job_scheduled");
|
|
847
|
+
}
|
|
848
|
+
async runScheduled(entry) {
|
|
849
|
+
const controller = new AbortController();
|
|
850
|
+
entry.controller = controller;
|
|
851
|
+
this.transition(entry, "running", {
|
|
852
|
+
started_at: new Date().toISOString()
|
|
853
|
+
});
|
|
854
|
+
this.audit(entry, "job_started");
|
|
855
|
+
const envelope = {
|
|
856
|
+
job_id: entry.record.job_id,
|
|
857
|
+
request_id: entry.record.request_id,
|
|
858
|
+
node_id: this.config.nodeId,
|
|
859
|
+
job: entry.record.job
|
|
860
|
+
};
|
|
861
|
+
const result = await this.runtime.executeGenericJob(envelope, {
|
|
862
|
+
signal: controller.signal,
|
|
863
|
+
onEvent: async (event) => {
|
|
864
|
+
this.recordEvent(entry, event);
|
|
865
|
+
}
|
|
866
|
+
});
|
|
867
|
+
entry.controller = undefined;
|
|
868
|
+
if (result.status === "failed" && this.shouldRetry(entry, result.result.error?.code)) {
|
|
869
|
+
this.scheduleRetry(entry, result.result);
|
|
870
|
+
await this.dispatchQueued();
|
|
871
|
+
return;
|
|
872
|
+
}
|
|
873
|
+
const terminalState = result.status === "succeeded" ? "succeeded" : result.status === "cancelled" ? "cancelled" : "failed";
|
|
874
|
+
this.transition(entry, terminalState, {
|
|
875
|
+
finished_at: new Date().toISOString(),
|
|
876
|
+
result: result.result,
|
|
877
|
+
artifacts: result.result.artifacts || []
|
|
878
|
+
});
|
|
879
|
+
this.audit(entry, terminalState === "cancelled" ? "job_cancelled" : "job_completed", { status: terminalState });
|
|
880
|
+
this.releaseReservation(entry);
|
|
881
|
+
await this.dispatchQueued();
|
|
882
|
+
}
|
|
883
|
+
shouldRetry(entry, errorCode) {
|
|
884
|
+
if (!errorCode || entry.record.retry.retry_count >= entry.record.retry.max_retries) {
|
|
885
|
+
return false;
|
|
886
|
+
}
|
|
887
|
+
const retryable = entry.record.retry.retryable_error_codes || [];
|
|
888
|
+
return retryable.includes(errorCode);
|
|
889
|
+
}
|
|
890
|
+
scheduleRetry(entry, result) {
|
|
891
|
+
const retryCount = entry.record.retry.retry_count + 1;
|
|
892
|
+
const nextRetryAt = new Date().toISOString();
|
|
893
|
+
this.transition(entry, "retrying", {
|
|
894
|
+
finished_at: new Date().toISOString(),
|
|
895
|
+
result,
|
|
896
|
+
retry: {
|
|
897
|
+
...entry.record.retry,
|
|
898
|
+
retry_count: retryCount,
|
|
899
|
+
next_retry_at: nextRetryAt
|
|
900
|
+
}
|
|
901
|
+
});
|
|
902
|
+
this.audit(entry, "job_retry_scheduled", { retry_count: retryCount, next_retry_at: nextRetryAt });
|
|
903
|
+
this.releaseReservation(entry);
|
|
904
|
+
this.transition(entry, "queued", {
|
|
905
|
+
queued_at: nextRetryAt,
|
|
906
|
+
scheduled_at: undefined,
|
|
907
|
+
started_at: undefined,
|
|
908
|
+
finished_at: undefined,
|
|
909
|
+
reservation: undefined,
|
|
910
|
+
envelope: undefined,
|
|
911
|
+
backpressure: undefined
|
|
912
|
+
});
|
|
913
|
+
}
|
|
914
|
+
recordEvent(entry, event) {
|
|
915
|
+
entry.events.push(event);
|
|
916
|
+
if (event.type === "stdout" || event.type === "stderr") {
|
|
917
|
+
entry.logs.push({
|
|
918
|
+
job_id: event.job_id,
|
|
919
|
+
sequence: event.sequence,
|
|
920
|
+
timestamp: event.timestamp,
|
|
921
|
+
stream: event.type,
|
|
922
|
+
message: event.message || "",
|
|
923
|
+
truncated: false
|
|
924
|
+
});
|
|
925
|
+
}
|
|
926
|
+
this.audit(entry, "job_event_recorded", { type: event.type, sequence: event.sequence });
|
|
927
|
+
}
|
|
928
|
+
releaseReservation(entry) {
|
|
929
|
+
if (!entry.record.reservation || entry.record.reservation.released_at) {
|
|
930
|
+
return;
|
|
931
|
+
}
|
|
932
|
+
entry.record.reservation = {
|
|
933
|
+
...entry.record.reservation,
|
|
934
|
+
released_at: new Date().toISOString()
|
|
935
|
+
};
|
|
936
|
+
this.audit(entry, "reservation_released");
|
|
937
|
+
}
|
|
938
|
+
setBackpressure(entry, reason, message, retryAfterMs) {
|
|
939
|
+
entry.record.backpressure = {
|
|
940
|
+
reason,
|
|
941
|
+
message,
|
|
942
|
+
...(retryAfterMs ? { retry_after_ms: retryAfterMs } : {})
|
|
943
|
+
};
|
|
944
|
+
entry.record.updated_at = new Date().toISOString();
|
|
945
|
+
}
|
|
946
|
+
transition(entry, state, patch = {}) {
|
|
947
|
+
if (!isMswarmLifecycleStateTransitionAllowed(entry.record.state, state)) {
|
|
948
|
+
throw new Error(`invalid lifecycle transition from ${entry.record.state} to ${state}`);
|
|
949
|
+
}
|
|
950
|
+
entry.record = {
|
|
951
|
+
...entry.record,
|
|
952
|
+
...patch,
|
|
953
|
+
state,
|
|
954
|
+
updated_at: new Date().toISOString()
|
|
955
|
+
};
|
|
956
|
+
}
|
|
957
|
+
audit(entry, action, details) {
|
|
958
|
+
entry.audit.push(buildMswarmGenericJobAuditEvent({
|
|
959
|
+
auditId: `audit_${randomUUID()}`,
|
|
960
|
+
jobId: entry.record.job_id,
|
|
961
|
+
requestId: entry.record.request_id,
|
|
962
|
+
tenantId: entry.record.tenant_id,
|
|
963
|
+
nodeId: this.config.nodeId,
|
|
964
|
+
action,
|
|
965
|
+
timestamp: new Date().toISOString(),
|
|
966
|
+
details
|
|
967
|
+
}));
|
|
968
|
+
}
|
|
969
|
+
snapshot(entry) {
|
|
970
|
+
return {
|
|
971
|
+
job: entry.record,
|
|
972
|
+
events: [...entry.events],
|
|
973
|
+
logs: [...entry.logs],
|
|
974
|
+
artifacts: [...(entry.record.artifacts || [])],
|
|
975
|
+
audit: [...entry.audit]
|
|
976
|
+
};
|
|
977
|
+
}
|
|
978
|
+
opsJobSummary(entry) {
|
|
979
|
+
const lastEvent = entry.events[entry.events.length - 1];
|
|
980
|
+
return {
|
|
981
|
+
job_id: entry.record.job_id,
|
|
982
|
+
request_id: entry.record.request_id,
|
|
983
|
+
tenant_id: entry.record.tenant_id,
|
|
984
|
+
node_id: entry.record.node_id,
|
|
985
|
+
state: entry.record.state,
|
|
986
|
+
job_type: entry.record.job.job_type,
|
|
987
|
+
schema_version: entry.record.job.schema_version,
|
|
988
|
+
created_at: entry.record.created_at,
|
|
989
|
+
updated_at: entry.record.updated_at,
|
|
990
|
+
queued_at: entry.record.queued_at,
|
|
991
|
+
scheduled_at: entry.record.scheduled_at,
|
|
992
|
+
started_at: entry.record.started_at,
|
|
993
|
+
finished_at: entry.record.finished_at,
|
|
994
|
+
retry_count: entry.record.retry.retry_count,
|
|
995
|
+
max_retries: entry.record.retry.max_retries,
|
|
996
|
+
progress_percent: progressPercent(entry.events),
|
|
997
|
+
last_event_type: lastEvent?.type,
|
|
998
|
+
last_event_message: lastEvent?.message,
|
|
999
|
+
artifact_count: (entry.record.artifacts || []).length,
|
|
1000
|
+
artifact_bytes: artifactBytes(entry.record),
|
|
1001
|
+
log_bytes: logBytes(entry.logs)
|
|
1002
|
+
};
|
|
1003
|
+
}
|
|
1004
|
+
mustGetEntry(jobId) {
|
|
1005
|
+
const entry = this.jobs.get(jobId);
|
|
1006
|
+
if (!entry) {
|
|
1007
|
+
throw new Error("job_not_found");
|
|
1008
|
+
}
|
|
1009
|
+
return entry;
|
|
1010
|
+
}
|
|
1011
|
+
}
|
|
158
1012
|
export function buildSelfHostedNodeApp(runtime, config) {
|
|
159
1013
|
const app = Fastify({ logger: false });
|
|
1014
|
+
const activeGenericJobs = new Map();
|
|
1015
|
+
const lifecycle = new OwnerLocalGenericJobLifecycleScheduler(runtime, config);
|
|
160
1016
|
app.get("/healthz", async (_request, reply) => {
|
|
161
1017
|
reply.send({ service: "mswarm-self-hosted-node", status: "ok", node_id: config.nodeId });
|
|
162
1018
|
});
|
|
1019
|
+
app.get("/v1/swarm/self-hosted/node/capabilities", async (request, reply) => {
|
|
1020
|
+
if (!isOwnerLocalNodeApiMode(config)) {
|
|
1021
|
+
reply.status(403).send({
|
|
1022
|
+
error: "forbidden",
|
|
1023
|
+
code: "owner_local_required",
|
|
1024
|
+
message: "Node capabilities are only available in owner-local direct mode"
|
|
1025
|
+
});
|
|
1026
|
+
return;
|
|
1027
|
+
}
|
|
1028
|
+
if (!config.invocationSigningSecret) {
|
|
1029
|
+
reply.status(503).send({
|
|
1030
|
+
error: "service_unavailable",
|
|
1031
|
+
code: "missing_config",
|
|
1032
|
+
message: "MSWARM_SELF_HOSTED_INVOCATION_SIGNING_SECRET is required for capability reads"
|
|
1033
|
+
});
|
|
1034
|
+
return;
|
|
1035
|
+
}
|
|
1036
|
+
const token = extractBearerToken(request.headers);
|
|
1037
|
+
if (!token) {
|
|
1038
|
+
reply.status(401).send({
|
|
1039
|
+
error: "unauthorized",
|
|
1040
|
+
code: "unauthorized",
|
|
1041
|
+
message: "Missing capability token"
|
|
1042
|
+
});
|
|
1043
|
+
return;
|
|
1044
|
+
}
|
|
1045
|
+
let claims;
|
|
1046
|
+
try {
|
|
1047
|
+
claims = verifySelfHostedCapabilityToken({
|
|
1048
|
+
token,
|
|
1049
|
+
secret: config.invocationSigningSecret
|
|
1050
|
+
});
|
|
1051
|
+
}
|
|
1052
|
+
catch (error) {
|
|
1053
|
+
reply.status(401).send({
|
|
1054
|
+
error: "unauthorized",
|
|
1055
|
+
code: "unauthorized",
|
|
1056
|
+
message: error instanceof Error ? error.message : "Invalid capability token"
|
|
1057
|
+
});
|
|
1058
|
+
return;
|
|
1059
|
+
}
|
|
1060
|
+
if (claims.node_id !== config.nodeId) {
|
|
1061
|
+
reply.status(400).send({
|
|
1062
|
+
error: "bad_request",
|
|
1063
|
+
code: "validation_failed",
|
|
1064
|
+
message: "capability token node_id does not match this node"
|
|
1065
|
+
});
|
|
1066
|
+
return;
|
|
1067
|
+
}
|
|
1068
|
+
reply.send(await runtime.publicCapabilityProjection());
|
|
1069
|
+
});
|
|
163
1070
|
app.post("/v1/swarm/self-hosted/node/jobs", async (request, reply) => {
|
|
164
1071
|
if (!config.invocationSigningSecret) {
|
|
165
1072
|
reply.status(503).send({
|
|
@@ -262,6 +1169,496 @@ export function buildSelfHostedNodeApp(runtime, config) {
|
|
|
262
1169
|
}
|
|
263
1170
|
reply.send(result);
|
|
264
1171
|
});
|
|
1172
|
+
app.post("/v1/swarm/self-hosted/node/generic-jobs", async (request, reply) => {
|
|
1173
|
+
if (!config.genericJobsEnabled) {
|
|
1174
|
+
reply.status(404).send({
|
|
1175
|
+
error: "not_found",
|
|
1176
|
+
code: "feature_disabled",
|
|
1177
|
+
message: "Generic node jobs are disabled on this node"
|
|
1178
|
+
});
|
|
1179
|
+
return;
|
|
1180
|
+
}
|
|
1181
|
+
if (!isOwnerLocalGenericMode(config)) {
|
|
1182
|
+
reply.status(403).send({
|
|
1183
|
+
error: "forbidden",
|
|
1184
|
+
code: "owner_local_required",
|
|
1185
|
+
message: "Generic node jobs are only available in owner-local direct mode"
|
|
1186
|
+
});
|
|
1187
|
+
return;
|
|
1188
|
+
}
|
|
1189
|
+
if (!config.invocationSigningSecret) {
|
|
1190
|
+
reply.status(503).send({
|
|
1191
|
+
error: "service_unavailable",
|
|
1192
|
+
code: "missing_config",
|
|
1193
|
+
message: "MSWARM_SELF_HOSTED_INVOCATION_SIGNING_SECRET is required for generic jobs"
|
|
1194
|
+
});
|
|
1195
|
+
return;
|
|
1196
|
+
}
|
|
1197
|
+
const token = extractBearerToken(request.headers);
|
|
1198
|
+
if (!token) {
|
|
1199
|
+
reply.status(401).send({
|
|
1200
|
+
error: "unauthorized",
|
|
1201
|
+
code: "unauthorized",
|
|
1202
|
+
message: "Missing generic job token"
|
|
1203
|
+
});
|
|
1204
|
+
return;
|
|
1205
|
+
}
|
|
1206
|
+
let claims;
|
|
1207
|
+
try {
|
|
1208
|
+
claims = verifySelfHostedGenericJobToken({
|
|
1209
|
+
token,
|
|
1210
|
+
secret: config.invocationSigningSecret
|
|
1211
|
+
});
|
|
1212
|
+
}
|
|
1213
|
+
catch (error) {
|
|
1214
|
+
reply.status(401).send({
|
|
1215
|
+
error: "unauthorized",
|
|
1216
|
+
code: "unauthorized",
|
|
1217
|
+
message: error instanceof Error ? error.message : "Invalid generic job token"
|
|
1218
|
+
});
|
|
1219
|
+
return;
|
|
1220
|
+
}
|
|
1221
|
+
const job = request.body;
|
|
1222
|
+
try {
|
|
1223
|
+
assertGenericJobMatchesClaims(job, claims);
|
|
1224
|
+
}
|
|
1225
|
+
catch (error) {
|
|
1226
|
+
reply.status(400).send({
|
|
1227
|
+
error: "bad_request",
|
|
1228
|
+
code: "validation_failed",
|
|
1229
|
+
message: error instanceof Error ? error.message : "Invalid generic job"
|
|
1230
|
+
});
|
|
1231
|
+
return;
|
|
1232
|
+
}
|
|
1233
|
+
const wantsEventStream = acceptsGenericEventStream(request.headers);
|
|
1234
|
+
if (activeGenericJobs.has(job.job_id)) {
|
|
1235
|
+
reply.status(409).send({
|
|
1236
|
+
error: "conflict",
|
|
1237
|
+
code: "job_already_running",
|
|
1238
|
+
message: "Generic job is already running on this node"
|
|
1239
|
+
});
|
|
1240
|
+
return;
|
|
1241
|
+
}
|
|
1242
|
+
const abortController = new AbortController();
|
|
1243
|
+
activeGenericJobs.set(job.job_id, { controller: abortController, claims });
|
|
1244
|
+
if (wantsEventStream) {
|
|
1245
|
+
reply.hijack();
|
|
1246
|
+
reply.raw.writeHead(200, {
|
|
1247
|
+
"content-type": "text/event-stream; charset=utf-8",
|
|
1248
|
+
"cache-control": "no-cache, no-transform",
|
|
1249
|
+
connection: "keep-alive",
|
|
1250
|
+
"x-accel-buffering": "no"
|
|
1251
|
+
});
|
|
1252
|
+
const onClose = () => {
|
|
1253
|
+
if (!abortController.signal.aborted) {
|
|
1254
|
+
abortController.abort("cancelled");
|
|
1255
|
+
}
|
|
1256
|
+
};
|
|
1257
|
+
reply.raw.once("close", onClose);
|
|
1258
|
+
const keepAlive = setInterval(() => {
|
|
1259
|
+
if (!reply.raw.destroyed && !reply.raw.writableEnded) {
|
|
1260
|
+
reply.raw.write(": keep-alive\n\n");
|
|
1261
|
+
}
|
|
1262
|
+
}, 15000);
|
|
1263
|
+
try {
|
|
1264
|
+
await runtime.executeGenericJob(job, {
|
|
1265
|
+
signal: abortController.signal,
|
|
1266
|
+
onEvent: async (event) => {
|
|
1267
|
+
writeGenericJobSseEvent(reply.raw, { ...event });
|
|
1268
|
+
}
|
|
1269
|
+
});
|
|
1270
|
+
writeSelfHostedSseDone(reply.raw);
|
|
1271
|
+
}
|
|
1272
|
+
catch (error) {
|
|
1273
|
+
writeGenericJobSseEvent(reply.raw, {
|
|
1274
|
+
job_id: job.job_id,
|
|
1275
|
+
type: "failed",
|
|
1276
|
+
sequence: 0,
|
|
1277
|
+
timestamp: new Date().toISOString(),
|
|
1278
|
+
message: error instanceof Error ? error.message : String(error),
|
|
1279
|
+
data: { code: "upstream_error" }
|
|
1280
|
+
});
|
|
1281
|
+
writeSelfHostedSseDone(reply.raw);
|
|
1282
|
+
}
|
|
1283
|
+
finally {
|
|
1284
|
+
clearInterval(keepAlive);
|
|
1285
|
+
reply.raw.removeListener("close", onClose);
|
|
1286
|
+
if (activeGenericJobs.get(job.job_id)?.controller === abortController) {
|
|
1287
|
+
activeGenericJobs.delete(job.job_id);
|
|
1288
|
+
}
|
|
1289
|
+
if (!reply.raw.destroyed && !reply.raw.writableEnded) {
|
|
1290
|
+
reply.raw.end();
|
|
1291
|
+
}
|
|
1292
|
+
}
|
|
1293
|
+
return;
|
|
1294
|
+
}
|
|
1295
|
+
const result = await runtime.executeGenericJob(job, { signal: abortController.signal }).finally(() => {
|
|
1296
|
+
if (activeGenericJobs.get(job.job_id)?.controller === abortController) {
|
|
1297
|
+
activeGenericJobs.delete(job.job_id);
|
|
1298
|
+
}
|
|
1299
|
+
});
|
|
1300
|
+
if (result.status === "succeeded") {
|
|
1301
|
+
reply.send(result);
|
|
1302
|
+
return;
|
|
1303
|
+
}
|
|
1304
|
+
reply.status(genericJobFailureStatusCode(result)).send(result);
|
|
1305
|
+
});
|
|
1306
|
+
app.post("/v1/swarm/self-hosted/node/generic-jobs/:job_id/cancel", async (request, reply) => {
|
|
1307
|
+
if (!config.genericJobsEnabled) {
|
|
1308
|
+
reply.status(404).send({
|
|
1309
|
+
error: "not_found",
|
|
1310
|
+
code: "feature_disabled",
|
|
1311
|
+
message: "Generic node jobs are disabled on this node"
|
|
1312
|
+
});
|
|
1313
|
+
return;
|
|
1314
|
+
}
|
|
1315
|
+
if (!isOwnerLocalGenericMode(config)) {
|
|
1316
|
+
reply.status(403).send({
|
|
1317
|
+
error: "forbidden",
|
|
1318
|
+
code: "owner_local_required",
|
|
1319
|
+
message: "Generic node jobs are only available in owner-local direct mode"
|
|
1320
|
+
});
|
|
1321
|
+
return;
|
|
1322
|
+
}
|
|
1323
|
+
if (!config.invocationSigningSecret) {
|
|
1324
|
+
reply.status(503).send({
|
|
1325
|
+
error: "service_unavailable",
|
|
1326
|
+
code: "missing_config",
|
|
1327
|
+
message: "MSWARM_SELF_HOSTED_INVOCATION_SIGNING_SECRET is required for generic jobs"
|
|
1328
|
+
});
|
|
1329
|
+
return;
|
|
1330
|
+
}
|
|
1331
|
+
const token = extractBearerToken(request.headers);
|
|
1332
|
+
if (!token) {
|
|
1333
|
+
reply.status(401).send({
|
|
1334
|
+
error: "unauthorized",
|
|
1335
|
+
code: "unauthorized",
|
|
1336
|
+
message: "Missing generic job token"
|
|
1337
|
+
});
|
|
1338
|
+
return;
|
|
1339
|
+
}
|
|
1340
|
+
let claims;
|
|
1341
|
+
try {
|
|
1342
|
+
claims = verifySelfHostedGenericJobToken({
|
|
1343
|
+
token,
|
|
1344
|
+
secret: config.invocationSigningSecret
|
|
1345
|
+
});
|
|
1346
|
+
}
|
|
1347
|
+
catch (error) {
|
|
1348
|
+
reply.status(401).send({
|
|
1349
|
+
error: "unauthorized",
|
|
1350
|
+
code: "unauthorized",
|
|
1351
|
+
message: error instanceof Error ? error.message : "Invalid generic job token"
|
|
1352
|
+
});
|
|
1353
|
+
return;
|
|
1354
|
+
}
|
|
1355
|
+
const jobId = String(request.params.job_id || "").trim();
|
|
1356
|
+
if (!jobId) {
|
|
1357
|
+
reply.status(400).send({
|
|
1358
|
+
error: "bad_request",
|
|
1359
|
+
code: "validation_failed",
|
|
1360
|
+
message: "generic job_id is required"
|
|
1361
|
+
});
|
|
1362
|
+
return;
|
|
1363
|
+
}
|
|
1364
|
+
if (claims.node_id !== config.nodeId || claims.job_id !== jobId) {
|
|
1365
|
+
reply.status(400).send({
|
|
1366
|
+
error: "bad_request",
|
|
1367
|
+
code: "validation_failed",
|
|
1368
|
+
message: "generic cancellation token does not match this node or job"
|
|
1369
|
+
});
|
|
1370
|
+
return;
|
|
1371
|
+
}
|
|
1372
|
+
const activeJob = activeGenericJobs.get(jobId);
|
|
1373
|
+
if (!activeJob) {
|
|
1374
|
+
reply.status(404).send({
|
|
1375
|
+
error: "not_found",
|
|
1376
|
+
code: "job_not_running",
|
|
1377
|
+
message: "Generic job is not running on this node"
|
|
1378
|
+
});
|
|
1379
|
+
return;
|
|
1380
|
+
}
|
|
1381
|
+
if (activeJob.claims.request_id !== claims.request_id ||
|
|
1382
|
+
activeJob.claims.schema_version !== claims.schema_version ||
|
|
1383
|
+
activeJob.claims.job_type !== claims.job_type) {
|
|
1384
|
+
reply.status(400).send({
|
|
1385
|
+
error: "bad_request",
|
|
1386
|
+
code: "validation_failed",
|
|
1387
|
+
message: "generic cancellation token does not match the active request"
|
|
1388
|
+
});
|
|
1389
|
+
return;
|
|
1390
|
+
}
|
|
1391
|
+
if (!activeJob.controller.signal.aborted) {
|
|
1392
|
+
activeJob.controller.abort("cancelled");
|
|
1393
|
+
}
|
|
1394
|
+
reply.status(202).send({
|
|
1395
|
+
job_id: jobId,
|
|
1396
|
+
request_id: activeJob.claims.request_id,
|
|
1397
|
+
status: "cancelling"
|
|
1398
|
+
});
|
|
1399
|
+
});
|
|
1400
|
+
app.post("/v1/swarm/self-hosted/node/generic-job-control/jobs", async (request, reply) => {
|
|
1401
|
+
const auth = verifyOwnerLocalGenericJobRequest(config, request.headers);
|
|
1402
|
+
if (!auth.ok) {
|
|
1403
|
+
reply.status(auth.statusCode).send(auth.payload);
|
|
1404
|
+
return;
|
|
1405
|
+
}
|
|
1406
|
+
const job = request.body;
|
|
1407
|
+
try {
|
|
1408
|
+
assertGenericJobMatchesClaims(job, auth.claims);
|
|
1409
|
+
assertLifecycleJobIdMatchesClaims(job.job_id, config, auth.claims);
|
|
1410
|
+
const result = lifecycle.create(job, auth.claims, auth.token);
|
|
1411
|
+
reply.status(result.reused ? 200 : 202).send(result.snapshot);
|
|
1412
|
+
}
|
|
1413
|
+
catch (error) {
|
|
1414
|
+
const message = error instanceof Error ? error.message : "Invalid generic lifecycle job";
|
|
1415
|
+
reply.status(message.includes("conflict") ? 409 : 400).send({
|
|
1416
|
+
error: message.includes("conflict") ? "conflict" : "bad_request",
|
|
1417
|
+
code: message.includes("conflict") ? message : "validation_failed",
|
|
1418
|
+
message
|
|
1419
|
+
});
|
|
1420
|
+
}
|
|
1421
|
+
});
|
|
1422
|
+
app.post("/v1/swarm/self-hosted/node/generic-job-control/jobs/:job_id/artifacts", async (request, reply) => {
|
|
1423
|
+
const auth = verifyOwnerLocalGenericJobRequest(config, request.headers);
|
|
1424
|
+
if (!auth.ok) {
|
|
1425
|
+
reply.status(auth.statusCode).send(auth.payload);
|
|
1426
|
+
return;
|
|
1427
|
+
}
|
|
1428
|
+
const jobId = String(request.params.job_id || "").trim();
|
|
1429
|
+
try {
|
|
1430
|
+
assertLifecycleJobIdMatchesClaims(jobId, config, auth.claims);
|
|
1431
|
+
const upload = decodeArtifactUploadBody(request.body);
|
|
1432
|
+
const root = artifactUploadRoot(config, jobId);
|
|
1433
|
+
const target = resolveArtifactUploadTarget(config, jobId, upload.path);
|
|
1434
|
+
await mkdir(dirname(target), { recursive: true });
|
|
1435
|
+
await assertNoArtifactSymlinkSegments(root, upload.path);
|
|
1436
|
+
try {
|
|
1437
|
+
await lstat(target);
|
|
1438
|
+
throw new Error("artifact_upload_target_exists");
|
|
1439
|
+
}
|
|
1440
|
+
catch (error) {
|
|
1441
|
+
if (error.code !== "ENOENT")
|
|
1442
|
+
throw error;
|
|
1443
|
+
}
|
|
1444
|
+
await writeFile(target, upload.bytes, { mode: 0o600 });
|
|
1445
|
+
reply.status(201).send({
|
|
1446
|
+
job_id: jobId,
|
|
1447
|
+
artifact: {
|
|
1448
|
+
id: `upload_${upload.sha256.slice(0, 16)}`,
|
|
1449
|
+
uri: buildMswarmLocalArtifactUri(jobId, upload.path),
|
|
1450
|
+
name: upload.name,
|
|
1451
|
+
content_type: upload.contentType,
|
|
1452
|
+
size_bytes: upload.bytes.length,
|
|
1453
|
+
sha256: upload.sha256,
|
|
1454
|
+
scope: "input",
|
|
1455
|
+
access: defaultMswarmArtifactAccessPolicy("owner-local"),
|
|
1456
|
+
retention: defaultMswarmArtifactRetentionPolicy()
|
|
1457
|
+
}
|
|
1458
|
+
});
|
|
1459
|
+
}
|
|
1460
|
+
catch (error) {
|
|
1461
|
+
const message = error instanceof Error ? error.message : "Invalid generic lifecycle artifact upload";
|
|
1462
|
+
reply.status(400).send({
|
|
1463
|
+
error: "bad_request",
|
|
1464
|
+
code: "validation_failed",
|
|
1465
|
+
message
|
|
1466
|
+
});
|
|
1467
|
+
}
|
|
1468
|
+
});
|
|
1469
|
+
app.get("/v1/swarm/self-hosted/node/generic-job-control/ops", async (request, reply) => {
|
|
1470
|
+
const auth = verifyOwnerLocalGenericJobOpsRequest(config, request.headers);
|
|
1471
|
+
if (!auth.ok) {
|
|
1472
|
+
reply.status(auth.statusCode).send(auth.payload);
|
|
1473
|
+
return;
|
|
1474
|
+
}
|
|
1475
|
+
reply.send(await lifecycle.ops(opsQueryOptions(request.query)));
|
|
1476
|
+
});
|
|
1477
|
+
app.get("/v1/swarm/self-hosted/node/generic-job-control/jobs/:job_id", async (request, reply) => {
|
|
1478
|
+
const auth = verifyOwnerLocalGenericJobRequest(config, request.headers);
|
|
1479
|
+
if (!auth.ok) {
|
|
1480
|
+
reply.status(auth.statusCode).send(auth.payload);
|
|
1481
|
+
return;
|
|
1482
|
+
}
|
|
1483
|
+
const jobId = String(request.params.job_id || "").trim();
|
|
1484
|
+
try {
|
|
1485
|
+
assertLifecycleJobIdMatchesClaims(jobId, config, auth.claims);
|
|
1486
|
+
}
|
|
1487
|
+
catch (error) {
|
|
1488
|
+
reply.status(400).send({
|
|
1489
|
+
error: "bad_request",
|
|
1490
|
+
code: "validation_failed",
|
|
1491
|
+
message: error instanceof Error ? error.message : "Invalid generic lifecycle job token"
|
|
1492
|
+
});
|
|
1493
|
+
return;
|
|
1494
|
+
}
|
|
1495
|
+
const snapshot = lifecycle.get(jobId);
|
|
1496
|
+
if (!snapshot) {
|
|
1497
|
+
reply.status(404).send({
|
|
1498
|
+
error: "not_found",
|
|
1499
|
+
code: "job_not_found",
|
|
1500
|
+
message: "Generic lifecycle job was not found"
|
|
1501
|
+
});
|
|
1502
|
+
return;
|
|
1503
|
+
}
|
|
1504
|
+
reply.send(snapshot);
|
|
1505
|
+
});
|
|
1506
|
+
app.get("/v1/swarm/self-hosted/node/generic-job-control/jobs/:job_id/events", async (request, reply) => {
|
|
1507
|
+
const auth = verifyOwnerLocalGenericJobRequest(config, request.headers);
|
|
1508
|
+
if (!auth.ok) {
|
|
1509
|
+
reply.status(auth.statusCode).send(auth.payload);
|
|
1510
|
+
return;
|
|
1511
|
+
}
|
|
1512
|
+
const jobId = String(request.params.job_id || "").trim();
|
|
1513
|
+
try {
|
|
1514
|
+
assertLifecycleJobIdMatchesClaims(jobId, config, auth.claims);
|
|
1515
|
+
}
|
|
1516
|
+
catch (error) {
|
|
1517
|
+
reply.status(400).send({
|
|
1518
|
+
error: "bad_request",
|
|
1519
|
+
code: "validation_failed",
|
|
1520
|
+
message: error instanceof Error ? error.message : "Invalid generic lifecycle job token"
|
|
1521
|
+
});
|
|
1522
|
+
return;
|
|
1523
|
+
}
|
|
1524
|
+
const snapshot = lifecycle.get(jobId);
|
|
1525
|
+
if (!snapshot) {
|
|
1526
|
+
reply.status(404).send({
|
|
1527
|
+
error: "not_found",
|
|
1528
|
+
code: "job_not_found",
|
|
1529
|
+
message: "Generic lifecycle job was not found"
|
|
1530
|
+
});
|
|
1531
|
+
return;
|
|
1532
|
+
}
|
|
1533
|
+
reply.send({ job_id: jobId, events: snapshot.events });
|
|
1534
|
+
});
|
|
1535
|
+
app.get("/v1/swarm/self-hosted/node/generic-job-control/jobs/:job_id/logs", async (request, reply) => {
|
|
1536
|
+
const auth = verifyOwnerLocalGenericJobRequest(config, request.headers);
|
|
1537
|
+
if (!auth.ok) {
|
|
1538
|
+
reply.status(auth.statusCode).send(auth.payload);
|
|
1539
|
+
return;
|
|
1540
|
+
}
|
|
1541
|
+
const jobId = String(request.params.job_id || "").trim();
|
|
1542
|
+
try {
|
|
1543
|
+
assertLifecycleJobIdMatchesClaims(jobId, config, auth.claims);
|
|
1544
|
+
}
|
|
1545
|
+
catch (error) {
|
|
1546
|
+
reply.status(400).send({
|
|
1547
|
+
error: "bad_request",
|
|
1548
|
+
code: "validation_failed",
|
|
1549
|
+
message: error instanceof Error ? error.message : "Invalid generic lifecycle job token"
|
|
1550
|
+
});
|
|
1551
|
+
return;
|
|
1552
|
+
}
|
|
1553
|
+
const snapshot = lifecycle.get(jobId);
|
|
1554
|
+
if (!snapshot) {
|
|
1555
|
+
reply.status(404).send({
|
|
1556
|
+
error: "not_found",
|
|
1557
|
+
code: "job_not_found",
|
|
1558
|
+
message: "Generic lifecycle job was not found"
|
|
1559
|
+
});
|
|
1560
|
+
return;
|
|
1561
|
+
}
|
|
1562
|
+
reply.send({ job_id: jobId, logs: snapshot.logs });
|
|
1563
|
+
});
|
|
1564
|
+
app.get("/v1/swarm/self-hosted/node/generic-job-control/jobs/:job_id/artifacts", async (request, reply) => {
|
|
1565
|
+
const auth = verifyOwnerLocalGenericJobRequest(config, request.headers);
|
|
1566
|
+
if (!auth.ok) {
|
|
1567
|
+
reply.status(auth.statusCode).send(auth.payload);
|
|
1568
|
+
return;
|
|
1569
|
+
}
|
|
1570
|
+
const jobId = String(request.params.job_id || "").trim();
|
|
1571
|
+
try {
|
|
1572
|
+
assertLifecycleJobIdMatchesClaims(jobId, config, auth.claims);
|
|
1573
|
+
}
|
|
1574
|
+
catch (error) {
|
|
1575
|
+
reply.status(400).send({
|
|
1576
|
+
error: "bad_request",
|
|
1577
|
+
code: "validation_failed",
|
|
1578
|
+
message: error instanceof Error ? error.message : "Invalid generic lifecycle job token"
|
|
1579
|
+
});
|
|
1580
|
+
return;
|
|
1581
|
+
}
|
|
1582
|
+
const snapshot = lifecycle.get(jobId);
|
|
1583
|
+
if (!snapshot) {
|
|
1584
|
+
reply.status(404).send({
|
|
1585
|
+
error: "not_found",
|
|
1586
|
+
code: "job_not_found",
|
|
1587
|
+
message: "Generic lifecycle job was not found"
|
|
1588
|
+
});
|
|
1589
|
+
return;
|
|
1590
|
+
}
|
|
1591
|
+
reply.send({ job_id: jobId, artifacts: snapshot.artifacts });
|
|
1592
|
+
});
|
|
1593
|
+
app.get("/v1/swarm/self-hosted/node/generic-job-control/jobs/:job_id/audit", async (request, reply) => {
|
|
1594
|
+
const auth = verifyOwnerLocalGenericJobRequest(config, request.headers);
|
|
1595
|
+
if (!auth.ok) {
|
|
1596
|
+
reply.status(auth.statusCode).send(auth.payload);
|
|
1597
|
+
return;
|
|
1598
|
+
}
|
|
1599
|
+
const jobId = String(request.params.job_id || "").trim();
|
|
1600
|
+
try {
|
|
1601
|
+
assertLifecycleJobIdMatchesClaims(jobId, config, auth.claims);
|
|
1602
|
+
}
|
|
1603
|
+
catch (error) {
|
|
1604
|
+
reply.status(400).send({
|
|
1605
|
+
error: "bad_request",
|
|
1606
|
+
code: "validation_failed",
|
|
1607
|
+
message: error instanceof Error ? error.message : "Invalid generic lifecycle job token"
|
|
1608
|
+
});
|
|
1609
|
+
return;
|
|
1610
|
+
}
|
|
1611
|
+
const snapshot = lifecycle.get(jobId);
|
|
1612
|
+
if (!snapshot) {
|
|
1613
|
+
reply.status(404).send({
|
|
1614
|
+
error: "not_found",
|
|
1615
|
+
code: "job_not_found",
|
|
1616
|
+
message: "Generic lifecycle job was not found"
|
|
1617
|
+
});
|
|
1618
|
+
return;
|
|
1619
|
+
}
|
|
1620
|
+
reply.send({ job_id: jobId, audit: snapshot.audit });
|
|
1621
|
+
});
|
|
1622
|
+
app.post("/v1/swarm/self-hosted/node/generic-job-control/jobs/:job_id/cancel", async (request, reply) => {
|
|
1623
|
+
const auth = verifyOwnerLocalGenericJobRequest(config, request.headers);
|
|
1624
|
+
if (!auth.ok) {
|
|
1625
|
+
reply.status(auth.statusCode).send(auth.payload);
|
|
1626
|
+
return;
|
|
1627
|
+
}
|
|
1628
|
+
const jobId = String(request.params.job_id || "").trim();
|
|
1629
|
+
try {
|
|
1630
|
+
assertLifecycleJobIdMatchesClaims(jobId, config, auth.claims);
|
|
1631
|
+
reply.status(202).send(lifecycle.cancel(jobId, auth.claims));
|
|
1632
|
+
}
|
|
1633
|
+
catch (error) {
|
|
1634
|
+
const message = error instanceof Error ? error.message : "Invalid generic lifecycle cancellation";
|
|
1635
|
+
reply.status(message === "job_not_found" ? 404 : 400).send({
|
|
1636
|
+
error: message === "job_not_found" ? "not_found" : "bad_request",
|
|
1637
|
+
code: message === "job_not_found" ? "job_not_found" : "validation_failed",
|
|
1638
|
+
message: message === "job_not_found" ? "Generic lifecycle job was not found" : message
|
|
1639
|
+
});
|
|
1640
|
+
}
|
|
1641
|
+
});
|
|
1642
|
+
app.post("/v1/swarm/self-hosted/node/generic-job-control/jobs/:job_id/retry", async (request, reply) => {
|
|
1643
|
+
const auth = verifyOwnerLocalGenericJobRequest(config, request.headers);
|
|
1644
|
+
if (!auth.ok) {
|
|
1645
|
+
reply.status(auth.statusCode).send(auth.payload);
|
|
1646
|
+
return;
|
|
1647
|
+
}
|
|
1648
|
+
const jobId = String(request.params.job_id || "").trim();
|
|
1649
|
+
try {
|
|
1650
|
+
assertLifecycleJobIdMatchesClaims(jobId, config, auth.claims);
|
|
1651
|
+
reply.status(202).send(lifecycle.retry(jobId, auth.claims));
|
|
1652
|
+
}
|
|
1653
|
+
catch (error) {
|
|
1654
|
+
const message = error instanceof Error ? error.message : "Invalid generic lifecycle retry";
|
|
1655
|
+
reply.status(message === "job_not_found" ? 404 : 400).send({
|
|
1656
|
+
error: message === "job_not_found" ? "not_found" : "bad_request",
|
|
1657
|
+
code: message === "job_not_found" ? "job_not_found" : "validation_failed",
|
|
1658
|
+
message: message === "job_not_found" ? "Generic lifecycle job was not found" : message
|
|
1659
|
+
});
|
|
1660
|
+
}
|
|
1661
|
+
});
|
|
265
1662
|
return app;
|
|
266
1663
|
}
|
|
267
1664
|
export async function main(argv = process.argv) {
|