@mcoda/mswarm 0.1.76 → 0.1.79

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/server.js CHANGED
@@ -1,11 +1,13 @@
1
1
  #!/usr/bin/env node
2
2
  import { realpathSync } from "node:fs";
3
- import { readFile } from "node:fs/promises";
3
+ import { lstat, mkdir, readFile, writeFile } from "node:fs/promises";
4
4
  import { fileURLToPath } from "node:url";
5
- import { resolve } from "node:path";
5
+ import { dirname, resolve, sep } from "node:path";
6
+ import { createHash, randomUUID } from "node:crypto";
6
7
  import Fastify from "fastify";
7
- import { verifySelfHostedInvocationToken } from "./invocation-token.js";
8
- import { controlSelfHostedNodeService, installSelfHostedNodeService, readOwnerSetupConfig, readSelfHostedNodeConfig, resolveSelfHostedNodeServiceLayout, SelfHostedNodeRuntime, uninstallSelfHostedNodeService } from "./runtime.js";
8
+ import { MSWARM_JOB_LIFECYCLE_SCHEMA_VERSION, assertMswarmSafeRelativePath, buildMswarmLocalArtifactUri, defaultMswarmArtifactAccessPolicy, defaultMswarmArtifactRetentionPolicy, buildMswarmGenericJobAuditEvent, buildMswarmGenericJobEnvelopeDescriptor, isMswarmLifecycleStateTransitionAllowed, isMswarmTerminalLifecycleState, normalizeMswarmGenericJobIdempotencyKey } from "@mcoda/shared";
9
+ import { verifySelfHostedCapabilityToken, verifySelfHostedGenericJobOpsToken, verifySelfHostedGenericJobToken, verifySelfHostedInvocationToken } from "./invocation-token.js";
10
+ import { controlSelfHostedNodeService, installSelfHostedNodeService, readOwnerSetupConfig, readSelfHostedNodeConfig, resolveSelfHostedNodeServiceLayout, SelfHostedNodeRuntime, uninstallSelfHostedNodeService, genericJobCapabilityMismatch } from "./runtime.js";
9
11
  const SELF_HOSTED_NODE_PROCESS_TITLE = "mswarm-node";
10
12
  function applySelfHostedNodeProcessTitle() {
11
13
  const title = process.env.MSWARM_SELF_HOSTED_PROCESS_TITLE?.trim() || SELF_HOSTED_NODE_PROCESS_TITLE;
@@ -44,7 +46,17 @@ Environment:
44
46
  MSWARM_SELF_HOSTED_OLLAMA_BASE_URL Ollama base URL, defaults to http://127.0.0.1:11434
45
47
  MSWARM_SELF_HOSTED_NODE_STATE_PATH Config/state file, defaults to ~/.mswarm/self-hosted-node/config.json
46
48
  MSWARM_SELF_HOSTED_NODE_KEY_PATH Runtime token file, defaults to ~/.mswarm/self-hosted-node/node.key
49
+ MSWARM_SELF_HOSTED_ARTIFACT_STORE_PATH Local generic-job artifact store, defaults to ~/.mswarm/self-hosted-node/artifacts
47
50
  MSWARM_SELF_HOSTED_INVOCATION_SIGNING_SECRET Shared direct-job signing secret
51
+ MSWARM_SELF_HOSTED_MAX_CONCURRENT_JOBS Overall advertised job capacity, defaults to 1
52
+ MSWARM_SELF_HOSTED_MAX_CONCURRENT_LLM_JOBS LLM/Codali capacity, defaults to overall capacity
53
+ MSWARM_SELF_HOSTED_DRAIN_MODE Report zero free slots for maintenance
54
+ MSWARM_SELF_HOSTED_LOAD_REPORTING_ENABLED Add load-balancer telemetry, defaults to true
55
+ MSWARM_SELF_HOSTED_HARDWARE_TELEMETRY_ENABLED Opt in to coarse host pressure telemetry
56
+ MSWARM_SELF_HOSTED_GENERIC_JOBS_ENABLED Enable owner-local generic jobs, defaults to false
57
+ MSWARM_SELF_HOSTED_GENERIC_JOB_TIMEOUT_MS Generic job timeout, defaults to self-hosted job timeout
58
+ MSWARM_SELF_HOSTED_GENERIC_JOB_MAX_CONCURRENCY Generic job concurrency, defaults to 1
59
+ MSWARM_SELF_HOSTED_CAPABILITY_PROBE_TIMEOUT_MS Capability probe timeout, defaults to 2000
48
60
  MSWARM_SELF_HOSTED_LISTEN_HOST Direct node bind host, defaults to 127.0.0.1
49
61
  MSWARM_SELF_HOSTED_LISTEN_PORT Direct node bind port, defaults to 18083
50
62
  MSWARM_SELF_HOSTED_MODEL_ALLOWLIST Comma-separated local agent slugs/model names to expose
@@ -63,6 +75,15 @@ Setup options:
63
75
  --block <SLUGS> Comma-separated blocklist
64
76
  --expose-all Expose all healthy non-embedding local agents (default)
65
77
  --no-expose-all Expose only allowlisted local agents
78
+ --max-concurrent-jobs <N> Overall advertised job capacity
79
+ --max-concurrent-llm-jobs <N> LLM/Codali capacity
80
+ --drain Register the node in drain mode
81
+ --disable-load-reporting Keep legacy heartbeat capacity shape only
82
+ --enable-hardware-telemetry Include coarse host pressure telemetry in heartbeats
83
+ --enable-generic-jobs Enable owner-local generic job endpoint for development
84
+ --generic-job-timeout-ms <N> Generic job timeout for owner-local development
85
+ --generic-job-max-concurrency <N> Generic job concurrency for owner-local development
86
+ --artifact-store-path <PATH> Local generic-job artifact store path
66
87
  --start Start foreground daemon after setup
67
88
 
68
89
  Log options:
@@ -149,17 +170,918 @@ function assertJobMatchesClaims(job, claims) {
149
170
  throw new Error("model does not match invocation token");
150
171
  }
151
172
  }
173
+ function assertGenericJobMatchesClaims(job, claims) {
174
+ if (job.node_id !== claims.node_id) {
175
+ throw new Error("generic job node_id does not match invocation token");
176
+ }
177
+ if (job.job_id !== claims.job_id) {
178
+ throw new Error("generic job_id does not match invocation token");
179
+ }
180
+ if (job.request_id !== claims.request_id) {
181
+ throw new Error("generic request_id does not match invocation token");
182
+ }
183
+ if (job.job?.schema_version !== claims.schema_version) {
184
+ throw new Error("generic schema_version does not match invocation token");
185
+ }
186
+ if (job.job?.job_type !== claims.job_type) {
187
+ throw new Error("generic job_type does not match invocation token");
188
+ }
189
+ }
190
+ function isOwnerLocalHost(value) {
191
+ const normalized = (value || "").trim().toLowerCase();
192
+ return (normalized === "localhost" ||
193
+ normalized === "::1" ||
194
+ normalized === "[::1]" ||
195
+ normalized === "127.0.0.1" ||
196
+ /^127\.\d{1,3}\.\d{1,3}\.\d{1,3}$/.test(normalized));
197
+ }
198
+ function isOwnerLocalGenericMode(config) {
199
+ if (!config.genericJobsEnabled || !isOwnerLocalHost(config.listenHost)) {
200
+ return false;
201
+ }
202
+ if (!config.directBaseUrl) {
203
+ return true;
204
+ }
205
+ try {
206
+ return isOwnerLocalHost(new URL(config.directBaseUrl).hostname);
207
+ }
208
+ catch {
209
+ return false;
210
+ }
211
+ }
212
+ function isOwnerLocalNodeApiMode(config) {
213
+ if (!isOwnerLocalHost(config.listenHost)) {
214
+ return false;
215
+ }
216
+ if (!config.directBaseUrl) {
217
+ return true;
218
+ }
219
+ try {
220
+ return isOwnerLocalHost(new URL(config.directBaseUrl).hostname);
221
+ }
222
+ catch {
223
+ return false;
224
+ }
225
+ }
226
+ function acceptsGenericEventStream(headers) {
227
+ const accept = headerText(headers, "accept") || "";
228
+ return accept.split(",").some((entry) => entry.trim().toLowerCase().startsWith("text/event-stream"));
229
+ }
152
230
  function writeSelfHostedSseChunk(raw, chunk) {
153
231
  raw.write(`data: ${JSON.stringify(chunk)}\n\n`);
154
232
  }
155
233
  function writeSelfHostedSseDone(raw) {
156
234
  raw.write("data: [DONE]\n\n");
157
235
  }
236
+ function writeGenericJobSseEvent(raw, event) {
237
+ raw.write(`event: ${String(event.type || "message")}\n`);
238
+ raw.write(`data: ${JSON.stringify(event)}\n\n`);
239
+ }
240
+ function genericJobFailureStatusCode(result) {
241
+ const code = result.result.error?.code;
242
+ return code === "validation_failed"
243
+ ? 400
244
+ : code === "timeout"
245
+ ? 408
246
+ : result.status === "cancelled"
247
+ ? 409
248
+ : 502;
249
+ }
250
+ const MAX_OWNER_LOCAL_ARTIFACT_UPLOAD_BYTES = 128 * 1024 * 1024;
251
+ function optionalString(value) {
252
+ return typeof value === "string" && value.trim().length > 0 ? value.trim() : undefined;
253
+ }
254
+ function decodeArtifactUploadBody(body) {
255
+ const payload = (body && typeof body === "object" && !Array.isArray(body) ? body : {});
256
+ const artifactPath = assertMswarmSafeRelativePath(payload.path, "artifact_path");
257
+ const rawBase64 = optionalString(payload.content_base64);
258
+ if (!rawBase64) {
259
+ throw new Error("content_base64_required");
260
+ }
261
+ const normalizedBase64 = rawBase64.replace(/\s/g, "");
262
+ if (!/^[a-zA-Z0-9+/]+={0,2}$/.test(normalizedBase64) || normalizedBase64.length % 4 === 1) {
263
+ throw new Error("content_base64_invalid");
264
+ }
265
+ const bytes = Buffer.from(normalizedBase64, "base64");
266
+ if (bytes.length > MAX_OWNER_LOCAL_ARTIFACT_UPLOAD_BYTES) {
267
+ throw new Error("artifact_upload_size_limit_exceeded");
268
+ }
269
+ if (typeof payload.size_bytes === "number" && Number.isFinite(payload.size_bytes) && payload.size_bytes !== bytes.length) {
270
+ throw new Error("artifact_upload_size_mismatch");
271
+ }
272
+ const sha256 = sha256Hex(bytes);
273
+ const expectedSha = optionalString(payload.sha256);
274
+ if (expectedSha && expectedSha !== sha256) {
275
+ throw new Error("artifact_upload_checksum_mismatch");
276
+ }
277
+ return {
278
+ name: optionalString(payload.name) || artifactPath.split("/").pop() || "artifact",
279
+ path: artifactPath,
280
+ contentType: optionalString(payload.content_type),
281
+ sha256,
282
+ bytes
283
+ };
284
+ }
285
+ function artifactUploadRoot(config, jobId) {
286
+ const safeJobId = assertMswarmSafeRelativePath(jobId.replace(/[^a-zA-Z0-9_.-]/g, "_"), "job_id");
287
+ return resolve(config.artifactStorePath || ".", safeJobId);
288
+ }
289
+ function resolveArtifactUploadTarget(config, jobId, relativePath) {
290
+ const root = artifactUploadRoot(config, jobId);
291
+ const target = resolve(root, relativePath);
292
+ const rootPrefix = root.endsWith(sep) ? root : `${root}${sep}`;
293
+ if (target !== root && !target.startsWith(rootPrefix)) {
294
+ throw new Error("artifact_path_escape_rejected");
295
+ }
296
+ return target;
297
+ }
298
+ async function assertNoArtifactSymlinkSegments(root, relativePath) {
299
+ let cursor = root;
300
+ const segments = relativePath.split("/").slice(0, -1);
301
+ for (const segment of segments) {
302
+ cursor = resolve(cursor, segment);
303
+ try {
304
+ const info = await lstat(cursor);
305
+ if (info.isSymbolicLink()) {
306
+ throw new Error("artifact_path_symlink_rejected");
307
+ }
308
+ if (!info.isDirectory()) {
309
+ throw new Error("artifact_path_parent_not_directory");
310
+ }
311
+ }
312
+ catch (error) {
313
+ if (error.code === "ENOENT")
314
+ continue;
315
+ throw error;
316
+ }
317
+ }
318
+ }
319
+ function sha256Hex(buffer) {
320
+ return createHash("sha256").update(buffer).digest("hex");
321
+ }
322
+ function verifyOwnerLocalGenericJobRequest(config, headers) {
323
+ if (!config.genericJobsEnabled) {
324
+ return {
325
+ ok: false,
326
+ statusCode: 404,
327
+ payload: {
328
+ error: "not_found",
329
+ code: "feature_disabled",
330
+ message: "Generic node jobs are disabled on this node"
331
+ }
332
+ };
333
+ }
334
+ if (!isOwnerLocalGenericMode(config)) {
335
+ return {
336
+ ok: false,
337
+ statusCode: 403,
338
+ payload: {
339
+ error: "forbidden",
340
+ code: "owner_local_required",
341
+ message: "Generic node jobs are only available in owner-local direct mode"
342
+ }
343
+ };
344
+ }
345
+ if (!config.invocationSigningSecret) {
346
+ return {
347
+ ok: false,
348
+ statusCode: 503,
349
+ payload: {
350
+ error: "service_unavailable",
351
+ code: "missing_config",
352
+ message: "MSWARM_SELF_HOSTED_INVOCATION_SIGNING_SECRET is required for generic jobs"
353
+ }
354
+ };
355
+ }
356
+ const token = extractBearerToken(headers);
357
+ if (!token) {
358
+ return {
359
+ ok: false,
360
+ statusCode: 401,
361
+ payload: {
362
+ error: "unauthorized",
363
+ code: "unauthorized",
364
+ message: "Missing generic job token"
365
+ }
366
+ };
367
+ }
368
+ try {
369
+ return {
370
+ ok: true,
371
+ token,
372
+ claims: verifySelfHostedGenericJobToken({
373
+ token,
374
+ secret: config.invocationSigningSecret
375
+ })
376
+ };
377
+ }
378
+ catch (error) {
379
+ return {
380
+ ok: false,
381
+ statusCode: 401,
382
+ payload: {
383
+ error: "unauthorized",
384
+ code: "unauthorized",
385
+ message: error instanceof Error ? error.message : "Invalid generic job token"
386
+ }
387
+ };
388
+ }
389
+ }
390
+ function verifyOwnerLocalGenericJobOpsRequest(config, headers) {
391
+ if (!config.genericJobsEnabled) {
392
+ return {
393
+ ok: false,
394
+ statusCode: 404,
395
+ payload: {
396
+ error: "not_found",
397
+ code: "feature_disabled",
398
+ message: "Generic node jobs are disabled on this node"
399
+ }
400
+ };
401
+ }
402
+ if (!isOwnerLocalGenericMode(config)) {
403
+ return {
404
+ ok: false,
405
+ statusCode: 403,
406
+ payload: {
407
+ error: "forbidden",
408
+ code: "owner_local_required",
409
+ message: "Generic node operations are only available in owner-local direct mode"
410
+ }
411
+ };
412
+ }
413
+ if (!config.invocationSigningSecret) {
414
+ return {
415
+ ok: false,
416
+ statusCode: 503,
417
+ payload: {
418
+ error: "service_unavailable",
419
+ code: "missing_config",
420
+ message: "MSWARM_SELF_HOSTED_INVOCATION_SIGNING_SECRET is required for generic job operations"
421
+ }
422
+ };
423
+ }
424
+ const token = extractBearerToken(headers);
425
+ if (!token) {
426
+ return {
427
+ ok: false,
428
+ statusCode: 401,
429
+ payload: {
430
+ error: "unauthorized",
431
+ code: "unauthorized",
432
+ message: "Missing generic job ops token"
433
+ }
434
+ };
435
+ }
436
+ try {
437
+ const claims = verifySelfHostedGenericJobOpsToken({
438
+ token,
439
+ secret: config.invocationSigningSecret
440
+ });
441
+ if (claims.node_id !== config.nodeId) {
442
+ return {
443
+ ok: false,
444
+ statusCode: 400,
445
+ payload: {
446
+ error: "bad_request",
447
+ code: "validation_failed",
448
+ message: "generic job ops token does not match this node"
449
+ }
450
+ };
451
+ }
452
+ return { ok: true, claims };
453
+ }
454
+ catch (error) {
455
+ return {
456
+ ok: false,
457
+ statusCode: 401,
458
+ payload: {
459
+ error: "unauthorized",
460
+ code: "unauthorized",
461
+ message: error instanceof Error ? error.message : "Invalid generic job ops token"
462
+ }
463
+ };
464
+ }
465
+ }
466
+ function assertLifecycleJobIdMatchesClaims(jobId, config, claims) {
467
+ if (claims.node_id !== config.nodeId || claims.job_id !== jobId) {
468
+ throw new Error("generic job token does not match this node or job");
469
+ }
470
+ }
471
+ function tenantIdForGenericJob(job) {
472
+ const metadata = job.job.metadata;
473
+ const tenantId = metadata && typeof metadata.tenant_id === "string" ? metadata.tenant_id.trim() : "";
474
+ return tenantId || "owner-local";
475
+ }
476
+ function genericJobMaxConcurrency(config) {
477
+ const configured = config.genericJobMaxConcurrency;
478
+ return Number.isFinite(configured) && configured && configured > 0 ? Math.floor(configured) : 1;
479
+ }
480
+ function clampOpsQueryNumber(value, fallback, max) {
481
+ const raw = Array.isArray(value) ? value[value.length - 1] : value;
482
+ const parsed = typeof raw === "number" ? raw : typeof raw === "string" ? Number.parseInt(raw, 10) : NaN;
483
+ if (!Number.isFinite(parsed) || parsed < 0) {
484
+ return fallback;
485
+ }
486
+ return Math.min(max, Math.floor(parsed));
487
+ }
488
+ function opsQueryOptions(query) {
489
+ const record = query && typeof query === "object" && !Array.isArray(query) ? query : {};
490
+ return {
491
+ auditLimit: clampOpsQueryNumber(record.audit_limit ?? record.auditLimit, 50, 250),
492
+ auditOffset: clampOpsQueryNumber(record.audit_offset ?? record.auditOffset, 0, 10000)
493
+ };
494
+ }
495
+ function artifactBytes(record) {
496
+ return (record.artifacts || []).reduce((total, artifact) => total + (artifact.size_bytes || 0), 0);
497
+ }
498
+ function logBytes(logs, stream) {
499
+ return logs
500
+ .filter((log) => !stream || log.stream === stream)
501
+ .reduce((total, log) => total + Buffer.byteLength(log.message || "", "utf8"), 0);
502
+ }
503
+ function progressPercent(events) {
504
+ for (const event of [...events].reverse()) {
505
+ if (event.type !== "progress" || !event.data || typeof event.data !== "object") {
506
+ continue;
507
+ }
508
+ const data = event.data;
509
+ const value = data.progress_percent ?? data.percent ?? data.progress;
510
+ if (typeof value === "number" && Number.isFinite(value)) {
511
+ return Math.max(0, Math.min(100, value));
512
+ }
513
+ }
514
+ return undefined;
515
+ }
516
+ function gpuSeconds(record) {
517
+ if (!record.started_at || !record.finished_at) {
518
+ return 0;
519
+ }
520
+ const started = Date.parse(record.started_at);
521
+ const finished = Date.parse(record.finished_at);
522
+ if (!Number.isFinite(started) || !Number.isFinite(finished) || finished <= started) {
523
+ return 0;
524
+ }
525
+ const gpuCount = Math.max(1, Math.floor(record.reservation?.resources?.gpu_count || record.job.resources?.gpu?.count || 1));
526
+ return Math.round(((finished - started) / 1000) * gpuCount * 1000) / 1000;
527
+ }
528
+ function tokenSha256(token) {
529
+ return createHash("sha256").update(token).digest("hex");
530
+ }
531
+ function lifecycleRetryPolicy(job) {
532
+ const retry = job.job.metadata?.retry;
533
+ const retryRecord = retry && typeof retry === "object" && !Array.isArray(retry) ? retry : null;
534
+ const maxRetries = typeof retryRecord?.max_retries === "number"
535
+ ? Math.max(0, Math.min(3, Math.floor(retryRecord.max_retries)))
536
+ : 0;
537
+ return {
538
+ max_retries: maxRetries,
539
+ retry_count: 0,
540
+ retryable_error_codes: ["timeout"]
541
+ };
542
+ }
543
+ class OwnerLocalGenericJobLifecycleScheduler {
544
+ constructor(runtime, config) {
545
+ this.runtime = runtime;
546
+ this.config = config;
547
+ this.jobs = new Map();
548
+ this.idempotency = new Map();
549
+ this.dispatching = false;
550
+ }
551
+ syncRuntimeQueueTelemetry() {
552
+ this.runtime.updateLocalQueueTelemetry({ genericQueuedJobs: this.queuedEntries().length });
553
+ }
554
+ create(job, claims, token) {
555
+ const tenantId = tenantIdForGenericJob(job);
556
+ const idempotencyKey = normalizeMswarmGenericJobIdempotencyKey({
557
+ tenantId,
558
+ idempotencyKey: job.job.idempotency_key,
559
+ jobId: job.job_id,
560
+ requestId: job.request_id
561
+ });
562
+ const existingId = this.idempotency.get(idempotencyKey);
563
+ if (existingId) {
564
+ const existing = this.mustGetEntry(existingId);
565
+ if (existing.record.job_id !== job.job_id || existing.record.request_id !== job.request_id) {
566
+ throw new Error("idempotency_key_conflict");
567
+ }
568
+ this.audit(existing, "job_idempotent_reused", { idempotency_key: idempotencyKey });
569
+ return { snapshot: this.snapshot(existing), reused: true };
570
+ }
571
+ if (this.jobs.has(job.job_id)) {
572
+ throw new Error("job_id_conflict");
573
+ }
574
+ const now = new Date().toISOString();
575
+ const entry = {
576
+ claims,
577
+ tokenSha256: tokenSha256(token),
578
+ events: [],
579
+ logs: [],
580
+ audit: [],
581
+ record: {
582
+ schema_version: MSWARM_JOB_LIFECYCLE_SCHEMA_VERSION,
583
+ job_id: job.job_id,
584
+ request_id: job.request_id,
585
+ tenant_id: tenantId,
586
+ node_id: this.config.nodeId,
587
+ state: "queued",
588
+ job: job.job,
589
+ idempotency_key: idempotencyKey,
590
+ created_at: now,
591
+ updated_at: now,
592
+ queued_at: now,
593
+ retry: lifecycleRetryPolicy(job)
594
+ }
595
+ };
596
+ this.jobs.set(job.job_id, entry);
597
+ this.idempotency.set(idempotencyKey, job.job_id);
598
+ this.audit(entry, "job_created", { idempotency_key: idempotencyKey });
599
+ this.audit(entry, "job_queued");
600
+ this.syncRuntimeQueueTelemetry();
601
+ queueMicrotask(() => {
602
+ void this.dispatchQueued();
603
+ });
604
+ return { snapshot: this.snapshot(entry), reused: false };
605
+ }
606
+ get(jobId) {
607
+ const entry = this.jobs.get(jobId);
608
+ return entry ? this.snapshot(entry) : null;
609
+ }
610
+ async ops(options) {
611
+ const capabilities = await this.runtime.publicCapabilityProjection();
612
+ const entries = Array.from(this.jobs.values());
613
+ const totalsByState = {};
614
+ for (const entry of entries) {
615
+ totalsByState[entry.record.state] = (totalsByState[entry.record.state] || 0) + 1;
616
+ }
617
+ const activeJobs = this.activeEntries().length;
618
+ const queuedJobs = this.queuedEntries().length;
619
+ const terminalJobs = entries.filter((entry) => isMswarmTerminalLifecycleState(entry.record.state)).length;
620
+ const stdoutBytes = entries.reduce((total, entry) => total + logBytes(entry.logs, "stdout"), 0);
621
+ const stderrBytes = entries.reduce((total, entry) => total + logBytes(entry.logs, "stderr"), 0);
622
+ const allAudit = entries
623
+ .flatMap((entry) => entry.audit)
624
+ .sort((a, b) => b.timestamp.localeCompare(a.timestamp));
625
+ const jobs = entries
626
+ .map((entry) => this.opsJobSummary(entry))
627
+ .sort((a, b) => b.updated_at.localeCompare(a.updated_at));
628
+ return {
629
+ schema_version: MSWARM_JOB_LIFECYCLE_SCHEMA_VERSION,
630
+ generated_at: new Date().toISOString(),
631
+ node: {
632
+ node_id: this.config.nodeId,
633
+ listen_host: this.config.listenHost,
634
+ listen_port: this.config.listenPort,
635
+ owner_local: isOwnerLocalGenericMode(this.config),
636
+ generic_jobs_enabled: this.config.genericJobsEnabled,
637
+ artifact_store_configured: Boolean(this.config.artifactStorePath),
638
+ max_concurrent_jobs: genericJobMaxConcurrency(this.config)
639
+ },
640
+ capabilities: capabilities,
641
+ queue: {
642
+ jobs,
643
+ totals_by_state: totalsByState,
644
+ active_jobs: activeJobs,
645
+ queued_jobs: queuedJobs,
646
+ terminal_jobs: terminalJobs
647
+ },
648
+ quota: {
649
+ max_concurrent_jobs: genericJobMaxConcurrency(this.config),
650
+ active_jobs: activeJobs,
651
+ queued_jobs: queuedJobs,
652
+ available_slots: Math.max(0, genericJobMaxConcurrency(this.config) - activeJobs),
653
+ production_enforced: false,
654
+ limits: {
655
+ generic_job_timeout_ms: this.config.genericJobTimeoutMs,
656
+ job_timeout_ms: this.config.jobTimeoutMs,
657
+ request_timeout_ms: this.config.requestTimeoutMs,
658
+ artifact_store_configured: Boolean(this.config.artifactStorePath)
659
+ }
660
+ },
661
+ usage: {
662
+ total_jobs: entries.length,
663
+ active_jobs: activeJobs,
664
+ terminal_jobs: terminalJobs,
665
+ succeeded_jobs: totalsByState.succeeded || 0,
666
+ failed_jobs: totalsByState.failed || 0,
667
+ cancelled_jobs: totalsByState.cancelled || 0,
668
+ blocked_jobs: totalsByState.blocked || 0,
669
+ expired_jobs: totalsByState.expired || 0,
670
+ gpu_seconds: Math.round(entries.reduce((total, entry) => total + gpuSeconds(entry.record), 0) * 1000) / 1000,
671
+ artifact_count: entries.reduce((total, entry) => total + (entry.record.artifacts || []).length, 0),
672
+ artifact_bytes: entries.reduce((total, entry) => total + artifactBytes(entry.record), 0),
673
+ event_count: entries.reduce((total, entry) => total + entry.events.length, 0),
674
+ audit_event_count: allAudit.length,
675
+ stdout_bytes: stdoutBytes,
676
+ stderr_bytes: stderrBytes,
677
+ log_bytes: stdoutBytes + stderrBytes
678
+ },
679
+ audit: {
680
+ total: allAudit.length,
681
+ offset: options.auditOffset,
682
+ limit: options.auditLimit,
683
+ events: allAudit.slice(options.auditOffset, options.auditOffset + options.auditLimit)
684
+ }
685
+ };
686
+ }
687
+ cancel(jobId, claims) {
688
+ const entry = this.mustGetEntry(jobId);
689
+ if (entry.claims.request_id !== claims.request_id ||
690
+ entry.claims.schema_version !== claims.schema_version ||
691
+ entry.claims.job_type !== claims.job_type) {
692
+ throw new Error("generic cancellation token does not match the lifecycle job");
693
+ }
694
+ this.audit(entry, "job_cancel_requested");
695
+ if (isMswarmTerminalLifecycleState(entry.record.state)) {
696
+ return this.snapshot(entry);
697
+ }
698
+ if (entry.controller && !entry.controller.signal.aborted) {
699
+ entry.controller.abort("cancelled");
700
+ return this.snapshot(entry);
701
+ }
702
+ this.transition(entry, "cancelled", {
703
+ finished_at: new Date().toISOString(),
704
+ result: {
705
+ job_id: entry.record.job_id,
706
+ status: "cancelled",
707
+ error: {
708
+ code: "cancelled",
709
+ message: "generic job cancelled before dispatch"
710
+ }
711
+ }
712
+ });
713
+ this.audit(entry, "job_cancelled");
714
+ this.releaseReservation(entry);
715
+ return this.snapshot(entry);
716
+ }
717
+ retry(jobId, claims) {
718
+ const entry = this.mustGetEntry(jobId);
719
+ if (entry.claims.request_id !== claims.request_id ||
720
+ entry.claims.schema_version !== claims.schema_version ||
721
+ entry.claims.job_type !== claims.job_type) {
722
+ throw new Error("generic retry token does not match the lifecycle job");
723
+ }
724
+ if (!isMswarmTerminalLifecycleState(entry.record.state)) {
725
+ throw new Error("job_retry_requires_terminal_state");
726
+ }
727
+ if (entry.record.state === "succeeded") {
728
+ throw new Error("job_retry_not_allowed_for_succeeded_jobs");
729
+ }
730
+ this.releaseReservation(entry);
731
+ const now = new Date().toISOString();
732
+ const retryCount = entry.record.retry.retry_count + 1;
733
+ entry.record = {
734
+ ...entry.record,
735
+ state: "queued",
736
+ updated_at: now,
737
+ queued_at: now,
738
+ scheduled_at: undefined,
739
+ started_at: undefined,
740
+ finished_at: undefined,
741
+ reservation: undefined,
742
+ envelope: undefined,
743
+ backpressure: undefined,
744
+ result: undefined,
745
+ artifacts: undefined,
746
+ retry: {
747
+ ...entry.record.retry,
748
+ retry_count: retryCount,
749
+ next_retry_at: now
750
+ }
751
+ };
752
+ this.audit(entry, "job_retry_scheduled", { retry_count: retryCount, manual: true });
753
+ queueMicrotask(() => {
754
+ void this.dispatchQueued();
755
+ });
756
+ return this.snapshot(entry);
757
+ }
758
+ async dispatchQueued() {
759
+ if (this.dispatching) {
760
+ return;
761
+ }
762
+ this.dispatching = true;
763
+ try {
764
+ while (this.activeEntries().length < genericJobMaxConcurrency(this.config)) {
765
+ const entry = this.nextDispatchableEntry();
766
+ if (!entry) {
767
+ return;
768
+ }
769
+ const activeTenant = this.activeTenantId();
770
+ if (activeTenant && activeTenant !== entry.record.tenant_id) {
771
+ this.setBackpressure(entry, "tenant_reserved", "Node is reserved for another tenant until active jobs finish.");
772
+ return;
773
+ }
774
+ const capabilityOk = await this.recheckCapabilities(entry);
775
+ if (!capabilityOk) {
776
+ continue;
777
+ }
778
+ this.schedule(entry);
779
+ void this.runScheduled(entry);
780
+ }
781
+ for (const entry of this.queuedEntries()) {
782
+ this.setBackpressure(entry, "node_at_capacity", "Node is at generic job concurrency limit.", 1000);
783
+ }
784
+ }
785
+ finally {
786
+ this.dispatching = false;
787
+ }
788
+ }
789
+ activeEntries() {
790
+ return Array.from(this.jobs.values()).filter((entry) => entry.record.state === "scheduled" || entry.record.state === "running");
791
+ }
792
+ queuedEntries() {
793
+ return Array.from(this.jobs.values()).filter((entry) => entry.record.state === "queued" || entry.record.state === "retrying");
794
+ }
795
+ nextDispatchableEntry() {
796
+ return this.queuedEntries().sort((a, b) => a.record.created_at.localeCompare(b.record.created_at))[0] || null;
797
+ }
798
+ activeTenantId() {
799
+ const active = this.activeEntries().find((entry) => entry.record.reservation && !entry.record.reservation.released_at);
800
+ return active?.record.tenant_id || null;
801
+ }
802
+ async recheckCapabilities(entry) {
803
+ const snapshot = await this.runtime.probeCapabilities();
804
+ const capabilityMismatch = genericJobCapabilityMismatch(entry.record.job, snapshot);
805
+ if (capabilityMismatch) {
806
+ this.transition(entry, "blocked", {
807
+ finished_at: new Date().toISOString(),
808
+ backpressure: {
809
+ reason: "no_capable_node",
810
+ message: capabilityMismatch.message
811
+ },
812
+ result: {
813
+ job_id: entry.record.job_id,
814
+ status: "failed",
815
+ error: {
816
+ code: capabilityMismatch.code,
817
+ message: capabilityMismatch.message,
818
+ retryable: true
819
+ }
820
+ }
821
+ });
822
+ this.audit(entry, "job_blocked", { reason: capabilityMismatch.code });
823
+ return false;
824
+ }
825
+ return true;
826
+ }
827
+ schedule(entry) {
828
+ const now = new Date().toISOString();
829
+ const reservation = {
830
+ node_id: this.config.nodeId,
831
+ tenant_id: entry.record.tenant_id,
832
+ job_id: entry.record.job_id,
833
+ request_id: entry.record.request_id,
834
+ reserved_at: now,
835
+ resources: {
836
+ ...(entry.record.job.resources?.gpu?.count ? { gpu_count: entry.record.job.resources.gpu.count } : {}),
837
+ ...(entry.record.job.resources?.cpu?.cores ? { cpu_cores: entry.record.job.resources.cpu.cores } : {}),
838
+ ...(entry.record.job.resources?.memory_gb ? { memory_gb: entry.record.job.resources.memory_gb } : {}),
839
+ ...(entry.record.job.resources?.disk_gb ? { disk_gb: entry.record.job.resources.disk_gb } : {})
840
+ }
841
+ };
842
+ const expiresAt = new Date(Date.now() + 5 * 60000).toISOString();
843
+ this.transition(entry, "scheduled", {
844
+ node_id: this.config.nodeId,
845
+ scheduled_at: now,
846
+ reservation,
847
+ backpressure: undefined,
848
+ envelope: buildMswarmGenericJobEnvelopeDescriptor({
849
+ jobId: entry.record.job_id,
850
+ requestId: entry.record.request_id,
851
+ nodeId: this.config.nodeId,
852
+ job: entry.record.job,
853
+ issuedAt: now,
854
+ expiresAt,
855
+ tokenSha256: entry.tokenSha256
856
+ })
857
+ });
858
+ this.audit(entry, "reservation_created", { resources: reservation.resources });
859
+ this.audit(entry, "envelope_issued", { expires_at: expiresAt });
860
+ this.audit(entry, "job_scheduled");
861
+ }
862
+ async runScheduled(entry) {
863
+ const controller = new AbortController();
864
+ entry.controller = controller;
865
+ this.transition(entry, "running", {
866
+ started_at: new Date().toISOString()
867
+ });
868
+ this.audit(entry, "job_started");
869
+ const envelope = {
870
+ job_id: entry.record.job_id,
871
+ request_id: entry.record.request_id,
872
+ node_id: this.config.nodeId,
873
+ job: entry.record.job
874
+ };
875
+ const result = await this.runtime.executeGenericJob(envelope, {
876
+ signal: controller.signal,
877
+ onEvent: async (event) => {
878
+ this.recordEvent(entry, event);
879
+ }
880
+ });
881
+ entry.controller = undefined;
882
+ if (result.status === "failed" && this.shouldRetry(entry, result.result.error?.code)) {
883
+ this.scheduleRetry(entry, result.result);
884
+ await this.dispatchQueued();
885
+ return;
886
+ }
887
+ const terminalState = result.status === "succeeded" ? "succeeded" : result.status === "cancelled" ? "cancelled" : "failed";
888
+ this.transition(entry, terminalState, {
889
+ finished_at: new Date().toISOString(),
890
+ result: result.result,
891
+ artifacts: result.result.artifacts || []
892
+ });
893
+ this.audit(entry, terminalState === "cancelled" ? "job_cancelled" : "job_completed", { status: terminalState });
894
+ this.releaseReservation(entry);
895
+ await this.dispatchQueued();
896
+ }
897
+ shouldRetry(entry, errorCode) {
898
+ if (!errorCode || entry.record.retry.retry_count >= entry.record.retry.max_retries) {
899
+ return false;
900
+ }
901
+ const retryable = entry.record.retry.retryable_error_codes || [];
902
+ return retryable.includes(errorCode);
903
+ }
904
+ scheduleRetry(entry, result) {
905
+ const retryCount = entry.record.retry.retry_count + 1;
906
+ const nextRetryAt = new Date().toISOString();
907
+ this.transition(entry, "retrying", {
908
+ finished_at: new Date().toISOString(),
909
+ result,
910
+ retry: {
911
+ ...entry.record.retry,
912
+ retry_count: retryCount,
913
+ next_retry_at: nextRetryAt
914
+ }
915
+ });
916
+ this.audit(entry, "job_retry_scheduled", { retry_count: retryCount, next_retry_at: nextRetryAt });
917
+ this.releaseReservation(entry);
918
+ this.transition(entry, "queued", {
919
+ queued_at: nextRetryAt,
920
+ scheduled_at: undefined,
921
+ started_at: undefined,
922
+ finished_at: undefined,
923
+ reservation: undefined,
924
+ envelope: undefined,
925
+ backpressure: undefined
926
+ });
927
+ }
928
+ recordEvent(entry, event) {
929
+ entry.events.push(event);
930
+ if (event.type === "stdout" || event.type === "stderr") {
931
+ entry.logs.push({
932
+ job_id: event.job_id,
933
+ sequence: event.sequence,
934
+ timestamp: event.timestamp,
935
+ stream: event.type,
936
+ message: event.message || "",
937
+ truncated: false
938
+ });
939
+ }
940
+ this.audit(entry, "job_event_recorded", { type: event.type, sequence: event.sequence });
941
+ }
942
+ releaseReservation(entry) {
943
+ if (!entry.record.reservation || entry.record.reservation.released_at) {
944
+ return;
945
+ }
946
+ entry.record.reservation = {
947
+ ...entry.record.reservation,
948
+ released_at: new Date().toISOString()
949
+ };
950
+ this.audit(entry, "reservation_released");
951
+ }
952
+ setBackpressure(entry, reason, message, retryAfterMs) {
953
+ entry.record.backpressure = {
954
+ reason,
955
+ message,
956
+ ...(retryAfterMs ? { retry_after_ms: retryAfterMs } : {})
957
+ };
958
+ entry.record.updated_at = new Date().toISOString();
959
+ }
960
+ transition(entry, state, patch = {}) {
961
+ if (!isMswarmLifecycleStateTransitionAllowed(entry.record.state, state)) {
962
+ throw new Error(`invalid lifecycle transition from ${entry.record.state} to ${state}`);
963
+ }
964
+ entry.record = {
965
+ ...entry.record,
966
+ ...patch,
967
+ state,
968
+ updated_at: new Date().toISOString()
969
+ };
970
+ this.syncRuntimeQueueTelemetry();
971
+ }
972
+ audit(entry, action, details) {
973
+ entry.audit.push(buildMswarmGenericJobAuditEvent({
974
+ auditId: `audit_${randomUUID()}`,
975
+ jobId: entry.record.job_id,
976
+ requestId: entry.record.request_id,
977
+ tenantId: entry.record.tenant_id,
978
+ nodeId: this.config.nodeId,
979
+ action,
980
+ timestamp: new Date().toISOString(),
981
+ details
982
+ }));
983
+ }
984
+ snapshot(entry) {
985
+ return {
986
+ job: entry.record,
987
+ events: [...entry.events],
988
+ logs: [...entry.logs],
989
+ artifacts: [...(entry.record.artifacts || [])],
990
+ audit: [...entry.audit]
991
+ };
992
+ }
993
+ opsJobSummary(entry) {
994
+ const lastEvent = entry.events[entry.events.length - 1];
995
+ return {
996
+ job_id: entry.record.job_id,
997
+ request_id: entry.record.request_id,
998
+ tenant_id: entry.record.tenant_id,
999
+ node_id: entry.record.node_id,
1000
+ state: entry.record.state,
1001
+ job_type: entry.record.job.job_type,
1002
+ schema_version: entry.record.job.schema_version,
1003
+ created_at: entry.record.created_at,
1004
+ updated_at: entry.record.updated_at,
1005
+ queued_at: entry.record.queued_at,
1006
+ scheduled_at: entry.record.scheduled_at,
1007
+ started_at: entry.record.started_at,
1008
+ finished_at: entry.record.finished_at,
1009
+ retry_count: entry.record.retry.retry_count,
1010
+ max_retries: entry.record.retry.max_retries,
1011
+ progress_percent: progressPercent(entry.events),
1012
+ last_event_type: lastEvent?.type,
1013
+ last_event_message: lastEvent?.message,
1014
+ artifact_count: (entry.record.artifacts || []).length,
1015
+ artifact_bytes: artifactBytes(entry.record),
1016
+ log_bytes: logBytes(entry.logs)
1017
+ };
1018
+ }
1019
+ mustGetEntry(jobId) {
1020
+ const entry = this.jobs.get(jobId);
1021
+ if (!entry) {
1022
+ throw new Error("job_not_found");
1023
+ }
1024
+ return entry;
1025
+ }
1026
+ }
158
1027
  export function buildSelfHostedNodeApp(runtime, config) {
159
1028
  const app = Fastify({ logger: false });
1029
+ const activeGenericJobs = new Map();
1030
+ const lifecycle = new OwnerLocalGenericJobLifecycleScheduler(runtime, config);
160
1031
  app.get("/healthz", async (_request, reply) => {
161
1032
  reply.send({ service: "mswarm-self-hosted-node", status: "ok", node_id: config.nodeId });
162
1033
  });
1034
+ app.get("/v1/swarm/self-hosted/node/capabilities", async (request, reply) => {
1035
+ if (!isOwnerLocalNodeApiMode(config)) {
1036
+ reply.status(403).send({
1037
+ error: "forbidden",
1038
+ code: "owner_local_required",
1039
+ message: "Node capabilities are only available in owner-local direct mode"
1040
+ });
1041
+ return;
1042
+ }
1043
+ if (!config.invocationSigningSecret) {
1044
+ reply.status(503).send({
1045
+ error: "service_unavailable",
1046
+ code: "missing_config",
1047
+ message: "MSWARM_SELF_HOSTED_INVOCATION_SIGNING_SECRET is required for capability reads"
1048
+ });
1049
+ return;
1050
+ }
1051
+ const token = extractBearerToken(request.headers);
1052
+ if (!token) {
1053
+ reply.status(401).send({
1054
+ error: "unauthorized",
1055
+ code: "unauthorized",
1056
+ message: "Missing capability token"
1057
+ });
1058
+ return;
1059
+ }
1060
+ let claims;
1061
+ try {
1062
+ claims = verifySelfHostedCapabilityToken({
1063
+ token,
1064
+ secret: config.invocationSigningSecret
1065
+ });
1066
+ }
1067
+ catch (error) {
1068
+ reply.status(401).send({
1069
+ error: "unauthorized",
1070
+ code: "unauthorized",
1071
+ message: error instanceof Error ? error.message : "Invalid capability token"
1072
+ });
1073
+ return;
1074
+ }
1075
+ if (claims.node_id !== config.nodeId) {
1076
+ reply.status(400).send({
1077
+ error: "bad_request",
1078
+ code: "validation_failed",
1079
+ message: "capability token node_id does not match this node"
1080
+ });
1081
+ return;
1082
+ }
1083
+ reply.send(await runtime.publicCapabilityProjection());
1084
+ });
163
1085
  app.post("/v1/swarm/self-hosted/node/jobs", async (request, reply) => {
164
1086
  if (!config.invocationSigningSecret) {
165
1087
  reply.status(503).send({
@@ -262,6 +1184,496 @@ export function buildSelfHostedNodeApp(runtime, config) {
262
1184
  }
263
1185
  reply.send(result);
264
1186
  });
1187
+ app.post("/v1/swarm/self-hosted/node/generic-jobs", async (request, reply) => {
1188
+ if (!config.genericJobsEnabled) {
1189
+ reply.status(404).send({
1190
+ error: "not_found",
1191
+ code: "feature_disabled",
1192
+ message: "Generic node jobs are disabled on this node"
1193
+ });
1194
+ return;
1195
+ }
1196
+ if (!isOwnerLocalGenericMode(config)) {
1197
+ reply.status(403).send({
1198
+ error: "forbidden",
1199
+ code: "owner_local_required",
1200
+ message: "Generic node jobs are only available in owner-local direct mode"
1201
+ });
1202
+ return;
1203
+ }
1204
+ if (!config.invocationSigningSecret) {
1205
+ reply.status(503).send({
1206
+ error: "service_unavailable",
1207
+ code: "missing_config",
1208
+ message: "MSWARM_SELF_HOSTED_INVOCATION_SIGNING_SECRET is required for generic jobs"
1209
+ });
1210
+ return;
1211
+ }
1212
+ const token = extractBearerToken(request.headers);
1213
+ if (!token) {
1214
+ reply.status(401).send({
1215
+ error: "unauthorized",
1216
+ code: "unauthorized",
1217
+ message: "Missing generic job token"
1218
+ });
1219
+ return;
1220
+ }
1221
+ let claims;
1222
+ try {
1223
+ claims = verifySelfHostedGenericJobToken({
1224
+ token,
1225
+ secret: config.invocationSigningSecret
1226
+ });
1227
+ }
1228
+ catch (error) {
1229
+ reply.status(401).send({
1230
+ error: "unauthorized",
1231
+ code: "unauthorized",
1232
+ message: error instanceof Error ? error.message : "Invalid generic job token"
1233
+ });
1234
+ return;
1235
+ }
1236
+ const job = request.body;
1237
+ try {
1238
+ assertGenericJobMatchesClaims(job, claims);
1239
+ }
1240
+ catch (error) {
1241
+ reply.status(400).send({
1242
+ error: "bad_request",
1243
+ code: "validation_failed",
1244
+ message: error instanceof Error ? error.message : "Invalid generic job"
1245
+ });
1246
+ return;
1247
+ }
1248
+ const wantsEventStream = acceptsGenericEventStream(request.headers);
1249
+ if (activeGenericJobs.has(job.job_id)) {
1250
+ reply.status(409).send({
1251
+ error: "conflict",
1252
+ code: "job_already_running",
1253
+ message: "Generic job is already running on this node"
1254
+ });
1255
+ return;
1256
+ }
1257
+ const abortController = new AbortController();
1258
+ activeGenericJobs.set(job.job_id, { controller: abortController, claims });
1259
+ if (wantsEventStream) {
1260
+ reply.hijack();
1261
+ reply.raw.writeHead(200, {
1262
+ "content-type": "text/event-stream; charset=utf-8",
1263
+ "cache-control": "no-cache, no-transform",
1264
+ connection: "keep-alive",
1265
+ "x-accel-buffering": "no"
1266
+ });
1267
+ const onClose = () => {
1268
+ if (!abortController.signal.aborted) {
1269
+ abortController.abort("cancelled");
1270
+ }
1271
+ };
1272
+ reply.raw.once("close", onClose);
1273
+ const keepAlive = setInterval(() => {
1274
+ if (!reply.raw.destroyed && !reply.raw.writableEnded) {
1275
+ reply.raw.write(": keep-alive\n\n");
1276
+ }
1277
+ }, 15000);
1278
+ try {
1279
+ await runtime.executeGenericJob(job, {
1280
+ signal: abortController.signal,
1281
+ onEvent: async (event) => {
1282
+ writeGenericJobSseEvent(reply.raw, { ...event });
1283
+ }
1284
+ });
1285
+ writeSelfHostedSseDone(reply.raw);
1286
+ }
1287
+ catch (error) {
1288
+ writeGenericJobSseEvent(reply.raw, {
1289
+ job_id: job.job_id,
1290
+ type: "failed",
1291
+ sequence: 0,
1292
+ timestamp: new Date().toISOString(),
1293
+ message: error instanceof Error ? error.message : String(error),
1294
+ data: { code: "upstream_error" }
1295
+ });
1296
+ writeSelfHostedSseDone(reply.raw);
1297
+ }
1298
+ finally {
1299
+ clearInterval(keepAlive);
1300
+ reply.raw.removeListener("close", onClose);
1301
+ if (activeGenericJobs.get(job.job_id)?.controller === abortController) {
1302
+ activeGenericJobs.delete(job.job_id);
1303
+ }
1304
+ if (!reply.raw.destroyed && !reply.raw.writableEnded) {
1305
+ reply.raw.end();
1306
+ }
1307
+ }
1308
+ return;
1309
+ }
1310
+ const result = await runtime.executeGenericJob(job, { signal: abortController.signal }).finally(() => {
1311
+ if (activeGenericJobs.get(job.job_id)?.controller === abortController) {
1312
+ activeGenericJobs.delete(job.job_id);
1313
+ }
1314
+ });
1315
+ if (result.status === "succeeded") {
1316
+ reply.send(result);
1317
+ return;
1318
+ }
1319
+ reply.status(genericJobFailureStatusCode(result)).send(result);
1320
+ });
1321
+ app.post("/v1/swarm/self-hosted/node/generic-jobs/:job_id/cancel", async (request, reply) => {
1322
+ if (!config.genericJobsEnabled) {
1323
+ reply.status(404).send({
1324
+ error: "not_found",
1325
+ code: "feature_disabled",
1326
+ message: "Generic node jobs are disabled on this node"
1327
+ });
1328
+ return;
1329
+ }
1330
+ if (!isOwnerLocalGenericMode(config)) {
1331
+ reply.status(403).send({
1332
+ error: "forbidden",
1333
+ code: "owner_local_required",
1334
+ message: "Generic node jobs are only available in owner-local direct mode"
1335
+ });
1336
+ return;
1337
+ }
1338
+ if (!config.invocationSigningSecret) {
1339
+ reply.status(503).send({
1340
+ error: "service_unavailable",
1341
+ code: "missing_config",
1342
+ message: "MSWARM_SELF_HOSTED_INVOCATION_SIGNING_SECRET is required for generic jobs"
1343
+ });
1344
+ return;
1345
+ }
1346
+ const token = extractBearerToken(request.headers);
1347
+ if (!token) {
1348
+ reply.status(401).send({
1349
+ error: "unauthorized",
1350
+ code: "unauthorized",
1351
+ message: "Missing generic job token"
1352
+ });
1353
+ return;
1354
+ }
1355
+ let claims;
1356
+ try {
1357
+ claims = verifySelfHostedGenericJobToken({
1358
+ token,
1359
+ secret: config.invocationSigningSecret
1360
+ });
1361
+ }
1362
+ catch (error) {
1363
+ reply.status(401).send({
1364
+ error: "unauthorized",
1365
+ code: "unauthorized",
1366
+ message: error instanceof Error ? error.message : "Invalid generic job token"
1367
+ });
1368
+ return;
1369
+ }
1370
+ const jobId = String(request.params.job_id || "").trim();
1371
+ if (!jobId) {
1372
+ reply.status(400).send({
1373
+ error: "bad_request",
1374
+ code: "validation_failed",
1375
+ message: "generic job_id is required"
1376
+ });
1377
+ return;
1378
+ }
1379
+ if (claims.node_id !== config.nodeId || claims.job_id !== jobId) {
1380
+ reply.status(400).send({
1381
+ error: "bad_request",
1382
+ code: "validation_failed",
1383
+ message: "generic cancellation token does not match this node or job"
1384
+ });
1385
+ return;
1386
+ }
1387
+ const activeJob = activeGenericJobs.get(jobId);
1388
+ if (!activeJob) {
1389
+ reply.status(404).send({
1390
+ error: "not_found",
1391
+ code: "job_not_running",
1392
+ message: "Generic job is not running on this node"
1393
+ });
1394
+ return;
1395
+ }
1396
+ if (activeJob.claims.request_id !== claims.request_id ||
1397
+ activeJob.claims.schema_version !== claims.schema_version ||
1398
+ activeJob.claims.job_type !== claims.job_type) {
1399
+ reply.status(400).send({
1400
+ error: "bad_request",
1401
+ code: "validation_failed",
1402
+ message: "generic cancellation token does not match the active request"
1403
+ });
1404
+ return;
1405
+ }
1406
+ if (!activeJob.controller.signal.aborted) {
1407
+ activeJob.controller.abort("cancelled");
1408
+ }
1409
+ reply.status(202).send({
1410
+ job_id: jobId,
1411
+ request_id: activeJob.claims.request_id,
1412
+ status: "cancelling"
1413
+ });
1414
+ });
1415
+ app.post("/v1/swarm/self-hosted/node/generic-job-control/jobs", async (request, reply) => {
1416
+ const auth = verifyOwnerLocalGenericJobRequest(config, request.headers);
1417
+ if (!auth.ok) {
1418
+ reply.status(auth.statusCode).send(auth.payload);
1419
+ return;
1420
+ }
1421
+ const job = request.body;
1422
+ try {
1423
+ assertGenericJobMatchesClaims(job, auth.claims);
1424
+ assertLifecycleJobIdMatchesClaims(job.job_id, config, auth.claims);
1425
+ const result = lifecycle.create(job, auth.claims, auth.token);
1426
+ reply.status(result.reused ? 200 : 202).send(result.snapshot);
1427
+ }
1428
+ catch (error) {
1429
+ const message = error instanceof Error ? error.message : "Invalid generic lifecycle job";
1430
+ reply.status(message.includes("conflict") ? 409 : 400).send({
1431
+ error: message.includes("conflict") ? "conflict" : "bad_request",
1432
+ code: message.includes("conflict") ? message : "validation_failed",
1433
+ message
1434
+ });
1435
+ }
1436
+ });
1437
+ app.post("/v1/swarm/self-hosted/node/generic-job-control/jobs/:job_id/artifacts", async (request, reply) => {
1438
+ const auth = verifyOwnerLocalGenericJobRequest(config, request.headers);
1439
+ if (!auth.ok) {
1440
+ reply.status(auth.statusCode).send(auth.payload);
1441
+ return;
1442
+ }
1443
+ const jobId = String(request.params.job_id || "").trim();
1444
+ try {
1445
+ assertLifecycleJobIdMatchesClaims(jobId, config, auth.claims);
1446
+ const upload = decodeArtifactUploadBody(request.body);
1447
+ const root = artifactUploadRoot(config, jobId);
1448
+ const target = resolveArtifactUploadTarget(config, jobId, upload.path);
1449
+ await mkdir(dirname(target), { recursive: true });
1450
+ await assertNoArtifactSymlinkSegments(root, upload.path);
1451
+ try {
1452
+ await lstat(target);
1453
+ throw new Error("artifact_upload_target_exists");
1454
+ }
1455
+ catch (error) {
1456
+ if (error.code !== "ENOENT")
1457
+ throw error;
1458
+ }
1459
+ await writeFile(target, upload.bytes, { mode: 0o600 });
1460
+ reply.status(201).send({
1461
+ job_id: jobId,
1462
+ artifact: {
1463
+ id: `upload_${upload.sha256.slice(0, 16)}`,
1464
+ uri: buildMswarmLocalArtifactUri(jobId, upload.path),
1465
+ name: upload.name,
1466
+ content_type: upload.contentType,
1467
+ size_bytes: upload.bytes.length,
1468
+ sha256: upload.sha256,
1469
+ scope: "input",
1470
+ access: defaultMswarmArtifactAccessPolicy("owner-local"),
1471
+ retention: defaultMswarmArtifactRetentionPolicy()
1472
+ }
1473
+ });
1474
+ }
1475
+ catch (error) {
1476
+ const message = error instanceof Error ? error.message : "Invalid generic lifecycle artifact upload";
1477
+ reply.status(400).send({
1478
+ error: "bad_request",
1479
+ code: "validation_failed",
1480
+ message
1481
+ });
1482
+ }
1483
+ });
1484
+ app.get("/v1/swarm/self-hosted/node/generic-job-control/ops", async (request, reply) => {
1485
+ const auth = verifyOwnerLocalGenericJobOpsRequest(config, request.headers);
1486
+ if (!auth.ok) {
1487
+ reply.status(auth.statusCode).send(auth.payload);
1488
+ return;
1489
+ }
1490
+ reply.send(await lifecycle.ops(opsQueryOptions(request.query)));
1491
+ });
1492
+ app.get("/v1/swarm/self-hosted/node/generic-job-control/jobs/:job_id", async (request, reply) => {
1493
+ const auth = verifyOwnerLocalGenericJobRequest(config, request.headers);
1494
+ if (!auth.ok) {
1495
+ reply.status(auth.statusCode).send(auth.payload);
1496
+ return;
1497
+ }
1498
+ const jobId = String(request.params.job_id || "").trim();
1499
+ try {
1500
+ assertLifecycleJobIdMatchesClaims(jobId, config, auth.claims);
1501
+ }
1502
+ catch (error) {
1503
+ reply.status(400).send({
1504
+ error: "bad_request",
1505
+ code: "validation_failed",
1506
+ message: error instanceof Error ? error.message : "Invalid generic lifecycle job token"
1507
+ });
1508
+ return;
1509
+ }
1510
+ const snapshot = lifecycle.get(jobId);
1511
+ if (!snapshot) {
1512
+ reply.status(404).send({
1513
+ error: "not_found",
1514
+ code: "job_not_found",
1515
+ message: "Generic lifecycle job was not found"
1516
+ });
1517
+ return;
1518
+ }
1519
+ reply.send(snapshot);
1520
+ });
1521
+ app.get("/v1/swarm/self-hosted/node/generic-job-control/jobs/:job_id/events", async (request, reply) => {
1522
+ const auth = verifyOwnerLocalGenericJobRequest(config, request.headers);
1523
+ if (!auth.ok) {
1524
+ reply.status(auth.statusCode).send(auth.payload);
1525
+ return;
1526
+ }
1527
+ const jobId = String(request.params.job_id || "").trim();
1528
+ try {
1529
+ assertLifecycleJobIdMatchesClaims(jobId, config, auth.claims);
1530
+ }
1531
+ catch (error) {
1532
+ reply.status(400).send({
1533
+ error: "bad_request",
1534
+ code: "validation_failed",
1535
+ message: error instanceof Error ? error.message : "Invalid generic lifecycle job token"
1536
+ });
1537
+ return;
1538
+ }
1539
+ const snapshot = lifecycle.get(jobId);
1540
+ if (!snapshot) {
1541
+ reply.status(404).send({
1542
+ error: "not_found",
1543
+ code: "job_not_found",
1544
+ message: "Generic lifecycle job was not found"
1545
+ });
1546
+ return;
1547
+ }
1548
+ reply.send({ job_id: jobId, events: snapshot.events });
1549
+ });
1550
+ app.get("/v1/swarm/self-hosted/node/generic-job-control/jobs/:job_id/logs", async (request, reply) => {
1551
+ const auth = verifyOwnerLocalGenericJobRequest(config, request.headers);
1552
+ if (!auth.ok) {
1553
+ reply.status(auth.statusCode).send(auth.payload);
1554
+ return;
1555
+ }
1556
+ const jobId = String(request.params.job_id || "").trim();
1557
+ try {
1558
+ assertLifecycleJobIdMatchesClaims(jobId, config, auth.claims);
1559
+ }
1560
+ catch (error) {
1561
+ reply.status(400).send({
1562
+ error: "bad_request",
1563
+ code: "validation_failed",
1564
+ message: error instanceof Error ? error.message : "Invalid generic lifecycle job token"
1565
+ });
1566
+ return;
1567
+ }
1568
+ const snapshot = lifecycle.get(jobId);
1569
+ if (!snapshot) {
1570
+ reply.status(404).send({
1571
+ error: "not_found",
1572
+ code: "job_not_found",
1573
+ message: "Generic lifecycle job was not found"
1574
+ });
1575
+ return;
1576
+ }
1577
+ reply.send({ job_id: jobId, logs: snapshot.logs });
1578
+ });
1579
+ app.get("/v1/swarm/self-hosted/node/generic-job-control/jobs/:job_id/artifacts", async (request, reply) => {
1580
+ const auth = verifyOwnerLocalGenericJobRequest(config, request.headers);
1581
+ if (!auth.ok) {
1582
+ reply.status(auth.statusCode).send(auth.payload);
1583
+ return;
1584
+ }
1585
+ const jobId = String(request.params.job_id || "").trim();
1586
+ try {
1587
+ assertLifecycleJobIdMatchesClaims(jobId, config, auth.claims);
1588
+ }
1589
+ catch (error) {
1590
+ reply.status(400).send({
1591
+ error: "bad_request",
1592
+ code: "validation_failed",
1593
+ message: error instanceof Error ? error.message : "Invalid generic lifecycle job token"
1594
+ });
1595
+ return;
1596
+ }
1597
+ const snapshot = lifecycle.get(jobId);
1598
+ if (!snapshot) {
1599
+ reply.status(404).send({
1600
+ error: "not_found",
1601
+ code: "job_not_found",
1602
+ message: "Generic lifecycle job was not found"
1603
+ });
1604
+ return;
1605
+ }
1606
+ reply.send({ job_id: jobId, artifacts: snapshot.artifacts });
1607
+ });
1608
+ app.get("/v1/swarm/self-hosted/node/generic-job-control/jobs/:job_id/audit", async (request, reply) => {
1609
+ const auth = verifyOwnerLocalGenericJobRequest(config, request.headers);
1610
+ if (!auth.ok) {
1611
+ reply.status(auth.statusCode).send(auth.payload);
1612
+ return;
1613
+ }
1614
+ const jobId = String(request.params.job_id || "").trim();
1615
+ try {
1616
+ assertLifecycleJobIdMatchesClaims(jobId, config, auth.claims);
1617
+ }
1618
+ catch (error) {
1619
+ reply.status(400).send({
1620
+ error: "bad_request",
1621
+ code: "validation_failed",
1622
+ message: error instanceof Error ? error.message : "Invalid generic lifecycle job token"
1623
+ });
1624
+ return;
1625
+ }
1626
+ const snapshot = lifecycle.get(jobId);
1627
+ if (!snapshot) {
1628
+ reply.status(404).send({
1629
+ error: "not_found",
1630
+ code: "job_not_found",
1631
+ message: "Generic lifecycle job was not found"
1632
+ });
1633
+ return;
1634
+ }
1635
+ reply.send({ job_id: jobId, audit: snapshot.audit });
1636
+ });
1637
+ app.post("/v1/swarm/self-hosted/node/generic-job-control/jobs/:job_id/cancel", async (request, reply) => {
1638
+ const auth = verifyOwnerLocalGenericJobRequest(config, request.headers);
1639
+ if (!auth.ok) {
1640
+ reply.status(auth.statusCode).send(auth.payload);
1641
+ return;
1642
+ }
1643
+ const jobId = String(request.params.job_id || "").trim();
1644
+ try {
1645
+ assertLifecycleJobIdMatchesClaims(jobId, config, auth.claims);
1646
+ reply.status(202).send(lifecycle.cancel(jobId, auth.claims));
1647
+ }
1648
+ catch (error) {
1649
+ const message = error instanceof Error ? error.message : "Invalid generic lifecycle cancellation";
1650
+ reply.status(message === "job_not_found" ? 404 : 400).send({
1651
+ error: message === "job_not_found" ? "not_found" : "bad_request",
1652
+ code: message === "job_not_found" ? "job_not_found" : "validation_failed",
1653
+ message: message === "job_not_found" ? "Generic lifecycle job was not found" : message
1654
+ });
1655
+ }
1656
+ });
1657
+ app.post("/v1/swarm/self-hosted/node/generic-job-control/jobs/:job_id/retry", async (request, reply) => {
1658
+ const auth = verifyOwnerLocalGenericJobRequest(config, request.headers);
1659
+ if (!auth.ok) {
1660
+ reply.status(auth.statusCode).send(auth.payload);
1661
+ return;
1662
+ }
1663
+ const jobId = String(request.params.job_id || "").trim();
1664
+ try {
1665
+ assertLifecycleJobIdMatchesClaims(jobId, config, auth.claims);
1666
+ reply.status(202).send(lifecycle.retry(jobId, auth.claims));
1667
+ }
1668
+ catch (error) {
1669
+ const message = error instanceof Error ? error.message : "Invalid generic lifecycle retry";
1670
+ reply.status(message === "job_not_found" ? 404 : 400).send({
1671
+ error: message === "job_not_found" ? "not_found" : "bad_request",
1672
+ code: message === "job_not_found" ? "job_not_found" : "validation_failed",
1673
+ message: message === "job_not_found" ? "Generic lifecycle job was not found" : message
1674
+ });
1675
+ }
1676
+ });
265
1677
  return app;
266
1678
  }
267
1679
  export async function main(argv = process.argv) {