@mcoda/mswarm 0.1.75 → 0.1.78

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/server.js CHANGED
@@ -1,11 +1,13 @@
1
1
  #!/usr/bin/env node
2
2
  import { realpathSync } from "node:fs";
3
- import { readFile } from "node:fs/promises";
3
+ import { lstat, mkdir, readFile, writeFile } from "node:fs/promises";
4
4
  import { fileURLToPath } from "node:url";
5
- import { resolve } from "node:path";
5
+ import { dirname, resolve, sep } from "node:path";
6
+ import { createHash, randomUUID } from "node:crypto";
6
7
  import Fastify from "fastify";
7
- import { verifySelfHostedInvocationToken } from "./invocation-token.js";
8
- import { controlSelfHostedNodeService, installSelfHostedNodeService, readOwnerSetupConfig, readSelfHostedNodeConfig, resolveSelfHostedNodeServiceLayout, SelfHostedNodeRuntime, uninstallSelfHostedNodeService } from "./runtime.js";
8
+ import { MSWARM_JOB_LIFECYCLE_SCHEMA_VERSION, assertMswarmSafeRelativePath, buildMswarmLocalArtifactUri, defaultMswarmArtifactAccessPolicy, defaultMswarmArtifactRetentionPolicy, buildMswarmGenericJobAuditEvent, buildMswarmGenericJobEnvelopeDescriptor, isMswarmLifecycleStateTransitionAllowed, isMswarmTerminalLifecycleState, normalizeMswarmGenericJobIdempotencyKey } from "@mcoda/shared";
9
+ import { verifySelfHostedCapabilityToken, verifySelfHostedGenericJobOpsToken, verifySelfHostedGenericJobToken, verifySelfHostedInvocationToken } from "./invocation-token.js";
10
+ import { controlSelfHostedNodeService, installSelfHostedNodeService, readOwnerSetupConfig, readSelfHostedNodeConfig, resolveSelfHostedNodeServiceLayout, SelfHostedNodeRuntime, uninstallSelfHostedNodeService, genericJobCapabilityMismatch } from "./runtime.js";
9
11
  const SELF_HOSTED_NODE_PROCESS_TITLE = "mswarm-node";
10
12
  function applySelfHostedNodeProcessTitle() {
11
13
  const title = process.env.MSWARM_SELF_HOSTED_PROCESS_TITLE?.trim() || SELF_HOSTED_NODE_PROCESS_TITLE;
@@ -44,7 +46,12 @@ Environment:
44
46
  MSWARM_SELF_HOSTED_OLLAMA_BASE_URL Ollama base URL, defaults to http://127.0.0.1:11434
45
47
  MSWARM_SELF_HOSTED_NODE_STATE_PATH Config/state file, defaults to ~/.mswarm/self-hosted-node/config.json
46
48
  MSWARM_SELF_HOSTED_NODE_KEY_PATH Runtime token file, defaults to ~/.mswarm/self-hosted-node/node.key
49
+ MSWARM_SELF_HOSTED_ARTIFACT_STORE_PATH Local generic-job artifact store, defaults to ~/.mswarm/self-hosted-node/artifacts
47
50
  MSWARM_SELF_HOSTED_INVOCATION_SIGNING_SECRET Shared direct-job signing secret
51
+ MSWARM_SELF_HOSTED_GENERIC_JOBS_ENABLED Enable owner-local generic jobs, defaults to false
52
+ MSWARM_SELF_HOSTED_GENERIC_JOB_TIMEOUT_MS Generic job timeout, defaults to self-hosted job timeout
53
+ MSWARM_SELF_HOSTED_GENERIC_JOB_MAX_CONCURRENCY Generic job concurrency, defaults to 1
54
+ MSWARM_SELF_HOSTED_CAPABILITY_PROBE_TIMEOUT_MS Capability probe timeout, defaults to 2000
48
55
  MSWARM_SELF_HOSTED_LISTEN_HOST Direct node bind host, defaults to 127.0.0.1
49
56
  MSWARM_SELF_HOSTED_LISTEN_PORT Direct node bind port, defaults to 18083
50
57
  MSWARM_SELF_HOSTED_MODEL_ALLOWLIST Comma-separated local agent slugs/model names to expose
@@ -63,6 +70,10 @@ Setup options:
63
70
  --block <SLUGS> Comma-separated blocklist
64
71
  --expose-all Expose all healthy non-embedding local agents (default)
65
72
  --no-expose-all Expose only allowlisted local agents
73
+ --enable-generic-jobs Enable owner-local generic job endpoint for development
74
+ --generic-job-timeout-ms <N> Generic job timeout for owner-local development
75
+ --generic-job-max-concurrency <N> Generic job concurrency for owner-local development
76
+ --artifact-store-path <PATH> Local generic-job artifact store path
66
77
  --start Start foreground daemon after setup
67
78
 
68
79
  Log options:
@@ -149,17 +160,913 @@ function assertJobMatchesClaims(job, claims) {
149
160
  throw new Error("model does not match invocation token");
150
161
  }
151
162
  }
163
+ function assertGenericJobMatchesClaims(job, claims) {
164
+ if (job.node_id !== claims.node_id) {
165
+ throw new Error("generic job node_id does not match invocation token");
166
+ }
167
+ if (job.job_id !== claims.job_id) {
168
+ throw new Error("generic job_id does not match invocation token");
169
+ }
170
+ if (job.request_id !== claims.request_id) {
171
+ throw new Error("generic request_id does not match invocation token");
172
+ }
173
+ if (job.job?.schema_version !== claims.schema_version) {
174
+ throw new Error("generic schema_version does not match invocation token");
175
+ }
176
+ if (job.job?.job_type !== claims.job_type) {
177
+ throw new Error("generic job_type does not match invocation token");
178
+ }
179
+ }
180
+ function isOwnerLocalHost(value) {
181
+ const normalized = (value || "").trim().toLowerCase();
182
+ return (normalized === "localhost" ||
183
+ normalized === "::1" ||
184
+ normalized === "[::1]" ||
185
+ normalized === "127.0.0.1" ||
186
+ /^127\.\d{1,3}\.\d{1,3}\.\d{1,3}$/.test(normalized));
187
+ }
188
+ function isOwnerLocalGenericMode(config) {
189
+ if (!config.genericJobsEnabled || !isOwnerLocalHost(config.listenHost)) {
190
+ return false;
191
+ }
192
+ if (!config.directBaseUrl) {
193
+ return true;
194
+ }
195
+ try {
196
+ return isOwnerLocalHost(new URL(config.directBaseUrl).hostname);
197
+ }
198
+ catch {
199
+ return false;
200
+ }
201
+ }
202
+ function isOwnerLocalNodeApiMode(config) {
203
+ if (!isOwnerLocalHost(config.listenHost)) {
204
+ return false;
205
+ }
206
+ if (!config.directBaseUrl) {
207
+ return true;
208
+ }
209
+ try {
210
+ return isOwnerLocalHost(new URL(config.directBaseUrl).hostname);
211
+ }
212
+ catch {
213
+ return false;
214
+ }
215
+ }
216
+ function acceptsGenericEventStream(headers) {
217
+ const accept = headerText(headers, "accept") || "";
218
+ return accept.split(",").some((entry) => entry.trim().toLowerCase().startsWith("text/event-stream"));
219
+ }
152
220
  function writeSelfHostedSseChunk(raw, chunk) {
153
221
  raw.write(`data: ${JSON.stringify(chunk)}\n\n`);
154
222
  }
155
223
  function writeSelfHostedSseDone(raw) {
156
224
  raw.write("data: [DONE]\n\n");
157
225
  }
226
+ function writeGenericJobSseEvent(raw, event) {
227
+ raw.write(`event: ${String(event.type || "message")}\n`);
228
+ raw.write(`data: ${JSON.stringify(event)}\n\n`);
229
+ }
230
+ function genericJobFailureStatusCode(result) {
231
+ const code = result.result.error?.code;
232
+ return code === "validation_failed"
233
+ ? 400
234
+ : code === "timeout"
235
+ ? 408
236
+ : result.status === "cancelled"
237
+ ? 409
238
+ : 502;
239
+ }
240
+ const MAX_OWNER_LOCAL_ARTIFACT_UPLOAD_BYTES = 128 * 1024 * 1024;
241
+ function optionalString(value) {
242
+ return typeof value === "string" && value.trim().length > 0 ? value.trim() : undefined;
243
+ }
244
+ function decodeArtifactUploadBody(body) {
245
+ const payload = (body && typeof body === "object" && !Array.isArray(body) ? body : {});
246
+ const artifactPath = assertMswarmSafeRelativePath(payload.path, "artifact_path");
247
+ const rawBase64 = optionalString(payload.content_base64);
248
+ if (!rawBase64) {
249
+ throw new Error("content_base64_required");
250
+ }
251
+ const normalizedBase64 = rawBase64.replace(/\s/g, "");
252
+ if (!/^[a-zA-Z0-9+/]+={0,2}$/.test(normalizedBase64) || normalizedBase64.length % 4 === 1) {
253
+ throw new Error("content_base64_invalid");
254
+ }
255
+ const bytes = Buffer.from(normalizedBase64, "base64");
256
+ if (bytes.length > MAX_OWNER_LOCAL_ARTIFACT_UPLOAD_BYTES) {
257
+ throw new Error("artifact_upload_size_limit_exceeded");
258
+ }
259
+ if (typeof payload.size_bytes === "number" && Number.isFinite(payload.size_bytes) && payload.size_bytes !== bytes.length) {
260
+ throw new Error("artifact_upload_size_mismatch");
261
+ }
262
+ const sha256 = sha256Hex(bytes);
263
+ const expectedSha = optionalString(payload.sha256);
264
+ if (expectedSha && expectedSha !== sha256) {
265
+ throw new Error("artifact_upload_checksum_mismatch");
266
+ }
267
+ return {
268
+ name: optionalString(payload.name) || artifactPath.split("/").pop() || "artifact",
269
+ path: artifactPath,
270
+ contentType: optionalString(payload.content_type),
271
+ sha256,
272
+ bytes
273
+ };
274
+ }
275
+ function artifactUploadRoot(config, jobId) {
276
+ const safeJobId = assertMswarmSafeRelativePath(jobId.replace(/[^a-zA-Z0-9_.-]/g, "_"), "job_id");
277
+ return resolve(config.artifactStorePath || ".", safeJobId);
278
+ }
279
+ function resolveArtifactUploadTarget(config, jobId, relativePath) {
280
+ const root = artifactUploadRoot(config, jobId);
281
+ const target = resolve(root, relativePath);
282
+ const rootPrefix = root.endsWith(sep) ? root : `${root}${sep}`;
283
+ if (target !== root && !target.startsWith(rootPrefix)) {
284
+ throw new Error("artifact_path_escape_rejected");
285
+ }
286
+ return target;
287
+ }
288
+ async function assertNoArtifactSymlinkSegments(root, relativePath) {
289
+ let cursor = root;
290
+ const segments = relativePath.split("/").slice(0, -1);
291
+ for (const segment of segments) {
292
+ cursor = resolve(cursor, segment);
293
+ try {
294
+ const info = await lstat(cursor);
295
+ if (info.isSymbolicLink()) {
296
+ throw new Error("artifact_path_symlink_rejected");
297
+ }
298
+ if (!info.isDirectory()) {
299
+ throw new Error("artifact_path_parent_not_directory");
300
+ }
301
+ }
302
+ catch (error) {
303
+ if (error.code === "ENOENT")
304
+ continue;
305
+ throw error;
306
+ }
307
+ }
308
+ }
309
+ function sha256Hex(buffer) {
310
+ return createHash("sha256").update(buffer).digest("hex");
311
+ }
312
+ function verifyOwnerLocalGenericJobRequest(config, headers) {
313
+ if (!config.genericJobsEnabled) {
314
+ return {
315
+ ok: false,
316
+ statusCode: 404,
317
+ payload: {
318
+ error: "not_found",
319
+ code: "feature_disabled",
320
+ message: "Generic node jobs are disabled on this node"
321
+ }
322
+ };
323
+ }
324
+ if (!isOwnerLocalGenericMode(config)) {
325
+ return {
326
+ ok: false,
327
+ statusCode: 403,
328
+ payload: {
329
+ error: "forbidden",
330
+ code: "owner_local_required",
331
+ message: "Generic node jobs are only available in owner-local direct mode"
332
+ }
333
+ };
334
+ }
335
+ if (!config.invocationSigningSecret) {
336
+ return {
337
+ ok: false,
338
+ statusCode: 503,
339
+ payload: {
340
+ error: "service_unavailable",
341
+ code: "missing_config",
342
+ message: "MSWARM_SELF_HOSTED_INVOCATION_SIGNING_SECRET is required for generic jobs"
343
+ }
344
+ };
345
+ }
346
+ const token = extractBearerToken(headers);
347
+ if (!token) {
348
+ return {
349
+ ok: false,
350
+ statusCode: 401,
351
+ payload: {
352
+ error: "unauthorized",
353
+ code: "unauthorized",
354
+ message: "Missing generic job token"
355
+ }
356
+ };
357
+ }
358
+ try {
359
+ return {
360
+ ok: true,
361
+ token,
362
+ claims: verifySelfHostedGenericJobToken({
363
+ token,
364
+ secret: config.invocationSigningSecret
365
+ })
366
+ };
367
+ }
368
+ catch (error) {
369
+ return {
370
+ ok: false,
371
+ statusCode: 401,
372
+ payload: {
373
+ error: "unauthorized",
374
+ code: "unauthorized",
375
+ message: error instanceof Error ? error.message : "Invalid generic job token"
376
+ }
377
+ };
378
+ }
379
+ }
380
+ function verifyOwnerLocalGenericJobOpsRequest(config, headers) {
381
+ if (!config.genericJobsEnabled) {
382
+ return {
383
+ ok: false,
384
+ statusCode: 404,
385
+ payload: {
386
+ error: "not_found",
387
+ code: "feature_disabled",
388
+ message: "Generic node jobs are disabled on this node"
389
+ }
390
+ };
391
+ }
392
+ if (!isOwnerLocalGenericMode(config)) {
393
+ return {
394
+ ok: false,
395
+ statusCode: 403,
396
+ payload: {
397
+ error: "forbidden",
398
+ code: "owner_local_required",
399
+ message: "Generic node operations are only available in owner-local direct mode"
400
+ }
401
+ };
402
+ }
403
+ if (!config.invocationSigningSecret) {
404
+ return {
405
+ ok: false,
406
+ statusCode: 503,
407
+ payload: {
408
+ error: "service_unavailable",
409
+ code: "missing_config",
410
+ message: "MSWARM_SELF_HOSTED_INVOCATION_SIGNING_SECRET is required for generic job operations"
411
+ }
412
+ };
413
+ }
414
+ const token = extractBearerToken(headers);
415
+ if (!token) {
416
+ return {
417
+ ok: false,
418
+ statusCode: 401,
419
+ payload: {
420
+ error: "unauthorized",
421
+ code: "unauthorized",
422
+ message: "Missing generic job ops token"
423
+ }
424
+ };
425
+ }
426
+ try {
427
+ const claims = verifySelfHostedGenericJobOpsToken({
428
+ token,
429
+ secret: config.invocationSigningSecret
430
+ });
431
+ if (claims.node_id !== config.nodeId) {
432
+ return {
433
+ ok: false,
434
+ statusCode: 400,
435
+ payload: {
436
+ error: "bad_request",
437
+ code: "validation_failed",
438
+ message: "generic job ops token does not match this node"
439
+ }
440
+ };
441
+ }
442
+ return { ok: true, claims };
443
+ }
444
+ catch (error) {
445
+ return {
446
+ ok: false,
447
+ statusCode: 401,
448
+ payload: {
449
+ error: "unauthorized",
450
+ code: "unauthorized",
451
+ message: error instanceof Error ? error.message : "Invalid generic job ops token"
452
+ }
453
+ };
454
+ }
455
+ }
456
+ function assertLifecycleJobIdMatchesClaims(jobId, config, claims) {
457
+ if (claims.node_id !== config.nodeId || claims.job_id !== jobId) {
458
+ throw new Error("generic job token does not match this node or job");
459
+ }
460
+ }
461
+ function tenantIdForGenericJob(job) {
462
+ const metadata = job.job.metadata;
463
+ const tenantId = metadata && typeof metadata.tenant_id === "string" ? metadata.tenant_id.trim() : "";
464
+ return tenantId || "owner-local";
465
+ }
466
+ function genericJobMaxConcurrency(config) {
467
+ const configured = config.genericJobMaxConcurrency;
468
+ return Number.isFinite(configured) && configured && configured > 0 ? Math.floor(configured) : 1;
469
+ }
470
+ function clampOpsQueryNumber(value, fallback, max) {
471
+ const raw = Array.isArray(value) ? value[value.length - 1] : value;
472
+ const parsed = typeof raw === "number" ? raw : typeof raw === "string" ? Number.parseInt(raw, 10) : NaN;
473
+ if (!Number.isFinite(parsed) || parsed < 0) {
474
+ return fallback;
475
+ }
476
+ return Math.min(max, Math.floor(parsed));
477
+ }
478
+ function opsQueryOptions(query) {
479
+ const record = query && typeof query === "object" && !Array.isArray(query) ? query : {};
480
+ return {
481
+ auditLimit: clampOpsQueryNumber(record.audit_limit ?? record.auditLimit, 50, 250),
482
+ auditOffset: clampOpsQueryNumber(record.audit_offset ?? record.auditOffset, 0, 10000)
483
+ };
484
+ }
485
+ function artifactBytes(record) {
486
+ return (record.artifacts || []).reduce((total, artifact) => total + (artifact.size_bytes || 0), 0);
487
+ }
488
+ function logBytes(logs, stream) {
489
+ return logs
490
+ .filter((log) => !stream || log.stream === stream)
491
+ .reduce((total, log) => total + Buffer.byteLength(log.message || "", "utf8"), 0);
492
+ }
493
+ function progressPercent(events) {
494
+ for (const event of [...events].reverse()) {
495
+ if (event.type !== "progress" || !event.data || typeof event.data !== "object") {
496
+ continue;
497
+ }
498
+ const data = event.data;
499
+ const value = data.progress_percent ?? data.percent ?? data.progress;
500
+ if (typeof value === "number" && Number.isFinite(value)) {
501
+ return Math.max(0, Math.min(100, value));
502
+ }
503
+ }
504
+ return undefined;
505
+ }
506
+ function gpuSeconds(record) {
507
+ if (!record.started_at || !record.finished_at) {
508
+ return 0;
509
+ }
510
+ const started = Date.parse(record.started_at);
511
+ const finished = Date.parse(record.finished_at);
512
+ if (!Number.isFinite(started) || !Number.isFinite(finished) || finished <= started) {
513
+ return 0;
514
+ }
515
+ const gpuCount = Math.max(1, Math.floor(record.reservation?.resources?.gpu_count || record.job.resources?.gpu?.count || 1));
516
+ return Math.round(((finished - started) / 1000) * gpuCount * 1000) / 1000;
517
+ }
518
+ function tokenSha256(token) {
519
+ return createHash("sha256").update(token).digest("hex");
520
+ }
521
+ function lifecycleRetryPolicy(job) {
522
+ const retry = job.job.metadata?.retry;
523
+ const retryRecord = retry && typeof retry === "object" && !Array.isArray(retry) ? retry : null;
524
+ const maxRetries = typeof retryRecord?.max_retries === "number"
525
+ ? Math.max(0, Math.min(3, Math.floor(retryRecord.max_retries)))
526
+ : 0;
527
+ return {
528
+ max_retries: maxRetries,
529
+ retry_count: 0,
530
+ retryable_error_codes: ["timeout"]
531
+ };
532
+ }
533
+ class OwnerLocalGenericJobLifecycleScheduler {
534
+ constructor(runtime, config) {
535
+ this.runtime = runtime;
536
+ this.config = config;
537
+ this.jobs = new Map();
538
+ this.idempotency = new Map();
539
+ this.dispatching = false;
540
+ }
541
+ create(job, claims, token) {
542
+ const tenantId = tenantIdForGenericJob(job);
543
+ const idempotencyKey = normalizeMswarmGenericJobIdempotencyKey({
544
+ tenantId,
545
+ idempotencyKey: job.job.idempotency_key,
546
+ jobId: job.job_id,
547
+ requestId: job.request_id
548
+ });
549
+ const existingId = this.idempotency.get(idempotencyKey);
550
+ if (existingId) {
551
+ const existing = this.mustGetEntry(existingId);
552
+ if (existing.record.job_id !== job.job_id || existing.record.request_id !== job.request_id) {
553
+ throw new Error("idempotency_key_conflict");
554
+ }
555
+ this.audit(existing, "job_idempotent_reused", { idempotency_key: idempotencyKey });
556
+ return { snapshot: this.snapshot(existing), reused: true };
557
+ }
558
+ if (this.jobs.has(job.job_id)) {
559
+ throw new Error("job_id_conflict");
560
+ }
561
+ const now = new Date().toISOString();
562
+ const entry = {
563
+ claims,
564
+ tokenSha256: tokenSha256(token),
565
+ events: [],
566
+ logs: [],
567
+ audit: [],
568
+ record: {
569
+ schema_version: MSWARM_JOB_LIFECYCLE_SCHEMA_VERSION,
570
+ job_id: job.job_id,
571
+ request_id: job.request_id,
572
+ tenant_id: tenantId,
573
+ node_id: this.config.nodeId,
574
+ state: "queued",
575
+ job: job.job,
576
+ idempotency_key: idempotencyKey,
577
+ created_at: now,
578
+ updated_at: now,
579
+ queued_at: now,
580
+ retry: lifecycleRetryPolicy(job)
581
+ }
582
+ };
583
+ this.jobs.set(job.job_id, entry);
584
+ this.idempotency.set(idempotencyKey, job.job_id);
585
+ this.audit(entry, "job_created", { idempotency_key: idempotencyKey });
586
+ this.audit(entry, "job_queued");
587
+ queueMicrotask(() => {
588
+ void this.dispatchQueued();
589
+ });
590
+ return { snapshot: this.snapshot(entry), reused: false };
591
+ }
592
+ get(jobId) {
593
+ const entry = this.jobs.get(jobId);
594
+ return entry ? this.snapshot(entry) : null;
595
+ }
596
+ async ops(options) {
597
+ const capabilities = await this.runtime.publicCapabilityProjection();
598
+ const entries = Array.from(this.jobs.values());
599
+ const totalsByState = {};
600
+ for (const entry of entries) {
601
+ totalsByState[entry.record.state] = (totalsByState[entry.record.state] || 0) + 1;
602
+ }
603
+ const activeJobs = this.activeEntries().length;
604
+ const queuedJobs = this.queuedEntries().length;
605
+ const terminalJobs = entries.filter((entry) => isMswarmTerminalLifecycleState(entry.record.state)).length;
606
+ const stdoutBytes = entries.reduce((total, entry) => total + logBytes(entry.logs, "stdout"), 0);
607
+ const stderrBytes = entries.reduce((total, entry) => total + logBytes(entry.logs, "stderr"), 0);
608
+ const allAudit = entries
609
+ .flatMap((entry) => entry.audit)
610
+ .sort((a, b) => b.timestamp.localeCompare(a.timestamp));
611
+ const jobs = entries
612
+ .map((entry) => this.opsJobSummary(entry))
613
+ .sort((a, b) => b.updated_at.localeCompare(a.updated_at));
614
+ return {
615
+ schema_version: MSWARM_JOB_LIFECYCLE_SCHEMA_VERSION,
616
+ generated_at: new Date().toISOString(),
617
+ node: {
618
+ node_id: this.config.nodeId,
619
+ listen_host: this.config.listenHost,
620
+ listen_port: this.config.listenPort,
621
+ owner_local: isOwnerLocalGenericMode(this.config),
622
+ generic_jobs_enabled: this.config.genericJobsEnabled,
623
+ artifact_store_configured: Boolean(this.config.artifactStorePath),
624
+ max_concurrent_jobs: genericJobMaxConcurrency(this.config)
625
+ },
626
+ capabilities: capabilities,
627
+ queue: {
628
+ jobs,
629
+ totals_by_state: totalsByState,
630
+ active_jobs: activeJobs,
631
+ queued_jobs: queuedJobs,
632
+ terminal_jobs: terminalJobs
633
+ },
634
+ quota: {
635
+ max_concurrent_jobs: genericJobMaxConcurrency(this.config),
636
+ active_jobs: activeJobs,
637
+ queued_jobs: queuedJobs,
638
+ available_slots: Math.max(0, genericJobMaxConcurrency(this.config) - activeJobs),
639
+ production_enforced: false,
640
+ limits: {
641
+ generic_job_timeout_ms: this.config.genericJobTimeoutMs,
642
+ job_timeout_ms: this.config.jobTimeoutMs,
643
+ request_timeout_ms: this.config.requestTimeoutMs,
644
+ artifact_store_configured: Boolean(this.config.artifactStorePath)
645
+ }
646
+ },
647
+ usage: {
648
+ total_jobs: entries.length,
649
+ active_jobs: activeJobs,
650
+ terminal_jobs: terminalJobs,
651
+ succeeded_jobs: totalsByState.succeeded || 0,
652
+ failed_jobs: totalsByState.failed || 0,
653
+ cancelled_jobs: totalsByState.cancelled || 0,
654
+ blocked_jobs: totalsByState.blocked || 0,
655
+ expired_jobs: totalsByState.expired || 0,
656
+ gpu_seconds: Math.round(entries.reduce((total, entry) => total + gpuSeconds(entry.record), 0) * 1000) / 1000,
657
+ artifact_count: entries.reduce((total, entry) => total + (entry.record.artifacts || []).length, 0),
658
+ artifact_bytes: entries.reduce((total, entry) => total + artifactBytes(entry.record), 0),
659
+ event_count: entries.reduce((total, entry) => total + entry.events.length, 0),
660
+ audit_event_count: allAudit.length,
661
+ stdout_bytes: stdoutBytes,
662
+ stderr_bytes: stderrBytes,
663
+ log_bytes: stdoutBytes + stderrBytes
664
+ },
665
+ audit: {
666
+ total: allAudit.length,
667
+ offset: options.auditOffset,
668
+ limit: options.auditLimit,
669
+ events: allAudit.slice(options.auditOffset, options.auditOffset + options.auditLimit)
670
+ }
671
+ };
672
+ }
673
+ cancel(jobId, claims) {
674
+ const entry = this.mustGetEntry(jobId);
675
+ if (entry.claims.request_id !== claims.request_id ||
676
+ entry.claims.schema_version !== claims.schema_version ||
677
+ entry.claims.job_type !== claims.job_type) {
678
+ throw new Error("generic cancellation token does not match the lifecycle job");
679
+ }
680
+ this.audit(entry, "job_cancel_requested");
681
+ if (isMswarmTerminalLifecycleState(entry.record.state)) {
682
+ return this.snapshot(entry);
683
+ }
684
+ if (entry.controller && !entry.controller.signal.aborted) {
685
+ entry.controller.abort("cancelled");
686
+ return this.snapshot(entry);
687
+ }
688
+ this.transition(entry, "cancelled", {
689
+ finished_at: new Date().toISOString(),
690
+ result: {
691
+ job_id: entry.record.job_id,
692
+ status: "cancelled",
693
+ error: {
694
+ code: "cancelled",
695
+ message: "generic job cancelled before dispatch"
696
+ }
697
+ }
698
+ });
699
+ this.audit(entry, "job_cancelled");
700
+ this.releaseReservation(entry);
701
+ return this.snapshot(entry);
702
+ }
703
+ retry(jobId, claims) {
704
+ const entry = this.mustGetEntry(jobId);
705
+ if (entry.claims.request_id !== claims.request_id ||
706
+ entry.claims.schema_version !== claims.schema_version ||
707
+ entry.claims.job_type !== claims.job_type) {
708
+ throw new Error("generic retry token does not match the lifecycle job");
709
+ }
710
+ if (!isMswarmTerminalLifecycleState(entry.record.state)) {
711
+ throw new Error("job_retry_requires_terminal_state");
712
+ }
713
+ if (entry.record.state === "succeeded") {
714
+ throw new Error("job_retry_not_allowed_for_succeeded_jobs");
715
+ }
716
+ this.releaseReservation(entry);
717
+ const now = new Date().toISOString();
718
+ const retryCount = entry.record.retry.retry_count + 1;
719
+ entry.record = {
720
+ ...entry.record,
721
+ state: "queued",
722
+ updated_at: now,
723
+ queued_at: now,
724
+ scheduled_at: undefined,
725
+ started_at: undefined,
726
+ finished_at: undefined,
727
+ reservation: undefined,
728
+ envelope: undefined,
729
+ backpressure: undefined,
730
+ result: undefined,
731
+ artifacts: undefined,
732
+ retry: {
733
+ ...entry.record.retry,
734
+ retry_count: retryCount,
735
+ next_retry_at: now
736
+ }
737
+ };
738
+ this.audit(entry, "job_retry_scheduled", { retry_count: retryCount, manual: true });
739
+ queueMicrotask(() => {
740
+ void this.dispatchQueued();
741
+ });
742
+ return this.snapshot(entry);
743
+ }
744
+ async dispatchQueued() {
745
+ if (this.dispatching) {
746
+ return;
747
+ }
748
+ this.dispatching = true;
749
+ try {
750
+ while (this.activeEntries().length < genericJobMaxConcurrency(this.config)) {
751
+ const entry = this.nextDispatchableEntry();
752
+ if (!entry) {
753
+ return;
754
+ }
755
+ const activeTenant = this.activeTenantId();
756
+ if (activeTenant && activeTenant !== entry.record.tenant_id) {
757
+ this.setBackpressure(entry, "tenant_reserved", "Node is reserved for another tenant until active jobs finish.");
758
+ return;
759
+ }
760
+ const capabilityOk = await this.recheckCapabilities(entry);
761
+ if (!capabilityOk) {
762
+ continue;
763
+ }
764
+ this.schedule(entry);
765
+ void this.runScheduled(entry);
766
+ }
767
+ for (const entry of this.queuedEntries()) {
768
+ this.setBackpressure(entry, "node_at_capacity", "Node is at generic job concurrency limit.", 1000);
769
+ }
770
+ }
771
+ finally {
772
+ this.dispatching = false;
773
+ }
774
+ }
775
+ activeEntries() {
776
+ return Array.from(this.jobs.values()).filter((entry) => entry.record.state === "scheduled" || entry.record.state === "running");
777
+ }
778
+ queuedEntries() {
779
+ return Array.from(this.jobs.values()).filter((entry) => entry.record.state === "queued" || entry.record.state === "retrying");
780
+ }
781
+ nextDispatchableEntry() {
782
+ return this.queuedEntries().sort((a, b) => a.record.created_at.localeCompare(b.record.created_at))[0] || null;
783
+ }
784
+ activeTenantId() {
785
+ const active = this.activeEntries().find((entry) => entry.record.reservation && !entry.record.reservation.released_at);
786
+ return active?.record.tenant_id || null;
787
+ }
788
+ async recheckCapabilities(entry) {
789
+ const snapshot = await this.runtime.probeCapabilities();
790
+ const capabilityMismatch = genericJobCapabilityMismatch(entry.record.job, snapshot);
791
+ if (capabilityMismatch) {
792
+ this.transition(entry, "blocked", {
793
+ finished_at: new Date().toISOString(),
794
+ backpressure: {
795
+ reason: "no_capable_node",
796
+ message: capabilityMismatch.message
797
+ },
798
+ result: {
799
+ job_id: entry.record.job_id,
800
+ status: "failed",
801
+ error: {
802
+ code: capabilityMismatch.code,
803
+ message: capabilityMismatch.message,
804
+ retryable: true
805
+ }
806
+ }
807
+ });
808
+ this.audit(entry, "job_blocked", { reason: capabilityMismatch.code });
809
+ return false;
810
+ }
811
+ return true;
812
+ }
813
+ schedule(entry) {
814
+ const now = new Date().toISOString();
815
+ const reservation = {
816
+ node_id: this.config.nodeId,
817
+ tenant_id: entry.record.tenant_id,
818
+ job_id: entry.record.job_id,
819
+ request_id: entry.record.request_id,
820
+ reserved_at: now,
821
+ resources: {
822
+ ...(entry.record.job.resources?.gpu?.count ? { gpu_count: entry.record.job.resources.gpu.count } : {}),
823
+ ...(entry.record.job.resources?.cpu?.cores ? { cpu_cores: entry.record.job.resources.cpu.cores } : {}),
824
+ ...(entry.record.job.resources?.memory_gb ? { memory_gb: entry.record.job.resources.memory_gb } : {}),
825
+ ...(entry.record.job.resources?.disk_gb ? { disk_gb: entry.record.job.resources.disk_gb } : {})
826
+ }
827
+ };
828
+ const expiresAt = new Date(Date.now() + 5 * 60000).toISOString();
829
+ this.transition(entry, "scheduled", {
830
+ node_id: this.config.nodeId,
831
+ scheduled_at: now,
832
+ reservation,
833
+ backpressure: undefined,
834
+ envelope: buildMswarmGenericJobEnvelopeDescriptor({
835
+ jobId: entry.record.job_id,
836
+ requestId: entry.record.request_id,
837
+ nodeId: this.config.nodeId,
838
+ job: entry.record.job,
839
+ issuedAt: now,
840
+ expiresAt,
841
+ tokenSha256: entry.tokenSha256
842
+ })
843
+ });
844
+ this.audit(entry, "reservation_created", { resources: reservation.resources });
845
+ this.audit(entry, "envelope_issued", { expires_at: expiresAt });
846
+ this.audit(entry, "job_scheduled");
847
+ }
848
+ async runScheduled(entry) {
849
+ const controller = new AbortController();
850
+ entry.controller = controller;
851
+ this.transition(entry, "running", {
852
+ started_at: new Date().toISOString()
853
+ });
854
+ this.audit(entry, "job_started");
855
+ const envelope = {
856
+ job_id: entry.record.job_id,
857
+ request_id: entry.record.request_id,
858
+ node_id: this.config.nodeId,
859
+ job: entry.record.job
860
+ };
861
+ const result = await this.runtime.executeGenericJob(envelope, {
862
+ signal: controller.signal,
863
+ onEvent: async (event) => {
864
+ this.recordEvent(entry, event);
865
+ }
866
+ });
867
+ entry.controller = undefined;
868
+ if (result.status === "failed" && this.shouldRetry(entry, result.result.error?.code)) {
869
+ this.scheduleRetry(entry, result.result);
870
+ await this.dispatchQueued();
871
+ return;
872
+ }
873
+ const terminalState = result.status === "succeeded" ? "succeeded" : result.status === "cancelled" ? "cancelled" : "failed";
874
+ this.transition(entry, terminalState, {
875
+ finished_at: new Date().toISOString(),
876
+ result: result.result,
877
+ artifacts: result.result.artifacts || []
878
+ });
879
+ this.audit(entry, terminalState === "cancelled" ? "job_cancelled" : "job_completed", { status: terminalState });
880
+ this.releaseReservation(entry);
881
+ await this.dispatchQueued();
882
+ }
883
+ shouldRetry(entry, errorCode) {
884
+ if (!errorCode || entry.record.retry.retry_count >= entry.record.retry.max_retries) {
885
+ return false;
886
+ }
887
+ const retryable = entry.record.retry.retryable_error_codes || [];
888
+ return retryable.includes(errorCode);
889
+ }
890
+ scheduleRetry(entry, result) {
891
+ const retryCount = entry.record.retry.retry_count + 1;
892
+ const nextRetryAt = new Date().toISOString();
893
+ this.transition(entry, "retrying", {
894
+ finished_at: new Date().toISOString(),
895
+ result,
896
+ retry: {
897
+ ...entry.record.retry,
898
+ retry_count: retryCount,
899
+ next_retry_at: nextRetryAt
900
+ }
901
+ });
902
+ this.audit(entry, "job_retry_scheduled", { retry_count: retryCount, next_retry_at: nextRetryAt });
903
+ this.releaseReservation(entry);
904
+ this.transition(entry, "queued", {
905
+ queued_at: nextRetryAt,
906
+ scheduled_at: undefined,
907
+ started_at: undefined,
908
+ finished_at: undefined,
909
+ reservation: undefined,
910
+ envelope: undefined,
911
+ backpressure: undefined
912
+ });
913
+ }
914
+ recordEvent(entry, event) {
915
+ entry.events.push(event);
916
+ if (event.type === "stdout" || event.type === "stderr") {
917
+ entry.logs.push({
918
+ job_id: event.job_id,
919
+ sequence: event.sequence,
920
+ timestamp: event.timestamp,
921
+ stream: event.type,
922
+ message: event.message || "",
923
+ truncated: false
924
+ });
925
+ }
926
+ this.audit(entry, "job_event_recorded", { type: event.type, sequence: event.sequence });
927
+ }
928
+ releaseReservation(entry) {
929
+ if (!entry.record.reservation || entry.record.reservation.released_at) {
930
+ return;
931
+ }
932
+ entry.record.reservation = {
933
+ ...entry.record.reservation,
934
+ released_at: new Date().toISOString()
935
+ };
936
+ this.audit(entry, "reservation_released");
937
+ }
938
+ setBackpressure(entry, reason, message, retryAfterMs) {
939
+ entry.record.backpressure = {
940
+ reason,
941
+ message,
942
+ ...(retryAfterMs ? { retry_after_ms: retryAfterMs } : {})
943
+ };
944
+ entry.record.updated_at = new Date().toISOString();
945
+ }
946
+ transition(entry, state, patch = {}) {
947
+ if (!isMswarmLifecycleStateTransitionAllowed(entry.record.state, state)) {
948
+ throw new Error(`invalid lifecycle transition from ${entry.record.state} to ${state}`);
949
+ }
950
+ entry.record = {
951
+ ...entry.record,
952
+ ...patch,
953
+ state,
954
+ updated_at: new Date().toISOString()
955
+ };
956
+ }
957
+ audit(entry, action, details) {
958
+ entry.audit.push(buildMswarmGenericJobAuditEvent({
959
+ auditId: `audit_${randomUUID()}`,
960
+ jobId: entry.record.job_id,
961
+ requestId: entry.record.request_id,
962
+ tenantId: entry.record.tenant_id,
963
+ nodeId: this.config.nodeId,
964
+ action,
965
+ timestamp: new Date().toISOString(),
966
+ details
967
+ }));
968
+ }
969
+ snapshot(entry) {
970
+ return {
971
+ job: entry.record,
972
+ events: [...entry.events],
973
+ logs: [...entry.logs],
974
+ artifacts: [...(entry.record.artifacts || [])],
975
+ audit: [...entry.audit]
976
+ };
977
+ }
978
+ opsJobSummary(entry) {
979
+ const lastEvent = entry.events[entry.events.length - 1];
980
+ return {
981
+ job_id: entry.record.job_id,
982
+ request_id: entry.record.request_id,
983
+ tenant_id: entry.record.tenant_id,
984
+ node_id: entry.record.node_id,
985
+ state: entry.record.state,
986
+ job_type: entry.record.job.job_type,
987
+ schema_version: entry.record.job.schema_version,
988
+ created_at: entry.record.created_at,
989
+ updated_at: entry.record.updated_at,
990
+ queued_at: entry.record.queued_at,
991
+ scheduled_at: entry.record.scheduled_at,
992
+ started_at: entry.record.started_at,
993
+ finished_at: entry.record.finished_at,
994
+ retry_count: entry.record.retry.retry_count,
995
+ max_retries: entry.record.retry.max_retries,
996
+ progress_percent: progressPercent(entry.events),
997
+ last_event_type: lastEvent?.type,
998
+ last_event_message: lastEvent?.message,
999
+ artifact_count: (entry.record.artifacts || []).length,
1000
+ artifact_bytes: artifactBytes(entry.record),
1001
+ log_bytes: logBytes(entry.logs)
1002
+ };
1003
+ }
1004
+ mustGetEntry(jobId) {
1005
+ const entry = this.jobs.get(jobId);
1006
+ if (!entry) {
1007
+ throw new Error("job_not_found");
1008
+ }
1009
+ return entry;
1010
+ }
1011
+ }
158
1012
  export function buildSelfHostedNodeApp(runtime, config) {
159
1013
  const app = Fastify({ logger: false });
1014
+ const activeGenericJobs = new Map();
1015
+ const lifecycle = new OwnerLocalGenericJobLifecycleScheduler(runtime, config);
160
1016
  app.get("/healthz", async (_request, reply) => {
161
1017
  reply.send({ service: "mswarm-self-hosted-node", status: "ok", node_id: config.nodeId });
162
1018
  });
1019
+ app.get("/v1/swarm/self-hosted/node/capabilities", async (request, reply) => {
1020
+ if (!isOwnerLocalNodeApiMode(config)) {
1021
+ reply.status(403).send({
1022
+ error: "forbidden",
1023
+ code: "owner_local_required",
1024
+ message: "Node capabilities are only available in owner-local direct mode"
1025
+ });
1026
+ return;
1027
+ }
1028
+ if (!config.invocationSigningSecret) {
1029
+ reply.status(503).send({
1030
+ error: "service_unavailable",
1031
+ code: "missing_config",
1032
+ message: "MSWARM_SELF_HOSTED_INVOCATION_SIGNING_SECRET is required for capability reads"
1033
+ });
1034
+ return;
1035
+ }
1036
+ const token = extractBearerToken(request.headers);
1037
+ if (!token) {
1038
+ reply.status(401).send({
1039
+ error: "unauthorized",
1040
+ code: "unauthorized",
1041
+ message: "Missing capability token"
1042
+ });
1043
+ return;
1044
+ }
1045
+ let claims;
1046
+ try {
1047
+ claims = verifySelfHostedCapabilityToken({
1048
+ token,
1049
+ secret: config.invocationSigningSecret
1050
+ });
1051
+ }
1052
+ catch (error) {
1053
+ reply.status(401).send({
1054
+ error: "unauthorized",
1055
+ code: "unauthorized",
1056
+ message: error instanceof Error ? error.message : "Invalid capability token"
1057
+ });
1058
+ return;
1059
+ }
1060
+ if (claims.node_id !== config.nodeId) {
1061
+ reply.status(400).send({
1062
+ error: "bad_request",
1063
+ code: "validation_failed",
1064
+ message: "capability token node_id does not match this node"
1065
+ });
1066
+ return;
1067
+ }
1068
+ reply.send(await runtime.publicCapabilityProjection());
1069
+ });
163
1070
  app.post("/v1/swarm/self-hosted/node/jobs", async (request, reply) => {
164
1071
  if (!config.invocationSigningSecret) {
165
1072
  reply.status(503).send({
@@ -262,6 +1169,496 @@ export function buildSelfHostedNodeApp(runtime, config) {
262
1169
  }
263
1170
  reply.send(result);
264
1171
  });
1172
+ app.post("/v1/swarm/self-hosted/node/generic-jobs", async (request, reply) => {
1173
+ if (!config.genericJobsEnabled) {
1174
+ reply.status(404).send({
1175
+ error: "not_found",
1176
+ code: "feature_disabled",
1177
+ message: "Generic node jobs are disabled on this node"
1178
+ });
1179
+ return;
1180
+ }
1181
+ if (!isOwnerLocalGenericMode(config)) {
1182
+ reply.status(403).send({
1183
+ error: "forbidden",
1184
+ code: "owner_local_required",
1185
+ message: "Generic node jobs are only available in owner-local direct mode"
1186
+ });
1187
+ return;
1188
+ }
1189
+ if (!config.invocationSigningSecret) {
1190
+ reply.status(503).send({
1191
+ error: "service_unavailable",
1192
+ code: "missing_config",
1193
+ message: "MSWARM_SELF_HOSTED_INVOCATION_SIGNING_SECRET is required for generic jobs"
1194
+ });
1195
+ return;
1196
+ }
1197
+ const token = extractBearerToken(request.headers);
1198
+ if (!token) {
1199
+ reply.status(401).send({
1200
+ error: "unauthorized",
1201
+ code: "unauthorized",
1202
+ message: "Missing generic job token"
1203
+ });
1204
+ return;
1205
+ }
1206
+ let claims;
1207
+ try {
1208
+ claims = verifySelfHostedGenericJobToken({
1209
+ token,
1210
+ secret: config.invocationSigningSecret
1211
+ });
1212
+ }
1213
+ catch (error) {
1214
+ reply.status(401).send({
1215
+ error: "unauthorized",
1216
+ code: "unauthorized",
1217
+ message: error instanceof Error ? error.message : "Invalid generic job token"
1218
+ });
1219
+ return;
1220
+ }
1221
+ const job = request.body;
1222
+ try {
1223
+ assertGenericJobMatchesClaims(job, claims);
1224
+ }
1225
+ catch (error) {
1226
+ reply.status(400).send({
1227
+ error: "bad_request",
1228
+ code: "validation_failed",
1229
+ message: error instanceof Error ? error.message : "Invalid generic job"
1230
+ });
1231
+ return;
1232
+ }
1233
+ const wantsEventStream = acceptsGenericEventStream(request.headers);
1234
+ if (activeGenericJobs.has(job.job_id)) {
1235
+ reply.status(409).send({
1236
+ error: "conflict",
1237
+ code: "job_already_running",
1238
+ message: "Generic job is already running on this node"
1239
+ });
1240
+ return;
1241
+ }
1242
+ const abortController = new AbortController();
1243
+ activeGenericJobs.set(job.job_id, { controller: abortController, claims });
1244
+ if (wantsEventStream) {
1245
+ reply.hijack();
1246
+ reply.raw.writeHead(200, {
1247
+ "content-type": "text/event-stream; charset=utf-8",
1248
+ "cache-control": "no-cache, no-transform",
1249
+ connection: "keep-alive",
1250
+ "x-accel-buffering": "no"
1251
+ });
1252
+ const onClose = () => {
1253
+ if (!abortController.signal.aborted) {
1254
+ abortController.abort("cancelled");
1255
+ }
1256
+ };
1257
+ reply.raw.once("close", onClose);
1258
+ const keepAlive = setInterval(() => {
1259
+ if (!reply.raw.destroyed && !reply.raw.writableEnded) {
1260
+ reply.raw.write(": keep-alive\n\n");
1261
+ }
1262
+ }, 15000);
1263
+ try {
1264
+ await runtime.executeGenericJob(job, {
1265
+ signal: abortController.signal,
1266
+ onEvent: async (event) => {
1267
+ writeGenericJobSseEvent(reply.raw, { ...event });
1268
+ }
1269
+ });
1270
+ writeSelfHostedSseDone(reply.raw);
1271
+ }
1272
+ catch (error) {
1273
+ writeGenericJobSseEvent(reply.raw, {
1274
+ job_id: job.job_id,
1275
+ type: "failed",
1276
+ sequence: 0,
1277
+ timestamp: new Date().toISOString(),
1278
+ message: error instanceof Error ? error.message : String(error),
1279
+ data: { code: "upstream_error" }
1280
+ });
1281
+ writeSelfHostedSseDone(reply.raw);
1282
+ }
1283
+ finally {
1284
+ clearInterval(keepAlive);
1285
+ reply.raw.removeListener("close", onClose);
1286
+ if (activeGenericJobs.get(job.job_id)?.controller === abortController) {
1287
+ activeGenericJobs.delete(job.job_id);
1288
+ }
1289
+ if (!reply.raw.destroyed && !reply.raw.writableEnded) {
1290
+ reply.raw.end();
1291
+ }
1292
+ }
1293
+ return;
1294
+ }
1295
+ const result = await runtime.executeGenericJob(job, { signal: abortController.signal }).finally(() => {
1296
+ if (activeGenericJobs.get(job.job_id)?.controller === abortController) {
1297
+ activeGenericJobs.delete(job.job_id);
1298
+ }
1299
+ });
1300
+ if (result.status === "succeeded") {
1301
+ reply.send(result);
1302
+ return;
1303
+ }
1304
+ reply.status(genericJobFailureStatusCode(result)).send(result);
1305
+ });
1306
+ app.post("/v1/swarm/self-hosted/node/generic-jobs/:job_id/cancel", async (request, reply) => {
1307
+ if (!config.genericJobsEnabled) {
1308
+ reply.status(404).send({
1309
+ error: "not_found",
1310
+ code: "feature_disabled",
1311
+ message: "Generic node jobs are disabled on this node"
1312
+ });
1313
+ return;
1314
+ }
1315
+ if (!isOwnerLocalGenericMode(config)) {
1316
+ reply.status(403).send({
1317
+ error: "forbidden",
1318
+ code: "owner_local_required",
1319
+ message: "Generic node jobs are only available in owner-local direct mode"
1320
+ });
1321
+ return;
1322
+ }
1323
+ if (!config.invocationSigningSecret) {
1324
+ reply.status(503).send({
1325
+ error: "service_unavailable",
1326
+ code: "missing_config",
1327
+ message: "MSWARM_SELF_HOSTED_INVOCATION_SIGNING_SECRET is required for generic jobs"
1328
+ });
1329
+ return;
1330
+ }
1331
+ const token = extractBearerToken(request.headers);
1332
+ if (!token) {
1333
+ reply.status(401).send({
1334
+ error: "unauthorized",
1335
+ code: "unauthorized",
1336
+ message: "Missing generic job token"
1337
+ });
1338
+ return;
1339
+ }
1340
+ let claims;
1341
+ try {
1342
+ claims = verifySelfHostedGenericJobToken({
1343
+ token,
1344
+ secret: config.invocationSigningSecret
1345
+ });
1346
+ }
1347
+ catch (error) {
1348
+ reply.status(401).send({
1349
+ error: "unauthorized",
1350
+ code: "unauthorized",
1351
+ message: error instanceof Error ? error.message : "Invalid generic job token"
1352
+ });
1353
+ return;
1354
+ }
1355
+ const jobId = String(request.params.job_id || "").trim();
1356
+ if (!jobId) {
1357
+ reply.status(400).send({
1358
+ error: "bad_request",
1359
+ code: "validation_failed",
1360
+ message: "generic job_id is required"
1361
+ });
1362
+ return;
1363
+ }
1364
+ if (claims.node_id !== config.nodeId || claims.job_id !== jobId) {
1365
+ reply.status(400).send({
1366
+ error: "bad_request",
1367
+ code: "validation_failed",
1368
+ message: "generic cancellation token does not match this node or job"
1369
+ });
1370
+ return;
1371
+ }
1372
+ const activeJob = activeGenericJobs.get(jobId);
1373
+ if (!activeJob) {
1374
+ reply.status(404).send({
1375
+ error: "not_found",
1376
+ code: "job_not_running",
1377
+ message: "Generic job is not running on this node"
1378
+ });
1379
+ return;
1380
+ }
1381
+ if (activeJob.claims.request_id !== claims.request_id ||
1382
+ activeJob.claims.schema_version !== claims.schema_version ||
1383
+ activeJob.claims.job_type !== claims.job_type) {
1384
+ reply.status(400).send({
1385
+ error: "bad_request",
1386
+ code: "validation_failed",
1387
+ message: "generic cancellation token does not match the active request"
1388
+ });
1389
+ return;
1390
+ }
1391
+ if (!activeJob.controller.signal.aborted) {
1392
+ activeJob.controller.abort("cancelled");
1393
+ }
1394
+ reply.status(202).send({
1395
+ job_id: jobId,
1396
+ request_id: activeJob.claims.request_id,
1397
+ status: "cancelling"
1398
+ });
1399
+ });
1400
+ app.post("/v1/swarm/self-hosted/node/generic-job-control/jobs", async (request, reply) => {
1401
+ const auth = verifyOwnerLocalGenericJobRequest(config, request.headers);
1402
+ if (!auth.ok) {
1403
+ reply.status(auth.statusCode).send(auth.payload);
1404
+ return;
1405
+ }
1406
+ const job = request.body;
1407
+ try {
1408
+ assertGenericJobMatchesClaims(job, auth.claims);
1409
+ assertLifecycleJobIdMatchesClaims(job.job_id, config, auth.claims);
1410
+ const result = lifecycle.create(job, auth.claims, auth.token);
1411
+ reply.status(result.reused ? 200 : 202).send(result.snapshot);
1412
+ }
1413
+ catch (error) {
1414
+ const message = error instanceof Error ? error.message : "Invalid generic lifecycle job";
1415
+ reply.status(message.includes("conflict") ? 409 : 400).send({
1416
+ error: message.includes("conflict") ? "conflict" : "bad_request",
1417
+ code: message.includes("conflict") ? message : "validation_failed",
1418
+ message
1419
+ });
1420
+ }
1421
+ });
1422
+ app.post("/v1/swarm/self-hosted/node/generic-job-control/jobs/:job_id/artifacts", async (request, reply) => {
1423
+ const auth = verifyOwnerLocalGenericJobRequest(config, request.headers);
1424
+ if (!auth.ok) {
1425
+ reply.status(auth.statusCode).send(auth.payload);
1426
+ return;
1427
+ }
1428
+ const jobId = String(request.params.job_id || "").trim();
1429
+ try {
1430
+ assertLifecycleJobIdMatchesClaims(jobId, config, auth.claims);
1431
+ const upload = decodeArtifactUploadBody(request.body);
1432
+ const root = artifactUploadRoot(config, jobId);
1433
+ const target = resolveArtifactUploadTarget(config, jobId, upload.path);
1434
+ await mkdir(dirname(target), { recursive: true });
1435
+ await assertNoArtifactSymlinkSegments(root, upload.path);
1436
+ try {
1437
+ await lstat(target);
1438
+ throw new Error("artifact_upload_target_exists");
1439
+ }
1440
+ catch (error) {
1441
+ if (error.code !== "ENOENT")
1442
+ throw error;
1443
+ }
1444
+ await writeFile(target, upload.bytes, { mode: 0o600 });
1445
+ reply.status(201).send({
1446
+ job_id: jobId,
1447
+ artifact: {
1448
+ id: `upload_${upload.sha256.slice(0, 16)}`,
1449
+ uri: buildMswarmLocalArtifactUri(jobId, upload.path),
1450
+ name: upload.name,
1451
+ content_type: upload.contentType,
1452
+ size_bytes: upload.bytes.length,
1453
+ sha256: upload.sha256,
1454
+ scope: "input",
1455
+ access: defaultMswarmArtifactAccessPolicy("owner-local"),
1456
+ retention: defaultMswarmArtifactRetentionPolicy()
1457
+ }
1458
+ });
1459
+ }
1460
+ catch (error) {
1461
+ const message = error instanceof Error ? error.message : "Invalid generic lifecycle artifact upload";
1462
+ reply.status(400).send({
1463
+ error: "bad_request",
1464
+ code: "validation_failed",
1465
+ message
1466
+ });
1467
+ }
1468
+ });
1469
+ app.get("/v1/swarm/self-hosted/node/generic-job-control/ops", async (request, reply) => {
1470
+ const auth = verifyOwnerLocalGenericJobOpsRequest(config, request.headers);
1471
+ if (!auth.ok) {
1472
+ reply.status(auth.statusCode).send(auth.payload);
1473
+ return;
1474
+ }
1475
+ reply.send(await lifecycle.ops(opsQueryOptions(request.query)));
1476
+ });
1477
+ app.get("/v1/swarm/self-hosted/node/generic-job-control/jobs/:job_id", async (request, reply) => {
1478
+ const auth = verifyOwnerLocalGenericJobRequest(config, request.headers);
1479
+ if (!auth.ok) {
1480
+ reply.status(auth.statusCode).send(auth.payload);
1481
+ return;
1482
+ }
1483
+ const jobId = String(request.params.job_id || "").trim();
1484
+ try {
1485
+ assertLifecycleJobIdMatchesClaims(jobId, config, auth.claims);
1486
+ }
1487
+ catch (error) {
1488
+ reply.status(400).send({
1489
+ error: "bad_request",
1490
+ code: "validation_failed",
1491
+ message: error instanceof Error ? error.message : "Invalid generic lifecycle job token"
1492
+ });
1493
+ return;
1494
+ }
1495
+ const snapshot = lifecycle.get(jobId);
1496
+ if (!snapshot) {
1497
+ reply.status(404).send({
1498
+ error: "not_found",
1499
+ code: "job_not_found",
1500
+ message: "Generic lifecycle job was not found"
1501
+ });
1502
+ return;
1503
+ }
1504
+ reply.send(snapshot);
1505
+ });
1506
+ app.get("/v1/swarm/self-hosted/node/generic-job-control/jobs/:job_id/events", async (request, reply) => {
1507
+ const auth = verifyOwnerLocalGenericJobRequest(config, request.headers);
1508
+ if (!auth.ok) {
1509
+ reply.status(auth.statusCode).send(auth.payload);
1510
+ return;
1511
+ }
1512
+ const jobId = String(request.params.job_id || "").trim();
1513
+ try {
1514
+ assertLifecycleJobIdMatchesClaims(jobId, config, auth.claims);
1515
+ }
1516
+ catch (error) {
1517
+ reply.status(400).send({
1518
+ error: "bad_request",
1519
+ code: "validation_failed",
1520
+ message: error instanceof Error ? error.message : "Invalid generic lifecycle job token"
1521
+ });
1522
+ return;
1523
+ }
1524
+ const snapshot = lifecycle.get(jobId);
1525
+ if (!snapshot) {
1526
+ reply.status(404).send({
1527
+ error: "not_found",
1528
+ code: "job_not_found",
1529
+ message: "Generic lifecycle job was not found"
1530
+ });
1531
+ return;
1532
+ }
1533
+ reply.send({ job_id: jobId, events: snapshot.events });
1534
+ });
1535
+ app.get("/v1/swarm/self-hosted/node/generic-job-control/jobs/:job_id/logs", async (request, reply) => {
1536
+ const auth = verifyOwnerLocalGenericJobRequest(config, request.headers);
1537
+ if (!auth.ok) {
1538
+ reply.status(auth.statusCode).send(auth.payload);
1539
+ return;
1540
+ }
1541
+ const jobId = String(request.params.job_id || "").trim();
1542
+ try {
1543
+ assertLifecycleJobIdMatchesClaims(jobId, config, auth.claims);
1544
+ }
1545
+ catch (error) {
1546
+ reply.status(400).send({
1547
+ error: "bad_request",
1548
+ code: "validation_failed",
1549
+ message: error instanceof Error ? error.message : "Invalid generic lifecycle job token"
1550
+ });
1551
+ return;
1552
+ }
1553
+ const snapshot = lifecycle.get(jobId);
1554
+ if (!snapshot) {
1555
+ reply.status(404).send({
1556
+ error: "not_found",
1557
+ code: "job_not_found",
1558
+ message: "Generic lifecycle job was not found"
1559
+ });
1560
+ return;
1561
+ }
1562
+ reply.send({ job_id: jobId, logs: snapshot.logs });
1563
+ });
1564
+ app.get("/v1/swarm/self-hosted/node/generic-job-control/jobs/:job_id/artifacts", async (request, reply) => {
1565
+ const auth = verifyOwnerLocalGenericJobRequest(config, request.headers);
1566
+ if (!auth.ok) {
1567
+ reply.status(auth.statusCode).send(auth.payload);
1568
+ return;
1569
+ }
1570
+ const jobId = String(request.params.job_id || "").trim();
1571
+ try {
1572
+ assertLifecycleJobIdMatchesClaims(jobId, config, auth.claims);
1573
+ }
1574
+ catch (error) {
1575
+ reply.status(400).send({
1576
+ error: "bad_request",
1577
+ code: "validation_failed",
1578
+ message: error instanceof Error ? error.message : "Invalid generic lifecycle job token"
1579
+ });
1580
+ return;
1581
+ }
1582
+ const snapshot = lifecycle.get(jobId);
1583
+ if (!snapshot) {
1584
+ reply.status(404).send({
1585
+ error: "not_found",
1586
+ code: "job_not_found",
1587
+ message: "Generic lifecycle job was not found"
1588
+ });
1589
+ return;
1590
+ }
1591
+ reply.send({ job_id: jobId, artifacts: snapshot.artifacts });
1592
+ });
1593
+ app.get("/v1/swarm/self-hosted/node/generic-job-control/jobs/:job_id/audit", async (request, reply) => {
1594
+ const auth = verifyOwnerLocalGenericJobRequest(config, request.headers);
1595
+ if (!auth.ok) {
1596
+ reply.status(auth.statusCode).send(auth.payload);
1597
+ return;
1598
+ }
1599
+ const jobId = String(request.params.job_id || "").trim();
1600
+ try {
1601
+ assertLifecycleJobIdMatchesClaims(jobId, config, auth.claims);
1602
+ }
1603
+ catch (error) {
1604
+ reply.status(400).send({
1605
+ error: "bad_request",
1606
+ code: "validation_failed",
1607
+ message: error instanceof Error ? error.message : "Invalid generic lifecycle job token"
1608
+ });
1609
+ return;
1610
+ }
1611
+ const snapshot = lifecycle.get(jobId);
1612
+ if (!snapshot) {
1613
+ reply.status(404).send({
1614
+ error: "not_found",
1615
+ code: "job_not_found",
1616
+ message: "Generic lifecycle job was not found"
1617
+ });
1618
+ return;
1619
+ }
1620
+ reply.send({ job_id: jobId, audit: snapshot.audit });
1621
+ });
1622
+ app.post("/v1/swarm/self-hosted/node/generic-job-control/jobs/:job_id/cancel", async (request, reply) => {
1623
+ const auth = verifyOwnerLocalGenericJobRequest(config, request.headers);
1624
+ if (!auth.ok) {
1625
+ reply.status(auth.statusCode).send(auth.payload);
1626
+ return;
1627
+ }
1628
+ const jobId = String(request.params.job_id || "").trim();
1629
+ try {
1630
+ assertLifecycleJobIdMatchesClaims(jobId, config, auth.claims);
1631
+ reply.status(202).send(lifecycle.cancel(jobId, auth.claims));
1632
+ }
1633
+ catch (error) {
1634
+ const message = error instanceof Error ? error.message : "Invalid generic lifecycle cancellation";
1635
+ reply.status(message === "job_not_found" ? 404 : 400).send({
1636
+ error: message === "job_not_found" ? "not_found" : "bad_request",
1637
+ code: message === "job_not_found" ? "job_not_found" : "validation_failed",
1638
+ message: message === "job_not_found" ? "Generic lifecycle job was not found" : message
1639
+ });
1640
+ }
1641
+ });
1642
+ app.post("/v1/swarm/self-hosted/node/generic-job-control/jobs/:job_id/retry", async (request, reply) => {
1643
+ const auth = verifyOwnerLocalGenericJobRequest(config, request.headers);
1644
+ if (!auth.ok) {
1645
+ reply.status(auth.statusCode).send(auth.payload);
1646
+ return;
1647
+ }
1648
+ const jobId = String(request.params.job_id || "").trim();
1649
+ try {
1650
+ assertLifecycleJobIdMatchesClaims(jobId, config, auth.claims);
1651
+ reply.status(202).send(lifecycle.retry(jobId, auth.claims));
1652
+ }
1653
+ catch (error) {
1654
+ const message = error instanceof Error ? error.message : "Invalid generic lifecycle retry";
1655
+ reply.status(message === "job_not_found" ? 404 : 400).send({
1656
+ error: message === "job_not_found" ? "not_found" : "bad_request",
1657
+ code: message === "job_not_found" ? "job_not_found" : "validation_failed",
1658
+ message: message === "job_not_found" ? "Generic lifecycle job was not found" : message
1659
+ });
1660
+ }
1661
+ });
265
1662
  return app;
266
1663
  }
267
1664
  export async function main(argv = process.argv) {