@lightcone-ai/daemon 0.15.76 → 0.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@lightcone-ai/daemon",
3
- "version": "0.15.76",
3
+ "version": "0.16.0",
4
4
  "type": "module",
5
5
  "main": "src/index.js",
6
6
  "bin": {
@@ -1,6 +1,6 @@
1
1
  import { spawn } from 'node:child_process';
2
2
  import path from 'node:path';
3
- import { mkdir, rm, writeFile, access } from 'node:fs/promises';
3
+ import { mkdir, rm, writeFile, access, stat as statAsync } from 'node:fs/promises';
4
4
  import { constants as fsConstants } from 'node:fs';
5
5
  import os from 'node:os';
6
6
  import { randomUUID } from 'node:crypto';
@@ -407,7 +407,18 @@ export async function composeVideoV2({
407
407
  }
408
408
 
409
409
  const totalDuration = await probeDurationSec(outPath);
410
- return { path: outPath, duration_ms: Math.round(totalDuration * 1000) };
410
+
411
+ // Stat the final file before returning so the caller can rely on size and
412
+ // so we can detect the (rare but observed) case where ffmpeg's `close`
413
+ // arrived but the kernel writeback wasn't complete. A 0-byte / tiny mp4
414
+ // here means the burn-subtitles pass produced nothing usable — fail loudly
415
+ // instead of letting a broken file flow into write_workspace_file / submit.
416
+ const finalStat = await statAsync(outPath);
417
+ const sizeBytes = Number(finalStat.size ?? 0);
418
+ if (!Number.isFinite(sizeBytes) || sizeBytes < 1024) {
419
+ throw new Error(`compose_video_v2 produced an invalid output: ${outPath} size=${sizeBytes} bytes`);
420
+ }
421
+ return { path: outPath, duration_ms: Math.round(totalDuration * 1000), size_bytes: sizeBytes };
411
422
  } finally {
412
423
  await rm(tmpDir, { recursive: true, force: true });
413
424
  }
@@ -2,11 +2,14 @@
2
2
  import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
3
3
  import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
4
4
  import { z } from 'zod';
5
- import { createReadStream, existsSync, mkdirSync, readFileSync, writeFileSync } from 'fs';
5
+ import { createReadStream, existsSync, mkdirSync, readFileSync, writeFileSync, statSync } from 'fs';
6
6
  import { createHash, randomUUID } from 'crypto';
7
7
  import path, { extname } from 'path';
8
+ import os from 'os';
8
9
  import { recordUrlNarration } from './_vendor/video/recorder/index.js';
9
- import { writeLocalFileToWorkspace } from './workspace-file-upload.js';
10
+ import { writeLocalFileToWorkspace, resolveWorkspaceFileUploadPlan } from './workspace-file-upload.js';
11
+ import { UploadJobManager } from './upload-job-manager.js';
12
+ import { createUploadServerApi } from './upload-server-api.js';
10
13
  import { runRecordUrlNarrationTool } from './record-url-narration-tool.js';
11
14
  import { runSubmitToLibraryTool } from './submit-to-library-tool.js';
12
15
  import { runRenderTextToImageTool } from './tools/render-text-to-image.js';
@@ -843,6 +846,23 @@ const IS_HOST_AGENT = await resolveHostAgentFlag();
843
846
 
844
847
  const server = new McpServer({ name: 'chat', version: '0.1.0' });
845
848
 
849
+ // ── Upload job manager (async + multipart workspace-file uploads) ────────────
850
+ // Per-agent jobDir so each chat-bridge sees only its own pending jobs. When the
851
+ // chat-bridge process restarts (skill bind, manual restart) it picks up any
852
+ // in-flight uploads from disk and resumes them; see docs/upload-pipeline-design.md.
853
+ const _uploadJobDir = path.join(
854
+ os.homedir(),
855
+ '.lightcone',
856
+ 'upload-jobs',
857
+ AGENT_ID || 'default'
858
+ );
859
+ const uploadJobManager = new UploadJobManager({
860
+ jobDir: _uploadJobDir,
861
+ serverApi: createUploadServerApi({ api: (...args) => api(...args) }),
862
+ log: (msg) => console.error(`[chat-bridge:UploadJobManager] ${msg}`),
863
+ });
864
+ uploadJobManager.start();
865
+
846
866
  // ── check_messages ────────────────────────────────────────────────────────────
847
867
  server.tool('check_messages', 'Check for new messages in your inbox', {}, async () => {
848
868
  const data = await api('GET', '/receive');
@@ -1099,38 +1119,67 @@ server.tool('write_workspace', 'Write a file to the shared workspace. Use this t
1099
1119
  return { content: [{ type: 'text', text: `Saved to workspace: ${path}` }] };
1100
1120
  });
1101
1121
 
1102
- server.tool('write_workspace_file', 'Write a local file directly to the shared workspace. Prefer this over write_workspace for images/PDFs/binary files so large base64 content never enters the model context. The source file may be a relative path under the current agent workspace, or an absolute path inside the agent workspace/workspace shared artifacts/notes/tmp directories.', {
1103
- file_path: z.string().describe('Local file path. Relative paths resolve from the current agent workspace. Absolute paths must stay inside the agent/workspace.'),
1104
- path: z.string().describe('Destination path relative to workspace root, e.g. "artifacts/cover.png"'),
1105
- }, async ({ file_path, path }) => {
1122
+ server.tool('write_workspace_file',
1123
+ 'Write a local file directly to the shared workspace. Prefer this over write_workspace for images/PDFs/binary files so large base64 content never enters the model context.\n\n'
1124
+ + 'Binary uploads are ASYNCHRONOUS: the tool queues an upload job and returns immediately with a jobId. The actual PUT runs in the background with retry + COS multipart (no codex 120s timeout, no single-PUT failures). When you next call submit_to_library on the same workspace path, the server waits up to ~90s for the upload to finish before indexing; if it returns "upload_still_in_progress" just retry submit_to_library a few seconds later — DO NOT re-record/re-compose the video, the source file is fine.\n\n'
1125
+ + 'Text files (.md/.txt/.json/etc.) still upload synchronously since they are tiny.',
1126
+ {
1127
+ file_path: z.string().describe('Local file path. Relative paths resolve from the current agent workspace. Absolute paths must stay inside the agent/workspace.'),
1128
+ path: z.string().describe('Destination path relative to workspace root, e.g. "artifacts/cover.png"'),
1129
+ },
1130
+ async ({ file_path, path: workspacePath }) => {
1106
1131
  if (!currentWorkspaceId) return { content: [{ type: 'text', text: 'No workspace context.' }] };
1107
1132
  const localPath = resolveLocalWorkspaceFile(file_path);
1108
- const result = await writeLocalFileToWorkspace({
1109
- localPath,
1110
- workspacePath: path,
1111
- workspaceId: currentWorkspaceId,
1112
- readFileSyncFn: readFileSync,
1113
- uploadWorkspaceMemory: async ({ workspacePath, workspaceId, content }) => {
1114
- await api('PUT', `/workspace-memory?path=${encodeURIComponent(workspacePath)}&workspaceId=${encodeURIComponent(workspaceId)}`, { content });
1115
- },
1116
- presign: async ({ workspaceId, path: filePath, size, mime, sha256 }) => {
1117
- return api('POST', '/storage/presign', { workspaceId, path: filePath, size, mime, sha256 });
1118
- },
1119
- confirmUpload: async ({ workspaceId, path: filePath, objectKey }) => {
1120
- await api('POST', '/storage/confirm', { workspaceId, path: filePath, objectKey });
1121
- },
1122
- });
1123
1133
 
1124
- const finalMime = result?.mime ?? 'application/octet-stream';
1125
- const finalBytes = Number.isFinite(result?.bytes) ? result.bytes : 0;
1126
- const sizeText = finalBytes > 0 ? formatBytes(finalBytes) : 'unknown size';
1127
- const asyncNote = result?.async ? ' (uploading in background)' : '';
1128
- return {
1129
- content: [{
1130
- type: 'text',
1131
- text: `Saved local file to workspace: ${path} (${finalMime}, ${sizeText})${asyncNote}`,
1132
- }],
1133
- };
1134
+ const plan = resolveWorkspaceFileUploadPlan({ localPath, workspacePath });
1135
+
1136
+ // ── Text path: tiny, sync, unchanged ────────────────────────────────────
1137
+ if (plan.isText) {
1138
+ const result = await writeLocalFileToWorkspace({
1139
+ localPath,
1140
+ workspacePath,
1141
+ workspaceId: currentWorkspaceId,
1142
+ readFileSyncFn: readFileSync,
1143
+ uploadWorkspaceMemory: async ({ workspacePath: wp, workspaceId, content }) => {
1144
+ await api('PUT', `/workspace-memory?path=${encodeURIComponent(wp)}&workspaceId=${encodeURIComponent(workspaceId)}`, { content });
1145
+ },
1146
+ // Binary path below replaces these; keep these stubs in case the plan
1147
+ // misclassifies, to avoid TypeError mid-tool-call.
1148
+ presign: async () => { throw new Error('binary upload should not reach legacy presign'); },
1149
+ confirmUpload: async () => { throw new Error('binary upload should not reach legacy confirmUpload'); },
1150
+ });
1151
+ const bytes = Number.isFinite(result?.bytes) ? result.bytes : 0;
1152
+ return {
1153
+ content: [{
1154
+ type: 'text',
1155
+ text: `Saved local text file to workspace: ${workspacePath} (${plan.mime}, ${formatBytes(bytes)})`,
1156
+ }],
1157
+ };
1158
+ }
1159
+
1160
+ // ── Binary path: async enqueue ──────────────────────────────────────────
1161
+ try {
1162
+ const enq = await uploadJobManager.enqueue({
1163
+ workspaceId: currentWorkspaceId,
1164
+ agentId: AGENT_ID || null,
1165
+ localPath,
1166
+ workspacePath,
1167
+ mime: plan.mime,
1168
+ });
1169
+ const lines = [
1170
+ `Queued upload to workspace: ${workspacePath} (${plan.mime}, ${formatBytes(enq.size)})`,
1171
+ `mode=${enq.mode}${enq.mode === 'multipart' ? ` chunks=${enq.totalChunks}` : ''}`,
1172
+ `job_id=${enq.jobId}`,
1173
+ `status=pending`,
1174
+ `note=Upload runs in background. Call submit_to_library when ready — it waits up to ~90s for the upload to finish. If it returns "upload_still_in_progress", just retry submit_to_library a few seconds later. DO NOT re-record or re-compose.`,
1175
+ ];
1176
+ return { content: [{ type: 'text', text: lines.join('\n') }] };
1177
+ } catch (err) {
1178
+ return {
1179
+ isError: true,
1180
+ content: [{ type: 'text', text: `write_workspace_file failed: ${err?.message ?? err}` }],
1181
+ };
1182
+ }
1134
1183
  });
1135
1184
 
1136
1185
  // ── skill_list ───────────────────────────────────────────────────────────────
@@ -30,17 +30,18 @@ export async function runSubmitToLibraryTool({
30
30
  '/content-library/submit',
31
31
  buildSubmitToLibraryBody(args, currentWorkspaceId)
32
32
  );
33
- // Server returns 2xx + body.error for transient "still processing" cases
34
- // (e.g. HTTP 202 + {error: "video is still uploading, retry shortly"} when
35
- // the workspace file's object_status is still 'uploading'). The HTTP client
36
- // treats any 2xx as success and returns the body verbatim, so we must
37
- // surface body.error here — otherwise the agent reads `data.itemId` as
38
- // undefined and (because the literal "undefined" looks like proof the
39
- // submit silently failed) tends to redo the entire video instead of just
40
- // retrying submit_to_library. Fail loudly with the server's message so the
41
- // agent retries the right step.
33
+ // Server returns 2xx + body.error for transient "still processing" cases:
34
+ // HTTP 202 + {error: 'upload_still_in_progress', message: '...'} when the
35
+ // async upload pipeline (see docs/upload-pipeline-design.md) hasn't
36
+ // finished. The HTTP client treats any 2xx as success and returns the
37
+ // body verbatim, so we must surface body.error here — otherwise the agent
38
+ // reads `data.itemId` as undefined and (because the literal "undefined"
39
+ // looks like proof the submit silently failed) tends to redo the entire
40
+ // video instead of just retrying submit_to_library. Prefer body.message
41
+ // when present (rich human-readable retry hint); fall back to body.error.
42
42
  if (data && typeof data === 'object' && typeof data.error === 'string' && data.error.trim()) {
43
- return toolError(`submit_to_library not ready: ${data.error}. Retry submit_to_library with the same video_path in a few seconds — do NOT re-record or re-compose the video.`);
43
+ const detail = (typeof data.message === 'string' && data.message.trim()) ? data.message : data.error;
44
+ return toolError(`submit_to_library not ready: ${detail}. Retry submit_to_library with the same video_path in a few seconds — do NOT re-record or re-compose the video.`);
44
45
  }
45
46
  if (!data || typeof data.itemId !== 'string' || !data.itemId) {
46
47
  return toolError(`submit_to_library returned no itemId. Server response: ${JSON.stringify(data).slice(0, 300)}. Retry submit_to_library with the same video_path — do NOT re-record or re-compose.`);
@@ -103,6 +103,7 @@ export async function runComposeVideoV2Tool({ segments, outro_paths, format, res
103
103
  'compose_video_v2 completed.',
104
104
  `path=${result.path}`,
105
105
  `duration_ms=${result.duration_ms}`,
106
+ `size_bytes=${result.size_bytes ?? 'unknown'}`,
106
107
  `segments=${segments.length}`,
107
108
  `outro_clips=${(outro_paths ?? []).length}`,
108
109
  ];
@@ -0,0 +1,542 @@
1
+ // Daemon-owned upload job manager.
2
+ //
3
+ // Decouples the agent-facing `write_workspace_file` tool from the actual
4
+ // PUT-to-COS round-trip so:
5
+ // - the tool returns in <50ms (just enqueues a job) rather than holding the
6
+ // codex/claude tool slot for minutes
7
+ // - PUT failures are retried with exponential backoff
8
+ // - large files use COS multipart so a single dropped chunk doesn't restart
9
+ // the whole upload
10
+ // - daemon restarts pick up in-flight jobs from disk and resume
11
+ //
12
+ // Out of scope for this module: the actual COS API calls. Those live behind
13
+ // the injected `serverApi` interface (M3 implements the server endpoints, M4
14
+ // wires this manager to it).
15
+ //
16
+ // Schema and state machine: see docs/upload-pipeline-design.md.
17
+
18
+ import {
19
+ readFileSync, writeFileSync, readdirSync, statSync,
20
+ promises as fsPromises,
21
+ createReadStream,
22
+ mkdirSync, renameSync,
23
+ } from 'node:fs';
24
+ import { open as fsOpen } from 'node:fs/promises';
25
+ import path from 'node:path';
26
+ import crypto, { randomUUID } from 'node:crypto';
27
+ import os from 'node:os';
28
+
29
+ export const SCHEMA_VERSION = 1;
30
+ export const DEFAULT_CHUNK_SIZE = 8 * 1024 * 1024; // 8 MB
31
+ export const MULTIPART_THRESHOLD = 5 * 1024 * 1024; // COS minimum part size
32
+ export const MAX_JOB_ATTEMPTS = 5;
33
+ export const DEFAULT_TICK_INTERVAL_MS = 2_000;
34
+ export const PART_RETRY_ATTEMPTS = 3;
35
+ export const PART_RETRY_BASE_MS = 1_000; // 1s, 3s, 9s
36
+ export const TERMINAL_JOB_TTL_MS = 7 * 24 * 3600 * 1000; // sweep done/dead_letter after 7 days
37
+ export const HOUSEKEEPING_INTERVAL_MS = 6 * 3600 * 1000; // run housekeeping every 6h
38
+
39
+ function nowIso() { return new Date().toISOString(); }
40
+
41
+ async function sha256OfFile(localPath) {
42
+ return new Promise((resolve, reject) => {
43
+ const h = crypto.createHash('sha256');
44
+ const stream = createReadStream(localPath);
45
+ stream.on('data', chunk => h.update(chunk));
46
+ stream.on('end', () => resolve(h.digest('hex')));
47
+ stream.on('error', reject);
48
+ });
49
+ }
50
+
51
+ // Job-level backoff: attempts²×30s.
52
+ // 1st retry: +30s, 2nd: +120s, 3rd: +270s, 4th: +480s.
53
+ export function jobBackoffMs(attempts) {
54
+ return attempts * attempts * 30_000;
55
+ }
56
+
57
+ // Chunk-level backoff: 1s, 3s, 9s.
58
+ function partBackoffMs(attempt) {
59
+ return PART_RETRY_BASE_MS * (3 ** (attempt - 1));
60
+ }
61
+
62
+ function sleep(ms) { return new Promise(r => setTimeout(r, ms)); }
63
+
64
+ /**
65
+ * UploadJobManager — singleton in daemon process.
66
+ *
67
+ * const mgr = new UploadJobManager({ serverApi });
68
+ * mgr.start();
69
+ * const { jobId } = await mgr.enqueue({ workspaceId, agentId, localPath, workspacePath, mime });
70
+ * // job runs in background; later:
71
+ * mgr.getStatus(jobId);
72
+ * // or shutting down:
73
+ * mgr.stop();
74
+ *
75
+ * serverApi shape (M3 implements):
76
+ * presignSingle({ workspaceId, path, size, mime, sha256 }) → { objectKey, uploadUrl, method?, headers? }
77
+ * confirmSingle({ workspaceId, path, objectKey }) → { ok: true }
78
+ * presignMultipart({ workspaceId, path, size, mime, sha256 }) → { objectKey, cosUploadId }
79
+ * presignPart({ workspaceId, objectKey, cosUploadId, partNumber }) → { url, method?, headers? }
80
+ * completeMultipart({ workspaceId, path, objectKey, cosUploadId, parts, size, sha256 }) → { ok: true }
81
+ * abortMultipart({ workspaceId, path, objectKey, cosUploadId }) → { ok: true }
82
+ */
83
+ export class UploadJobManager {
84
+ constructor({
85
+ jobDir = path.join(os.homedir(), '.lightcone', 'upload-jobs'),
86
+ serverApi,
87
+ fetchFn = globalThis.fetch,
88
+ nowFn = () => Date.now(),
89
+ log = (msg) => console.error(`[UploadJobManager] ${msg}`),
90
+ tickIntervalMs = DEFAULT_TICK_INTERVAL_MS,
91
+ chunkSize = DEFAULT_CHUNK_SIZE,
92
+ multipartThreshold = MULTIPART_THRESHOLD,
93
+ } = {}) {
94
+ if (!serverApi) throw new Error('UploadJobManager: serverApi is required');
95
+ this.jobDir = jobDir;
96
+ this.serverApi = serverApi;
97
+ this.fetchFn = fetchFn;
98
+ this.nowFn = nowFn;
99
+ this.log = log;
100
+ this.tickIntervalMs = tickIntervalMs;
101
+ this.chunkSize = chunkSize;
102
+ this.multipartThreshold = multipartThreshold;
103
+
104
+ this._tickInterval = null;
105
+ this._housekeepingInterval = null;
106
+ this._stopping = false;
107
+ this._activeJobs = new Set(); // jobIds currently advancing
108
+ this._pathLocks = new Map(); // `${workspaceId}|${workspacePath}` → jobId
109
+
110
+ mkdirSync(this.jobDir, { recursive: true });
111
+ }
112
+
113
+ // ─── public API ──────────────────────────────────────────────────────────
114
+
115
+ /**
116
+ * Stat + sha256 the file, choose mode (single vs multipart), persist a
117
+ * pending job, and schedule a tick. Throws synchronously on path-lock
118
+ * conflict or missing/unreadable file.
119
+ */
120
+ async enqueue({ workspaceId, agentId = null, localPath, workspacePath, mime = 'application/octet-stream' }) {
121
+ if (!workspaceId) throw new Error('enqueue: workspaceId required');
122
+ if (!localPath) throw new Error('enqueue: localPath required');
123
+ if (!workspacePath) throw new Error('enqueue: workspacePath required');
124
+
125
+ const pathKey = `${workspaceId}|${workspacePath}`;
126
+ const existing = this._pathLocks.get(pathKey);
127
+ if (existing) {
128
+ throw new Error(`upload_path_locked: another upload to ${workspacePath} is in progress (jobId=${existing})`);
129
+ }
130
+
131
+ let st;
132
+ try { st = statSync(localPath); }
133
+ catch (err) { throw new Error(`localPath not readable: ${err.message}`); }
134
+ const size = st.size;
135
+ if (!Number.isFinite(size) || size <= 0) {
136
+ throw new Error(`localPath has invalid size: ${size}`);
137
+ }
138
+ const mtimeMs = st.mtimeMs;
139
+ const sha256 = await sha256OfFile(localPath);
140
+
141
+ const uploadMode = size >= this.multipartThreshold ? 'multipart' : 'single';
142
+ const totalChunks = uploadMode === 'multipart' ? Math.ceil(size / this.chunkSize) : 1;
143
+
144
+ const job = {
145
+ schemaVersion: SCHEMA_VERSION,
146
+ jobId: randomUUID(),
147
+ createdAt: nowIso(),
148
+ updatedAt: nowIso(),
149
+ agentId,
150
+ workspaceId,
151
+ localPath,
152
+ workspacePath,
153
+ mime,
154
+ objectKey: null,
155
+ size,
156
+ mtimeMs,
157
+ sha256,
158
+ uploadMode,
159
+ chunkSize: this.chunkSize,
160
+ totalChunks,
161
+ cosUploadId: null,
162
+ doneParts: [],
163
+ status: 'pending',
164
+ attempts: 0,
165
+ lastError: null,
166
+ lastErrorAt: null,
167
+ nextAttemptAt: this.nowFn(),
168
+ };
169
+
170
+ this._persist(job);
171
+ this._pathLocks.set(pathKey, job.jobId);
172
+
173
+ // Best-effort immediate tick so the very first job doesn't wait for the interval.
174
+ setImmediate(() => this._tick().catch(err => this.log(`tick (post-enqueue) failed: ${err.message}`)));
175
+
176
+ return {
177
+ jobId: job.jobId,
178
+ mode: uploadMode,
179
+ totalChunks,
180
+ status: 'pending',
181
+ size,
182
+ };
183
+ }
184
+
185
+ getStatus(jobId) {
186
+ const job = this._loadById(jobId);
187
+ return job ? this._publicState(job) : null;
188
+ }
189
+
190
+ start() {
191
+ if (this._tickInterval) return;
192
+ this._stopping = false;
193
+ // Rebuild path locks from on-disk jobs so concurrent enqueue still respects them.
194
+ for (const job of this._listJobs()) {
195
+ if (job.status === 'pending' || job.status === 'uploading') {
196
+ const pathKey = `${job.workspaceId}|${job.workspacePath}`;
197
+ this._pathLocks.set(pathKey, job.jobId);
198
+ }
199
+ }
200
+ this._tickInterval = setInterval(() => {
201
+ this._tick().catch(err => this.log(`tick failed: ${err.message}`));
202
+ }, this.tickIntervalMs);
203
+ // Run housekeeping immediately + on a slow schedule.
204
+ this._housekeep();
205
+ this._housekeepingInterval = setInterval(() => {
206
+ this._housekeep();
207
+ }, HOUSEKEEPING_INTERVAL_MS);
208
+ this.log(`started (jobDir=${this.jobDir}, tick=${this.tickIntervalMs}ms)`);
209
+ }
210
+
211
+ /**
212
+ * Signal stop. In-flight `_advance` calls notice via `_stopping` flag and
213
+ * exit between chunks; partial state is persisted so the next start() can
214
+ * resume. Does NOT abort the underlying fetch in progress — a single chunk
215
+ * may still complete after stop returns.
216
+ */
217
+ stop() {
218
+ this._stopping = true;
219
+ if (this._tickInterval) clearInterval(this._tickInterval);
220
+ this._tickInterval = null;
221
+ if (this._housekeepingInterval) clearInterval(this._housekeepingInterval);
222
+ this._housekeepingInterval = null;
223
+ }
224
+
225
+ /**
226
+ * Drop terminal (done / dead_letter) job files older than TERMINAL_JOB_TTL_MS.
227
+ * Keeps recent ones around for observability / debugging.
228
+ */
229
+ _housekeep() {
230
+ try {
231
+ const now = this.nowFn();
232
+ let pruned = 0;
233
+ for (const job of this._listJobs()) {
234
+ if (job.status !== 'done' && job.status !== 'dead_letter') continue;
235
+ const updatedAt = job.updatedAt ? Date.parse(job.updatedAt) : 0;
236
+ if (!Number.isFinite(updatedAt)) continue;
237
+ if (now - updatedAt < TERMINAL_JOB_TTL_MS) continue;
238
+ try {
239
+ // unlink via fsPromises is fine to do sync-ish since housekeeping is rare
240
+ // and the file is tiny.
241
+ fsPromises.unlink(this._jobPath(job.jobId)).catch(() => {});
242
+ pruned += 1;
243
+ } catch { /* ignore */ }
244
+ }
245
+ if (pruned > 0) this.log(`housekeeping: pruned ${pruned} terminal job(s) older than ${TERMINAL_JOB_TTL_MS / 86_400_000}d`);
246
+ } catch (err) {
247
+ this.log(`housekeeping failed: ${err?.message ?? err}`);
248
+ }
249
+ }
250
+
251
+ /**
252
+ * Test helper: wait until no jobs are pending/uploading or until timeout.
253
+ */
254
+ async waitAllSettled({ timeoutMs = 30_000 } = {}) {
255
+ const start = this.nowFn();
256
+ while (this.nowFn() - start < timeoutMs) {
257
+ const jobs = this._listJobs();
258
+ const inFlight = jobs.some(j => j.status === 'pending' || j.status === 'uploading');
259
+ if (!inFlight && this._activeJobs.size === 0) return;
260
+ await sleep(50);
261
+ }
262
+ throw new Error('waitAllSettled: timeout');
263
+ }
264
+
265
+ // ─── internal ────────────────────────────────────────────────────────────
266
+
267
+ async _tick() {
268
+ if (this._stopping) return;
269
+ const jobs = this._listJobs();
270
+ const now = this.nowFn();
271
+ for (const job of jobs) {
272
+ if (this._stopping) break;
273
+ if (job.status === 'done' || job.status === 'dead_letter') continue;
274
+ if (this._activeJobs.has(job.jobId)) continue;
275
+ if (job.nextAttemptAt && job.nextAttemptAt > now) continue;
276
+ this._activeJobs.add(job.jobId);
277
+ this._advance(job)
278
+ .catch(err => this.log(`advance(${job.jobId}) crashed: ${err.message}`))
279
+ .finally(() => this._activeJobs.delete(job.jobId));
280
+ }
281
+ }
282
+
283
+ async _advance(job) {
284
+ try {
285
+ // Local-file integrity check before each attempt.
286
+ let st;
287
+ try { st = statSync(job.localPath); }
288
+ catch (err) {
289
+ return this._markDeadLetter(job, `local_file_gone:${err.code ?? err.message}`);
290
+ }
291
+ if (st.size !== job.size) {
292
+ return this._markDeadLetter(job, `local_file_changed: declared=${job.size}, now=${st.size}`);
293
+ }
294
+
295
+ if (job.status !== 'uploading') {
296
+ job.status = 'uploading';
297
+ this._persist(job);
298
+ }
299
+
300
+ if (job.uploadMode === 'single') {
301
+ await this._advanceSingle(job);
302
+ } else {
303
+ await this._advanceMultipart(job);
304
+ }
305
+ } catch (err) {
306
+ this._recordFailure(job, err);
307
+ }
308
+ }
309
+
310
+ async _advanceSingle(job) {
311
+ const presign = await this.serverApi.presignSingle({
312
+ workspaceId: job.workspaceId,
313
+ path: job.workspacePath,
314
+ size: job.size,
315
+ mime: job.mime,
316
+ sha256: job.sha256,
317
+ });
318
+ job.objectKey = presign.objectKey;
319
+ this._persist(job);
320
+
321
+ const fileBuf = await fsPromises.readFile(job.localPath);
322
+ const resp = await this.fetchFn(presign.uploadUrl, {
323
+ method: presign.method ?? 'PUT',
324
+ headers: {
325
+ 'Content-Type': job.mime,
326
+ 'Content-Length': String(job.size),
327
+ ...(presign.headers ?? {}),
328
+ },
329
+ body: fileBuf,
330
+ });
331
+ if (!resp.ok) {
332
+ const text = await resp.text().catch(() => '');
333
+ throw new Error(`single PUT failed: HTTP ${resp.status} ${text.slice(0, 200)}`);
334
+ }
335
+
336
+ await this.serverApi.confirmSingle({
337
+ workspaceId: job.workspaceId,
338
+ path: job.workspacePath,
339
+ objectKey: job.objectKey,
340
+ });
341
+
342
+ this._markDone(job);
343
+ }
344
+
345
+ async _advanceMultipart(job) {
346
+ if (!job.cosUploadId) {
347
+ const init = await this.serverApi.presignMultipart({
348
+ workspaceId: job.workspaceId,
349
+ path: job.workspacePath,
350
+ size: job.size,
351
+ mime: job.mime,
352
+ sha256: job.sha256,
353
+ });
354
+ job.objectKey = init.objectKey;
355
+ job.cosUploadId = init.cosUploadId;
356
+ this._persist(job);
357
+ }
358
+
359
+ const doneSet = new Set(job.doneParts.map(p => p.partNumber));
360
+ const fh = await fsOpen(job.localPath, 'r');
361
+ try {
362
+ for (let i = 1; i <= job.totalChunks; i++) {
363
+ if (this._stopping) return; // graceful shutdown mid-job
364
+ if (doneSet.has(i)) continue;
365
+
366
+ const offset = (i - 1) * job.chunkSize;
367
+ const remaining = job.size - offset;
368
+ const partLen = Math.min(job.chunkSize, remaining);
369
+ const buf = Buffer.alloc(partLen);
370
+ await fh.read(buf, 0, partLen, offset);
371
+
372
+ const etag = await this._uploadPartWithRetry(job, i, buf);
373
+ job.doneParts.push({ partNumber: i, etag });
374
+ this._persist(job);
375
+ }
376
+ } finally {
377
+ await fh.close().catch(() => {});
378
+ }
379
+
380
+ if (this._stopping) return;
381
+
382
+ await this.serverApi.completeMultipart({
383
+ workspaceId: job.workspaceId,
384
+ path: job.workspacePath,
385
+ objectKey: job.objectKey,
386
+ cosUploadId: job.cosUploadId,
387
+ parts: job.doneParts,
388
+ size: job.size,
389
+ sha256: job.sha256,
390
+ });
391
+
392
+ this._markDone(job);
393
+ }
394
+
395
+ async _uploadPartWithRetry(job, partNumber, buf) {
396
+ let lastErr;
397
+ for (let attempt = 1; attempt <= PART_RETRY_ATTEMPTS; attempt++) {
398
+ if (this._stopping) throw new Error('stopping');
399
+ try {
400
+ const presign = await this.serverApi.presignPart({
401
+ workspaceId: job.workspaceId,
402
+ objectKey: job.objectKey,
403
+ cosUploadId: job.cosUploadId,
404
+ partNumber,
405
+ });
406
+ const resp = await this.fetchFn(presign.url, {
407
+ method: presign.method ?? 'PUT',
408
+ headers: {
409
+ 'Content-Length': String(buf.length),
410
+ ...(presign.headers ?? {}),
411
+ },
412
+ body: buf,
413
+ });
414
+ if (!resp.ok) {
415
+ const text = await resp.text().catch(() => '');
416
+ throw new Error(`HTTP ${resp.status} ${text.slice(0, 200)}`);
417
+ }
418
+ const etag = resp.headers.get?.('etag') ?? resp.headers.get?.('ETag') ?? presign.etag ?? '';
419
+ if (!etag) throw new Error(`PUT part ${partNumber} missing etag`);
420
+ return etag;
421
+ } catch (err) {
422
+ lastErr = err;
423
+ this.log(`part ${partNumber} attempt ${attempt}/${PART_RETRY_ATTEMPTS} failed: ${err.message}`);
424
+ if (attempt < PART_RETRY_ATTEMPTS) {
425
+ await sleep(partBackoffMs(attempt));
426
+ }
427
+ }
428
+ }
429
+ throw new Error(`part ${partNumber} exhausted ${PART_RETRY_ATTEMPTS} retries: ${lastErr?.message ?? lastErr}`);
430
+ }
431
+
432
+ _markDone(job) {
433
+ job.status = 'done';
434
+ job.updatedAt = nowIso();
435
+ job.lastError = null;
436
+ job.lastErrorAt = null;
437
+ job.nextAttemptAt = null;
438
+ this._persist(job);
439
+ this._releaseLock(job);
440
+ this.log(`job ${job.jobId} done (${job.workspacePath}, ${job.size} bytes, ${job.uploadMode})`);
441
+ }
442
+
443
+ _markDeadLetter(job, reason) {
444
+ // Best-effort COS cleanup so we don't leak storage cost on aborted multipart.
445
+ if (job.uploadMode === 'multipart' && job.cosUploadId && job.objectKey) {
446
+ Promise.resolve(this.serverApi.abortMultipart({
447
+ workspaceId: job.workspaceId,
448
+ path: job.workspacePath,
449
+ objectKey: job.objectKey,
450
+ cosUploadId: job.cosUploadId,
451
+ })).catch(err => this.log(`abort_multipart for ${job.jobId} failed: ${err.message}`));
452
+ }
453
+ job.status = 'dead_letter';
454
+ job.lastError = String(reason);
455
+ job.lastErrorAt = nowIso();
456
+ job.updatedAt = nowIso();
457
+ job.nextAttemptAt = null;
458
+ this._persist(job);
459
+ this._releaseLock(job);
460
+ this.log(`job ${job.jobId} dead_letter: ${reason}`);
461
+ }
462
+
463
+ _recordFailure(job, err) {
464
+ job.attempts = (job.attempts ?? 0) + 1;
465
+ job.lastError = String(err?.message ?? err);
466
+ job.lastErrorAt = nowIso();
467
+ job.updatedAt = nowIso();
468
+ if (job.attempts >= MAX_JOB_ATTEMPTS) {
469
+ this._markDeadLetter(job, `max_attempts_exhausted: ${job.lastError}`);
470
+ return;
471
+ }
472
+ job.status = 'pending';
473
+ const backoff = jobBackoffMs(job.attempts);
474
+ job.nextAttemptAt = this.nowFn() + backoff;
475
+ this._persist(job);
476
+ this.log(`job ${job.jobId} attempt ${job.attempts} failed: ${job.lastError}; next retry in ${Math.round(backoff / 1000)}s`);
477
+ }
478
+
479
+ _releaseLock(job) {
480
+ const pathKey = `${job.workspaceId}|${job.workspacePath}`;
481
+ if (this._pathLocks.get(pathKey) === job.jobId) {
482
+ this._pathLocks.delete(pathKey);
483
+ }
484
+ }
485
+
486
+ // ─── persistence ─────────────────────────────────────────────────────────
487
+
488
+ _jobPath(jobId) {
489
+ return path.join(this.jobDir, `${jobId}.json`);
490
+ }
491
+
492
+ _persist(job) {
493
+ job.updatedAt = nowIso();
494
+ const dest = this._jobPath(job.jobId);
495
+ const tmp = `${dest}.tmp`;
496
+ writeFileSync(tmp, JSON.stringify(job, null, 2));
497
+ renameSync(tmp, dest); // atomic on POSIX
498
+ }
499
+
500
+ _loadById(jobId) {
501
+ try {
502
+ const text = readFileSync(this._jobPath(jobId), 'utf8');
503
+ const job = JSON.parse(text);
504
+ if (job.schemaVersion !== SCHEMA_VERSION) {
505
+ this.log(`job ${jobId}: unsupported schemaVersion ${job.schemaVersion}, ignored`);
506
+ return null;
507
+ }
508
+ return job;
509
+ } catch { return null; }
510
+ }
511
+
512
+ _listJobs() {
513
+ let names;
514
+ try { names = readdirSync(this.jobDir); }
515
+ catch { return []; }
516
+ const out = [];
517
+ for (const name of names) {
518
+ if (!name.endsWith('.json') || name.endsWith('.tmp.json')) continue;
519
+ const jobId = name.slice(0, -5);
520
+ const job = this._loadById(jobId);
521
+ if (job) out.push(job);
522
+ }
523
+ return out;
524
+ }
525
+
526
+ _publicState(job) {
527
+ return {
528
+ jobId: job.jobId,
529
+ status: job.status,
530
+ mode: job.uploadMode,
531
+ size: job.size,
532
+ progress: job.uploadMode === 'multipart'
533
+ ? { donePartCount: job.doneParts.length, totalChunks: job.totalChunks }
534
+ : { donePartCount: job.status === 'done' ? 1 : 0, totalChunks: 1 },
535
+ attempts: job.attempts,
536
+ lastError: job.lastError,
537
+ lastErrorAt: job.lastErrorAt,
538
+ nextAttemptAt: job.nextAttemptAt,
539
+ objectKey: job.objectKey,
540
+ };
541
+ }
542
+ }
@@ -0,0 +1,80 @@
1
+ // Thin daemon-side wrapper around the server's /storage/* endpoints.
2
+ //
3
+ // Implements the `serverApi` interface consumed by UploadJobManager. Single
4
+ // place that translates the abstract { presignSingle / confirmSingle /
5
+ // presignMultipart / presignPart / completeMultipart / abortMultipart } calls
6
+ // into HTTP requests against the lightcone server's internal/agent/.../storage/*
7
+ // endpoints. The actual HTTP plumbing — auth headers, governance, retries —
8
+ // is delegated to the `api` function passed in (typically chat-bridge's `api`).
9
+ //
10
+ // Why split this out: keeps UploadJobManager pure (no HTTP knowledge) and lets
11
+ // us write the manager's tests entirely against an injected mock.
12
+
13
+ /**
14
+ * @param {Object} deps
15
+ * @param {(method: string, path: string, body?: unknown) => Promise<unknown>} deps.api
16
+ * HTTP helper that POSTs to `/internal/agent/<agentId>/<path>` and returns
17
+ * the JSON body. Throws on non-2xx.
18
+ */
19
+ export function createUploadServerApi({ api }) {
20
+ if (typeof api !== 'function') {
21
+ throw new Error('createUploadServerApi: api function is required');
22
+ }
23
+
24
+ return {
25
+ async presignSingle({ workspaceId, path, size, mime, sha256 }) {
26
+ // Existing endpoint, reused as-is. Returns { objectKey, uploadUrl, method?, headers? }.
27
+ const data = await api('POST', '/storage/presign', {
28
+ workspaceId, path, size, mime, sha256,
29
+ });
30
+ // /storage/presign returns { uploadUrl, method, headers, objectKey, alreadyExists }
31
+ // for the actual presigned PUT URL flow; surface them in the shape the
32
+ // manager expects.
33
+ return {
34
+ objectKey: data.objectKey,
35
+ uploadUrl: data.uploadUrl,
36
+ method: data.method ?? 'PUT',
37
+ headers: data.headers ?? {},
38
+ alreadyExists: !!data.alreadyExists,
39
+ };
40
+ },
41
+
42
+ async confirmSingle({ workspaceId, path, objectKey }) {
43
+ return api('POST', '/storage/confirm', { workspaceId, path, objectKey });
44
+ },
45
+
46
+ async presignMultipart({ workspaceId, path, size, mime, sha256 }) {
47
+ const data = await api('POST', '/storage/presign-multipart', {
48
+ workspaceId, path, size, mime, sha256,
49
+ });
50
+ return {
51
+ objectKey: data.objectKey,
52
+ cosUploadId: data.cosUploadId,
53
+ alreadyExists: !!data.alreadyExists,
54
+ };
55
+ },
56
+
57
+ async presignPart({ workspaceId, objectKey, cosUploadId, partNumber }) {
58
+ const data = await api('POST', '/storage/presign-part', {
59
+ workspaceId, objectKey, cosUploadId, partNumber,
60
+ });
61
+ return {
62
+ url: data.url,
63
+ method: data.method ?? 'PUT',
64
+ headers: data.headers ?? {},
65
+ };
66
+ },
67
+
68
+ async completeMultipart({ workspaceId, path, objectKey, cosUploadId, parts, size, sha256 }) {
69
+ return api('POST', '/storage/complete-multipart', {
70
+ workspaceId, path, objectKey, cosUploadId, parts, size, sha256,
71
+ });
72
+ },
73
+
74
+ async abortMultipart({ workspaceId, path, objectKey, cosUploadId }) {
75
+ return api('POST', '/storage/abort-multipart', {
76
+ workspaceId, path, objectKey, cosUploadId,
77
+ });
78
+ },
79
+ };
80
+ }
@@ -1,7 +1,31 @@
1
1
  import path, { extname } from 'node:path';
2
- import { readFileSync, createReadStream } from 'node:fs';
2
+ import { readFileSync, createReadStream, statSync } from 'node:fs';
3
3
  import { createHash } from 'node:crypto';
4
4
 
5
+ // Wait for a file to stop growing before we read it. compose_video_v2 / ffmpeg
6
+ // occasionally finishes the wrapping tool-call promise a fraction earlier than
7
+ // the kernel finishes the last writeback for the output mp4 (observed: codex
8
+ // reports item.completed at T, file mtime is T+60s, and write_workspace_file
9
+ // reads a half-finished file in between). We stat → sleep → stat; if size or
10
+ // mtime moved, the file is still being written and we retry. Cheap (200-400ms
11
+ // for stable files; bounded retries for actively-growing ones).
12
+ async function waitForFileStable(localPath, { intervalMs = 300, attempts = 10 } = {}) {
13
+ let lastSize = -1;
14
+ let lastMtime = -1;
15
+ for (let i = 0; i < attempts; i += 1) {
16
+ const st = statSync(localPath);
17
+ if (st.size === lastSize && st.mtimeMs === lastMtime) {
18
+ return { size: st.size };
19
+ }
20
+ lastSize = st.size;
21
+ lastMtime = st.mtimeMs;
22
+ await new Promise(r => setTimeout(r, intervalMs));
23
+ }
24
+ // Returned best-effort: caller can still proceed, but mismatch may surface
25
+ // at /storage/confirm size check.
26
+ return { size: lastSize };
27
+ }
28
+
5
29
  export const WORKSPACE_BINARY_MIME = {
6
30
  '.mp4': 'video/mp4',
7
31
  '.png': 'image/png',
@@ -48,6 +72,11 @@ async function uploadBinaryFile({
48
72
  presign, // async fn({ workspaceId, path, size, mime, sha256 }) → { uploadUrl, objectKey, alreadyExists }
49
73
  confirmUpload, // async fn({ workspaceId, path, objectKey }) → void
50
74
  }) {
75
+ // Wait until the source file is no longer being written. Without this we
76
+ // sometimes read a partial mp4 from a still-running ffmpeg, upload that
77
+ // partial buffer, and end up serving a truncated video downstream.
78
+ await waitForFileStable(localPath);
79
+
51
80
  const buf = readFileSync(localPath);
52
81
  const size = buf.length;
53
82
  const sha256 = sha256ofBuffer(buf);