vidpipe 1.3.3 → 1.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -291,10 +291,12 @@ function initConfig(cli = {}) {
291
291
  SKIP_MEDIUM_CLIPS: cli.mediumClips === false,
292
292
  SKIP_SOCIAL: cli.social === false,
293
293
  SKIP_CAPTIONS: cli.captions === false,
294
+ SKIP_VISUAL_ENHANCEMENT: cli.visualEnhancement === false,
294
295
  LATE_API_KEY: cli.lateApiKey || process.env.LATE_API_KEY || "",
295
296
  LATE_PROFILE_ID: cli.lateProfileId || process.env.LATE_PROFILE_ID || "",
296
297
  SKIP_SOCIAL_PUBLISH: cli.socialPublish === false,
297
- GEMINI_API_KEY: process.env.GEMINI_API_KEY || ""
298
+ GEMINI_API_KEY: process.env.GEMINI_API_KEY || "",
299
+ GEMINI_MODEL: process.env.GEMINI_MODEL || "gemini-2.5-pro"
298
300
  };
299
301
  return config;
300
302
  }
@@ -724,10 +726,12 @@ async function getVideoResolution(videoPath) {
724
726
  }
725
727
  async function extractSampleFrames(videoPath, tempDir) {
726
728
  const duration = await getVideoDuration(videoPath);
727
- const interval = Math.max(1, Math.floor(duration / (SAMPLE_FRAMES + 1)));
729
+ const effectiveSamples = Math.min(SAMPLE_FRAMES, Math.max(1, Math.floor(duration) - 1));
730
+ const interval = Math.max(1, Math.floor(duration / (effectiveSamples + 1)));
728
731
  const timestamps = [];
729
- for (let i = 1; i <= SAMPLE_FRAMES; i++) {
730
- timestamps.push(i * interval);
732
+ for (let i = 1; i <= effectiveSamples; i++) {
733
+ const ts = i * interval;
734
+ if (ts < duration) timestamps.push(ts);
731
735
  }
732
736
  const framePaths = [];
733
737
  for (let i = 0; i < timestamps.length; i++) {
@@ -871,7 +875,7 @@ function findPeakDiff(means, searchFrom, searchTo, minDiff) {
871
875
  }
872
876
  return maxDiff >= minDiff ? { index: maxIdx, magnitude: maxDiff } : { index: -1, magnitude: maxDiff };
873
877
  }
874
- async function refineBoundingBox(framePaths, position) {
878
+ async function refineBoundingBox(framePaths, position, minEdgeDiff = REFINE_MIN_EDGE_DIFF) {
875
879
  if (framePaths.length === 0) return null;
876
880
  const isRight = position.includes("right");
877
881
  const isBottom = position.includes("bottom");
@@ -893,10 +897,10 @@ async function refineBoundingBox(framePaths, position) {
893
897
  const avgRows = averageFloat64Arrays(rowMeansAll);
894
898
  const xFrom = isRight ? Math.floor(fw * 0.35) : Math.floor(fw * 0.05);
895
899
  const xTo = isRight ? Math.floor(fw * 0.95) : Math.floor(fw * 0.65);
896
- const xEdge = findPeakDiff(avgCols, xFrom, xTo, REFINE_MIN_EDGE_DIFF);
900
+ const xEdge = findPeakDiff(avgCols, xFrom, xTo, minEdgeDiff);
897
901
  const yFrom = isBottom ? Math.floor(fh * 0.35) : Math.floor(fh * 0.05);
898
902
  const yTo = isBottom ? Math.floor(fh * 0.95) : Math.floor(fh * 0.65);
899
- const yEdge = findPeakDiff(avgRows, yFrom, yTo, REFINE_MIN_EDGE_DIFF);
903
+ const yEdge = findPeakDiff(avgRows, yFrom, yTo, minEdgeDiff);
900
904
  if (xEdge.index < 0 || yEdge.index < 0) {
901
905
  logger_default.info(
902
906
  `[FaceDetection] Edge refinement: no strong edges (xDiff=${xEdge.magnitude.toFixed(1)}, yDiff=${yEdge.magnitude.toFixed(1)})`
@@ -986,25 +990,43 @@ async function detectWebcamRegion(videoPath) {
986
990
  y2: boxes.reduce((s, b) => s + b.y2, 0) / boxes.length,
987
991
  confidence: bestConfidence
988
992
  };
989
- const refined = await refineBoundingBox(framePaths, bestPosition);
993
+ let refined = null;
994
+ refined = await refineBoundingBox(framePaths, bestPosition, REFINE_MIN_EDGE_DIFF);
995
+ if (!refined) {
996
+ for (const threshold of REFINE_RETRY_THRESHOLDS) {
997
+ logger_default.info(`[FaceDetection] Retrying edge refinement with threshold=${threshold}`);
998
+ refined = await refineBoundingBox(framePaths, bestPosition, threshold);
999
+ if (refined) break;
1000
+ }
1001
+ }
990
1002
  const scaleX = resolution.width / MODEL_WIDTH;
991
1003
  const scaleY = resolution.height / MODEL_HEIGHT;
992
- let origX, origY, origW, origH;
1004
+ let origX = 0, origY = 0, origW = 0, origH = 0;
993
1005
  if (refined) {
994
1006
  origX = Math.round(refined.x * scaleX);
995
1007
  origY = Math.round(refined.y * scaleY);
996
1008
  origW = Math.round(refined.width * scaleX);
997
1009
  origH = Math.round(refined.height * scaleY);
998
- } else {
999
- const expandFactor = 1.4;
1000
- const faceCx = (avgBox.x1 + avgBox.x2) / 2;
1001
- const faceCy = (avgBox.y1 + avgBox.y2) / 2;
1002
- const faceW = (avgBox.x2 - avgBox.x1) * expandFactor;
1003
- const faceH = (avgBox.y2 - avgBox.y1) * expandFactor;
1004
- origX = Math.max(0, Math.round((faceCx - faceW / 2) * resolution.width));
1005
- origY = Math.max(0, Math.round((faceCy - faceH / 2) * resolution.height));
1006
- origW = Math.min(resolution.width - origX, Math.round(faceW * resolution.width));
1007
- origH = Math.min(resolution.height - origY, Math.round(faceH * resolution.height));
1010
+ const refinedAR = origW / origH;
1011
+ if (origW < MIN_WEBCAM_WIDTH_PX || origH < MIN_WEBCAM_HEIGHT_PX || refinedAR > MAX_WEBCAM_ASPECT_RATIO) {
1012
+ logger_default.info(
1013
+ `[FaceDetection] Refined region implausible (${origW}x${origH}px, AR=${refinedAR.toFixed(1)}), using proportional fallback`
1014
+ );
1015
+ refined = null;
1016
+ }
1017
+ }
1018
+ if (!refined) {
1019
+ const webcamWidthFrac = 0.33;
1020
+ const webcamHeightFrac = 0.28;
1021
+ origW = Math.round(resolution.width * webcamWidthFrac);
1022
+ origH = Math.round(resolution.height * webcamHeightFrac);
1023
+ const isRight = bestPosition.includes("right");
1024
+ const isBottom = bestPosition.includes("bottom");
1025
+ origX = isRight ? resolution.width - origW : 0;
1026
+ origY = isBottom ? resolution.height - origH : 0;
1027
+ logger_default.info(
1028
+ `[FaceDetection] Using proportional fallback: (${origX},${origY}) ${origW}x${origH}`
1029
+ );
1008
1030
  }
1009
1031
  const region = {
1010
1032
  x: origX,
@@ -1028,7 +1050,7 @@ async function detectWebcamRegion(videoPath) {
1028
1050
  });
1029
1051
  }
1030
1052
  }
1031
- var ffmpegPath, ffprobePath, MODEL_PATH, cachedSession, SAMPLE_FRAMES, MODEL_WIDTH, MODEL_HEIGHT, MIN_FACE_CONFIDENCE, MIN_DETECTION_CONFIDENCE, REFINE_MIN_EDGE_DIFF, REFINE_MIN_SIZE_FRAC, REFINE_MAX_SIZE_FRAC;
1053
+ var ffmpegPath, ffprobePath, MODEL_PATH, cachedSession, SAMPLE_FRAMES, MODEL_WIDTH, MODEL_HEIGHT, MIN_FACE_CONFIDENCE, MIN_DETECTION_CONFIDENCE, REFINE_MIN_EDGE_DIFF, REFINE_RETRY_THRESHOLDS, REFINE_MIN_SIZE_FRAC, REFINE_MAX_SIZE_FRAC, MIN_WEBCAM_WIDTH_PX, MIN_WEBCAM_HEIGHT_PX, MAX_WEBCAM_ASPECT_RATIO;
1032
1054
  var init_faceDetection = __esm({
1033
1055
  "src/tools/ffmpeg/faceDetection.ts"() {
1034
1056
  "use strict";
@@ -1042,14 +1064,18 @@ var init_faceDetection = __esm({
1042
1064
  ffprobePath = getFFprobePath();
1043
1065
  MODEL_PATH = join(modelsDir(), "ultraface-320.onnx");
1044
1066
  cachedSession = null;
1045
- SAMPLE_FRAMES = 5;
1067
+ SAMPLE_FRAMES = 15;
1046
1068
  MODEL_WIDTH = 320;
1047
1069
  MODEL_HEIGHT = 240;
1048
1070
  MIN_FACE_CONFIDENCE = 0.5;
1049
1071
  MIN_DETECTION_CONFIDENCE = 0.3;
1050
1072
  REFINE_MIN_EDGE_DIFF = 3;
1073
+ REFINE_RETRY_THRESHOLDS = [2, 1];
1051
1074
  REFINE_MIN_SIZE_FRAC = 0.05;
1052
1075
  REFINE_MAX_SIZE_FRAC = 0.55;
1076
+ MIN_WEBCAM_WIDTH_PX = 300;
1077
+ MIN_WEBCAM_HEIGHT_PX = 200;
1078
+ MAX_WEBCAM_ASPECT_RATIO = 3;
1053
1079
  }
1054
1080
  });
1055
1081
 
@@ -1462,16 +1488,31 @@ async function transcribeAudio(audioPath) {
1462
1488
  const openai = new default4({ apiKey: config2.OPENAI_API_KEY });
1463
1489
  try {
1464
1490
  const prompt = getWhisperPrompt();
1465
- const response = await openai.audio.transcriptions.create({
1466
- model: "whisper-1",
1467
- file: openReadStream(audioPath),
1468
- response_format: "verbose_json",
1469
- timestamp_granularities: ["word", "segment"],
1470
- ...prompt && { prompt }
1471
- });
1491
+ let response;
1492
+ for (let attempt = 1; attempt <= MAX_RETRIES; attempt++) {
1493
+ try {
1494
+ response = await openai.audio.transcriptions.create({
1495
+ model: "whisper-1",
1496
+ file: openReadStream(audioPath),
1497
+ response_format: "verbose_json",
1498
+ timestamp_granularities: ["word", "segment"],
1499
+ ...prompt && { prompt }
1500
+ });
1501
+ break;
1502
+ } catch (retryError) {
1503
+ const status = typeof retryError === "object" && retryError !== null && "status" in retryError ? retryError.status : void 0;
1504
+ if (status === 401 || status === 400 || status === 429) throw retryError;
1505
+ if (attempt === MAX_RETRIES) throw retryError;
1506
+ const msg = retryError instanceof Error ? retryError.message : String(retryError);
1507
+ logger_default.warn(`Whisper attempt ${attempt}/${MAX_RETRIES} failed: ${msg} \u2014 retrying in ${RETRY_DELAY_MS / 1e3}s`);
1508
+ await new Promise((resolve3) => setTimeout(resolve3, RETRY_DELAY_MS));
1509
+ }
1510
+ }
1511
+ if (!response) throw new Error("Whisper transcription failed after all retries");
1472
1512
  const verboseResponse = response;
1473
1513
  const rawSegments = verboseResponse.segments ?? [];
1474
1514
  const rawWords = verboseResponse.words ?? [];
1515
+ const typedResponse = response;
1475
1516
  const words = rawWords.map((w) => ({
1476
1517
  word: w.word,
1477
1518
  start: w.start,
@@ -1485,20 +1526,20 @@ async function transcribeAudio(audioPath) {
1485
1526
  words: rawWords.filter((w) => w.start >= s.start && w.end <= s.end).map((w) => ({ word: w.word, start: w.start, end: w.end }))
1486
1527
  }));
1487
1528
  logger_default.info(
1488
- `Transcription complete \u2014 ${segments.length} segments, ${words.length} words, language=${response.language}`
1529
+ `Transcription complete \u2014 ${segments.length} segments, ${words.length} words, language=${typedResponse.language}`
1489
1530
  );
1490
- const durationMinutes = (response.duration ?? 0) / 60;
1531
+ const durationMinutes = (typedResponse.duration ?? 0) / 60;
1491
1532
  costTracker.recordServiceUsage("whisper", durationMinutes * WHISPER_COST_PER_MINUTE, {
1492
1533
  model: "whisper-1",
1493
- durationSeconds: response.duration ?? 0,
1534
+ durationSeconds: typedResponse.duration ?? 0,
1494
1535
  audioFile: audioPath
1495
1536
  });
1496
1537
  return {
1497
- text: response.text,
1538
+ text: typedResponse.text,
1498
1539
  segments,
1499
1540
  words,
1500
- language: response.language ?? "unknown",
1501
- duration: response.duration ?? 0
1541
+ language: typedResponse.language ?? "unknown",
1542
+ duration: typedResponse.duration ?? 0
1502
1543
  };
1503
1544
  } catch (error) {
1504
1545
  const message = error instanceof Error ? error.message : String(error);
@@ -1513,7 +1554,7 @@ async function transcribeAudio(audioPath) {
1513
1554
  throw new Error(`Whisper transcription failed: ${message}`);
1514
1555
  }
1515
1556
  }
1516
- var MAX_FILE_SIZE_MB, WHISPER_COST_PER_MINUTE, WARN_FILE_SIZE_MB;
1557
+ var MAX_FILE_SIZE_MB, WHISPER_COST_PER_MINUTE, WARN_FILE_SIZE_MB, MAX_RETRIES, RETRY_DELAY_MS;
1517
1558
  var init_whisperClient = __esm({
1518
1559
  "src/tools/whisper/whisperClient.ts"() {
1519
1560
  "use strict";
@@ -1526,6 +1567,8 @@ var init_whisperClient = __esm({
1526
1567
  MAX_FILE_SIZE_MB = 25;
1527
1568
  WHISPER_COST_PER_MINUTE = 6e-3;
1528
1569
  WARN_FILE_SIZE_MB = 20;
1570
+ MAX_RETRIES = 3;
1571
+ RETRY_DELAY_MS = 5e3;
1529
1572
  }
1530
1573
  });
1531
1574
 
@@ -2989,6 +3032,8 @@ async function extractCompositeClipWithTransitions(videoPath, segments, outputPa
2989
3032
  "[aout]",
2990
3033
  "-c:v",
2991
3034
  "libx264",
3035
+ "-pix_fmt",
3036
+ "yuv420p",
2992
3037
  "-preset",
2993
3038
  "ultrafast",
2994
3039
  "-crf",
@@ -3087,23 +3132,24 @@ async function convertAspectRatio(inputPath, outputPath, targetRatio, options =
3087
3132
  });
3088
3133
  });
3089
3134
  }
3090
- async function convertWithSmartLayout(inputPath, outputPath, config2) {
3135
+ async function convertWithSmartLayout(inputPath, outputPath, config2, webcamOverride) {
3091
3136
  const { label, targetW, screenH, camH, fallbackRatio } = config2;
3092
3137
  const outputDir = dirname(outputPath);
3093
3138
  await ensureDirectory(outputDir);
3094
- const webcam = await detectWebcamRegion(inputPath);
3139
+ const webcam = webcamOverride !== void 0 ? webcamOverride : await detectWebcamRegion(inputPath);
3095
3140
  if (!webcam) {
3096
3141
  logger_default.info(`[${label}] No webcam found, falling back to center-crop`);
3097
3142
  return convertAspectRatio(inputPath, outputPath, fallbackRatio);
3098
3143
  }
3099
3144
  const resolution = await getVideoResolution(inputPath);
3145
+ const margin = Math.round(resolution.width * 0.02);
3100
3146
  let screenCropX;
3101
3147
  let screenCropW;
3102
3148
  if (webcam.position === "top-right" || webcam.position === "bottom-right") {
3103
3149
  screenCropX = 0;
3104
- screenCropW = webcam.x;
3150
+ screenCropW = Math.max(0, webcam.x - margin);
3105
3151
  } else {
3106
- screenCropX = webcam.x + webcam.width;
3152
+ screenCropX = webcam.x + webcam.width + margin;
3107
3153
  screenCropW = Math.max(0, resolution.width - screenCropX);
3108
3154
  }
3109
3155
  const targetAR = targetW / camH;
@@ -3162,32 +3208,32 @@ async function convertWithSmartLayout(inputPath, outputPath, config2) {
3162
3208
  });
3163
3209
  });
3164
3210
  }
3165
- async function convertToPortraitSmart(inputPath, outputPath) {
3211
+ async function convertToPortraitSmart(inputPath, outputPath, webcamOverride) {
3166
3212
  return convertWithSmartLayout(inputPath, outputPath, {
3167
3213
  label: "SmartPortrait",
3168
3214
  targetW: 1080,
3169
3215
  screenH: 1248,
3170
3216
  camH: 672,
3171
3217
  fallbackRatio: "9:16"
3172
- });
3218
+ }, webcamOverride);
3173
3219
  }
3174
- async function convertToSquareSmart(inputPath, outputPath) {
3220
+ async function convertToSquareSmart(inputPath, outputPath, webcamOverride) {
3175
3221
  return convertWithSmartLayout(inputPath, outputPath, {
3176
3222
  label: "SmartSquare",
3177
3223
  targetW: 1080,
3178
3224
  screenH: 700,
3179
3225
  camH: 380,
3180
3226
  fallbackRatio: "1:1"
3181
- });
3227
+ }, webcamOverride);
3182
3228
  }
3183
- async function convertToFeedSmart(inputPath, outputPath) {
3229
+ async function convertToFeedSmart(inputPath, outputPath, webcamOverride) {
3184
3230
  return convertWithSmartLayout(inputPath, outputPath, {
3185
3231
  label: "SmartFeed",
3186
3232
  targetW: 1080,
3187
3233
  screenH: 878,
3188
3234
  camH: 472,
3189
3235
  fallbackRatio: "4:5"
3190
- });
3236
+ }, webcamOverride);
3191
3237
  }
3192
3238
  async function generatePlatformVariants(inputPath, outputDir, slug, platforms = ["tiktok", "linkedin"], options = {}) {
3193
3239
  await ensureDirectory(outputDir);
@@ -3208,11 +3254,11 @@ async function generatePlatformVariants(inputPath, outputDir, slug, platforms =
3208
3254
  if (options.useAgent) {
3209
3255
  logger_default.warn(`[generatePlatformVariants] LayoutAgent is disabled, falling back to ONNX pipeline`);
3210
3256
  }
3211
- await convertToPortraitSmart(inputPath, outPath);
3257
+ await convertToPortraitSmart(inputPath, outPath, options.webcamOverride);
3212
3258
  } else if (ratio === "1:1") {
3213
- await convertToSquareSmart(inputPath, outPath);
3259
+ await convertToSquareSmart(inputPath, outPath, options.webcamOverride);
3214
3260
  } else if (ratio === "4:5") {
3215
- await convertToFeedSmart(inputPath, outPath);
3261
+ await convertToFeedSmart(inputPath, outPath, options.webcamOverride);
3216
3262
  } else {
3217
3263
  await convertAspectRatio(inputPath, outPath, ratio);
3218
3264
  }
@@ -3276,7 +3322,7 @@ var ShortsAgent_exports = {};
3276
3322
  __export(ShortsAgent_exports, {
3277
3323
  generateShorts: () => generateShorts
3278
3324
  });
3279
- async function generateShorts(video, transcript, model, clipDirection) {
3325
+ async function generateShorts(video, transcript, model, clipDirection, webcamOverride) {
3280
3326
  const agent = new ShortsAgent(model);
3281
3327
  const transcriptLines = transcript.segments.map((seg) => {
3282
3328
  const words = seg.words.map((w) => `[${w.start.toFixed(2)}-${w.end.toFixed(2)}] ${w.word}`).join(" ");
@@ -3287,7 +3333,8 @@ Words: ${words}`;
3287
3333
  `Analyze the following transcript (${transcript.duration.toFixed(0)}s total) and plan shorts.
3288
3334
  `,
3289
3335
  `Video: ${video.filename}`,
3290
- `Duration: ${transcript.duration.toFixed(1)}s
3336
+ `Duration: ${transcript.duration.toFixed(1)}s`,
3337
+ `Target: ~${Math.max(3, Math.round(transcript.duration / 150))}\u2013${Math.max(5, Math.round(transcript.duration / 120))} shorts (scale by content richness)
3291
3338
  `,
3292
3339
  "--- TRANSCRIPT ---\n",
3293
3340
  transcriptLines.join("\n\n"),
@@ -3329,7 +3376,7 @@ Words: ${words}`;
3329
3376
  let variants;
3330
3377
  try {
3331
3378
  const defaultPlatforms = ["tiktok", "youtube-shorts", "instagram-reels", "instagram-feed", "linkedin"];
3332
- const results = await generatePlatformVariants(outputPath, shortsDir, shortSlug, defaultPlatforms);
3379
+ const results = await generatePlatformVariants(outputPath, shortsDir, shortSlug, defaultPlatforms, { webcamOverride });
3333
3380
  if (results.length > 0) {
3334
3381
  variants = results.map((v) => ({
3335
3382
  path: v.path,
@@ -3428,7 +3475,7 @@ Words: ${words}`;
3428
3475
  await agent.destroy();
3429
3476
  }
3430
3477
  }
3431
- var SYSTEM_PROMPT2, PLAN_SHORTS_SCHEMA, ShortsAgent;
3478
+ var SYSTEM_PROMPT2, ADD_SHORTS_SCHEMA, ShortsAgent;
3432
3479
  var init_ShortsAgent = __esm({
3433
3480
  "src/agents/ShortsAgent.ts"() {
3434
3481
  "use strict";
@@ -3442,7 +3489,23 @@ var init_ShortsAgent = __esm({
3442
3489
  init_fileSystem();
3443
3490
  init_paths();
3444
3491
  init_logger2();
3445
- SYSTEM_PROMPT2 = `You are a short-form video content strategist. Your job is to analyze a video transcript with word-level timestamps and identify the most compelling moments to extract as shorts (15\u201360 seconds each).
3492
+ SYSTEM_PROMPT2 = `You are a short-form video content strategist. Your job is to **exhaustively** analyze a video transcript with word-level timestamps and extract every compelling moment as a short (15\u201360 seconds each).
3493
+
3494
+ ## Your workflow
3495
+ 1. Read the transcript and note the total duration.
3496
+ 2. Work through the transcript **section by section** (roughly 3\u20135 minute chunks). For each chunk, identify every possible short.
3497
+ 3. Call **add_shorts** for each batch of shorts you find. You can call it as many times as needed.
3498
+ 4. After your first pass, call **review_shorts** to see everything you've planned so far.
3499
+ 5. Review for gaps: are there sections of the transcript with no shorts? Could any moments be combined into composites? Did you miss any humor, insights, or quotable moments?
3500
+ 6. Add any additional shorts you find.
3501
+ 7. When you are confident you've exhausted all opportunities, call **finalize_shorts**.
3502
+
3503
+ ## Target quantity
3504
+ Scale your output by video duration:
3505
+ - **~1 short per 2\u20133 minutes** of video content.
3506
+ - A 10-minute video \u2192 4\u20136 shorts. A 30-minute video \u2192 12\u201318 shorts. A 60-minute video \u2192 20\u201330 shorts.
3507
+ - These are guidelines, not hard caps \u2014 if the content is rich, find more. If it's sparse, find fewer.
3508
+ - **Never stop at 3\u20138 shorts for a long video.** Your job is to be thorough.
3446
3509
 
3447
3510
  ## What to look for
3448
3511
  - **Key insights** \u2014 concise, quotable takeaways
@@ -3450,34 +3513,34 @@ var init_ShortsAgent = __esm({
3450
3513
  - **Controversial takes** \u2014 bold opinions that spark discussion
3451
3514
  - **Educational nuggets** \u2014 clear explanations of complex topics
3452
3515
  - **Emotional peaks** \u2014 passion, vulnerability, excitement
3453
- - **Topic compilations** \u2014 multiple brief mentions of one theme that can be stitched together
3516
+ - **Audience hooks** \u2014 moments that would make someone stop scrolling
3517
+ - **Before/after reveals** \u2014 showing a transformation or result
3518
+ - **Mistakes & corrections** \u2014 relatable "oops" moments that humanize the speaker
3454
3519
 
3455
3520
  ## Short types
3456
3521
  - **Single segment** \u2014 one contiguous section of the video
3457
- - **Composite** \u2014 multiple non-contiguous segments combined into one short (great for topic compilations or building a narrative arc)
3522
+ - **Composite** \u2014 multiple non-contiguous segments combined into one short (great for topic compilations, building narrative arcs, or "every time X happens" montages). **Actively look for composite opportunities** \u2014 they often make the best shorts.
3458
3523
 
3459
3524
  ## Rules
3460
3525
  1. Each short must be 15\u201360 seconds total duration.
3461
3526
  2. Timestamps must align to word boundaries from the transcript.
3462
3527
  3. Prefer natural sentence boundaries for clean cuts.
3463
- 4. Aim for 3\u20138 shorts per video, depending on length and richness.
3464
- 5. Every short needs a catchy, descriptive title (5\u201310 words).
3465
- 6. Tags should be lowercase, no hashes, 3\u20136 per short.
3466
- 7. A 1-second buffer is automatically added before and after each segment boundary during extraction, so plan segments based on content timestamps without worrying about clipping words at the edges.
3467
-
3468
- When you have identified the shorts, call the **plan_shorts** tool with your complete plan.
3528
+ 4. Every short needs a catchy, descriptive title (5\u201310 words).
3529
+ 5. Tags should be lowercase, no hashes, 3\u20136 per short.
3530
+ 6. A 1-second buffer is automatically added before and after each segment boundary during extraction, so plan segments based on content timestamps without worrying about clipping words at the edges.
3531
+ 7. Avoid significant timestamp overlap between shorts \u2014 each short should bring unique content. Small overlaps (a few seconds of shared context) are OK.
3469
3532
 
3470
3533
  ## Using Clip Direction
3471
3534
  You may receive AI-generated clip direction with suggested shorts. Use these as a starting point but make your own decisions:
3472
3535
  - The suggestions are based on visual + audio analysis and may identify moments you'd miss from transcript alone
3473
3536
  - Feel free to adjust timestamps, combine suggestions, or ignore ones that don't work
3474
3537
  - You may also find good shorts NOT in the suggestions \u2014 always analyze the full transcript`;
3475
- PLAN_SHORTS_SCHEMA = {
3538
+ ADD_SHORTS_SCHEMA = {
3476
3539
  type: "object",
3477
3540
  properties: {
3478
3541
  shorts: {
3479
3542
  type: "array",
3480
- description: "Array of planned short clips",
3543
+ description: "Array of short clips to add to the plan",
3481
3544
  items: {
3482
3545
  type: "object",
3483
3546
  properties: {
@@ -3510,32 +3573,77 @@ You may receive AI-generated clip direction with suggested shorts. Use these as
3510
3573
  };
3511
3574
  ShortsAgent = class extends BaseAgent {
3512
3575
  plannedShorts = [];
3576
+ isFinalized = false;
3513
3577
  constructor(model) {
3514
3578
  super("ShortsAgent", SYSTEM_PROMPT2, void 0, model);
3515
3579
  }
3516
3580
  getTools() {
3517
3581
  return [
3518
3582
  {
3519
- name: "plan_shorts",
3520
- description: "Submit the planned shorts as a structured JSON array. Call this once with all planned shorts.",
3521
- parameters: PLAN_SHORTS_SCHEMA,
3583
+ name: "add_shorts",
3584
+ description: "Add one or more shorts to your plan. You can call this multiple times to build your list incrementally as you analyze each section of the transcript.",
3585
+ parameters: ADD_SHORTS_SCHEMA,
3522
3586
  handler: async (args) => {
3523
- return this.handleToolCall("plan_shorts", args);
3587
+ return this.handleToolCall("add_shorts", args);
3588
+ }
3589
+ },
3590
+ {
3591
+ name: "review_shorts",
3592
+ description: "Review all shorts planned so far. Returns a summary of every short in your current plan. Use this to check for gaps, overlaps, or missed opportunities before finalizing.",
3593
+ parameters: { type: "object", properties: {} },
3594
+ handler: async () => {
3595
+ return this.handleToolCall("review_shorts", {});
3596
+ }
3597
+ },
3598
+ {
3599
+ name: "finalize_shorts",
3600
+ description: "Finalize your short clip plan and trigger extraction. Call this ONCE after you have added all shorts and reviewed them for completeness.",
3601
+ parameters: { type: "object", properties: {} },
3602
+ handler: async () => {
3603
+ return this.handleToolCall("finalize_shorts", {});
3524
3604
  }
3525
3605
  }
3526
3606
  ];
3527
3607
  }
3528
3608
  async handleToolCall(toolName, args) {
3529
- if (toolName === "plan_shorts") {
3530
- this.plannedShorts = args.shorts;
3531
- logger_default.info(`[ShortsAgent] Planned ${this.plannedShorts.length} shorts`);
3532
- return { success: true, count: this.plannedShorts.length };
3609
+ switch (toolName) {
3610
+ case "add_shorts": {
3611
+ const newShorts = args.shorts;
3612
+ this.plannedShorts.push(...newShorts);
3613
+ logger_default.info(`[ShortsAgent] Added ${newShorts.length} shorts (total: ${this.plannedShorts.length})`);
3614
+ return `Added ${newShorts.length} shorts. Total planned: ${this.plannedShorts.length}. Call add_shorts for more, review_shorts to check your plan, or finalize_shorts when done.`;
3615
+ }
3616
+ case "review_shorts": {
3617
+ if (this.plannedShorts.length === 0) {
3618
+ return "No shorts planned yet. Analyze the transcript and call add_shorts to start planning.";
3619
+ }
3620
+ const summary = this.plannedShorts.map((s, i) => {
3621
+ const totalDur = s.segments.reduce((sum, seg) => sum + (seg.end - seg.start), 0);
3622
+ const timeRanges = s.segments.map((seg) => `${seg.start.toFixed(1)}s\u2013${seg.end.toFixed(1)}s`).join(", ");
3623
+ const type = s.segments.length > 1 ? "composite" : "single";
3624
+ return `${i + 1}. "${s.title}" (${totalDur.toFixed(1)}s, ${type}) [${timeRanges}] \u2014 ${s.description}`;
3625
+ }).join("\n");
3626
+ return `## Planned shorts (${this.plannedShorts.length} total)
3627
+
3628
+ ${summary}
3629
+
3630
+ Look for gaps in transcript coverage, missed composite opportunities, and any additional compelling moments.`;
3631
+ }
3632
+ case "finalize_shorts": {
3633
+ this.isFinalized = true;
3634
+ logger_default.info(`[ShortsAgent] Finalized ${this.plannedShorts.length} shorts`);
3635
+ return `Finalized ${this.plannedShorts.length} shorts. Extraction will begin.`;
3636
+ }
3637
+ default:
3638
+ throw new Error(`Unknown tool: ${toolName}`);
3533
3639
  }
3534
- throw new Error(`Unknown tool: ${toolName}`);
3535
3640
  }
3536
3641
  getPlannedShorts() {
3537
3642
  return this.plannedShorts;
3538
3643
  }
3644
+ getIsFinalized() {
3645
+ return this.isFinalized;
3646
+ }
3539
3647
  };
3540
3648
  }
3541
3649
  });
@@ -3556,7 +3664,8 @@ Words: ${words}`;
3556
3664
  `Analyze the following transcript (${transcript.duration.toFixed(0)}s total) and plan medium-length clips (1\u20133 minutes each).
3557
3665
  `,
3558
3666
  `Video: ${video.filename}`,
3559
- `Duration: ${transcript.duration.toFixed(1)}s
3667
+ `Duration: ${transcript.duration.toFixed(1)}s`,
3668
+ `Target: ~${Math.max(1, Math.round(transcript.duration / 480))}\u2013${Math.max(2, Math.round(transcript.duration / 300))} medium clips (scale by content richness)
3560
3669
  `,
3561
3670
  "--- TRANSCRIPT ---\n",
3562
3671
  transcriptLines.join("\n\n"),
@@ -3649,7 +3758,7 @@ Words: ${words}`;
3649
3758
  await agent.destroy();
3650
3759
  }
3651
3760
  }
3652
- var SYSTEM_PROMPT3, PLAN_MEDIUM_CLIPS_SCHEMA, MediumVideoAgent;
3761
+ var SYSTEM_PROMPT3, ADD_MEDIUM_CLIPS_SCHEMA, MediumVideoAgent;
3653
3762
  var init_MediumVideoAgent = __esm({
3654
3763
  "src/agents/MediumVideoAgent.ts"() {
3655
3764
  "use strict";
@@ -3662,7 +3771,23 @@ var init_MediumVideoAgent = __esm({
3662
3771
  init_fileSystem();
3663
3772
  init_paths();
3664
3773
  init_logger2();
3665
- SYSTEM_PROMPT3 = `You are a medium-form video content strategist. Your job is to analyze a video transcript with word-level timestamps and identify the best 1\u20133 minute segments to extract as standalone medium-form clips.
3774
+ SYSTEM_PROMPT3 = `You are a medium-form video content strategist. Your job is to **exhaustively** analyze a video transcript with word-level timestamps and extract every viable 1\u20133 minute segment as a standalone medium-form clip.
3775
+
3776
+ ## Your workflow
3777
+ 1. Read the transcript and note the total duration.
3778
+ 2. Work through the transcript **section by section** (roughly 5\u20138 minute chunks). For each chunk, identify every complete topic or narrative arc.
3779
+ 3. Call **add_medium_clips** for each batch of clips you find. You can call it as many times as needed.
3780
+ 4. After your first pass, call **review_medium_clips** to see everything you've planned so far.
3781
+ 5. Review for gaps: are there complete topics you missed? Could non-contiguous mentions of the same theme be compiled? Is there a tutorial segment that stands alone?
3782
+ 6. Add any additional clips you find.
3783
+ 7. When you are confident you've exhausted all opportunities, call **finalize_medium_clips**.
3784
+
3785
+ ## Target quantity
3786
+ Scale your output by video duration:
3787
+ - **~1 medium clip per 5\u20138 minutes** of video content.
3788
+ - A 10-minute video \u2192 1\u20132 clips. A 30-minute video \u2192 4\u20136 clips. A 60-minute video \u2192 8\u201312 clips.
3789
+ - These are guidelines, not hard caps \u2014 if the content is rich, find more.
3790
+ - **Never stop at 2\u20134 clips for a long video.** Your job is to be thorough.
3666
3791
 
3667
3792
  ## What to look for
3668
3793
 
@@ -3671,7 +3796,7 @@ var init_MediumVideoAgent = __esm({
3671
3796
  - **Educational deep dives** \u2014 clear, thorough explanations of complex topics
3672
3797
  - **Compelling stories** \u2014 anecdotes with setup, tension, and resolution
3673
3798
  - **Strong arguments** \u2014 claim \u2192 evidence \u2192 implication sequences
3674
- - **Topic compilations** \u2014 multiple brief mentions of one theme across the video that can be compiled into a cohesive 1\u20133 minute segment
3799
+ - **Topic compilations** \u2014 multiple brief mentions of one theme across the video that can be compiled into a cohesive 1\u20133 minute segment. **Actively look for these** \u2014 they often make excellent content.
3675
3800
 
3676
3801
  ## Clip types
3677
3802
 
@@ -3684,12 +3809,12 @@ var init_MediumVideoAgent = __esm({
3684
3809
  2. Timestamps must align to word boundaries from the transcript.
3685
3810
  3. Prefer natural sentence and paragraph boundaries for clean entry/exit points.
3686
3811
  4. Each clip must be self-contained \u2014 a viewer with no other context should understand and get value from the clip.
3687
- 5. Aim for 2\u20134 medium clips per video, depending on length and richness.
3688
- 6. Every clip needs a descriptive title (5\u201312 words) and a topic label.
3689
- 7. For compilations, specify segments in the order they should appear in the final clip (which may differ from chronological order).
3690
- 8. Tags should be lowercase, no hashes, 3\u20136 per clip.
3691
- 9. A 1-second buffer is automatically added around each segment boundary.
3692
- 10. Each clip needs a hook \u2014 the opening line or concept that draws viewers in.
3812
+ 5. Every clip needs a descriptive title (5\u201312 words) and a topic label.
3813
+ 6. For compilations, specify segments in the order they should appear in the final clip (which may differ from chronological order).
3814
+ 7. Tags should be lowercase, no hashes, 3\u20136 per clip.
3815
+ 8. A 1-second buffer is automatically added around each segment boundary.
3816
+ 9. Each clip needs a hook \u2014 the opening line or concept that draws viewers in.
3817
+ 10. Avoid significant overlap with content that would work better as a short (punchy, viral, single-moment).
3693
3818
 
3694
3819
  ## Differences from shorts
3695
3820
 
@@ -3697,9 +3822,6 @@ var init_MediumVideoAgent = __esm({
3697
3822
  - Don't just find the most exciting 60 seconds \u2014 find where a topic starts and where it naturally concludes.
3698
3823
  - It's OK if a medium clip has slower pacing \u2014 depth and coherence matter more than constant high energy.
3699
3824
  - Look for segments that work as standalone mini-tutorials or explanations.
3700
- - Avoid overlap with content that would work better as a short (punchy, viral, single-moment).
3701
-
3702
- When you have identified the clips, call the **plan_medium_clips** tool with your complete plan.
3703
3825
 
3704
3826
  ## Using Clip Direction
3705
3827
  You may receive AI-generated clip direction with suggested medium clips. Use these as a starting point but make your own decisions:
@@ -3707,12 +3829,12 @@ You may receive AI-generated clip direction with suggested medium clips. Use the
3707
3829
  - Feel free to adjust timestamps, combine suggestions, or ignore ones that don't work
3708
3830
  - You may also find good clips NOT in the suggestions \u2014 always analyze the full transcript
3709
3831
  - Pay special attention to suggested hooks and topic arcs \u2014 they come from multimodal analysis`;
3710
- PLAN_MEDIUM_CLIPS_SCHEMA = {
3832
+ ADD_MEDIUM_CLIPS_SCHEMA = {
3711
3833
  type: "object",
3712
3834
  properties: {
3713
3835
  clips: {
3714
3836
  type: "array",
3715
- description: "Array of planned medium-length clips",
3837
+ description: "Array of medium-length clips to add to the plan",
3716
3838
  items: {
3717
3839
  type: "object",
3718
3840
  properties: {
@@ -3748,32 +3870,79 @@ You may receive AI-generated clip direction with suggested medium clips. Use the
3748
3870
  };
3749
3871
  MediumVideoAgent = class extends BaseAgent {
3750
3872
  plannedClips = [];
3873
+ isFinalized = false;
3751
3874
  constructor(model) {
3752
3875
  super("MediumVideoAgent", SYSTEM_PROMPT3, void 0, model);
3753
3876
  }
3754
3877
  getTools() {
3755
3878
  return [
3756
3879
  {
3757
- name: "plan_medium_clips",
3758
- description: "Submit the planned medium-length clips as a structured JSON array. Call this once with all planned clips.",
3759
- parameters: PLAN_MEDIUM_CLIPS_SCHEMA,
3880
+ name: "add_medium_clips",
3881
+ description: "Add one or more medium clips to your plan. You can call this multiple times to build your list incrementally as you analyze each section of the transcript.",
3882
+ parameters: ADD_MEDIUM_CLIPS_SCHEMA,
3760
3883
  handler: async (args) => {
3761
- return this.handleToolCall("plan_medium_clips", args);
3884
+ return this.handleToolCall("add_medium_clips", args);
3885
+ }
3886
+ },
3887
+ {
3888
+ name: "review_medium_clips",
3889
+ description: "Review all medium clips planned so far. Returns a summary of every clip in your current plan. Use this to check for gaps, overlaps, or missed opportunities before finalizing.",
3890
+ parameters: { type: "object", properties: {} },
3891
+ handler: async () => {
3892
+ return this.handleToolCall("review_medium_clips", {});
3893
+ }
3894
+ },
3895
+ {
3896
+ name: "finalize_medium_clips",
3897
+ description: "Finalize your medium clip plan and trigger extraction. Call this ONCE after you have added all clips and reviewed them for completeness.",
3898
+ parameters: { type: "object", properties: {} },
3899
+ handler: async () => {
3900
+ return this.handleToolCall("finalize_medium_clips", {});
3762
3901
  }
3763
3902
  }
3764
3903
  ];
3765
3904
  }
3766
3905
  async handleToolCall(toolName, args) {
3767
- if (toolName === "plan_medium_clips") {
3768
- this.plannedClips = args.clips;
3769
- logger_default.info(`[MediumVideoAgent] Planned ${this.plannedClips.length} medium clips`);
3770
- return { success: true, count: this.plannedClips.length };
3906
+ switch (toolName) {
3907
+ case "add_medium_clips": {
3908
+ const newClips = args.clips;
3909
+ this.plannedClips.push(...newClips);
3910
+ logger_default.info(`[MediumVideoAgent] Added ${newClips.length} clips (total: ${this.plannedClips.length})`);
3911
+ return `Added ${newClips.length} clips. Total planned: ${this.plannedClips.length}. Call add_medium_clips for more, review_medium_clips to check your plan, or finalize_medium_clips when done.`;
3912
+ }
3913
+ case "review_medium_clips": {
3914
+ if (this.plannedClips.length === 0) {
3915
+ return "No medium clips planned yet. Analyze the transcript and call add_medium_clips to start planning.";
3916
+ }
3917
+ const summary = this.plannedClips.map((c, i) => {
3918
+ const totalDur = c.segments.reduce((sum, seg) => sum + (seg.end - seg.start), 0);
3919
+ const timeRanges = c.segments.map((seg) => `${seg.start.toFixed(1)}s\u2013${seg.end.toFixed(1)}s`).join(", ");
3920
+ const type = c.segments.length > 1 ? "compilation" : "deep dive";
3921
+ return `${i + 1}. "${c.title}" (${totalDur.toFixed(1)}s, ${type}) [${timeRanges}]
3922
+ Topic: ${c.topic} | Hook: ${c.hook}
3923
+ ${c.description}`;
3924
+ }).join("\n");
3925
+ return `## Planned medium clips (${this.plannedClips.length} total)
3926
+
3927
+ ${summary}
3928
+
3929
+ Look for gaps in transcript coverage, missed compilation opportunities, and complete topic arcs you may have overlooked.`;
3930
+ }
3931
+ case "finalize_medium_clips": {
3932
+ this.isFinalized = true;
3933
+ logger_default.info(`[MediumVideoAgent] Finalized ${this.plannedClips.length} medium clips`);
3934
+ return `Finalized ${this.plannedClips.length} medium clips. Extraction will begin.`;
3935
+ }
3936
+ default:
3937
+ throw new Error(`Unknown tool: ${toolName}`);
3771
3938
  }
3772
- throw new Error(`Unknown tool: ${toolName}`);
3773
3939
  }
3774
3940
  getPlannedClips() {
3775
3941
  return this.plannedClips;
3776
3942
  }
3943
+ getIsFinalized() {
3944
+ return this.isFinalized;
3945
+ }
3777
3946
  };
3778
3947
  }
3779
3948
  });
@@ -4307,57 +4476,70 @@ var ProducerAgent_exports = {};
4307
4476
  __export(ProducerAgent_exports, {
4308
4477
  ProducerAgent: () => ProducerAgent
4309
4478
  });
4310
- var SYSTEM_PROMPT4, PLAN_CUTS_SCHEMA, ProducerAgent;
4479
+ function mergeRemovals(removals) {
4480
+ if (removals.length <= 1) return removals;
4481
+ const sorted = [...removals].sort((a, b) => a.start - b.start);
4482
+ const merged = [{ ...sorted[0] }];
4483
+ for (let i = 1; i < sorted.length; i++) {
4484
+ const prev = merged[merged.length - 1];
4485
+ const curr = sorted[i];
4486
+ if (curr.start <= prev.end + 2) {
4487
+ prev.end = Math.max(prev.end, curr.end);
4488
+ prev.reason = `${prev.reason}; ${curr.reason}`;
4489
+ } else {
4490
+ merged.push({ ...curr });
4491
+ }
4492
+ }
4493
+ return merged;
4494
+ }
4495
+ var SYSTEM_PROMPT4, ADD_CUTS_SCHEMA, ProducerAgent;
4311
4496
  var init_ProducerAgent = __esm({
4312
4497
  "src/agents/ProducerAgent.ts"() {
4313
4498
  "use strict";
4314
4499
  init_BaseAgent();
4315
4500
  init_singlePassEdit();
4316
4501
  init_logger2();
4317
- SYSTEM_PROMPT4 = `You are a professional video cleaner. Your job is to analyze videos and identify regions that should be removed for a tighter, cleaner edit.
4318
-
4319
- ## CONTEXT TOOLS (use these first to understand the video)
4320
- - **get_video_info**: Get video dimensions, duration, and frame rate
4321
- - **get_transcript**: Read what's being said (with optional time range filtering)
4322
- - **get_editorial_direction**: Get AI-generated editorial guidance (cut points, pacing notes) from Gemini video analysis. Use this to inform your cleaning decisions.
4323
-
4324
- ## WHAT TO REMOVE
4325
- - **Dead air**: Long silences with no meaningful content
4326
- - **Filler words**: Excessive "um", "uh", "like", "you know" clusters
4327
- - **Bad takes**: False starts, stumbles, repeated sentences where the speaker restarts
4328
- - **Long pauses**: Extended gaps between sentences (>3 seconds) that don't serve a purpose
4329
- - **Redundant content**: Sections where the same point is repeated without adding value
4330
-
4331
- ## WHAT TO PRESERVE
4332
- - **Intentional pauses**: Dramatic pauses, thinking pauses before important points
4333
- - **Demonstrations**: Silence during live coding, UI interaction, or waiting for results
4334
- - **Meaningful silence**: Pauses that give the viewer time to absorb information
4335
- - **All substantive content**: When in doubt, keep it
4336
-
4337
- ## WORKFLOW
4338
-
4339
- 1. Call get_video_info to know the video duration
4340
- 2. Call get_editorial_direction to get AI-powered editorial guidance (cut points, pacing issues)
4341
- 3. Call get_transcript (in sections if long) to understand what's being said and find removable regions
4342
- 4. When ready, call **plan_cuts** with your list of regions to remove
4343
-
4344
- ## GUIDELINES
4345
- - Be conservative: aim for 10-20% removal at most
4346
- - Each removal should have a clear reason
4347
- - Don't remove short pauses (<1 second) \u2014 they sound natural
4348
- - Focus on making the video tighter, not shorter for its own sake
4349
- - Use editorial direction from Gemini to identify problematic regions`;
4350
- PLAN_CUTS_SCHEMA = {
4502
+ SYSTEM_PROMPT4 = `You are a professional video editor preparing raw footage for visual enhancement. Your goal is to produce a clean, tight edit that's ready for graphics overlays, captions, and social media distribution.
4503
+
4504
+ ## INFORMATION HIERARCHY
4505
+
4506
+ You have three sources of information:
4507
+ 1. **Editorial direction** (from Gemini video AI) \u2014 provides editorial judgment: what to cut, pacing issues, hook advice. It watched the actual video and can see visual cues the transcript cannot.
4508
+ 2. **Transcript** \u2014 the ground truth for **what was said and when**. Timestamps in the transcript are accurate. Use it to verify that editorial direction timestamps actually match the spoken content.
4509
+ 3. **Your own judgment** \u2014 use this to resolve conflicts and make final decisions.
4510
+
4511
+ ## CONFLICT RESOLUTION
4512
+
4513
+ - **Timestamps**: The transcript's timestamps are authoritative. Gemini's timestamps can drift. Always cross-reference the editorial direction's timestamps against the transcript before cutting. If Gemini says "cut 85-108 because it's dead air" but the transcript shows substantive speech at 92-105, trust the transcript.
4514
+ - **Pacing vs Cleaning**: If the Pacing Analysis recommends removing an entire range but Cleaning Recommendations only flags pieces, favor pacing \u2014 it reflects the broader viewing experience.
4515
+ - **Hook & Retention**: If this section recommends starting at a later point, that overrides granular cleaning cuts in the opening.
4516
+ - **Valuable content**: Never cut substantive content that the viewer needs to understand the video's message. Filler and dead air around valuable content should be trimmed, but the content itself must be preserved.
4517
+
4518
+ ## WHAT YOU'RE OPTIMIZING FOR
4519
+
4520
+ The video you produce will be further processed by a graphics agent that adds AI-generated image overlays, then captioned, then cut into shorts and medium clips. Your edit needs to:
4521
+ - Start with the strongest content \u2014 no dead air, no "I'm going to make a quick video" preambles
4522
+ - Flow naturally so captions and overlays land on clean, well-paced segments
4523
+ - Remove anything that isn't for the viewer (meta-commentary, editor instructions, false starts)
4524
+
4525
+ ## TOOLS
4526
+
4527
+ - **get_video_info** \u2014 video duration, dimensions, frame rate
4528
+ - **get_editorial_direction** \u2014 Gemini's full editorial report (cut points, pacing, hook advice, cleaning recommendations)
4529
+ - **get_transcript** \u2014 timestamped transcript (supports start/end filtering)
4530
+ - **add_cuts** \u2014 queue regions for removal (call as many times as needed, use decimal-second precision)
4531
+ - **finalize_cuts** \u2014 merge adjacent cuts and trigger the render (call once at the end)`;
4532
+ ADD_CUTS_SCHEMA = {
4351
4533
  type: "object",
4352
4534
  properties: {
4353
4535
  removals: {
4354
4536
  type: "array",
4355
- description: "Array of regions to remove from the video",
4537
+ description: "One or more regions to remove from the video",
4356
4538
  items: {
4357
4539
  type: "object",
4358
4540
  properties: {
4359
- start: { type: "number", description: "Start time in seconds" },
4360
- end: { type: "number", description: "End time in seconds" },
4541
+ start: { type: "number", description: "Start time in seconds (decimal precision, e.g. 14.3)" },
4542
+ end: { type: "number", description: "End time in seconds (decimal precision, e.g. 37.0)" },
4361
4543
  reason: { type: "string", description: "Why this region should be removed" }
4362
4544
  },
4363
4545
  required: ["start", "end", "reason"]
@@ -4370,6 +4552,8 @@ var init_ProducerAgent = __esm({
4370
4552
  video;
4371
4553
  videoDuration = 0;
4372
4554
  removals = [];
4555
+ renderPromise = null;
4556
+ outputPath = "";
4373
4557
  constructor(video, model) {
4374
4558
  super("ProducerAgent", SYSTEM_PROMPT4, void 0, model);
4375
4559
  this.video = video;
@@ -4401,10 +4585,16 @@ var init_ProducerAgent = __esm({
4401
4585
  handler: async () => this.handleToolCall("get_editorial_direction", {})
4402
4586
  },
4403
4587
  {
4404
- name: "plan_cuts",
4405
- description: "Submit your list of regions to remove from the video. Call this ONCE with ALL planned removals.",
4406
- parameters: PLAN_CUTS_SCHEMA,
4407
- handler: async (rawArgs) => this.handleToolCall("plan_cuts", rawArgs)
4588
+ name: "add_cuts",
4589
+ description: "Add one or more regions to remove from the video. You can call this multiple times to build your edit list incrementally as you analyze each section.",
4590
+ parameters: ADD_CUTS_SCHEMA,
4591
+ handler: async (rawArgs) => this.handleToolCall("add_cuts", rawArgs)
4592
+ },
4593
+ {
4594
+ name: "finalize_cuts",
4595
+ description: "Finalize your edit list and trigger video rendering. Call this ONCE after you have added all cuts with add_cuts. Adjacent/overlapping cuts will be merged automatically.",
4596
+ parameters: { type: "object", properties: {} },
4597
+ handler: async () => this.handleToolCall("finalize_cuts", {})
4408
4598
  }
4409
4599
  ];
4410
4600
  }
@@ -4456,11 +4646,33 @@ var init_ProducerAgent = __esm({
4456
4646
  editorialDirection: direction
4457
4647
  };
4458
4648
  }
4459
- case "plan_cuts": {
4649
+ case "add_cuts": {
4460
4650
  const { removals } = args;
4461
- logger_default.info(`[ProducerAgent] Received plan with ${removals.length} removals`);
4462
- this.removals = removals;
4463
- return `Plan received with ${removals.length} removals. Video will be rendered automatically.`;
4651
+ this.removals.push(...removals);
4652
+ logger_default.info(`[ProducerAgent] Added ${removals.length} cuts (total: ${this.removals.length})`);
4653
+ return `Added ${removals.length} cuts. Total queued: ${this.removals.length}. Call add_cuts again for more, or finalize_cuts when done.`;
4654
+ }
4655
+ case "finalize_cuts": {
4656
+ this.removals = mergeRemovals(this.removals);
4657
+ logger_default.info(`[ProducerAgent] Finalized ${this.removals.length} cuts (after merging), starting render`);
4658
+ const sortedRemovals = [...this.removals].sort((a, b) => a.start - b.start);
4659
+ const keepSegments = [];
4660
+ let cursor = 0;
4661
+ for (const removal of sortedRemovals) {
4662
+ if (removal.start > cursor) {
4663
+ keepSegments.push({ start: cursor, end: removal.start });
4664
+ }
4665
+ cursor = Math.max(cursor, removal.end);
4666
+ }
4667
+ if (cursor < this.videoDuration) {
4668
+ keepSegments.push({ start: cursor, end: this.videoDuration });
4669
+ }
4670
+ const totalRemoval = this.removals.reduce((sum, r) => sum + (r.end - r.start), 0);
4671
+ logger_default.info(
4672
+ `[ProducerAgent] ${this.removals.length} removals \u2192 ${keepSegments.length} keep segments, removing ${totalRemoval.toFixed(1)}s`
4673
+ );
4674
+ this.renderPromise = singlePassEdit(this.video.videoPath, keepSegments, this.outputPath);
4675
+ return `Rendering started with ${this.removals.length} cuts. The video is being processed in the background.`;
4464
4676
  }
4465
4677
  default:
4466
4678
  throw new Error(`Unknown tool: ${toolName}`);
@@ -4473,73 +4685,47 @@ var init_ProducerAgent = __esm({
4473
4685
  */
4474
4686
  async produce(outputPath) {
4475
4687
  this.removals = [];
4476
- const prompt = `Analyze this video and decide which segments should be removed for a cleaner edit.
4688
+ this.renderPromise = null;
4689
+ this.outputPath = outputPath;
4690
+ const prompt = `Clean this video by removing unwanted segments.
4477
4691
 
4478
4692
  **Video:** ${this.video.videoPath}
4479
4693
 
4480
- ## Instructions
4481
-
4482
- 1. Call get_video_info to know the video duration.
4483
- 2. Call get_editorial_direction to get AI-powered editorial guidance (cut points, pacing issues).
4484
- 3. Call get_transcript to understand what's being said and identify removable regions.
4485
- 4. Call **plan_cuts** with your list of regions to remove.
4486
-
4487
- Focus on removing dead air, filler words, bad takes, and redundant content. Be conservative \u2014 aim for 10-20% removal at most.`;
4694
+ Get the video info, editorial direction, and transcript. Analyze them together, then add your cuts and finalize.`;
4488
4695
  try {
4489
4696
  const response = await this.run(prompt);
4490
- logger_default.info(`[ProducerAgent] Agent planning complete for ${this.video.videoPath}`);
4491
- if (this.removals.length === 0) {
4492
- logger_default.info(`[ProducerAgent] No removals planned \u2014 video is clean`);
4697
+ logger_default.info(`[ProducerAgent] Agent conversation complete for ${this.video.videoPath}`);
4698
+ if (this.renderPromise) {
4699
+ await this.renderPromise;
4700
+ logger_default.info(`[ProducerAgent] Render complete: ${outputPath}`);
4701
+ const sortedRemovals = [...this.removals].sort((a, b) => a.start - b.start);
4702
+ const keepSegments = [];
4703
+ let cursor = 0;
4704
+ for (const removal of sortedRemovals) {
4705
+ if (removal.start > cursor) {
4706
+ keepSegments.push({ start: cursor, end: removal.start });
4707
+ }
4708
+ cursor = Math.max(cursor, removal.end);
4709
+ }
4710
+ if (cursor < this.videoDuration) {
4711
+ keepSegments.push({ start: cursor, end: this.videoDuration });
4712
+ }
4493
4713
  return {
4494
4714
  summary: response,
4715
+ outputPath,
4495
4716
  success: true,
4496
- editCount: 0,
4497
- removals: [],
4498
- keepSegments: [{ start: 0, end: this.videoDuration }]
4717
+ editCount: this.removals.length,
4718
+ removals: sortedRemovals.map((r) => ({ start: r.start, end: r.end })),
4719
+ keepSegments
4499
4720
  };
4500
4721
  }
4501
- const maxRemoval = this.videoDuration * 0.2;
4502
- let totalRemoval = 0;
4503
- const sortedByDuration = [...this.removals].sort(
4504
- (a, b) => b.end - b.start - (a.end - a.start)
4505
- );
4506
- const cappedRemovals = [];
4507
- for (const r of sortedByDuration) {
4508
- const dur = r.end - r.start;
4509
- if (totalRemoval + dur <= maxRemoval) {
4510
- cappedRemovals.push(r);
4511
- totalRemoval += dur;
4512
- }
4513
- }
4514
- if (cappedRemovals.length < this.removals.length) {
4515
- logger_default.warn(
4516
- `[ProducerAgent] Safety cap: reduced ${this.removals.length} removals to ${cappedRemovals.length} (max 20% of ${this.videoDuration}s = ${maxRemoval.toFixed(1)}s)`
4517
- );
4518
- }
4519
- const sortedRemovals = [...cappedRemovals].sort((a, b) => a.start - b.start);
4520
- const keepSegments = [];
4521
- let cursor = 0;
4522
- for (const removal of sortedRemovals) {
4523
- if (removal.start > cursor) {
4524
- keepSegments.push({ start: cursor, end: removal.start });
4525
- }
4526
- cursor = Math.max(cursor, removal.end);
4527
- }
4528
- if (cursor < this.videoDuration) {
4529
- keepSegments.push({ start: cursor, end: this.videoDuration });
4530
- }
4531
- logger_default.info(
4532
- `[ProducerAgent] ${cappedRemovals.length} removals \u2192 ${keepSegments.length} keep segments, removing ${totalRemoval.toFixed(1)}s`
4533
- );
4534
- await singlePassEdit(this.video.videoPath, keepSegments, outputPath);
4535
- logger_default.info(`[ProducerAgent] Render complete: ${outputPath}`);
4722
+ logger_default.info(`[ProducerAgent] No cuts finalized \u2014 video is clean`);
4536
4723
  return {
4537
4724
  summary: response,
4538
- outputPath,
4539
4725
  success: true,
4540
- editCount: cappedRemovals.length,
4541
- removals: sortedRemovals.map((r) => ({ start: r.start, end: r.end })),
4542
- keepSegments
4726
+ editCount: 0,
4727
+ removals: [],
4728
+ keepSegments: [{ start: 0, end: this.videoDuration }]
4543
4729
  };
4544
4730
  } catch (err) {
4545
4731
  const message = err instanceof Error ? err.message : String(err);
@@ -4563,12 +4749,14 @@ Focus on removing dead air, filler words, bad takes, and redundant content. Be c
4563
4749
  var geminiClient_exports = {};
4564
4750
  __export(geminiClient_exports, {
4565
4751
  analyzeVideoClipDirection: () => analyzeVideoClipDirection,
4566
- analyzeVideoEditorial: () => analyzeVideoEditorial
4752
+ analyzeVideoEditorial: () => analyzeVideoEditorial,
4753
+ analyzeVideoForEnhancements: () => analyzeVideoForEnhancements
4567
4754
  });
4568
4755
  import { GoogleGenAI, createUserContent, createPartFromUri } from "@google/genai";
4569
- async function analyzeVideoEditorial(videoPath, durationSeconds, model = "gemini-2.5-flash") {
4756
+ async function analyzeVideoEditorial(videoPath, durationSeconds, model) {
4570
4757
  const config2 = getConfig();
4571
4758
  const apiKey = config2.GEMINI_API_KEY;
4759
+ const resolvedModel = model ?? config2.GEMINI_MODEL;
4572
4760
  if (!apiKey) {
4573
4761
  throw new Error(
4574
4762
  "GEMINI_API_KEY is required for video editorial analysis. Get a key at https://aistudio.google.com/apikey"
@@ -4594,9 +4782,9 @@ async function analyzeVideoEditorial(videoPath, durationSeconds, model = "gemini
4594
4782
  if (fileState !== "ACTIVE") {
4595
4783
  throw new Error(`Gemini file processing failed \u2014 state: ${fileState}`);
4596
4784
  }
4597
- logger_default.info(`[Gemini] Video ready, requesting editorial analysis (model: ${model})`);
4785
+ logger_default.info(`[Gemini] Video ready, requesting editorial analysis (model: ${resolvedModel})`);
4598
4786
  const response = await ai.models.generateContent({
4599
- model,
4787
+ model: resolvedModel,
4600
4788
  contents: createUserContent([
4601
4789
  createPartFromUri(file.uri, file.mimeType),
4602
4790
  EDITORIAL_PROMPT
@@ -4609,7 +4797,7 @@ async function analyzeVideoEditorial(videoPath, durationSeconds, model = "gemini
4609
4797
  const estimatedInputTokens = Math.ceil(durationSeconds * VIDEO_TOKENS_PER_SECOND);
4610
4798
  const estimatedOutputTokens = Math.ceil(text.length / 4);
4611
4799
  costTracker.recordServiceUsage("gemini", 0, {
4612
- model,
4800
+ model: resolvedModel,
4613
4801
  durationSeconds,
4614
4802
  estimatedInputTokens,
4615
4803
  estimatedOutputTokens,
@@ -4618,9 +4806,10 @@ async function analyzeVideoEditorial(videoPath, durationSeconds, model = "gemini
4618
4806
  logger_default.info(`[Gemini] Editorial analysis complete (${text.length} chars)`);
4619
4807
  return text;
4620
4808
  }
4621
- async function analyzeVideoClipDirection(videoPath, durationSeconds, model = "gemini-2.5-flash") {
4809
+ async function analyzeVideoClipDirection(videoPath, durationSeconds, model) {
4622
4810
  const config2 = getConfig();
4623
4811
  const apiKey = config2.GEMINI_API_KEY;
4812
+ const resolvedModel = model ?? config2.GEMINI_MODEL;
4624
4813
  if (!apiKey) {
4625
4814
  throw new Error(
4626
4815
  "GEMINI_API_KEY is required for video clip direction analysis. Get a key at https://aistudio.google.com/apikey"
@@ -4646,9 +4835,9 @@ async function analyzeVideoClipDirection(videoPath, durationSeconds, model = "ge
4646
4835
  if (fileState !== "ACTIVE") {
4647
4836
  throw new Error(`Gemini file processing failed \u2014 state: ${fileState}`);
4648
4837
  }
4649
- logger_default.info(`[Gemini] Video ready, requesting clip direction analysis (model: ${model})`);
4838
+ logger_default.info(`[Gemini] Video ready, requesting clip direction analysis (model: ${resolvedModel})`);
4650
4839
  const response = await ai.models.generateContent({
4651
- model,
4840
+ model: resolvedModel,
4652
4841
  contents: createUserContent([
4653
4842
  createPartFromUri(file.uri, file.mimeType),
4654
4843
  CLIP_DIRECTION_PROMPT
@@ -4661,7 +4850,7 @@ async function analyzeVideoClipDirection(videoPath, durationSeconds, model = "ge
4661
4850
  const estimatedInputTokens = Math.ceil(durationSeconds * VIDEO_TOKENS_PER_SECOND);
4662
4851
  const estimatedOutputTokens = Math.ceil(text.length / 4);
4663
4852
  costTracker.recordServiceUsage("gemini", 0, {
4664
- model,
4853
+ model: resolvedModel,
4665
4854
  durationSeconds,
4666
4855
  estimatedInputTokens,
4667
4856
  estimatedOutputTokens,
@@ -4670,7 +4859,60 @@ async function analyzeVideoClipDirection(videoPath, durationSeconds, model = "ge
4670
4859
  logger_default.info(`[Gemini] Clip direction analysis complete (${text.length} chars)`);
4671
4860
  return text;
4672
4861
  }
4673
- var VIDEO_TOKENS_PER_SECOND, EDITORIAL_PROMPT, CLIP_DIRECTION_PROMPT;
4862
+ async function analyzeVideoForEnhancements(videoPath, durationSeconds, transcript, model) {
4863
+ const config2 = getConfig();
4864
+ const apiKey = config2.GEMINI_API_KEY;
4865
+ const resolvedModel = model ?? config2.GEMINI_MODEL;
4866
+ if (!apiKey) {
4867
+ throw new Error(
4868
+ "GEMINI_API_KEY is required for video enhancement analysis. Get a key at https://aistudio.google.com/apikey"
4869
+ );
4870
+ }
4871
+ const ai = new GoogleGenAI({ apiKey });
4872
+ logger_default.info(`[Gemini] Uploading video for enhancement analysis: ${videoPath}`);
4873
+ const file = await ai.files.upload({
4874
+ file: videoPath,
4875
+ config: { mimeType: "video/mp4" }
4876
+ });
4877
+ if (!file.uri || !file.mimeType || !file.name) {
4878
+ throw new Error("Gemini file upload failed \u2014 no URI returned");
4879
+ }
4880
+ logger_default.info(`[Gemini] Waiting for file processing to complete...`);
4881
+ let fileState = file.state;
4882
+ while (fileState === "PROCESSING") {
4883
+ await new Promise((resolve3) => setTimeout(resolve3, 2e3));
4884
+ const updated = await ai.files.get({ name: file.name });
4885
+ fileState = updated.state;
4886
+ logger_default.debug(`[Gemini] File state: ${fileState}`);
4887
+ }
4888
+ if (fileState !== "ACTIVE") {
4889
+ throw new Error(`Gemini file processing failed \u2014 state: ${fileState}`);
4890
+ }
4891
+ logger_default.info(`[Gemini] Video ready, requesting enhancement analysis (model: ${resolvedModel})`);
4892
+ const response = await ai.models.generateContent({
4893
+ model: resolvedModel,
4894
+ contents: createUserContent([
4895
+ createPartFromUri(file.uri, file.mimeType),
4896
+ ENHANCEMENT_ANALYSIS_PROMPT + transcript
4897
+ ])
4898
+ });
4899
+ const text = response.text ?? "";
4900
+ if (!text) {
4901
+ throw new Error("Gemini returned empty response");
4902
+ }
4903
+ const estimatedInputTokens = Math.ceil(durationSeconds * VIDEO_TOKENS_PER_SECOND);
4904
+ const estimatedOutputTokens = Math.ceil(text.length / 4);
4905
+ costTracker.recordServiceUsage("gemini", 0, {
4906
+ model: resolvedModel,
4907
+ durationSeconds,
4908
+ estimatedInputTokens,
4909
+ estimatedOutputTokens,
4910
+ videoFile: videoPath
4911
+ });
4912
+ logger_default.info(`[Gemini] Enhancement analysis complete (${text.length} chars)`);
4913
+ return text;
4914
+ }
4915
+ var VIDEO_TOKENS_PER_SECOND, EDITORIAL_PROMPT, CLIP_DIRECTION_PROMPT, ENHANCEMENT_ANALYSIS_PROMPT;
4674
4916
  var init_geminiClient = __esm({
4675
4917
  "src/tools/gemini/geminiClient.ts"() {
4676
4918
  "use strict";
@@ -4692,7 +4934,7 @@ Flag sections that are too slow, too fast, or have dead air. Give start/end time
4692
4934
  Identify moments where text overlays, graphics, zoom-ins, or visual emphasis would improve engagement.
4693
4935
 
4694
4936
  ## Hook & Retention
4695
- Rate the first 3 seconds (1-10) and suggest specific improvements for viewer retention.
4937
+ Rate the first 3 seconds (1-10) and suggest specific improvements for viewer retention. If the video has a weak opening (meta-commentary, dead air, false starts), recommend where the actual content begins so an editor can start the video there.
4696
4938
 
4697
4939
  ## Content Structure
4698
4940
  Break the video into intro/body sections/outro with timestamps and topic for each section.
@@ -4702,10 +4944,21 @@ Highlight the most engaging, surprising, or important moments that should be emp
4702
4944
 
4703
4945
  ## Cleaning Recommendations
4704
4946
  Identify sections that should be trimmed or removed entirely to produce a tighter edit. For each:
4705
- - Give start/end timestamps (MM:SS format)
4947
+ - Give start/end timestamps (MM:SS.s format with decimal precision, e.g. 00:14.3 - 00:37.0)
4706
4948
  - Explain why it should be removed (dead air, filler words, false starts, repeated explanations, off-topic tangents, excessive pauses)
4707
4949
  - Rate the confidence (high/medium/low) \u2014 high means definitely remove, low means optional
4708
4950
 
4951
+ After listing the recommendations in markdown, also provide a machine-readable JSON block summarizing all suggested cuts:
4952
+
4953
+ \`\`\`json:cuts
4954
+ [
4955
+ { "start": 0.0, "end": 15.2, "reason": "Opening too slow - dead air and filler", "confidence": "high" },
4956
+ { "start": 26.5, "end": 37.0, "reason": "Meta-commentary for editor", "confidence": "high" }
4957
+ ]
4958
+ \`\`\`
4959
+
4960
+ Times in the JSON block should be in seconds with decimal precision. Place cut boundaries at word boundaries.
4961
+
4709
4962
  ## Hook Snippets for Short Videos
4710
4963
  Identify the 3-5 best moments (3-8 seconds each) that could serve as attention-grabbing hooks for the beginning of short-form videos. For each:
4711
4964
  - Give start/end timestamps
@@ -4760,6 +5013,465 @@ For each recommended medium clip, provide:
4760
5013
  Identify 2-4 medium clips. Prioritize: complete explanations, tutorial segments, deep dives, and compelling narrative arcs.
4761
5014
 
4762
5015
  Be precise with timestamps. Be opinionated about what works and what doesn't. Think about what would make someone stop scrolling.`;
5016
+ ENHANCEMENT_ANALYSIS_PROMPT = `You are a visual content strategist reviewing raw video footage. Write an editorial report identifying moments where an AI-generated image overlay would genuinely enhance viewer comprehension.
5017
+
5018
+ Watch the video carefully and read the transcript below. Write a natural editorial report covering:
5019
+
5020
+ 1. **Video layout observations** \u2014 What is on screen? Is there a webcam overlay? Where is the main content area (code editor, terminal, browser)? What areas of the screen have less visual activity and could safely hold an overlay without hiding important content?
5021
+
5022
+ 2. **Enhancement opportunities** \u2014 For each moment you identify, describe:
5023
+ - The approximate timestamp range (in seconds) where the speaker is discussing the topic
5024
+ - What the speaker is explaining and what is currently visible on screen
5025
+ - The dominant background colors and brightness level at that moment (e.g., dark IDE, white browser, terminal with dark background). This helps the image designer choose contrasting colors so the overlay stands out
5026
+ - What kind of image would help (diagram, flowchart, illustration, infographic, etc.)
5027
+ - A detailed description of the image to generate
5028
+ - Why showing this image at this moment helps the viewer understand
5029
+ - Where on screen the image should go to avoid blocking important content
5030
+
5031
+ 3. **Timing guidance** \u2014 For each opportunity, note the natural start and end of the speaker's explanation. The image should appear when the topic begins and disappear when the speaker moves on. Typically 5-12 seconds is ideal \u2014 long enough to register, short enough to not overstay.
5032
+
5033
+ Important guidelines:
5034
+ - Do NOT force opportunities \u2014 if the video doesn't need visual aids, say so
5035
+ - Do NOT suggest images when the screen already shows relevant visuals (diagrams, UI demos, live coding that needs to be seen)
5036
+ - Do NOT suggest images for trivial topics that don't need visual explanation
5037
+ - Do NOT suggest images during live demonstrations where the viewer needs to see the screen clearly
5038
+ - Moments shorter than 5 seconds are too brief for an overlay to register
5039
+ - It's perfectly fine to identify 0 opportunities, 1, or several \u2014 quality over quantity
5040
+
5041
+ Write your report in natural language with clear section headers. This report will be read by a graphics agent that will make final decisions about what to generate.
5042
+
5043
+ TRANSCRIPT:
5044
+ `;
5045
+ }
5046
+ });
5047
+
5048
+ // src/tools/imageGeneration.ts
5049
+ import { writeFile } from "fs/promises";
5050
+ import { dirname as dirname3 } from "path";
5051
+ import sharp from "sharp";
5052
+ async function generateImage(prompt, outputPath, options) {
5053
+ const config2 = getConfig();
5054
+ if (!config2.OPENAI_API_KEY) {
5055
+ throw new Error("[ImageGen] OPENAI_API_KEY is required for image generation");
5056
+ }
5057
+ const size = options?.size ?? "auto";
5058
+ const quality = options?.quality ?? "high";
5059
+ const fullPrompt = (options?.style ? `${prompt}
5060
+
5061
+ Style: ${options.style}` : prompt) + IMAGE_BASE_PROMPT;
5062
+ logger_default.info(`[ImageGen] Generating image: ${prompt.substring(0, 100)}...`);
5063
+ logger_default.debug(`[ImageGen] Size: ${size}, Quality: ${quality}`);
5064
+ const response = await fetch("https://api.openai.com/v1/images/generations", {
5065
+ method: "POST",
5066
+ headers: {
5067
+ "Content-Type": "application/json",
5068
+ Authorization: `Bearer ${config2.OPENAI_API_KEY}`
5069
+ },
5070
+ body: JSON.stringify({
5071
+ model: "gpt-image-1.5",
5072
+ prompt: fullPrompt,
5073
+ n: 1,
5074
+ size,
5075
+ quality
5076
+ })
5077
+ });
5078
+ if (!response.ok) {
5079
+ const errorText = await response.text();
5080
+ logger_default.error(`[ImageGen] API error (${response.status}): ${errorText}`);
5081
+ throw new Error(`[ImageGen] OpenAI API returned ${response.status}: ${errorText}`);
5082
+ }
5083
+ const result = await response.json();
5084
+ const b64 = result.data?.[0]?.b64_json;
5085
+ if (!b64) {
5086
+ logger_default.error("[ImageGen] No b64_json in API response");
5087
+ throw new Error("[ImageGen] API response missing b64_json image data");
5088
+ }
5089
+ const rawBuffer = Buffer.from(b64, "base64");
5090
+ let validatedBuffer;
5091
+ try {
5092
+ validatedBuffer = await sharp(rawBuffer).png().toBuffer();
5093
+ } catch (error) {
5094
+ logger_default.error("[ImageGen] Failed to validate image data from API", { error });
5095
+ throw new Error("[ImageGen] Invalid image data received from API - not a valid image format");
5096
+ }
5097
+ await ensureDirectory(dirname3(outputPath));
5098
+ await writeFile(outputPath, validatedBuffer);
5099
+ const estimatedCost = COST_BY_QUALITY[quality];
5100
+ costTracker.recordServiceUsage("openai-image", estimatedCost, {
5101
+ model: "gpt-image-1.5",
5102
+ size,
5103
+ quality,
5104
+ prompt: prompt.substring(0, 200)
5105
+ });
5106
+ logger_default.info(`[ImageGen] Image saved to ${outputPath} (${validatedBuffer.length} bytes)`);
5107
+ return outputPath;
5108
+ }
5109
+ var COST_BY_QUALITY, IMAGE_BASE_PROMPT;
5110
+ var init_imageGeneration = __esm({
5111
+ "src/tools/imageGeneration.ts"() {
5112
+ "use strict";
5113
+ init_logger2();
5114
+ init_environment();
5115
+ init_costTracker();
5116
+ init_fileSystem();
5117
+ COST_BY_QUALITY = {
5118
+ low: 0.04,
5119
+ medium: 0.07,
5120
+ high: 0.07
5121
+ };
5122
+ IMAGE_BASE_PROMPT = `
5123
+
5124
+ Rendering requirements: The image MUST have a solid opaque background (not transparent). Include a thin border or subtle drop shadow around the entire image. Use a clean, flat design style suitable for overlaying on top of video content. The image should look like a polished infographic card that clearly separates from whatever is behind it.`;
5125
+ }
5126
+ });
5127
+
5128
+ // src/agents/GraphicsAgent.ts
5129
+ import sharp2 from "sharp";
5130
+ async function generateEnhancementImages(enhancementReport, enhancementsDir, videoDuration, model) {
5131
+ await ensureDirectory(enhancementsDir);
5132
+ const agent = new GraphicsAgent(model);
5133
+ agent.setContext(enhancementsDir);
5134
+ try {
5135
+ const userMessage = `Here is the editorial report from our video analyst. The video is ${videoDuration.toFixed(1)} seconds long.
5136
+
5137
+ Review each opportunity and make your editorial decision \u2014 generate an image or skip it.
5138
+
5139
+ ---
5140
+
5141
+ ${enhancementReport}`;
5142
+ await agent.run(userMessage);
5143
+ return agent.getOverlays();
5144
+ } finally {
5145
+ await agent.destroy();
5146
+ }
5147
+ }
5148
+ var SYSTEM_PROMPT5, GENERATE_ENHANCEMENT_SCHEMA, SKIP_OPPORTUNITY_SCHEMA, GraphicsAgent;
5149
+ var init_GraphicsAgent = __esm({
5150
+ "src/agents/GraphicsAgent.ts"() {
5151
+ "use strict";
5152
+ init_BaseAgent();
5153
+ init_imageGeneration();
5154
+ init_text();
5155
+ init_paths();
5156
+ init_fileSystem();
5157
+ init_logger2();
5158
+ SYSTEM_PROMPT5 = `You are a visual content designer and editorial director for educational video content. You are given an editorial report from a video analyst describing moments in a video where AI-generated image overlays could enhance viewer comprehension.
5159
+
5160
+ Your job is to make the FINAL editorial decision for each opportunity:
5161
+ 1. Decide whether to generate an image or skip the opportunity
5162
+ 2. Determine the exact timing \u2014 when the image should appear and disappear
5163
+ 3. Choose the optimal screen placement to avoid blocking important content
5164
+ 4. Write a refined, high-quality image generation prompt
5165
+
5166
+ Guidelines for editorial decisions:
5167
+ - Only generate images that genuinely add value \u2014 quality over quantity
5168
+ - Timing should match the speaker's explanation: appear when the topic starts, disappear when they move on
5169
+ - Keep display duration between 5-12 seconds \u2014 long enough to register, short enough to not overstay
5170
+ - Ensure at least 10 seconds gap between consecutive overlays to avoid visual clutter
5171
+ - Choose placement regions that avoid the webcam, main content area, and any important UI elements
5172
+ - Size should be 15-30% of video width \u2014 large enough to see, small enough to not dominate
5173
+
5174
+ Guidelines for image prompts:
5175
+ - Create clean, professional diagrams and illustrations
5176
+ - Use flat design / modern infographic style
5177
+ - Include labels and annotations when helpful
5178
+ - Avoid photorealistic imagery \u2014 prefer stylized educational graphics
5179
+ - Keep the image simple and immediately understandable at a glance
5180
+ - The image will be shown as a small overlay, so avoid tiny details
5181
+ - Use high contrast colors for visibility when overlaid on video
5182
+ - No text-heavy images \u2014 a few key labels at most
5183
+ - Let the image content dictate its natural aspect ratio \u2014 don't force square if the content is better as landscape or portrait
5184
+ - IMPORTANT: Every image MUST have a solid, opaque background (e.g., white, light gray, dark navy) \u2014 never transparent or borderless. The image will be overlaid on top of a video so it needs to stand out with clear visual separation. If the report mentions a dark video background, use a light image background (and vice versa). Add a subtle border or shadow effect in the prompt to ensure the image pops against the video content.
5185
+
5186
+ Process the report and call generate_enhancement for each image worth creating, or call skip_opportunity for those not worth generating.`;
5187
+ GENERATE_ENHANCEMENT_SCHEMA = {
5188
+ type: "object",
5189
+ properties: {
5190
+ prompt: {
5191
+ type: "string",
5192
+ description: "A refined, high-quality image generation prompt describing the visual to create"
5193
+ },
5194
+ timestampStart: {
5195
+ type: "number",
5196
+ description: "When to start showing the image (seconds from video start)"
5197
+ },
5198
+ timestampEnd: {
5199
+ type: "number",
5200
+ description: "When to stop showing the image (seconds from video start). Should be 5-12 seconds after timestampStart."
5201
+ },
5202
+ region: {
5203
+ type: "string",
5204
+ enum: ["top-left", "top-right", "bottom-left", "bottom-right", "center-right", "center-left"],
5205
+ description: "Screen region for placement, chosen to avoid blocking important content"
5206
+ },
5207
+ sizePercent: {
5208
+ type: "number",
5209
+ description: "Image width as percentage of video width (15-30)"
5210
+ },
5211
+ topic: {
5212
+ type: "string",
5213
+ description: "Brief label for what this image illustrates"
5214
+ },
5215
+ reason: {
5216
+ type: "string",
5217
+ description: "Why this visual enhancement helps the viewer"
5218
+ }
5219
+ },
5220
+ required: ["prompt", "timestampStart", "timestampEnd", "region", "sizePercent", "topic", "reason"]
5221
+ };
5222
+ SKIP_OPPORTUNITY_SCHEMA = {
5223
+ type: "object",
5224
+ properties: {
5225
+ topic: {
5226
+ type: "string",
5227
+ description: "The topic from the report that is being skipped"
5228
+ },
5229
+ reason: {
5230
+ type: "string",
5231
+ description: "Why this opportunity should be skipped"
5232
+ }
5233
+ },
5234
+ required: ["topic", "reason"]
5235
+ };
5236
+ GraphicsAgent = class extends BaseAgent {
5237
+ overlays = [];
5238
+ enhancementsDir = "";
5239
+ imageIndex = 0;
5240
+ constructor(model) {
5241
+ super("GraphicsAgent", SYSTEM_PROMPT5, void 0, model);
5242
+ }
5243
+ setContext(enhancementsDir) {
5244
+ this.enhancementsDir = enhancementsDir;
5245
+ }
5246
+ getTools() {
5247
+ return [
5248
+ {
5249
+ name: "generate_enhancement",
5250
+ description: "Generate an AI image overlay for a specific moment in the video. You decide the timing, placement, and prompt.",
5251
+ parameters: GENERATE_ENHANCEMENT_SCHEMA,
5252
+ handler: async (args) => this.handleToolCall("generate_enhancement", args)
5253
+ },
5254
+ {
5255
+ name: "skip_opportunity",
5256
+ description: "Skip an enhancement opportunity from the report that is not worth generating.",
5257
+ parameters: SKIP_OPPORTUNITY_SCHEMA,
5258
+ handler: async (args) => this.handleToolCall("skip_opportunity", args)
5259
+ }
5260
+ ];
5261
+ }
5262
+ async handleToolCall(toolName, args) {
5263
+ if (toolName === "generate_enhancement") {
5264
+ const prompt = args.prompt;
5265
+ const timestampStart = args.timestampStart;
5266
+ const timestampEnd = args.timestampEnd;
5267
+ const region = args.region;
5268
+ const sizePercent = Math.min(30, Math.max(15, args.sizePercent));
5269
+ const topic = args.topic;
5270
+ const reason = args.reason;
5271
+ const slug = slugify(topic, { lower: true, strict: true });
5272
+ const filename = `${this.imageIndex}-${slug}.png`;
5273
+ const outputPath = join(this.enhancementsDir, filename);
5274
+ try {
5275
+ await generateImage(prompt, outputPath, { size: "auto" });
5276
+ const metadata = await sharp2(outputPath).metadata();
5277
+ const width = metadata.width ?? 1024;
5278
+ const height = metadata.height ?? 1024;
5279
+ const opportunity = {
5280
+ timestampStart,
5281
+ timestampEnd,
5282
+ topic,
5283
+ imagePrompt: prompt,
5284
+ reason,
5285
+ placement: { region, avoidAreas: [], sizePercent },
5286
+ confidence: 1
5287
+ };
5288
+ const overlay = {
5289
+ opportunity,
5290
+ imagePath: outputPath,
5291
+ width,
5292
+ height
5293
+ };
5294
+ this.overlays.push(overlay);
5295
+ this.imageIndex++;
5296
+ logger_default.info(`Generated enhancement image: ${filename} (${width}x${height})`);
5297
+ return { success: true, imagePath: outputPath, dimensions: `${width}x${height}` };
5298
+ } catch (err) {
5299
+ const message = err instanceof Error ? err.message : String(err);
5300
+ logger_default.error(`Failed to generate image for "${topic}": ${message}`);
5301
+ return { error: message };
5302
+ }
5303
+ }
5304
+ if (toolName === "skip_opportunity") {
5305
+ const topic = args.topic;
5306
+ const reason = args.reason;
5307
+ logger_default.info(`Skipped enhancement opportunity "${topic}": ${reason}`);
5308
+ return { success: true, skipped: true };
5309
+ }
5310
+ throw new Error(`Unknown tool: ${toolName}`);
5311
+ }
5312
+ getOverlays() {
5313
+ return this.overlays;
5314
+ }
5315
+ };
5316
+ }
5317
+ });
5318
+
5319
+ // src/tools/ffmpeg/overlayCompositing.ts
5320
+ function getOverlayPosition(region, margin) {
5321
+ const m = String(margin);
5322
+ switch (region) {
5323
+ case "top-left":
5324
+ return { x: m, y: m };
5325
+ case "top-right":
5326
+ return { x: `(main_w-overlay_w-${m})`, y: m };
5327
+ case "bottom-left":
5328
+ return { x: m, y: `(main_h-overlay_h-${m})` };
5329
+ case "bottom-right":
5330
+ return { x: `(main_w-overlay_w-${m})`, y: `(main_h-overlay_h-${m})` };
5331
+ case "center-right":
5332
+ return { x: `(main_w-overlay_w-${m})`, y: `((main_h-overlay_h)/2)` };
5333
+ case "center-left":
5334
+ return { x: m, y: `((main_h-overlay_h)/2)` };
5335
+ }
5336
+ }
5337
+ function buildOverlayFilterComplex(overlays, videoWidth, videoHeight) {
5338
+ const margin = Math.round(videoWidth * 0.05);
5339
+ const filters = [];
5340
+ for (let i = 0; i < overlays.length; i++) {
5341
+ const overlay = overlays[i];
5342
+ const inputIdx = i + 1;
5343
+ const overlayWidth = Math.round(videoWidth * overlay.opportunity.placement.sizePercent / 100);
5344
+ const start = overlay.opportunity.timestampStart;
5345
+ const end = overlay.opportunity.timestampEnd;
5346
+ filters.push(`[${inputIdx}:v]scale=${overlayWidth}:-1,format=rgba[img_${i}]`);
5347
+ const prev = i === 0 ? "[0:v]" : `[out_${i - 1}]`;
5348
+ const isLast = i === overlays.length - 1;
5349
+ const out = isLast ? "[overlaid]" : `[out_${i}]`;
5350
+ const pos = getOverlayPosition(overlay.opportunity.placement.region, margin);
5351
+ filters.push(
5352
+ `${prev}[img_${i}]overlay=x=${pos.x}:y=${pos.y}:enable='between(t,${start},${end})':format=auto${out}`
5353
+ );
5354
+ }
5355
+ filters.push("[overlaid]format=yuv420p[outv]");
5356
+ return filters.join(";");
5357
+ }
5358
+ async function compositeOverlays(videoPath, overlays, outputPath, videoWidth, videoHeight) {
5359
+ if (overlays.length === 0) {
5360
+ throw new Error("[OverlayCompositing] No overlays provided");
5361
+ }
5362
+ const ffmpegPath6 = getFFmpegPath();
5363
+ const filterComplex = buildOverlayFilterComplex(overlays, videoWidth, videoHeight);
5364
+ const args = ["-y", "-i", videoPath];
5365
+ for (const overlay of overlays) {
5366
+ args.push("-loop", "1", "-i", overlay.imagePath);
5367
+ }
5368
+ args.push(
5369
+ "-filter_complex",
5370
+ filterComplex,
5371
+ "-map",
5372
+ "[outv]",
5373
+ "-map",
5374
+ "0:a",
5375
+ "-c:v",
5376
+ "libx264",
5377
+ "-preset",
5378
+ "ultrafast",
5379
+ "-crf",
5380
+ "23",
5381
+ "-threads",
5382
+ "4",
5383
+ "-c:a",
5384
+ "copy",
5385
+ "-shortest",
5386
+ outputPath
5387
+ );
5388
+ logger_default.info(`[OverlayCompositing] Compositing ${overlays.length} overlays \u2192 ${outputPath}`);
5389
+ return new Promise((resolve3, reject) => {
5390
+ execFileRaw(ffmpegPath6, args, { maxBuffer: 50 * 1024 * 1024 }, (error, _stdout, stderr) => {
5391
+ if (error) {
5392
+ logger_default.error(`[OverlayCompositing] FFmpeg failed: ${stderr}`);
5393
+ reject(new Error(`[OverlayCompositing] FFmpeg overlay compositing failed: ${error.message}`));
5394
+ return;
5395
+ }
5396
+ logger_default.info(`[OverlayCompositing] Complete: ${outputPath}`);
5397
+ resolve3(outputPath);
5398
+ });
5399
+ });
5400
+ }
5401
+ var init_overlayCompositing = __esm({
5402
+ "src/tools/ffmpeg/overlayCompositing.ts"() {
5403
+ "use strict";
5404
+ init_process();
5405
+ init_ffmpeg();
5406
+ init_logger2();
5407
+ }
5408
+ });
5409
+
5410
+ // src/stages/visualEnhancement.ts
5411
+ var visualEnhancement_exports = {};
5412
+ __export(visualEnhancement_exports, {
5413
+ enhanceVideo: () => enhanceVideo
5414
+ });
5415
+ async function enhanceVideo(videoPath, transcript, video) {
5416
+ const enhancementsDir = join(video.videoDir, "enhancements");
5417
+ await ensureDirectory(enhancementsDir);
5418
+ logger_default.info("[VisualEnhancement] Step 1: Analyzing video for enhancement opportunities...");
5419
+ const enhancementReport = await analyzeVideoForEnhancements(
5420
+ videoPath,
5421
+ video.duration,
5422
+ transcript.text
5423
+ );
5424
+ if (!enhancementReport || enhancementReport.trim().length === 0) {
5425
+ logger_default.info("[VisualEnhancement] No enhancement report generated \u2014 skipping");
5426
+ return void 0;
5427
+ }
5428
+ logger_default.info(`[VisualEnhancement] Received editorial report (${enhancementReport.length} chars)`);
5429
+ logger_default.info("[VisualEnhancement] Step 2: GraphicsAgent making editorial decisions and generating images...");
5430
+ const overlays = await generateEnhancementImages(
5431
+ enhancementReport,
5432
+ enhancementsDir,
5433
+ video.duration,
5434
+ getModelForAgent("GraphicsAgent")
5435
+ );
5436
+ if (overlays.length === 0) {
5437
+ logger_default.info("[VisualEnhancement] GraphicsAgent generated no images \u2014 skipping compositing");
5438
+ return void 0;
5439
+ }
5440
+ logger_default.info(`[VisualEnhancement] Generated ${overlays.length} enhancement images`);
5441
+ logger_default.info("[VisualEnhancement] Step 3: Compositing overlays onto video...");
5442
+ const outputPath = join(video.videoDir, `${video.slug}-enhanced.mp4`);
5443
+ const videoWidth = video.layout?.width ?? 1920;
5444
+ const videoHeight = video.layout?.height ?? 1080;
5445
+ const enhancedVideoPath = await compositeOverlays(
5446
+ videoPath,
5447
+ overlays,
5448
+ outputPath,
5449
+ videoWidth,
5450
+ videoHeight
5451
+ );
5452
+ logger_default.info(`[VisualEnhancement] Enhanced video created: ${enhancedVideoPath}`);
5453
+ let totalImageCost = 0;
5454
+ for (const overlay of overlays) {
5455
+ totalImageCost += 0.07;
5456
+ }
5457
+ return {
5458
+ enhancedVideoPath,
5459
+ overlays,
5460
+ analysisTokens: 0,
5461
+ // tracked by costTracker internally
5462
+ imageGenCost: totalImageCost
5463
+ };
5464
+ }
5465
+ var init_visualEnhancement = __esm({
5466
+ "src/stages/visualEnhancement.ts"() {
5467
+ "use strict";
5468
+ init_geminiClient();
5469
+ init_GraphicsAgent();
5470
+ init_overlayCompositing();
5471
+ init_modelConfig();
5472
+ init_fileSystem();
5473
+ init_paths();
5474
+ init_logger2();
4763
5475
  }
4764
5476
  });
4765
5477
 
@@ -4980,7 +5692,7 @@ async function generateSocialPosts(video, transcript, summary, outputDir, model)
4980
5692
  await agent.destroy();
4981
5693
  }
4982
5694
  }
4983
- var SYSTEM_PROMPT5, SocialMediaAgent;
5695
+ var SYSTEM_PROMPT6, SocialMediaAgent;
4984
5696
  var init_SocialMediaAgent = __esm({
4985
5697
  "src/agents/SocialMediaAgent.ts"() {
4986
5698
  "use strict";
@@ -4990,7 +5702,7 @@ var init_SocialMediaAgent = __esm({
4990
5702
  init_logger2();
4991
5703
  init_environment();
4992
5704
  init_types();
4993
- SYSTEM_PROMPT5 = `You are a viral social-media content strategist.
5705
+ SYSTEM_PROMPT6 = `You are a viral social-media content strategist.
4994
5706
  Given a video transcript and summary you MUST generate one post for each of the 5 platforms listed below.
4995
5707
  Each post must match the platform's tone, format, and constraints exactly.
4996
5708
 
@@ -5014,7 +5726,7 @@ Always call "create_posts" exactly once with all 5 platform posts.`;
5014
5726
  SocialMediaAgent = class extends BaseAgent {
5015
5727
  collectedPosts = [];
5016
5728
  constructor(model) {
5017
- super("SocialMediaAgent", SYSTEM_PROMPT5, void 0, model);
5729
+ super("SocialMediaAgent", SYSTEM_PROMPT6, void 0, model);
5018
5730
  }
5019
5731
  getMcpServers() {
5020
5732
  const config2 = getConfig();
@@ -5449,6 +6161,7 @@ var loadChapterAgent = async () => Promise.resolve().then(() => (init_ChapterAge
5449
6161
  var loadSummaryAgent = async () => Promise.resolve().then(() => (init_SummaryAgent(), SummaryAgent_exports));
5450
6162
  var loadProducerAgent = async () => Promise.resolve().then(() => (init_ProducerAgent(), ProducerAgent_exports));
5451
6163
  var loadGeminiClient = async () => Promise.resolve().then(() => (init_geminiClient(), geminiClient_exports));
6164
+ var loadVisualEnhancement = async () => Promise.resolve().then(() => (init_visualEnhancement(), visualEnhancement_exports));
5452
6165
 
5453
6166
  // src/assets/VideoAsset.ts
5454
6167
  var VideoAsset = class extends Asset {
@@ -5908,7 +6621,8 @@ var ShortVideoAsset = class extends VideoAsset {
5908
6621
  return this.videoPath;
5909
6622
  }
5910
6623
  await ensureDirectory(this.videoDir);
5911
- const parentVideo = await this.parent.getResult();
6624
+ const mainParent = this.parent;
6625
+ const parentVideo = await mainParent.getEditedVideo();
5912
6626
  await extractCompositeClip(parentVideo, this.clip.segments, this.videoPath);
5913
6627
  return this.videoPath;
5914
6628
  }
@@ -5951,6 +6665,7 @@ var ShortVideoAsset = class extends VideoAsset {
5951
6665
  init_paths();
5952
6666
  init_fileSystem();
5953
6667
  init_types();
6668
+ init_clipExtraction();
5954
6669
  var MediumClipAsset = class extends VideoAsset {
5955
6670
  /** Parent video this clip was extracted from */
5956
6671
  parent;
@@ -6012,18 +6727,20 @@ var MediumClipAsset = class extends VideoAsset {
6012
6727
  return fileExists(this.videoPath);
6013
6728
  }
6014
6729
  /**
6015
- * Get the rendered clip video path.
6730
+ * Get the rendered clip video path, extracting from parent if needed.
6731
+ * Extracts from the enhanced video so AI-generated overlays carry through.
6016
6732
  *
6017
- * @param opts - Asset options (force not used - clip must be pre-rendered)
6733
+ * @param opts - Asset options (force regeneration, etc.)
6018
6734
  * @returns Path to the rendered video file
6019
- * @throws Error if clip hasn't been rendered yet
6020
6735
  */
6021
6736
  async getResult(opts) {
6022
- if (!await this.exists()) {
6023
- throw new Error(
6024
- `Medium clip "${this.slug}" not found at ${this.videoPath}. Run the medium-clips stage first.`
6025
- );
6737
+ if (!opts?.force && await this.exists()) {
6738
+ return this.videoPath;
6026
6739
  }
6740
+ await ensureDirectory(this.videoDir);
6741
+ const mainParent = this.parent;
6742
+ const parentVideo = await mainParent.getEnhancedVideo();
6743
+ await extractCompositeClip(parentVideo, this.clip.segments, this.videoPath);
6027
6744
  return this.videoPath;
6028
6745
  }
6029
6746
  };
@@ -6222,6 +6939,10 @@ var MainVideoAsset = class _MainVideoAsset extends VideoAsset {
6222
6939
  get editedVideoPath() {
6223
6940
  return join(this.videoDir, `${this.slug}-edited.mp4`);
6224
6941
  }
6942
+ /** Path to the enhanced (visual overlays) video: videoDir/{slug}-enhanced.mp4 */
6943
+ get enhancedVideoPath() {
6944
+ return join(this.videoDir, `${this.slug}-enhanced.mp4`);
6945
+ }
6225
6946
  /** Path to the captioned video: videoDir/{slug}-captioned.mp4 */
6226
6947
  get captionedVideoPath() {
6227
6948
  return join(this.videoDir, `${this.slug}-captioned.mp4`);
@@ -6275,7 +6996,13 @@ var MainVideoAsset = class _MainVideoAsset extends VideoAsset {
6275
6996
  logger_default.info(`Ingesting video: ${sourcePath} \u2192 ${slug}`);
6276
6997
  if (await fileExists(videoDir)) {
6277
6998
  logger_default.warn(`Output folder already exists, cleaning previous artifacts: ${videoDir}`);
6278
- const subDirs = ["thumbnails", "shorts", "social-posts", "chapters", "medium-clips", "captions"];
6999
+ const subDirs = ["thumbnails", "shorts", "social-posts", "chapters", "medium-clips", "captions", "enhancements"];
7000
+ const allEntries = await listDirectory(videoDir);
7001
+ for (const entry of allEntries) {
7002
+ if (entry.endsWith("-enhance-test")) {
7003
+ await removeDirectory(join(videoDir, entry), { recursive: true, force: true });
7004
+ }
7005
+ }
6279
7006
  for (const sub of subDirs) {
6280
7007
  await removeDirectory(join(videoDir, sub), { recursive: true, force: true });
6281
7008
  }
@@ -6287,14 +7014,18 @@ var MainVideoAsset = class _MainVideoAsset extends VideoAsset {
6287
7014
  "captions.ass",
6288
7015
  "summary.md",
6289
7016
  "blog-post.md",
6290
- "README.md"
7017
+ "README.md",
7018
+ "clip-direction.md",
7019
+ "editorial-direction.md",
7020
+ "cost-report.md",
7021
+ "layout.json"
6291
7022
  ];
6292
7023
  for (const pattern of stalePatterns) {
6293
7024
  await removeFile(join(videoDir, pattern));
6294
7025
  }
6295
7026
  const files = await listDirectory(videoDir);
6296
7027
  for (const file of files) {
6297
- if (file.endsWith("-edited.mp4") || file.endsWith("-captioned.mp4") || file.endsWith("-produced.mp4")) {
7028
+ if (file.endsWith("-edited.mp4") || file.endsWith("-enhanced.mp4") || file.endsWith("-captioned.mp4") || file.endsWith("-produced.mp4")) {
6298
7029
  await removeFile(join(videoDir, file));
6299
7030
  }
6300
7031
  }
@@ -6416,9 +7147,37 @@ var MainVideoAsset = class _MainVideoAsset extends VideoAsset {
6416
7147
  logger_default.info("No silence removed, using original video");
6417
7148
  return this.videoPath;
6418
7149
  }
7150
+ /**
7151
+ * Get the enhanced (visual overlays) video.
7152
+ * If not already generated, runs the visual enhancement stage.
7153
+ * Falls back to the edited video if enhancement is skipped or finds no opportunities.
7154
+ *
7155
+ * @param opts - Options controlling generation
7156
+ * @returns Path to the enhanced or edited video
7157
+ */
7158
+ async getEnhancedVideo(opts) {
7159
+ if (!opts?.force && await fileExists(this.enhancedVideoPath)) {
7160
+ return this.enhancedVideoPath;
7161
+ }
7162
+ const config2 = getConfig();
7163
+ if (config2.SKIP_VISUAL_ENHANCEMENT) {
7164
+ return this.getEditedVideo(opts);
7165
+ }
7166
+ const editedPath = await this.getEditedVideo(opts);
7167
+ const transcript = await this.getTranscript();
7168
+ const videoFile = await this.toVideoFile();
7169
+ const { enhanceVideo: enhanceVideo2 } = await loadVisualEnhancement();
7170
+ const result = await enhanceVideo2(editedPath, transcript, videoFile);
7171
+ if (result) {
7172
+ logger_default.info(`Visual enhancement completed: ${result.overlays.length} overlays composited`);
7173
+ return result.enhancedVideoPath;
7174
+ }
7175
+ logger_default.info("No visual enhancements generated, using edited video");
7176
+ return editedPath;
7177
+ }
6419
7178
  /**
6420
7179
  * Get the captioned video.
6421
- * If not already generated, burns captions into the edited video.
7180
+ * If not already generated, burns captions into the enhanced video.
6422
7181
  *
6423
7182
  * @param opts - Options controlling generation
6424
7183
  * @returns Path to the captioned video
@@ -6427,10 +7186,10 @@ var MainVideoAsset = class _MainVideoAsset extends VideoAsset {
6427
7186
  if (!opts?.force && await fileExists(this.captionedVideoPath)) {
6428
7187
  return this.captionedVideoPath;
6429
7188
  }
6430
- const editedPath = await this.getEditedVideo(opts);
7189
+ const enhancedPath = await this.getEnhancedVideo(opts);
6431
7190
  const captions = await this.getCaptions();
6432
7191
  const { burnCaptions: burnCaptions2 } = await loadCaptionBurning();
6433
- await burnCaptions2(editedPath, captions.ass, this.captionedVideoPath);
7192
+ await burnCaptions2(enhancedPath, captions.ass, this.captionedVideoPath);
6434
7193
  logger_default.info(`Captions burned into video: ${this.captionedVideoPath}`);
6435
7194
  return this.captionedVideoPath;
6436
7195
  }
@@ -6717,6 +7476,7 @@ var CONTENT_MATRIX = {
6717
7476
  "medium-clip": { captions: true, variantKey: null }
6718
7477
  },
6719
7478
  ["linkedin" /* LinkedIn */]: {
7479
+ video: { captions: true, variantKey: null },
6720
7480
  "medium-clip": { captions: true, variantKey: null }
6721
7481
  },
6722
7482
  ["tiktok" /* TikTok */]: {
@@ -7196,9 +7956,107 @@ async function buildPublishQueue(video, shorts, mediumClips, socialPosts, captio
7196
7956
  init_ProducerAgent();
7197
7957
  init_captionBurning();
7198
7958
  init_singlePassEdit();
7959
+ init_visualEnhancement();
7199
7960
  init_modelConfig();
7200
7961
  init_costTracker();
7201
7962
  init_types();
7963
+
7964
+ // src/services/processingState.ts
7965
+ init_fileSystem();
7966
+ init_paths();
7967
+ init_environment();
7968
+ init_logger2();
7969
+ function getStatePath() {
7970
+ const config2 = getConfig();
7971
+ return join(config2.OUTPUT_DIR, "processing-state.json");
7972
+ }
7973
+ async function readState() {
7974
+ const statePath = getStatePath();
7975
+ if (!fileExistsSync(statePath)) {
7976
+ return { videos: {} };
7977
+ }
7978
+ return readJsonFile(statePath, { videos: {} });
7979
+ }
7980
+ async function writeState(state) {
7981
+ const statePath = getStatePath();
7982
+ await writeJsonFile(statePath, state);
7983
+ }
7984
+ async function getVideoStatus(slug) {
7985
+ const state = await readState();
7986
+ return state.videos[slug];
7987
+ }
7988
+ async function getUnprocessed() {
7989
+ const state = await readState();
7990
+ const result = {};
7991
+ for (const [slug, video] of Object.entries(state.videos)) {
7992
+ if (video.status === "pending" || video.status === "failed") {
7993
+ result[slug] = video;
7994
+ }
7995
+ }
7996
+ return result;
7997
+ }
7998
+ async function isCompleted(slug) {
7999
+ const status = await getVideoStatus(slug);
8000
+ return status?.status === "completed";
8001
+ }
8002
+ async function markPending(slug, sourcePath) {
8003
+ const state = await readState();
8004
+ state.videos[slug] = {
8005
+ status: "pending",
8006
+ sourcePath
8007
+ };
8008
+ await writeState(state);
8009
+ logger_default.info(`[ProcessingState] Marked pending: ${slug}`);
8010
+ }
8011
+ async function markProcessing(slug) {
8012
+ const state = await readState();
8013
+ const existing = state.videos[slug];
8014
+ if (!existing) {
8015
+ logger_default.warn(`[ProcessingState] Cannot mark processing \u2014 unknown slug: ${slug}`);
8016
+ return;
8017
+ }
8018
+ state.videos[slug] = {
8019
+ ...existing,
8020
+ status: "processing",
8021
+ startedAt: (/* @__PURE__ */ new Date()).toISOString()
8022
+ };
8023
+ await writeState(state);
8024
+ logger_default.info(`[ProcessingState] Marked processing: ${slug}`);
8025
+ }
8026
+ async function markCompleted(slug) {
8027
+ const state = await readState();
8028
+ const existing = state.videos[slug];
8029
+ if (!existing) {
8030
+ logger_default.warn(`[ProcessingState] Cannot mark completed \u2014 unknown slug: ${slug}`);
8031
+ return;
8032
+ }
8033
+ state.videos[slug] = {
8034
+ ...existing,
8035
+ status: "completed",
8036
+ completedAt: (/* @__PURE__ */ new Date()).toISOString(),
8037
+ error: void 0
8038
+ };
8039
+ await writeState(state);
8040
+ logger_default.info(`[ProcessingState] Marked completed: ${slug}`);
8041
+ }
8042
+ async function markFailed(slug, error) {
8043
+ const state = await readState();
8044
+ const existing = state.videos[slug];
8045
+ if (!existing) {
8046
+ logger_default.warn(`[ProcessingState] Cannot mark failed \u2014 unknown slug: ${slug}`);
8047
+ return;
8048
+ }
8049
+ state.videos[slug] = {
8050
+ ...existing,
8051
+ status: "failed",
8052
+ completedAt: (/* @__PURE__ */ new Date()).toISOString(),
8053
+ error
8054
+ };
8055
+ await writeState(state);
8056
+ logger_default.info(`[ProcessingState] Marked failed: ${slug} \u2014 ${error}`);
8057
+ }
8058
+
8059
+ // src/pipeline.ts
7202
8060
  async function runStage(stageName, fn, stageResults) {
7203
8061
  costTracker.setStage(stageName);
7204
8062
  const start = Date.now();
@@ -7307,6 +8165,22 @@ async function processVideo(videoPath) {
7307
8165
  }
7308
8166
  }
7309
8167
  const captionTranscript = adjustedTranscript ?? transcript;
8168
+ let enhancedVideoPath;
8169
+ if (!cfg.SKIP_VISUAL_ENHANCEMENT && captionTranscript) {
8170
+ const videoToEnhance = editedVideoPath ?? video.repoPath;
8171
+ const enhancementResult = await runStage(
8172
+ "visual-enhancement" /* VisualEnhancement */,
8173
+ async () => {
8174
+ const result = await enhanceVideo(videoToEnhance, captionTranscript, video);
8175
+ if (!result) return void 0;
8176
+ return result;
8177
+ },
8178
+ stageResults
8179
+ );
8180
+ if (enhancementResult) {
8181
+ enhancedVideoPath = enhancementResult.enhancedVideoPath;
8182
+ }
8183
+ }
7310
8184
  let captions;
7311
8185
  if (captionTranscript && !cfg.SKIP_CAPTIONS) {
7312
8186
  captions = await runStage("captions" /* Captions */, () => generateCaptions(video, captionTranscript), stageResults);
@@ -7314,7 +8188,7 @@ async function processVideo(videoPath) {
7314
8188
  let captionedVideoPath;
7315
8189
  if (captions && !cfg.SKIP_CAPTIONS) {
7316
8190
  const assFile = captions.find((p) => p.endsWith(".ass"));
7317
- if (assFile && cleaningKeepSegments) {
8191
+ if (assFile && cleaningKeepSegments && !enhancedVideoPath) {
7318
8192
  const captionedOutput = join(video.videoDir, `${video.slug}-captioned.mp4`);
7319
8193
  captionedVideoPath = await runStage(
7320
8194
  "caption-burn" /* CaptionBurn */,
@@ -7322,7 +8196,7 @@ async function processVideo(videoPath) {
7322
8196
  stageResults
7323
8197
  );
7324
8198
  } else if (assFile) {
7325
- const videoToBurn = editedVideoPath ?? video.repoPath;
8199
+ const videoToBurn = enhancedVideoPath ?? editedVideoPath ?? video.repoPath;
7326
8200
  const captionedOutput = join(video.videoDir, `${video.slug}-captioned.mp4`);
7327
8201
  captionedVideoPath = await runStage(
7328
8202
  "caption-burn" /* CaptionBurn */,
@@ -7343,13 +8217,23 @@ async function processVideo(videoPath) {
7343
8217
  }
7344
8218
  } catch {
7345
8219
  }
7346
- const result = await runStage("shorts" /* Shorts */, () => generateShorts(shortsVideo, shortsTranscript, getModelForAgent("ShortsAgent"), clipDirection), stageResults);
8220
+ let webcamRegion;
8221
+ try {
8222
+ const layoutPath = join(video.videoDir, "layout.json");
8223
+ if (await fileExists(layoutPath)) {
8224
+ const layout = await readJsonFile(layoutPath);
8225
+ webcamRegion = layout.webcam;
8226
+ }
8227
+ } catch {
8228
+ }
8229
+ const result = await runStage("shorts" /* Shorts */, () => generateShorts(shortsVideo, shortsTranscript, getModelForAgent("ShortsAgent"), clipDirection, webcamRegion), stageResults);
7347
8230
  if (result) shorts = result;
7348
8231
  }
7349
8232
  let mediumClips = [];
7350
8233
  if (transcript && !cfg.SKIP_MEDIUM_CLIPS) {
7351
8234
  const mediumTranscript = adjustedTranscript ?? transcript;
7352
- const mediumVideo = editedVideoPath ? { ...video, repoPath: editedVideoPath } : video;
8235
+ const mediumVideoPath = enhancedVideoPath ?? editedVideoPath;
8236
+ const mediumVideo = mediumVideoPath ? { ...video, repoPath: mediumVideoPath } : video;
7353
8237
  let mediumClipDirection;
7354
8238
  try {
7355
8239
  const clipDirPath = join(video.videoDir, "clip-direction.md");
@@ -7455,6 +8339,7 @@ async function processVideo(videoPath) {
7455
8339
  video,
7456
8340
  transcript,
7457
8341
  editedVideoPath,
8342
+ enhancedVideoPath,
7458
8343
  captions,
7459
8344
  captionedVideoPath,
7460
8345
  summary,
@@ -7512,11 +8397,18 @@ function generateCostMarkdown(report) {
7512
8397
  return md;
7513
8398
  }
7514
8399
  async function processVideoSafe(videoPath) {
8400
+ const filename = basename(videoPath);
8401
+ const slug = filename.replace(/\.(mp4|mov|webm|avi|mkv)$/i, "");
8402
+ await markPending(slug, videoPath);
8403
+ await markProcessing(slug);
7515
8404
  try {
7516
- return await processVideo(videoPath);
8405
+ const result = await processVideo(videoPath);
8406
+ await markCompleted(slug);
8407
+ return result;
7517
8408
  } catch (err) {
7518
8409
  const message = err instanceof Error ? err.message : String(err);
7519
8410
  logger_default.error(`Pipeline failed with uncaught error: ${message}`);
8411
+ await markFailed(slug, message);
7520
8412
  return null;
7521
8413
  }
7522
8414
  }
@@ -8918,7 +9810,7 @@ program.command("schedule").description("View the current posting schedule acros
8918
9810
  program.command("doctor").description("Check all prerequisites and dependencies").action(async () => {
8919
9811
  await runDoctor();
8920
9812
  });
8921
- var defaultCmd = program.command("process", { isDefault: true }).argument("[video-path]", "Path to a video file to process (implies --once)").option("--watch-dir <path>", "Folder to watch for new recordings (default: env WATCH_FOLDER)").option("--output-dir <path>", "Output directory for processed videos (default: ./recordings)").option("--openai-key <key>", "OpenAI API key (default: env OPENAI_API_KEY)").option("--exa-key <key>", "Exa AI API key for web search (default: env EXA_API_KEY)").option("--once", "Process a single video and exit (no watching)").option("--brand <path>", "Path to brand.json config (default: ./brand.json)").option("--no-git", "Skip git commit/push stage").option("--no-silence-removal", "Skip silence removal stage").option("--no-shorts", "Skip shorts generation").option("--no-medium-clips", "Skip medium clip generation").option("--no-social", "Skip social media post generation").option("--no-captions", "Skip caption generation/burning").option("--no-social-publish", "Skip social media publishing/queue-build stage").option("--late-api-key <key>", "Late API key (default: env LATE_API_KEY)").option("--late-profile-id <id>", "Late profile ID (default: env LATE_PROFILE_ID)").option("-v, --verbose", "Verbose logging").option("--doctor", "Check all prerequisites and exit").action(async (videoPath) => {
9813
+ var defaultCmd = program.command("process", { isDefault: true }).argument("[video-path]", "Path to a video file to process (implies --once)").option("--watch-dir <path>", "Folder to watch for new recordings (default: env WATCH_FOLDER)").option("--output-dir <path>", "Output directory for processed videos (default: ./recordings)").option("--openai-key <key>", "OpenAI API key (default: env OPENAI_API_KEY)").option("--exa-key <key>", "Exa AI API key for web search (default: env EXA_API_KEY)").option("--once", "Process a single video and exit (no watching)").option("--brand <path>", "Path to brand.json config (default: ./brand.json)").option("--no-git", "Skip git commit/push stage").option("--no-silence-removal", "Skip silence removal stage").option("--no-shorts", "Skip shorts generation").option("--no-medium-clips", "Skip medium clip generation").option("--no-social", "Skip social media post generation").option("--no-captions", "Skip caption generation/burning").option("--no-visual-enhancement", "Skip visual enhancement (AI image overlays)").option("--no-social-publish", "Skip social media publishing/queue-build stage").option("--late-api-key <key>", "Late API key (default: env LATE_API_KEY)").option("--late-profile-id <id>", "Late profile ID (default: env LATE_PROFILE_ID)").option("-v, --verbose", "Verbose logging").option("--doctor", "Check all prerequisites and exit").action(async (videoPath) => {
8922
9814
  const opts = defaultCmd.opts();
8923
9815
  if (opts.doctor) {
8924
9816
  await runDoctor();
@@ -8938,6 +9830,7 @@ var defaultCmd = program.command("process", { isDefault: true }).argument("[vide
8938
9830
  mediumClips: opts.mediumClips,
8939
9831
  social: opts.social,
8940
9832
  captions: opts.captions,
9833
+ visualEnhancement: opts.visualEnhancement,
8941
9834
  socialPublish: opts.socialPublish,
8942
9835
  lateApiKey: opts.lateApiKey,
8943
9836
  lateProfileId: opts.lateProfileId
@@ -8990,12 +9883,47 @@ var defaultCmd = program.command("process", { isDefault: true }).argument("[vide
8990
9883
  }
8991
9884
  process.on("SIGINT", () => shutdown());
8992
9885
  process.on("SIGTERM", () => shutdown());
8993
- watcher.on("new-video", (filePath) => {
9886
+ watcher.on("new-video", async (filePath) => {
9887
+ const filename = filePath.replace(/\\/g, "/").split("/").pop() ?? "";
9888
+ const slug = filename.replace(/\.(mp4|mov|webm|avi|mkv)$/i, "");
9889
+ if (slug && await isCompleted(slug)) {
9890
+ logger_default.info(`Skipping already-processed video: ${filePath}`);
9891
+ return;
9892
+ }
8994
9893
  queue.push(filePath);
8995
9894
  logger_default.info(`Queued video: ${filePath} (queue length: ${queue.length})`);
8996
9895
  processQueue().catch((err) => logger_default.error("Queue processing error:", err));
8997
9896
  });
8998
9897
  watcher.start();
9898
+ try {
9899
+ const watchFiles = listDirectorySync(config2.WATCH_FOLDER);
9900
+ for (const file of watchFiles) {
9901
+ const ext = extname(file).toLowerCase();
9902
+ if (![".mp4", ".mov", ".webm", ".avi", ".mkv"].includes(ext)) continue;
9903
+ const filePath = join(config2.WATCH_FOLDER, file);
9904
+ const slug = file.replace(/\.(mp4|mov|webm|avi|mkv)$/i, "");
9905
+ const status = await getVideoStatus(slug);
9906
+ if (!status || status.status === "failed" || status.status === "pending") {
9907
+ if (!queue.includes(filePath)) {
9908
+ queue.push(filePath);
9909
+ logger_default.info(`Startup scan: queued ${slug}${status ? ` (was ${status.status})` : " (new)"}`);
9910
+ }
9911
+ }
9912
+ }
9913
+ } catch (err) {
9914
+ logger_default.warn(`Could not scan watch folder on startup: ${err instanceof Error ? err.message : String(err)}`);
9915
+ }
9916
+ const unprocessed = await getUnprocessed();
9917
+ for (const [slug, state] of Object.entries(unprocessed)) {
9918
+ if (!queue.includes(state.sourcePath)) {
9919
+ queue.push(state.sourcePath);
9920
+ logger_default.info(`Re-queued from state: ${slug} (${state.status})`);
9921
+ }
9922
+ }
9923
+ if (queue.length > 0) {
9924
+ logger_default.info(`Startup: ${queue.length} video(s) queued for processing`);
9925
+ processQueue().catch((err) => logger_default.error("Queue processing error:", err));
9926
+ }
8999
9927
  if (onceMode) {
9000
9928
  logger_default.info("Running in --once mode. Will exit after processing the next video.");
9001
9929
  } else {