vidpipe 1.3.3 → 1.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +1160 -232
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -291,10 +291,12 @@ function initConfig(cli = {}) {
|
|
|
291
291
|
SKIP_MEDIUM_CLIPS: cli.mediumClips === false,
|
|
292
292
|
SKIP_SOCIAL: cli.social === false,
|
|
293
293
|
SKIP_CAPTIONS: cli.captions === false,
|
|
294
|
+
SKIP_VISUAL_ENHANCEMENT: cli.visualEnhancement === false,
|
|
294
295
|
LATE_API_KEY: cli.lateApiKey || process.env.LATE_API_KEY || "",
|
|
295
296
|
LATE_PROFILE_ID: cli.lateProfileId || process.env.LATE_PROFILE_ID || "",
|
|
296
297
|
SKIP_SOCIAL_PUBLISH: cli.socialPublish === false,
|
|
297
|
-
GEMINI_API_KEY: process.env.GEMINI_API_KEY || ""
|
|
298
|
+
GEMINI_API_KEY: process.env.GEMINI_API_KEY || "",
|
|
299
|
+
GEMINI_MODEL: process.env.GEMINI_MODEL || "gemini-2.5-pro"
|
|
298
300
|
};
|
|
299
301
|
return config;
|
|
300
302
|
}
|
|
@@ -724,10 +726,12 @@ async function getVideoResolution(videoPath) {
|
|
|
724
726
|
}
|
|
725
727
|
async function extractSampleFrames(videoPath, tempDir) {
|
|
726
728
|
const duration = await getVideoDuration(videoPath);
|
|
727
|
-
const
|
|
729
|
+
const effectiveSamples = Math.min(SAMPLE_FRAMES, Math.max(1, Math.floor(duration) - 1));
|
|
730
|
+
const interval = Math.max(1, Math.floor(duration / (effectiveSamples + 1)));
|
|
728
731
|
const timestamps = [];
|
|
729
|
-
for (let i = 1; i <=
|
|
730
|
-
|
|
732
|
+
for (let i = 1; i <= effectiveSamples; i++) {
|
|
733
|
+
const ts = i * interval;
|
|
734
|
+
if (ts < duration) timestamps.push(ts);
|
|
731
735
|
}
|
|
732
736
|
const framePaths = [];
|
|
733
737
|
for (let i = 0; i < timestamps.length; i++) {
|
|
@@ -871,7 +875,7 @@ function findPeakDiff(means, searchFrom, searchTo, minDiff) {
|
|
|
871
875
|
}
|
|
872
876
|
return maxDiff >= minDiff ? { index: maxIdx, magnitude: maxDiff } : { index: -1, magnitude: maxDiff };
|
|
873
877
|
}
|
|
874
|
-
async function refineBoundingBox(framePaths, position) {
|
|
878
|
+
async function refineBoundingBox(framePaths, position, minEdgeDiff = REFINE_MIN_EDGE_DIFF) {
|
|
875
879
|
if (framePaths.length === 0) return null;
|
|
876
880
|
const isRight = position.includes("right");
|
|
877
881
|
const isBottom = position.includes("bottom");
|
|
@@ -893,10 +897,10 @@ async function refineBoundingBox(framePaths, position) {
|
|
|
893
897
|
const avgRows = averageFloat64Arrays(rowMeansAll);
|
|
894
898
|
const xFrom = isRight ? Math.floor(fw * 0.35) : Math.floor(fw * 0.05);
|
|
895
899
|
const xTo = isRight ? Math.floor(fw * 0.95) : Math.floor(fw * 0.65);
|
|
896
|
-
const xEdge = findPeakDiff(avgCols, xFrom, xTo,
|
|
900
|
+
const xEdge = findPeakDiff(avgCols, xFrom, xTo, minEdgeDiff);
|
|
897
901
|
const yFrom = isBottom ? Math.floor(fh * 0.35) : Math.floor(fh * 0.05);
|
|
898
902
|
const yTo = isBottom ? Math.floor(fh * 0.95) : Math.floor(fh * 0.65);
|
|
899
|
-
const yEdge = findPeakDiff(avgRows, yFrom, yTo,
|
|
903
|
+
const yEdge = findPeakDiff(avgRows, yFrom, yTo, minEdgeDiff);
|
|
900
904
|
if (xEdge.index < 0 || yEdge.index < 0) {
|
|
901
905
|
logger_default.info(
|
|
902
906
|
`[FaceDetection] Edge refinement: no strong edges (xDiff=${xEdge.magnitude.toFixed(1)}, yDiff=${yEdge.magnitude.toFixed(1)})`
|
|
@@ -986,25 +990,43 @@ async function detectWebcamRegion(videoPath) {
|
|
|
986
990
|
y2: boxes.reduce((s, b) => s + b.y2, 0) / boxes.length,
|
|
987
991
|
confidence: bestConfidence
|
|
988
992
|
};
|
|
989
|
-
|
|
993
|
+
let refined = null;
|
|
994
|
+
refined = await refineBoundingBox(framePaths, bestPosition, REFINE_MIN_EDGE_DIFF);
|
|
995
|
+
if (!refined) {
|
|
996
|
+
for (const threshold of REFINE_RETRY_THRESHOLDS) {
|
|
997
|
+
logger_default.info(`[FaceDetection] Retrying edge refinement with threshold=${threshold}`);
|
|
998
|
+
refined = await refineBoundingBox(framePaths, bestPosition, threshold);
|
|
999
|
+
if (refined) break;
|
|
1000
|
+
}
|
|
1001
|
+
}
|
|
990
1002
|
const scaleX = resolution.width / MODEL_WIDTH;
|
|
991
1003
|
const scaleY = resolution.height / MODEL_HEIGHT;
|
|
992
|
-
let origX, origY, origW, origH;
|
|
1004
|
+
let origX = 0, origY = 0, origW = 0, origH = 0;
|
|
993
1005
|
if (refined) {
|
|
994
1006
|
origX = Math.round(refined.x * scaleX);
|
|
995
1007
|
origY = Math.round(refined.y * scaleY);
|
|
996
1008
|
origW = Math.round(refined.width * scaleX);
|
|
997
1009
|
origH = Math.round(refined.height * scaleY);
|
|
998
|
-
|
|
999
|
-
|
|
1000
|
-
|
|
1001
|
-
|
|
1002
|
-
|
|
1003
|
-
|
|
1004
|
-
|
|
1005
|
-
|
|
1006
|
-
|
|
1007
|
-
|
|
1010
|
+
const refinedAR = origW / origH;
|
|
1011
|
+
if (origW < MIN_WEBCAM_WIDTH_PX || origH < MIN_WEBCAM_HEIGHT_PX || refinedAR > MAX_WEBCAM_ASPECT_RATIO) {
|
|
1012
|
+
logger_default.info(
|
|
1013
|
+
`[FaceDetection] Refined region implausible (${origW}x${origH}px, AR=${refinedAR.toFixed(1)}), using proportional fallback`
|
|
1014
|
+
);
|
|
1015
|
+
refined = null;
|
|
1016
|
+
}
|
|
1017
|
+
}
|
|
1018
|
+
if (!refined) {
|
|
1019
|
+
const webcamWidthFrac = 0.33;
|
|
1020
|
+
const webcamHeightFrac = 0.28;
|
|
1021
|
+
origW = Math.round(resolution.width * webcamWidthFrac);
|
|
1022
|
+
origH = Math.round(resolution.height * webcamHeightFrac);
|
|
1023
|
+
const isRight = bestPosition.includes("right");
|
|
1024
|
+
const isBottom = bestPosition.includes("bottom");
|
|
1025
|
+
origX = isRight ? resolution.width - origW : 0;
|
|
1026
|
+
origY = isBottom ? resolution.height - origH : 0;
|
|
1027
|
+
logger_default.info(
|
|
1028
|
+
`[FaceDetection] Using proportional fallback: (${origX},${origY}) ${origW}x${origH}`
|
|
1029
|
+
);
|
|
1008
1030
|
}
|
|
1009
1031
|
const region = {
|
|
1010
1032
|
x: origX,
|
|
@@ -1028,7 +1050,7 @@ async function detectWebcamRegion(videoPath) {
|
|
|
1028
1050
|
});
|
|
1029
1051
|
}
|
|
1030
1052
|
}
|
|
1031
|
-
var ffmpegPath, ffprobePath, MODEL_PATH, cachedSession, SAMPLE_FRAMES, MODEL_WIDTH, MODEL_HEIGHT, MIN_FACE_CONFIDENCE, MIN_DETECTION_CONFIDENCE, REFINE_MIN_EDGE_DIFF, REFINE_MIN_SIZE_FRAC, REFINE_MAX_SIZE_FRAC;
|
|
1053
|
+
var ffmpegPath, ffprobePath, MODEL_PATH, cachedSession, SAMPLE_FRAMES, MODEL_WIDTH, MODEL_HEIGHT, MIN_FACE_CONFIDENCE, MIN_DETECTION_CONFIDENCE, REFINE_MIN_EDGE_DIFF, REFINE_RETRY_THRESHOLDS, REFINE_MIN_SIZE_FRAC, REFINE_MAX_SIZE_FRAC, MIN_WEBCAM_WIDTH_PX, MIN_WEBCAM_HEIGHT_PX, MAX_WEBCAM_ASPECT_RATIO;
|
|
1032
1054
|
var init_faceDetection = __esm({
|
|
1033
1055
|
"src/tools/ffmpeg/faceDetection.ts"() {
|
|
1034
1056
|
"use strict";
|
|
@@ -1042,14 +1064,18 @@ var init_faceDetection = __esm({
|
|
|
1042
1064
|
ffprobePath = getFFprobePath();
|
|
1043
1065
|
MODEL_PATH = join(modelsDir(), "ultraface-320.onnx");
|
|
1044
1066
|
cachedSession = null;
|
|
1045
|
-
SAMPLE_FRAMES =
|
|
1067
|
+
SAMPLE_FRAMES = 15;
|
|
1046
1068
|
MODEL_WIDTH = 320;
|
|
1047
1069
|
MODEL_HEIGHT = 240;
|
|
1048
1070
|
MIN_FACE_CONFIDENCE = 0.5;
|
|
1049
1071
|
MIN_DETECTION_CONFIDENCE = 0.3;
|
|
1050
1072
|
REFINE_MIN_EDGE_DIFF = 3;
|
|
1073
|
+
REFINE_RETRY_THRESHOLDS = [2, 1];
|
|
1051
1074
|
REFINE_MIN_SIZE_FRAC = 0.05;
|
|
1052
1075
|
REFINE_MAX_SIZE_FRAC = 0.55;
|
|
1076
|
+
MIN_WEBCAM_WIDTH_PX = 300;
|
|
1077
|
+
MIN_WEBCAM_HEIGHT_PX = 200;
|
|
1078
|
+
MAX_WEBCAM_ASPECT_RATIO = 3;
|
|
1053
1079
|
}
|
|
1054
1080
|
});
|
|
1055
1081
|
|
|
@@ -1462,16 +1488,31 @@ async function transcribeAudio(audioPath) {
|
|
|
1462
1488
|
const openai = new default4({ apiKey: config2.OPENAI_API_KEY });
|
|
1463
1489
|
try {
|
|
1464
1490
|
const prompt = getWhisperPrompt();
|
|
1465
|
-
|
|
1466
|
-
|
|
1467
|
-
|
|
1468
|
-
|
|
1469
|
-
|
|
1470
|
-
|
|
1471
|
-
|
|
1491
|
+
let response;
|
|
1492
|
+
for (let attempt = 1; attempt <= MAX_RETRIES; attempt++) {
|
|
1493
|
+
try {
|
|
1494
|
+
response = await openai.audio.transcriptions.create({
|
|
1495
|
+
model: "whisper-1",
|
|
1496
|
+
file: openReadStream(audioPath),
|
|
1497
|
+
response_format: "verbose_json",
|
|
1498
|
+
timestamp_granularities: ["word", "segment"],
|
|
1499
|
+
...prompt && { prompt }
|
|
1500
|
+
});
|
|
1501
|
+
break;
|
|
1502
|
+
} catch (retryError) {
|
|
1503
|
+
const status = typeof retryError === "object" && retryError !== null && "status" in retryError ? retryError.status : void 0;
|
|
1504
|
+
if (status === 401 || status === 400 || status === 429) throw retryError;
|
|
1505
|
+
if (attempt === MAX_RETRIES) throw retryError;
|
|
1506
|
+
const msg = retryError instanceof Error ? retryError.message : String(retryError);
|
|
1507
|
+
logger_default.warn(`Whisper attempt ${attempt}/${MAX_RETRIES} failed: ${msg} \u2014 retrying in ${RETRY_DELAY_MS / 1e3}s`);
|
|
1508
|
+
await new Promise((resolve3) => setTimeout(resolve3, RETRY_DELAY_MS));
|
|
1509
|
+
}
|
|
1510
|
+
}
|
|
1511
|
+
if (!response) throw new Error("Whisper transcription failed after all retries");
|
|
1472
1512
|
const verboseResponse = response;
|
|
1473
1513
|
const rawSegments = verboseResponse.segments ?? [];
|
|
1474
1514
|
const rawWords = verboseResponse.words ?? [];
|
|
1515
|
+
const typedResponse = response;
|
|
1475
1516
|
const words = rawWords.map((w) => ({
|
|
1476
1517
|
word: w.word,
|
|
1477
1518
|
start: w.start,
|
|
@@ -1485,20 +1526,20 @@ async function transcribeAudio(audioPath) {
|
|
|
1485
1526
|
words: rawWords.filter((w) => w.start >= s.start && w.end <= s.end).map((w) => ({ word: w.word, start: w.start, end: w.end }))
|
|
1486
1527
|
}));
|
|
1487
1528
|
logger_default.info(
|
|
1488
|
-
`Transcription complete \u2014 ${segments.length} segments, ${words.length} words, language=${
|
|
1529
|
+
`Transcription complete \u2014 ${segments.length} segments, ${words.length} words, language=${typedResponse.language}`
|
|
1489
1530
|
);
|
|
1490
|
-
const durationMinutes = (
|
|
1531
|
+
const durationMinutes = (typedResponse.duration ?? 0) / 60;
|
|
1491
1532
|
costTracker.recordServiceUsage("whisper", durationMinutes * WHISPER_COST_PER_MINUTE, {
|
|
1492
1533
|
model: "whisper-1",
|
|
1493
|
-
durationSeconds:
|
|
1534
|
+
durationSeconds: typedResponse.duration ?? 0,
|
|
1494
1535
|
audioFile: audioPath
|
|
1495
1536
|
});
|
|
1496
1537
|
return {
|
|
1497
|
-
text:
|
|
1538
|
+
text: typedResponse.text,
|
|
1498
1539
|
segments,
|
|
1499
1540
|
words,
|
|
1500
|
-
language:
|
|
1501
|
-
duration:
|
|
1541
|
+
language: typedResponse.language ?? "unknown",
|
|
1542
|
+
duration: typedResponse.duration ?? 0
|
|
1502
1543
|
};
|
|
1503
1544
|
} catch (error) {
|
|
1504
1545
|
const message = error instanceof Error ? error.message : String(error);
|
|
@@ -1513,7 +1554,7 @@ async function transcribeAudio(audioPath) {
|
|
|
1513
1554
|
throw new Error(`Whisper transcription failed: ${message}`);
|
|
1514
1555
|
}
|
|
1515
1556
|
}
|
|
1516
|
-
var MAX_FILE_SIZE_MB, WHISPER_COST_PER_MINUTE, WARN_FILE_SIZE_MB;
|
|
1557
|
+
var MAX_FILE_SIZE_MB, WHISPER_COST_PER_MINUTE, WARN_FILE_SIZE_MB, MAX_RETRIES, RETRY_DELAY_MS;
|
|
1517
1558
|
var init_whisperClient = __esm({
|
|
1518
1559
|
"src/tools/whisper/whisperClient.ts"() {
|
|
1519
1560
|
"use strict";
|
|
@@ -1526,6 +1567,8 @@ var init_whisperClient = __esm({
|
|
|
1526
1567
|
MAX_FILE_SIZE_MB = 25;
|
|
1527
1568
|
WHISPER_COST_PER_MINUTE = 6e-3;
|
|
1528
1569
|
WARN_FILE_SIZE_MB = 20;
|
|
1570
|
+
MAX_RETRIES = 3;
|
|
1571
|
+
RETRY_DELAY_MS = 5e3;
|
|
1529
1572
|
}
|
|
1530
1573
|
});
|
|
1531
1574
|
|
|
@@ -2989,6 +3032,8 @@ async function extractCompositeClipWithTransitions(videoPath, segments, outputPa
|
|
|
2989
3032
|
"[aout]",
|
|
2990
3033
|
"-c:v",
|
|
2991
3034
|
"libx264",
|
|
3035
|
+
"-pix_fmt",
|
|
3036
|
+
"yuv420p",
|
|
2992
3037
|
"-preset",
|
|
2993
3038
|
"ultrafast",
|
|
2994
3039
|
"-crf",
|
|
@@ -3087,23 +3132,24 @@ async function convertAspectRatio(inputPath, outputPath, targetRatio, options =
|
|
|
3087
3132
|
});
|
|
3088
3133
|
});
|
|
3089
3134
|
}
|
|
3090
|
-
async function convertWithSmartLayout(inputPath, outputPath, config2) {
|
|
3135
|
+
async function convertWithSmartLayout(inputPath, outputPath, config2, webcamOverride) {
|
|
3091
3136
|
const { label, targetW, screenH, camH, fallbackRatio } = config2;
|
|
3092
3137
|
const outputDir = dirname(outputPath);
|
|
3093
3138
|
await ensureDirectory(outputDir);
|
|
3094
|
-
const webcam = await detectWebcamRegion(inputPath);
|
|
3139
|
+
const webcam = webcamOverride !== void 0 ? webcamOverride : await detectWebcamRegion(inputPath);
|
|
3095
3140
|
if (!webcam) {
|
|
3096
3141
|
logger_default.info(`[${label}] No webcam found, falling back to center-crop`);
|
|
3097
3142
|
return convertAspectRatio(inputPath, outputPath, fallbackRatio);
|
|
3098
3143
|
}
|
|
3099
3144
|
const resolution = await getVideoResolution(inputPath);
|
|
3145
|
+
const margin = Math.round(resolution.width * 0.02);
|
|
3100
3146
|
let screenCropX;
|
|
3101
3147
|
let screenCropW;
|
|
3102
3148
|
if (webcam.position === "top-right" || webcam.position === "bottom-right") {
|
|
3103
3149
|
screenCropX = 0;
|
|
3104
|
-
screenCropW = webcam.x;
|
|
3150
|
+
screenCropW = Math.max(0, webcam.x - margin);
|
|
3105
3151
|
} else {
|
|
3106
|
-
screenCropX = webcam.x + webcam.width;
|
|
3152
|
+
screenCropX = webcam.x + webcam.width + margin;
|
|
3107
3153
|
screenCropW = Math.max(0, resolution.width - screenCropX);
|
|
3108
3154
|
}
|
|
3109
3155
|
const targetAR = targetW / camH;
|
|
@@ -3162,32 +3208,32 @@ async function convertWithSmartLayout(inputPath, outputPath, config2) {
|
|
|
3162
3208
|
});
|
|
3163
3209
|
});
|
|
3164
3210
|
}
|
|
3165
|
-
async function convertToPortraitSmart(inputPath, outputPath) {
|
|
3211
|
+
async function convertToPortraitSmart(inputPath, outputPath, webcamOverride) {
|
|
3166
3212
|
return convertWithSmartLayout(inputPath, outputPath, {
|
|
3167
3213
|
label: "SmartPortrait",
|
|
3168
3214
|
targetW: 1080,
|
|
3169
3215
|
screenH: 1248,
|
|
3170
3216
|
camH: 672,
|
|
3171
3217
|
fallbackRatio: "9:16"
|
|
3172
|
-
});
|
|
3218
|
+
}, webcamOverride);
|
|
3173
3219
|
}
|
|
3174
|
-
async function convertToSquareSmart(inputPath, outputPath) {
|
|
3220
|
+
async function convertToSquareSmart(inputPath, outputPath, webcamOverride) {
|
|
3175
3221
|
return convertWithSmartLayout(inputPath, outputPath, {
|
|
3176
3222
|
label: "SmartSquare",
|
|
3177
3223
|
targetW: 1080,
|
|
3178
3224
|
screenH: 700,
|
|
3179
3225
|
camH: 380,
|
|
3180
3226
|
fallbackRatio: "1:1"
|
|
3181
|
-
});
|
|
3227
|
+
}, webcamOverride);
|
|
3182
3228
|
}
|
|
3183
|
-
async function convertToFeedSmart(inputPath, outputPath) {
|
|
3229
|
+
async function convertToFeedSmart(inputPath, outputPath, webcamOverride) {
|
|
3184
3230
|
return convertWithSmartLayout(inputPath, outputPath, {
|
|
3185
3231
|
label: "SmartFeed",
|
|
3186
3232
|
targetW: 1080,
|
|
3187
3233
|
screenH: 878,
|
|
3188
3234
|
camH: 472,
|
|
3189
3235
|
fallbackRatio: "4:5"
|
|
3190
|
-
});
|
|
3236
|
+
}, webcamOverride);
|
|
3191
3237
|
}
|
|
3192
3238
|
async function generatePlatformVariants(inputPath, outputDir, slug, platforms = ["tiktok", "linkedin"], options = {}) {
|
|
3193
3239
|
await ensureDirectory(outputDir);
|
|
@@ -3208,11 +3254,11 @@ async function generatePlatformVariants(inputPath, outputDir, slug, platforms =
|
|
|
3208
3254
|
if (options.useAgent) {
|
|
3209
3255
|
logger_default.warn(`[generatePlatformVariants] LayoutAgent is disabled, falling back to ONNX pipeline`);
|
|
3210
3256
|
}
|
|
3211
|
-
await convertToPortraitSmart(inputPath, outPath);
|
|
3257
|
+
await convertToPortraitSmart(inputPath, outPath, options.webcamOverride);
|
|
3212
3258
|
} else if (ratio === "1:1") {
|
|
3213
|
-
await convertToSquareSmart(inputPath, outPath);
|
|
3259
|
+
await convertToSquareSmart(inputPath, outPath, options.webcamOverride);
|
|
3214
3260
|
} else if (ratio === "4:5") {
|
|
3215
|
-
await convertToFeedSmart(inputPath, outPath);
|
|
3261
|
+
await convertToFeedSmart(inputPath, outPath, options.webcamOverride);
|
|
3216
3262
|
} else {
|
|
3217
3263
|
await convertAspectRatio(inputPath, outPath, ratio);
|
|
3218
3264
|
}
|
|
@@ -3276,7 +3322,7 @@ var ShortsAgent_exports = {};
|
|
|
3276
3322
|
__export(ShortsAgent_exports, {
|
|
3277
3323
|
generateShorts: () => generateShorts
|
|
3278
3324
|
});
|
|
3279
|
-
async function generateShorts(video, transcript, model, clipDirection) {
|
|
3325
|
+
async function generateShorts(video, transcript, model, clipDirection, webcamOverride) {
|
|
3280
3326
|
const agent = new ShortsAgent(model);
|
|
3281
3327
|
const transcriptLines = transcript.segments.map((seg) => {
|
|
3282
3328
|
const words = seg.words.map((w) => `[${w.start.toFixed(2)}-${w.end.toFixed(2)}] ${w.word}`).join(" ");
|
|
@@ -3287,7 +3333,8 @@ Words: ${words}`;
|
|
|
3287
3333
|
`Analyze the following transcript (${transcript.duration.toFixed(0)}s total) and plan shorts.
|
|
3288
3334
|
`,
|
|
3289
3335
|
`Video: ${video.filename}`,
|
|
3290
|
-
`Duration: ${transcript.duration.toFixed(1)}s
|
|
3336
|
+
`Duration: ${transcript.duration.toFixed(1)}s`,
|
|
3337
|
+
`Target: ~${Math.max(3, Math.round(transcript.duration / 150))}\u2013${Math.max(5, Math.round(transcript.duration / 120))} shorts (scale by content richness)
|
|
3291
3338
|
`,
|
|
3292
3339
|
"--- TRANSCRIPT ---\n",
|
|
3293
3340
|
transcriptLines.join("\n\n"),
|
|
@@ -3329,7 +3376,7 @@ Words: ${words}`;
|
|
|
3329
3376
|
let variants;
|
|
3330
3377
|
try {
|
|
3331
3378
|
const defaultPlatforms = ["tiktok", "youtube-shorts", "instagram-reels", "instagram-feed", "linkedin"];
|
|
3332
|
-
const results = await generatePlatformVariants(outputPath, shortsDir, shortSlug, defaultPlatforms);
|
|
3379
|
+
const results = await generatePlatformVariants(outputPath, shortsDir, shortSlug, defaultPlatforms, { webcamOverride });
|
|
3333
3380
|
if (results.length > 0) {
|
|
3334
3381
|
variants = results.map((v) => ({
|
|
3335
3382
|
path: v.path,
|
|
@@ -3428,7 +3475,7 @@ Words: ${words}`;
|
|
|
3428
3475
|
await agent.destroy();
|
|
3429
3476
|
}
|
|
3430
3477
|
}
|
|
3431
|
-
var SYSTEM_PROMPT2,
|
|
3478
|
+
var SYSTEM_PROMPT2, ADD_SHORTS_SCHEMA, ShortsAgent;
|
|
3432
3479
|
var init_ShortsAgent = __esm({
|
|
3433
3480
|
"src/agents/ShortsAgent.ts"() {
|
|
3434
3481
|
"use strict";
|
|
@@ -3442,7 +3489,23 @@ var init_ShortsAgent = __esm({
|
|
|
3442
3489
|
init_fileSystem();
|
|
3443
3490
|
init_paths();
|
|
3444
3491
|
init_logger2();
|
|
3445
|
-
SYSTEM_PROMPT2 = `You are a short-form video content strategist. Your job is to analyze a video transcript with word-level timestamps and
|
|
3492
|
+
SYSTEM_PROMPT2 = `You are a short-form video content strategist. Your job is to **exhaustively** analyze a video transcript with word-level timestamps and extract every compelling moment as a short (15\u201360 seconds each).
|
|
3493
|
+
|
|
3494
|
+
## Your workflow
|
|
3495
|
+
1. Read the transcript and note the total duration.
|
|
3496
|
+
2. Work through the transcript **section by section** (roughly 3\u20135 minute chunks). For each chunk, identify every possible short.
|
|
3497
|
+
3. Call **add_shorts** for each batch of shorts you find. You can call it as many times as needed.
|
|
3498
|
+
4. After your first pass, call **review_shorts** to see everything you've planned so far.
|
|
3499
|
+
5. Review for gaps: are there sections of the transcript with no shorts? Could any moments be combined into composites? Did you miss any humor, insights, or quotable moments?
|
|
3500
|
+
6. Add any additional shorts you find.
|
|
3501
|
+
7. When you are confident you've exhausted all opportunities, call **finalize_shorts**.
|
|
3502
|
+
|
|
3503
|
+
## Target quantity
|
|
3504
|
+
Scale your output by video duration:
|
|
3505
|
+
- **~1 short per 2\u20133 minutes** of video content.
|
|
3506
|
+
- A 10-minute video \u2192 4\u20136 shorts. A 30-minute video \u2192 12\u201318 shorts. A 60-minute video \u2192 20\u201330 shorts.
|
|
3507
|
+
- These are guidelines, not hard caps \u2014 if the content is rich, find more. If it's sparse, find fewer.
|
|
3508
|
+
- **Never stop at 3\u20138 shorts for a long video.** Your job is to be thorough.
|
|
3446
3509
|
|
|
3447
3510
|
## What to look for
|
|
3448
3511
|
- **Key insights** \u2014 concise, quotable takeaways
|
|
@@ -3450,34 +3513,34 @@ var init_ShortsAgent = __esm({
|
|
|
3450
3513
|
- **Controversial takes** \u2014 bold opinions that spark discussion
|
|
3451
3514
|
- **Educational nuggets** \u2014 clear explanations of complex topics
|
|
3452
3515
|
- **Emotional peaks** \u2014 passion, vulnerability, excitement
|
|
3453
|
-
- **
|
|
3516
|
+
- **Audience hooks** \u2014 moments that would make someone stop scrolling
|
|
3517
|
+
- **Before/after reveals** \u2014 showing a transformation or result
|
|
3518
|
+
- **Mistakes & corrections** \u2014 relatable "oops" moments that humanize the speaker
|
|
3454
3519
|
|
|
3455
3520
|
## Short types
|
|
3456
3521
|
- **Single segment** \u2014 one contiguous section of the video
|
|
3457
|
-
- **Composite** \u2014 multiple non-contiguous segments combined into one short (great for topic compilations or
|
|
3522
|
+
- **Composite** \u2014 multiple non-contiguous segments combined into one short (great for topic compilations, building narrative arcs, or "every time X happens" montages). **Actively look for composite opportunities** \u2014 they often make the best shorts.
|
|
3458
3523
|
|
|
3459
3524
|
## Rules
|
|
3460
3525
|
1. Each short must be 15\u201360 seconds total duration.
|
|
3461
3526
|
2. Timestamps must align to word boundaries from the transcript.
|
|
3462
3527
|
3. Prefer natural sentence boundaries for clean cuts.
|
|
3463
|
-
4.
|
|
3464
|
-
5.
|
|
3465
|
-
6.
|
|
3466
|
-
7.
|
|
3467
|
-
|
|
3468
|
-
When you have identified the shorts, call the **plan_shorts** tool with your complete plan.
|
|
3528
|
+
4. Every short needs a catchy, descriptive title (5\u201310 words).
|
|
3529
|
+
5. Tags should be lowercase, no hashes, 3\u20136 per short.
|
|
3530
|
+
6. A 1-second buffer is automatically added before and after each segment boundary during extraction, so plan segments based on content timestamps without worrying about clipping words at the edges.
|
|
3531
|
+
7. Avoid significant timestamp overlap between shorts \u2014 each short should bring unique content. Small overlaps (a few seconds of shared context) are OK.
|
|
3469
3532
|
|
|
3470
3533
|
## Using Clip Direction
|
|
3471
3534
|
You may receive AI-generated clip direction with suggested shorts. Use these as a starting point but make your own decisions:
|
|
3472
3535
|
- The suggestions are based on visual + audio analysis and may identify moments you'd miss from transcript alone
|
|
3473
3536
|
- Feel free to adjust timestamps, combine suggestions, or ignore ones that don't work
|
|
3474
3537
|
- You may also find good shorts NOT in the suggestions \u2014 always analyze the full transcript`;
|
|
3475
|
-
|
|
3538
|
+
ADD_SHORTS_SCHEMA = {
|
|
3476
3539
|
type: "object",
|
|
3477
3540
|
properties: {
|
|
3478
3541
|
shorts: {
|
|
3479
3542
|
type: "array",
|
|
3480
|
-
description: "Array of
|
|
3543
|
+
description: "Array of short clips to add to the plan",
|
|
3481
3544
|
items: {
|
|
3482
3545
|
type: "object",
|
|
3483
3546
|
properties: {
|
|
@@ -3510,32 +3573,77 @@ You may receive AI-generated clip direction with suggested shorts. Use these as
|
|
|
3510
3573
|
};
|
|
3511
3574
|
ShortsAgent = class extends BaseAgent {
|
|
3512
3575
|
plannedShorts = [];
|
|
3576
|
+
isFinalized = false;
|
|
3513
3577
|
constructor(model) {
|
|
3514
3578
|
super("ShortsAgent", SYSTEM_PROMPT2, void 0, model);
|
|
3515
3579
|
}
|
|
3516
3580
|
getTools() {
|
|
3517
3581
|
return [
|
|
3518
3582
|
{
|
|
3519
|
-
name: "
|
|
3520
|
-
description: "
|
|
3521
|
-
parameters:
|
|
3583
|
+
name: "add_shorts",
|
|
3584
|
+
description: "Add one or more shorts to your plan. You can call this multiple times to build your list incrementally as you analyze each section of the transcript.",
|
|
3585
|
+
parameters: ADD_SHORTS_SCHEMA,
|
|
3522
3586
|
handler: async (args) => {
|
|
3523
|
-
return this.handleToolCall("
|
|
3587
|
+
return this.handleToolCall("add_shorts", args);
|
|
3588
|
+
}
|
|
3589
|
+
},
|
|
3590
|
+
{
|
|
3591
|
+
name: "review_shorts",
|
|
3592
|
+
description: "Review all shorts planned so far. Returns a summary of every short in your current plan. Use this to check for gaps, overlaps, or missed opportunities before finalizing.",
|
|
3593
|
+
parameters: { type: "object", properties: {} },
|
|
3594
|
+
handler: async () => {
|
|
3595
|
+
return this.handleToolCall("review_shorts", {});
|
|
3596
|
+
}
|
|
3597
|
+
},
|
|
3598
|
+
{
|
|
3599
|
+
name: "finalize_shorts",
|
|
3600
|
+
description: "Finalize your short clip plan and trigger extraction. Call this ONCE after you have added all shorts and reviewed them for completeness.",
|
|
3601
|
+
parameters: { type: "object", properties: {} },
|
|
3602
|
+
handler: async () => {
|
|
3603
|
+
return this.handleToolCall("finalize_shorts", {});
|
|
3524
3604
|
}
|
|
3525
3605
|
}
|
|
3526
3606
|
];
|
|
3527
3607
|
}
|
|
3528
3608
|
async handleToolCall(toolName, args) {
|
|
3529
|
-
|
|
3530
|
-
|
|
3531
|
-
|
|
3532
|
-
|
|
3609
|
+
switch (toolName) {
|
|
3610
|
+
case "add_shorts": {
|
|
3611
|
+
const newShorts = args.shorts;
|
|
3612
|
+
this.plannedShorts.push(...newShorts);
|
|
3613
|
+
logger_default.info(`[ShortsAgent] Added ${newShorts.length} shorts (total: ${this.plannedShorts.length})`);
|
|
3614
|
+
return `Added ${newShorts.length} shorts. Total planned: ${this.plannedShorts.length}. Call add_shorts for more, review_shorts to check your plan, or finalize_shorts when done.`;
|
|
3615
|
+
}
|
|
3616
|
+
case "review_shorts": {
|
|
3617
|
+
if (this.plannedShorts.length === 0) {
|
|
3618
|
+
return "No shorts planned yet. Analyze the transcript and call add_shorts to start planning.";
|
|
3619
|
+
}
|
|
3620
|
+
const summary = this.plannedShorts.map((s, i) => {
|
|
3621
|
+
const totalDur = s.segments.reduce((sum, seg) => sum + (seg.end - seg.start), 0);
|
|
3622
|
+
const timeRanges = s.segments.map((seg) => `${seg.start.toFixed(1)}s\u2013${seg.end.toFixed(1)}s`).join(", ");
|
|
3623
|
+
const type = s.segments.length > 1 ? "composite" : "single";
|
|
3624
|
+
return `${i + 1}. "${s.title}" (${totalDur.toFixed(1)}s, ${type}) [${timeRanges}] \u2014 ${s.description}`;
|
|
3625
|
+
}).join("\n");
|
|
3626
|
+
return `## Planned shorts (${this.plannedShorts.length} total)
|
|
3627
|
+
|
|
3628
|
+
${summary}
|
|
3629
|
+
|
|
3630
|
+
Look for gaps in transcript coverage, missed composite opportunities, and any additional compelling moments.`;
|
|
3631
|
+
}
|
|
3632
|
+
case "finalize_shorts": {
|
|
3633
|
+
this.isFinalized = true;
|
|
3634
|
+
logger_default.info(`[ShortsAgent] Finalized ${this.plannedShorts.length} shorts`);
|
|
3635
|
+
return `Finalized ${this.plannedShorts.length} shorts. Extraction will begin.`;
|
|
3636
|
+
}
|
|
3637
|
+
default:
|
|
3638
|
+
throw new Error(`Unknown tool: ${toolName}`);
|
|
3533
3639
|
}
|
|
3534
|
-
throw new Error(`Unknown tool: ${toolName}`);
|
|
3535
3640
|
}
|
|
3536
3641
|
getPlannedShorts() {
|
|
3537
3642
|
return this.plannedShorts;
|
|
3538
3643
|
}
|
|
3644
|
+
getIsFinalized() {
|
|
3645
|
+
return this.isFinalized;
|
|
3646
|
+
}
|
|
3539
3647
|
};
|
|
3540
3648
|
}
|
|
3541
3649
|
});
|
|
@@ -3556,7 +3664,8 @@ Words: ${words}`;
|
|
|
3556
3664
|
`Analyze the following transcript (${transcript.duration.toFixed(0)}s total) and plan medium-length clips (1\u20133 minutes each).
|
|
3557
3665
|
`,
|
|
3558
3666
|
`Video: ${video.filename}`,
|
|
3559
|
-
`Duration: ${transcript.duration.toFixed(1)}s
|
|
3667
|
+
`Duration: ${transcript.duration.toFixed(1)}s`,
|
|
3668
|
+
`Target: ~${Math.max(1, Math.round(transcript.duration / 480))}\u2013${Math.max(2, Math.round(transcript.duration / 300))} medium clips (scale by content richness)
|
|
3560
3669
|
`,
|
|
3561
3670
|
"--- TRANSCRIPT ---\n",
|
|
3562
3671
|
transcriptLines.join("\n\n"),
|
|
@@ -3649,7 +3758,7 @@ Words: ${words}`;
|
|
|
3649
3758
|
await agent.destroy();
|
|
3650
3759
|
}
|
|
3651
3760
|
}
|
|
3652
|
-
var SYSTEM_PROMPT3,
|
|
3761
|
+
var SYSTEM_PROMPT3, ADD_MEDIUM_CLIPS_SCHEMA, MediumVideoAgent;
|
|
3653
3762
|
var init_MediumVideoAgent = __esm({
|
|
3654
3763
|
"src/agents/MediumVideoAgent.ts"() {
|
|
3655
3764
|
"use strict";
|
|
@@ -3662,7 +3771,23 @@ var init_MediumVideoAgent = __esm({
|
|
|
3662
3771
|
init_fileSystem();
|
|
3663
3772
|
init_paths();
|
|
3664
3773
|
init_logger2();
|
|
3665
|
-
SYSTEM_PROMPT3 = `You are a medium-form video content strategist. Your job is to analyze a video transcript with word-level timestamps and
|
|
3774
|
+
SYSTEM_PROMPT3 = `You are a medium-form video content strategist. Your job is to **exhaustively** analyze a video transcript with word-level timestamps and extract every viable 1\u20133 minute segment as a standalone medium-form clip.
|
|
3775
|
+
|
|
3776
|
+
## Your workflow
|
|
3777
|
+
1. Read the transcript and note the total duration.
|
|
3778
|
+
2. Work through the transcript **section by section** (roughly 5\u20138 minute chunks). For each chunk, identify every complete topic or narrative arc.
|
|
3779
|
+
3. Call **add_medium_clips** for each batch of clips you find. You can call it as many times as needed.
|
|
3780
|
+
4. After your first pass, call **review_medium_clips** to see everything you've planned so far.
|
|
3781
|
+
5. Review for gaps: are there complete topics you missed? Could non-contiguous mentions of the same theme be compiled? Is there a tutorial segment that stands alone?
|
|
3782
|
+
6. Add any additional clips you find.
|
|
3783
|
+
7. When you are confident you've exhausted all opportunities, call **finalize_medium_clips**.
|
|
3784
|
+
|
|
3785
|
+
## Target quantity
|
|
3786
|
+
Scale your output by video duration:
|
|
3787
|
+
- **~1 medium clip per 5\u20138 minutes** of video content.
|
|
3788
|
+
- A 10-minute video \u2192 1\u20132 clips. A 30-minute video \u2192 4\u20136 clips. A 60-minute video \u2192 8\u201312 clips.
|
|
3789
|
+
- These are guidelines, not hard caps \u2014 if the content is rich, find more.
|
|
3790
|
+
- **Never stop at 2\u20134 clips for a long video.** Your job is to be thorough.
|
|
3666
3791
|
|
|
3667
3792
|
## What to look for
|
|
3668
3793
|
|
|
@@ -3671,7 +3796,7 @@ var init_MediumVideoAgent = __esm({
|
|
|
3671
3796
|
- **Educational deep dives** \u2014 clear, thorough explanations of complex topics
|
|
3672
3797
|
- **Compelling stories** \u2014 anecdotes with setup, tension, and resolution
|
|
3673
3798
|
- **Strong arguments** \u2014 claim \u2192 evidence \u2192 implication sequences
|
|
3674
|
-
- **Topic compilations** \u2014 multiple brief mentions of one theme across the video that can be compiled into a cohesive 1\u20133 minute segment
|
|
3799
|
+
- **Topic compilations** \u2014 multiple brief mentions of one theme across the video that can be compiled into a cohesive 1\u20133 minute segment. **Actively look for these** \u2014 they often make excellent content.
|
|
3675
3800
|
|
|
3676
3801
|
## Clip types
|
|
3677
3802
|
|
|
@@ -3684,12 +3809,12 @@ var init_MediumVideoAgent = __esm({
|
|
|
3684
3809
|
2. Timestamps must align to word boundaries from the transcript.
|
|
3685
3810
|
3. Prefer natural sentence and paragraph boundaries for clean entry/exit points.
|
|
3686
3811
|
4. Each clip must be self-contained \u2014 a viewer with no other context should understand and get value from the clip.
|
|
3687
|
-
5.
|
|
3688
|
-
6.
|
|
3689
|
-
7.
|
|
3690
|
-
8.
|
|
3691
|
-
9.
|
|
3692
|
-
10.
|
|
3812
|
+
5. Every clip needs a descriptive title (5\u201312 words) and a topic label.
|
|
3813
|
+
6. For compilations, specify segments in the order they should appear in the final clip (which may differ from chronological order).
|
|
3814
|
+
7. Tags should be lowercase, no hashes, 3\u20136 per clip.
|
|
3815
|
+
8. A 1-second buffer is automatically added around each segment boundary.
|
|
3816
|
+
9. Each clip needs a hook \u2014 the opening line or concept that draws viewers in.
|
|
3817
|
+
10. Avoid significant overlap with content that would work better as a short (punchy, viral, single-moment).
|
|
3693
3818
|
|
|
3694
3819
|
## Differences from shorts
|
|
3695
3820
|
|
|
@@ -3697,9 +3822,6 @@ var init_MediumVideoAgent = __esm({
|
|
|
3697
3822
|
- Don't just find the most exciting 60 seconds \u2014 find where a topic starts and where it naturally concludes.
|
|
3698
3823
|
- It's OK if a medium clip has slower pacing \u2014 depth and coherence matter more than constant high energy.
|
|
3699
3824
|
- Look for segments that work as standalone mini-tutorials or explanations.
|
|
3700
|
-
- Avoid overlap with content that would work better as a short (punchy, viral, single-moment).
|
|
3701
|
-
|
|
3702
|
-
When you have identified the clips, call the **plan_medium_clips** tool with your complete plan.
|
|
3703
3825
|
|
|
3704
3826
|
## Using Clip Direction
|
|
3705
3827
|
You may receive AI-generated clip direction with suggested medium clips. Use these as a starting point but make your own decisions:
|
|
@@ -3707,12 +3829,12 @@ You may receive AI-generated clip direction with suggested medium clips. Use the
|
|
|
3707
3829
|
- Feel free to adjust timestamps, combine suggestions, or ignore ones that don't work
|
|
3708
3830
|
- You may also find good clips NOT in the suggestions \u2014 always analyze the full transcript
|
|
3709
3831
|
- Pay special attention to suggested hooks and topic arcs \u2014 they come from multimodal analysis`;
|
|
3710
|
-
|
|
3832
|
+
ADD_MEDIUM_CLIPS_SCHEMA = {
|
|
3711
3833
|
type: "object",
|
|
3712
3834
|
properties: {
|
|
3713
3835
|
clips: {
|
|
3714
3836
|
type: "array",
|
|
3715
|
-
description: "Array of
|
|
3837
|
+
description: "Array of medium-length clips to add to the plan",
|
|
3716
3838
|
items: {
|
|
3717
3839
|
type: "object",
|
|
3718
3840
|
properties: {
|
|
@@ -3748,32 +3870,79 @@ You may receive AI-generated clip direction with suggested medium clips. Use the
|
|
|
3748
3870
|
};
|
|
3749
3871
|
MediumVideoAgent = class extends BaseAgent {
|
|
3750
3872
|
plannedClips = [];
|
|
3873
|
+
isFinalized = false;
|
|
3751
3874
|
constructor(model) {
|
|
3752
3875
|
super("MediumVideoAgent", SYSTEM_PROMPT3, void 0, model);
|
|
3753
3876
|
}
|
|
3754
3877
|
getTools() {
|
|
3755
3878
|
return [
|
|
3756
3879
|
{
|
|
3757
|
-
name: "
|
|
3758
|
-
description: "
|
|
3759
|
-
parameters:
|
|
3880
|
+
name: "add_medium_clips",
|
|
3881
|
+
description: "Add one or more medium clips to your plan. You can call this multiple times to build your list incrementally as you analyze each section of the transcript.",
|
|
3882
|
+
parameters: ADD_MEDIUM_CLIPS_SCHEMA,
|
|
3760
3883
|
handler: async (args) => {
|
|
3761
|
-
return this.handleToolCall("
|
|
3884
|
+
return this.handleToolCall("add_medium_clips", args);
|
|
3885
|
+
}
|
|
3886
|
+
},
|
|
3887
|
+
{
|
|
3888
|
+
name: "review_medium_clips",
|
|
3889
|
+
description: "Review all medium clips planned so far. Returns a summary of every clip in your current plan. Use this to check for gaps, overlaps, or missed opportunities before finalizing.",
|
|
3890
|
+
parameters: { type: "object", properties: {} },
|
|
3891
|
+
handler: async () => {
|
|
3892
|
+
return this.handleToolCall("review_medium_clips", {});
|
|
3893
|
+
}
|
|
3894
|
+
},
|
|
3895
|
+
{
|
|
3896
|
+
name: "finalize_medium_clips",
|
|
3897
|
+
description: "Finalize your medium clip plan and trigger extraction. Call this ONCE after you have added all clips and reviewed them for completeness.",
|
|
3898
|
+
parameters: { type: "object", properties: {} },
|
|
3899
|
+
handler: async () => {
|
|
3900
|
+
return this.handleToolCall("finalize_medium_clips", {});
|
|
3762
3901
|
}
|
|
3763
3902
|
}
|
|
3764
3903
|
];
|
|
3765
3904
|
}
|
|
3766
3905
|
async handleToolCall(toolName, args) {
|
|
3767
|
-
|
|
3768
|
-
|
|
3769
|
-
|
|
3770
|
-
|
|
3906
|
+
switch (toolName) {
|
|
3907
|
+
case "add_medium_clips": {
|
|
3908
|
+
const newClips = args.clips;
|
|
3909
|
+
this.plannedClips.push(...newClips);
|
|
3910
|
+
logger_default.info(`[MediumVideoAgent] Added ${newClips.length} clips (total: ${this.plannedClips.length})`);
|
|
3911
|
+
return `Added ${newClips.length} clips. Total planned: ${this.plannedClips.length}. Call add_medium_clips for more, review_medium_clips to check your plan, or finalize_medium_clips when done.`;
|
|
3912
|
+
}
|
|
3913
|
+
case "review_medium_clips": {
|
|
3914
|
+
if (this.plannedClips.length === 0) {
|
|
3915
|
+
return "No medium clips planned yet. Analyze the transcript and call add_medium_clips to start planning.";
|
|
3916
|
+
}
|
|
3917
|
+
const summary = this.plannedClips.map((c, i) => {
|
|
3918
|
+
const totalDur = c.segments.reduce((sum, seg) => sum + (seg.end - seg.start), 0);
|
|
3919
|
+
const timeRanges = c.segments.map((seg) => `${seg.start.toFixed(1)}s\u2013${seg.end.toFixed(1)}s`).join(", ");
|
|
3920
|
+
const type = c.segments.length > 1 ? "compilation" : "deep dive";
|
|
3921
|
+
return `${i + 1}. "${c.title}" (${totalDur.toFixed(1)}s, ${type}) [${timeRanges}]
|
|
3922
|
+
Topic: ${c.topic} | Hook: ${c.hook}
|
|
3923
|
+
${c.description}`;
|
|
3924
|
+
}).join("\n");
|
|
3925
|
+
return `## Planned medium clips (${this.plannedClips.length} total)
|
|
3926
|
+
|
|
3927
|
+
${summary}
|
|
3928
|
+
|
|
3929
|
+
Look for gaps in transcript coverage, missed compilation opportunities, and complete topic arcs you may have overlooked.`;
|
|
3930
|
+
}
|
|
3931
|
+
case "finalize_medium_clips": {
|
|
3932
|
+
this.isFinalized = true;
|
|
3933
|
+
logger_default.info(`[MediumVideoAgent] Finalized ${this.plannedClips.length} medium clips`);
|
|
3934
|
+
return `Finalized ${this.plannedClips.length} medium clips. Extraction will begin.`;
|
|
3935
|
+
}
|
|
3936
|
+
default:
|
|
3937
|
+
throw new Error(`Unknown tool: ${toolName}`);
|
|
3771
3938
|
}
|
|
3772
|
-
throw new Error(`Unknown tool: ${toolName}`);
|
|
3773
3939
|
}
|
|
3774
3940
|
getPlannedClips() {
|
|
3775
3941
|
return this.plannedClips;
|
|
3776
3942
|
}
|
|
3943
|
+
getIsFinalized() {
|
|
3944
|
+
return this.isFinalized;
|
|
3945
|
+
}
|
|
3777
3946
|
};
|
|
3778
3947
|
}
|
|
3779
3948
|
});
|
|
@@ -4307,57 +4476,70 @@ var ProducerAgent_exports = {};
|
|
|
4307
4476
|
__export(ProducerAgent_exports, {
|
|
4308
4477
|
ProducerAgent: () => ProducerAgent
|
|
4309
4478
|
});
|
|
4310
|
-
|
|
4479
|
+
function mergeRemovals(removals) {
|
|
4480
|
+
if (removals.length <= 1) return removals;
|
|
4481
|
+
const sorted = [...removals].sort((a, b) => a.start - b.start);
|
|
4482
|
+
const merged = [{ ...sorted[0] }];
|
|
4483
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
4484
|
+
const prev = merged[merged.length - 1];
|
|
4485
|
+
const curr = sorted[i];
|
|
4486
|
+
if (curr.start <= prev.end + 2) {
|
|
4487
|
+
prev.end = Math.max(prev.end, curr.end);
|
|
4488
|
+
prev.reason = `${prev.reason}; ${curr.reason}`;
|
|
4489
|
+
} else {
|
|
4490
|
+
merged.push({ ...curr });
|
|
4491
|
+
}
|
|
4492
|
+
}
|
|
4493
|
+
return merged;
|
|
4494
|
+
}
|
|
4495
|
+
var SYSTEM_PROMPT4, ADD_CUTS_SCHEMA, ProducerAgent;
|
|
4311
4496
|
var init_ProducerAgent = __esm({
|
|
4312
4497
|
"src/agents/ProducerAgent.ts"() {
|
|
4313
4498
|
"use strict";
|
|
4314
4499
|
init_BaseAgent();
|
|
4315
4500
|
init_singlePassEdit();
|
|
4316
4501
|
init_logger2();
|
|
4317
|
-
SYSTEM_PROMPT4 = `You are a professional video
|
|
4318
|
-
|
|
4319
|
-
##
|
|
4320
|
-
|
|
4321
|
-
|
|
4322
|
-
|
|
4323
|
-
|
|
4324
|
-
|
|
4325
|
-
|
|
4326
|
-
|
|
4327
|
-
|
|
4328
|
-
- **
|
|
4329
|
-
- **
|
|
4330
|
-
|
|
4331
|
-
|
|
4332
|
-
|
|
4333
|
-
|
|
4334
|
-
|
|
4335
|
-
|
|
4336
|
-
|
|
4337
|
-
|
|
4338
|
-
|
|
4339
|
-
|
|
4340
|
-
|
|
4341
|
-
|
|
4342
|
-
|
|
4343
|
-
|
|
4344
|
-
|
|
4345
|
-
-
|
|
4346
|
-
-
|
|
4347
|
-
|
|
4348
|
-
- Focus on making the video tighter, not shorter for its own sake
|
|
4349
|
-
- Use editorial direction from Gemini to identify problematic regions`;
|
|
4350
|
-
PLAN_CUTS_SCHEMA = {
|
|
4502
|
+
SYSTEM_PROMPT4 = `You are a professional video editor preparing raw footage for visual enhancement. Your goal is to produce a clean, tight edit that's ready for graphics overlays, captions, and social media distribution.
|
|
4503
|
+
|
|
4504
|
+
## INFORMATION HIERARCHY
|
|
4505
|
+
|
|
4506
|
+
You have three sources of information:
|
|
4507
|
+
1. **Editorial direction** (from Gemini video AI) \u2014 provides editorial judgment: what to cut, pacing issues, hook advice. It watched the actual video and can see visual cues the transcript cannot.
|
|
4508
|
+
2. **Transcript** \u2014 the ground truth for **what was said and when**. Timestamps in the transcript are accurate. Use it to verify that editorial direction timestamps actually match the spoken content.
|
|
4509
|
+
3. **Your own judgment** \u2014 use this to resolve conflicts and make final decisions.
|
|
4510
|
+
|
|
4511
|
+
## CONFLICT RESOLUTION
|
|
4512
|
+
|
|
4513
|
+
- **Timestamps**: The transcript's timestamps are authoritative. Gemini's timestamps can drift. Always cross-reference the editorial direction's timestamps against the transcript before cutting. If Gemini says "cut 85-108 because it's dead air" but the transcript shows substantive speech at 92-105, trust the transcript.
|
|
4514
|
+
- **Pacing vs Cleaning**: If the Pacing Analysis recommends removing an entire range but Cleaning Recommendations only flags pieces, favor pacing \u2014 it reflects the broader viewing experience.
|
|
4515
|
+
- **Hook & Retention**: If this section recommends starting at a later point, that overrides granular cleaning cuts in the opening.
|
|
4516
|
+
- **Valuable content**: Never cut substantive content that the viewer needs to understand the video's message. Filler and dead air around valuable content should be trimmed, but the content itself must be preserved.
|
|
4517
|
+
|
|
4518
|
+
## WHAT YOU'RE OPTIMIZING FOR
|
|
4519
|
+
|
|
4520
|
+
The video you produce will be further processed by a graphics agent that adds AI-generated image overlays, then captioned, then cut into shorts and medium clips. Your edit needs to:
|
|
4521
|
+
- Start with the strongest content \u2014 no dead air, no "I'm going to make a quick video" preambles
|
|
4522
|
+
- Flow naturally so captions and overlays land on clean, well-paced segments
|
|
4523
|
+
- Remove anything that isn't for the viewer (meta-commentary, editor instructions, false starts)
|
|
4524
|
+
|
|
4525
|
+
## TOOLS
|
|
4526
|
+
|
|
4527
|
+
- **get_video_info** \u2014 video duration, dimensions, frame rate
|
|
4528
|
+
- **get_editorial_direction** \u2014 Gemini's full editorial report (cut points, pacing, hook advice, cleaning recommendations)
|
|
4529
|
+
- **get_transcript** \u2014 timestamped transcript (supports start/end filtering)
|
|
4530
|
+
- **add_cuts** \u2014 queue regions for removal (call as many times as needed, use decimal-second precision)
|
|
4531
|
+
- **finalize_cuts** \u2014 merge adjacent cuts and trigger the render (call once at the end)`;
|
|
4532
|
+
ADD_CUTS_SCHEMA = {
|
|
4351
4533
|
type: "object",
|
|
4352
4534
|
properties: {
|
|
4353
4535
|
removals: {
|
|
4354
4536
|
type: "array",
|
|
4355
|
-
description: "
|
|
4537
|
+
description: "One or more regions to remove from the video",
|
|
4356
4538
|
items: {
|
|
4357
4539
|
type: "object",
|
|
4358
4540
|
properties: {
|
|
4359
|
-
start: { type: "number", description: "Start time in seconds" },
|
|
4360
|
-
end: { type: "number", description: "End time in seconds" },
|
|
4541
|
+
start: { type: "number", description: "Start time in seconds (decimal precision, e.g. 14.3)" },
|
|
4542
|
+
end: { type: "number", description: "End time in seconds (decimal precision, e.g. 37.0)" },
|
|
4361
4543
|
reason: { type: "string", description: "Why this region should be removed" }
|
|
4362
4544
|
},
|
|
4363
4545
|
required: ["start", "end", "reason"]
|
|
@@ -4370,6 +4552,8 @@ var init_ProducerAgent = __esm({
|
|
|
4370
4552
|
video;
|
|
4371
4553
|
videoDuration = 0;
|
|
4372
4554
|
removals = [];
|
|
4555
|
+
renderPromise = null;
|
|
4556
|
+
outputPath = "";
|
|
4373
4557
|
constructor(video, model) {
|
|
4374
4558
|
super("ProducerAgent", SYSTEM_PROMPT4, void 0, model);
|
|
4375
4559
|
this.video = video;
|
|
@@ -4401,10 +4585,16 @@ var init_ProducerAgent = __esm({
|
|
|
4401
4585
|
handler: async () => this.handleToolCall("get_editorial_direction", {})
|
|
4402
4586
|
},
|
|
4403
4587
|
{
|
|
4404
|
-
name: "
|
|
4405
|
-
description: "
|
|
4406
|
-
parameters:
|
|
4407
|
-
handler: async (rawArgs) => this.handleToolCall("
|
|
4588
|
+
name: "add_cuts",
|
|
4589
|
+
description: "Add one or more regions to remove from the video. You can call this multiple times to build your edit list incrementally as you analyze each section.",
|
|
4590
|
+
parameters: ADD_CUTS_SCHEMA,
|
|
4591
|
+
handler: async (rawArgs) => this.handleToolCall("add_cuts", rawArgs)
|
|
4592
|
+
},
|
|
4593
|
+
{
|
|
4594
|
+
name: "finalize_cuts",
|
|
4595
|
+
description: "Finalize your edit list and trigger video rendering. Call this ONCE after you have added all cuts with add_cuts. Adjacent/overlapping cuts will be merged automatically.",
|
|
4596
|
+
parameters: { type: "object", properties: {} },
|
|
4597
|
+
handler: async () => this.handleToolCall("finalize_cuts", {})
|
|
4408
4598
|
}
|
|
4409
4599
|
];
|
|
4410
4600
|
}
|
|
@@ -4456,11 +4646,33 @@ var init_ProducerAgent = __esm({
|
|
|
4456
4646
|
editorialDirection: direction
|
|
4457
4647
|
};
|
|
4458
4648
|
}
|
|
4459
|
-
case "
|
|
4649
|
+
case "add_cuts": {
|
|
4460
4650
|
const { removals } = args;
|
|
4461
|
-
|
|
4462
|
-
|
|
4463
|
-
return `
|
|
4651
|
+
this.removals.push(...removals);
|
|
4652
|
+
logger_default.info(`[ProducerAgent] Added ${removals.length} cuts (total: ${this.removals.length})`);
|
|
4653
|
+
return `Added ${removals.length} cuts. Total queued: ${this.removals.length}. Call add_cuts again for more, or finalize_cuts when done.`;
|
|
4654
|
+
}
|
|
4655
|
+
case "finalize_cuts": {
|
|
4656
|
+
this.removals = mergeRemovals(this.removals);
|
|
4657
|
+
logger_default.info(`[ProducerAgent] Finalized ${this.removals.length} cuts (after merging), starting render`);
|
|
4658
|
+
const sortedRemovals = [...this.removals].sort((a, b) => a.start - b.start);
|
|
4659
|
+
const keepSegments = [];
|
|
4660
|
+
let cursor = 0;
|
|
4661
|
+
for (const removal of sortedRemovals) {
|
|
4662
|
+
if (removal.start > cursor) {
|
|
4663
|
+
keepSegments.push({ start: cursor, end: removal.start });
|
|
4664
|
+
}
|
|
4665
|
+
cursor = Math.max(cursor, removal.end);
|
|
4666
|
+
}
|
|
4667
|
+
if (cursor < this.videoDuration) {
|
|
4668
|
+
keepSegments.push({ start: cursor, end: this.videoDuration });
|
|
4669
|
+
}
|
|
4670
|
+
const totalRemoval = this.removals.reduce((sum, r) => sum + (r.end - r.start), 0);
|
|
4671
|
+
logger_default.info(
|
|
4672
|
+
`[ProducerAgent] ${this.removals.length} removals \u2192 ${keepSegments.length} keep segments, removing ${totalRemoval.toFixed(1)}s`
|
|
4673
|
+
);
|
|
4674
|
+
this.renderPromise = singlePassEdit(this.video.videoPath, keepSegments, this.outputPath);
|
|
4675
|
+
return `Rendering started with ${this.removals.length} cuts. The video is being processed in the background.`;
|
|
4464
4676
|
}
|
|
4465
4677
|
default:
|
|
4466
4678
|
throw new Error(`Unknown tool: ${toolName}`);
|
|
@@ -4473,73 +4685,47 @@ var init_ProducerAgent = __esm({
|
|
|
4473
4685
|
*/
|
|
4474
4686
|
async produce(outputPath) {
|
|
4475
4687
|
this.removals = [];
|
|
4476
|
-
|
|
4688
|
+
this.renderPromise = null;
|
|
4689
|
+
this.outputPath = outputPath;
|
|
4690
|
+
const prompt = `Clean this video by removing unwanted segments.
|
|
4477
4691
|
|
|
4478
4692
|
**Video:** ${this.video.videoPath}
|
|
4479
4693
|
|
|
4480
|
-
|
|
4481
|
-
|
|
4482
|
-
1. Call get_video_info to know the video duration.
|
|
4483
|
-
2. Call get_editorial_direction to get AI-powered editorial guidance (cut points, pacing issues).
|
|
4484
|
-
3. Call get_transcript to understand what's being said and identify removable regions.
|
|
4485
|
-
4. Call **plan_cuts** with your list of regions to remove.
|
|
4486
|
-
|
|
4487
|
-
Focus on removing dead air, filler words, bad takes, and redundant content. Be conservative \u2014 aim for 10-20% removal at most.`;
|
|
4694
|
+
Get the video info, editorial direction, and transcript. Analyze them together, then add your cuts and finalize.`;
|
|
4488
4695
|
try {
|
|
4489
4696
|
const response = await this.run(prompt);
|
|
4490
|
-
logger_default.info(`[ProducerAgent] Agent
|
|
4491
|
-
if (this.
|
|
4492
|
-
|
|
4697
|
+
logger_default.info(`[ProducerAgent] Agent conversation complete for ${this.video.videoPath}`);
|
|
4698
|
+
if (this.renderPromise) {
|
|
4699
|
+
await this.renderPromise;
|
|
4700
|
+
logger_default.info(`[ProducerAgent] Render complete: ${outputPath}`);
|
|
4701
|
+
const sortedRemovals = [...this.removals].sort((a, b) => a.start - b.start);
|
|
4702
|
+
const keepSegments = [];
|
|
4703
|
+
let cursor = 0;
|
|
4704
|
+
for (const removal of sortedRemovals) {
|
|
4705
|
+
if (removal.start > cursor) {
|
|
4706
|
+
keepSegments.push({ start: cursor, end: removal.start });
|
|
4707
|
+
}
|
|
4708
|
+
cursor = Math.max(cursor, removal.end);
|
|
4709
|
+
}
|
|
4710
|
+
if (cursor < this.videoDuration) {
|
|
4711
|
+
keepSegments.push({ start: cursor, end: this.videoDuration });
|
|
4712
|
+
}
|
|
4493
4713
|
return {
|
|
4494
4714
|
summary: response,
|
|
4715
|
+
outputPath,
|
|
4495
4716
|
success: true,
|
|
4496
|
-
editCount:
|
|
4497
|
-
removals:
|
|
4498
|
-
keepSegments
|
|
4717
|
+
editCount: this.removals.length,
|
|
4718
|
+
removals: sortedRemovals.map((r) => ({ start: r.start, end: r.end })),
|
|
4719
|
+
keepSegments
|
|
4499
4720
|
};
|
|
4500
4721
|
}
|
|
4501
|
-
|
|
4502
|
-
let totalRemoval = 0;
|
|
4503
|
-
const sortedByDuration = [...this.removals].sort(
|
|
4504
|
-
(a, b) => b.end - b.start - (a.end - a.start)
|
|
4505
|
-
);
|
|
4506
|
-
const cappedRemovals = [];
|
|
4507
|
-
for (const r of sortedByDuration) {
|
|
4508
|
-
const dur = r.end - r.start;
|
|
4509
|
-
if (totalRemoval + dur <= maxRemoval) {
|
|
4510
|
-
cappedRemovals.push(r);
|
|
4511
|
-
totalRemoval += dur;
|
|
4512
|
-
}
|
|
4513
|
-
}
|
|
4514
|
-
if (cappedRemovals.length < this.removals.length) {
|
|
4515
|
-
logger_default.warn(
|
|
4516
|
-
`[ProducerAgent] Safety cap: reduced ${this.removals.length} removals to ${cappedRemovals.length} (max 20% of ${this.videoDuration}s = ${maxRemoval.toFixed(1)}s)`
|
|
4517
|
-
);
|
|
4518
|
-
}
|
|
4519
|
-
const sortedRemovals = [...cappedRemovals].sort((a, b) => a.start - b.start);
|
|
4520
|
-
const keepSegments = [];
|
|
4521
|
-
let cursor = 0;
|
|
4522
|
-
for (const removal of sortedRemovals) {
|
|
4523
|
-
if (removal.start > cursor) {
|
|
4524
|
-
keepSegments.push({ start: cursor, end: removal.start });
|
|
4525
|
-
}
|
|
4526
|
-
cursor = Math.max(cursor, removal.end);
|
|
4527
|
-
}
|
|
4528
|
-
if (cursor < this.videoDuration) {
|
|
4529
|
-
keepSegments.push({ start: cursor, end: this.videoDuration });
|
|
4530
|
-
}
|
|
4531
|
-
logger_default.info(
|
|
4532
|
-
`[ProducerAgent] ${cappedRemovals.length} removals \u2192 ${keepSegments.length} keep segments, removing ${totalRemoval.toFixed(1)}s`
|
|
4533
|
-
);
|
|
4534
|
-
await singlePassEdit(this.video.videoPath, keepSegments, outputPath);
|
|
4535
|
-
logger_default.info(`[ProducerAgent] Render complete: ${outputPath}`);
|
|
4722
|
+
logger_default.info(`[ProducerAgent] No cuts finalized \u2014 video is clean`);
|
|
4536
4723
|
return {
|
|
4537
4724
|
summary: response,
|
|
4538
|
-
outputPath,
|
|
4539
4725
|
success: true,
|
|
4540
|
-
editCount:
|
|
4541
|
-
removals:
|
|
4542
|
-
keepSegments
|
|
4726
|
+
editCount: 0,
|
|
4727
|
+
removals: [],
|
|
4728
|
+
keepSegments: [{ start: 0, end: this.videoDuration }]
|
|
4543
4729
|
};
|
|
4544
4730
|
} catch (err) {
|
|
4545
4731
|
const message = err instanceof Error ? err.message : String(err);
|
|
@@ -4563,12 +4749,14 @@ Focus on removing dead air, filler words, bad takes, and redundant content. Be c
|
|
|
4563
4749
|
var geminiClient_exports = {};
|
|
4564
4750
|
__export(geminiClient_exports, {
|
|
4565
4751
|
analyzeVideoClipDirection: () => analyzeVideoClipDirection,
|
|
4566
|
-
analyzeVideoEditorial: () => analyzeVideoEditorial
|
|
4752
|
+
analyzeVideoEditorial: () => analyzeVideoEditorial,
|
|
4753
|
+
analyzeVideoForEnhancements: () => analyzeVideoForEnhancements
|
|
4567
4754
|
});
|
|
4568
4755
|
import { GoogleGenAI, createUserContent, createPartFromUri } from "@google/genai";
|
|
4569
|
-
async function analyzeVideoEditorial(videoPath, durationSeconds, model
|
|
4756
|
+
async function analyzeVideoEditorial(videoPath, durationSeconds, model) {
|
|
4570
4757
|
const config2 = getConfig();
|
|
4571
4758
|
const apiKey = config2.GEMINI_API_KEY;
|
|
4759
|
+
const resolvedModel = model ?? config2.GEMINI_MODEL;
|
|
4572
4760
|
if (!apiKey) {
|
|
4573
4761
|
throw new Error(
|
|
4574
4762
|
"GEMINI_API_KEY is required for video editorial analysis. Get a key at https://aistudio.google.com/apikey"
|
|
@@ -4594,9 +4782,9 @@ async function analyzeVideoEditorial(videoPath, durationSeconds, model = "gemini
|
|
|
4594
4782
|
if (fileState !== "ACTIVE") {
|
|
4595
4783
|
throw new Error(`Gemini file processing failed \u2014 state: ${fileState}`);
|
|
4596
4784
|
}
|
|
4597
|
-
logger_default.info(`[Gemini] Video ready, requesting editorial analysis (model: ${
|
|
4785
|
+
logger_default.info(`[Gemini] Video ready, requesting editorial analysis (model: ${resolvedModel})`);
|
|
4598
4786
|
const response = await ai.models.generateContent({
|
|
4599
|
-
model,
|
|
4787
|
+
model: resolvedModel,
|
|
4600
4788
|
contents: createUserContent([
|
|
4601
4789
|
createPartFromUri(file.uri, file.mimeType),
|
|
4602
4790
|
EDITORIAL_PROMPT
|
|
@@ -4609,7 +4797,7 @@ async function analyzeVideoEditorial(videoPath, durationSeconds, model = "gemini
|
|
|
4609
4797
|
const estimatedInputTokens = Math.ceil(durationSeconds * VIDEO_TOKENS_PER_SECOND);
|
|
4610
4798
|
const estimatedOutputTokens = Math.ceil(text.length / 4);
|
|
4611
4799
|
costTracker.recordServiceUsage("gemini", 0, {
|
|
4612
|
-
model,
|
|
4800
|
+
model: resolvedModel,
|
|
4613
4801
|
durationSeconds,
|
|
4614
4802
|
estimatedInputTokens,
|
|
4615
4803
|
estimatedOutputTokens,
|
|
@@ -4618,9 +4806,10 @@ async function analyzeVideoEditorial(videoPath, durationSeconds, model = "gemini
|
|
|
4618
4806
|
logger_default.info(`[Gemini] Editorial analysis complete (${text.length} chars)`);
|
|
4619
4807
|
return text;
|
|
4620
4808
|
}
|
|
4621
|
-
async function analyzeVideoClipDirection(videoPath, durationSeconds, model
|
|
4809
|
+
async function analyzeVideoClipDirection(videoPath, durationSeconds, model) {
|
|
4622
4810
|
const config2 = getConfig();
|
|
4623
4811
|
const apiKey = config2.GEMINI_API_KEY;
|
|
4812
|
+
const resolvedModel = model ?? config2.GEMINI_MODEL;
|
|
4624
4813
|
if (!apiKey) {
|
|
4625
4814
|
throw new Error(
|
|
4626
4815
|
"GEMINI_API_KEY is required for video clip direction analysis. Get a key at https://aistudio.google.com/apikey"
|
|
@@ -4646,9 +4835,9 @@ async function analyzeVideoClipDirection(videoPath, durationSeconds, model = "ge
|
|
|
4646
4835
|
if (fileState !== "ACTIVE") {
|
|
4647
4836
|
throw new Error(`Gemini file processing failed \u2014 state: ${fileState}`);
|
|
4648
4837
|
}
|
|
4649
|
-
logger_default.info(`[Gemini] Video ready, requesting clip direction analysis (model: ${
|
|
4838
|
+
logger_default.info(`[Gemini] Video ready, requesting clip direction analysis (model: ${resolvedModel})`);
|
|
4650
4839
|
const response = await ai.models.generateContent({
|
|
4651
|
-
model,
|
|
4840
|
+
model: resolvedModel,
|
|
4652
4841
|
contents: createUserContent([
|
|
4653
4842
|
createPartFromUri(file.uri, file.mimeType),
|
|
4654
4843
|
CLIP_DIRECTION_PROMPT
|
|
@@ -4661,7 +4850,7 @@ async function analyzeVideoClipDirection(videoPath, durationSeconds, model = "ge
|
|
|
4661
4850
|
const estimatedInputTokens = Math.ceil(durationSeconds * VIDEO_TOKENS_PER_SECOND);
|
|
4662
4851
|
const estimatedOutputTokens = Math.ceil(text.length / 4);
|
|
4663
4852
|
costTracker.recordServiceUsage("gemini", 0, {
|
|
4664
|
-
model,
|
|
4853
|
+
model: resolvedModel,
|
|
4665
4854
|
durationSeconds,
|
|
4666
4855
|
estimatedInputTokens,
|
|
4667
4856
|
estimatedOutputTokens,
|
|
@@ -4670,7 +4859,60 @@ async function analyzeVideoClipDirection(videoPath, durationSeconds, model = "ge
|
|
|
4670
4859
|
logger_default.info(`[Gemini] Clip direction analysis complete (${text.length} chars)`);
|
|
4671
4860
|
return text;
|
|
4672
4861
|
}
|
|
4673
|
-
|
|
4862
|
+
async function analyzeVideoForEnhancements(videoPath, durationSeconds, transcript, model) {
|
|
4863
|
+
const config2 = getConfig();
|
|
4864
|
+
const apiKey = config2.GEMINI_API_KEY;
|
|
4865
|
+
const resolvedModel = model ?? config2.GEMINI_MODEL;
|
|
4866
|
+
if (!apiKey) {
|
|
4867
|
+
throw new Error(
|
|
4868
|
+
"GEMINI_API_KEY is required for video enhancement analysis. Get a key at https://aistudio.google.com/apikey"
|
|
4869
|
+
);
|
|
4870
|
+
}
|
|
4871
|
+
const ai = new GoogleGenAI({ apiKey });
|
|
4872
|
+
logger_default.info(`[Gemini] Uploading video for enhancement analysis: ${videoPath}`);
|
|
4873
|
+
const file = await ai.files.upload({
|
|
4874
|
+
file: videoPath,
|
|
4875
|
+
config: { mimeType: "video/mp4" }
|
|
4876
|
+
});
|
|
4877
|
+
if (!file.uri || !file.mimeType || !file.name) {
|
|
4878
|
+
throw new Error("Gemini file upload failed \u2014 no URI returned");
|
|
4879
|
+
}
|
|
4880
|
+
logger_default.info(`[Gemini] Waiting for file processing to complete...`);
|
|
4881
|
+
let fileState = file.state;
|
|
4882
|
+
while (fileState === "PROCESSING") {
|
|
4883
|
+
await new Promise((resolve3) => setTimeout(resolve3, 2e3));
|
|
4884
|
+
const updated = await ai.files.get({ name: file.name });
|
|
4885
|
+
fileState = updated.state;
|
|
4886
|
+
logger_default.debug(`[Gemini] File state: ${fileState}`);
|
|
4887
|
+
}
|
|
4888
|
+
if (fileState !== "ACTIVE") {
|
|
4889
|
+
throw new Error(`Gemini file processing failed \u2014 state: ${fileState}`);
|
|
4890
|
+
}
|
|
4891
|
+
logger_default.info(`[Gemini] Video ready, requesting enhancement analysis (model: ${resolvedModel})`);
|
|
4892
|
+
const response = await ai.models.generateContent({
|
|
4893
|
+
model: resolvedModel,
|
|
4894
|
+
contents: createUserContent([
|
|
4895
|
+
createPartFromUri(file.uri, file.mimeType),
|
|
4896
|
+
ENHANCEMENT_ANALYSIS_PROMPT + transcript
|
|
4897
|
+
])
|
|
4898
|
+
});
|
|
4899
|
+
const text = response.text ?? "";
|
|
4900
|
+
if (!text) {
|
|
4901
|
+
throw new Error("Gemini returned empty response");
|
|
4902
|
+
}
|
|
4903
|
+
const estimatedInputTokens = Math.ceil(durationSeconds * VIDEO_TOKENS_PER_SECOND);
|
|
4904
|
+
const estimatedOutputTokens = Math.ceil(text.length / 4);
|
|
4905
|
+
costTracker.recordServiceUsage("gemini", 0, {
|
|
4906
|
+
model: resolvedModel,
|
|
4907
|
+
durationSeconds,
|
|
4908
|
+
estimatedInputTokens,
|
|
4909
|
+
estimatedOutputTokens,
|
|
4910
|
+
videoFile: videoPath
|
|
4911
|
+
});
|
|
4912
|
+
logger_default.info(`[Gemini] Enhancement analysis complete (${text.length} chars)`);
|
|
4913
|
+
return text;
|
|
4914
|
+
}
|
|
4915
|
+
var VIDEO_TOKENS_PER_SECOND, EDITORIAL_PROMPT, CLIP_DIRECTION_PROMPT, ENHANCEMENT_ANALYSIS_PROMPT;
|
|
4674
4916
|
var init_geminiClient = __esm({
|
|
4675
4917
|
"src/tools/gemini/geminiClient.ts"() {
|
|
4676
4918
|
"use strict";
|
|
@@ -4692,7 +4934,7 @@ Flag sections that are too slow, too fast, or have dead air. Give start/end time
|
|
|
4692
4934
|
Identify moments where text overlays, graphics, zoom-ins, or visual emphasis would improve engagement.
|
|
4693
4935
|
|
|
4694
4936
|
## Hook & Retention
|
|
4695
|
-
Rate the first 3 seconds (1-10) and suggest specific improvements for viewer retention.
|
|
4937
|
+
Rate the first 3 seconds (1-10) and suggest specific improvements for viewer retention. If the video has a weak opening (meta-commentary, dead air, false starts), recommend where the actual content begins so an editor can start the video there.
|
|
4696
4938
|
|
|
4697
4939
|
## Content Structure
|
|
4698
4940
|
Break the video into intro/body sections/outro with timestamps and topic for each section.
|
|
@@ -4702,10 +4944,21 @@ Highlight the most engaging, surprising, or important moments that should be emp
|
|
|
4702
4944
|
|
|
4703
4945
|
## Cleaning Recommendations
|
|
4704
4946
|
Identify sections that should be trimmed or removed entirely to produce a tighter edit. For each:
|
|
4705
|
-
- Give start/end timestamps (MM:SS format)
|
|
4947
|
+
- Give start/end timestamps (MM:SS.s format with decimal precision, e.g. 00:14.3 - 00:37.0)
|
|
4706
4948
|
- Explain why it should be removed (dead air, filler words, false starts, repeated explanations, off-topic tangents, excessive pauses)
|
|
4707
4949
|
- Rate the confidence (high/medium/low) \u2014 high means definitely remove, low means optional
|
|
4708
4950
|
|
|
4951
|
+
After listing the recommendations in markdown, also provide a machine-readable JSON block summarizing all suggested cuts:
|
|
4952
|
+
|
|
4953
|
+
\`\`\`json:cuts
|
|
4954
|
+
[
|
|
4955
|
+
{ "start": 0.0, "end": 15.2, "reason": "Opening too slow - dead air and filler", "confidence": "high" },
|
|
4956
|
+
{ "start": 26.5, "end": 37.0, "reason": "Meta-commentary for editor", "confidence": "high" }
|
|
4957
|
+
]
|
|
4958
|
+
\`\`\`
|
|
4959
|
+
|
|
4960
|
+
Times in the JSON block should be in seconds with decimal precision. Place cut boundaries at word boundaries.
|
|
4961
|
+
|
|
4709
4962
|
## Hook Snippets for Short Videos
|
|
4710
4963
|
Identify the 3-5 best moments (3-8 seconds each) that could serve as attention-grabbing hooks for the beginning of short-form videos. For each:
|
|
4711
4964
|
- Give start/end timestamps
|
|
@@ -4760,6 +5013,465 @@ For each recommended medium clip, provide:
|
|
|
4760
5013
|
Identify 2-4 medium clips. Prioritize: complete explanations, tutorial segments, deep dives, and compelling narrative arcs.
|
|
4761
5014
|
|
|
4762
5015
|
Be precise with timestamps. Be opinionated about what works and what doesn't. Think about what would make someone stop scrolling.`;
|
|
5016
|
+
ENHANCEMENT_ANALYSIS_PROMPT = `You are a visual content strategist reviewing raw video footage. Write an editorial report identifying moments where an AI-generated image overlay would genuinely enhance viewer comprehension.
|
|
5017
|
+
|
|
5018
|
+
Watch the video carefully and read the transcript below. Write a natural editorial report covering:
|
|
5019
|
+
|
|
5020
|
+
1. **Video layout observations** \u2014 What is on screen? Is there a webcam overlay? Where is the main content area (code editor, terminal, browser)? What areas of the screen have less visual activity and could safely hold an overlay without hiding important content?
|
|
5021
|
+
|
|
5022
|
+
2. **Enhancement opportunities** \u2014 For each moment you identify, describe:
|
|
5023
|
+
- The approximate timestamp range (in seconds) where the speaker is discussing the topic
|
|
5024
|
+
- What the speaker is explaining and what is currently visible on screen
|
|
5025
|
+
- The dominant background colors and brightness level at that moment (e.g., dark IDE, white browser, terminal with dark background). This helps the image designer choose contrasting colors so the overlay stands out
|
|
5026
|
+
- What kind of image would help (diagram, flowchart, illustration, infographic, etc.)
|
|
5027
|
+
- A detailed description of the image to generate
|
|
5028
|
+
- Why showing this image at this moment helps the viewer understand
|
|
5029
|
+
- Where on screen the image should go to avoid blocking important content
|
|
5030
|
+
|
|
5031
|
+
3. **Timing guidance** \u2014 For each opportunity, note the natural start and end of the speaker's explanation. The image should appear when the topic begins and disappear when the speaker moves on. Typically 5-12 seconds is ideal \u2014 long enough to register, short enough to not overstay.
|
|
5032
|
+
|
|
5033
|
+
Important guidelines:
|
|
5034
|
+
- Do NOT force opportunities \u2014 if the video doesn't need visual aids, say so
|
|
5035
|
+
- Do NOT suggest images when the screen already shows relevant visuals (diagrams, UI demos, live coding that needs to be seen)
|
|
5036
|
+
- Do NOT suggest images for trivial topics that don't need visual explanation
|
|
5037
|
+
- Do NOT suggest images during live demonstrations where the viewer needs to see the screen clearly
|
|
5038
|
+
- Moments shorter than 5 seconds are too brief for an overlay to register
|
|
5039
|
+
- It's perfectly fine to identify 0 opportunities, 1, or several \u2014 quality over quantity
|
|
5040
|
+
|
|
5041
|
+
Write your report in natural language with clear section headers. This report will be read by a graphics agent that will make final decisions about what to generate.
|
|
5042
|
+
|
|
5043
|
+
TRANSCRIPT:
|
|
5044
|
+
`;
|
|
5045
|
+
}
|
|
5046
|
+
});
|
|
5047
|
+
|
|
5048
|
+
// src/tools/imageGeneration.ts
|
|
5049
|
+
import { writeFile } from "fs/promises";
|
|
5050
|
+
import { dirname as dirname3 } from "path";
|
|
5051
|
+
import sharp from "sharp";
|
|
5052
|
+
async function generateImage(prompt, outputPath, options) {
|
|
5053
|
+
const config2 = getConfig();
|
|
5054
|
+
if (!config2.OPENAI_API_KEY) {
|
|
5055
|
+
throw new Error("[ImageGen] OPENAI_API_KEY is required for image generation");
|
|
5056
|
+
}
|
|
5057
|
+
const size = options?.size ?? "auto";
|
|
5058
|
+
const quality = options?.quality ?? "high";
|
|
5059
|
+
const fullPrompt = (options?.style ? `${prompt}
|
|
5060
|
+
|
|
5061
|
+
Style: ${options.style}` : prompt) + IMAGE_BASE_PROMPT;
|
|
5062
|
+
logger_default.info(`[ImageGen] Generating image: ${prompt.substring(0, 100)}...`);
|
|
5063
|
+
logger_default.debug(`[ImageGen] Size: ${size}, Quality: ${quality}`);
|
|
5064
|
+
const response = await fetch("https://api.openai.com/v1/images/generations", {
|
|
5065
|
+
method: "POST",
|
|
5066
|
+
headers: {
|
|
5067
|
+
"Content-Type": "application/json",
|
|
5068
|
+
Authorization: `Bearer ${config2.OPENAI_API_KEY}`
|
|
5069
|
+
},
|
|
5070
|
+
body: JSON.stringify({
|
|
5071
|
+
model: "gpt-image-1.5",
|
|
5072
|
+
prompt: fullPrompt,
|
|
5073
|
+
n: 1,
|
|
5074
|
+
size,
|
|
5075
|
+
quality
|
|
5076
|
+
})
|
|
5077
|
+
});
|
|
5078
|
+
if (!response.ok) {
|
|
5079
|
+
const errorText = await response.text();
|
|
5080
|
+
logger_default.error(`[ImageGen] API error (${response.status}): ${errorText}`);
|
|
5081
|
+
throw new Error(`[ImageGen] OpenAI API returned ${response.status}: ${errorText}`);
|
|
5082
|
+
}
|
|
5083
|
+
const result = await response.json();
|
|
5084
|
+
const b64 = result.data?.[0]?.b64_json;
|
|
5085
|
+
if (!b64) {
|
|
5086
|
+
logger_default.error("[ImageGen] No b64_json in API response");
|
|
5087
|
+
throw new Error("[ImageGen] API response missing b64_json image data");
|
|
5088
|
+
}
|
|
5089
|
+
const rawBuffer = Buffer.from(b64, "base64");
|
|
5090
|
+
let validatedBuffer;
|
|
5091
|
+
try {
|
|
5092
|
+
validatedBuffer = await sharp(rawBuffer).png().toBuffer();
|
|
5093
|
+
} catch (error) {
|
|
5094
|
+
logger_default.error("[ImageGen] Failed to validate image data from API", { error });
|
|
5095
|
+
throw new Error("[ImageGen] Invalid image data received from API - not a valid image format");
|
|
5096
|
+
}
|
|
5097
|
+
await ensureDirectory(dirname3(outputPath));
|
|
5098
|
+
await writeFile(outputPath, validatedBuffer);
|
|
5099
|
+
const estimatedCost = COST_BY_QUALITY[quality];
|
|
5100
|
+
costTracker.recordServiceUsage("openai-image", estimatedCost, {
|
|
5101
|
+
model: "gpt-image-1.5",
|
|
5102
|
+
size,
|
|
5103
|
+
quality,
|
|
5104
|
+
prompt: prompt.substring(0, 200)
|
|
5105
|
+
});
|
|
5106
|
+
logger_default.info(`[ImageGen] Image saved to ${outputPath} (${validatedBuffer.length} bytes)`);
|
|
5107
|
+
return outputPath;
|
|
5108
|
+
}
|
|
5109
|
+
var COST_BY_QUALITY, IMAGE_BASE_PROMPT;
|
|
5110
|
+
var init_imageGeneration = __esm({
|
|
5111
|
+
"src/tools/imageGeneration.ts"() {
|
|
5112
|
+
"use strict";
|
|
5113
|
+
init_logger2();
|
|
5114
|
+
init_environment();
|
|
5115
|
+
init_costTracker();
|
|
5116
|
+
init_fileSystem();
|
|
5117
|
+
COST_BY_QUALITY = {
|
|
5118
|
+
low: 0.04,
|
|
5119
|
+
medium: 0.07,
|
|
5120
|
+
high: 0.07
|
|
5121
|
+
};
|
|
5122
|
+
IMAGE_BASE_PROMPT = `
|
|
5123
|
+
|
|
5124
|
+
Rendering requirements: The image MUST have a solid opaque background (not transparent). Include a thin border or subtle drop shadow around the entire image. Use a clean, flat design style suitable for overlaying on top of video content. The image should look like a polished infographic card that clearly separates from whatever is behind it.`;
|
|
5125
|
+
}
|
|
5126
|
+
});
|
|
5127
|
+
|
|
5128
|
+
// src/agents/GraphicsAgent.ts
|
|
5129
|
+
import sharp2 from "sharp";
|
|
5130
|
+
async function generateEnhancementImages(enhancementReport, enhancementsDir, videoDuration, model) {
|
|
5131
|
+
await ensureDirectory(enhancementsDir);
|
|
5132
|
+
const agent = new GraphicsAgent(model);
|
|
5133
|
+
agent.setContext(enhancementsDir);
|
|
5134
|
+
try {
|
|
5135
|
+
const userMessage = `Here is the editorial report from our video analyst. The video is ${videoDuration.toFixed(1)} seconds long.
|
|
5136
|
+
|
|
5137
|
+
Review each opportunity and make your editorial decision \u2014 generate an image or skip it.
|
|
5138
|
+
|
|
5139
|
+
---
|
|
5140
|
+
|
|
5141
|
+
${enhancementReport}`;
|
|
5142
|
+
await agent.run(userMessage);
|
|
5143
|
+
return agent.getOverlays();
|
|
5144
|
+
} finally {
|
|
5145
|
+
await agent.destroy();
|
|
5146
|
+
}
|
|
5147
|
+
}
|
|
5148
|
+
var SYSTEM_PROMPT5, GENERATE_ENHANCEMENT_SCHEMA, SKIP_OPPORTUNITY_SCHEMA, GraphicsAgent;
|
|
5149
|
+
var init_GraphicsAgent = __esm({
|
|
5150
|
+
"src/agents/GraphicsAgent.ts"() {
|
|
5151
|
+
"use strict";
|
|
5152
|
+
init_BaseAgent();
|
|
5153
|
+
init_imageGeneration();
|
|
5154
|
+
init_text();
|
|
5155
|
+
init_paths();
|
|
5156
|
+
init_fileSystem();
|
|
5157
|
+
init_logger2();
|
|
5158
|
+
SYSTEM_PROMPT5 = `You are a visual content designer and editorial director for educational video content. You are given an editorial report from a video analyst describing moments in a video where AI-generated image overlays could enhance viewer comprehension.
|
|
5159
|
+
|
|
5160
|
+
Your job is to make the FINAL editorial decision for each opportunity:
|
|
5161
|
+
1. Decide whether to generate an image or skip the opportunity
|
|
5162
|
+
2. Determine the exact timing \u2014 when the image should appear and disappear
|
|
5163
|
+
3. Choose the optimal screen placement to avoid blocking important content
|
|
5164
|
+
4. Write a refined, high-quality image generation prompt
|
|
5165
|
+
|
|
5166
|
+
Guidelines for editorial decisions:
|
|
5167
|
+
- Only generate images that genuinely add value \u2014 quality over quantity
|
|
5168
|
+
- Timing should match the speaker's explanation: appear when the topic starts, disappear when they move on
|
|
5169
|
+
- Keep display duration between 5-12 seconds \u2014 long enough to register, short enough to not overstay
|
|
5170
|
+
- Ensure at least 10 seconds gap between consecutive overlays to avoid visual clutter
|
|
5171
|
+
- Choose placement regions that avoid the webcam, main content area, and any important UI elements
|
|
5172
|
+
- Size should be 15-30% of video width \u2014 large enough to see, small enough to not dominate
|
|
5173
|
+
|
|
5174
|
+
Guidelines for image prompts:
|
|
5175
|
+
- Create clean, professional diagrams and illustrations
|
|
5176
|
+
- Use flat design / modern infographic style
|
|
5177
|
+
- Include labels and annotations when helpful
|
|
5178
|
+
- Avoid photorealistic imagery \u2014 prefer stylized educational graphics
|
|
5179
|
+
- Keep the image simple and immediately understandable at a glance
|
|
5180
|
+
- The image will be shown as a small overlay, so avoid tiny details
|
|
5181
|
+
- Use high contrast colors for visibility when overlaid on video
|
|
5182
|
+
- No text-heavy images \u2014 a few key labels at most
|
|
5183
|
+
- Let the image content dictate its natural aspect ratio \u2014 don't force square if the content is better as landscape or portrait
|
|
5184
|
+
- IMPORTANT: Every image MUST have a solid, opaque background (e.g., white, light gray, dark navy) \u2014 never transparent or borderless. The image will be overlaid on top of a video so it needs to stand out with clear visual separation. If the report mentions a dark video background, use a light image background (and vice versa). Add a subtle border or shadow effect in the prompt to ensure the image pops against the video content.
|
|
5185
|
+
|
|
5186
|
+
Process the report and call generate_enhancement for each image worth creating, or call skip_opportunity for those not worth generating.`;
|
|
5187
|
+
GENERATE_ENHANCEMENT_SCHEMA = {
|
|
5188
|
+
type: "object",
|
|
5189
|
+
properties: {
|
|
5190
|
+
prompt: {
|
|
5191
|
+
type: "string",
|
|
5192
|
+
description: "A refined, high-quality image generation prompt describing the visual to create"
|
|
5193
|
+
},
|
|
5194
|
+
timestampStart: {
|
|
5195
|
+
type: "number",
|
|
5196
|
+
description: "When to start showing the image (seconds from video start)"
|
|
5197
|
+
},
|
|
5198
|
+
timestampEnd: {
|
|
5199
|
+
type: "number",
|
|
5200
|
+
description: "When to stop showing the image (seconds from video start). Should be 5-12 seconds after timestampStart."
|
|
5201
|
+
},
|
|
5202
|
+
region: {
|
|
5203
|
+
type: "string",
|
|
5204
|
+
enum: ["top-left", "top-right", "bottom-left", "bottom-right", "center-right", "center-left"],
|
|
5205
|
+
description: "Screen region for placement, chosen to avoid blocking important content"
|
|
5206
|
+
},
|
|
5207
|
+
sizePercent: {
|
|
5208
|
+
type: "number",
|
|
5209
|
+
description: "Image width as percentage of video width (15-30)"
|
|
5210
|
+
},
|
|
5211
|
+
topic: {
|
|
5212
|
+
type: "string",
|
|
5213
|
+
description: "Brief label for what this image illustrates"
|
|
5214
|
+
},
|
|
5215
|
+
reason: {
|
|
5216
|
+
type: "string",
|
|
5217
|
+
description: "Why this visual enhancement helps the viewer"
|
|
5218
|
+
}
|
|
5219
|
+
},
|
|
5220
|
+
required: ["prompt", "timestampStart", "timestampEnd", "region", "sizePercent", "topic", "reason"]
|
|
5221
|
+
};
|
|
5222
|
+
SKIP_OPPORTUNITY_SCHEMA = {
|
|
5223
|
+
type: "object",
|
|
5224
|
+
properties: {
|
|
5225
|
+
topic: {
|
|
5226
|
+
type: "string",
|
|
5227
|
+
description: "The topic from the report that is being skipped"
|
|
5228
|
+
},
|
|
5229
|
+
reason: {
|
|
5230
|
+
type: "string",
|
|
5231
|
+
description: "Why this opportunity should be skipped"
|
|
5232
|
+
}
|
|
5233
|
+
},
|
|
5234
|
+
required: ["topic", "reason"]
|
|
5235
|
+
};
|
|
5236
|
+
GraphicsAgent = class extends BaseAgent {
|
|
5237
|
+
overlays = [];
|
|
5238
|
+
enhancementsDir = "";
|
|
5239
|
+
imageIndex = 0;
|
|
5240
|
+
constructor(model) {
|
|
5241
|
+
super("GraphicsAgent", SYSTEM_PROMPT5, void 0, model);
|
|
5242
|
+
}
|
|
5243
|
+
setContext(enhancementsDir) {
|
|
5244
|
+
this.enhancementsDir = enhancementsDir;
|
|
5245
|
+
}
|
|
5246
|
+
getTools() {
|
|
5247
|
+
return [
|
|
5248
|
+
{
|
|
5249
|
+
name: "generate_enhancement",
|
|
5250
|
+
description: "Generate an AI image overlay for a specific moment in the video. You decide the timing, placement, and prompt.",
|
|
5251
|
+
parameters: GENERATE_ENHANCEMENT_SCHEMA,
|
|
5252
|
+
handler: async (args) => this.handleToolCall("generate_enhancement", args)
|
|
5253
|
+
},
|
|
5254
|
+
{
|
|
5255
|
+
name: "skip_opportunity",
|
|
5256
|
+
description: "Skip an enhancement opportunity from the report that is not worth generating.",
|
|
5257
|
+
parameters: SKIP_OPPORTUNITY_SCHEMA,
|
|
5258
|
+
handler: async (args) => this.handleToolCall("skip_opportunity", args)
|
|
5259
|
+
}
|
|
5260
|
+
];
|
|
5261
|
+
}
|
|
5262
|
+
async handleToolCall(toolName, args) {
|
|
5263
|
+
if (toolName === "generate_enhancement") {
|
|
5264
|
+
const prompt = args.prompt;
|
|
5265
|
+
const timestampStart = args.timestampStart;
|
|
5266
|
+
const timestampEnd = args.timestampEnd;
|
|
5267
|
+
const region = args.region;
|
|
5268
|
+
const sizePercent = Math.min(30, Math.max(15, args.sizePercent));
|
|
5269
|
+
const topic = args.topic;
|
|
5270
|
+
const reason = args.reason;
|
|
5271
|
+
const slug = slugify(topic, { lower: true, strict: true });
|
|
5272
|
+
const filename = `${this.imageIndex}-${slug}.png`;
|
|
5273
|
+
const outputPath = join(this.enhancementsDir, filename);
|
|
5274
|
+
try {
|
|
5275
|
+
await generateImage(prompt, outputPath, { size: "auto" });
|
|
5276
|
+
const metadata = await sharp2(outputPath).metadata();
|
|
5277
|
+
const width = metadata.width ?? 1024;
|
|
5278
|
+
const height = metadata.height ?? 1024;
|
|
5279
|
+
const opportunity = {
|
|
5280
|
+
timestampStart,
|
|
5281
|
+
timestampEnd,
|
|
5282
|
+
topic,
|
|
5283
|
+
imagePrompt: prompt,
|
|
5284
|
+
reason,
|
|
5285
|
+
placement: { region, avoidAreas: [], sizePercent },
|
|
5286
|
+
confidence: 1
|
|
5287
|
+
};
|
|
5288
|
+
const overlay = {
|
|
5289
|
+
opportunity,
|
|
5290
|
+
imagePath: outputPath,
|
|
5291
|
+
width,
|
|
5292
|
+
height
|
|
5293
|
+
};
|
|
5294
|
+
this.overlays.push(overlay);
|
|
5295
|
+
this.imageIndex++;
|
|
5296
|
+
logger_default.info(`Generated enhancement image: ${filename} (${width}x${height})`);
|
|
5297
|
+
return { success: true, imagePath: outputPath, dimensions: `${width}x${height}` };
|
|
5298
|
+
} catch (err) {
|
|
5299
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
5300
|
+
logger_default.error(`Failed to generate image for "${topic}": ${message}`);
|
|
5301
|
+
return { error: message };
|
|
5302
|
+
}
|
|
5303
|
+
}
|
|
5304
|
+
if (toolName === "skip_opportunity") {
|
|
5305
|
+
const topic = args.topic;
|
|
5306
|
+
const reason = args.reason;
|
|
5307
|
+
logger_default.info(`Skipped enhancement opportunity "${topic}": ${reason}`);
|
|
5308
|
+
return { success: true, skipped: true };
|
|
5309
|
+
}
|
|
5310
|
+
throw new Error(`Unknown tool: ${toolName}`);
|
|
5311
|
+
}
|
|
5312
|
+
getOverlays() {
|
|
5313
|
+
return this.overlays;
|
|
5314
|
+
}
|
|
5315
|
+
};
|
|
5316
|
+
}
|
|
5317
|
+
});
|
|
5318
|
+
|
|
5319
|
+
// src/tools/ffmpeg/overlayCompositing.ts
|
|
5320
|
+
function getOverlayPosition(region, margin) {
|
|
5321
|
+
const m = String(margin);
|
|
5322
|
+
switch (region) {
|
|
5323
|
+
case "top-left":
|
|
5324
|
+
return { x: m, y: m };
|
|
5325
|
+
case "top-right":
|
|
5326
|
+
return { x: `(main_w-overlay_w-${m})`, y: m };
|
|
5327
|
+
case "bottom-left":
|
|
5328
|
+
return { x: m, y: `(main_h-overlay_h-${m})` };
|
|
5329
|
+
case "bottom-right":
|
|
5330
|
+
return { x: `(main_w-overlay_w-${m})`, y: `(main_h-overlay_h-${m})` };
|
|
5331
|
+
case "center-right":
|
|
5332
|
+
return { x: `(main_w-overlay_w-${m})`, y: `((main_h-overlay_h)/2)` };
|
|
5333
|
+
case "center-left":
|
|
5334
|
+
return { x: m, y: `((main_h-overlay_h)/2)` };
|
|
5335
|
+
}
|
|
5336
|
+
}
|
|
5337
|
+
function buildOverlayFilterComplex(overlays, videoWidth, videoHeight) {
|
|
5338
|
+
const margin = Math.round(videoWidth * 0.05);
|
|
5339
|
+
const filters = [];
|
|
5340
|
+
for (let i = 0; i < overlays.length; i++) {
|
|
5341
|
+
const overlay = overlays[i];
|
|
5342
|
+
const inputIdx = i + 1;
|
|
5343
|
+
const overlayWidth = Math.round(videoWidth * overlay.opportunity.placement.sizePercent / 100);
|
|
5344
|
+
const start = overlay.opportunity.timestampStart;
|
|
5345
|
+
const end = overlay.opportunity.timestampEnd;
|
|
5346
|
+
filters.push(`[${inputIdx}:v]scale=${overlayWidth}:-1,format=rgba[img_${i}]`);
|
|
5347
|
+
const prev = i === 0 ? "[0:v]" : `[out_${i - 1}]`;
|
|
5348
|
+
const isLast = i === overlays.length - 1;
|
|
5349
|
+
const out = isLast ? "[overlaid]" : `[out_${i}]`;
|
|
5350
|
+
const pos = getOverlayPosition(overlay.opportunity.placement.region, margin);
|
|
5351
|
+
filters.push(
|
|
5352
|
+
`${prev}[img_${i}]overlay=x=${pos.x}:y=${pos.y}:enable='between(t,${start},${end})':format=auto${out}`
|
|
5353
|
+
);
|
|
5354
|
+
}
|
|
5355
|
+
filters.push("[overlaid]format=yuv420p[outv]");
|
|
5356
|
+
return filters.join(";");
|
|
5357
|
+
}
|
|
5358
|
+
async function compositeOverlays(videoPath, overlays, outputPath, videoWidth, videoHeight) {
|
|
5359
|
+
if (overlays.length === 0) {
|
|
5360
|
+
throw new Error("[OverlayCompositing] No overlays provided");
|
|
5361
|
+
}
|
|
5362
|
+
const ffmpegPath6 = getFFmpegPath();
|
|
5363
|
+
const filterComplex = buildOverlayFilterComplex(overlays, videoWidth, videoHeight);
|
|
5364
|
+
const args = ["-y", "-i", videoPath];
|
|
5365
|
+
for (const overlay of overlays) {
|
|
5366
|
+
args.push("-loop", "1", "-i", overlay.imagePath);
|
|
5367
|
+
}
|
|
5368
|
+
args.push(
|
|
5369
|
+
"-filter_complex",
|
|
5370
|
+
filterComplex,
|
|
5371
|
+
"-map",
|
|
5372
|
+
"[outv]",
|
|
5373
|
+
"-map",
|
|
5374
|
+
"0:a",
|
|
5375
|
+
"-c:v",
|
|
5376
|
+
"libx264",
|
|
5377
|
+
"-preset",
|
|
5378
|
+
"ultrafast",
|
|
5379
|
+
"-crf",
|
|
5380
|
+
"23",
|
|
5381
|
+
"-threads",
|
|
5382
|
+
"4",
|
|
5383
|
+
"-c:a",
|
|
5384
|
+
"copy",
|
|
5385
|
+
"-shortest",
|
|
5386
|
+
outputPath
|
|
5387
|
+
);
|
|
5388
|
+
logger_default.info(`[OverlayCompositing] Compositing ${overlays.length} overlays \u2192 ${outputPath}`);
|
|
5389
|
+
return new Promise((resolve3, reject) => {
|
|
5390
|
+
execFileRaw(ffmpegPath6, args, { maxBuffer: 50 * 1024 * 1024 }, (error, _stdout, stderr) => {
|
|
5391
|
+
if (error) {
|
|
5392
|
+
logger_default.error(`[OverlayCompositing] FFmpeg failed: ${stderr}`);
|
|
5393
|
+
reject(new Error(`[OverlayCompositing] FFmpeg overlay compositing failed: ${error.message}`));
|
|
5394
|
+
return;
|
|
5395
|
+
}
|
|
5396
|
+
logger_default.info(`[OverlayCompositing] Complete: ${outputPath}`);
|
|
5397
|
+
resolve3(outputPath);
|
|
5398
|
+
});
|
|
5399
|
+
});
|
|
5400
|
+
}
|
|
5401
|
+
var init_overlayCompositing = __esm({
|
|
5402
|
+
"src/tools/ffmpeg/overlayCompositing.ts"() {
|
|
5403
|
+
"use strict";
|
|
5404
|
+
init_process();
|
|
5405
|
+
init_ffmpeg();
|
|
5406
|
+
init_logger2();
|
|
5407
|
+
}
|
|
5408
|
+
});
|
|
5409
|
+
|
|
5410
|
+
// src/stages/visualEnhancement.ts
|
|
5411
|
+
var visualEnhancement_exports = {};
|
|
5412
|
+
__export(visualEnhancement_exports, {
|
|
5413
|
+
enhanceVideo: () => enhanceVideo
|
|
5414
|
+
});
|
|
5415
|
+
async function enhanceVideo(videoPath, transcript, video) {
|
|
5416
|
+
const enhancementsDir = join(video.videoDir, "enhancements");
|
|
5417
|
+
await ensureDirectory(enhancementsDir);
|
|
5418
|
+
logger_default.info("[VisualEnhancement] Step 1: Analyzing video for enhancement opportunities...");
|
|
5419
|
+
const enhancementReport = await analyzeVideoForEnhancements(
|
|
5420
|
+
videoPath,
|
|
5421
|
+
video.duration,
|
|
5422
|
+
transcript.text
|
|
5423
|
+
);
|
|
5424
|
+
if (!enhancementReport || enhancementReport.trim().length === 0) {
|
|
5425
|
+
logger_default.info("[VisualEnhancement] No enhancement report generated \u2014 skipping");
|
|
5426
|
+
return void 0;
|
|
5427
|
+
}
|
|
5428
|
+
logger_default.info(`[VisualEnhancement] Received editorial report (${enhancementReport.length} chars)`);
|
|
5429
|
+
logger_default.info("[VisualEnhancement] Step 2: GraphicsAgent making editorial decisions and generating images...");
|
|
5430
|
+
const overlays = await generateEnhancementImages(
|
|
5431
|
+
enhancementReport,
|
|
5432
|
+
enhancementsDir,
|
|
5433
|
+
video.duration,
|
|
5434
|
+
getModelForAgent("GraphicsAgent")
|
|
5435
|
+
);
|
|
5436
|
+
if (overlays.length === 0) {
|
|
5437
|
+
logger_default.info("[VisualEnhancement] GraphicsAgent generated no images \u2014 skipping compositing");
|
|
5438
|
+
return void 0;
|
|
5439
|
+
}
|
|
5440
|
+
logger_default.info(`[VisualEnhancement] Generated ${overlays.length} enhancement images`);
|
|
5441
|
+
logger_default.info("[VisualEnhancement] Step 3: Compositing overlays onto video...");
|
|
5442
|
+
const outputPath = join(video.videoDir, `${video.slug}-enhanced.mp4`);
|
|
5443
|
+
const videoWidth = video.layout?.width ?? 1920;
|
|
5444
|
+
const videoHeight = video.layout?.height ?? 1080;
|
|
5445
|
+
const enhancedVideoPath = await compositeOverlays(
|
|
5446
|
+
videoPath,
|
|
5447
|
+
overlays,
|
|
5448
|
+
outputPath,
|
|
5449
|
+
videoWidth,
|
|
5450
|
+
videoHeight
|
|
5451
|
+
);
|
|
5452
|
+
logger_default.info(`[VisualEnhancement] Enhanced video created: ${enhancedVideoPath}`);
|
|
5453
|
+
let totalImageCost = 0;
|
|
5454
|
+
for (const overlay of overlays) {
|
|
5455
|
+
totalImageCost += 0.07;
|
|
5456
|
+
}
|
|
5457
|
+
return {
|
|
5458
|
+
enhancedVideoPath,
|
|
5459
|
+
overlays,
|
|
5460
|
+
analysisTokens: 0,
|
|
5461
|
+
// tracked by costTracker internally
|
|
5462
|
+
imageGenCost: totalImageCost
|
|
5463
|
+
};
|
|
5464
|
+
}
|
|
5465
|
+
var init_visualEnhancement = __esm({
|
|
5466
|
+
"src/stages/visualEnhancement.ts"() {
|
|
5467
|
+
"use strict";
|
|
5468
|
+
init_geminiClient();
|
|
5469
|
+
init_GraphicsAgent();
|
|
5470
|
+
init_overlayCompositing();
|
|
5471
|
+
init_modelConfig();
|
|
5472
|
+
init_fileSystem();
|
|
5473
|
+
init_paths();
|
|
5474
|
+
init_logger2();
|
|
4763
5475
|
}
|
|
4764
5476
|
});
|
|
4765
5477
|
|
|
@@ -4980,7 +5692,7 @@ async function generateSocialPosts(video, transcript, summary, outputDir, model)
|
|
|
4980
5692
|
await agent.destroy();
|
|
4981
5693
|
}
|
|
4982
5694
|
}
|
|
4983
|
-
var
|
|
5695
|
+
var SYSTEM_PROMPT6, SocialMediaAgent;
|
|
4984
5696
|
var init_SocialMediaAgent = __esm({
|
|
4985
5697
|
"src/agents/SocialMediaAgent.ts"() {
|
|
4986
5698
|
"use strict";
|
|
@@ -4990,7 +5702,7 @@ var init_SocialMediaAgent = __esm({
|
|
|
4990
5702
|
init_logger2();
|
|
4991
5703
|
init_environment();
|
|
4992
5704
|
init_types();
|
|
4993
|
-
|
|
5705
|
+
SYSTEM_PROMPT6 = `You are a viral social-media content strategist.
|
|
4994
5706
|
Given a video transcript and summary you MUST generate one post for each of the 5 platforms listed below.
|
|
4995
5707
|
Each post must match the platform's tone, format, and constraints exactly.
|
|
4996
5708
|
|
|
@@ -5014,7 +5726,7 @@ Always call "create_posts" exactly once with all 5 platform posts.`;
|
|
|
5014
5726
|
SocialMediaAgent = class extends BaseAgent {
|
|
5015
5727
|
collectedPosts = [];
|
|
5016
5728
|
constructor(model) {
|
|
5017
|
-
super("SocialMediaAgent",
|
|
5729
|
+
super("SocialMediaAgent", SYSTEM_PROMPT6, void 0, model);
|
|
5018
5730
|
}
|
|
5019
5731
|
getMcpServers() {
|
|
5020
5732
|
const config2 = getConfig();
|
|
@@ -5449,6 +6161,7 @@ var loadChapterAgent = async () => Promise.resolve().then(() => (init_ChapterAge
|
|
|
5449
6161
|
var loadSummaryAgent = async () => Promise.resolve().then(() => (init_SummaryAgent(), SummaryAgent_exports));
|
|
5450
6162
|
var loadProducerAgent = async () => Promise.resolve().then(() => (init_ProducerAgent(), ProducerAgent_exports));
|
|
5451
6163
|
var loadGeminiClient = async () => Promise.resolve().then(() => (init_geminiClient(), geminiClient_exports));
|
|
6164
|
+
var loadVisualEnhancement = async () => Promise.resolve().then(() => (init_visualEnhancement(), visualEnhancement_exports));
|
|
5452
6165
|
|
|
5453
6166
|
// src/assets/VideoAsset.ts
|
|
5454
6167
|
var VideoAsset = class extends Asset {
|
|
@@ -5908,7 +6621,8 @@ var ShortVideoAsset = class extends VideoAsset {
|
|
|
5908
6621
|
return this.videoPath;
|
|
5909
6622
|
}
|
|
5910
6623
|
await ensureDirectory(this.videoDir);
|
|
5911
|
-
const
|
|
6624
|
+
const mainParent = this.parent;
|
|
6625
|
+
const parentVideo = await mainParent.getEditedVideo();
|
|
5912
6626
|
await extractCompositeClip(parentVideo, this.clip.segments, this.videoPath);
|
|
5913
6627
|
return this.videoPath;
|
|
5914
6628
|
}
|
|
@@ -5951,6 +6665,7 @@ var ShortVideoAsset = class extends VideoAsset {
|
|
|
5951
6665
|
init_paths();
|
|
5952
6666
|
init_fileSystem();
|
|
5953
6667
|
init_types();
|
|
6668
|
+
init_clipExtraction();
|
|
5954
6669
|
var MediumClipAsset = class extends VideoAsset {
|
|
5955
6670
|
/** Parent video this clip was extracted from */
|
|
5956
6671
|
parent;
|
|
@@ -6012,18 +6727,20 @@ var MediumClipAsset = class extends VideoAsset {
|
|
|
6012
6727
|
return fileExists(this.videoPath);
|
|
6013
6728
|
}
|
|
6014
6729
|
/**
|
|
6015
|
-
* Get the rendered clip video path.
|
|
6730
|
+
* Get the rendered clip video path, extracting from parent if needed.
|
|
6731
|
+
* Extracts from the enhanced video so AI-generated overlays carry through.
|
|
6016
6732
|
*
|
|
6017
|
-
* @param opts - Asset options (force
|
|
6733
|
+
* @param opts - Asset options (force regeneration, etc.)
|
|
6018
6734
|
* @returns Path to the rendered video file
|
|
6019
|
-
* @throws Error if clip hasn't been rendered yet
|
|
6020
6735
|
*/
|
|
6021
6736
|
async getResult(opts) {
|
|
6022
|
-
if (!await this.exists()) {
|
|
6023
|
-
|
|
6024
|
-
`Medium clip "${this.slug}" not found at ${this.videoPath}. Run the medium-clips stage first.`
|
|
6025
|
-
);
|
|
6737
|
+
if (!opts?.force && await this.exists()) {
|
|
6738
|
+
return this.videoPath;
|
|
6026
6739
|
}
|
|
6740
|
+
await ensureDirectory(this.videoDir);
|
|
6741
|
+
const mainParent = this.parent;
|
|
6742
|
+
const parentVideo = await mainParent.getEnhancedVideo();
|
|
6743
|
+
await extractCompositeClip(parentVideo, this.clip.segments, this.videoPath);
|
|
6027
6744
|
return this.videoPath;
|
|
6028
6745
|
}
|
|
6029
6746
|
};
|
|
@@ -6222,6 +6939,10 @@ var MainVideoAsset = class _MainVideoAsset extends VideoAsset {
|
|
|
6222
6939
|
get editedVideoPath() {
|
|
6223
6940
|
return join(this.videoDir, `${this.slug}-edited.mp4`);
|
|
6224
6941
|
}
|
|
6942
|
+
/** Path to the enhanced (visual overlays) video: videoDir/{slug}-enhanced.mp4 */
|
|
6943
|
+
get enhancedVideoPath() {
|
|
6944
|
+
return join(this.videoDir, `${this.slug}-enhanced.mp4`);
|
|
6945
|
+
}
|
|
6225
6946
|
/** Path to the captioned video: videoDir/{slug}-captioned.mp4 */
|
|
6226
6947
|
get captionedVideoPath() {
|
|
6227
6948
|
return join(this.videoDir, `${this.slug}-captioned.mp4`);
|
|
@@ -6275,7 +6996,13 @@ var MainVideoAsset = class _MainVideoAsset extends VideoAsset {
|
|
|
6275
6996
|
logger_default.info(`Ingesting video: ${sourcePath} \u2192 ${slug}`);
|
|
6276
6997
|
if (await fileExists(videoDir)) {
|
|
6277
6998
|
logger_default.warn(`Output folder already exists, cleaning previous artifacts: ${videoDir}`);
|
|
6278
|
-
const subDirs = ["thumbnails", "shorts", "social-posts", "chapters", "medium-clips", "captions"];
|
|
6999
|
+
const subDirs = ["thumbnails", "shorts", "social-posts", "chapters", "medium-clips", "captions", "enhancements"];
|
|
7000
|
+
const allEntries = await listDirectory(videoDir);
|
|
7001
|
+
for (const entry of allEntries) {
|
|
7002
|
+
if (entry.endsWith("-enhance-test")) {
|
|
7003
|
+
await removeDirectory(join(videoDir, entry), { recursive: true, force: true });
|
|
7004
|
+
}
|
|
7005
|
+
}
|
|
6279
7006
|
for (const sub of subDirs) {
|
|
6280
7007
|
await removeDirectory(join(videoDir, sub), { recursive: true, force: true });
|
|
6281
7008
|
}
|
|
@@ -6287,14 +7014,18 @@ var MainVideoAsset = class _MainVideoAsset extends VideoAsset {
|
|
|
6287
7014
|
"captions.ass",
|
|
6288
7015
|
"summary.md",
|
|
6289
7016
|
"blog-post.md",
|
|
6290
|
-
"README.md"
|
|
7017
|
+
"README.md",
|
|
7018
|
+
"clip-direction.md",
|
|
7019
|
+
"editorial-direction.md",
|
|
7020
|
+
"cost-report.md",
|
|
7021
|
+
"layout.json"
|
|
6291
7022
|
];
|
|
6292
7023
|
for (const pattern of stalePatterns) {
|
|
6293
7024
|
await removeFile(join(videoDir, pattern));
|
|
6294
7025
|
}
|
|
6295
7026
|
const files = await listDirectory(videoDir);
|
|
6296
7027
|
for (const file of files) {
|
|
6297
|
-
if (file.endsWith("-edited.mp4") || file.endsWith("-captioned.mp4") || file.endsWith("-produced.mp4")) {
|
|
7028
|
+
if (file.endsWith("-edited.mp4") || file.endsWith("-enhanced.mp4") || file.endsWith("-captioned.mp4") || file.endsWith("-produced.mp4")) {
|
|
6298
7029
|
await removeFile(join(videoDir, file));
|
|
6299
7030
|
}
|
|
6300
7031
|
}
|
|
@@ -6416,9 +7147,37 @@ var MainVideoAsset = class _MainVideoAsset extends VideoAsset {
|
|
|
6416
7147
|
logger_default.info("No silence removed, using original video");
|
|
6417
7148
|
return this.videoPath;
|
|
6418
7149
|
}
|
|
7150
|
+
/**
|
|
7151
|
+
* Get the enhanced (visual overlays) video.
|
|
7152
|
+
* If not already generated, runs the visual enhancement stage.
|
|
7153
|
+
* Falls back to the edited video if enhancement is skipped or finds no opportunities.
|
|
7154
|
+
*
|
|
7155
|
+
* @param opts - Options controlling generation
|
|
7156
|
+
* @returns Path to the enhanced or edited video
|
|
7157
|
+
*/
|
|
7158
|
+
async getEnhancedVideo(opts) {
|
|
7159
|
+
if (!opts?.force && await fileExists(this.enhancedVideoPath)) {
|
|
7160
|
+
return this.enhancedVideoPath;
|
|
7161
|
+
}
|
|
7162
|
+
const config2 = getConfig();
|
|
7163
|
+
if (config2.SKIP_VISUAL_ENHANCEMENT) {
|
|
7164
|
+
return this.getEditedVideo(opts);
|
|
7165
|
+
}
|
|
7166
|
+
const editedPath = await this.getEditedVideo(opts);
|
|
7167
|
+
const transcript = await this.getTranscript();
|
|
7168
|
+
const videoFile = await this.toVideoFile();
|
|
7169
|
+
const { enhanceVideo: enhanceVideo2 } = await loadVisualEnhancement();
|
|
7170
|
+
const result = await enhanceVideo2(editedPath, transcript, videoFile);
|
|
7171
|
+
if (result) {
|
|
7172
|
+
logger_default.info(`Visual enhancement completed: ${result.overlays.length} overlays composited`);
|
|
7173
|
+
return result.enhancedVideoPath;
|
|
7174
|
+
}
|
|
7175
|
+
logger_default.info("No visual enhancements generated, using edited video");
|
|
7176
|
+
return editedPath;
|
|
7177
|
+
}
|
|
6419
7178
|
/**
|
|
6420
7179
|
* Get the captioned video.
|
|
6421
|
-
* If not already generated, burns captions into the
|
|
7180
|
+
* If not already generated, burns captions into the enhanced video.
|
|
6422
7181
|
*
|
|
6423
7182
|
* @param opts - Options controlling generation
|
|
6424
7183
|
* @returns Path to the captioned video
|
|
@@ -6427,10 +7186,10 @@ var MainVideoAsset = class _MainVideoAsset extends VideoAsset {
|
|
|
6427
7186
|
if (!opts?.force && await fileExists(this.captionedVideoPath)) {
|
|
6428
7187
|
return this.captionedVideoPath;
|
|
6429
7188
|
}
|
|
6430
|
-
const
|
|
7189
|
+
const enhancedPath = await this.getEnhancedVideo(opts);
|
|
6431
7190
|
const captions = await this.getCaptions();
|
|
6432
7191
|
const { burnCaptions: burnCaptions2 } = await loadCaptionBurning();
|
|
6433
|
-
await burnCaptions2(
|
|
7192
|
+
await burnCaptions2(enhancedPath, captions.ass, this.captionedVideoPath);
|
|
6434
7193
|
logger_default.info(`Captions burned into video: ${this.captionedVideoPath}`);
|
|
6435
7194
|
return this.captionedVideoPath;
|
|
6436
7195
|
}
|
|
@@ -6717,6 +7476,7 @@ var CONTENT_MATRIX = {
|
|
|
6717
7476
|
"medium-clip": { captions: true, variantKey: null }
|
|
6718
7477
|
},
|
|
6719
7478
|
["linkedin" /* LinkedIn */]: {
|
|
7479
|
+
video: { captions: true, variantKey: null },
|
|
6720
7480
|
"medium-clip": { captions: true, variantKey: null }
|
|
6721
7481
|
},
|
|
6722
7482
|
["tiktok" /* TikTok */]: {
|
|
@@ -7196,9 +7956,107 @@ async function buildPublishQueue(video, shorts, mediumClips, socialPosts, captio
|
|
|
7196
7956
|
init_ProducerAgent();
|
|
7197
7957
|
init_captionBurning();
|
|
7198
7958
|
init_singlePassEdit();
|
|
7959
|
+
init_visualEnhancement();
|
|
7199
7960
|
init_modelConfig();
|
|
7200
7961
|
init_costTracker();
|
|
7201
7962
|
init_types();
|
|
7963
|
+
|
|
7964
|
+
// src/services/processingState.ts
|
|
7965
|
+
init_fileSystem();
|
|
7966
|
+
init_paths();
|
|
7967
|
+
init_environment();
|
|
7968
|
+
init_logger2();
|
|
7969
|
+
function getStatePath() {
|
|
7970
|
+
const config2 = getConfig();
|
|
7971
|
+
return join(config2.OUTPUT_DIR, "processing-state.json");
|
|
7972
|
+
}
|
|
7973
|
+
async function readState() {
|
|
7974
|
+
const statePath = getStatePath();
|
|
7975
|
+
if (!fileExistsSync(statePath)) {
|
|
7976
|
+
return { videos: {} };
|
|
7977
|
+
}
|
|
7978
|
+
return readJsonFile(statePath, { videos: {} });
|
|
7979
|
+
}
|
|
7980
|
+
async function writeState(state) {
|
|
7981
|
+
const statePath = getStatePath();
|
|
7982
|
+
await writeJsonFile(statePath, state);
|
|
7983
|
+
}
|
|
7984
|
+
async function getVideoStatus(slug) {
|
|
7985
|
+
const state = await readState();
|
|
7986
|
+
return state.videos[slug];
|
|
7987
|
+
}
|
|
7988
|
+
async function getUnprocessed() {
|
|
7989
|
+
const state = await readState();
|
|
7990
|
+
const result = {};
|
|
7991
|
+
for (const [slug, video] of Object.entries(state.videos)) {
|
|
7992
|
+
if (video.status === "pending" || video.status === "failed") {
|
|
7993
|
+
result[slug] = video;
|
|
7994
|
+
}
|
|
7995
|
+
}
|
|
7996
|
+
return result;
|
|
7997
|
+
}
|
|
7998
|
+
async function isCompleted(slug) {
|
|
7999
|
+
const status = await getVideoStatus(slug);
|
|
8000
|
+
return status?.status === "completed";
|
|
8001
|
+
}
|
|
8002
|
+
async function markPending(slug, sourcePath) {
|
|
8003
|
+
const state = await readState();
|
|
8004
|
+
state.videos[slug] = {
|
|
8005
|
+
status: "pending",
|
|
8006
|
+
sourcePath
|
|
8007
|
+
};
|
|
8008
|
+
await writeState(state);
|
|
8009
|
+
logger_default.info(`[ProcessingState] Marked pending: ${slug}`);
|
|
8010
|
+
}
|
|
8011
|
+
async function markProcessing(slug) {
|
|
8012
|
+
const state = await readState();
|
|
8013
|
+
const existing = state.videos[slug];
|
|
8014
|
+
if (!existing) {
|
|
8015
|
+
logger_default.warn(`[ProcessingState] Cannot mark processing \u2014 unknown slug: ${slug}`);
|
|
8016
|
+
return;
|
|
8017
|
+
}
|
|
8018
|
+
state.videos[slug] = {
|
|
8019
|
+
...existing,
|
|
8020
|
+
status: "processing",
|
|
8021
|
+
startedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
8022
|
+
};
|
|
8023
|
+
await writeState(state);
|
|
8024
|
+
logger_default.info(`[ProcessingState] Marked processing: ${slug}`);
|
|
8025
|
+
}
|
|
8026
|
+
async function markCompleted(slug) {
|
|
8027
|
+
const state = await readState();
|
|
8028
|
+
const existing = state.videos[slug];
|
|
8029
|
+
if (!existing) {
|
|
8030
|
+
logger_default.warn(`[ProcessingState] Cannot mark completed \u2014 unknown slug: ${slug}`);
|
|
8031
|
+
return;
|
|
8032
|
+
}
|
|
8033
|
+
state.videos[slug] = {
|
|
8034
|
+
...existing,
|
|
8035
|
+
status: "completed",
|
|
8036
|
+
completedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
8037
|
+
error: void 0
|
|
8038
|
+
};
|
|
8039
|
+
await writeState(state);
|
|
8040
|
+
logger_default.info(`[ProcessingState] Marked completed: ${slug}`);
|
|
8041
|
+
}
|
|
8042
|
+
async function markFailed(slug, error) {
|
|
8043
|
+
const state = await readState();
|
|
8044
|
+
const existing = state.videos[slug];
|
|
8045
|
+
if (!existing) {
|
|
8046
|
+
logger_default.warn(`[ProcessingState] Cannot mark failed \u2014 unknown slug: ${slug}`);
|
|
8047
|
+
return;
|
|
8048
|
+
}
|
|
8049
|
+
state.videos[slug] = {
|
|
8050
|
+
...existing,
|
|
8051
|
+
status: "failed",
|
|
8052
|
+
completedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
8053
|
+
error
|
|
8054
|
+
};
|
|
8055
|
+
await writeState(state);
|
|
8056
|
+
logger_default.info(`[ProcessingState] Marked failed: ${slug} \u2014 ${error}`);
|
|
8057
|
+
}
|
|
8058
|
+
|
|
8059
|
+
// src/pipeline.ts
|
|
7202
8060
|
async function runStage(stageName, fn, stageResults) {
|
|
7203
8061
|
costTracker.setStage(stageName);
|
|
7204
8062
|
const start = Date.now();
|
|
@@ -7307,6 +8165,22 @@ async function processVideo(videoPath) {
|
|
|
7307
8165
|
}
|
|
7308
8166
|
}
|
|
7309
8167
|
const captionTranscript = adjustedTranscript ?? transcript;
|
|
8168
|
+
let enhancedVideoPath;
|
|
8169
|
+
if (!cfg.SKIP_VISUAL_ENHANCEMENT && captionTranscript) {
|
|
8170
|
+
const videoToEnhance = editedVideoPath ?? video.repoPath;
|
|
8171
|
+
const enhancementResult = await runStage(
|
|
8172
|
+
"visual-enhancement" /* VisualEnhancement */,
|
|
8173
|
+
async () => {
|
|
8174
|
+
const result = await enhanceVideo(videoToEnhance, captionTranscript, video);
|
|
8175
|
+
if (!result) return void 0;
|
|
8176
|
+
return result;
|
|
8177
|
+
},
|
|
8178
|
+
stageResults
|
|
8179
|
+
);
|
|
8180
|
+
if (enhancementResult) {
|
|
8181
|
+
enhancedVideoPath = enhancementResult.enhancedVideoPath;
|
|
8182
|
+
}
|
|
8183
|
+
}
|
|
7310
8184
|
let captions;
|
|
7311
8185
|
if (captionTranscript && !cfg.SKIP_CAPTIONS) {
|
|
7312
8186
|
captions = await runStage("captions" /* Captions */, () => generateCaptions(video, captionTranscript), stageResults);
|
|
@@ -7314,7 +8188,7 @@ async function processVideo(videoPath) {
|
|
|
7314
8188
|
let captionedVideoPath;
|
|
7315
8189
|
if (captions && !cfg.SKIP_CAPTIONS) {
|
|
7316
8190
|
const assFile = captions.find((p) => p.endsWith(".ass"));
|
|
7317
|
-
if (assFile && cleaningKeepSegments) {
|
|
8191
|
+
if (assFile && cleaningKeepSegments && !enhancedVideoPath) {
|
|
7318
8192
|
const captionedOutput = join(video.videoDir, `${video.slug}-captioned.mp4`);
|
|
7319
8193
|
captionedVideoPath = await runStage(
|
|
7320
8194
|
"caption-burn" /* CaptionBurn */,
|
|
@@ -7322,7 +8196,7 @@ async function processVideo(videoPath) {
|
|
|
7322
8196
|
stageResults
|
|
7323
8197
|
);
|
|
7324
8198
|
} else if (assFile) {
|
|
7325
|
-
const videoToBurn = editedVideoPath ?? video.repoPath;
|
|
8199
|
+
const videoToBurn = enhancedVideoPath ?? editedVideoPath ?? video.repoPath;
|
|
7326
8200
|
const captionedOutput = join(video.videoDir, `${video.slug}-captioned.mp4`);
|
|
7327
8201
|
captionedVideoPath = await runStage(
|
|
7328
8202
|
"caption-burn" /* CaptionBurn */,
|
|
@@ -7343,13 +8217,23 @@ async function processVideo(videoPath) {
|
|
|
7343
8217
|
}
|
|
7344
8218
|
} catch {
|
|
7345
8219
|
}
|
|
7346
|
-
|
|
8220
|
+
let webcamRegion;
|
|
8221
|
+
try {
|
|
8222
|
+
const layoutPath = join(video.videoDir, "layout.json");
|
|
8223
|
+
if (await fileExists(layoutPath)) {
|
|
8224
|
+
const layout = await readJsonFile(layoutPath);
|
|
8225
|
+
webcamRegion = layout.webcam;
|
|
8226
|
+
}
|
|
8227
|
+
} catch {
|
|
8228
|
+
}
|
|
8229
|
+
const result = await runStage("shorts" /* Shorts */, () => generateShorts(shortsVideo, shortsTranscript, getModelForAgent("ShortsAgent"), clipDirection, webcamRegion), stageResults);
|
|
7347
8230
|
if (result) shorts = result;
|
|
7348
8231
|
}
|
|
7349
8232
|
let mediumClips = [];
|
|
7350
8233
|
if (transcript && !cfg.SKIP_MEDIUM_CLIPS) {
|
|
7351
8234
|
const mediumTranscript = adjustedTranscript ?? transcript;
|
|
7352
|
-
const
|
|
8235
|
+
const mediumVideoPath = enhancedVideoPath ?? editedVideoPath;
|
|
8236
|
+
const mediumVideo = mediumVideoPath ? { ...video, repoPath: mediumVideoPath } : video;
|
|
7353
8237
|
let mediumClipDirection;
|
|
7354
8238
|
try {
|
|
7355
8239
|
const clipDirPath = join(video.videoDir, "clip-direction.md");
|
|
@@ -7455,6 +8339,7 @@ async function processVideo(videoPath) {
|
|
|
7455
8339
|
video,
|
|
7456
8340
|
transcript,
|
|
7457
8341
|
editedVideoPath,
|
|
8342
|
+
enhancedVideoPath,
|
|
7458
8343
|
captions,
|
|
7459
8344
|
captionedVideoPath,
|
|
7460
8345
|
summary,
|
|
@@ -7512,11 +8397,18 @@ function generateCostMarkdown(report) {
|
|
|
7512
8397
|
return md;
|
|
7513
8398
|
}
|
|
7514
8399
|
async function processVideoSafe(videoPath) {
|
|
8400
|
+
const filename = basename(videoPath);
|
|
8401
|
+
const slug = filename.replace(/\.(mp4|mov|webm|avi|mkv)$/i, "");
|
|
8402
|
+
await markPending(slug, videoPath);
|
|
8403
|
+
await markProcessing(slug);
|
|
7515
8404
|
try {
|
|
7516
|
-
|
|
8405
|
+
const result = await processVideo(videoPath);
|
|
8406
|
+
await markCompleted(slug);
|
|
8407
|
+
return result;
|
|
7517
8408
|
} catch (err) {
|
|
7518
8409
|
const message = err instanceof Error ? err.message : String(err);
|
|
7519
8410
|
logger_default.error(`Pipeline failed with uncaught error: ${message}`);
|
|
8411
|
+
await markFailed(slug, message);
|
|
7520
8412
|
return null;
|
|
7521
8413
|
}
|
|
7522
8414
|
}
|
|
@@ -8918,7 +9810,7 @@ program.command("schedule").description("View the current posting schedule acros
|
|
|
8918
9810
|
program.command("doctor").description("Check all prerequisites and dependencies").action(async () => {
|
|
8919
9811
|
await runDoctor();
|
|
8920
9812
|
});
|
|
8921
|
-
var defaultCmd = program.command("process", { isDefault: true }).argument("[video-path]", "Path to a video file to process (implies --once)").option("--watch-dir <path>", "Folder to watch for new recordings (default: env WATCH_FOLDER)").option("--output-dir <path>", "Output directory for processed videos (default: ./recordings)").option("--openai-key <key>", "OpenAI API key (default: env OPENAI_API_KEY)").option("--exa-key <key>", "Exa AI API key for web search (default: env EXA_API_KEY)").option("--once", "Process a single video and exit (no watching)").option("--brand <path>", "Path to brand.json config (default: ./brand.json)").option("--no-git", "Skip git commit/push stage").option("--no-silence-removal", "Skip silence removal stage").option("--no-shorts", "Skip shorts generation").option("--no-medium-clips", "Skip medium clip generation").option("--no-social", "Skip social media post generation").option("--no-captions", "Skip caption generation/burning").option("--no-social-publish", "Skip social media publishing/queue-build stage").option("--late-api-key <key>", "Late API key (default: env LATE_API_KEY)").option("--late-profile-id <id>", "Late profile ID (default: env LATE_PROFILE_ID)").option("-v, --verbose", "Verbose logging").option("--doctor", "Check all prerequisites and exit").action(async (videoPath) => {
|
|
9813
|
+
var defaultCmd = program.command("process", { isDefault: true }).argument("[video-path]", "Path to a video file to process (implies --once)").option("--watch-dir <path>", "Folder to watch for new recordings (default: env WATCH_FOLDER)").option("--output-dir <path>", "Output directory for processed videos (default: ./recordings)").option("--openai-key <key>", "OpenAI API key (default: env OPENAI_API_KEY)").option("--exa-key <key>", "Exa AI API key for web search (default: env EXA_API_KEY)").option("--once", "Process a single video and exit (no watching)").option("--brand <path>", "Path to brand.json config (default: ./brand.json)").option("--no-git", "Skip git commit/push stage").option("--no-silence-removal", "Skip silence removal stage").option("--no-shorts", "Skip shorts generation").option("--no-medium-clips", "Skip medium clip generation").option("--no-social", "Skip social media post generation").option("--no-captions", "Skip caption generation/burning").option("--no-visual-enhancement", "Skip visual enhancement (AI image overlays)").option("--no-social-publish", "Skip social media publishing/queue-build stage").option("--late-api-key <key>", "Late API key (default: env LATE_API_KEY)").option("--late-profile-id <id>", "Late profile ID (default: env LATE_PROFILE_ID)").option("-v, --verbose", "Verbose logging").option("--doctor", "Check all prerequisites and exit").action(async (videoPath) => {
|
|
8922
9814
|
const opts = defaultCmd.opts();
|
|
8923
9815
|
if (opts.doctor) {
|
|
8924
9816
|
await runDoctor();
|
|
@@ -8938,6 +9830,7 @@ var defaultCmd = program.command("process", { isDefault: true }).argument("[vide
|
|
|
8938
9830
|
mediumClips: opts.mediumClips,
|
|
8939
9831
|
social: opts.social,
|
|
8940
9832
|
captions: opts.captions,
|
|
9833
|
+
visualEnhancement: opts.visualEnhancement,
|
|
8941
9834
|
socialPublish: opts.socialPublish,
|
|
8942
9835
|
lateApiKey: opts.lateApiKey,
|
|
8943
9836
|
lateProfileId: opts.lateProfileId
|
|
@@ -8990,12 +9883,47 @@ var defaultCmd = program.command("process", { isDefault: true }).argument("[vide
|
|
|
8990
9883
|
}
|
|
8991
9884
|
process.on("SIGINT", () => shutdown());
|
|
8992
9885
|
process.on("SIGTERM", () => shutdown());
|
|
8993
|
-
watcher.on("new-video", (filePath) => {
|
|
9886
|
+
watcher.on("new-video", async (filePath) => {
|
|
9887
|
+
const filename = filePath.replace(/\\/g, "/").split("/").pop() ?? "";
|
|
9888
|
+
const slug = filename.replace(/\.(mp4|mov|webm|avi|mkv)$/i, "");
|
|
9889
|
+
if (slug && await isCompleted(slug)) {
|
|
9890
|
+
logger_default.info(`Skipping already-processed video: ${filePath}`);
|
|
9891
|
+
return;
|
|
9892
|
+
}
|
|
8994
9893
|
queue.push(filePath);
|
|
8995
9894
|
logger_default.info(`Queued video: ${filePath} (queue length: ${queue.length})`);
|
|
8996
9895
|
processQueue().catch((err) => logger_default.error("Queue processing error:", err));
|
|
8997
9896
|
});
|
|
8998
9897
|
watcher.start();
|
|
9898
|
+
try {
|
|
9899
|
+
const watchFiles = listDirectorySync(config2.WATCH_FOLDER);
|
|
9900
|
+
for (const file of watchFiles) {
|
|
9901
|
+
const ext = extname(file).toLowerCase();
|
|
9902
|
+
if (![".mp4", ".mov", ".webm", ".avi", ".mkv"].includes(ext)) continue;
|
|
9903
|
+
const filePath = join(config2.WATCH_FOLDER, file);
|
|
9904
|
+
const slug = file.replace(/\.(mp4|mov|webm|avi|mkv)$/i, "");
|
|
9905
|
+
const status = await getVideoStatus(slug);
|
|
9906
|
+
if (!status || status.status === "failed" || status.status === "pending") {
|
|
9907
|
+
if (!queue.includes(filePath)) {
|
|
9908
|
+
queue.push(filePath);
|
|
9909
|
+
logger_default.info(`Startup scan: queued ${slug}${status ? ` (was ${status.status})` : " (new)"}`);
|
|
9910
|
+
}
|
|
9911
|
+
}
|
|
9912
|
+
}
|
|
9913
|
+
} catch (err) {
|
|
9914
|
+
logger_default.warn(`Could not scan watch folder on startup: ${err instanceof Error ? err.message : String(err)}`);
|
|
9915
|
+
}
|
|
9916
|
+
const unprocessed = await getUnprocessed();
|
|
9917
|
+
for (const [slug, state] of Object.entries(unprocessed)) {
|
|
9918
|
+
if (!queue.includes(state.sourcePath)) {
|
|
9919
|
+
queue.push(state.sourcePath);
|
|
9920
|
+
logger_default.info(`Re-queued from state: ${slug} (${state.status})`);
|
|
9921
|
+
}
|
|
9922
|
+
}
|
|
9923
|
+
if (queue.length > 0) {
|
|
9924
|
+
logger_default.info(`Startup: ${queue.length} video(s) queued for processing`);
|
|
9925
|
+
processQueue().catch((err) => logger_default.error("Queue processing error:", err));
|
|
9926
|
+
}
|
|
8999
9927
|
if (onceMode) {
|
|
9000
9928
|
logger_default.info("Running in --once mode. Will exit after processing the next video.");
|
|
9001
9929
|
} else {
|