visual-ai-assertions 0.8.0 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -56,6 +56,7 @@ __export(index_exports, {
56
56
  VisualAIRateLimitError: () => VisualAIRateLimitError,
57
57
  VisualAIResponseParseError: () => VisualAIResponseParseError,
58
58
  VisualAITruncationError: () => VisualAITruncationError,
59
+ VisualAIVideoError: () => VisualAIVideoError,
59
60
  assertVisualCompareResult: () => assertVisualCompareResult,
60
61
  assertVisualResult: () => assertVisualResult,
61
62
  formatCheckResult: () => formatCheckResult,
@@ -182,6 +183,12 @@ var VisualAIImageError = class extends VisualAIError {
182
183
  this.name = "VisualAIImageError";
183
184
  }
184
185
  };
186
+ var VisualAIVideoError = class extends VisualAIError {
187
+ constructor(message) {
188
+ super(message, "VIDEO_INVALID");
189
+ this.name = "VisualAIVideoError";
190
+ }
191
+ };
185
192
  var VisualAIResponseParseError = class extends VisualAIError {
186
193
  rawResponse;
187
194
  constructor(message, rawResponse) {
@@ -215,7 +222,7 @@ var VisualAIAssertionError = class extends VisualAIError {
215
222
  }
216
223
  };
217
224
  function isVisualAIKnownError(error) {
218
- return error instanceof VisualAIAuthError || error instanceof VisualAIRateLimitError || error instanceof VisualAIProviderError || error instanceof VisualAIImageError || error instanceof VisualAIResponseParseError || error instanceof VisualAITruncationError || error instanceof VisualAIConfigError || error instanceof VisualAIAssertionError;
225
+ return error instanceof VisualAIAuthError || error instanceof VisualAIRateLimitError || error instanceof VisualAIProviderError || error instanceof VisualAIImageError || error instanceof VisualAIVideoError || error instanceof VisualAIResponseParseError || error instanceof VisualAITruncationError || error instanceof VisualAIConfigError || error instanceof VisualAIAssertionError;
219
226
  }
220
227
 
221
228
  // src/core/prompt.ts
@@ -229,7 +236,7 @@ Each issue must have:
229
236
  - "description": what the issue is
230
237
  - "suggestion": how to fix or improve it
231
238
  `;
232
- var CHECK_OUTPUT_SCHEMA = `IMPORTANT: Follow this evaluation order:
239
+ var CHECK_OUTPUT_SCHEMA_IMAGE = `IMPORTANT: Follow this evaluation order:
233
240
  1. First, evaluate EACH statement independently and populate the "statements" array
234
241
  2. Then, set "pass" to true ONLY if every statement passed (logical AND of all statement results)
235
242
  3. Write "reasoning" as a brief overall summary of the evaluation
@@ -269,7 +276,46 @@ Example for a failing check:
269
276
  ]
270
277
  }
271
278
  ${JSON_INSTRUCTIONS}`;
272
- var ASK_OUTPUT_SCHEMA = `Respond with a JSON object matching this exact structure:
279
+ var CHECK_OUTPUT_SCHEMA_VIDEO = `IMPORTANT: Follow this evaluation order:
280
+ 1. First, evaluate EACH statement independently across the entire timeline and populate the "statements" array
281
+ 2. A statement passes if it is true at ANY frame of the timeline, unless the wording explicitly says otherwise (e.g. "throughout", "at all times")
282
+ 3. For each statement that passes, set "timestampSeconds" to the timestamp of the frame that most clearly demonstrates it (or where it first becomes true). Use null when the statement fails or applies across the whole clip.
283
+ 4. Then, set "pass" to true ONLY if every statement passed (logical AND of all statement results)
284
+ 5. Write "reasoning" as a brief overall summary of the evaluation
285
+ 6. Include "issues" only for statements that failed
286
+
287
+ Respond with a JSON object matching this exact structure:
288
+ {
289
+ "pass": boolean, // true ONLY if ALL statements passed \u2014 derive from statements array
290
+ "reasoning": string, // brief overall summary of the evaluation
291
+ "issues": [...], // one issue per failing statement (empty if all pass)
292
+ "statements": [ // one entry per statement, in order \u2014 evaluate these FIRST
293
+ {
294
+ "statement": string, // the original statement text
295
+ "pass": boolean, // whether this statement is true at any point in the timeline
296
+ "reasoning": string, // explanation for this statement, citing frame timestamps where relevant
297
+ "confidence": "high" | "medium" | "low",
298
+ "timestampSeconds": number | null
299
+ // seconds from the start of the clip where the statement is most clearly true,
300
+ // or null if it failed / applies across the whole clip
301
+ }
302
+ ]
303
+ }
304
+ ${ISSUE_SCHEMA_INSTRUCTIONS}
305
+
306
+ Only include issues for statements that fail. If all statements pass, issues should be an empty array.
307
+
308
+ Example for a passing video check:
309
+ {
310
+ "pass": true,
311
+ "reasoning": "The success toast appeared briefly around 3.5s.",
312
+ "issues": [],
313
+ "statements": [
314
+ { "statement": "A success toast with text 'Saved' appears", "pass": true, "reasoning": "A green toast labeled 'Saved' is visible in the bottom-right at the 3.5s frame", "confidence": "high", "timestampSeconds": 3.5 }
315
+ ]
316
+ }
317
+ ${JSON_INSTRUCTIONS}`;
318
+ var ASK_OUTPUT_SCHEMA_IMAGE = `Respond with a JSON object matching this exact structure:
273
319
  {
274
320
  "summary": string, // high-level analysis summary
275
321
  "issues": [...] // list of issues/findings, can be empty
@@ -290,6 +336,17 @@ Example:
290
336
  ]
291
337
  }
292
338
  ${JSON_INSTRUCTIONS}`;
339
+ var ASK_OUTPUT_SCHEMA_VIDEO = `Respond with a JSON object matching this exact structure:
340
+ {
341
+ "summary": string, // high-level summary of what happens across the timeline
342
+ "issues": [...], // list of issues/findings, can be empty
343
+ "frameReferences": number[] // 0-based indices of frames the answer relies on (in order)
344
+ }
345
+ ${ISSUE_SCHEMA_INSTRUCTIONS}
346
+
347
+ Prioritize issues by severity (critical / major / minor) as for image input.
348
+ Cite frame indices in "frameReferences" so the user can locate the moments you describe.
349
+ ${JSON_INSTRUCTIONS}`;
293
350
  var COMPARE_OUTPUT_SCHEMA = `Respond with a JSON object matching this exact structure:
294
351
  {
295
352
  "pass": boolean, // true if no critical or major changes found
@@ -308,7 +365,19 @@ var COMPARE_OUTPUT_SCHEMA = `Respond with a JSON object matching this exact stru
308
365
  If the images appear identical, set pass to true, explain in reasoning, and return an empty changes array.
309
366
  ${JSON_INSTRUCTIONS}`;
310
367
  var DEFAULT_CHECK_ROLE = "You are a visual QA assistant. Evaluate the provided image precisely and objectively.";
368
+ var DEFAULT_CHECK_ROLE_VIDEO = "You are a visual QA assistant. Evaluate the provided sequence of video frames precisely and objectively, treating them as a chronological timeline.";
311
369
  var DEFAULT_ASK_ROLE = "You are a visual QA assistant. Analyze the provided image based on the user's request.";
370
+ var DEFAULT_ASK_ROLE_VIDEO = "You are a visual QA assistant. Analyze the provided sequence of video frames as a chronological timeline based on the user's request.";
371
+ function buildVideoTimelineSection(frameTimestamps, durationSeconds) {
372
+ const formatted = frameTimestamps.map((t, i) => ` ${i}: ${t.toFixed(2)}s`).join("\n");
373
+ return `Video timeline:
374
+ - Total duration: ${durationSeconds.toFixed(2)}s
375
+ - ${frameTimestamps.length} frames sampled (in chronological order)
376
+ - Frame index \u2192 timestamp:
377
+ ${formatted}
378
+
379
+ Treat the attached images as a chronological timeline. The first image is the earliest frame, the last is the latest. Refer to frames by timestamp where helpful.`;
380
+ }
312
381
  var COMPARE_ROLE = "You are performing a visual regression test. Compare the BEFORE image (baseline) to the AFTER image (current) and identify all visual differences. Flag changes that appear unintentional or problematic.";
313
382
  var COMPARE_EDGE_RULES = [
314
383
  "The BEFORE image is the baseline/expected state.",
@@ -321,22 +390,31 @@ function buildInstructionsSection(instructions) {
321
390
  function buildCheckPrompt(statements, options) {
322
391
  const stmts = Array.isArray(statements) ? statements : [statements];
323
392
  const statementsBlock = stmts.map((s, i) => `${i + 1}. "${s}"`).join("\n");
324
- const sections = [options?.role ?? DEFAULT_CHECK_ROLE];
393
+ const media = options?.media;
394
+ const defaultRole = media?.kind === "video" ? DEFAULT_CHECK_ROLE_VIDEO : DEFAULT_CHECK_ROLE;
395
+ const sections = [options?.role ?? defaultRole];
396
+ if (media?.kind === "video") {
397
+ sections.push(buildVideoTimelineSection(media.frameTimestamps, media.durationSeconds));
398
+ }
325
399
  if (options?.instructions && options.instructions.length > 0) {
326
400
  sections.push(buildInstructionsSection(options.instructions));
327
401
  }
328
402
  sections.push(`Statements to evaluate:
329
403
  ${statementsBlock}`);
330
- sections.push(CHECK_OUTPUT_SCHEMA);
404
+ sections.push(media?.kind === "video" ? CHECK_OUTPUT_SCHEMA_VIDEO : CHECK_OUTPUT_SCHEMA_IMAGE);
331
405
  return sections.join("\n\n");
332
406
  }
333
407
  function buildAskPrompt(userPrompt, options) {
334
- const sections = [DEFAULT_ASK_ROLE];
408
+ const media = options?.media;
409
+ const sections = [media?.kind === "video" ? DEFAULT_ASK_ROLE_VIDEO : DEFAULT_ASK_ROLE];
410
+ if (media?.kind === "video") {
411
+ sections.push(buildVideoTimelineSection(media.frameTimestamps, media.durationSeconds));
412
+ }
335
413
  if (options?.instructions && options.instructions.length > 0) {
336
414
  sections.push(buildInstructionsSection(options.instructions));
337
415
  }
338
416
  sections.push(`User request: ${userPrompt}`);
339
- sections.push(ASK_OUTPUT_SCHEMA);
417
+ sections.push(media?.kind === "video" ? ASK_OUTPUT_SCHEMA_VIDEO : ASK_OUTPUT_SCHEMA_IMAGE);
340
418
  return sections.join("\n\n");
341
419
  }
342
420
  function buildAiDiffPrompt() {
@@ -1031,6 +1109,51 @@ async function generateAiDiff(imgA, imgB, model, driver) {
1031
1109
  var import_promises = require("fs/promises");
1032
1110
  var import_node_path = require("path");
1033
1111
  var import_sharp2 = __toESM(require("sharp"), 1);
1112
+
1113
+ // src/core/input-detect.ts
1114
+ function isFilePath(input) {
1115
+ return input.startsWith("/") || input.startsWith("./") || input.startsWith("../") || input.includes("\\");
1116
+ }
1117
+ function isUrl(input) {
1118
+ return input.startsWith("http://") || input.startsWith("https://");
1119
+ }
1120
+ function isDataUrl(input) {
1121
+ return input.startsWith("data:");
1122
+ }
1123
+ function parseDataUrl(input) {
1124
+ const match = /^data:([^;]+);base64,(.+)$/.exec(input);
1125
+ if (!match?.[1] || !match[2]) return null;
1126
+ return { mimeType: match[1], base64Payload: match[2] };
1127
+ }
1128
+ function decodeBase64(payload) {
1129
+ if (!/^[A-Za-z0-9+/\n\r]+=*$/.test(payload)) {
1130
+ throw new Error("Invalid base64 string");
1131
+ }
1132
+ return Buffer.from(payload, "base64");
1133
+ }
1134
+ function looksLikeImageBase64(input) {
1135
+ return input.startsWith("iVBOR") || // PNG (0x89 0x50 0x4E 0x47)
1136
+ input.startsWith("/9j/") || // JPEG (0xFF 0xD8 0xFF)
1137
+ input.startsWith("R0lGOD") || // GIF (0x47 0x49 0x46)
1138
+ input.startsWith("UklGR");
1139
+ }
1140
+ function looksLikeVideoBase64(input) {
1141
+ return input.startsWith("GkXf") || input.startsWith("AAAA");
1142
+ }
1143
+ async function fetchToBuffer(url, timeoutMs) {
1144
+ const response = await fetch(url, {
1145
+ signal: AbortSignal.timeout(timeoutMs)
1146
+ });
1147
+ if (!response.ok) {
1148
+ throw new Error(`HTTP ${response.status}`);
1149
+ }
1150
+ const arrayBuffer = await response.arrayBuffer();
1151
+ const data = Buffer.from(arrayBuffer);
1152
+ const contentType = response.headers.get("content-type")?.split(";")[0]?.trim() ?? null;
1153
+ return { data, contentType };
1154
+ }
1155
+
1156
+ // src/core/image.ts
1034
1157
  var SUPPORTED_FORMATS = /* @__PURE__ */ new Set([
1035
1158
  "image/jpeg",
1036
1159
  "image/png",
@@ -1053,18 +1176,6 @@ function getMimeFromExtension(filePath) {
1053
1176
  const ext = (0, import_node_path.extname)(filePath).toLowerCase();
1054
1177
  return EXTENSION_TO_MIME[ext];
1055
1178
  }
1056
- function isFilePath(input) {
1057
- return input.startsWith("/") || input.startsWith("./") || input.startsWith("../") || input.includes("\\");
1058
- }
1059
- function isUrl(input) {
1060
- return input.startsWith("http://") || input.startsWith("https://");
1061
- }
1062
- function isBase64Image(input) {
1063
- return input.startsWith("iVBOR") || // PNG (0x89 0x50 0x4E 0x47)
1064
- input.startsWith("/9j/") || // JPEG (0xFF 0xD8 0xFF)
1065
- input.startsWith("R0lGOD") || // GIF (0x47 0x49 0x46)
1066
- input.startsWith("UklGR");
1067
- }
1068
1179
  function detectMimeType(data) {
1069
1180
  if (data[0] === 255 && data[1] === 216 && data[2] === 255) {
1070
1181
  return "image/jpeg";
@@ -1118,45 +1229,38 @@ async function loadFromFilePath(filePath) {
1118
1229
  return { data: fileData, mimeType };
1119
1230
  }
1120
1231
  async function loadFromUrl(url) {
1121
- let response;
1232
+ let result;
1122
1233
  try {
1123
- response = await fetch(url, {
1124
- signal: AbortSignal.timeout(URL_FETCH_TIMEOUT_MS)
1125
- });
1234
+ result = await fetchToBuffer(url, URL_FETCH_TIMEOUT_MS);
1126
1235
  } catch (err) {
1127
1236
  throw new VisualAIImageError(
1128
1237
  `Failed to fetch image from URL: ${url} \u2014 ${err instanceof Error ? err.message : String(err)}`
1129
1238
  );
1130
1239
  }
1131
- if (!response.ok) {
1132
- throw new VisualAIImageError(
1133
- `Failed to fetch image from URL: ${url} \u2014 HTTP ${response.status}`
1134
- );
1135
- }
1136
- const arrayBuffer = await response.arrayBuffer();
1137
- const data = Buffer.from(arrayBuffer);
1138
- const contentType = response.headers.get("content-type")?.split(";")[0]?.trim() ?? null;
1240
+ const { data, contentType } = result;
1139
1241
  const mimeType = contentType && isSupportedMimeType(contentType) ? contentType : detectMimeType(data);
1140
1242
  return { data, mimeType };
1141
1243
  }
1142
1244
  function loadFromBase64(input) {
1143
1245
  let base64Data = input;
1144
1246
  let mimeType;
1145
- if (input.startsWith("data:")) {
1146
- const match = /^data:(image\/[^;]+);base64,(.+)$/.exec(input);
1147
- if (!match?.[1] || !match[2]) {
1247
+ if (isDataUrl(input)) {
1248
+ const parsed = parseDataUrl(input);
1249
+ if (!parsed) {
1148
1250
  throw new VisualAIImageError("Invalid data URL format");
1149
1251
  }
1150
- if (!isSupportedMimeType(match[1])) {
1151
- throw new VisualAIImageError(`Unsupported image format: ${match[1]}`);
1252
+ if (!isSupportedMimeType(parsed.mimeType)) {
1253
+ throw new VisualAIImageError(`Unsupported image format: ${parsed.mimeType}`);
1152
1254
  }
1153
- mimeType = match[1];
1154
- base64Data = match[2];
1255
+ mimeType = parsed.mimeType;
1256
+ base64Data = parsed.base64Payload;
1155
1257
  }
1156
- if (!/^[A-Za-z0-9+/\n\r]+=*$/.test(base64Data)) {
1258
+ let data;
1259
+ try {
1260
+ data = decodeBase64(base64Data);
1261
+ } catch {
1157
1262
  throw new VisualAIImageError("Invalid base64 string");
1158
1263
  }
1159
- const data = Buffer.from(base64Data, "base64");
1160
1264
  if (data.length === 0) {
1161
1265
  throw new VisualAIImageError("Empty image data after base64 decode");
1162
1266
  }
@@ -1175,9 +1279,9 @@ async function normalizeImage(input) {
1175
1279
  } else if (typeof input === "string") {
1176
1280
  if (isUrl(input)) {
1177
1281
  ({ data, mimeType } = await loadFromUrl(input));
1178
- } else if (input.startsWith("data:")) {
1282
+ } else if (isDataUrl(input)) {
1179
1283
  ({ data, mimeType } = loadFromBase64(input));
1180
- } else if (isBase64Image(input)) {
1284
+ } else if (looksLikeImageBase64(input)) {
1181
1285
  ({ data, mimeType } = loadFromBase64(input));
1182
1286
  } else if (isFilePath(input)) {
1183
1287
  ({ data, mimeType } = await loadFromFilePath(input));
@@ -1205,6 +1309,435 @@ async function normalizeImage(input) {
1205
1309
  };
1206
1310
  }
1207
1311
 
1312
+ // src/core/debug-frames.ts
1313
+ var import_node_crypto = require("crypto");
1314
+ var import_promises2 = require("fs/promises");
1315
+ var import_node_path2 = require("path");
1316
+ var DEBUG_FRAMES_ENV = "VISUAL_AI_DEBUG_FRAMES";
1317
+ var DEBUG_FRAMES_DIR_ENV = "VISUAL_AI_DEBUG_FRAMES_DIR";
1318
+ var DEFAULT_DIR_NAME = "visual-ai-debug-frames";
1319
+ function isEnabled(env) {
1320
+ const raw = env[DEBUG_FRAMES_ENV];
1321
+ if (raw === void 0 || raw === "") return false;
1322
+ const lower = raw.toLowerCase();
1323
+ return lower === "true" || lower === "1";
1324
+ }
1325
+ function timestampSlug(date) {
1326
+ return date.toISOString().replace(/[:.]/g, "-");
1327
+ }
1328
+ function paddedIndex(value, total) {
1329
+ const width = Math.max(2, String(total - 1).length);
1330
+ return String(value).padStart(width, "0");
1331
+ }
1332
+ function extensionFromMimeType(mimeType) {
1333
+ if (mimeType === "image/png") return ".png";
1334
+ if (mimeType === "image/webp") return ".webp";
1335
+ return ".jpg";
1336
+ }
1337
+ async function saveDebugFrames(frames, env = process.env) {
1338
+ if (!isEnabled(env)) return void 0;
1339
+ if (frames.length === 0) return void 0;
1340
+ const baseDir = env[DEBUG_FRAMES_DIR_ENV]?.trim() || DEFAULT_DIR_NAME;
1341
+ const runDir = (0, import_node_path2.resolve)(baseDir, `${timestampSlug(/* @__PURE__ */ new Date())}-${(0, import_node_crypto.randomBytes)(3).toString("hex")}`);
1342
+ try {
1343
+ await (0, import_promises2.mkdir)(runDir, { recursive: true });
1344
+ await Promise.all(
1345
+ frames.map((frame) => {
1346
+ const idx = paddedIndex(frame.index, frames.length);
1347
+ const ts = frame.timestampSeconds.toFixed(2);
1348
+ const ext = extensionFromMimeType(frame.mimeType);
1349
+ const filename = `frame-${idx}-t${ts}s${ext}`;
1350
+ return (0, import_promises2.writeFile)((0, import_node_path2.join)(runDir, filename), frame.data);
1351
+ })
1352
+ );
1353
+ } catch (err) {
1354
+ process.stderr.write(
1355
+ `[visual-ai-assertions] warning: failed to save debug frames to ${runDir}: ${err instanceof Error ? err.message : String(err)}
1356
+ `
1357
+ );
1358
+ return void 0;
1359
+ }
1360
+ process.stderr.write(
1361
+ `[visual-ai-assertions] Saved ${frames.length} debug frame(s) to ${runDir}
1362
+ `
1363
+ );
1364
+ return runDir;
1365
+ }
1366
+
1367
+ // src/core/video.ts
1368
+ var import_promises3 = require("fs/promises");
1369
+ var import_node_os = require("os");
1370
+ var import_node_path3 = require("path");
1371
+ var FRAME_MAX_DIMENSION = 1568;
1372
+ var DEFAULT_FPS = 1;
1373
+ var DEFAULT_MAX_FRAMES = 10;
1374
+ var DEFAULT_MAX_DURATION_SECONDS = 10;
1375
+ var MAX_FRAMES_HARD_CAP = 60;
1376
+ var FFPROBE_TIMEOUT_MS = 15e3;
1377
+ var FFMPEG_RUN_TIMEOUT_MS = 6e4;
1378
+ var VIDEO_EXTENSIONS = {
1379
+ ".mp4": "video/mp4",
1380
+ ".m4v": "video/mp4",
1381
+ ".webm": "video/webm",
1382
+ ".mov": "video/quicktime",
1383
+ ".qt": "video/quicktime",
1384
+ ".mkv": "video/x-matroska"
1385
+ };
1386
+ var VIDEO_MIME_TYPES = /* @__PURE__ */ new Set([
1387
+ "video/mp4",
1388
+ "video/webm",
1389
+ "video/quicktime",
1390
+ "video/x-matroska"
1391
+ ]);
1392
+ function isSupportedVideoMimeType(value) {
1393
+ return VIDEO_MIME_TYPES.has(value);
1394
+ }
1395
+ function getVideoMimeFromExtension(filePath) {
1396
+ const ext = (0, import_node_path3.extname)(filePath).toLowerCase();
1397
+ return VIDEO_EXTENSIONS[ext];
1398
+ }
1399
+ function detectVideoMimeType(data) {
1400
+ if (data.length < 12) return null;
1401
+ if (data[4] === 102 && data[5] === 116 && data[6] === 121 && data[7] === 112) {
1402
+ if (data[8] === 113 && data[9] === 116 && data[10] === 32 && data[11] === 32) {
1403
+ return "video/quicktime";
1404
+ }
1405
+ return "video/mp4";
1406
+ }
1407
+ if (data[0] === 26 && data[1] === 69 && data[2] === 223 && data[3] === 163) {
1408
+ return "video/webm";
1409
+ }
1410
+ return null;
1411
+ }
1412
+ async function resolveVideoToPath(input) {
1413
+ if (Buffer.isBuffer(input) || input instanceof Uint8Array) {
1414
+ const buf2 = Buffer.isBuffer(input) ? input : Buffer.from(input);
1415
+ const mimeType2 = detectVideoMimeType(buf2);
1416
+ if (!mimeType2) {
1417
+ throw new VisualAIVideoError("Unable to detect video format from buffer contents");
1418
+ }
1419
+ return writeBufferToTemp(buf2, mimeType2);
1420
+ }
1421
+ if (typeof input !== "string") {
1422
+ throw new VisualAIVideoError(
1423
+ "Invalid video input: expected Buffer, Uint8Array, file path, data URL, or base64 string"
1424
+ );
1425
+ }
1426
+ if (isDataUrl(input)) {
1427
+ const parsed = parseDataUrl(input);
1428
+ if (!parsed) {
1429
+ throw new VisualAIVideoError("Invalid data URL format");
1430
+ }
1431
+ if (!isSupportedVideoMimeType(parsed.mimeType)) {
1432
+ throw new VisualAIVideoError(`Unsupported video format: ${parsed.mimeType}`);
1433
+ }
1434
+ let buf2;
1435
+ try {
1436
+ buf2 = decodeBase64(parsed.base64Payload);
1437
+ } catch {
1438
+ throw new VisualAIVideoError("Invalid base64 payload in data URL");
1439
+ }
1440
+ return writeBufferToTemp(buf2, parsed.mimeType);
1441
+ }
1442
+ if (isFilePath(input)) {
1443
+ const mimeType2 = getVideoMimeFromExtension(input);
1444
+ if (!mimeType2) {
1445
+ throw new VisualAIVideoError(
1446
+ `Unsupported video file extension: ${input}. Supported: .mp4, .webm, .mov, .mkv`
1447
+ );
1448
+ }
1449
+ return { path: input, mimeType: mimeType2, cleanup: async () => {
1450
+ } };
1451
+ }
1452
+ let buf;
1453
+ try {
1454
+ buf = decodeBase64(input);
1455
+ } catch {
1456
+ throw new VisualAIVideoError(
1457
+ `Unrecognized video input: "${input.slice(0, 80)}". Expected a file path, data URL, or base64-encoded video string.`
1458
+ );
1459
+ }
1460
+ const mimeType = detectVideoMimeType(buf);
1461
+ if (!mimeType) {
1462
+ throw new VisualAIVideoError(
1463
+ `Unrecognized video input: "${input.slice(0, 80)}". Expected a file path, data URL, or base64-encoded video string.`
1464
+ );
1465
+ }
1466
+ return writeBufferToTemp(buf, mimeType);
1467
+ }
1468
+ async function writeBufferToTemp(data, mimeType) {
1469
+ const dir = await (0, import_promises3.mkdtemp)((0, import_node_path3.join)((0, import_node_os.tmpdir)(), "visual-ai-video-"));
1470
+ try {
1471
+ const ext = extensionFor(mimeType);
1472
+ const path = (0, import_node_path3.join)(dir, `input${ext}`);
1473
+ await (0, import_promises3.writeFile)(path, data);
1474
+ return {
1475
+ path,
1476
+ mimeType,
1477
+ cleanup: async () => {
1478
+ try {
1479
+ await (0, import_promises3.rm)(dir, { recursive: true, force: true });
1480
+ } catch {
1481
+ }
1482
+ }
1483
+ };
1484
+ } catch (err) {
1485
+ try {
1486
+ await (0, import_promises3.rm)(dir, { recursive: true, force: true });
1487
+ } catch {
1488
+ }
1489
+ throw err;
1490
+ }
1491
+ }
1492
+ function extensionFor(mimeType) {
1493
+ switch (mimeType) {
1494
+ case "video/mp4":
1495
+ return ".mp4";
1496
+ case "video/webm":
1497
+ return ".webm";
1498
+ case "video/quicktime":
1499
+ return ".mov";
1500
+ case "video/x-matroska":
1501
+ return ".mkv";
1502
+ }
1503
+ }
1504
+ var cachedFactoryPromise;
1505
+ async function loadFfmpegFactory() {
1506
+ if (cachedFactoryPromise) return cachedFactoryPromise;
1507
+ cachedFactoryPromise = (async () => {
1508
+ let ffmpegModule;
1509
+ try {
1510
+ ffmpegModule = await import("fluent-ffmpeg");
1511
+ } catch (err) {
1512
+ const code = err?.code;
1513
+ if (code === "ERR_MODULE_NOT_FOUND" || code === "MODULE_NOT_FOUND") {
1514
+ throw new VisualAIVideoError(
1515
+ "Could not load fluent-ffmpeg. It ships as a dependency of visual-ai-assertions, so this usually means the install was pruned or the platform-specific binary is unavailable. Reinstall the package or run: pnpm add fluent-ffmpeg @ffmpeg-installer/ffmpeg @ffprobe-installer/ffprobe"
1516
+ );
1517
+ }
1518
+ throw new VisualAIVideoError(
1519
+ `Failed to load fluent-ffmpeg: ${err instanceof Error ? err.message : String(err)}`
1520
+ );
1521
+ }
1522
+ const factory = ffmpegModule.default ?? ffmpegModule;
1523
+ try {
1524
+ const installer = await import("@ffmpeg-installer/ffmpeg");
1525
+ const path = (installer.default ?? installer).path;
1526
+ if (path) factory.setFfmpegPath(path);
1527
+ } catch (err) {
1528
+ const code = err?.code;
1529
+ if (code !== "ERR_MODULE_NOT_FOUND" && code !== "MODULE_NOT_FOUND") {
1530
+ process.stderr.write(
1531
+ `[visual-ai-assertions] warning: @ffmpeg-installer/ffmpeg failed to load: ${err instanceof Error ? err.message : String(err)}
1532
+ `
1533
+ );
1534
+ }
1535
+ }
1536
+ try {
1537
+ const installer = await import("@ffprobe-installer/ffprobe");
1538
+ const path = (installer.default ?? installer).path;
1539
+ if (path) factory.setFfprobePath(path);
1540
+ } catch (err) {
1541
+ const code = err?.code;
1542
+ if (code !== "ERR_MODULE_NOT_FOUND" && code !== "MODULE_NOT_FOUND") {
1543
+ process.stderr.write(
1544
+ `[visual-ai-assertions] warning: @ffprobe-installer/ffprobe failed to load: ${err instanceof Error ? err.message : String(err)}
1545
+ `
1546
+ );
1547
+ }
1548
+ }
1549
+ return factory;
1550
+ })();
1551
+ try {
1552
+ return await cachedFactoryPromise;
1553
+ } catch (err) {
1554
+ cachedFactoryPromise = void 0;
1555
+ throw err;
1556
+ }
1557
+ }
1558
+ async function probeDurationSeconds(videoPath) {
1559
+ const ffmpeg = await loadFfmpegFactory();
1560
+ return new Promise((resolve2, reject) => {
1561
+ let settled = false;
1562
+ const finish = (fn) => {
1563
+ if (settled) return;
1564
+ settled = true;
1565
+ clearTimeout(timer);
1566
+ fn();
1567
+ };
1568
+ const timer = setTimeout(() => {
1569
+ finish(() => {
1570
+ reject(
1571
+ new VisualAIVideoError(
1572
+ `ffprobe timed out after ${FFPROBE_TIMEOUT_MS}ms while probing ${videoPath}`
1573
+ )
1574
+ );
1575
+ });
1576
+ }, FFPROBE_TIMEOUT_MS);
1577
+ ffmpeg.ffprobe(videoPath, (err, data) => {
1578
+ if (err) {
1579
+ finish(() => {
1580
+ reject(
1581
+ new VisualAIVideoError(
1582
+ `Failed to probe video metadata: ${err.message}. Ensure ffprobe is installed (e.g. via @ffprobe-installer/ffprobe).`
1583
+ )
1584
+ );
1585
+ });
1586
+ return;
1587
+ }
1588
+ const raw = data.format?.duration;
1589
+ const duration = typeof raw === "string" ? Number(raw) : raw;
1590
+ if (!duration || !Number.isFinite(duration) || duration <= 0) {
1591
+ finish(() => {
1592
+ reject(new VisualAIVideoError("Video duration could not be determined"));
1593
+ });
1594
+ return;
1595
+ }
1596
+ finish(() => {
1597
+ resolve2(duration);
1598
+ });
1599
+ });
1600
+ });
1601
+ }
1602
+ async function extractFrames(videoPath, options = {}) {
1603
+ const fps = options.fps ?? DEFAULT_FPS;
1604
+ const maxFrames = options.maxFrames ?? DEFAULT_MAX_FRAMES;
1605
+ const maxDurationSeconds = options.maxDurationSeconds ?? DEFAULT_MAX_DURATION_SECONDS;
1606
+ if (!Number.isFinite(fps) || fps <= 0) {
1607
+ throw new VisualAIVideoError(`Invalid fps: ${fps}. Must be a finite number > 0.`);
1608
+ }
1609
+ if (!Number.isFinite(maxFrames) || maxFrames <= 0) {
1610
+ throw new VisualAIVideoError(`Invalid maxFrames: ${maxFrames}. Must be a finite number > 0.`);
1611
+ }
1612
+ if (maxFrames > MAX_FRAMES_HARD_CAP) {
1613
+ throw new VisualAIVideoError(
1614
+ `maxFrames ${maxFrames} exceeds the hard cap of ${MAX_FRAMES_HARD_CAP}. Lower maxFrames or open an issue if you need a larger limit.`
1615
+ );
1616
+ }
1617
+ if (!Number.isFinite(maxDurationSeconds) || maxDurationSeconds <= 0) {
1618
+ throw new VisualAIVideoError(
1619
+ `Invalid maxDurationSeconds: ${maxDurationSeconds}. Must be a finite number > 0.`
1620
+ );
1621
+ }
1622
+ const ffmpeg = await loadFfmpegFactory();
1623
+ const durationSeconds = await probeDurationSeconds(videoPath);
1624
+ if (durationSeconds > maxDurationSeconds) {
1625
+ throw new VisualAIVideoError(
1626
+ `Video duration ${durationSeconds.toFixed(2)}s exceeds limit of ${maxDurationSeconds}s. Pass { maxDurationSeconds: N } to override, or trim the source video.`
1627
+ );
1628
+ }
1629
+ const outputDir = await (0, import_promises3.mkdtemp)((0, import_node_path3.join)((0, import_node_os.tmpdir)(), "visual-ai-frames-"));
1630
+ try {
1631
+ const filter = `fps=${fps},scale='if(gt(iw,ih),min(${FRAME_MAX_DIMENSION},iw),-2)':'if(gt(iw,ih),-2,min(${FRAME_MAX_DIMENSION},ih))':flags=area`;
1632
+ await new Promise((resolve2, reject) => {
1633
+ let settled = false;
1634
+ const cmd = ffmpeg(videoPath);
1635
+ const finish = (fn) => {
1636
+ if (settled) return;
1637
+ settled = true;
1638
+ clearTimeout(timer);
1639
+ fn();
1640
+ };
1641
+ const timer = setTimeout(() => {
1642
+ try {
1643
+ cmd.kill("SIGKILL");
1644
+ } catch {
1645
+ }
1646
+ finish(() => {
1647
+ reject(
1648
+ new VisualAIVideoError(
1649
+ `ffmpeg frame extraction timed out after ${FFMPEG_RUN_TIMEOUT_MS}ms`
1650
+ )
1651
+ );
1652
+ });
1653
+ }, FFMPEG_RUN_TIMEOUT_MS);
1654
+ cmd.outputOptions(["-vf", filter, "-vframes", String(maxFrames), "-q:v", "3"]).output((0, import_node_path3.join)(outputDir, "frame-%04d.jpg")).on("end", () => {
1655
+ finish(() => {
1656
+ resolve2();
1657
+ });
1658
+ }).on("error", (err) => {
1659
+ finish(() => {
1660
+ reject(new VisualAIVideoError(`ffmpeg frame extraction failed: ${err.message}`));
1661
+ });
1662
+ }).run();
1663
+ });
1664
+ const files = (await (0, import_promises3.readdir)(outputDir)).filter((name) => name.endsWith(".jpg")).sort();
1665
+ if (files.length === 0) {
1666
+ throw new VisualAIVideoError(
1667
+ "No frames could be extracted from the video. The source may be corrupt or empty."
1668
+ );
1669
+ }
1670
+ const frames = await Promise.all(
1671
+ files.map(async (name, index) => {
1672
+ const data = await (0, import_promises3.readFile)((0, import_node_path3.join)(outputDir, name));
1673
+ const timestampSeconds = Math.min(durationSeconds, (index + 0.5) / fps);
1674
+ let cachedBase64;
1675
+ return {
1676
+ data,
1677
+ mimeType: "image/jpeg",
1678
+ get base64() {
1679
+ if (cachedBase64 === void 0) {
1680
+ cachedBase64 = data.toString("base64");
1681
+ }
1682
+ return cachedBase64;
1683
+ },
1684
+ timestampSeconds,
1685
+ index
1686
+ };
1687
+ })
1688
+ );
1689
+ return { frames, durationSeconds };
1690
+ } finally {
1691
+ try {
1692
+ await (0, import_promises3.rm)(outputDir, { recursive: true, force: true });
1693
+ } catch {
1694
+ }
1695
+ }
1696
+ }
1697
+
1698
+ // src/core/media.ts
1699
+ var VIDEO_MAGIC_BYTE_PREFIX_LEN = 16;
1700
+ function isVideoInput(input) {
1701
+ if (Buffer.isBuffer(input) || input instanceof Uint8Array) {
1702
+ const buf = Buffer.isBuffer(input) ? input : Buffer.from(input);
1703
+ return detectVideoMimeType(buf) !== null;
1704
+ }
1705
+ if (typeof input !== "string") return false;
1706
+ if (isDataUrl(input)) {
1707
+ const parsed = parseDataUrl(input);
1708
+ return parsed?.mimeType.startsWith("video/") ?? false;
1709
+ }
1710
+ if (isFilePath(input)) {
1711
+ return getVideoMimeFromExtension(input) !== void 0;
1712
+ }
1713
+ if (looksLikeVideoBase64(input)) {
1714
+ try {
1715
+ const buf = decodeBase64(input.slice(0, VIDEO_MAGIC_BYTE_PREFIX_LEN));
1716
+ return detectVideoMimeType(buf) !== null;
1717
+ } catch {
1718
+ return false;
1719
+ }
1720
+ }
1721
+ return false;
1722
+ }
1723
+ async function normalizeMedia(input, videoOptions) {
1724
+ if (isVideoInput(input)) {
1725
+ const { path, cleanup } = await resolveVideoToPath(input);
1726
+ try {
1727
+ const { frames, durationSeconds } = await extractFrames(path, videoOptions);
1728
+ await saveDebugFrames(frames);
1729
+ return { kind: "video", frames, durationSeconds };
1730
+ } finally {
1731
+ try {
1732
+ await cleanup();
1733
+ } catch {
1734
+ }
1735
+ }
1736
+ }
1737
+ const image = await normalizeImage(input);
1738
+ return { kind: "image", image };
1739
+ }
1740
+
1208
1741
  // src/types.ts
1209
1742
  var import_zod = require("zod");
1210
1743
  var IssuePrioritySchema = import_zod.z.enum(["critical", "major", "minor"]);
@@ -1229,7 +1762,13 @@ var StatementResultSchema = import_zod.z.object({
1229
1762
  statement: import_zod.z.string(),
1230
1763
  pass: import_zod.z.boolean(),
1231
1764
  reasoning: import_zod.z.string(),
1232
- confidence: ConfidenceSchema.optional()
1765
+ confidence: ConfidenceSchema.optional(),
1766
+ /**
1767
+ * For video inputs, the approximate timestamp (in seconds, from the start of the clip)
1768
+ * of the frame that most clearly demonstrates the statement. `null` when the statement
1769
+ * fails or applies across the whole clip. Always omitted for image inputs.
1770
+ */
1771
+ timestampSeconds: import_zod.z.number().nonnegative().nullable().optional()
1233
1772
  });
1234
1773
  var UsageInfoSchema = import_zod.z.object({
1235
1774
  inputTokens: import_zod.z.number(),
@@ -1258,6 +1797,11 @@ var CompareResultSchema = BaseResultSchema.extend({
1258
1797
  var AskResultSchema = import_zod.z.object({
1259
1798
  summary: import_zod.z.string(),
1260
1799
  issues: import_zod.z.array(IssueSchema),
1800
+ /**
1801
+ * For video inputs, the indices of frames the model relied on to answer.
1802
+ * Indices are 0-based and refer to entries in `frames.timestampsSeconds`.
1803
+ */
1804
+ frameReferences: import_zod.z.array(import_zod.z.number().int().nonnegative()).optional(),
1261
1805
  usage: UsageInfoSchema.optional()
1262
1806
  });
1263
1807
 
@@ -1333,6 +1877,29 @@ function createDriver(provider, config) {
1333
1877
  var checkSchemaOptions = toSchemaOptions(CheckResponseSchema);
1334
1878
  var askSchemaOptions = toSchemaOptions(AskResponseSchema);
1335
1879
  var compareSchemaOptions = toSchemaOptions(CompareResponseSchema);
1880
+ function mediaToProviderInputs(media) {
1881
+ if (media.kind === "image") {
1882
+ return {
1883
+ images: [media.image],
1884
+ mediaContext: { kind: "image" },
1885
+ framesMetadata: void 0
1886
+ };
1887
+ }
1888
+ const timestamps = media.frames.map((f) => f.timestampSeconds);
1889
+ return {
1890
+ images: media.frames,
1891
+ mediaContext: {
1892
+ kind: "video",
1893
+ frameTimestamps: timestamps,
1894
+ durationSeconds: media.durationSeconds
1895
+ },
1896
+ framesMetadata: {
1897
+ count: media.frames.length,
1898
+ timestampsSeconds: timestamps,
1899
+ durationSeconds: media.durationSeconds
1900
+ }
1901
+ };
1902
+ }
1336
1903
  function visualAI(config = {}) {
1337
1904
  const resolvedConfig = resolveConfig(config);
1338
1905
  const driverConfig = {
@@ -1361,34 +1928,44 @@ function visualAI(config = {}) {
1361
1928
  });
1362
1929
  }
1363
1930
  return {
1364
- async check(image, statements, options) {
1931
+ async check(input, statements, options) {
1365
1932
  const stmts = Array.isArray(statements) ? statements : [statements];
1366
1933
  if (stmts.length === 0) {
1367
1934
  throw new VisualAIConfigError("At least one statement is required for check()");
1368
1935
  }
1369
1936
  return withErrorDebug(resolvedConfig, "check", async () => {
1370
- const img = await normalizeImage(image);
1371
- const prompt = buildCheckPrompt(stmts, { instructions: options?.instructions });
1937
+ const media = await normalizeMedia(input, options?.video);
1938
+ const { images, mediaContext, framesMetadata } = mediaToProviderInputs(media);
1939
+ const prompt = buildCheckPrompt(stmts, {
1940
+ instructions: options?.instructions,
1941
+ media: mediaContext
1942
+ });
1372
1943
  debugLog(resolvedConfig, "check prompt", prompt, "prompt");
1373
- const response = await timedSendMessage(driver, [img], prompt, checkSchemaOptions);
1944
+ const response = await timedSendMessage(driver, images, prompt, checkSchemaOptions);
1374
1945
  debugLog(resolvedConfig, "check response", response.text, "response");
1375
1946
  const result = parseCheckResponse(response.text);
1376
1947
  return {
1377
1948
  ...result,
1949
+ ...framesMetadata ? { frames: framesMetadata } : {},
1378
1950
  usage: processUsage("check", response.usage, response.durationSeconds, resolvedConfig)
1379
1951
  };
1380
1952
  });
1381
1953
  },
1382
- async ask(image, userPrompt, options) {
1954
+ async ask(input, userPrompt, options) {
1383
1955
  return withErrorDebug(resolvedConfig, "ask", async () => {
1384
- const img = await normalizeImage(image);
1385
- const prompt = buildAskPrompt(userPrompt, { instructions: options?.instructions });
1956
+ const media = await normalizeMedia(input, options?.video);
1957
+ const { images, mediaContext, framesMetadata } = mediaToProviderInputs(media);
1958
+ const prompt = buildAskPrompt(userPrompt, {
1959
+ instructions: options?.instructions,
1960
+ media: mediaContext
1961
+ });
1386
1962
  debugLog(resolvedConfig, "ask prompt", prompt, "prompt");
1387
- const response = await timedSendMessage(driver, [img], prompt, askSchemaOptions);
1963
+ const response = await timedSendMessage(driver, images, prompt, askSchemaOptions);
1388
1964
  debugLog(resolvedConfig, "ask response", response.text, "response");
1389
1965
  const result = parseAskResponse(response.text);
1390
1966
  return {
1391
1967
  ...result,
1968
+ ...framesMetadata ? { frames: framesMetadata } : {},
1392
1969
  usage: processUsage("ask", response.usage, response.durationSeconds, resolvedConfig)
1393
1970
  };
1394
1971
  });
@@ -1577,6 +2154,7 @@ function assertVisualCompareResult(result, label) {
1577
2154
  VisualAIRateLimitError,
1578
2155
  VisualAIResponseParseError,
1579
2156
  VisualAITruncationError,
2157
+ VisualAIVideoError,
1580
2158
  assertVisualCompareResult,
1581
2159
  assertVisualResult,
1582
2160
  formatCheckResult,