visual-ai-assertions 0.8.0 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -115,6 +115,12 @@ var VisualAIImageError = class extends VisualAIError {
115
115
  this.name = "VisualAIImageError";
116
116
  }
117
117
  };
118
+ var VisualAIVideoError = class extends VisualAIError {
119
+ constructor(message) {
120
+ super(message, "VIDEO_INVALID");
121
+ this.name = "VisualAIVideoError";
122
+ }
123
+ };
118
124
  var VisualAIResponseParseError = class extends VisualAIError {
119
125
  rawResponse;
120
126
  constructor(message, rawResponse) {
@@ -148,7 +154,7 @@ var VisualAIAssertionError = class extends VisualAIError {
148
154
  }
149
155
  };
150
156
  function isVisualAIKnownError(error) {
151
- return error instanceof VisualAIAuthError || error instanceof VisualAIRateLimitError || error instanceof VisualAIProviderError || error instanceof VisualAIImageError || error instanceof VisualAIResponseParseError || error instanceof VisualAITruncationError || error instanceof VisualAIConfigError || error instanceof VisualAIAssertionError;
157
+ return error instanceof VisualAIAuthError || error instanceof VisualAIRateLimitError || error instanceof VisualAIProviderError || error instanceof VisualAIImageError || error instanceof VisualAIVideoError || error instanceof VisualAIResponseParseError || error instanceof VisualAITruncationError || error instanceof VisualAIConfigError || error instanceof VisualAIAssertionError;
152
158
  }
153
159
 
154
160
  // src/core/prompt.ts
@@ -162,7 +168,7 @@ Each issue must have:
162
168
  - "description": what the issue is
163
169
  - "suggestion": how to fix or improve it
164
170
  `;
165
- var CHECK_OUTPUT_SCHEMA = `IMPORTANT: Follow this evaluation order:
171
+ var CHECK_OUTPUT_SCHEMA_IMAGE = `IMPORTANT: Follow this evaluation order:
166
172
  1. First, evaluate EACH statement independently and populate the "statements" array
167
173
  2. Then, set "pass" to true ONLY if every statement passed (logical AND of all statement results)
168
174
  3. Write "reasoning" as a brief overall summary of the evaluation
@@ -202,7 +208,46 @@ Example for a failing check:
202
208
  ]
203
209
  }
204
210
  ${JSON_INSTRUCTIONS}`;
205
- var ASK_OUTPUT_SCHEMA = `Respond with a JSON object matching this exact structure:
211
+ var CHECK_OUTPUT_SCHEMA_VIDEO = `IMPORTANT: Follow this evaluation order:
212
+ 1. First, evaluate EACH statement independently across the entire timeline and populate the "statements" array
213
+ 2. A statement passes if it is true at ANY frame of the timeline, unless the wording explicitly says otherwise (e.g. "throughout", "at all times")
214
+ 3. For each statement that passes, set "timestampSeconds" to the timestamp of the frame that most clearly demonstrates it (or where it first becomes true). Use null when the statement fails or applies across the whole clip.
215
+ 4. Then, set "pass" to true ONLY if every statement passed (logical AND of all statement results)
216
+ 5. Write "reasoning" as a brief overall summary of the evaluation
217
+ 6. Include "issues" only for statements that failed
218
+
219
+ Respond with a JSON object matching this exact structure:
220
+ {
221
+ "pass": boolean, // true ONLY if ALL statements passed \u2014 derive from statements array
222
+ "reasoning": string, // brief overall summary of the evaluation
223
+ "issues": [...], // one issue per failing statement (empty if all pass)
224
+ "statements": [ // one entry per statement, in order \u2014 evaluate these FIRST
225
+ {
226
+ "statement": string, // the original statement text
227
+ "pass": boolean, // whether this statement is true at any point in the timeline
228
+ "reasoning": string, // explanation for this statement, citing frame timestamps where relevant
229
+ "confidence": "high" | "medium" | "low",
230
+ "timestampSeconds": number | null
231
+ // seconds from the start of the clip where the statement is most clearly true,
232
+ // or null if it failed / applies across the whole clip
233
+ }
234
+ ]
235
+ }
236
+ ${ISSUE_SCHEMA_INSTRUCTIONS}
237
+
238
+ Only include issues for statements that fail. If all statements pass, issues should be an empty array.
239
+
240
+ Example for a passing video check:
241
+ {
242
+ "pass": true,
243
+ "reasoning": "The success toast appeared briefly around 3.5s.",
244
+ "issues": [],
245
+ "statements": [
246
+ { "statement": "A success toast with text 'Saved' appears", "pass": true, "reasoning": "A green toast labeled 'Saved' is visible in the bottom-right at the 3.5s frame", "confidence": "high", "timestampSeconds": 3.5 }
247
+ ]
248
+ }
249
+ ${JSON_INSTRUCTIONS}`;
250
+ var ASK_OUTPUT_SCHEMA_IMAGE = `Respond with a JSON object matching this exact structure:
206
251
  {
207
252
  "summary": string, // high-level analysis summary
208
253
  "issues": [...] // list of issues/findings, can be empty
@@ -223,6 +268,17 @@ Example:
223
268
  ]
224
269
  }
225
270
  ${JSON_INSTRUCTIONS}`;
271
+ var ASK_OUTPUT_SCHEMA_VIDEO = `Respond with a JSON object matching this exact structure:
272
+ {
273
+ "summary": string, // high-level summary of what happens across the timeline
274
+ "issues": [...], // list of issues/findings, can be empty
275
+ "frameReferences": number[] // 0-based indices of frames the answer relies on (in order)
276
+ }
277
+ ${ISSUE_SCHEMA_INSTRUCTIONS}
278
+
279
+ Prioritize issues by severity (critical / major / minor) as for image input.
280
+ Cite frame indices in "frameReferences" so the user can locate the moments you describe.
281
+ ${JSON_INSTRUCTIONS}`;
226
282
  var COMPARE_OUTPUT_SCHEMA = `Respond with a JSON object matching this exact structure:
227
283
  {
228
284
  "pass": boolean, // true if no critical or major changes found
@@ -241,7 +297,19 @@ var COMPARE_OUTPUT_SCHEMA = `Respond with a JSON object matching this exact stru
241
297
  If the images appear identical, set pass to true, explain in reasoning, and return an empty changes array.
242
298
  ${JSON_INSTRUCTIONS}`;
243
299
  var DEFAULT_CHECK_ROLE = "You are a visual QA assistant. Evaluate the provided image precisely and objectively.";
300
+ var DEFAULT_CHECK_ROLE_VIDEO = "You are a visual QA assistant. Evaluate the provided sequence of video frames precisely and objectively, treating them as a chronological timeline.";
244
301
  var DEFAULT_ASK_ROLE = "You are a visual QA assistant. Analyze the provided image based on the user's request.";
302
+ var DEFAULT_ASK_ROLE_VIDEO = "You are a visual QA assistant. Analyze the provided sequence of video frames as a chronological timeline based on the user's request.";
303
+ function buildVideoTimelineSection(frameTimestamps, durationSeconds) {
304
+ const formatted = frameTimestamps.map((t, i) => ` ${i}: ${t.toFixed(2)}s`).join("\n");
305
+ return `Video timeline:
306
+ - Total duration: ${durationSeconds.toFixed(2)}s
307
+ - ${frameTimestamps.length} frames sampled (in chronological order)
308
+ - Frame index \u2192 timestamp:
309
+ ${formatted}
310
+
311
+ Treat the attached images as a chronological timeline. The first image is the earliest frame, the last is the latest. Refer to frames by timestamp where helpful.`;
312
+ }
245
313
  var COMPARE_ROLE = "You are performing a visual regression test. Compare the BEFORE image (baseline) to the AFTER image (current) and identify all visual differences. Flag changes that appear unintentional or problematic.";
246
314
  var COMPARE_EDGE_RULES = [
247
315
  "The BEFORE image is the baseline/expected state.",
@@ -254,22 +322,31 @@ function buildInstructionsSection(instructions) {
254
322
  function buildCheckPrompt(statements, options) {
255
323
  const stmts = Array.isArray(statements) ? statements : [statements];
256
324
  const statementsBlock = stmts.map((s, i) => `${i + 1}. "${s}"`).join("\n");
257
- const sections = [options?.role ?? DEFAULT_CHECK_ROLE];
325
+ const media = options?.media;
326
+ const defaultRole = media?.kind === "video" ? DEFAULT_CHECK_ROLE_VIDEO : DEFAULT_CHECK_ROLE;
327
+ const sections = [options?.role ?? defaultRole];
328
+ if (media?.kind === "video") {
329
+ sections.push(buildVideoTimelineSection(media.frameTimestamps, media.durationSeconds));
330
+ }
258
331
  if (options?.instructions && options.instructions.length > 0) {
259
332
  sections.push(buildInstructionsSection(options.instructions));
260
333
  }
261
334
  sections.push(`Statements to evaluate:
262
335
  ${statementsBlock}`);
263
- sections.push(CHECK_OUTPUT_SCHEMA);
336
+ sections.push(media?.kind === "video" ? CHECK_OUTPUT_SCHEMA_VIDEO : CHECK_OUTPUT_SCHEMA_IMAGE);
264
337
  return sections.join("\n\n");
265
338
  }
266
339
  function buildAskPrompt(userPrompt, options) {
267
- const sections = [DEFAULT_ASK_ROLE];
340
+ const media = options?.media;
341
+ const sections = [media?.kind === "video" ? DEFAULT_ASK_ROLE_VIDEO : DEFAULT_ASK_ROLE];
342
+ if (media?.kind === "video") {
343
+ sections.push(buildVideoTimelineSection(media.frameTimestamps, media.durationSeconds));
344
+ }
268
345
  if (options?.instructions && options.instructions.length > 0) {
269
346
  sections.push(buildInstructionsSection(options.instructions));
270
347
  }
271
348
  sections.push(`User request: ${userPrompt}`);
272
- sections.push(ASK_OUTPUT_SCHEMA);
349
+ sections.push(media?.kind === "video" ? ASK_OUTPUT_SCHEMA_VIDEO : ASK_OUTPUT_SCHEMA_IMAGE);
273
350
  return sections.join("\n\n");
274
351
  }
275
352
  function buildAiDiffPrompt() {
@@ -964,6 +1041,51 @@ async function generateAiDiff(imgA, imgB, model, driver) {
964
1041
  import { readFile } from "fs/promises";
965
1042
  import { extname } from "path";
966
1043
  import sharp2 from "sharp";
1044
+
1045
+ // src/core/input-detect.ts
1046
+ function isFilePath(input) {
1047
+ return input.startsWith("/") || input.startsWith("./") || input.startsWith("../") || input.includes("\\");
1048
+ }
1049
+ function isUrl(input) {
1050
+ return input.startsWith("http://") || input.startsWith("https://");
1051
+ }
1052
+ function isDataUrl(input) {
1053
+ return input.startsWith("data:");
1054
+ }
1055
+ function parseDataUrl(input) {
1056
+ const match = /^data:([^;]+);base64,(.+)$/.exec(input);
1057
+ if (!match?.[1] || !match[2]) return null;
1058
+ return { mimeType: match[1], base64Payload: match[2] };
1059
+ }
1060
+ function decodeBase64(payload) {
1061
+ if (!/^[A-Za-z0-9+/\n\r]+=*$/.test(payload)) {
1062
+ throw new Error("Invalid base64 string");
1063
+ }
1064
+ return Buffer.from(payload, "base64");
1065
+ }
1066
+ function looksLikeImageBase64(input) {
1067
+ return input.startsWith("iVBOR") || // PNG (0x89 0x50 0x4E 0x47)
1068
+ input.startsWith("/9j/") || // JPEG (0xFF 0xD8 0xFF)
1069
+ input.startsWith("R0lGOD") || // GIF (0x47 0x49 0x46)
1070
+ input.startsWith("UklGR");
1071
+ }
1072
+ function looksLikeVideoBase64(input) {
1073
+ return input.startsWith("GkXf") || input.startsWith("AAAA");
1074
+ }
1075
+ async function fetchToBuffer(url, timeoutMs) {
1076
+ const response = await fetch(url, {
1077
+ signal: AbortSignal.timeout(timeoutMs)
1078
+ });
1079
+ if (!response.ok) {
1080
+ throw new Error(`HTTP ${response.status}`);
1081
+ }
1082
+ const arrayBuffer = await response.arrayBuffer();
1083
+ const data = Buffer.from(arrayBuffer);
1084
+ const contentType = response.headers.get("content-type")?.split(";")[0]?.trim() ?? null;
1085
+ return { data, contentType };
1086
+ }
1087
+
1088
+ // src/core/image.ts
967
1089
  var SUPPORTED_FORMATS = /* @__PURE__ */ new Set([
968
1090
  "image/jpeg",
969
1091
  "image/png",
@@ -986,18 +1108,6 @@ function getMimeFromExtension(filePath) {
986
1108
  const ext = extname(filePath).toLowerCase();
987
1109
  return EXTENSION_TO_MIME[ext];
988
1110
  }
989
- function isFilePath(input) {
990
- return input.startsWith("/") || input.startsWith("./") || input.startsWith("../") || input.includes("\\");
991
- }
992
- function isUrl(input) {
993
- return input.startsWith("http://") || input.startsWith("https://");
994
- }
995
- function isBase64Image(input) {
996
- return input.startsWith("iVBOR") || // PNG (0x89 0x50 0x4E 0x47)
997
- input.startsWith("/9j/") || // JPEG (0xFF 0xD8 0xFF)
998
- input.startsWith("R0lGOD") || // GIF (0x47 0x49 0x46)
999
- input.startsWith("UklGR");
1000
- }
1001
1111
  function detectMimeType(data) {
1002
1112
  if (data[0] === 255 && data[1] === 216 && data[2] === 255) {
1003
1113
  return "image/jpeg";
@@ -1051,45 +1161,38 @@ async function loadFromFilePath(filePath) {
1051
1161
  return { data: fileData, mimeType };
1052
1162
  }
1053
1163
  async function loadFromUrl(url) {
1054
- let response;
1164
+ let result;
1055
1165
  try {
1056
- response = await fetch(url, {
1057
- signal: AbortSignal.timeout(URL_FETCH_TIMEOUT_MS)
1058
- });
1166
+ result = await fetchToBuffer(url, URL_FETCH_TIMEOUT_MS);
1059
1167
  } catch (err) {
1060
1168
  throw new VisualAIImageError(
1061
1169
  `Failed to fetch image from URL: ${url} \u2014 ${err instanceof Error ? err.message : String(err)}`
1062
1170
  );
1063
1171
  }
1064
- if (!response.ok) {
1065
- throw new VisualAIImageError(
1066
- `Failed to fetch image from URL: ${url} \u2014 HTTP ${response.status}`
1067
- );
1068
- }
1069
- const arrayBuffer = await response.arrayBuffer();
1070
- const data = Buffer.from(arrayBuffer);
1071
- const contentType = response.headers.get("content-type")?.split(";")[0]?.trim() ?? null;
1172
+ const { data, contentType } = result;
1072
1173
  const mimeType = contentType && isSupportedMimeType(contentType) ? contentType : detectMimeType(data);
1073
1174
  return { data, mimeType };
1074
1175
  }
1075
1176
  function loadFromBase64(input) {
1076
1177
  let base64Data = input;
1077
1178
  let mimeType;
1078
- if (input.startsWith("data:")) {
1079
- const match = /^data:(image\/[^;]+);base64,(.+)$/.exec(input);
1080
- if (!match?.[1] || !match[2]) {
1179
+ if (isDataUrl(input)) {
1180
+ const parsed = parseDataUrl(input);
1181
+ if (!parsed) {
1081
1182
  throw new VisualAIImageError("Invalid data URL format");
1082
1183
  }
1083
- if (!isSupportedMimeType(match[1])) {
1084
- throw new VisualAIImageError(`Unsupported image format: ${match[1]}`);
1184
+ if (!isSupportedMimeType(parsed.mimeType)) {
1185
+ throw new VisualAIImageError(`Unsupported image format: ${parsed.mimeType}`);
1085
1186
  }
1086
- mimeType = match[1];
1087
- base64Data = match[2];
1187
+ mimeType = parsed.mimeType;
1188
+ base64Data = parsed.base64Payload;
1088
1189
  }
1089
- if (!/^[A-Za-z0-9+/\n\r]+=*$/.test(base64Data)) {
1190
+ let data;
1191
+ try {
1192
+ data = decodeBase64(base64Data);
1193
+ } catch {
1090
1194
  throw new VisualAIImageError("Invalid base64 string");
1091
1195
  }
1092
- const data = Buffer.from(base64Data, "base64");
1093
1196
  if (data.length === 0) {
1094
1197
  throw new VisualAIImageError("Empty image data after base64 decode");
1095
1198
  }
@@ -1108,9 +1211,9 @@ async function normalizeImage(input) {
1108
1211
  } else if (typeof input === "string") {
1109
1212
  if (isUrl(input)) {
1110
1213
  ({ data, mimeType } = await loadFromUrl(input));
1111
- } else if (input.startsWith("data:")) {
1214
+ } else if (isDataUrl(input)) {
1112
1215
  ({ data, mimeType } = loadFromBase64(input));
1113
- } else if (isBase64Image(input)) {
1216
+ } else if (looksLikeImageBase64(input)) {
1114
1217
  ({ data, mimeType } = loadFromBase64(input));
1115
1218
  } else if (isFilePath(input)) {
1116
1219
  ({ data, mimeType } = await loadFromFilePath(input));
@@ -1138,6 +1241,379 @@ async function normalizeImage(input) {
1138
1241
  };
1139
1242
  }
1140
1243
 
1244
+ // src/core/video.ts
1245
+ import { mkdtemp, readFile as readFile2, readdir, rm, writeFile } from "fs/promises";
1246
+ import { tmpdir } from "os";
1247
+ import { extname as extname2, join } from "path";
1248
+ var FRAME_MAX_DIMENSION = 1568;
1249
+ var DEFAULT_FPS = 1;
1250
+ var DEFAULT_MAX_FRAMES = 10;
1251
+ var DEFAULT_MAX_DURATION_SECONDS = 10;
1252
+ var MAX_FRAMES_HARD_CAP = 60;
1253
+ var FFPROBE_TIMEOUT_MS = 15e3;
1254
+ var FFMPEG_RUN_TIMEOUT_MS = 6e4;
1255
+ var VIDEO_EXTENSIONS = {
1256
+ ".mp4": "video/mp4",
1257
+ ".m4v": "video/mp4",
1258
+ ".webm": "video/webm",
1259
+ ".mov": "video/quicktime",
1260
+ ".qt": "video/quicktime",
1261
+ ".mkv": "video/x-matroska"
1262
+ };
1263
+ var VIDEO_MIME_TYPES = /* @__PURE__ */ new Set([
1264
+ "video/mp4",
1265
+ "video/webm",
1266
+ "video/quicktime",
1267
+ "video/x-matroska"
1268
+ ]);
1269
+ function isSupportedVideoMimeType(value) {
1270
+ return VIDEO_MIME_TYPES.has(value);
1271
+ }
1272
+ function getVideoMimeFromExtension(filePath) {
1273
+ const ext = extname2(filePath).toLowerCase();
1274
+ return VIDEO_EXTENSIONS[ext];
1275
+ }
1276
+ function detectVideoMimeType(data) {
1277
+ if (data.length < 12) return null;
1278
+ if (data[4] === 102 && data[5] === 116 && data[6] === 121 && data[7] === 112) {
1279
+ if (data[8] === 113 && data[9] === 116 && data[10] === 32 && data[11] === 32) {
1280
+ return "video/quicktime";
1281
+ }
1282
+ return "video/mp4";
1283
+ }
1284
+ if (data[0] === 26 && data[1] === 69 && data[2] === 223 && data[3] === 163) {
1285
+ return "video/webm";
1286
+ }
1287
+ return null;
1288
+ }
1289
+ async function resolveVideoToPath(input) {
1290
+ if (Buffer.isBuffer(input) || input instanceof Uint8Array) {
1291
+ const buf2 = Buffer.isBuffer(input) ? input : Buffer.from(input);
1292
+ const mimeType2 = detectVideoMimeType(buf2);
1293
+ if (!mimeType2) {
1294
+ throw new VisualAIVideoError("Unable to detect video format from buffer contents");
1295
+ }
1296
+ return writeBufferToTemp(buf2, mimeType2);
1297
+ }
1298
+ if (typeof input !== "string") {
1299
+ throw new VisualAIVideoError(
1300
+ "Invalid video input: expected Buffer, Uint8Array, file path, data URL, or base64 string"
1301
+ );
1302
+ }
1303
+ if (isDataUrl(input)) {
1304
+ const parsed = parseDataUrl(input);
1305
+ if (!parsed) {
1306
+ throw new VisualAIVideoError("Invalid data URL format");
1307
+ }
1308
+ if (!isSupportedVideoMimeType(parsed.mimeType)) {
1309
+ throw new VisualAIVideoError(`Unsupported video format: ${parsed.mimeType}`);
1310
+ }
1311
+ let buf2;
1312
+ try {
1313
+ buf2 = decodeBase64(parsed.base64Payload);
1314
+ } catch {
1315
+ throw new VisualAIVideoError("Invalid base64 payload in data URL");
1316
+ }
1317
+ return writeBufferToTemp(buf2, parsed.mimeType);
1318
+ }
1319
+ if (isFilePath(input)) {
1320
+ const mimeType2 = getVideoMimeFromExtension(input);
1321
+ if (!mimeType2) {
1322
+ throw new VisualAIVideoError(
1323
+ `Unsupported video file extension: ${input}. Supported: .mp4, .webm, .mov, .mkv`
1324
+ );
1325
+ }
1326
+ return { path: input, mimeType: mimeType2, cleanup: async () => {
1327
+ } };
1328
+ }
1329
+ let buf;
1330
+ try {
1331
+ buf = decodeBase64(input);
1332
+ } catch {
1333
+ throw new VisualAIVideoError(
1334
+ `Unrecognized video input: "${input.slice(0, 80)}". Expected a file path, data URL, or base64-encoded video string.`
1335
+ );
1336
+ }
1337
+ const mimeType = detectVideoMimeType(buf);
1338
+ if (!mimeType) {
1339
+ throw new VisualAIVideoError(
1340
+ `Unrecognized video input: "${input.slice(0, 80)}". Expected a file path, data URL, or base64-encoded video string.`
1341
+ );
1342
+ }
1343
+ return writeBufferToTemp(buf, mimeType);
1344
+ }
1345
+ async function writeBufferToTemp(data, mimeType) {
1346
+ const dir = await mkdtemp(join(tmpdir(), "visual-ai-video-"));
1347
+ try {
1348
+ const ext = extensionFor(mimeType);
1349
+ const path = join(dir, `input${ext}`);
1350
+ await writeFile(path, data);
1351
+ return {
1352
+ path,
1353
+ mimeType,
1354
+ cleanup: async () => {
1355
+ try {
1356
+ await rm(dir, { recursive: true, force: true });
1357
+ } catch {
1358
+ }
1359
+ }
1360
+ };
1361
+ } catch (err) {
1362
+ try {
1363
+ await rm(dir, { recursive: true, force: true });
1364
+ } catch {
1365
+ }
1366
+ throw err;
1367
+ }
1368
+ }
1369
+ function extensionFor(mimeType) {
1370
+ switch (mimeType) {
1371
+ case "video/mp4":
1372
+ return ".mp4";
1373
+ case "video/webm":
1374
+ return ".webm";
1375
+ case "video/quicktime":
1376
+ return ".mov";
1377
+ case "video/x-matroska":
1378
+ return ".mkv";
1379
+ }
1380
+ }
1381
+ var cachedFactoryPromise;
1382
+ async function loadFfmpegFactory() {
1383
+ if (cachedFactoryPromise) return cachedFactoryPromise;
1384
+ cachedFactoryPromise = (async () => {
1385
+ let ffmpegModule;
1386
+ try {
1387
+ ffmpegModule = await import("fluent-ffmpeg");
1388
+ } catch (err) {
1389
+ const code = err?.code;
1390
+ if (code === "ERR_MODULE_NOT_FOUND" || code === "MODULE_NOT_FOUND") {
1391
+ throw new VisualAIVideoError(
1392
+ "Video support requires fluent-ffmpeg. Install it with: pnpm add -D fluent-ffmpeg @ffmpeg-installer/ffmpeg @ffprobe-installer/ffprobe @types/fluent-ffmpeg"
1393
+ );
1394
+ }
1395
+ throw new VisualAIVideoError(
1396
+ `Failed to load fluent-ffmpeg: ${err instanceof Error ? err.message : String(err)}`
1397
+ );
1398
+ }
1399
+ const factory = ffmpegModule.default ?? ffmpegModule;
1400
+ try {
1401
+ const installer = await import("@ffmpeg-installer/ffmpeg");
1402
+ const path = (installer.default ?? installer).path;
1403
+ if (path) factory.setFfmpegPath(path);
1404
+ } catch (err) {
1405
+ const code = err?.code;
1406
+ if (code !== "ERR_MODULE_NOT_FOUND" && code !== "MODULE_NOT_FOUND") {
1407
+ process.stderr.write(
1408
+ `[visual-ai-assertions] warning: @ffmpeg-installer/ffmpeg failed to load: ${err instanceof Error ? err.message : String(err)}
1409
+ `
1410
+ );
1411
+ }
1412
+ }
1413
+ try {
1414
+ const installer = await import("@ffprobe-installer/ffprobe");
1415
+ const path = (installer.default ?? installer).path;
1416
+ if (path) factory.setFfprobePath(path);
1417
+ } catch (err) {
1418
+ const code = err?.code;
1419
+ if (code !== "ERR_MODULE_NOT_FOUND" && code !== "MODULE_NOT_FOUND") {
1420
+ process.stderr.write(
1421
+ `[visual-ai-assertions] warning: @ffprobe-installer/ffprobe failed to load: ${err instanceof Error ? err.message : String(err)}
1422
+ `
1423
+ );
1424
+ }
1425
+ }
1426
+ return factory;
1427
+ })();
1428
+ try {
1429
+ return await cachedFactoryPromise;
1430
+ } catch (err) {
1431
+ cachedFactoryPromise = void 0;
1432
+ throw err;
1433
+ }
1434
+ }
1435
+ async function probeDurationSeconds(videoPath) {
1436
+ const ffmpeg = await loadFfmpegFactory();
1437
+ return new Promise((resolve, reject) => {
1438
+ let settled = false;
1439
+ const finish = (fn) => {
1440
+ if (settled) return;
1441
+ settled = true;
1442
+ clearTimeout(timer);
1443
+ fn();
1444
+ };
1445
+ const timer = setTimeout(() => {
1446
+ finish(() => {
1447
+ reject(
1448
+ new VisualAIVideoError(
1449
+ `ffprobe timed out after ${FFPROBE_TIMEOUT_MS}ms while probing ${videoPath}`
1450
+ )
1451
+ );
1452
+ });
1453
+ }, FFPROBE_TIMEOUT_MS);
1454
+ ffmpeg.ffprobe(videoPath, (err, data) => {
1455
+ if (err) {
1456
+ finish(() => {
1457
+ reject(
1458
+ new VisualAIVideoError(
1459
+ `Failed to probe video metadata: ${err.message}. Ensure ffprobe is installed (e.g. via @ffprobe-installer/ffprobe).`
1460
+ )
1461
+ );
1462
+ });
1463
+ return;
1464
+ }
1465
+ const raw = data.format?.duration;
1466
+ const duration = typeof raw === "string" ? Number(raw) : raw;
1467
+ if (!duration || !Number.isFinite(duration) || duration <= 0) {
1468
+ finish(() => {
1469
+ reject(new VisualAIVideoError("Video duration could not be determined"));
1470
+ });
1471
+ return;
1472
+ }
1473
+ finish(() => {
1474
+ resolve(duration);
1475
+ });
1476
+ });
1477
+ });
1478
+ }
1479
+ async function extractFrames(videoPath, options = {}) {
1480
+ const fps = options.fps ?? DEFAULT_FPS;
1481
+ const maxFrames = options.maxFrames ?? DEFAULT_MAX_FRAMES;
1482
+ const maxDurationSeconds = options.maxDurationSeconds ?? DEFAULT_MAX_DURATION_SECONDS;
1483
+ if (!Number.isFinite(fps) || fps <= 0) {
1484
+ throw new VisualAIVideoError(`Invalid fps: ${fps}. Must be a finite number > 0.`);
1485
+ }
1486
+ if (!Number.isFinite(maxFrames) || maxFrames <= 0) {
1487
+ throw new VisualAIVideoError(`Invalid maxFrames: ${maxFrames}. Must be a finite number > 0.`);
1488
+ }
1489
+ if (maxFrames > MAX_FRAMES_HARD_CAP) {
1490
+ throw new VisualAIVideoError(
1491
+ `maxFrames ${maxFrames} exceeds the hard cap of ${MAX_FRAMES_HARD_CAP}. Lower maxFrames or open an issue if you need a larger limit.`
1492
+ );
1493
+ }
1494
+ if (!Number.isFinite(maxDurationSeconds) || maxDurationSeconds <= 0) {
1495
+ throw new VisualAIVideoError(
1496
+ `Invalid maxDurationSeconds: ${maxDurationSeconds}. Must be a finite number > 0.`
1497
+ );
1498
+ }
1499
+ const ffmpeg = await loadFfmpegFactory();
1500
+ const durationSeconds = await probeDurationSeconds(videoPath);
1501
+ if (durationSeconds > maxDurationSeconds) {
1502
+ throw new VisualAIVideoError(
1503
+ `Video duration ${durationSeconds.toFixed(2)}s exceeds limit of ${maxDurationSeconds}s. Pass { maxDurationSeconds: N } to override, or trim the source video.`
1504
+ );
1505
+ }
1506
+ const outputDir = await mkdtemp(join(tmpdir(), "visual-ai-frames-"));
1507
+ try {
1508
+ const filter = `fps=${fps},scale='if(gt(iw,ih),min(${FRAME_MAX_DIMENSION},iw),-2)':'if(gt(iw,ih),-2,min(${FRAME_MAX_DIMENSION},ih))':flags=area`;
1509
+ await new Promise((resolve, reject) => {
1510
+ let settled = false;
1511
+ const cmd = ffmpeg(videoPath);
1512
+ const finish = (fn) => {
1513
+ if (settled) return;
1514
+ settled = true;
1515
+ clearTimeout(timer);
1516
+ fn();
1517
+ };
1518
+ const timer = setTimeout(() => {
1519
+ try {
1520
+ cmd.kill("SIGKILL");
1521
+ } catch {
1522
+ }
1523
+ finish(() => {
1524
+ reject(
1525
+ new VisualAIVideoError(
1526
+ `ffmpeg frame extraction timed out after ${FFMPEG_RUN_TIMEOUT_MS}ms`
1527
+ )
1528
+ );
1529
+ });
1530
+ }, FFMPEG_RUN_TIMEOUT_MS);
1531
+ cmd.outputOptions(["-vf", filter, "-vframes", String(maxFrames), "-q:v", "3"]).output(join(outputDir, "frame-%04d.jpg")).on("end", () => {
1532
+ finish(() => {
1533
+ resolve();
1534
+ });
1535
+ }).on("error", (err) => {
1536
+ finish(() => {
1537
+ reject(new VisualAIVideoError(`ffmpeg frame extraction failed: ${err.message}`));
1538
+ });
1539
+ }).run();
1540
+ });
1541
+ const files = (await readdir(outputDir)).filter((name) => name.endsWith(".jpg")).sort();
1542
+ if (files.length === 0) {
1543
+ throw new VisualAIVideoError(
1544
+ "No frames could be extracted from the video. The source may be corrupt or empty."
1545
+ );
1546
+ }
1547
+ const frames = await Promise.all(
1548
+ files.map(async (name, index) => {
1549
+ const data = await readFile2(join(outputDir, name));
1550
+ const timestampSeconds = Math.min(durationSeconds, (index + 0.5) / fps);
1551
+ let cachedBase64;
1552
+ return {
1553
+ data,
1554
+ mimeType: "image/jpeg",
1555
+ get base64() {
1556
+ if (cachedBase64 === void 0) {
1557
+ cachedBase64 = data.toString("base64");
1558
+ }
1559
+ return cachedBase64;
1560
+ },
1561
+ timestampSeconds,
1562
+ index
1563
+ };
1564
+ })
1565
+ );
1566
+ return { frames, durationSeconds };
1567
+ } finally {
1568
+ try {
1569
+ await rm(outputDir, { recursive: true, force: true });
1570
+ } catch {
1571
+ }
1572
+ }
1573
+ }
1574
+
1575
+ // src/core/media.ts
1576
+ var VIDEO_MAGIC_BYTE_PREFIX_LEN = 16;
1577
+ function isVideoInput(input) {
1578
+ if (Buffer.isBuffer(input) || input instanceof Uint8Array) {
1579
+ const buf = Buffer.isBuffer(input) ? input : Buffer.from(input);
1580
+ return detectVideoMimeType(buf) !== null;
1581
+ }
1582
+ if (typeof input !== "string") return false;
1583
+ if (isDataUrl(input)) {
1584
+ const parsed = parseDataUrl(input);
1585
+ return parsed?.mimeType.startsWith("video/") ?? false;
1586
+ }
1587
+ if (isFilePath(input)) {
1588
+ return getVideoMimeFromExtension(input) !== void 0;
1589
+ }
1590
+ if (looksLikeVideoBase64(input)) {
1591
+ try {
1592
+ const buf = decodeBase64(input.slice(0, VIDEO_MAGIC_BYTE_PREFIX_LEN));
1593
+ return detectVideoMimeType(buf) !== null;
1594
+ } catch {
1595
+ return false;
1596
+ }
1597
+ }
1598
+ return false;
1599
+ }
1600
+ async function normalizeMedia(input, videoOptions) {
1601
+ if (isVideoInput(input)) {
1602
+ const { path, cleanup } = await resolveVideoToPath(input);
1603
+ try {
1604
+ const { frames, durationSeconds } = await extractFrames(path, videoOptions);
1605
+ return { kind: "video", frames, durationSeconds };
1606
+ } finally {
1607
+ try {
1608
+ await cleanup();
1609
+ } catch {
1610
+ }
1611
+ }
1612
+ }
1613
+ const image = await normalizeImage(input);
1614
+ return { kind: "image", image };
1615
+ }
1616
+
1141
1617
  // src/types.ts
1142
1618
  import { z } from "zod";
1143
1619
  var IssuePrioritySchema = z.enum(["critical", "major", "minor"]);
@@ -1162,7 +1638,13 @@ var StatementResultSchema = z.object({
1162
1638
  statement: z.string(),
1163
1639
  pass: z.boolean(),
1164
1640
  reasoning: z.string(),
1165
- confidence: ConfidenceSchema.optional()
1641
+ confidence: ConfidenceSchema.optional(),
1642
+ /**
1643
+ * For video inputs, the approximate timestamp (in seconds, from the start of the clip)
1644
+ * of the frame that most clearly demonstrates the statement. `null` when the statement
1645
+ * fails or applies across the whole clip. Always omitted for image inputs.
1646
+ */
1647
+ timestampSeconds: z.number().nonnegative().nullable().optional()
1166
1648
  });
1167
1649
  var UsageInfoSchema = z.object({
1168
1650
  inputTokens: z.number(),
@@ -1191,6 +1673,11 @@ var CompareResultSchema = BaseResultSchema.extend({
1191
1673
  var AskResultSchema = z.object({
1192
1674
  summary: z.string(),
1193
1675
  issues: z.array(IssueSchema),
1676
+ /**
1677
+ * For video inputs, the indices of frames the model relied on to answer.
1678
+ * Indices are 0-based and refer to entries in `frames.timestampsSeconds`.
1679
+ */
1680
+ frameReferences: z.array(z.number().int().nonnegative()).optional(),
1194
1681
  usage: UsageInfoSchema.optional()
1195
1682
  });
1196
1683
 
@@ -1266,6 +1753,29 @@ function createDriver(provider, config) {
1266
1753
  var checkSchemaOptions = toSchemaOptions(CheckResponseSchema);
1267
1754
  var askSchemaOptions = toSchemaOptions(AskResponseSchema);
1268
1755
  var compareSchemaOptions = toSchemaOptions(CompareResponseSchema);
1756
+ function mediaToProviderInputs(media) {
1757
+ if (media.kind === "image") {
1758
+ return {
1759
+ images: [media.image],
1760
+ mediaContext: { kind: "image" },
1761
+ framesMetadata: void 0
1762
+ };
1763
+ }
1764
+ const timestamps = media.frames.map((f) => f.timestampSeconds);
1765
+ return {
1766
+ images: media.frames,
1767
+ mediaContext: {
1768
+ kind: "video",
1769
+ frameTimestamps: timestamps,
1770
+ durationSeconds: media.durationSeconds
1771
+ },
1772
+ framesMetadata: {
1773
+ count: media.frames.length,
1774
+ timestampsSeconds: timestamps,
1775
+ durationSeconds: media.durationSeconds
1776
+ }
1777
+ };
1778
+ }
1269
1779
  function visualAI(config = {}) {
1270
1780
  const resolvedConfig = resolveConfig(config);
1271
1781
  const driverConfig = {
@@ -1294,34 +1804,44 @@ function visualAI(config = {}) {
1294
1804
  });
1295
1805
  }
1296
1806
  return {
1297
- async check(image, statements, options) {
1807
+ async check(input, statements, options) {
1298
1808
  const stmts = Array.isArray(statements) ? statements : [statements];
1299
1809
  if (stmts.length === 0) {
1300
1810
  throw new VisualAIConfigError("At least one statement is required for check()");
1301
1811
  }
1302
1812
  return withErrorDebug(resolvedConfig, "check", async () => {
1303
- const img = await normalizeImage(image);
1304
- const prompt = buildCheckPrompt(stmts, { instructions: options?.instructions });
1813
+ const media = await normalizeMedia(input, options?.video);
1814
+ const { images, mediaContext, framesMetadata } = mediaToProviderInputs(media);
1815
+ const prompt = buildCheckPrompt(stmts, {
1816
+ instructions: options?.instructions,
1817
+ media: mediaContext
1818
+ });
1305
1819
  debugLog(resolvedConfig, "check prompt", prompt, "prompt");
1306
- const response = await timedSendMessage(driver, [img], prompt, checkSchemaOptions);
1820
+ const response = await timedSendMessage(driver, images, prompt, checkSchemaOptions);
1307
1821
  debugLog(resolvedConfig, "check response", response.text, "response");
1308
1822
  const result = parseCheckResponse(response.text);
1309
1823
  return {
1310
1824
  ...result,
1825
+ ...framesMetadata ? { frames: framesMetadata } : {},
1311
1826
  usage: processUsage("check", response.usage, response.durationSeconds, resolvedConfig)
1312
1827
  };
1313
1828
  });
1314
1829
  },
1315
- async ask(image, userPrompt, options) {
1830
+ async ask(input, userPrompt, options) {
1316
1831
  return withErrorDebug(resolvedConfig, "ask", async () => {
1317
- const img = await normalizeImage(image);
1318
- const prompt = buildAskPrompt(userPrompt, { instructions: options?.instructions });
1832
+ const media = await normalizeMedia(input, options?.video);
1833
+ const { images, mediaContext, framesMetadata } = mediaToProviderInputs(media);
1834
+ const prompt = buildAskPrompt(userPrompt, {
1835
+ instructions: options?.instructions,
1836
+ media: mediaContext
1837
+ });
1319
1838
  debugLog(resolvedConfig, "ask prompt", prompt, "prompt");
1320
- const response = await timedSendMessage(driver, [img], prompt, askSchemaOptions);
1839
+ const response = await timedSendMessage(driver, images, prompt, askSchemaOptions);
1321
1840
  debugLog(resolvedConfig, "ask response", response.text, "response");
1322
1841
  const result = parseAskResponse(response.text);
1323
1842
  return {
1324
1843
  ...result,
1844
+ ...framesMetadata ? { frames: framesMetadata } : {},
1325
1845
  usage: processUsage("ask", response.usage, response.durationSeconds, resolvedConfig)
1326
1846
  };
1327
1847
  });
@@ -1509,6 +2029,7 @@ export {
1509
2029
  VisualAIRateLimitError,
1510
2030
  VisualAIResponseParseError,
1511
2031
  VisualAITruncationError,
2032
+ VisualAIVideoError,
1512
2033
  assertVisualCompareResult,
1513
2034
  assertVisualResult,
1514
2035
  formatCheckResult,