visual-ai-assertions 0.7.2 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +69 -16
- package/dist/index.cjs +588 -52
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +178 -25
- package/dist/index.d.ts +178 -25
- package/dist/index.js +587 -52
- package/dist/index.js.map +1 -1
- package/package.json +31 -21
package/dist/index.cjs
CHANGED
|
@@ -56,6 +56,7 @@ __export(index_exports, {
|
|
|
56
56
|
VisualAIRateLimitError: () => VisualAIRateLimitError,
|
|
57
57
|
VisualAIResponseParseError: () => VisualAIResponseParseError,
|
|
58
58
|
VisualAITruncationError: () => VisualAITruncationError,
|
|
59
|
+
VisualAIVideoError: () => VisualAIVideoError,
|
|
59
60
|
assertVisualCompareResult: () => assertVisualCompareResult,
|
|
60
61
|
assertVisualResult: () => assertVisualResult,
|
|
61
62
|
formatCheckResult: () => formatCheckResult,
|
|
@@ -79,11 +80,13 @@ var Provider = {
|
|
|
79
80
|
};
|
|
80
81
|
var Model = {
|
|
81
82
|
Anthropic: {
|
|
83
|
+
OPUS_4_7: "claude-opus-4-7",
|
|
82
84
|
OPUS_4_6: "claude-opus-4-6",
|
|
83
85
|
SONNET_4_6: "claude-sonnet-4-6",
|
|
84
86
|
HAIKU_4_5: "claude-haiku-4-5"
|
|
85
87
|
},
|
|
86
88
|
OpenAI: {
|
|
89
|
+
GPT_5_5: "gpt-5.5",
|
|
87
90
|
GPT_5_4: "gpt-5.4",
|
|
88
91
|
GPT_5_4_PRO: "gpt-5.4-pro",
|
|
89
92
|
GPT_5_4_MINI: "gpt-5.4-mini",
|
|
@@ -180,6 +183,12 @@ var VisualAIImageError = class extends VisualAIError {
|
|
|
180
183
|
this.name = "VisualAIImageError";
|
|
181
184
|
}
|
|
182
185
|
};
|
|
186
|
+
var VisualAIVideoError = class extends VisualAIError {
|
|
187
|
+
constructor(message) {
|
|
188
|
+
super(message, "VIDEO_INVALID");
|
|
189
|
+
this.name = "VisualAIVideoError";
|
|
190
|
+
}
|
|
191
|
+
};
|
|
183
192
|
var VisualAIResponseParseError = class extends VisualAIError {
|
|
184
193
|
rawResponse;
|
|
185
194
|
constructor(message, rawResponse) {
|
|
@@ -213,7 +222,7 @@ var VisualAIAssertionError = class extends VisualAIError {
|
|
|
213
222
|
}
|
|
214
223
|
};
|
|
215
224
|
function isVisualAIKnownError(error) {
|
|
216
|
-
return error instanceof VisualAIAuthError || error instanceof VisualAIRateLimitError || error instanceof VisualAIProviderError || error instanceof VisualAIImageError || error instanceof VisualAIResponseParseError || error instanceof VisualAITruncationError || error instanceof VisualAIConfigError || error instanceof VisualAIAssertionError;
|
|
225
|
+
return error instanceof VisualAIAuthError || error instanceof VisualAIRateLimitError || error instanceof VisualAIProviderError || error instanceof VisualAIImageError || error instanceof VisualAIVideoError || error instanceof VisualAIResponseParseError || error instanceof VisualAITruncationError || error instanceof VisualAIConfigError || error instanceof VisualAIAssertionError;
|
|
217
226
|
}
|
|
218
227
|
|
|
219
228
|
// src/core/prompt.ts
|
|
@@ -227,7 +236,7 @@ Each issue must have:
|
|
|
227
236
|
- "description": what the issue is
|
|
228
237
|
- "suggestion": how to fix or improve it
|
|
229
238
|
`;
|
|
230
|
-
var
|
|
239
|
+
var CHECK_OUTPUT_SCHEMA_IMAGE = `IMPORTANT: Follow this evaluation order:
|
|
231
240
|
1. First, evaluate EACH statement independently and populate the "statements" array
|
|
232
241
|
2. Then, set "pass" to true ONLY if every statement passed (logical AND of all statement results)
|
|
233
242
|
3. Write "reasoning" as a brief overall summary of the evaluation
|
|
@@ -267,7 +276,46 @@ Example for a failing check:
|
|
|
267
276
|
]
|
|
268
277
|
}
|
|
269
278
|
${JSON_INSTRUCTIONS}`;
|
|
270
|
-
var
|
|
279
|
+
var CHECK_OUTPUT_SCHEMA_VIDEO = `IMPORTANT: Follow this evaluation order:
|
|
280
|
+
1. First, evaluate EACH statement independently across the entire timeline and populate the "statements" array
|
|
281
|
+
2. A statement passes if it is true at ANY frame of the timeline, unless the wording explicitly says otherwise (e.g. "throughout", "at all times")
|
|
282
|
+
3. For each statement that passes, set "timestampSeconds" to the timestamp of the frame that most clearly demonstrates it (or where it first becomes true). Use null when the statement fails or applies across the whole clip.
|
|
283
|
+
4. Then, set "pass" to true ONLY if every statement passed (logical AND of all statement results)
|
|
284
|
+
5. Write "reasoning" as a brief overall summary of the evaluation
|
|
285
|
+
6. Include "issues" only for statements that failed
|
|
286
|
+
|
|
287
|
+
Respond with a JSON object matching this exact structure:
|
|
288
|
+
{
|
|
289
|
+
"pass": boolean, // true ONLY if ALL statements passed \u2014 derive from statements array
|
|
290
|
+
"reasoning": string, // brief overall summary of the evaluation
|
|
291
|
+
"issues": [...], // one issue per failing statement (empty if all pass)
|
|
292
|
+
"statements": [ // one entry per statement, in order \u2014 evaluate these FIRST
|
|
293
|
+
{
|
|
294
|
+
"statement": string, // the original statement text
|
|
295
|
+
"pass": boolean, // whether this statement is true at any point in the timeline
|
|
296
|
+
"reasoning": string, // explanation for this statement, citing frame timestamps where relevant
|
|
297
|
+
"confidence": "high" | "medium" | "low",
|
|
298
|
+
"timestampSeconds": number | null
|
|
299
|
+
// seconds from the start of the clip where the statement is most clearly true,
|
|
300
|
+
// or null if it failed / applies across the whole clip
|
|
301
|
+
}
|
|
302
|
+
]
|
|
303
|
+
}
|
|
304
|
+
${ISSUE_SCHEMA_INSTRUCTIONS}
|
|
305
|
+
|
|
306
|
+
Only include issues for statements that fail. If all statements pass, issues should be an empty array.
|
|
307
|
+
|
|
308
|
+
Example for a passing video check:
|
|
309
|
+
{
|
|
310
|
+
"pass": true,
|
|
311
|
+
"reasoning": "The success toast appeared briefly around 3.5s.",
|
|
312
|
+
"issues": [],
|
|
313
|
+
"statements": [
|
|
314
|
+
{ "statement": "A success toast with text 'Saved' appears", "pass": true, "reasoning": "A green toast labeled 'Saved' is visible in the bottom-right at the 3.5s frame", "confidence": "high", "timestampSeconds": 3.5 }
|
|
315
|
+
]
|
|
316
|
+
}
|
|
317
|
+
${JSON_INSTRUCTIONS}`;
|
|
318
|
+
var ASK_OUTPUT_SCHEMA_IMAGE = `Respond with a JSON object matching this exact structure:
|
|
271
319
|
{
|
|
272
320
|
"summary": string, // high-level analysis summary
|
|
273
321
|
"issues": [...] // list of issues/findings, can be empty
|
|
@@ -288,6 +336,17 @@ Example:
|
|
|
288
336
|
]
|
|
289
337
|
}
|
|
290
338
|
${JSON_INSTRUCTIONS}`;
|
|
339
|
+
var ASK_OUTPUT_SCHEMA_VIDEO = `Respond with a JSON object matching this exact structure:
|
|
340
|
+
{
|
|
341
|
+
"summary": string, // high-level summary of what happens across the timeline
|
|
342
|
+
"issues": [...], // list of issues/findings, can be empty
|
|
343
|
+
"frameReferences": number[] // 0-based indices of frames the answer relies on (in order)
|
|
344
|
+
}
|
|
345
|
+
${ISSUE_SCHEMA_INSTRUCTIONS}
|
|
346
|
+
|
|
347
|
+
Prioritize issues by severity (critical / major / minor) as for image input.
|
|
348
|
+
Cite frame indices in "frameReferences" so the user can locate the moments you describe.
|
|
349
|
+
${JSON_INSTRUCTIONS}`;
|
|
291
350
|
var COMPARE_OUTPUT_SCHEMA = `Respond with a JSON object matching this exact structure:
|
|
292
351
|
{
|
|
293
352
|
"pass": boolean, // true if no critical or major changes found
|
|
@@ -306,7 +365,19 @@ var COMPARE_OUTPUT_SCHEMA = `Respond with a JSON object matching this exact stru
|
|
|
306
365
|
If the images appear identical, set pass to true, explain in reasoning, and return an empty changes array.
|
|
307
366
|
${JSON_INSTRUCTIONS}`;
|
|
308
367
|
var DEFAULT_CHECK_ROLE = "You are a visual QA assistant. Evaluate the provided image precisely and objectively.";
|
|
368
|
+
var DEFAULT_CHECK_ROLE_VIDEO = "You are a visual QA assistant. Evaluate the provided sequence of video frames precisely and objectively, treating them as a chronological timeline.";
|
|
309
369
|
var DEFAULT_ASK_ROLE = "You are a visual QA assistant. Analyze the provided image based on the user's request.";
|
|
370
|
+
var DEFAULT_ASK_ROLE_VIDEO = "You are a visual QA assistant. Analyze the provided sequence of video frames as a chronological timeline based on the user's request.";
|
|
371
|
+
function buildVideoTimelineSection(frameTimestamps, durationSeconds) {
|
|
372
|
+
const formatted = frameTimestamps.map((t, i) => ` ${i}: ${t.toFixed(2)}s`).join("\n");
|
|
373
|
+
return `Video timeline:
|
|
374
|
+
- Total duration: ${durationSeconds.toFixed(2)}s
|
|
375
|
+
- ${frameTimestamps.length} frames sampled (in chronological order)
|
|
376
|
+
- Frame index \u2192 timestamp:
|
|
377
|
+
${formatted}
|
|
378
|
+
|
|
379
|
+
Treat the attached images as a chronological timeline. The first image is the earliest frame, the last is the latest. Refer to frames by timestamp where helpful.`;
|
|
380
|
+
}
|
|
310
381
|
var COMPARE_ROLE = "You are performing a visual regression test. Compare the BEFORE image (baseline) to the AFTER image (current) and identify all visual differences. Flag changes that appear unintentional or problematic.";
|
|
311
382
|
var COMPARE_EDGE_RULES = [
|
|
312
383
|
"The BEFORE image is the baseline/expected state.",
|
|
@@ -319,22 +390,31 @@ function buildInstructionsSection(instructions) {
|
|
|
319
390
|
function buildCheckPrompt(statements, options) {
|
|
320
391
|
const stmts = Array.isArray(statements) ? statements : [statements];
|
|
321
392
|
const statementsBlock = stmts.map((s, i) => `${i + 1}. "${s}"`).join("\n");
|
|
322
|
-
const
|
|
393
|
+
const media = options?.media;
|
|
394
|
+
const defaultRole = media?.kind === "video" ? DEFAULT_CHECK_ROLE_VIDEO : DEFAULT_CHECK_ROLE;
|
|
395
|
+
const sections = [options?.role ?? defaultRole];
|
|
396
|
+
if (media?.kind === "video") {
|
|
397
|
+
sections.push(buildVideoTimelineSection(media.frameTimestamps, media.durationSeconds));
|
|
398
|
+
}
|
|
323
399
|
if (options?.instructions && options.instructions.length > 0) {
|
|
324
400
|
sections.push(buildInstructionsSection(options.instructions));
|
|
325
401
|
}
|
|
326
402
|
sections.push(`Statements to evaluate:
|
|
327
403
|
${statementsBlock}`);
|
|
328
|
-
sections.push(
|
|
404
|
+
sections.push(media?.kind === "video" ? CHECK_OUTPUT_SCHEMA_VIDEO : CHECK_OUTPUT_SCHEMA_IMAGE);
|
|
329
405
|
return sections.join("\n\n");
|
|
330
406
|
}
|
|
331
407
|
function buildAskPrompt(userPrompt, options) {
|
|
332
|
-
const
|
|
408
|
+
const media = options?.media;
|
|
409
|
+
const sections = [media?.kind === "video" ? DEFAULT_ASK_ROLE_VIDEO : DEFAULT_ASK_ROLE];
|
|
410
|
+
if (media?.kind === "video") {
|
|
411
|
+
sections.push(buildVideoTimelineSection(media.frameTimestamps, media.durationSeconds));
|
|
412
|
+
}
|
|
333
413
|
if (options?.instructions && options.instructions.length > 0) {
|
|
334
414
|
sections.push(buildInstructionsSection(options.instructions));
|
|
335
415
|
}
|
|
336
416
|
sections.push(`User request: ${userPrompt}`);
|
|
337
|
-
sections.push(
|
|
417
|
+
sections.push(media?.kind === "video" ? ASK_OUTPUT_SCHEMA_VIDEO : ASK_OUTPUT_SCHEMA_IMAGE);
|
|
338
418
|
return sections.join("\n\n");
|
|
339
419
|
}
|
|
340
420
|
function buildAiDiffPrompt() {
|
|
@@ -484,6 +564,10 @@ function parseRetryAfter(value) {
|
|
|
484
564
|
}
|
|
485
565
|
|
|
486
566
|
// src/providers/anthropic.ts
|
|
567
|
+
function mapEffort(level, model) {
|
|
568
|
+
if (level !== "xhigh") return level;
|
|
569
|
+
return model === Model.Anthropic.OPUS_4_7 ? "xhigh" : "max";
|
|
570
|
+
}
|
|
487
571
|
var AnthropicDriver = class {
|
|
488
572
|
client;
|
|
489
573
|
model;
|
|
@@ -541,7 +625,7 @@ var AnthropicDriver = class {
|
|
|
541
625
|
if (this.reasoningEffort) {
|
|
542
626
|
requestParams.thinking = { type: "adaptive" };
|
|
543
627
|
requestParams.output_config = {
|
|
544
|
-
effort: this.reasoningEffort
|
|
628
|
+
effort: mapEffort(this.reasoningEffort, this.model)
|
|
545
629
|
};
|
|
546
630
|
}
|
|
547
631
|
const message = await client.messages.create(requestParams);
|
|
@@ -865,6 +949,10 @@ function resolveConfig(config) {
|
|
|
865
949
|
// src/core/pricing.ts
|
|
866
950
|
var PER_MILLION = 1e6;
|
|
867
951
|
var PRICING_TABLE = {
|
|
952
|
+
[`${Provider.ANTHROPIC}:${Model.Anthropic.OPUS_4_7}`]: {
|
|
953
|
+
inputPricePerToken: 5 / PER_MILLION,
|
|
954
|
+
outputPricePerToken: 25 / PER_MILLION
|
|
955
|
+
},
|
|
868
956
|
[`${Provider.ANTHROPIC}:${Model.Anthropic.OPUS_4_6}`]: {
|
|
869
957
|
inputPricePerToken: 5 / PER_MILLION,
|
|
870
958
|
outputPricePerToken: 25 / PER_MILLION
|
|
@@ -877,6 +965,10 @@ var PRICING_TABLE = {
|
|
|
877
965
|
inputPricePerToken: 1 / PER_MILLION,
|
|
878
966
|
outputPricePerToken: 5 / PER_MILLION
|
|
879
967
|
},
|
|
968
|
+
[`${Provider.OPENAI}:${Model.OpenAI.GPT_5_5}`]: {
|
|
969
|
+
inputPricePerToken: 5 / PER_MILLION,
|
|
970
|
+
outputPricePerToken: 30 / PER_MILLION
|
|
971
|
+
},
|
|
880
972
|
[`${Provider.OPENAI}:${Model.OpenAI.GPT_5_4}`]: {
|
|
881
973
|
inputPricePerToken: 2.5 / PER_MILLION,
|
|
882
974
|
outputPricePerToken: 15 / PER_MILLION
|
|
@@ -1017,6 +1109,51 @@ async function generateAiDiff(imgA, imgB, model, driver) {
|
|
|
1017
1109
|
var import_promises = require("fs/promises");
|
|
1018
1110
|
var import_node_path = require("path");
|
|
1019
1111
|
var import_sharp2 = __toESM(require("sharp"), 1);
|
|
1112
|
+
|
|
1113
|
+
// src/core/input-detect.ts
|
|
1114
|
+
function isFilePath(input) {
|
|
1115
|
+
return input.startsWith("/") || input.startsWith("./") || input.startsWith("../") || input.includes("\\");
|
|
1116
|
+
}
|
|
1117
|
+
function isUrl(input) {
|
|
1118
|
+
return input.startsWith("http://") || input.startsWith("https://");
|
|
1119
|
+
}
|
|
1120
|
+
function isDataUrl(input) {
|
|
1121
|
+
return input.startsWith("data:");
|
|
1122
|
+
}
|
|
1123
|
+
function parseDataUrl(input) {
|
|
1124
|
+
const match = /^data:([^;]+);base64,(.+)$/.exec(input);
|
|
1125
|
+
if (!match?.[1] || !match[2]) return null;
|
|
1126
|
+
return { mimeType: match[1], base64Payload: match[2] };
|
|
1127
|
+
}
|
|
1128
|
+
function decodeBase64(payload) {
|
|
1129
|
+
if (!/^[A-Za-z0-9+/\n\r]+=*$/.test(payload)) {
|
|
1130
|
+
throw new Error("Invalid base64 string");
|
|
1131
|
+
}
|
|
1132
|
+
return Buffer.from(payload, "base64");
|
|
1133
|
+
}
|
|
1134
|
+
function looksLikeImageBase64(input) {
|
|
1135
|
+
return input.startsWith("iVBOR") || // PNG (0x89 0x50 0x4E 0x47)
|
|
1136
|
+
input.startsWith("/9j/") || // JPEG (0xFF 0xD8 0xFF)
|
|
1137
|
+
input.startsWith("R0lGOD") || // GIF (0x47 0x49 0x46)
|
|
1138
|
+
input.startsWith("UklGR");
|
|
1139
|
+
}
|
|
1140
|
+
function looksLikeVideoBase64(input) {
|
|
1141
|
+
return input.startsWith("GkXf") || input.startsWith("AAAA");
|
|
1142
|
+
}
|
|
1143
|
+
async function fetchToBuffer(url, timeoutMs) {
|
|
1144
|
+
const response = await fetch(url, {
|
|
1145
|
+
signal: AbortSignal.timeout(timeoutMs)
|
|
1146
|
+
});
|
|
1147
|
+
if (!response.ok) {
|
|
1148
|
+
throw new Error(`HTTP ${response.status}`);
|
|
1149
|
+
}
|
|
1150
|
+
const arrayBuffer = await response.arrayBuffer();
|
|
1151
|
+
const data = Buffer.from(arrayBuffer);
|
|
1152
|
+
const contentType = response.headers.get("content-type")?.split(";")[0]?.trim() ?? null;
|
|
1153
|
+
return { data, contentType };
|
|
1154
|
+
}
|
|
1155
|
+
|
|
1156
|
+
// src/core/image.ts
|
|
1020
1157
|
var SUPPORTED_FORMATS = /* @__PURE__ */ new Set([
|
|
1021
1158
|
"image/jpeg",
|
|
1022
1159
|
"image/png",
|
|
@@ -1039,18 +1176,6 @@ function getMimeFromExtension(filePath) {
|
|
|
1039
1176
|
const ext = (0, import_node_path.extname)(filePath).toLowerCase();
|
|
1040
1177
|
return EXTENSION_TO_MIME[ext];
|
|
1041
1178
|
}
|
|
1042
|
-
function isFilePath(input) {
|
|
1043
|
-
return input.startsWith("/") || input.startsWith("./") || input.startsWith("../") || input.includes("\\");
|
|
1044
|
-
}
|
|
1045
|
-
function isUrl(input) {
|
|
1046
|
-
return input.startsWith("http://") || input.startsWith("https://");
|
|
1047
|
-
}
|
|
1048
|
-
function isBase64Image(input) {
|
|
1049
|
-
return input.startsWith("iVBOR") || // PNG (0x89 0x50 0x4E 0x47)
|
|
1050
|
-
input.startsWith("/9j/") || // JPEG (0xFF 0xD8 0xFF)
|
|
1051
|
-
input.startsWith("R0lGOD") || // GIF (0x47 0x49 0x46)
|
|
1052
|
-
input.startsWith("UklGR");
|
|
1053
|
-
}
|
|
1054
1179
|
function detectMimeType(data) {
|
|
1055
1180
|
if (data[0] === 255 && data[1] === 216 && data[2] === 255) {
|
|
1056
1181
|
return "image/jpeg";
|
|
@@ -1104,45 +1229,38 @@ async function loadFromFilePath(filePath) {
|
|
|
1104
1229
|
return { data: fileData, mimeType };
|
|
1105
1230
|
}
|
|
1106
1231
|
async function loadFromUrl(url) {
|
|
1107
|
-
let
|
|
1232
|
+
let result;
|
|
1108
1233
|
try {
|
|
1109
|
-
|
|
1110
|
-
signal: AbortSignal.timeout(URL_FETCH_TIMEOUT_MS)
|
|
1111
|
-
});
|
|
1234
|
+
result = await fetchToBuffer(url, URL_FETCH_TIMEOUT_MS);
|
|
1112
1235
|
} catch (err) {
|
|
1113
1236
|
throw new VisualAIImageError(
|
|
1114
1237
|
`Failed to fetch image from URL: ${url} \u2014 ${err instanceof Error ? err.message : String(err)}`
|
|
1115
1238
|
);
|
|
1116
1239
|
}
|
|
1117
|
-
|
|
1118
|
-
throw new VisualAIImageError(
|
|
1119
|
-
`Failed to fetch image from URL: ${url} \u2014 HTTP ${response.status}`
|
|
1120
|
-
);
|
|
1121
|
-
}
|
|
1122
|
-
const arrayBuffer = await response.arrayBuffer();
|
|
1123
|
-
const data = Buffer.from(arrayBuffer);
|
|
1124
|
-
const contentType = response.headers.get("content-type")?.split(";")[0]?.trim() ?? null;
|
|
1240
|
+
const { data, contentType } = result;
|
|
1125
1241
|
const mimeType = contentType && isSupportedMimeType(contentType) ? contentType : detectMimeType(data);
|
|
1126
1242
|
return { data, mimeType };
|
|
1127
1243
|
}
|
|
1128
1244
|
function loadFromBase64(input) {
|
|
1129
1245
|
let base64Data = input;
|
|
1130
1246
|
let mimeType;
|
|
1131
|
-
if (input
|
|
1132
|
-
const
|
|
1133
|
-
if (!
|
|
1247
|
+
if (isDataUrl(input)) {
|
|
1248
|
+
const parsed = parseDataUrl(input);
|
|
1249
|
+
if (!parsed) {
|
|
1134
1250
|
throw new VisualAIImageError("Invalid data URL format");
|
|
1135
1251
|
}
|
|
1136
|
-
if (!isSupportedMimeType(
|
|
1137
|
-
throw new VisualAIImageError(`Unsupported image format: ${
|
|
1252
|
+
if (!isSupportedMimeType(parsed.mimeType)) {
|
|
1253
|
+
throw new VisualAIImageError(`Unsupported image format: ${parsed.mimeType}`);
|
|
1138
1254
|
}
|
|
1139
|
-
mimeType =
|
|
1140
|
-
base64Data =
|
|
1255
|
+
mimeType = parsed.mimeType;
|
|
1256
|
+
base64Data = parsed.base64Payload;
|
|
1141
1257
|
}
|
|
1142
|
-
|
|
1258
|
+
let data;
|
|
1259
|
+
try {
|
|
1260
|
+
data = decodeBase64(base64Data);
|
|
1261
|
+
} catch {
|
|
1143
1262
|
throw new VisualAIImageError("Invalid base64 string");
|
|
1144
1263
|
}
|
|
1145
|
-
const data = Buffer.from(base64Data, "base64");
|
|
1146
1264
|
if (data.length === 0) {
|
|
1147
1265
|
throw new VisualAIImageError("Empty image data after base64 decode");
|
|
1148
1266
|
}
|
|
@@ -1161,9 +1279,9 @@ async function normalizeImage(input) {
|
|
|
1161
1279
|
} else if (typeof input === "string") {
|
|
1162
1280
|
if (isUrl(input)) {
|
|
1163
1281
|
({ data, mimeType } = await loadFromUrl(input));
|
|
1164
|
-
} else if (input
|
|
1282
|
+
} else if (isDataUrl(input)) {
|
|
1165
1283
|
({ data, mimeType } = loadFromBase64(input));
|
|
1166
|
-
} else if (
|
|
1284
|
+
} else if (looksLikeImageBase64(input)) {
|
|
1167
1285
|
({ data, mimeType } = loadFromBase64(input));
|
|
1168
1286
|
} else if (isFilePath(input)) {
|
|
1169
1287
|
({ data, mimeType } = await loadFromFilePath(input));
|
|
@@ -1191,6 +1309,379 @@ async function normalizeImage(input) {
|
|
|
1191
1309
|
};
|
|
1192
1310
|
}
|
|
1193
1311
|
|
|
1312
|
+
// src/core/video.ts
|
|
1313
|
+
var import_promises2 = require("fs/promises");
|
|
1314
|
+
var import_node_os = require("os");
|
|
1315
|
+
var import_node_path2 = require("path");
|
|
1316
|
+
var FRAME_MAX_DIMENSION = 1568;
|
|
1317
|
+
var DEFAULT_FPS = 1;
|
|
1318
|
+
var DEFAULT_MAX_FRAMES = 10;
|
|
1319
|
+
var DEFAULT_MAX_DURATION_SECONDS = 10;
|
|
1320
|
+
var MAX_FRAMES_HARD_CAP = 60;
|
|
1321
|
+
var FFPROBE_TIMEOUT_MS = 15e3;
|
|
1322
|
+
var FFMPEG_RUN_TIMEOUT_MS = 6e4;
|
|
1323
|
+
var VIDEO_EXTENSIONS = {
|
|
1324
|
+
".mp4": "video/mp4",
|
|
1325
|
+
".m4v": "video/mp4",
|
|
1326
|
+
".webm": "video/webm",
|
|
1327
|
+
".mov": "video/quicktime",
|
|
1328
|
+
".qt": "video/quicktime",
|
|
1329
|
+
".mkv": "video/x-matroska"
|
|
1330
|
+
};
|
|
1331
|
+
var VIDEO_MIME_TYPES = /* @__PURE__ */ new Set([
|
|
1332
|
+
"video/mp4",
|
|
1333
|
+
"video/webm",
|
|
1334
|
+
"video/quicktime",
|
|
1335
|
+
"video/x-matroska"
|
|
1336
|
+
]);
|
|
1337
|
+
function isSupportedVideoMimeType(value) {
|
|
1338
|
+
return VIDEO_MIME_TYPES.has(value);
|
|
1339
|
+
}
|
|
1340
|
+
function getVideoMimeFromExtension(filePath) {
|
|
1341
|
+
const ext = (0, import_node_path2.extname)(filePath).toLowerCase();
|
|
1342
|
+
return VIDEO_EXTENSIONS[ext];
|
|
1343
|
+
}
|
|
1344
|
+
function detectVideoMimeType(data) {
|
|
1345
|
+
if (data.length < 12) return null;
|
|
1346
|
+
if (data[4] === 102 && data[5] === 116 && data[6] === 121 && data[7] === 112) {
|
|
1347
|
+
if (data[8] === 113 && data[9] === 116 && data[10] === 32 && data[11] === 32) {
|
|
1348
|
+
return "video/quicktime";
|
|
1349
|
+
}
|
|
1350
|
+
return "video/mp4";
|
|
1351
|
+
}
|
|
1352
|
+
if (data[0] === 26 && data[1] === 69 && data[2] === 223 && data[3] === 163) {
|
|
1353
|
+
return "video/webm";
|
|
1354
|
+
}
|
|
1355
|
+
return null;
|
|
1356
|
+
}
|
|
1357
|
+
async function resolveVideoToPath(input) {
|
|
1358
|
+
if (Buffer.isBuffer(input) || input instanceof Uint8Array) {
|
|
1359
|
+
const buf2 = Buffer.isBuffer(input) ? input : Buffer.from(input);
|
|
1360
|
+
const mimeType2 = detectVideoMimeType(buf2);
|
|
1361
|
+
if (!mimeType2) {
|
|
1362
|
+
throw new VisualAIVideoError("Unable to detect video format from buffer contents");
|
|
1363
|
+
}
|
|
1364
|
+
return writeBufferToTemp(buf2, mimeType2);
|
|
1365
|
+
}
|
|
1366
|
+
if (typeof input !== "string") {
|
|
1367
|
+
throw new VisualAIVideoError(
|
|
1368
|
+
"Invalid video input: expected Buffer, Uint8Array, file path, data URL, or base64 string"
|
|
1369
|
+
);
|
|
1370
|
+
}
|
|
1371
|
+
if (isDataUrl(input)) {
|
|
1372
|
+
const parsed = parseDataUrl(input);
|
|
1373
|
+
if (!parsed) {
|
|
1374
|
+
throw new VisualAIVideoError("Invalid data URL format");
|
|
1375
|
+
}
|
|
1376
|
+
if (!isSupportedVideoMimeType(parsed.mimeType)) {
|
|
1377
|
+
throw new VisualAIVideoError(`Unsupported video format: ${parsed.mimeType}`);
|
|
1378
|
+
}
|
|
1379
|
+
let buf2;
|
|
1380
|
+
try {
|
|
1381
|
+
buf2 = decodeBase64(parsed.base64Payload);
|
|
1382
|
+
} catch {
|
|
1383
|
+
throw new VisualAIVideoError("Invalid base64 payload in data URL");
|
|
1384
|
+
}
|
|
1385
|
+
return writeBufferToTemp(buf2, parsed.mimeType);
|
|
1386
|
+
}
|
|
1387
|
+
if (isFilePath(input)) {
|
|
1388
|
+
const mimeType2 = getVideoMimeFromExtension(input);
|
|
1389
|
+
if (!mimeType2) {
|
|
1390
|
+
throw new VisualAIVideoError(
|
|
1391
|
+
`Unsupported video file extension: ${input}. Supported: .mp4, .webm, .mov, .mkv`
|
|
1392
|
+
);
|
|
1393
|
+
}
|
|
1394
|
+
return { path: input, mimeType: mimeType2, cleanup: async () => {
|
|
1395
|
+
} };
|
|
1396
|
+
}
|
|
1397
|
+
let buf;
|
|
1398
|
+
try {
|
|
1399
|
+
buf = decodeBase64(input);
|
|
1400
|
+
} catch {
|
|
1401
|
+
throw new VisualAIVideoError(
|
|
1402
|
+
`Unrecognized video input: "${input.slice(0, 80)}". Expected a file path, data URL, or base64-encoded video string.`
|
|
1403
|
+
);
|
|
1404
|
+
}
|
|
1405
|
+
const mimeType = detectVideoMimeType(buf);
|
|
1406
|
+
if (!mimeType) {
|
|
1407
|
+
throw new VisualAIVideoError(
|
|
1408
|
+
`Unrecognized video input: "${input.slice(0, 80)}". Expected a file path, data URL, or base64-encoded video string.`
|
|
1409
|
+
);
|
|
1410
|
+
}
|
|
1411
|
+
return writeBufferToTemp(buf, mimeType);
|
|
1412
|
+
}
|
|
1413
|
+
async function writeBufferToTemp(data, mimeType) {
|
|
1414
|
+
const dir = await (0, import_promises2.mkdtemp)((0, import_node_path2.join)((0, import_node_os.tmpdir)(), "visual-ai-video-"));
|
|
1415
|
+
try {
|
|
1416
|
+
const ext = extensionFor(mimeType);
|
|
1417
|
+
const path = (0, import_node_path2.join)(dir, `input${ext}`);
|
|
1418
|
+
await (0, import_promises2.writeFile)(path, data);
|
|
1419
|
+
return {
|
|
1420
|
+
path,
|
|
1421
|
+
mimeType,
|
|
1422
|
+
cleanup: async () => {
|
|
1423
|
+
try {
|
|
1424
|
+
await (0, import_promises2.rm)(dir, { recursive: true, force: true });
|
|
1425
|
+
} catch {
|
|
1426
|
+
}
|
|
1427
|
+
}
|
|
1428
|
+
};
|
|
1429
|
+
} catch (err) {
|
|
1430
|
+
try {
|
|
1431
|
+
await (0, import_promises2.rm)(dir, { recursive: true, force: true });
|
|
1432
|
+
} catch {
|
|
1433
|
+
}
|
|
1434
|
+
throw err;
|
|
1435
|
+
}
|
|
1436
|
+
}
|
|
1437
|
+
function extensionFor(mimeType) {
|
|
1438
|
+
switch (mimeType) {
|
|
1439
|
+
case "video/mp4":
|
|
1440
|
+
return ".mp4";
|
|
1441
|
+
case "video/webm":
|
|
1442
|
+
return ".webm";
|
|
1443
|
+
case "video/quicktime":
|
|
1444
|
+
return ".mov";
|
|
1445
|
+
case "video/x-matroska":
|
|
1446
|
+
return ".mkv";
|
|
1447
|
+
}
|
|
1448
|
+
}
|
|
1449
|
+
var cachedFactoryPromise;
|
|
1450
|
+
async function loadFfmpegFactory() {
|
|
1451
|
+
if (cachedFactoryPromise) return cachedFactoryPromise;
|
|
1452
|
+
cachedFactoryPromise = (async () => {
|
|
1453
|
+
let ffmpegModule;
|
|
1454
|
+
try {
|
|
1455
|
+
ffmpegModule = await import("fluent-ffmpeg");
|
|
1456
|
+
} catch (err) {
|
|
1457
|
+
const code = err?.code;
|
|
1458
|
+
if (code === "ERR_MODULE_NOT_FOUND" || code === "MODULE_NOT_FOUND") {
|
|
1459
|
+
throw new VisualAIVideoError(
|
|
1460
|
+
"Video support requires fluent-ffmpeg. Install it with: pnpm add -D fluent-ffmpeg @ffmpeg-installer/ffmpeg @ffprobe-installer/ffprobe @types/fluent-ffmpeg"
|
|
1461
|
+
);
|
|
1462
|
+
}
|
|
1463
|
+
throw new VisualAIVideoError(
|
|
1464
|
+
`Failed to load fluent-ffmpeg: ${err instanceof Error ? err.message : String(err)}`
|
|
1465
|
+
);
|
|
1466
|
+
}
|
|
1467
|
+
const factory = ffmpegModule.default ?? ffmpegModule;
|
|
1468
|
+
try {
|
|
1469
|
+
const installer = await import("@ffmpeg-installer/ffmpeg");
|
|
1470
|
+
const path = (installer.default ?? installer).path;
|
|
1471
|
+
if (path) factory.setFfmpegPath(path);
|
|
1472
|
+
} catch (err) {
|
|
1473
|
+
const code = err?.code;
|
|
1474
|
+
if (code !== "ERR_MODULE_NOT_FOUND" && code !== "MODULE_NOT_FOUND") {
|
|
1475
|
+
process.stderr.write(
|
|
1476
|
+
`[visual-ai-assertions] warning: @ffmpeg-installer/ffmpeg failed to load: ${err instanceof Error ? err.message : String(err)}
|
|
1477
|
+
`
|
|
1478
|
+
);
|
|
1479
|
+
}
|
|
1480
|
+
}
|
|
1481
|
+
try {
|
|
1482
|
+
const installer = await import("@ffprobe-installer/ffprobe");
|
|
1483
|
+
const path = (installer.default ?? installer).path;
|
|
1484
|
+
if (path) factory.setFfprobePath(path);
|
|
1485
|
+
} catch (err) {
|
|
1486
|
+
const code = err?.code;
|
|
1487
|
+
if (code !== "ERR_MODULE_NOT_FOUND" && code !== "MODULE_NOT_FOUND") {
|
|
1488
|
+
process.stderr.write(
|
|
1489
|
+
`[visual-ai-assertions] warning: @ffprobe-installer/ffprobe failed to load: ${err instanceof Error ? err.message : String(err)}
|
|
1490
|
+
`
|
|
1491
|
+
);
|
|
1492
|
+
}
|
|
1493
|
+
}
|
|
1494
|
+
return factory;
|
|
1495
|
+
})();
|
|
1496
|
+
try {
|
|
1497
|
+
return await cachedFactoryPromise;
|
|
1498
|
+
} catch (err) {
|
|
1499
|
+
cachedFactoryPromise = void 0;
|
|
1500
|
+
throw err;
|
|
1501
|
+
}
|
|
1502
|
+
}
|
|
1503
|
+
async function probeDurationSeconds(videoPath) {
|
|
1504
|
+
const ffmpeg = await loadFfmpegFactory();
|
|
1505
|
+
return new Promise((resolve, reject) => {
|
|
1506
|
+
let settled = false;
|
|
1507
|
+
const finish = (fn) => {
|
|
1508
|
+
if (settled) return;
|
|
1509
|
+
settled = true;
|
|
1510
|
+
clearTimeout(timer);
|
|
1511
|
+
fn();
|
|
1512
|
+
};
|
|
1513
|
+
const timer = setTimeout(() => {
|
|
1514
|
+
finish(() => {
|
|
1515
|
+
reject(
|
|
1516
|
+
new VisualAIVideoError(
|
|
1517
|
+
`ffprobe timed out after ${FFPROBE_TIMEOUT_MS}ms while probing ${videoPath}`
|
|
1518
|
+
)
|
|
1519
|
+
);
|
|
1520
|
+
});
|
|
1521
|
+
}, FFPROBE_TIMEOUT_MS);
|
|
1522
|
+
ffmpeg.ffprobe(videoPath, (err, data) => {
|
|
1523
|
+
if (err) {
|
|
1524
|
+
finish(() => {
|
|
1525
|
+
reject(
|
|
1526
|
+
new VisualAIVideoError(
|
|
1527
|
+
`Failed to probe video metadata: ${err.message}. Ensure ffprobe is installed (e.g. via @ffprobe-installer/ffprobe).`
|
|
1528
|
+
)
|
|
1529
|
+
);
|
|
1530
|
+
});
|
|
1531
|
+
return;
|
|
1532
|
+
}
|
|
1533
|
+
const raw = data.format?.duration;
|
|
1534
|
+
const duration = typeof raw === "string" ? Number(raw) : raw;
|
|
1535
|
+
if (!duration || !Number.isFinite(duration) || duration <= 0) {
|
|
1536
|
+
finish(() => {
|
|
1537
|
+
reject(new VisualAIVideoError("Video duration could not be determined"));
|
|
1538
|
+
});
|
|
1539
|
+
return;
|
|
1540
|
+
}
|
|
1541
|
+
finish(() => {
|
|
1542
|
+
resolve(duration);
|
|
1543
|
+
});
|
|
1544
|
+
});
|
|
1545
|
+
});
|
|
1546
|
+
}
|
|
1547
|
+
async function extractFrames(videoPath, options = {}) {
|
|
1548
|
+
const fps = options.fps ?? DEFAULT_FPS;
|
|
1549
|
+
const maxFrames = options.maxFrames ?? DEFAULT_MAX_FRAMES;
|
|
1550
|
+
const maxDurationSeconds = options.maxDurationSeconds ?? DEFAULT_MAX_DURATION_SECONDS;
|
|
1551
|
+
if (!Number.isFinite(fps) || fps <= 0) {
|
|
1552
|
+
throw new VisualAIVideoError(`Invalid fps: ${fps}. Must be a finite number > 0.`);
|
|
1553
|
+
}
|
|
1554
|
+
if (!Number.isFinite(maxFrames) || maxFrames <= 0) {
|
|
1555
|
+
throw new VisualAIVideoError(`Invalid maxFrames: ${maxFrames}. Must be a finite number > 0.`);
|
|
1556
|
+
}
|
|
1557
|
+
if (maxFrames > MAX_FRAMES_HARD_CAP) {
|
|
1558
|
+
throw new VisualAIVideoError(
|
|
1559
|
+
`maxFrames ${maxFrames} exceeds the hard cap of ${MAX_FRAMES_HARD_CAP}. Lower maxFrames or open an issue if you need a larger limit.`
|
|
1560
|
+
);
|
|
1561
|
+
}
|
|
1562
|
+
if (!Number.isFinite(maxDurationSeconds) || maxDurationSeconds <= 0) {
|
|
1563
|
+
throw new VisualAIVideoError(
|
|
1564
|
+
`Invalid maxDurationSeconds: ${maxDurationSeconds}. Must be a finite number > 0.`
|
|
1565
|
+
);
|
|
1566
|
+
}
|
|
1567
|
+
const ffmpeg = await loadFfmpegFactory();
|
|
1568
|
+
const durationSeconds = await probeDurationSeconds(videoPath);
|
|
1569
|
+
if (durationSeconds > maxDurationSeconds) {
|
|
1570
|
+
throw new VisualAIVideoError(
|
|
1571
|
+
`Video duration ${durationSeconds.toFixed(2)}s exceeds limit of ${maxDurationSeconds}s. Pass { maxDurationSeconds: N } to override, or trim the source video.`
|
|
1572
|
+
);
|
|
1573
|
+
}
|
|
1574
|
+
const outputDir = await (0, import_promises2.mkdtemp)((0, import_node_path2.join)((0, import_node_os.tmpdir)(), "visual-ai-frames-"));
|
|
1575
|
+
try {
|
|
1576
|
+
const filter = `fps=${fps},scale='if(gt(iw,ih),min(${FRAME_MAX_DIMENSION},iw),-2)':'if(gt(iw,ih),-2,min(${FRAME_MAX_DIMENSION},ih))':flags=area`;
|
|
1577
|
+
await new Promise((resolve, reject) => {
|
|
1578
|
+
let settled = false;
|
|
1579
|
+
const cmd = ffmpeg(videoPath);
|
|
1580
|
+
const finish = (fn) => {
|
|
1581
|
+
if (settled) return;
|
|
1582
|
+
settled = true;
|
|
1583
|
+
clearTimeout(timer);
|
|
1584
|
+
fn();
|
|
1585
|
+
};
|
|
1586
|
+
const timer = setTimeout(() => {
|
|
1587
|
+
try {
|
|
1588
|
+
cmd.kill("SIGKILL");
|
|
1589
|
+
} catch {
|
|
1590
|
+
}
|
|
1591
|
+
finish(() => {
|
|
1592
|
+
reject(
|
|
1593
|
+
new VisualAIVideoError(
|
|
1594
|
+
`ffmpeg frame extraction timed out after ${FFMPEG_RUN_TIMEOUT_MS}ms`
|
|
1595
|
+
)
|
|
1596
|
+
);
|
|
1597
|
+
});
|
|
1598
|
+
}, FFMPEG_RUN_TIMEOUT_MS);
|
|
1599
|
+
cmd.outputOptions(["-vf", filter, "-vframes", String(maxFrames), "-q:v", "3"]).output((0, import_node_path2.join)(outputDir, "frame-%04d.jpg")).on("end", () => {
|
|
1600
|
+
finish(() => {
|
|
1601
|
+
resolve();
|
|
1602
|
+
});
|
|
1603
|
+
}).on("error", (err) => {
|
|
1604
|
+
finish(() => {
|
|
1605
|
+
reject(new VisualAIVideoError(`ffmpeg frame extraction failed: ${err.message}`));
|
|
1606
|
+
});
|
|
1607
|
+
}).run();
|
|
1608
|
+
});
|
|
1609
|
+
const files = (await (0, import_promises2.readdir)(outputDir)).filter((name) => name.endsWith(".jpg")).sort();
|
|
1610
|
+
if (files.length === 0) {
|
|
1611
|
+
throw new VisualAIVideoError(
|
|
1612
|
+
"No frames could be extracted from the video. The source may be corrupt or empty."
|
|
1613
|
+
);
|
|
1614
|
+
}
|
|
1615
|
+
const frames = await Promise.all(
|
|
1616
|
+
files.map(async (name, index) => {
|
|
1617
|
+
const data = await (0, import_promises2.readFile)((0, import_node_path2.join)(outputDir, name));
|
|
1618
|
+
const timestampSeconds = Math.min(durationSeconds, (index + 0.5) / fps);
|
|
1619
|
+
let cachedBase64;
|
|
1620
|
+
return {
|
|
1621
|
+
data,
|
|
1622
|
+
mimeType: "image/jpeg",
|
|
1623
|
+
get base64() {
|
|
1624
|
+
if (cachedBase64 === void 0) {
|
|
1625
|
+
cachedBase64 = data.toString("base64");
|
|
1626
|
+
}
|
|
1627
|
+
return cachedBase64;
|
|
1628
|
+
},
|
|
1629
|
+
timestampSeconds,
|
|
1630
|
+
index
|
|
1631
|
+
};
|
|
1632
|
+
})
|
|
1633
|
+
);
|
|
1634
|
+
return { frames, durationSeconds };
|
|
1635
|
+
} finally {
|
|
1636
|
+
try {
|
|
1637
|
+
await (0, import_promises2.rm)(outputDir, { recursive: true, force: true });
|
|
1638
|
+
} catch {
|
|
1639
|
+
}
|
|
1640
|
+
}
|
|
1641
|
+
}
|
|
1642
|
+
|
|
1643
|
+
// src/core/media.ts
|
|
1644
|
+
var VIDEO_MAGIC_BYTE_PREFIX_LEN = 16;
|
|
1645
|
+
function isVideoInput(input) {
|
|
1646
|
+
if (Buffer.isBuffer(input) || input instanceof Uint8Array) {
|
|
1647
|
+
const buf = Buffer.isBuffer(input) ? input : Buffer.from(input);
|
|
1648
|
+
return detectVideoMimeType(buf) !== null;
|
|
1649
|
+
}
|
|
1650
|
+
if (typeof input !== "string") return false;
|
|
1651
|
+
if (isDataUrl(input)) {
|
|
1652
|
+
const parsed = parseDataUrl(input);
|
|
1653
|
+
return parsed?.mimeType.startsWith("video/") ?? false;
|
|
1654
|
+
}
|
|
1655
|
+
if (isFilePath(input)) {
|
|
1656
|
+
return getVideoMimeFromExtension(input) !== void 0;
|
|
1657
|
+
}
|
|
1658
|
+
if (looksLikeVideoBase64(input)) {
|
|
1659
|
+
try {
|
|
1660
|
+
const buf = decodeBase64(input.slice(0, VIDEO_MAGIC_BYTE_PREFIX_LEN));
|
|
1661
|
+
return detectVideoMimeType(buf) !== null;
|
|
1662
|
+
} catch {
|
|
1663
|
+
return false;
|
|
1664
|
+
}
|
|
1665
|
+
}
|
|
1666
|
+
return false;
|
|
1667
|
+
}
|
|
1668
|
+
async function normalizeMedia(input, videoOptions) {
|
|
1669
|
+
if (isVideoInput(input)) {
|
|
1670
|
+
const { path, cleanup } = await resolveVideoToPath(input);
|
|
1671
|
+
try {
|
|
1672
|
+
const { frames, durationSeconds } = await extractFrames(path, videoOptions);
|
|
1673
|
+
return { kind: "video", frames, durationSeconds };
|
|
1674
|
+
} finally {
|
|
1675
|
+
try {
|
|
1676
|
+
await cleanup();
|
|
1677
|
+
} catch {
|
|
1678
|
+
}
|
|
1679
|
+
}
|
|
1680
|
+
}
|
|
1681
|
+
const image = await normalizeImage(input);
|
|
1682
|
+
return { kind: "image", image };
|
|
1683
|
+
}
|
|
1684
|
+
|
|
1194
1685
|
// src/types.ts
|
|
1195
1686
|
var import_zod = require("zod");
|
|
1196
1687
|
var IssuePrioritySchema = import_zod.z.enum(["critical", "major", "minor"]);
|
|
@@ -1215,7 +1706,13 @@ var StatementResultSchema = import_zod.z.object({
|
|
|
1215
1706
|
statement: import_zod.z.string(),
|
|
1216
1707
|
pass: import_zod.z.boolean(),
|
|
1217
1708
|
reasoning: import_zod.z.string(),
|
|
1218
|
-
confidence: ConfidenceSchema.optional()
|
|
1709
|
+
confidence: ConfidenceSchema.optional(),
|
|
1710
|
+
/**
|
|
1711
|
+
* For video inputs, the approximate timestamp (in seconds, from the start of the clip)
|
|
1712
|
+
* of the frame that most clearly demonstrates the statement. `null` when the statement
|
|
1713
|
+
* fails or applies across the whole clip. Always omitted for image inputs.
|
|
1714
|
+
*/
|
|
1715
|
+
timestampSeconds: import_zod.z.number().nonnegative().nullable().optional()
|
|
1219
1716
|
});
|
|
1220
1717
|
var UsageInfoSchema = import_zod.z.object({
|
|
1221
1718
|
inputTokens: import_zod.z.number(),
|
|
@@ -1244,6 +1741,11 @@ var CompareResultSchema = BaseResultSchema.extend({
|
|
|
1244
1741
|
var AskResultSchema = import_zod.z.object({
|
|
1245
1742
|
summary: import_zod.z.string(),
|
|
1246
1743
|
issues: import_zod.z.array(IssueSchema),
|
|
1744
|
+
/**
|
|
1745
|
+
* For video inputs, the indices of frames the model relied on to answer.
|
|
1746
|
+
* Indices are 0-based and refer to entries in `frames.timestampsSeconds`.
|
|
1747
|
+
*/
|
|
1748
|
+
frameReferences: import_zod.z.array(import_zod.z.number().int().nonnegative()).optional(),
|
|
1247
1749
|
usage: UsageInfoSchema.optional()
|
|
1248
1750
|
});
|
|
1249
1751
|
|
|
@@ -1319,6 +1821,29 @@ function createDriver(provider, config) {
|
|
|
1319
1821
|
var checkSchemaOptions = toSchemaOptions(CheckResponseSchema);
|
|
1320
1822
|
var askSchemaOptions = toSchemaOptions(AskResponseSchema);
|
|
1321
1823
|
var compareSchemaOptions = toSchemaOptions(CompareResponseSchema);
|
|
1824
|
+
function mediaToProviderInputs(media) {
|
|
1825
|
+
if (media.kind === "image") {
|
|
1826
|
+
return {
|
|
1827
|
+
images: [media.image],
|
|
1828
|
+
mediaContext: { kind: "image" },
|
|
1829
|
+
framesMetadata: void 0
|
|
1830
|
+
};
|
|
1831
|
+
}
|
|
1832
|
+
const timestamps = media.frames.map((f) => f.timestampSeconds);
|
|
1833
|
+
return {
|
|
1834
|
+
images: media.frames,
|
|
1835
|
+
mediaContext: {
|
|
1836
|
+
kind: "video",
|
|
1837
|
+
frameTimestamps: timestamps,
|
|
1838
|
+
durationSeconds: media.durationSeconds
|
|
1839
|
+
},
|
|
1840
|
+
framesMetadata: {
|
|
1841
|
+
count: media.frames.length,
|
|
1842
|
+
timestampsSeconds: timestamps,
|
|
1843
|
+
durationSeconds: media.durationSeconds
|
|
1844
|
+
}
|
|
1845
|
+
};
|
|
1846
|
+
}
|
|
1322
1847
|
function visualAI(config = {}) {
|
|
1323
1848
|
const resolvedConfig = resolveConfig(config);
|
|
1324
1849
|
const driverConfig = {
|
|
@@ -1347,34 +1872,44 @@ function visualAI(config = {}) {
|
|
|
1347
1872
|
});
|
|
1348
1873
|
}
|
|
1349
1874
|
return {
|
|
1350
|
-
async check(
|
|
1875
|
+
async check(input, statements, options) {
|
|
1351
1876
|
const stmts = Array.isArray(statements) ? statements : [statements];
|
|
1352
1877
|
if (stmts.length === 0) {
|
|
1353
1878
|
throw new VisualAIConfigError("At least one statement is required for check()");
|
|
1354
1879
|
}
|
|
1355
1880
|
return withErrorDebug(resolvedConfig, "check", async () => {
|
|
1356
|
-
const
|
|
1357
|
-
const
|
|
1881
|
+
const media = await normalizeMedia(input, options?.video);
|
|
1882
|
+
const { images, mediaContext, framesMetadata } = mediaToProviderInputs(media);
|
|
1883
|
+
const prompt = buildCheckPrompt(stmts, {
|
|
1884
|
+
instructions: options?.instructions,
|
|
1885
|
+
media: mediaContext
|
|
1886
|
+
});
|
|
1358
1887
|
debugLog(resolvedConfig, "check prompt", prompt, "prompt");
|
|
1359
|
-
const response = await timedSendMessage(driver,
|
|
1888
|
+
const response = await timedSendMessage(driver, images, prompt, checkSchemaOptions);
|
|
1360
1889
|
debugLog(resolvedConfig, "check response", response.text, "response");
|
|
1361
1890
|
const result = parseCheckResponse(response.text);
|
|
1362
1891
|
return {
|
|
1363
1892
|
...result,
|
|
1893
|
+
...framesMetadata ? { frames: framesMetadata } : {},
|
|
1364
1894
|
usage: processUsage("check", response.usage, response.durationSeconds, resolvedConfig)
|
|
1365
1895
|
};
|
|
1366
1896
|
});
|
|
1367
1897
|
},
|
|
1368
|
-
async ask(
|
|
1898
|
+
async ask(input, userPrompt, options) {
|
|
1369
1899
|
return withErrorDebug(resolvedConfig, "ask", async () => {
|
|
1370
|
-
const
|
|
1371
|
-
const
|
|
1900
|
+
const media = await normalizeMedia(input, options?.video);
|
|
1901
|
+
const { images, mediaContext, framesMetadata } = mediaToProviderInputs(media);
|
|
1902
|
+
const prompt = buildAskPrompt(userPrompt, {
|
|
1903
|
+
instructions: options?.instructions,
|
|
1904
|
+
media: mediaContext
|
|
1905
|
+
});
|
|
1372
1906
|
debugLog(resolvedConfig, "ask prompt", prompt, "prompt");
|
|
1373
|
-
const response = await timedSendMessage(driver,
|
|
1907
|
+
const response = await timedSendMessage(driver, images, prompt, askSchemaOptions);
|
|
1374
1908
|
debugLog(resolvedConfig, "ask response", response.text, "response");
|
|
1375
1909
|
const result = parseAskResponse(response.text);
|
|
1376
1910
|
return {
|
|
1377
1911
|
...result,
|
|
1912
|
+
...framesMetadata ? { frames: framesMetadata } : {},
|
|
1378
1913
|
usage: processUsage("ask", response.usage, response.durationSeconds, resolvedConfig)
|
|
1379
1914
|
};
|
|
1380
1915
|
});
|
|
@@ -1563,6 +2098,7 @@ function assertVisualCompareResult(result, label) {
|
|
|
1563
2098
|
VisualAIRateLimitError,
|
|
1564
2099
|
VisualAIResponseParseError,
|
|
1565
2100
|
VisualAITruncationError,
|
|
2101
|
+
VisualAIVideoError,
|
|
1566
2102
|
assertVisualCompareResult,
|
|
1567
2103
|
assertVisualResult,
|
|
1568
2104
|
formatCheckResult,
|