visual-ai-assertions 0.7.2 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +69 -16
- package/dist/index.cjs +588 -52
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +178 -25
- package/dist/index.d.ts +178 -25
- package/dist/index.js +587 -52
- package/dist/index.js.map +1 -1
- package/package.json +31 -21
package/dist/index.js
CHANGED
|
@@ -12,11 +12,13 @@ var Provider = {
|
|
|
12
12
|
};
|
|
13
13
|
var Model = {
|
|
14
14
|
Anthropic: {
|
|
15
|
+
OPUS_4_7: "claude-opus-4-7",
|
|
15
16
|
OPUS_4_6: "claude-opus-4-6",
|
|
16
17
|
SONNET_4_6: "claude-sonnet-4-6",
|
|
17
18
|
HAIKU_4_5: "claude-haiku-4-5"
|
|
18
19
|
},
|
|
19
20
|
OpenAI: {
|
|
21
|
+
GPT_5_5: "gpt-5.5",
|
|
20
22
|
GPT_5_4: "gpt-5.4",
|
|
21
23
|
GPT_5_4_PRO: "gpt-5.4-pro",
|
|
22
24
|
GPT_5_4_MINI: "gpt-5.4-mini",
|
|
@@ -113,6 +115,12 @@ var VisualAIImageError = class extends VisualAIError {
|
|
|
113
115
|
this.name = "VisualAIImageError";
|
|
114
116
|
}
|
|
115
117
|
};
|
|
118
|
+
var VisualAIVideoError = class extends VisualAIError {
|
|
119
|
+
constructor(message) {
|
|
120
|
+
super(message, "VIDEO_INVALID");
|
|
121
|
+
this.name = "VisualAIVideoError";
|
|
122
|
+
}
|
|
123
|
+
};
|
|
116
124
|
var VisualAIResponseParseError = class extends VisualAIError {
|
|
117
125
|
rawResponse;
|
|
118
126
|
constructor(message, rawResponse) {
|
|
@@ -146,7 +154,7 @@ var VisualAIAssertionError = class extends VisualAIError {
|
|
|
146
154
|
}
|
|
147
155
|
};
|
|
148
156
|
function isVisualAIKnownError(error) {
|
|
149
|
-
return error instanceof VisualAIAuthError || error instanceof VisualAIRateLimitError || error instanceof VisualAIProviderError || error instanceof VisualAIImageError || error instanceof VisualAIResponseParseError || error instanceof VisualAITruncationError || error instanceof VisualAIConfigError || error instanceof VisualAIAssertionError;
|
|
157
|
+
return error instanceof VisualAIAuthError || error instanceof VisualAIRateLimitError || error instanceof VisualAIProviderError || error instanceof VisualAIImageError || error instanceof VisualAIVideoError || error instanceof VisualAIResponseParseError || error instanceof VisualAITruncationError || error instanceof VisualAIConfigError || error instanceof VisualAIAssertionError;
|
|
150
158
|
}
|
|
151
159
|
|
|
152
160
|
// src/core/prompt.ts
|
|
@@ -160,7 +168,7 @@ Each issue must have:
|
|
|
160
168
|
- "description": what the issue is
|
|
161
169
|
- "suggestion": how to fix or improve it
|
|
162
170
|
`;
|
|
163
|
-
var
|
|
171
|
+
var CHECK_OUTPUT_SCHEMA_IMAGE = `IMPORTANT: Follow this evaluation order:
|
|
164
172
|
1. First, evaluate EACH statement independently and populate the "statements" array
|
|
165
173
|
2. Then, set "pass" to true ONLY if every statement passed (logical AND of all statement results)
|
|
166
174
|
3. Write "reasoning" as a brief overall summary of the evaluation
|
|
@@ -200,7 +208,46 @@ Example for a failing check:
|
|
|
200
208
|
]
|
|
201
209
|
}
|
|
202
210
|
${JSON_INSTRUCTIONS}`;
|
|
203
|
-
var
|
|
211
|
+
var CHECK_OUTPUT_SCHEMA_VIDEO = `IMPORTANT: Follow this evaluation order:
|
|
212
|
+
1. First, evaluate EACH statement independently across the entire timeline and populate the "statements" array
|
|
213
|
+
2. A statement passes if it is true at ANY frame of the timeline, unless the wording explicitly says otherwise (e.g. "throughout", "at all times")
|
|
214
|
+
3. For each statement that passes, set "timestampSeconds" to the timestamp of the frame that most clearly demonstrates it (or where it first becomes true). Use null when the statement fails or applies across the whole clip.
|
|
215
|
+
4. Then, set "pass" to true ONLY if every statement passed (logical AND of all statement results)
|
|
216
|
+
5. Write "reasoning" as a brief overall summary of the evaluation
|
|
217
|
+
6. Include "issues" only for statements that failed
|
|
218
|
+
|
|
219
|
+
Respond with a JSON object matching this exact structure:
|
|
220
|
+
{
|
|
221
|
+
"pass": boolean, // true ONLY if ALL statements passed \u2014 derive from statements array
|
|
222
|
+
"reasoning": string, // brief overall summary of the evaluation
|
|
223
|
+
"issues": [...], // one issue per failing statement (empty if all pass)
|
|
224
|
+
"statements": [ // one entry per statement, in order \u2014 evaluate these FIRST
|
|
225
|
+
{
|
|
226
|
+
"statement": string, // the original statement text
|
|
227
|
+
"pass": boolean, // whether this statement is true at any point in the timeline
|
|
228
|
+
"reasoning": string, // explanation for this statement, citing frame timestamps where relevant
|
|
229
|
+
"confidence": "high" | "medium" | "low",
|
|
230
|
+
"timestampSeconds": number | null
|
|
231
|
+
// seconds from the start of the clip where the statement is most clearly true,
|
|
232
|
+
// or null if it failed / applies across the whole clip
|
|
233
|
+
}
|
|
234
|
+
]
|
|
235
|
+
}
|
|
236
|
+
${ISSUE_SCHEMA_INSTRUCTIONS}
|
|
237
|
+
|
|
238
|
+
Only include issues for statements that fail. If all statements pass, issues should be an empty array.
|
|
239
|
+
|
|
240
|
+
Example for a passing video check:
|
|
241
|
+
{
|
|
242
|
+
"pass": true,
|
|
243
|
+
"reasoning": "The success toast appeared briefly around 3.5s.",
|
|
244
|
+
"issues": [],
|
|
245
|
+
"statements": [
|
|
246
|
+
{ "statement": "A success toast with text 'Saved' appears", "pass": true, "reasoning": "A green toast labeled 'Saved' is visible in the bottom-right at the 3.5s frame", "confidence": "high", "timestampSeconds": 3.5 }
|
|
247
|
+
]
|
|
248
|
+
}
|
|
249
|
+
${JSON_INSTRUCTIONS}`;
|
|
250
|
+
var ASK_OUTPUT_SCHEMA_IMAGE = `Respond with a JSON object matching this exact structure:
|
|
204
251
|
{
|
|
205
252
|
"summary": string, // high-level analysis summary
|
|
206
253
|
"issues": [...] // list of issues/findings, can be empty
|
|
@@ -221,6 +268,17 @@ Example:
|
|
|
221
268
|
]
|
|
222
269
|
}
|
|
223
270
|
${JSON_INSTRUCTIONS}`;
|
|
271
|
+
var ASK_OUTPUT_SCHEMA_VIDEO = `Respond with a JSON object matching this exact structure:
|
|
272
|
+
{
|
|
273
|
+
"summary": string, // high-level summary of what happens across the timeline
|
|
274
|
+
"issues": [...], // list of issues/findings, can be empty
|
|
275
|
+
"frameReferences": number[] // 0-based indices of frames the answer relies on (in order)
|
|
276
|
+
}
|
|
277
|
+
${ISSUE_SCHEMA_INSTRUCTIONS}
|
|
278
|
+
|
|
279
|
+
Prioritize issues by severity (critical / major / minor) as for image input.
|
|
280
|
+
Cite frame indices in "frameReferences" so the user can locate the moments you describe.
|
|
281
|
+
${JSON_INSTRUCTIONS}`;
|
|
224
282
|
var COMPARE_OUTPUT_SCHEMA = `Respond with a JSON object matching this exact structure:
|
|
225
283
|
{
|
|
226
284
|
"pass": boolean, // true if no critical or major changes found
|
|
@@ -239,7 +297,19 @@ var COMPARE_OUTPUT_SCHEMA = `Respond with a JSON object matching this exact stru
|
|
|
239
297
|
If the images appear identical, set pass to true, explain in reasoning, and return an empty changes array.
|
|
240
298
|
${JSON_INSTRUCTIONS}`;
|
|
241
299
|
var DEFAULT_CHECK_ROLE = "You are a visual QA assistant. Evaluate the provided image precisely and objectively.";
|
|
300
|
+
var DEFAULT_CHECK_ROLE_VIDEO = "You are a visual QA assistant. Evaluate the provided sequence of video frames precisely and objectively, treating them as a chronological timeline.";
|
|
242
301
|
var DEFAULT_ASK_ROLE = "You are a visual QA assistant. Analyze the provided image based on the user's request.";
|
|
302
|
+
var DEFAULT_ASK_ROLE_VIDEO = "You are a visual QA assistant. Analyze the provided sequence of video frames as a chronological timeline based on the user's request.";
|
|
303
|
+
function buildVideoTimelineSection(frameTimestamps, durationSeconds) {
|
|
304
|
+
const formatted = frameTimestamps.map((t, i) => ` ${i}: ${t.toFixed(2)}s`).join("\n");
|
|
305
|
+
return `Video timeline:
|
|
306
|
+
- Total duration: ${durationSeconds.toFixed(2)}s
|
|
307
|
+
- ${frameTimestamps.length} frames sampled (in chronological order)
|
|
308
|
+
- Frame index \u2192 timestamp:
|
|
309
|
+
${formatted}
|
|
310
|
+
|
|
311
|
+
Treat the attached images as a chronological timeline. The first image is the earliest frame, the last is the latest. Refer to frames by timestamp where helpful.`;
|
|
312
|
+
}
|
|
243
313
|
var COMPARE_ROLE = "You are performing a visual regression test. Compare the BEFORE image (baseline) to the AFTER image (current) and identify all visual differences. Flag changes that appear unintentional or problematic.";
|
|
244
314
|
var COMPARE_EDGE_RULES = [
|
|
245
315
|
"The BEFORE image is the baseline/expected state.",
|
|
@@ -252,22 +322,31 @@ function buildInstructionsSection(instructions) {
|
|
|
252
322
|
function buildCheckPrompt(statements, options) {
|
|
253
323
|
const stmts = Array.isArray(statements) ? statements : [statements];
|
|
254
324
|
const statementsBlock = stmts.map((s, i) => `${i + 1}. "${s}"`).join("\n");
|
|
255
|
-
const
|
|
325
|
+
const media = options?.media;
|
|
326
|
+
const defaultRole = media?.kind === "video" ? DEFAULT_CHECK_ROLE_VIDEO : DEFAULT_CHECK_ROLE;
|
|
327
|
+
const sections = [options?.role ?? defaultRole];
|
|
328
|
+
if (media?.kind === "video") {
|
|
329
|
+
sections.push(buildVideoTimelineSection(media.frameTimestamps, media.durationSeconds));
|
|
330
|
+
}
|
|
256
331
|
if (options?.instructions && options.instructions.length > 0) {
|
|
257
332
|
sections.push(buildInstructionsSection(options.instructions));
|
|
258
333
|
}
|
|
259
334
|
sections.push(`Statements to evaluate:
|
|
260
335
|
${statementsBlock}`);
|
|
261
|
-
sections.push(
|
|
336
|
+
sections.push(media?.kind === "video" ? CHECK_OUTPUT_SCHEMA_VIDEO : CHECK_OUTPUT_SCHEMA_IMAGE);
|
|
262
337
|
return sections.join("\n\n");
|
|
263
338
|
}
|
|
264
339
|
function buildAskPrompt(userPrompt, options) {
|
|
265
|
-
const
|
|
340
|
+
const media = options?.media;
|
|
341
|
+
const sections = [media?.kind === "video" ? DEFAULT_ASK_ROLE_VIDEO : DEFAULT_ASK_ROLE];
|
|
342
|
+
if (media?.kind === "video") {
|
|
343
|
+
sections.push(buildVideoTimelineSection(media.frameTimestamps, media.durationSeconds));
|
|
344
|
+
}
|
|
266
345
|
if (options?.instructions && options.instructions.length > 0) {
|
|
267
346
|
sections.push(buildInstructionsSection(options.instructions));
|
|
268
347
|
}
|
|
269
348
|
sections.push(`User request: ${userPrompt}`);
|
|
270
|
-
sections.push(
|
|
349
|
+
sections.push(media?.kind === "video" ? ASK_OUTPUT_SCHEMA_VIDEO : ASK_OUTPUT_SCHEMA_IMAGE);
|
|
271
350
|
return sections.join("\n\n");
|
|
272
351
|
}
|
|
273
352
|
function buildAiDiffPrompt() {
|
|
@@ -417,6 +496,10 @@ function parseRetryAfter(value) {
|
|
|
417
496
|
}
|
|
418
497
|
|
|
419
498
|
// src/providers/anthropic.ts
|
|
499
|
+
function mapEffort(level, model) {
|
|
500
|
+
if (level !== "xhigh") return level;
|
|
501
|
+
return model === Model.Anthropic.OPUS_4_7 ? "xhigh" : "max";
|
|
502
|
+
}
|
|
420
503
|
var AnthropicDriver = class {
|
|
421
504
|
client;
|
|
422
505
|
model;
|
|
@@ -474,7 +557,7 @@ var AnthropicDriver = class {
|
|
|
474
557
|
if (this.reasoningEffort) {
|
|
475
558
|
requestParams.thinking = { type: "adaptive" };
|
|
476
559
|
requestParams.output_config = {
|
|
477
|
-
effort: this.reasoningEffort
|
|
560
|
+
effort: mapEffort(this.reasoningEffort, this.model)
|
|
478
561
|
};
|
|
479
562
|
}
|
|
480
563
|
const message = await client.messages.create(requestParams);
|
|
@@ -798,6 +881,10 @@ function resolveConfig(config) {
|
|
|
798
881
|
// src/core/pricing.ts
|
|
799
882
|
var PER_MILLION = 1e6;
|
|
800
883
|
var PRICING_TABLE = {
|
|
884
|
+
[`${Provider.ANTHROPIC}:${Model.Anthropic.OPUS_4_7}`]: {
|
|
885
|
+
inputPricePerToken: 5 / PER_MILLION,
|
|
886
|
+
outputPricePerToken: 25 / PER_MILLION
|
|
887
|
+
},
|
|
801
888
|
[`${Provider.ANTHROPIC}:${Model.Anthropic.OPUS_4_6}`]: {
|
|
802
889
|
inputPricePerToken: 5 / PER_MILLION,
|
|
803
890
|
outputPricePerToken: 25 / PER_MILLION
|
|
@@ -810,6 +897,10 @@ var PRICING_TABLE = {
|
|
|
810
897
|
inputPricePerToken: 1 / PER_MILLION,
|
|
811
898
|
outputPricePerToken: 5 / PER_MILLION
|
|
812
899
|
},
|
|
900
|
+
[`${Provider.OPENAI}:${Model.OpenAI.GPT_5_5}`]: {
|
|
901
|
+
inputPricePerToken: 5 / PER_MILLION,
|
|
902
|
+
outputPricePerToken: 30 / PER_MILLION
|
|
903
|
+
},
|
|
813
904
|
[`${Provider.OPENAI}:${Model.OpenAI.GPT_5_4}`]: {
|
|
814
905
|
inputPricePerToken: 2.5 / PER_MILLION,
|
|
815
906
|
outputPricePerToken: 15 / PER_MILLION
|
|
@@ -950,6 +1041,51 @@ async function generateAiDiff(imgA, imgB, model, driver) {
|
|
|
950
1041
|
import { readFile } from "fs/promises";
|
|
951
1042
|
import { extname } from "path";
|
|
952
1043
|
import sharp2 from "sharp";
|
|
1044
|
+
|
|
1045
|
+
// src/core/input-detect.ts
|
|
1046
|
+
function isFilePath(input) {
|
|
1047
|
+
return input.startsWith("/") || input.startsWith("./") || input.startsWith("../") || input.includes("\\");
|
|
1048
|
+
}
|
|
1049
|
+
function isUrl(input) {
|
|
1050
|
+
return input.startsWith("http://") || input.startsWith("https://");
|
|
1051
|
+
}
|
|
1052
|
+
function isDataUrl(input) {
|
|
1053
|
+
return input.startsWith("data:");
|
|
1054
|
+
}
|
|
1055
|
+
function parseDataUrl(input) {
|
|
1056
|
+
const match = /^data:([^;]+);base64,(.+)$/.exec(input);
|
|
1057
|
+
if (!match?.[1] || !match[2]) return null;
|
|
1058
|
+
return { mimeType: match[1], base64Payload: match[2] };
|
|
1059
|
+
}
|
|
1060
|
+
function decodeBase64(payload) {
|
|
1061
|
+
if (!/^[A-Za-z0-9+/\n\r]+=*$/.test(payload)) {
|
|
1062
|
+
throw new Error("Invalid base64 string");
|
|
1063
|
+
}
|
|
1064
|
+
return Buffer.from(payload, "base64");
|
|
1065
|
+
}
|
|
1066
|
+
function looksLikeImageBase64(input) {
|
|
1067
|
+
return input.startsWith("iVBOR") || // PNG (0x89 0x50 0x4E 0x47)
|
|
1068
|
+
input.startsWith("/9j/") || // JPEG (0xFF 0xD8 0xFF)
|
|
1069
|
+
input.startsWith("R0lGOD") || // GIF (0x47 0x49 0x46)
|
|
1070
|
+
input.startsWith("UklGR");
|
|
1071
|
+
}
|
|
1072
|
+
function looksLikeVideoBase64(input) {
|
|
1073
|
+
return input.startsWith("GkXf") || input.startsWith("AAAA");
|
|
1074
|
+
}
|
|
1075
|
+
async function fetchToBuffer(url, timeoutMs) {
|
|
1076
|
+
const response = await fetch(url, {
|
|
1077
|
+
signal: AbortSignal.timeout(timeoutMs)
|
|
1078
|
+
});
|
|
1079
|
+
if (!response.ok) {
|
|
1080
|
+
throw new Error(`HTTP ${response.status}`);
|
|
1081
|
+
}
|
|
1082
|
+
const arrayBuffer = await response.arrayBuffer();
|
|
1083
|
+
const data = Buffer.from(arrayBuffer);
|
|
1084
|
+
const contentType = response.headers.get("content-type")?.split(";")[0]?.trim() ?? null;
|
|
1085
|
+
return { data, contentType };
|
|
1086
|
+
}
|
|
1087
|
+
|
|
1088
|
+
// src/core/image.ts
|
|
953
1089
|
var SUPPORTED_FORMATS = /* @__PURE__ */ new Set([
|
|
954
1090
|
"image/jpeg",
|
|
955
1091
|
"image/png",
|
|
@@ -972,18 +1108,6 @@ function getMimeFromExtension(filePath) {
|
|
|
972
1108
|
const ext = extname(filePath).toLowerCase();
|
|
973
1109
|
return EXTENSION_TO_MIME[ext];
|
|
974
1110
|
}
|
|
975
|
-
function isFilePath(input) {
|
|
976
|
-
return input.startsWith("/") || input.startsWith("./") || input.startsWith("../") || input.includes("\\");
|
|
977
|
-
}
|
|
978
|
-
function isUrl(input) {
|
|
979
|
-
return input.startsWith("http://") || input.startsWith("https://");
|
|
980
|
-
}
|
|
981
|
-
function isBase64Image(input) {
|
|
982
|
-
return input.startsWith("iVBOR") || // PNG (0x89 0x50 0x4E 0x47)
|
|
983
|
-
input.startsWith("/9j/") || // JPEG (0xFF 0xD8 0xFF)
|
|
984
|
-
input.startsWith("R0lGOD") || // GIF (0x47 0x49 0x46)
|
|
985
|
-
input.startsWith("UklGR");
|
|
986
|
-
}
|
|
987
1111
|
function detectMimeType(data) {
|
|
988
1112
|
if (data[0] === 255 && data[1] === 216 && data[2] === 255) {
|
|
989
1113
|
return "image/jpeg";
|
|
@@ -1037,45 +1161,38 @@ async function loadFromFilePath(filePath) {
|
|
|
1037
1161
|
return { data: fileData, mimeType };
|
|
1038
1162
|
}
|
|
1039
1163
|
async function loadFromUrl(url) {
|
|
1040
|
-
let
|
|
1164
|
+
let result;
|
|
1041
1165
|
try {
|
|
1042
|
-
|
|
1043
|
-
signal: AbortSignal.timeout(URL_FETCH_TIMEOUT_MS)
|
|
1044
|
-
});
|
|
1166
|
+
result = await fetchToBuffer(url, URL_FETCH_TIMEOUT_MS);
|
|
1045
1167
|
} catch (err) {
|
|
1046
1168
|
throw new VisualAIImageError(
|
|
1047
1169
|
`Failed to fetch image from URL: ${url} \u2014 ${err instanceof Error ? err.message : String(err)}`
|
|
1048
1170
|
);
|
|
1049
1171
|
}
|
|
1050
|
-
|
|
1051
|
-
throw new VisualAIImageError(
|
|
1052
|
-
`Failed to fetch image from URL: ${url} \u2014 HTTP ${response.status}`
|
|
1053
|
-
);
|
|
1054
|
-
}
|
|
1055
|
-
const arrayBuffer = await response.arrayBuffer();
|
|
1056
|
-
const data = Buffer.from(arrayBuffer);
|
|
1057
|
-
const contentType = response.headers.get("content-type")?.split(";")[0]?.trim() ?? null;
|
|
1172
|
+
const { data, contentType } = result;
|
|
1058
1173
|
const mimeType = contentType && isSupportedMimeType(contentType) ? contentType : detectMimeType(data);
|
|
1059
1174
|
return { data, mimeType };
|
|
1060
1175
|
}
|
|
1061
1176
|
function loadFromBase64(input) {
|
|
1062
1177
|
let base64Data = input;
|
|
1063
1178
|
let mimeType;
|
|
1064
|
-
if (input
|
|
1065
|
-
const
|
|
1066
|
-
if (!
|
|
1179
|
+
if (isDataUrl(input)) {
|
|
1180
|
+
const parsed = parseDataUrl(input);
|
|
1181
|
+
if (!parsed) {
|
|
1067
1182
|
throw new VisualAIImageError("Invalid data URL format");
|
|
1068
1183
|
}
|
|
1069
|
-
if (!isSupportedMimeType(
|
|
1070
|
-
throw new VisualAIImageError(`Unsupported image format: ${
|
|
1184
|
+
if (!isSupportedMimeType(parsed.mimeType)) {
|
|
1185
|
+
throw new VisualAIImageError(`Unsupported image format: ${parsed.mimeType}`);
|
|
1071
1186
|
}
|
|
1072
|
-
mimeType =
|
|
1073
|
-
base64Data =
|
|
1187
|
+
mimeType = parsed.mimeType;
|
|
1188
|
+
base64Data = parsed.base64Payload;
|
|
1074
1189
|
}
|
|
1075
|
-
|
|
1190
|
+
let data;
|
|
1191
|
+
try {
|
|
1192
|
+
data = decodeBase64(base64Data);
|
|
1193
|
+
} catch {
|
|
1076
1194
|
throw new VisualAIImageError("Invalid base64 string");
|
|
1077
1195
|
}
|
|
1078
|
-
const data = Buffer.from(base64Data, "base64");
|
|
1079
1196
|
if (data.length === 0) {
|
|
1080
1197
|
throw new VisualAIImageError("Empty image data after base64 decode");
|
|
1081
1198
|
}
|
|
@@ -1094,9 +1211,9 @@ async function normalizeImage(input) {
|
|
|
1094
1211
|
} else if (typeof input === "string") {
|
|
1095
1212
|
if (isUrl(input)) {
|
|
1096
1213
|
({ data, mimeType } = await loadFromUrl(input));
|
|
1097
|
-
} else if (input
|
|
1214
|
+
} else if (isDataUrl(input)) {
|
|
1098
1215
|
({ data, mimeType } = loadFromBase64(input));
|
|
1099
|
-
} else if (
|
|
1216
|
+
} else if (looksLikeImageBase64(input)) {
|
|
1100
1217
|
({ data, mimeType } = loadFromBase64(input));
|
|
1101
1218
|
} else if (isFilePath(input)) {
|
|
1102
1219
|
({ data, mimeType } = await loadFromFilePath(input));
|
|
@@ -1124,6 +1241,379 @@ async function normalizeImage(input) {
|
|
|
1124
1241
|
};
|
|
1125
1242
|
}
|
|
1126
1243
|
|
|
1244
|
+
// src/core/video.ts
|
|
1245
|
+
import { mkdtemp, readFile as readFile2, readdir, rm, writeFile } from "fs/promises";
|
|
1246
|
+
import { tmpdir } from "os";
|
|
1247
|
+
import { extname as extname2, join } from "path";
|
|
1248
|
+
var FRAME_MAX_DIMENSION = 1568;
|
|
1249
|
+
var DEFAULT_FPS = 1;
|
|
1250
|
+
var DEFAULT_MAX_FRAMES = 10;
|
|
1251
|
+
var DEFAULT_MAX_DURATION_SECONDS = 10;
|
|
1252
|
+
var MAX_FRAMES_HARD_CAP = 60;
|
|
1253
|
+
var FFPROBE_TIMEOUT_MS = 15e3;
|
|
1254
|
+
var FFMPEG_RUN_TIMEOUT_MS = 6e4;
|
|
1255
|
+
var VIDEO_EXTENSIONS = {
|
|
1256
|
+
".mp4": "video/mp4",
|
|
1257
|
+
".m4v": "video/mp4",
|
|
1258
|
+
".webm": "video/webm",
|
|
1259
|
+
".mov": "video/quicktime",
|
|
1260
|
+
".qt": "video/quicktime",
|
|
1261
|
+
".mkv": "video/x-matroska"
|
|
1262
|
+
};
|
|
1263
|
+
var VIDEO_MIME_TYPES = /* @__PURE__ */ new Set([
|
|
1264
|
+
"video/mp4",
|
|
1265
|
+
"video/webm",
|
|
1266
|
+
"video/quicktime",
|
|
1267
|
+
"video/x-matroska"
|
|
1268
|
+
]);
|
|
1269
|
+
function isSupportedVideoMimeType(value) {
|
|
1270
|
+
return VIDEO_MIME_TYPES.has(value);
|
|
1271
|
+
}
|
|
1272
|
+
function getVideoMimeFromExtension(filePath) {
|
|
1273
|
+
const ext = extname2(filePath).toLowerCase();
|
|
1274
|
+
return VIDEO_EXTENSIONS[ext];
|
|
1275
|
+
}
|
|
1276
|
+
function detectVideoMimeType(data) {
|
|
1277
|
+
if (data.length < 12) return null;
|
|
1278
|
+
if (data[4] === 102 && data[5] === 116 && data[6] === 121 && data[7] === 112) {
|
|
1279
|
+
if (data[8] === 113 && data[9] === 116 && data[10] === 32 && data[11] === 32) {
|
|
1280
|
+
return "video/quicktime";
|
|
1281
|
+
}
|
|
1282
|
+
return "video/mp4";
|
|
1283
|
+
}
|
|
1284
|
+
if (data[0] === 26 && data[1] === 69 && data[2] === 223 && data[3] === 163) {
|
|
1285
|
+
return "video/webm";
|
|
1286
|
+
}
|
|
1287
|
+
return null;
|
|
1288
|
+
}
|
|
1289
|
+
async function resolveVideoToPath(input) {
|
|
1290
|
+
if (Buffer.isBuffer(input) || input instanceof Uint8Array) {
|
|
1291
|
+
const buf2 = Buffer.isBuffer(input) ? input : Buffer.from(input);
|
|
1292
|
+
const mimeType2 = detectVideoMimeType(buf2);
|
|
1293
|
+
if (!mimeType2) {
|
|
1294
|
+
throw new VisualAIVideoError("Unable to detect video format from buffer contents");
|
|
1295
|
+
}
|
|
1296
|
+
return writeBufferToTemp(buf2, mimeType2);
|
|
1297
|
+
}
|
|
1298
|
+
if (typeof input !== "string") {
|
|
1299
|
+
throw new VisualAIVideoError(
|
|
1300
|
+
"Invalid video input: expected Buffer, Uint8Array, file path, data URL, or base64 string"
|
|
1301
|
+
);
|
|
1302
|
+
}
|
|
1303
|
+
if (isDataUrl(input)) {
|
|
1304
|
+
const parsed = parseDataUrl(input);
|
|
1305
|
+
if (!parsed) {
|
|
1306
|
+
throw new VisualAIVideoError("Invalid data URL format");
|
|
1307
|
+
}
|
|
1308
|
+
if (!isSupportedVideoMimeType(parsed.mimeType)) {
|
|
1309
|
+
throw new VisualAIVideoError(`Unsupported video format: ${parsed.mimeType}`);
|
|
1310
|
+
}
|
|
1311
|
+
let buf2;
|
|
1312
|
+
try {
|
|
1313
|
+
buf2 = decodeBase64(parsed.base64Payload);
|
|
1314
|
+
} catch {
|
|
1315
|
+
throw new VisualAIVideoError("Invalid base64 payload in data URL");
|
|
1316
|
+
}
|
|
1317
|
+
return writeBufferToTemp(buf2, parsed.mimeType);
|
|
1318
|
+
}
|
|
1319
|
+
if (isFilePath(input)) {
|
|
1320
|
+
const mimeType2 = getVideoMimeFromExtension(input);
|
|
1321
|
+
if (!mimeType2) {
|
|
1322
|
+
throw new VisualAIVideoError(
|
|
1323
|
+
`Unsupported video file extension: ${input}. Supported: .mp4, .webm, .mov, .mkv`
|
|
1324
|
+
);
|
|
1325
|
+
}
|
|
1326
|
+
return { path: input, mimeType: mimeType2, cleanup: async () => {
|
|
1327
|
+
} };
|
|
1328
|
+
}
|
|
1329
|
+
let buf;
|
|
1330
|
+
try {
|
|
1331
|
+
buf = decodeBase64(input);
|
|
1332
|
+
} catch {
|
|
1333
|
+
throw new VisualAIVideoError(
|
|
1334
|
+
`Unrecognized video input: "${input.slice(0, 80)}". Expected a file path, data URL, or base64-encoded video string.`
|
|
1335
|
+
);
|
|
1336
|
+
}
|
|
1337
|
+
const mimeType = detectVideoMimeType(buf);
|
|
1338
|
+
if (!mimeType) {
|
|
1339
|
+
throw new VisualAIVideoError(
|
|
1340
|
+
`Unrecognized video input: "${input.slice(0, 80)}". Expected a file path, data URL, or base64-encoded video string.`
|
|
1341
|
+
);
|
|
1342
|
+
}
|
|
1343
|
+
return writeBufferToTemp(buf, mimeType);
|
|
1344
|
+
}
|
|
1345
|
+
async function writeBufferToTemp(data, mimeType) {
|
|
1346
|
+
const dir = await mkdtemp(join(tmpdir(), "visual-ai-video-"));
|
|
1347
|
+
try {
|
|
1348
|
+
const ext = extensionFor(mimeType);
|
|
1349
|
+
const path = join(dir, `input${ext}`);
|
|
1350
|
+
await writeFile(path, data);
|
|
1351
|
+
return {
|
|
1352
|
+
path,
|
|
1353
|
+
mimeType,
|
|
1354
|
+
cleanup: async () => {
|
|
1355
|
+
try {
|
|
1356
|
+
await rm(dir, { recursive: true, force: true });
|
|
1357
|
+
} catch {
|
|
1358
|
+
}
|
|
1359
|
+
}
|
|
1360
|
+
};
|
|
1361
|
+
} catch (err) {
|
|
1362
|
+
try {
|
|
1363
|
+
await rm(dir, { recursive: true, force: true });
|
|
1364
|
+
} catch {
|
|
1365
|
+
}
|
|
1366
|
+
throw err;
|
|
1367
|
+
}
|
|
1368
|
+
}
|
|
1369
|
+
function extensionFor(mimeType) {
|
|
1370
|
+
switch (mimeType) {
|
|
1371
|
+
case "video/mp4":
|
|
1372
|
+
return ".mp4";
|
|
1373
|
+
case "video/webm":
|
|
1374
|
+
return ".webm";
|
|
1375
|
+
case "video/quicktime":
|
|
1376
|
+
return ".mov";
|
|
1377
|
+
case "video/x-matroska":
|
|
1378
|
+
return ".mkv";
|
|
1379
|
+
}
|
|
1380
|
+
}
|
|
1381
|
+
var cachedFactoryPromise;
|
|
1382
|
+
async function loadFfmpegFactory() {
|
|
1383
|
+
if (cachedFactoryPromise) return cachedFactoryPromise;
|
|
1384
|
+
cachedFactoryPromise = (async () => {
|
|
1385
|
+
let ffmpegModule;
|
|
1386
|
+
try {
|
|
1387
|
+
ffmpegModule = await import("fluent-ffmpeg");
|
|
1388
|
+
} catch (err) {
|
|
1389
|
+
const code = err?.code;
|
|
1390
|
+
if (code === "ERR_MODULE_NOT_FOUND" || code === "MODULE_NOT_FOUND") {
|
|
1391
|
+
throw new VisualAIVideoError(
|
|
1392
|
+
"Video support requires fluent-ffmpeg. Install it with: pnpm add -D fluent-ffmpeg @ffmpeg-installer/ffmpeg @ffprobe-installer/ffprobe @types/fluent-ffmpeg"
|
|
1393
|
+
);
|
|
1394
|
+
}
|
|
1395
|
+
throw new VisualAIVideoError(
|
|
1396
|
+
`Failed to load fluent-ffmpeg: ${err instanceof Error ? err.message : String(err)}`
|
|
1397
|
+
);
|
|
1398
|
+
}
|
|
1399
|
+
const factory = ffmpegModule.default ?? ffmpegModule;
|
|
1400
|
+
try {
|
|
1401
|
+
const installer = await import("@ffmpeg-installer/ffmpeg");
|
|
1402
|
+
const path = (installer.default ?? installer).path;
|
|
1403
|
+
if (path) factory.setFfmpegPath(path);
|
|
1404
|
+
} catch (err) {
|
|
1405
|
+
const code = err?.code;
|
|
1406
|
+
if (code !== "ERR_MODULE_NOT_FOUND" && code !== "MODULE_NOT_FOUND") {
|
|
1407
|
+
process.stderr.write(
|
|
1408
|
+
`[visual-ai-assertions] warning: @ffmpeg-installer/ffmpeg failed to load: ${err instanceof Error ? err.message : String(err)}
|
|
1409
|
+
`
|
|
1410
|
+
);
|
|
1411
|
+
}
|
|
1412
|
+
}
|
|
1413
|
+
try {
|
|
1414
|
+
const installer = await import("@ffprobe-installer/ffprobe");
|
|
1415
|
+
const path = (installer.default ?? installer).path;
|
|
1416
|
+
if (path) factory.setFfprobePath(path);
|
|
1417
|
+
} catch (err) {
|
|
1418
|
+
const code = err?.code;
|
|
1419
|
+
if (code !== "ERR_MODULE_NOT_FOUND" && code !== "MODULE_NOT_FOUND") {
|
|
1420
|
+
process.stderr.write(
|
|
1421
|
+
`[visual-ai-assertions] warning: @ffprobe-installer/ffprobe failed to load: ${err instanceof Error ? err.message : String(err)}
|
|
1422
|
+
`
|
|
1423
|
+
);
|
|
1424
|
+
}
|
|
1425
|
+
}
|
|
1426
|
+
return factory;
|
|
1427
|
+
})();
|
|
1428
|
+
try {
|
|
1429
|
+
return await cachedFactoryPromise;
|
|
1430
|
+
} catch (err) {
|
|
1431
|
+
cachedFactoryPromise = void 0;
|
|
1432
|
+
throw err;
|
|
1433
|
+
}
|
|
1434
|
+
}
|
|
1435
|
+
async function probeDurationSeconds(videoPath) {
|
|
1436
|
+
const ffmpeg = await loadFfmpegFactory();
|
|
1437
|
+
return new Promise((resolve, reject) => {
|
|
1438
|
+
let settled = false;
|
|
1439
|
+
const finish = (fn) => {
|
|
1440
|
+
if (settled) return;
|
|
1441
|
+
settled = true;
|
|
1442
|
+
clearTimeout(timer);
|
|
1443
|
+
fn();
|
|
1444
|
+
};
|
|
1445
|
+
const timer = setTimeout(() => {
|
|
1446
|
+
finish(() => {
|
|
1447
|
+
reject(
|
|
1448
|
+
new VisualAIVideoError(
|
|
1449
|
+
`ffprobe timed out after ${FFPROBE_TIMEOUT_MS}ms while probing ${videoPath}`
|
|
1450
|
+
)
|
|
1451
|
+
);
|
|
1452
|
+
});
|
|
1453
|
+
}, FFPROBE_TIMEOUT_MS);
|
|
1454
|
+
ffmpeg.ffprobe(videoPath, (err, data) => {
|
|
1455
|
+
if (err) {
|
|
1456
|
+
finish(() => {
|
|
1457
|
+
reject(
|
|
1458
|
+
new VisualAIVideoError(
|
|
1459
|
+
`Failed to probe video metadata: ${err.message}. Ensure ffprobe is installed (e.g. via @ffprobe-installer/ffprobe).`
|
|
1460
|
+
)
|
|
1461
|
+
);
|
|
1462
|
+
});
|
|
1463
|
+
return;
|
|
1464
|
+
}
|
|
1465
|
+
const raw = data.format?.duration;
|
|
1466
|
+
const duration = typeof raw === "string" ? Number(raw) : raw;
|
|
1467
|
+
if (!duration || !Number.isFinite(duration) || duration <= 0) {
|
|
1468
|
+
finish(() => {
|
|
1469
|
+
reject(new VisualAIVideoError("Video duration could not be determined"));
|
|
1470
|
+
});
|
|
1471
|
+
return;
|
|
1472
|
+
}
|
|
1473
|
+
finish(() => {
|
|
1474
|
+
resolve(duration);
|
|
1475
|
+
});
|
|
1476
|
+
});
|
|
1477
|
+
});
|
|
1478
|
+
}
|
|
1479
|
+
async function extractFrames(videoPath, options = {}) {
|
|
1480
|
+
const fps = options.fps ?? DEFAULT_FPS;
|
|
1481
|
+
const maxFrames = options.maxFrames ?? DEFAULT_MAX_FRAMES;
|
|
1482
|
+
const maxDurationSeconds = options.maxDurationSeconds ?? DEFAULT_MAX_DURATION_SECONDS;
|
|
1483
|
+
if (!Number.isFinite(fps) || fps <= 0) {
|
|
1484
|
+
throw new VisualAIVideoError(`Invalid fps: ${fps}. Must be a finite number > 0.`);
|
|
1485
|
+
}
|
|
1486
|
+
if (!Number.isFinite(maxFrames) || maxFrames <= 0) {
|
|
1487
|
+
throw new VisualAIVideoError(`Invalid maxFrames: ${maxFrames}. Must be a finite number > 0.`);
|
|
1488
|
+
}
|
|
1489
|
+
if (maxFrames > MAX_FRAMES_HARD_CAP) {
|
|
1490
|
+
throw new VisualAIVideoError(
|
|
1491
|
+
`maxFrames ${maxFrames} exceeds the hard cap of ${MAX_FRAMES_HARD_CAP}. Lower maxFrames or open an issue if you need a larger limit.`
|
|
1492
|
+
);
|
|
1493
|
+
}
|
|
1494
|
+
if (!Number.isFinite(maxDurationSeconds) || maxDurationSeconds <= 0) {
|
|
1495
|
+
throw new VisualAIVideoError(
|
|
1496
|
+
`Invalid maxDurationSeconds: ${maxDurationSeconds}. Must be a finite number > 0.`
|
|
1497
|
+
);
|
|
1498
|
+
}
|
|
1499
|
+
const ffmpeg = await loadFfmpegFactory();
|
|
1500
|
+
const durationSeconds = await probeDurationSeconds(videoPath);
|
|
1501
|
+
if (durationSeconds > maxDurationSeconds) {
|
|
1502
|
+
throw new VisualAIVideoError(
|
|
1503
|
+
`Video duration ${durationSeconds.toFixed(2)}s exceeds limit of ${maxDurationSeconds}s. Pass { maxDurationSeconds: N } to override, or trim the source video.`
|
|
1504
|
+
);
|
|
1505
|
+
}
|
|
1506
|
+
const outputDir = await mkdtemp(join(tmpdir(), "visual-ai-frames-"));
|
|
1507
|
+
try {
|
|
1508
|
+
const filter = `fps=${fps},scale='if(gt(iw,ih),min(${FRAME_MAX_DIMENSION},iw),-2)':'if(gt(iw,ih),-2,min(${FRAME_MAX_DIMENSION},ih))':flags=area`;
|
|
1509
|
+
await new Promise((resolve, reject) => {
|
|
1510
|
+
let settled = false;
|
|
1511
|
+
const cmd = ffmpeg(videoPath);
|
|
1512
|
+
const finish = (fn) => {
|
|
1513
|
+
if (settled) return;
|
|
1514
|
+
settled = true;
|
|
1515
|
+
clearTimeout(timer);
|
|
1516
|
+
fn();
|
|
1517
|
+
};
|
|
1518
|
+
const timer = setTimeout(() => {
|
|
1519
|
+
try {
|
|
1520
|
+
cmd.kill("SIGKILL");
|
|
1521
|
+
} catch {
|
|
1522
|
+
}
|
|
1523
|
+
finish(() => {
|
|
1524
|
+
reject(
|
|
1525
|
+
new VisualAIVideoError(
|
|
1526
|
+
`ffmpeg frame extraction timed out after ${FFMPEG_RUN_TIMEOUT_MS}ms`
|
|
1527
|
+
)
|
|
1528
|
+
);
|
|
1529
|
+
});
|
|
1530
|
+
}, FFMPEG_RUN_TIMEOUT_MS);
|
|
1531
|
+
cmd.outputOptions(["-vf", filter, "-vframes", String(maxFrames), "-q:v", "3"]).output(join(outputDir, "frame-%04d.jpg")).on("end", () => {
|
|
1532
|
+
finish(() => {
|
|
1533
|
+
resolve();
|
|
1534
|
+
});
|
|
1535
|
+
}).on("error", (err) => {
|
|
1536
|
+
finish(() => {
|
|
1537
|
+
reject(new VisualAIVideoError(`ffmpeg frame extraction failed: ${err.message}`));
|
|
1538
|
+
});
|
|
1539
|
+
}).run();
|
|
1540
|
+
});
|
|
1541
|
+
const files = (await readdir(outputDir)).filter((name) => name.endsWith(".jpg")).sort();
|
|
1542
|
+
if (files.length === 0) {
|
|
1543
|
+
throw new VisualAIVideoError(
|
|
1544
|
+
"No frames could be extracted from the video. The source may be corrupt or empty."
|
|
1545
|
+
);
|
|
1546
|
+
}
|
|
1547
|
+
const frames = await Promise.all(
|
|
1548
|
+
files.map(async (name, index) => {
|
|
1549
|
+
const data = await readFile2(join(outputDir, name));
|
|
1550
|
+
const timestampSeconds = Math.min(durationSeconds, (index + 0.5) / fps);
|
|
1551
|
+
let cachedBase64;
|
|
1552
|
+
return {
|
|
1553
|
+
data,
|
|
1554
|
+
mimeType: "image/jpeg",
|
|
1555
|
+
get base64() {
|
|
1556
|
+
if (cachedBase64 === void 0) {
|
|
1557
|
+
cachedBase64 = data.toString("base64");
|
|
1558
|
+
}
|
|
1559
|
+
return cachedBase64;
|
|
1560
|
+
},
|
|
1561
|
+
timestampSeconds,
|
|
1562
|
+
index
|
|
1563
|
+
};
|
|
1564
|
+
})
|
|
1565
|
+
);
|
|
1566
|
+
return { frames, durationSeconds };
|
|
1567
|
+
} finally {
|
|
1568
|
+
try {
|
|
1569
|
+
await rm(outputDir, { recursive: true, force: true });
|
|
1570
|
+
} catch {
|
|
1571
|
+
}
|
|
1572
|
+
}
|
|
1573
|
+
}
|
|
1574
|
+
|
|
1575
|
+
// src/core/media.ts
|
|
1576
|
+
var VIDEO_MAGIC_BYTE_PREFIX_LEN = 16;
|
|
1577
|
+
function isVideoInput(input) {
|
|
1578
|
+
if (Buffer.isBuffer(input) || input instanceof Uint8Array) {
|
|
1579
|
+
const buf = Buffer.isBuffer(input) ? input : Buffer.from(input);
|
|
1580
|
+
return detectVideoMimeType(buf) !== null;
|
|
1581
|
+
}
|
|
1582
|
+
if (typeof input !== "string") return false;
|
|
1583
|
+
if (isDataUrl(input)) {
|
|
1584
|
+
const parsed = parseDataUrl(input);
|
|
1585
|
+
return parsed?.mimeType.startsWith("video/") ?? false;
|
|
1586
|
+
}
|
|
1587
|
+
if (isFilePath(input)) {
|
|
1588
|
+
return getVideoMimeFromExtension(input) !== void 0;
|
|
1589
|
+
}
|
|
1590
|
+
if (looksLikeVideoBase64(input)) {
|
|
1591
|
+
try {
|
|
1592
|
+
const buf = decodeBase64(input.slice(0, VIDEO_MAGIC_BYTE_PREFIX_LEN));
|
|
1593
|
+
return detectVideoMimeType(buf) !== null;
|
|
1594
|
+
} catch {
|
|
1595
|
+
return false;
|
|
1596
|
+
}
|
|
1597
|
+
}
|
|
1598
|
+
return false;
|
|
1599
|
+
}
|
|
1600
|
+
async function normalizeMedia(input, videoOptions) {
|
|
1601
|
+
if (isVideoInput(input)) {
|
|
1602
|
+
const { path, cleanup } = await resolveVideoToPath(input);
|
|
1603
|
+
try {
|
|
1604
|
+
const { frames, durationSeconds } = await extractFrames(path, videoOptions);
|
|
1605
|
+
return { kind: "video", frames, durationSeconds };
|
|
1606
|
+
} finally {
|
|
1607
|
+
try {
|
|
1608
|
+
await cleanup();
|
|
1609
|
+
} catch {
|
|
1610
|
+
}
|
|
1611
|
+
}
|
|
1612
|
+
}
|
|
1613
|
+
const image = await normalizeImage(input);
|
|
1614
|
+
return { kind: "image", image };
|
|
1615
|
+
}
|
|
1616
|
+
|
|
1127
1617
|
// src/types.ts
|
|
1128
1618
|
import { z } from "zod";
|
|
1129
1619
|
var IssuePrioritySchema = z.enum(["critical", "major", "minor"]);
|
|
@@ -1148,7 +1638,13 @@ var StatementResultSchema = z.object({
|
|
|
1148
1638
|
statement: z.string(),
|
|
1149
1639
|
pass: z.boolean(),
|
|
1150
1640
|
reasoning: z.string(),
|
|
1151
|
-
confidence: ConfidenceSchema.optional()
|
|
1641
|
+
confidence: ConfidenceSchema.optional(),
|
|
1642
|
+
/**
|
|
1643
|
+
* For video inputs, the approximate timestamp (in seconds, from the start of the clip)
|
|
1644
|
+
* of the frame that most clearly demonstrates the statement. `null` when the statement
|
|
1645
|
+
* fails or applies across the whole clip. Always omitted for image inputs.
|
|
1646
|
+
*/
|
|
1647
|
+
timestampSeconds: z.number().nonnegative().nullable().optional()
|
|
1152
1648
|
});
|
|
1153
1649
|
var UsageInfoSchema = z.object({
|
|
1154
1650
|
inputTokens: z.number(),
|
|
@@ -1177,6 +1673,11 @@ var CompareResultSchema = BaseResultSchema.extend({
|
|
|
1177
1673
|
var AskResultSchema = z.object({
|
|
1178
1674
|
summary: z.string(),
|
|
1179
1675
|
issues: z.array(IssueSchema),
|
|
1676
|
+
/**
|
|
1677
|
+
* For video inputs, the indices of frames the model relied on to answer.
|
|
1678
|
+
* Indices are 0-based and refer to entries in `frames.timestampsSeconds`.
|
|
1679
|
+
*/
|
|
1680
|
+
frameReferences: z.array(z.number().int().nonnegative()).optional(),
|
|
1180
1681
|
usage: UsageInfoSchema.optional()
|
|
1181
1682
|
});
|
|
1182
1683
|
|
|
@@ -1252,6 +1753,29 @@ function createDriver(provider, config) {
|
|
|
1252
1753
|
var checkSchemaOptions = toSchemaOptions(CheckResponseSchema);
|
|
1253
1754
|
var askSchemaOptions = toSchemaOptions(AskResponseSchema);
|
|
1254
1755
|
var compareSchemaOptions = toSchemaOptions(CompareResponseSchema);
|
|
1756
|
+
function mediaToProviderInputs(media) {
|
|
1757
|
+
if (media.kind === "image") {
|
|
1758
|
+
return {
|
|
1759
|
+
images: [media.image],
|
|
1760
|
+
mediaContext: { kind: "image" },
|
|
1761
|
+
framesMetadata: void 0
|
|
1762
|
+
};
|
|
1763
|
+
}
|
|
1764
|
+
const timestamps = media.frames.map((f) => f.timestampSeconds);
|
|
1765
|
+
return {
|
|
1766
|
+
images: media.frames,
|
|
1767
|
+
mediaContext: {
|
|
1768
|
+
kind: "video",
|
|
1769
|
+
frameTimestamps: timestamps,
|
|
1770
|
+
durationSeconds: media.durationSeconds
|
|
1771
|
+
},
|
|
1772
|
+
framesMetadata: {
|
|
1773
|
+
count: media.frames.length,
|
|
1774
|
+
timestampsSeconds: timestamps,
|
|
1775
|
+
durationSeconds: media.durationSeconds
|
|
1776
|
+
}
|
|
1777
|
+
};
|
|
1778
|
+
}
|
|
1255
1779
|
function visualAI(config = {}) {
|
|
1256
1780
|
const resolvedConfig = resolveConfig(config);
|
|
1257
1781
|
const driverConfig = {
|
|
@@ -1280,34 +1804,44 @@ function visualAI(config = {}) {
|
|
|
1280
1804
|
});
|
|
1281
1805
|
}
|
|
1282
1806
|
return {
|
|
1283
|
-
async check(
|
|
1807
|
+
async check(input, statements, options) {
|
|
1284
1808
|
const stmts = Array.isArray(statements) ? statements : [statements];
|
|
1285
1809
|
if (stmts.length === 0) {
|
|
1286
1810
|
throw new VisualAIConfigError("At least one statement is required for check()");
|
|
1287
1811
|
}
|
|
1288
1812
|
return withErrorDebug(resolvedConfig, "check", async () => {
|
|
1289
|
-
const
|
|
1290
|
-
const
|
|
1813
|
+
const media = await normalizeMedia(input, options?.video);
|
|
1814
|
+
const { images, mediaContext, framesMetadata } = mediaToProviderInputs(media);
|
|
1815
|
+
const prompt = buildCheckPrompt(stmts, {
|
|
1816
|
+
instructions: options?.instructions,
|
|
1817
|
+
media: mediaContext
|
|
1818
|
+
});
|
|
1291
1819
|
debugLog(resolvedConfig, "check prompt", prompt, "prompt");
|
|
1292
|
-
const response = await timedSendMessage(driver,
|
|
1820
|
+
const response = await timedSendMessage(driver, images, prompt, checkSchemaOptions);
|
|
1293
1821
|
debugLog(resolvedConfig, "check response", response.text, "response");
|
|
1294
1822
|
const result = parseCheckResponse(response.text);
|
|
1295
1823
|
return {
|
|
1296
1824
|
...result,
|
|
1825
|
+
...framesMetadata ? { frames: framesMetadata } : {},
|
|
1297
1826
|
usage: processUsage("check", response.usage, response.durationSeconds, resolvedConfig)
|
|
1298
1827
|
};
|
|
1299
1828
|
});
|
|
1300
1829
|
},
|
|
1301
|
-
async ask(
|
|
1830
|
+
async ask(input, userPrompt, options) {
|
|
1302
1831
|
return withErrorDebug(resolvedConfig, "ask", async () => {
|
|
1303
|
-
const
|
|
1304
|
-
const
|
|
1832
|
+
const media = await normalizeMedia(input, options?.video);
|
|
1833
|
+
const { images, mediaContext, framesMetadata } = mediaToProviderInputs(media);
|
|
1834
|
+
const prompt = buildAskPrompt(userPrompt, {
|
|
1835
|
+
instructions: options?.instructions,
|
|
1836
|
+
media: mediaContext
|
|
1837
|
+
});
|
|
1305
1838
|
debugLog(resolvedConfig, "ask prompt", prompt, "prompt");
|
|
1306
|
-
const response = await timedSendMessage(driver,
|
|
1839
|
+
const response = await timedSendMessage(driver, images, prompt, askSchemaOptions);
|
|
1307
1840
|
debugLog(resolvedConfig, "ask response", response.text, "response");
|
|
1308
1841
|
const result = parseAskResponse(response.text);
|
|
1309
1842
|
return {
|
|
1310
1843
|
...result,
|
|
1844
|
+
...framesMetadata ? { frames: framesMetadata } : {},
|
|
1311
1845
|
usage: processUsage("ask", response.usage, response.durationSeconds, resolvedConfig)
|
|
1312
1846
|
};
|
|
1313
1847
|
});
|
|
@@ -1495,6 +2029,7 @@ export {
|
|
|
1495
2029
|
VisualAIRateLimitError,
|
|
1496
2030
|
VisualAIResponseParseError,
|
|
1497
2031
|
VisualAITruncationError,
|
|
2032
|
+
VisualAIVideoError,
|
|
1498
2033
|
assertVisualCompareResult,
|
|
1499
2034
|
assertVisualResult,
|
|
1500
2035
|
formatCheckResult,
|