video-context-mcp-server 1.1.4 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. package/README.md +52 -27
  2. package/dist/generated/version.d.ts +1 -1
  3. package/dist/generated/version.js +1 -1
  4. package/dist/index.js +12 -1
  5. package/dist/index.js.map +1 -1
  6. package/dist/services/ffmpeg.d.ts +37 -0
  7. package/dist/services/ffmpeg.d.ts.map +1 -1
  8. package/dist/services/ffmpeg.js +175 -0
  9. package/dist/services/ffmpeg.js.map +1 -1
  10. package/dist/services/providerRouter.d.ts +9 -0
  11. package/dist/services/providerRouter.d.ts.map +1 -1
  12. package/dist/services/providerRouter.js +14 -0
  13. package/dist/services/providerRouter.js.map +1 -1
  14. package/dist/services/redactionDetector.d.ts +187 -0
  15. package/dist/services/redactionDetector.d.ts.map +1 -0
  16. package/dist/services/redactionDetector.js +766 -0
  17. package/dist/services/redactionDetector.js.map +1 -0
  18. package/dist/tools/analyzeVideo.d.ts.map +1 -1
  19. package/dist/tools/analyzeVideo.js +4 -3
  20. package/dist/tools/analyzeVideo.js.map +1 -1
  21. package/dist/tools/redactSensitive.d.ts +11 -0
  22. package/dist/tools/redactSensitive.d.ts.map +1 -0
  23. package/dist/tools/redactSensitive.js +466 -0
  24. package/dist/tools/redactSensitive.js.map +1 -0
  25. package/dist/tools/schemas.d.ts +53 -0
  26. package/dist/tools/schemas.d.ts.map +1 -1
  27. package/dist/tools/schemas.js +87 -0
  28. package/dist/tools/schemas.js.map +1 -1
  29. package/dist/tools/summarizeVideo.d.ts +17 -0
  30. package/dist/tools/summarizeVideo.d.ts.map +1 -1
  31. package/dist/tools/summarizeVideo.js +36 -2
  32. package/dist/tools/summarizeVideo.js.map +1 -1
  33. package/dist/types/redaction.d.ts +32 -0
  34. package/dist/types/redaction.d.ts.map +1 -0
  35. package/dist/types/redaction.js +5 -0
  36. package/dist/types/redaction.js.map +1 -0
  37. package/dist/utils/license.js +1 -1
  38. package/dist/utils/license.js.map +1 -1
  39. package/package.json +1 -1
@@ -0,0 +1,766 @@
1
+ /**
2
+ * Redaction Detector
3
+ * AI-assisted sensitive region detection in video frames.
4
+ * Samples frames, sends to a vision provider, and parses JSON detections.
5
+ */
6
+ import { z } from 'zod';
7
+ import { extractFramesAtTimestamps } from './ffmpeg.js';
8
+ import { bufferToBase64String } from '../utils/base64.js';
9
+ // ── Detection response schema ──────────────────────────────────────────────
10
+ /** Normalized box coordinates (0–1 range). */
11
+ const detectionBoxSchema = z.object({
12
+ left: z.number().min(0).max(1),
13
+ top: z.number().min(0).max(1),
14
+ width: z.number().min(0).max(1),
15
+ height: z.number().min(0).max(1),
16
+ });
17
+ /**
18
+ * Try to extract a normalised {left, top, width, height} box from any known
19
+ * bounding-box representation used by vision models.
20
+ *
21
+ * Handled formats (all with 0–1000 scale unless values ≤ 1.0, which is 0–1):
22
+ * - { box: { left, top, width, height } } — already normalised
23
+ * - { box_2d: [y1, x1, y2, x2] } — Gemini spatial grounding
24
+ * - { visual_region: [y1, x1, y2, x2] } — Gemini alternate key
25
+ * - { bbox: [x1, y1, x2, y2] } or { bounding_box: [...] } — COCO-style
26
+ * - { region: [y1, x1, y2, x2] } / { coordinates: [...] } — misc
27
+ */
28
+ function tryExtractBox(obj) {
29
+ // Already normalised
30
+ if (typeof obj['box'] === 'object' &&
31
+ obj['box'] !== null &&
32
+ typeof obj['box']['left'] === 'number') {
33
+ return obj['box'];
34
+ }
35
+ // Try array-based coordinate fields
36
+ const arrayFields = [
37
+ 'box_2d',
38
+ 'visual_region',
39
+ 'region',
40
+ 'coordinates',
41
+ 'bounding_box',
42
+ 'bbox',
43
+ ];
44
+ for (const field of arrayFields) {
45
+ const val = obj[field];
46
+ if (!Array.isArray(val) || val.length < 4)
47
+ continue;
48
+ const nums = val;
49
+ // Detect coordinate order and scale:
50
+ // Gemini / Xiaomi use [y1, x1, y2, x2] in 0–1000 range.
51
+ // COCO / standard ML uses [x1, y1, x2, y2].
52
+ // We infer order from which set of values is larger (y tends to be smaller
53
+ // for portrait screens but for widescreen desktop recordings x and y are
54
+ // both reasonable — we default to Gemini order for box_2d/visual_region).
55
+ let y1, x1, y2, x2;
56
+ if (field === 'bbox') {
57
+ // COCO-style [x1, y1, x2, y2] or [x, y, w, h]
58
+ ;
59
+ [x1, y1, x2, y2] = nums;
60
+ }
61
+ else {
62
+ // Gemini-style [y1, x1, y2, x2]
63
+ ;
64
+ [y1, x1, y2, x2] = nums;
65
+ }
66
+ // Values > 1 are assumed to be 0–1000 scaled; ≤ 1 are already 0–1
67
+ const scale = Math.max(...nums) > 1 ? 1000 : 1;
68
+ return {
69
+ left: Math.min(x1, x2) / scale,
70
+ top: Math.min(y1, y2) / scale,
71
+ width: Math.abs(x2 - x1) / scale,
72
+ height: Math.abs(y2 - y1) / scale,
73
+ };
74
+ }
75
+ return null;
76
+ }
77
+ /**
78
+ * Normalise a raw detection object from the provider into our canonical shape.
79
+ * Accepts any known bounding-box field name, fills defaults for missing fields.
80
+ */
81
+ function normaliseRawDetection(raw) {
82
+ if (typeof raw !== 'object' || raw === null)
83
+ return raw;
84
+ const obj = raw;
85
+ // Resolve bounding box
86
+ const box = tryExtractBox(obj);
87
+ if (box) {
88
+ obj['box'] = box;
89
+ // Remove alternate coordinate fields so Zod doesn't trip on extra keys
90
+ for (const f of [
91
+ 'box_2d',
92
+ 'visual_region',
93
+ 'region',
94
+ 'coordinates',
95
+ 'bounding_box',
96
+ 'bbox',
97
+ ]) {
98
+ if (f !== 'box')
99
+ delete obj[f];
100
+ }
101
+ }
102
+ // Fall back to 'type' field when 'label' is missing
103
+ if (typeof obj['label'] !== 'string' && typeof obj['type'] === 'string') {
104
+ obj['label'] = obj['type'];
105
+ }
106
+ // Default label if still missing
107
+ if (typeof obj['label'] !== 'string') {
108
+ obj['label'] = 'sensitive';
109
+ }
110
+ // Default confidence to 0.9 when not provided
111
+ if (typeof obj['confidence'] !== 'number') {
112
+ obj['confidence'] = 0.9;
113
+ }
114
+ return obj;
115
+ }
116
+ /** A single detection within a frame (permissive — box normalised in post-process). */
117
+ const rawFrameDetectionSchema = z.preprocess(normaliseRawDetection, z.object({
118
+ label: z.string(),
119
+ confidence: z.number().min(0).max(1),
120
+ box: z
121
+ .object({
122
+ left: z.number(),
123
+ top: z.number(),
124
+ width: z.number(),
125
+ height: z.number(),
126
+ })
127
+ .optional(),
128
+ reason: z.string().optional(),
129
+ }));
130
+ /** Detections for a single sampled frame. */
131
+ const frameResultSchema = z.object({
132
+ frameIndex: z.number(),
133
+ timestampSec: z.number(),
134
+ detections: z.array(rawFrameDetectionSchema),
135
+ });
136
+ /** Full detection response from the AI provider. */
137
+ export const detectionResponseSchema = z.object({
138
+ frames: z.array(frameResultSchema),
139
+ });
140
+ /**
141
+ * Post-parse box normalisation: clamp all box values to the 0–1 range.
142
+ * When values > 1, they are in absolute pixel coordinates — divide by the
143
+ * appropriate video dimension to normalise.
144
+ */
145
+ function normaliseBoxCoordinates(parsed, videoWidth, videoHeight) {
146
+ return {
147
+ frames: parsed.frames.map((frame) => ({
148
+ ...frame,
149
+ detections: frame.detections
150
+ .filter((d) => d.box !== undefined)
151
+ .map((d) => {
152
+ const b = d.box;
153
+ // If values are already 0–1, keep as-is
154
+ if (b.left <= 1 && b.top <= 1 && b.width <= 1 && b.height <= 1) {
155
+ return d;
156
+ }
157
+ // Otherwise treat as pixel coordinates
158
+ return {
159
+ ...d,
160
+ box: {
161
+ left: Math.max(0, Math.min(1, b.left / videoWidth)),
162
+ top: Math.max(0, Math.min(1, b.top / videoHeight)),
163
+ width: Math.max(0, Math.min(1, b.width / videoWidth)),
164
+ height: Math.max(0, Math.min(1, b.height / videoHeight)),
165
+ },
166
+ };
167
+ }),
168
+ })),
169
+ };
170
+ }
171
+ // ── Detection prompt ───────────────────────────────────────────────────────
172
+ /**
173
+ * System prompt that instructs the vision model to detect sensitive regions.
174
+ */
175
+ const DETECTION_SYSTEM_PROMPT = `You are a security-focused video analysis assistant. Your task is to identify regions in video frames that may contain sensitive information.
176
+
177
+ Look for:
178
+ - API keys, tokens, passwords, or secrets in code editors or terminals
179
+ - Email addresses, phone numbers, or personal identifiers
180
+ - Internal URLs, IP addresses, or infrastructure details
181
+ - Account IDs, session tokens, or authentication cookies
182
+ - Financial data, credit card numbers, or account balances
183
+ - Any text or UI element that appears to be confidential
184
+
185
+ You will receive multiple images. Each image is a sampled frame from a video, sent in order.
186
+
187
+ Return ONLY a JSON object in EXACTLY this format (no markdown, no extra text):
188
+ {
189
+ "frames": [
190
+ {
191
+ "frameIndex": 0,
192
+ "timestampSec": 0.0,
193
+ "detections": [
194
+ {
195
+ "label": "api_key",
196
+ "confidence": 0.95,
197
+ "box": { "left": 0.1, "top": 0.2, "width": 0.4, "height": 0.05 },
198
+ "reason": "long token-like string visible in terminal"
199
+ }
200
+ ]
201
+ }
202
+ ]
203
+ }
204
+
205
+ Rules:
206
+ - frameIndex is the 0-based index of each image you received (first image = 0, second = 1, etc.)
207
+ - timestampSec is your best estimate of the frame time; use 0 if unknown
208
+ - box coordinates are normalized 0–1 relative to the frame dimensions (left=x, top=y)
209
+ - confidence is 0–1; omit detections you are not at least 40% confident about
210
+ - If a frame has no sensitive content, include it with an empty detections array
211
+ - For credentials in CODE EDITORS (VS Code, etc.) or TERMINALS, set left to the left edge of the editor content area (~0.15 if a sidebar is visible, otherwise 0.0), width to reach the right edge (1.0 minus left), and height to cover all sensitive lines. It is far better to over-cover a row than to clip the start of a key.
212
+ - Do NOT include markdown code fences or any text outside the JSON object`;
213
+ /** Keyword patterns mapped to detection categories. Order matters (first match wins). */
214
+ const INTENT_RULES = [
215
+ {
216
+ // Credentials: api key(s), token, secret, password, bearer, oauth, jwt, etc.
217
+ pattern: /\b(api[-_ ]?keys?|secret|token|bearer|password|credential|auth|oauth|jwt|private[-_ ]?key|access[-_ ]?key|api[-_ ]?secret)\b/i,
218
+ category: 'credentials',
219
+ },
220
+ {
221
+ // Infrastructure: IP address, URL, hostname, session/account IDs — listed before
222
+ // PII to prevent "ip address" from matching the generic word "address" in the
223
+ // PII rule.
224
+ pattern: /\b(ip[-_ ]?address|url|hostname|domain|account[-_ ]?id|session[-_ ]?id|cookie|internal[-_ ]?url)\b/i,
225
+ category: 'infrastructure',
226
+ },
227
+ {
228
+ // PII: email, phone, personal identifiers — "address" removed to avoid
229
+ // collision with "ip address" / "url" matches handled above.
230
+ pattern: /\b(email|phone|mobile|personal|pii|full[-_ ]?name|dob|date[-_ ]?of[-_ ]?birth|ssn|social[-_ ]?security)\b/i,
231
+ category: 'pii',
232
+ },
233
+ {
234
+ pattern: /\b(credit[-_ ]?card|card[-_ ]?number|cvv|bank|financial|balance|iban|bic|routing)\b/i,
235
+ category: 'financial',
236
+ },
237
+ ];
238
+ /**
239
+ * Classify a free-text intent string into one of the allowed detection
240
+ * categories. Returns 'general' when no specific category is recognised.
241
+ * Raw user text is never forwarded to the AI prompt.
242
+ */
243
+ export function classifyIntent(intent) {
244
+ for (const { pattern, category } of INTENT_RULES) {
245
+ if (pattern.test(intent))
246
+ return category;
247
+ }
248
+ return 'general';
249
+ }
250
+ /** Per-category detection defaults that override the schema defaults. */
251
+ const CATEGORY_DEFAULTS = {
252
+ // Credentials change quickly in screen-recordings; sample densely and
253
+ // lower confidence so short-lived terminal pastes aren't missed.
254
+ credentials: { sampleIntervalSeconds: 3, maxFrames: 40, minConfidence: 0.4 },
255
+ pii: { sampleIntervalSeconds: 4, maxFrames: 30, minConfidence: 0.45 },
256
+ infrastructure: {
257
+ sampleIntervalSeconds: 4,
258
+ maxFrames: 30,
259
+ minConfidence: 0.45,
260
+ },
261
+ financial: { sampleIntervalSeconds: 4, maxFrames: 30, minConfidence: 0.45 },
262
+ general: { sampleIntervalSeconds: 5, maxFrames: 20, minConfidence: 0.5 },
263
+ };
264
+ /** Return the sampling/confidence defaults for a given intent category. */
265
+ export function getDetectionDefaults(category) {
266
+ return CATEGORY_DEFAULTS[category];
267
+ }
268
+ /** Per-category system prompt addition that sharpens model focus. */
269
+ const CATEGORY_FOCUS = {
270
+ credentials: `PRIORITY: Focus especially on API keys, access tokens, bearer tokens, passwords, and any long alphanumeric secret strings. These often appear in:
271
+ - Code-editor config files (.env, mcp.json, settings.json) — look for lines like "KEY": "...", KEY=..., or token: ...
272
+ - Terminal output after commands such as echo, printenv, cat, or export
273
+ - Browser pages with copy-to-clipboard key fields (Deepgram, OpenAI, Groq dashboards)`,
274
+ pii: `PRIORITY: Focus especially on email addresses, phone numbers, full names, dates of birth, national ID numbers (SSN, etc.), and other personally identifiable information.`,
275
+ infrastructure: `PRIORITY: Focus especially on internal URLs, IP addresses, hostnames, domain names, session IDs, and authentication cookies.`,
276
+ financial: `PRIORITY: Focus especially on credit/debit card numbers, CVV codes, IBAN/routing numbers, bank account details, and financial balances.`,
277
+ general: '',
278
+ };
279
+ /**
280
+ * Build the detection system prompt, optionally sharpened for a specific
281
+ * intent category. When no category is provided the generic prompt is used.
282
+ */
283
+ function buildDetectionSystemPrompt(category) {
284
+ const focus = category ? CATEGORY_FOCUS[category] : '';
285
+ if (!focus)
286
+ return DETECTION_SYSTEM_PROMPT;
287
+ return `${focus}\n\n${DETECTION_SYSTEM_PROMPT}`;
288
+ }
289
+ /**
290
+ * User prompt template for detection.
291
+ * Including per-frame timestamps helps models correctly assign frameIndex values.
292
+ * @param frameIndexOffset Global offset added to each local batch index (0 for single-batch calls).
293
+ */
294
+ function buildDetectionUserPrompt(customInstructions, timestamps, frameIndexOffset = 0) {
295
+ const frameList = timestamps && timestamps.length > 0
296
+ ? `\n\nFrame index to timestamp mapping (use these exact frameIndex values in your response):\n` +
297
+ timestamps
298
+ .map((t, i) => ` frameIndex ${i + frameIndexOffset} → ${t.toFixed(1)}s`)
299
+ .join('\n')
300
+ : '';
301
+ const base = `Analyze the video frames below and detect any regions that may contain sensitive information. Return your findings as JSON.${frameList}`;
302
+ if (customInstructions) {
303
+ return `${base}\n\nAdditional instructions: ${customInstructions}`;
304
+ }
305
+ return base;
306
+ }
307
+ /**
308
+ * Compute timestamps for evenly-spaced frame samples.
309
+ *
310
+ * When the requested interval would produce more frames than `maxFrames`, the
311
+ * timestamps are redistributed so that `maxFrames` samples are spread evenly
312
+ * across the **full** video duration rather than being truncated to the first
313
+ * `maxFrames × interval` seconds. This guarantees the entire video is always
314
+ * covered regardless of the frame cap.
315
+ */
316
+ function buildFallbackTimestamps(duration, maxCount) {
317
+ const base = [0, duration / 2];
318
+ if (duration > 2)
319
+ base.push(duration - 1);
320
+ return [...new Set(base)].sort((a, b) => a - b).slice(0, maxCount);
321
+ }
322
+ export function computeSampleTimestamps(options) {
323
+ const { duration, sampleIntervalSeconds = 5, maxFrames = 20 } = options;
324
+ if (duration <= 0)
325
+ return [0];
326
+ const interval = Math.max(sampleIntervalSeconds, 1);
327
+ const maxCount = Math.max(maxFrames, 1);
328
+ const timestamps = [];
329
+ const naturalCount = Math.floor(duration / interval);
330
+ if (naturalCount <= maxCount) {
331
+ // Requested interval fits within the frame cap — generate at the exact interval.
332
+ for (let t = 0; t < duration; t += interval) {
333
+ timestamps.push(Math.min(t, duration - 0.1));
334
+ }
335
+ }
336
+ else {
337
+ // Frame cap would truncate coverage. Spread maxCount frames evenly so every
338
+ // portion of the video is sampled (at a coarser effective interval).
339
+ const spread = duration / maxCount;
340
+ for (let i = 0; i < maxCount; i++) {
341
+ timestamps.push(Math.min(i * spread, duration - 0.1));
342
+ }
343
+ }
344
+ // If the interval exceeds the duration, sample at start/mid/end
345
+ if (timestamps.length <= 1 && duration > 1) {
346
+ return buildFallbackTimestamps(duration, maxCount);
347
+ }
348
+ return timestamps.map((t) => Math.round(t * 100) / 100);
349
+ }
350
+ /**
351
+ * Run AI-assisted detection on a video.
352
+ *
353
+ * 1. Extract evenly-spaced frames
354
+ * 2. Convert frames to base64
355
+ * 3. Send to the AI provider for analysis
356
+ * 4. Parse and validate the JSON response
357
+ * 5. Convert normalized coordinates to pixel coordinates
358
+ * 6. Filter by confidence threshold
359
+ * 7. Apply padding expansion
360
+ *
361
+ * @returns Detection result with regions, frame count, and raw detection count
362
+ */
363
+ export async function detectSensitiveRegions(options) {
364
+ const { videoPath, duration, width, height, sampling, minConfidence = 0.5, paddingPixels = 10, customInstructions, intentCategory, provider, } = options;
365
+ // Step 1: Compute sample timestamps
366
+ const timestamps = computeSampleTimestamps({
367
+ ...sampling,
368
+ duration,
369
+ });
370
+ // Step 2: Extract frames at the exact timestamps we tell the model about.
371
+ const frameBuffers = await extractFramesAtTimestamps(videoPath, timestamps);
372
+ if (frameBuffers.length === 0) {
373
+ throw new Error('Failed to extract frames from video for detection.');
374
+ }
375
+ // Step 3: Convert to proper image format for provider
376
+ const images = frameBuffers.map((buf) => ({
377
+ data: bufferToBase64String(buf),
378
+ mimeType: 'image/jpeg',
379
+ }));
380
+ // Step 4: Build prompt and call provider in batches.
381
+ // Some providers (e.g. Qwen) impose a per-request image limit (~20).
382
+ // Split into chunks and re-index frameIndex values to their global position.
383
+ const DETECTION_BATCH_SIZE = 20;
384
+ const allFrameResults = [];
385
+ for (let batchStart = 0; batchStart < images.length; batchStart += DETECTION_BATCH_SIZE) {
386
+ const batchImages = images.slice(batchStart, batchStart + DETECTION_BATCH_SIZE);
387
+ const batchTimestamps = timestamps.slice(batchStart, batchStart + DETECTION_BATCH_SIZE);
388
+ // Prepend the system prompt so providers that use analyzeImages() receive
389
+ // the full JSON schema instruction regardless of how they handle system roles.
390
+ const systemPrompt = buildDetectionSystemPrompt(intentCategory);
391
+ const batchPrompt = systemPrompt +
392
+ '\n\n' +
393
+ buildDetectionUserPrompt(customInstructions, batchTimestamps, batchStart);
394
+ const responseText = await provider.analyzeImages(batchImages, batchPrompt);
395
+ const batchParsed = parseDetectionResponse(responseText, width, height);
396
+ // Re-index frame results from batch-local to global positions
397
+ for (const frame of batchParsed.frames) {
398
+ frame.frameIndex += batchStart;
399
+ }
400
+ allFrameResults.push(...batchParsed.frames);
401
+ }
402
+ const parsed = { frames: allFrameResults };
403
+ // Step 5: Count raw detections (before filtering)
404
+ const rawDetectionsCount = parsed.frames.reduce((sum, f) => sum + f.detections.length, 0);
405
+ // Step 6: Convert to pixel regions and filter
406
+ const regions = convertDetectionsToRegions(parsed, width, height, timestamps, minConfidence, paddingPixels);
407
+ return { regions, sampledFrames: timestamps.length, rawDetectionsCount };
408
+ }
409
+ /**
410
+ * Parse and validate the AI provider's JSON response.
411
+ * Handles cases where the model wraps JSON in markdown code fences.
412
+ * @param videoWidth Frame width in pixels — used to normalise pixel-space coordinates.
413
+ * @param videoHeight Frame height in pixels — used to normalise pixel-space coordinates.
414
+ */
415
+ /** Strip markdown code fences from an AI response if present. */
416
+ function stripCodeFences(text) {
417
+ const match = text.trim().match(/^```(?:json)?\s*\n([\s\S]*?)\n```\s*$/);
418
+ return match ? match[1] : text.trim();
419
+ }
420
+ /**
421
+ * Group a flat array of per-detection objects (each with a frameIndex) into
422
+ * the canonical frame-result shape: `{ frames: [{ frameIndex, timestampSec, detections }] }`.
423
+ */
424
+ function groupFlatDetections(arr) {
425
+ const frameMap = new Map();
426
+ for (const item of arr) {
427
+ const fi = typeof item['frameIndex'] === 'number' ? item['frameIndex'] : 0;
428
+ if (!frameMap.has(fi))
429
+ frameMap.set(fi, []);
430
+ const { frameIndex: _fi, ...det } = item;
431
+ frameMap.get(fi).push(det);
432
+ }
433
+ return {
434
+ frames: Array.from(frameMap.entries()).map(([fi, detections]) => ({
435
+ frameIndex: fi,
436
+ timestampSec: 0, // overridden by timestamps[fi] in convertDetectionsToRegions
437
+ detections,
438
+ })),
439
+ };
440
+ }
441
+ /**
442
+ * Normalise a bare JSON array returned by models that don't emit the canonical
443
+ * `{ frames: [...] }` wrapper. Three sub-cases:
444
+ * 1. Array of frame result objects (already have a `detections` array)
445
+ * 2. Flat array of detection objects that each carry a `frameIndex` field
446
+ * 3. Flat list with no frame info → all assigned to frame 0
447
+ */
448
+ function normaliseRawArray(arr) {
449
+ if (arr.length === 0)
450
+ return { frames: [] };
451
+ const first = arr[0];
452
+ if ('detections' in first && Array.isArray(first['detections'])) {
453
+ return { frames: arr };
454
+ }
455
+ if ('frameIndex' in first || 'box_2d' in first || 'box' in first) {
456
+ return groupFlatDetections(arr);
457
+ }
458
+ return { frames: [{ frameIndex: 0, timestampSec: 0, detections: arr }] };
459
+ }
460
+ /** Parse and Zod-validate a raw value; throw a descriptive error on failure. */
461
+ function validateDetectionSchema(raw) {
462
+ const result = detectionResponseSchema.safeParse(raw);
463
+ if (!result.success) {
464
+ const preview = JSON.stringify(raw).slice(0, 500);
465
+ throw new Error(`AI provider returned invalid detection JSON: ${result.error.message}\nParsed structure preview: ${preview}`);
466
+ }
467
+ return result.data;
468
+ }
469
+ export function parseDetectionResponse(responseText, videoWidth = 1000, videoHeight = 1000) {
470
+ const cleaned = stripCodeFences(responseText);
471
+ let raw;
472
+ try {
473
+ raw = JSON.parse(cleaned);
474
+ }
475
+ catch {
476
+ throw new Error(`AI provider returned non-JSON response. Response preview: ${cleaned.slice(0, 200)}`);
477
+ }
478
+ if (Array.isArray(raw)) {
479
+ raw = normaliseRawArray(raw);
480
+ }
481
+ return normaliseBoxCoordinates(validateDetectionSchema(raw), videoWidth, videoHeight);
482
+ }
483
+ /**
484
+ * Convert normalized AI detections to pixel-coordinate regions.
485
+ * Filters by confidence and applies padding.
486
+ */
487
+ /**
488
+ * Convert a single raw detection box to padded pixel coordinates.
489
+ *
490
+ * Applies double upward padding to account for the systematic downward drift
491
+ * common in vision-model bounding boxes (text baseline vs. cap-height).
492
+ *
493
+ * Extends the region to full video width when it starts near the left edge
494
+ * and spans a wide portion of the frame — the pattern for a code-editor or
495
+ * terminal row where long token values often extend past the detected box.
496
+ */
497
+ function expandDetectionBox(rawX, rawY, rawW, rawH, videoWidth, videoHeight, paddingPixels) {
498
+ // Double upward padding compensates for model Y-offset toward the baseline
499
+ const upPad = paddingPixels * 2;
500
+ const y = Math.max(0, Math.round(rawY - upPad));
501
+ // Two cases indicate a full editor/terminal row that should be extended
502
+ // to the full frame width so the entire value is redacted:
503
+ // (a) Detection starts near the left edge (original heuristic)
504
+ // (b) Detection is wide AND reaches the right side of the frame — the
505
+ // common pattern when an AI anchors its box to where key text is
506
+ // visually dense but misses the left portion of the value.
507
+ const startsNearLeft = rawX < videoWidth * 0.2 && rawW > videoWidth * 0.25;
508
+ const reachesRightEdge = rawX + rawW > videoWidth * 0.7 && rawW > videoWidth * 0.3;
509
+ const isFullRowLike = startsNearLeft || reachesRightEdge;
510
+ // For right-anchored wide detections, pull the left edge back to the typical
511
+ // editor-content boundary (~15% from left) to cover the start of the value.
512
+ let x;
513
+ if (isFullRowLike && rawX > videoWidth * 0.2) {
514
+ x = Math.round(videoWidth * 0.15);
515
+ }
516
+ else {
517
+ x = Math.max(0, Math.round(rawX - paddingPixels));
518
+ }
519
+ const w = isFullRowLike
520
+ ? videoWidth - x
521
+ : Math.min(videoWidth - x, Math.round(rawW + paddingPixels * 2));
522
+ const h = Math.min(videoHeight - y, Math.round(rawH + upPad + paddingPixels));
523
+ return { x, y, width: w, height: h };
524
+ }
525
+ export function convertDetectionsToRegions(response, videoWidth, videoHeight, timestamps, minConfidence, paddingPixels) {
526
+ const regions = [];
527
+ for (const frameResult of response.frames) {
528
+ const timestamp = timestamps[frameResult.frameIndex] ?? frameResult.timestampSec;
529
+ for (const detection of frameResult.detections) {
530
+ // Filter by confidence
531
+ if (detection.confidence < minConfidence)
532
+ continue;
533
+ // Skip detections whose box could not be parsed
534
+ if (!detection.box)
535
+ continue;
536
+ // Convert normalized coords to pixels, applying padding and full-row extension
537
+ const rawX = detection.box.left * videoWidth;
538
+ const rawY = detection.box.top * videoHeight;
539
+ const rawW = detection.box.width * videoWidth;
540
+ const rawH = detection.box.height * videoHeight;
541
+ const { x, y, width: w, height: h, } = expandDetectionBox(rawX, rawY, rawW, rawH, videoWidth, videoHeight, paddingPixels);
542
+ // Skip degenerate boxes
543
+ if (w <= 0 || h <= 0)
544
+ continue;
545
+ regions.push({
546
+ x,
547
+ y,
548
+ width: w,
549
+ height: h,
550
+ label: detection.label,
551
+ confidence: detection.confidence,
552
+ startTime: timestamp,
553
+ endTime: timestamp,
554
+ });
555
+ }
556
+ }
557
+ return regions;
558
+ }
559
+ /**
560
+ * Check if two regions match by label and proximity.
561
+ */
562
+ function regionsMatch(a, b, threshold) {
563
+ if (a.label !== b.label)
564
+ return false;
565
+ const dx = Math.abs(a.x - b.x);
566
+ const dy = Math.abs(a.y - b.y);
567
+ return dx < threshold && dy < threshold;
568
+ }
569
+ /**
570
+ * Expand a region's spatial bounds to encompass another region.
571
+ */
572
+ function expandRegionBounds(target, source) {
573
+ const startX = Math.min(target.x, source.x);
574
+ const startY = Math.min(target.y, source.y);
575
+ const endX = Math.max(target.x + target.width, source.x + source.width);
576
+ const endY = Math.max(target.y + target.height, source.y + source.height);
577
+ target.x = startX;
578
+ target.y = startY;
579
+ target.width = endX - startX;
580
+ target.height = endY - startY;
581
+ }
582
+ /**
583
+ * Update a region's temporal and confidence metadata from another region.
584
+ */
585
+ function expandRegionMetadata(target, source) {
586
+ const targetStart = target.startTime ?? 0;
587
+ const sourceStart = source.startTime ?? 0;
588
+ const targetEnd = target.endTime ?? 0;
589
+ const sourceEnd = source.endTime ?? 0;
590
+ target.startTime = Math.min(targetStart, sourceStart);
591
+ target.endTime = Math.max(targetEnd, sourceEnd);
592
+ target.confidence = Math.max(target.confidence ?? 0, source.confidence ?? 0);
593
+ }
594
+ /**
595
+ * Expand a region to encompass another region.
596
+ */
597
+ function expandRegion(target, source) {
598
+ expandRegionBounds(target, source);
599
+ expandRegionMetadata(target, source);
600
+ }
601
+ /**
602
+ * Merge overlapping detections across frames into consolidated regions.
603
+ * Uses simple label + proximity matching (no full IOU in v1).
604
+ */
605
+ export function mergeDetections(regions, proximityThreshold = 50) {
606
+ if (regions.length === 0)
607
+ return [];
608
+ const merged = [];
609
+ for (const region of regions) {
610
+ const existing = merged.find((m) => regionsMatch(m, region, proximityThreshold));
611
+ if (existing) {
612
+ expandRegion(existing, region);
613
+ }
614
+ else {
615
+ merged.push({ ...region });
616
+ }
617
+ }
618
+ return merged;
619
+ }
620
+ /**
621
+ * Compute Intersection-over-Union between two boxes.
622
+ */
623
+ function computeIOU(a, b) {
624
+ const x1 = Math.max(a.x, b.x);
625
+ const y1 = Math.max(a.y, b.y);
626
+ const x2 = Math.min(a.x + a.width, b.x + b.width);
627
+ const y2 = Math.min(a.y + a.height, b.y + b.height);
628
+ const intersection = Math.max(0, x2 - x1) * Math.max(0, y2 - y1);
629
+ if (intersection === 0)
630
+ return 0;
631
+ const areaA = a.width * a.height;
632
+ const areaB = b.width * b.height;
633
+ const union = areaA + areaB - intersection;
634
+ return union > 0 ? intersection / union : 0;
635
+ }
636
+ /**
637
+ * Find the best matching track for a region using IOU and label matching.
638
+ */
639
+ function findBestTrack(region, tracks, iouThreshold, maxTimeGap) {
640
+ let bestTrack = null;
641
+ let bestIOU = 0;
642
+ for (const track of tracks) {
643
+ if (track.label !== (region.label ?? 'unknown'))
644
+ continue;
645
+ if (region.startTime != null &&
646
+ region.startTime - track.endTime > maxTimeGap)
647
+ continue;
648
+ const iou = computeIOU({ x: region.x, y: region.y, width: region.width, height: region.height }, { x: track.x, y: track.y, width: track.width, height: track.height });
649
+ if (iou > bestIOU && iou >= iouThreshold) {
650
+ bestIOU = iou;
651
+ bestTrack = track;
652
+ }
653
+ }
654
+ return bestTrack;
655
+ }
656
+ /**
657
+ * Extend a track with a new detection.
658
+ */
659
+ function extendTrack(track, region) {
660
+ const ts = region.startTime ?? 0;
661
+ track.endTime = Math.max(track.endTime, ts);
662
+ track.frameCount++;
663
+ track.confidence = Math.max(track.confidence, region.confidence ?? 0);
664
+ const minX = Math.min(track.x, region.x);
665
+ const minY = Math.min(track.y, region.y);
666
+ const maxX = Math.max(track.x + track.width, region.x + region.width);
667
+ const maxY = Math.max(track.y + track.height, region.y + region.height);
668
+ track.x = minX;
669
+ track.y = minY;
670
+ track.width = maxX - minX;
671
+ track.height = maxY - minY;
672
+ }
673
+ /**
674
+ * Group detections into temporal tracks across adjacent sampled frames.
675
+ *
676
+ * Uses IOU overlap and label similarity to merge boxes that represent the
677
+ * same persistent secret across time. Treats persistent detections in
678
+ * roughly the same area as one track.
679
+ *
680
+ * @param regions - Flat list of per-frame detections (already converted to pixels)
681
+ * @param iouThreshold - Minimum IOU to consider two boxes the same track (default: 0.3)
682
+ * @param sampleInterval - Seconds between sampled frames (used for gap-filling)
683
+ * @returns Consolidated tracks with stable time ranges
684
+ */
685
+ export function groupDetectionsIntoTracks(regions, iouThreshold = 0.3, sampleInterval = 5) {
686
+ if (regions.length === 0)
687
+ return [];
688
+ const sorted = [...regions].sort((a, b) => (a.startTime ?? 0) - (b.startTime ?? 0));
689
+ const tracks = [];
690
+ const maxTimeGap = sampleInterval * 2;
691
+ for (const region of sorted) {
692
+ const bestTrack = findBestTrack(region, tracks, iouThreshold, maxTimeGap);
693
+ if (bestTrack) {
694
+ extendTrack(bestTrack, region);
695
+ }
696
+ else {
697
+ const ts = region.startTime ?? 0;
698
+ tracks.push({
699
+ label: region.label ?? 'unknown',
700
+ x: region.x,
701
+ y: region.y,
702
+ width: region.width,
703
+ height: region.height,
704
+ startTime: ts,
705
+ endTime: ts,
706
+ confidence: region.confidence ?? 0,
707
+ frameCount: 1,
708
+ });
709
+ }
710
+ }
711
+ return tracks;
712
+ }
713
+ /**
714
+ * Convert temporal tracks into a final `RedactionPlan` with time ranges.
715
+ *
716
+ * - Start time: detection timestamp minus half the sample interval
717
+ * - End time: last detection timestamp plus half the sample interval
718
+ * - Clamp to video duration
719
+ * - Merge short gaps between nearby detections
720
+ * - Expand boxes slightly to absorb tiny camera/UI shifts
721
+ *
722
+ * @param tracks - Grouped detection tracks
723
+ * @param videoDuration - Total video duration in seconds
724
+ * @param sampleInterval - Seconds between sampled frames
725
+ * @param paddingPixels - Extra padding to expand each region
726
+ * @param videoWidth - Video width for clamping
727
+ * @param videoHeight - Video height for clamping
728
+ * @returns Final consolidated redaction regions
729
+ */
730
+ export function tracksToRedactionPlan(tracks, videoDuration, sampleInterval, paddingPixels, videoWidth, videoHeight) {
731
+ const halfInterval = sampleInterval / 2;
732
+ return tracks.map((track) => {
733
+ // Expand time range by half the sample interval
734
+ const startTime = Math.max(0, track.startTime - halfInterval);
735
+ const endTime = Math.min(videoDuration, track.endTime + halfInterval);
736
+ // Expand spatial box by padding
737
+ const x = Math.max(0, track.x - paddingPixels);
738
+ const y = Math.max(0, track.y - paddingPixels);
739
+ const width = Math.min(videoWidth - x, track.width + paddingPixels * 2);
740
+ const height = Math.min(videoHeight - y, track.height + paddingPixels * 2);
741
+ return {
742
+ x,
743
+ y,
744
+ width,
745
+ height,
746
+ label: track.label,
747
+ confidence: track.confidence,
748
+ startTime,
749
+ endTime,
750
+ };
751
+ });
752
+ }
753
+ /**
754
+ * Full consolidation pipeline: raw detections → stable redaction regions.
755
+ *
756
+ * 1. Group detections into temporal tracks (IOU + label matching)
757
+ * 2. Convert tracks to time-bounded regions
758
+ * 3. Expand boxes and clamp to video bounds
759
+ *
760
+ * This replaces the simpler `mergeDetections()` for AI-mode detection.
761
+ */
762
+ export function consolidateDetections(regions, videoDuration, sampleInterval, paddingPixels, videoWidth, videoHeight, iouThreshold = 0.3) {
763
+ const tracks = groupDetectionsIntoTracks(regions, iouThreshold, sampleInterval);
764
+ return tracksToRedactionPlan(tracks, videoDuration, sampleInterval, paddingPixels, videoWidth, videoHeight);
765
+ }
766
+ //# sourceMappingURL=redactionDetector.js.map