npm - video-context-mcp-server - Versions diffs - 1.1.4 → 1.2.0 - Mend

video-context-mcp-server 1.1.4 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

package/README.md +52 -27
package/dist/generated/version.d.ts +1 -1
package/dist/generated/version.js +1 -1
package/dist/index.js +12 -1
package/dist/index.js.map +1 -1
package/dist/services/ffmpeg.d.ts +37 -0
package/dist/services/ffmpeg.d.ts.map +1 -1
package/dist/services/ffmpeg.js +175 -0
package/dist/services/ffmpeg.js.map +1 -1
package/dist/services/providerRouter.d.ts +9 -0
package/dist/services/providerRouter.d.ts.map +1 -1
package/dist/services/providerRouter.js +14 -0
package/dist/services/providerRouter.js.map +1 -1
package/dist/services/redactionDetector.d.ts +187 -0
package/dist/services/redactionDetector.d.ts.map +1 -0
package/dist/services/redactionDetector.js +766 -0
package/dist/services/redactionDetector.js.map +1 -0
package/dist/tools/analyzeVideo.d.ts.map +1 -1
package/dist/tools/analyzeVideo.js +4 -3
package/dist/tools/analyzeVideo.js.map +1 -1
package/dist/tools/redactSensitive.d.ts +11 -0
package/dist/tools/redactSensitive.d.ts.map +1 -0
package/dist/tools/redactSensitive.js +466 -0
package/dist/tools/redactSensitive.js.map +1 -0
package/dist/tools/schemas.d.ts +53 -0
package/dist/tools/schemas.d.ts.map +1 -1
package/dist/tools/schemas.js +87 -0
package/dist/tools/schemas.js.map +1 -1
package/dist/tools/summarizeVideo.d.ts +17 -0
package/dist/tools/summarizeVideo.d.ts.map +1 -1
package/dist/tools/summarizeVideo.js +36 -2
package/dist/tools/summarizeVideo.js.map +1 -1
package/dist/types/redaction.d.ts +32 -0
package/dist/types/redaction.d.ts.map +1 -0
package/dist/types/redaction.js +5 -0
package/dist/types/redaction.js.map +1 -0
package/dist/utils/license.js +1 -1
package/dist/utils/license.js.map +1 -1
package/package.json +1 -1

package/dist/services/redactionDetector.js ADDED Viewed

@@ -0,0 +1,766 @@
+/**
+ * Redaction Detector
+ * AI-assisted sensitive region detection in video frames.
+ * Samples frames, sends to a vision provider, and parses JSON detections.
+ */
+import { z } from 'zod';
+import { extractFramesAtTimestamps } from './ffmpeg.js';
+import { bufferToBase64String } from '../utils/base64.js';
+// ── Detection response schema ──────────────────────────────────────────────
+/** Normalized box coordinates (0–1 range). */
+const detectionBoxSchema = z.object({
+    left: z.number().min(0).max(1),
+    top: z.number().min(0).max(1),
+    width: z.number().min(0).max(1),
+    height: z.number().min(0).max(1),
+});
+/**
+ * Try to extract a normalised {left, top, width, height} box from any known
+ * bounding-box representation used by vision models.
+ *
+ * Handled formats (all with 0–1000 scale unless values ≤ 1.0, which is 0–1):
+ *  - { box: { left, top, width, height } }           — already normalised
+ *  - { box_2d: [y1, x1, y2, x2] }                    — Gemini spatial grounding
+ *  - { visual_region: [y1, x1, y2, x2] }             — Gemini alternate key
+ *  - { bbox: [x1, y1, x2, y2] } or { bounding_box: [...] }  — COCO-style
+ *  - { region: [y1, x1, y2, x2] } / { coordinates: [...] }  — misc
+ */
+function tryExtractBox(obj) {
+    // Already normalised
+    if (typeof obj['box'] === 'object' &&
+        obj['box'] !== null &&
+        typeof obj['box']['left'] === 'number') {
+        return obj['box'];
+    }
+    // Try array-based coordinate fields
+    const arrayFields = [
+        'box_2d',
+        'visual_region',
+        'region',
+        'coordinates',
+        'bounding_box',
+        'bbox',
+    ];
+    for (const field of arrayFields) {
+        const val = obj[field];
+        if (!Array.isArray(val) || val.length < 4)
+            continue;
+        const nums = val;
+        // Detect coordinate order and scale:
+        // Gemini / Xiaomi use [y1, x1, y2, x2] in 0–1000 range.
+        // COCO / standard ML uses [x1, y1, x2, y2].
+        // We infer order from which set of values is larger (y tends to be smaller
+        // for portrait screens but for widescreen desktop recordings x and y are
+        // both reasonable — we default to Gemini order for box_2d/visual_region).
+        let y1, x1, y2, x2;
+        if (field === 'bbox') {
+            // COCO-style [x1, y1, x2, y2] or [x, y, w, h]
+            ;
+            [x1, y1, x2, y2] = nums;
+        }
+        else {
+            // Gemini-style [y1, x1, y2, x2]
+            ;
+            [y1, x1, y2, x2] = nums;
+        }
+        // Values > 1 are assumed to be 0–1000 scaled; ≤ 1 are already 0–1
+        const scale = Math.max(...nums) > 1 ? 1000 : 1;
+        return {
+            left: Math.min(x1, x2) / scale,
+            top: Math.min(y1, y2) / scale,
+            width: Math.abs(x2 - x1) / scale,
+            height: Math.abs(y2 - y1) / scale,
+        };
+    }
+    return null;
+}
+/**
+ * Normalise a raw detection object from the provider into our canonical shape.
+ * Accepts any known bounding-box field name, fills defaults for missing fields.
+ */
+function normaliseRawDetection(raw) {
+    if (typeof raw !== 'object' || raw === null)
+        return raw;
+    const obj = raw;
+    // Resolve bounding box
+    const box = tryExtractBox(obj);
+    if (box) {
+        obj['box'] = box;
+        // Remove alternate coordinate fields so Zod doesn't trip on extra keys
+        for (const f of [
+            'box_2d',
+            'visual_region',
+            'region',
+            'coordinates',
+            'bounding_box',
+            'bbox',
+        ]) {
+            if (f !== 'box')
+                delete obj[f];
+        }
+    }
+    // Fall back to 'type' field when 'label' is missing
+    if (typeof obj['label'] !== 'string' && typeof obj['type'] === 'string') {
+        obj['label'] = obj['type'];
+    }
+    // Default label if still missing
+    if (typeof obj['label'] !== 'string') {
+        obj['label'] = 'sensitive';
+    }
+    // Default confidence to 0.9 when not provided
+    if (typeof obj['confidence'] !== 'number') {
+        obj['confidence'] = 0.9;
+    }
+    return obj;
+}
+/** A single detection within a frame (permissive — box normalised in post-process). */
+const rawFrameDetectionSchema = z.preprocess(normaliseRawDetection, z.object({
+    label: z.string(),
+    confidence: z.number().min(0).max(1),
+    box: z
+        .object({
+        left: z.number(),
+        top: z.number(),
+        width: z.number(),
+        height: z.number(),
+    })
+        .optional(),
+    reason: z.string().optional(),
+}));
+/** Detections for a single sampled frame. */
+const frameResultSchema = z.object({
+    frameIndex: z.number(),
+    timestampSec: z.number(),
+    detections: z.array(rawFrameDetectionSchema),
+});
+/** Full detection response from the AI provider. */
+export const detectionResponseSchema = z.object({
+    frames: z.array(frameResultSchema),
+});
+/**
+ * Post-parse box normalisation: clamp all box values to the 0–1 range.
+ * When values > 1, they are in absolute pixel coordinates — divide by the
+ * appropriate video dimension to normalise.
+ */
+function normaliseBoxCoordinates(parsed, videoWidth, videoHeight) {
+    return {
+        frames: parsed.frames.map((frame) => ({
+            ...frame,
+            detections: frame.detections
+                .filter((d) => d.box !== undefined)
+                .map((d) => {
+                const b = d.box;
+                // If values are already 0–1, keep as-is
+                if (b.left <= 1 && b.top <= 1 && b.width <= 1 && b.height <= 1) {
+                    return d;
+                }
+                // Otherwise treat as pixel coordinates
+                return {
+                    ...d,
+                    box: {
+                        left: Math.max(0, Math.min(1, b.left / videoWidth)),
+                        top: Math.max(0, Math.min(1, b.top / videoHeight)),
+                        width: Math.max(0, Math.min(1, b.width / videoWidth)),
+                        height: Math.max(0, Math.min(1, b.height / videoHeight)),
+                    },
+                };
+            }),
+        })),
+    };
+}
+// ── Detection prompt ───────────────────────────────────────────────────────
+/**
+ * System prompt that instructs the vision model to detect sensitive regions.
+ */
+const DETECTION_SYSTEM_PROMPT = `You are a security-focused video analysis assistant. Your task is to identify regions in video frames that may contain sensitive information.
+Look for:
+- API keys, tokens, passwords, or secrets in code editors or terminals
+- Email addresses, phone numbers, or personal identifiers
+- Internal URLs, IP addresses, or infrastructure details
+- Account IDs, session tokens, or authentication cookies
+- Financial data, credit card numbers, or account balances
+- Any text or UI element that appears to be confidential
+You will receive multiple images. Each image is a sampled frame from a video, sent in order.
+Return ONLY a JSON object in EXACTLY this format (no markdown, no extra text):
+{
+  "frames": [
+    {
+      "frameIndex": 0,
+      "timestampSec": 0.0,
+      "detections": [
+        {
+          "label": "api_key",
+          "confidence": 0.95,
+          "box": { "left": 0.1, "top": 0.2, "width": 0.4, "height": 0.05 },
+          "reason": "long token-like string visible in terminal"
+        }
+      ]
+    }
+  ]
+}
+Rules:
+- frameIndex is the 0-based index of each image you received (first image = 0, second = 1, etc.)
+- timestampSec is your best estimate of the frame time; use 0 if unknown
+- box coordinates are normalized 0–1 relative to the frame dimensions (left=x, top=y)
+- confidence is 0–1; omit detections you are not at least 40% confident about
+- If a frame has no sensitive content, include it with an empty detections array
+- For credentials in CODE EDITORS (VS Code, etc.) or TERMINALS, set left to the left edge of the editor content area (~0.15 if a sidebar is visible, otherwise 0.0), width to reach the right edge (1.0 minus left), and height to cover all sensitive lines. It is far better to over-cover a row than to clip the start of a key.
+- Do NOT include markdown code fences or any text outside the JSON object`;
+/** Keyword patterns mapped to detection categories. Order matters (first match wins). */
+const INTENT_RULES = [
+    {
+        // Credentials: api key(s), token, secret, password, bearer, oauth, jwt, etc.
+        pattern: /\b(api[-_ ]?keys?|secret|token|bearer|password|credential|auth|oauth|jwt|private[-_ ]?key|access[-_ ]?key|api[-_ ]?secret)\b/i,
+        category: 'credentials',
+    },
+    {
+        // Infrastructure: IP address, URL, hostname, session/account IDs — listed before
+        // PII to prevent "ip address" from matching the generic word "address" in the
+        // PII rule.
+        pattern: /\b(ip[-_ ]?address|url|hostname|domain|account[-_ ]?id|session[-_ ]?id|cookie|internal[-_ ]?url)\b/i,
+        category: 'infrastructure',
+    },
+    {
+        // PII: email, phone, personal identifiers — "address" removed to avoid
+        // collision with "ip address" / "url" matches handled above.
+        pattern: /\b(email|phone|mobile|personal|pii|full[-_ ]?name|dob|date[-_ ]?of[-_ ]?birth|ssn|social[-_ ]?security)\b/i,
+        category: 'pii',
+    },
+    {
+        pattern: /\b(credit[-_ ]?card|card[-_ ]?number|cvv|bank|financial|balance|iban|bic|routing)\b/i,
+        category: 'financial',
+    },
+];
+/**
+ * Classify a free-text intent string into one of the allowed detection
+ * categories.  Returns 'general' when no specific category is recognised.
+ * Raw user text is never forwarded to the AI prompt.
+ */
+export function classifyIntent(intent) {
+    for (const { pattern, category } of INTENT_RULES) {
+        if (pattern.test(intent))
+            return category;
+    }
+    return 'general';
+}
+/** Per-category detection defaults that override the schema defaults. */
+const CATEGORY_DEFAULTS = {
+    // Credentials change quickly in screen-recordings; sample densely and
+    // lower confidence so short-lived terminal pastes aren't missed.
+    credentials: { sampleIntervalSeconds: 3, maxFrames: 40, minConfidence: 0.4 },
+    pii: { sampleIntervalSeconds: 4, maxFrames: 30, minConfidence: 0.45 },
+    infrastructure: {
+        sampleIntervalSeconds: 4,
+        maxFrames: 30,
+        minConfidence: 0.45,
+    },
+    financial: { sampleIntervalSeconds: 4, maxFrames: 30, minConfidence: 0.45 },
+    general: { sampleIntervalSeconds: 5, maxFrames: 20, minConfidence: 0.5 },
+};
+/** Return the sampling/confidence defaults for a given intent category. */
+export function getDetectionDefaults(category) {
+    return CATEGORY_DEFAULTS[category];
+}
+/** Per-category system prompt addition that sharpens model focus. */
+const CATEGORY_FOCUS = {
+    credentials: `PRIORITY: Focus especially on API keys, access tokens, bearer tokens, passwords, and any long alphanumeric secret strings. These often appear in:
+- Code-editor config files (.env, mcp.json, settings.json) — look for lines like "KEY": "...", KEY=..., or token: ...
+- Terminal output after commands such as echo, printenv, cat, or export
+- Browser pages with copy-to-clipboard key fields (Deepgram, OpenAI, Groq dashboards)`,
+    pii: `PRIORITY: Focus especially on email addresses, phone numbers, full names, dates of birth, national ID numbers (SSN, etc.), and other personally identifiable information.`,
+    infrastructure: `PRIORITY: Focus especially on internal URLs, IP addresses, hostnames, domain names, session IDs, and authentication cookies.`,
+    financial: `PRIORITY: Focus especially on credit/debit card numbers, CVV codes, IBAN/routing numbers, bank account details, and financial balances.`,
+    general: '',
+};
+/**
+ * Build the detection system prompt, optionally sharpened for a specific
+ * intent category.  When no category is provided the generic prompt is used.
+ */
+function buildDetectionSystemPrompt(category) {
+    const focus = category ? CATEGORY_FOCUS[category] : '';
+    if (!focus)
+        return DETECTION_SYSTEM_PROMPT;
+    return `${focus}\n\n${DETECTION_SYSTEM_PROMPT}`;
+}
+/**
+ * User prompt template for detection.
+ * Including per-frame timestamps helps models correctly assign frameIndex values.
+ * @param frameIndexOffset  Global offset added to each local batch index (0 for single-batch calls).
+ */
+function buildDetectionUserPrompt(customInstructions, timestamps, frameIndexOffset = 0) {
+    const frameList = timestamps && timestamps.length > 0
+        ? `\n\nFrame index to timestamp mapping (use these exact frameIndex values in your response):\n` +
+            timestamps
+                .map((t, i) => `  frameIndex ${i + frameIndexOffset} → ${t.toFixed(1)}s`)
+                .join('\n')
+        : '';
+    const base = `Analyze the video frames below and detect any regions that may contain sensitive information. Return your findings as JSON.${frameList}`;
+    if (customInstructions) {
+        return `${base}\n\nAdditional instructions: ${customInstructions}`;
+    }
+    return base;
+}
+/**
+ * Compute timestamps for evenly-spaced frame samples.
+ *
+ * When the requested interval would produce more frames than `maxFrames`, the
+ * timestamps are redistributed so that `maxFrames` samples are spread evenly
+ * across the **full** video duration rather than being truncated to the first
+ * `maxFrames × interval` seconds.  This guarantees the entire video is always
+ * covered regardless of the frame cap.
+ */
+function buildFallbackTimestamps(duration, maxCount) {
+    const base = [0, duration / 2];
+    if (duration > 2)
+        base.push(duration - 1);
+    return [...new Set(base)].sort((a, b) => a - b).slice(0, maxCount);
+}
+export function computeSampleTimestamps(options) {
+    const { duration, sampleIntervalSeconds = 5, maxFrames = 20 } = options;
+    if (duration <= 0)
+        return [0];
+    const interval = Math.max(sampleIntervalSeconds, 1);
+    const maxCount = Math.max(maxFrames, 1);
+    const timestamps = [];
+    const naturalCount = Math.floor(duration / interval);
+    if (naturalCount <= maxCount) {
+        // Requested interval fits within the frame cap — generate at the exact interval.
+        for (let t = 0; t < duration; t += interval) {
+            timestamps.push(Math.min(t, duration - 0.1));
+        }
+    }
+    else {
+        // Frame cap would truncate coverage.  Spread maxCount frames evenly so every
+        // portion of the video is sampled (at a coarser effective interval).
+        const spread = duration / maxCount;
+        for (let i = 0; i < maxCount; i++) {
+            timestamps.push(Math.min(i * spread, duration - 0.1));
+        }
+    }
+    // If the interval exceeds the duration, sample at start/mid/end
+    if (timestamps.length <= 1 && duration > 1) {
+        return buildFallbackTimestamps(duration, maxCount);
+    }
+    return timestamps.map((t) => Math.round(t * 100) / 100);
+}
+/**
+ * Run AI-assisted detection on a video.
+ *
+ * 1. Extract evenly-spaced frames
+ * 2. Convert frames to base64
+ * 3. Send to the AI provider for analysis
+ * 4. Parse and validate the JSON response
+ * 5. Convert normalized coordinates to pixel coordinates
+ * 6. Filter by confidence threshold
+ * 7. Apply padding expansion
+ *
+ * @returns Detection result with regions, frame count, and raw detection count
+ */
+export async function detectSensitiveRegions(options) {
+    const { videoPath, duration, width, height, sampling, minConfidence = 0.5, paddingPixels = 10, customInstructions, intentCategory, provider, } = options;
+    // Step 1: Compute sample timestamps
+    const timestamps = computeSampleTimestamps({
+        ...sampling,
+        duration,
+    });
+    // Step 2: Extract frames at the exact timestamps we tell the model about.
+    const frameBuffers = await extractFramesAtTimestamps(videoPath, timestamps);
+    if (frameBuffers.length === 0) {
+        throw new Error('Failed to extract frames from video for detection.');
+    }
+    // Step 3: Convert to proper image format for provider
+    const images = frameBuffers.map((buf) => ({
+        data: bufferToBase64String(buf),
+        mimeType: 'image/jpeg',
+    }));
+    // Step 4: Build prompt and call provider in batches.
+    // Some providers (e.g. Qwen) impose a per-request image limit (~20).
+    // Split into chunks and re-index frameIndex values to their global position.
+    const DETECTION_BATCH_SIZE = 20;
+    const allFrameResults = [];
+    for (let batchStart = 0; batchStart < images.length; batchStart += DETECTION_BATCH_SIZE) {
+        const batchImages = images.slice(batchStart, batchStart + DETECTION_BATCH_SIZE);
+        const batchTimestamps = timestamps.slice(batchStart, batchStart + DETECTION_BATCH_SIZE);
+        // Prepend the system prompt so providers that use analyzeImages() receive
+        // the full JSON schema instruction regardless of how they handle system roles.
+        const systemPrompt = buildDetectionSystemPrompt(intentCategory);
+        const batchPrompt = systemPrompt +
+            '\n\n' +
+            buildDetectionUserPrompt(customInstructions, batchTimestamps, batchStart);
+        const responseText = await provider.analyzeImages(batchImages, batchPrompt);
+        const batchParsed = parseDetectionResponse(responseText, width, height);
+        // Re-index frame results from batch-local to global positions
+        for (const frame of batchParsed.frames) {
+            frame.frameIndex += batchStart;
+        }
+        allFrameResults.push(...batchParsed.frames);
+    }
+    const parsed = { frames: allFrameResults };
+    // Step 5: Count raw detections (before filtering)
+    const rawDetectionsCount = parsed.frames.reduce((sum, f) => sum + f.detections.length, 0);
+    // Step 6: Convert to pixel regions and filter
+    const regions = convertDetectionsToRegions(parsed, width, height, timestamps, minConfidence, paddingPixels);
+    return { regions, sampledFrames: timestamps.length, rawDetectionsCount };
+}
+/**
+ * Parse and validate the AI provider's JSON response.
+ * Handles cases where the model wraps JSON in markdown code fences.
+ * @param videoWidth  Frame width in pixels — used to normalise pixel-space coordinates.
+ * @param videoHeight Frame height in pixels — used to normalise pixel-space coordinates.
+ */
+/** Strip markdown code fences from an AI response if present. */
+function stripCodeFences(text) {
+    const match = text.trim().match(/^```(?:json)?\s*\n([\s\S]*?)\n```\s*$/);
+    return match ? match[1] : text.trim();
+}
+/**
+ * Group a flat array of per-detection objects (each with a frameIndex) into
+ * the canonical frame-result shape: `{ frames: [{ frameIndex, timestampSec, detections }] }`.
+ */
+function groupFlatDetections(arr) {
+    const frameMap = new Map();
+    for (const item of arr) {
+        const fi = typeof item['frameIndex'] === 'number' ? item['frameIndex'] : 0;
+        if (!frameMap.has(fi))
+            frameMap.set(fi, []);
+        const { frameIndex: _fi, ...det } = item;
+        frameMap.get(fi).push(det);
+    }
+    return {
+        frames: Array.from(frameMap.entries()).map(([fi, detections]) => ({
+            frameIndex: fi,
+            timestampSec: 0, // overridden by timestamps[fi] in convertDetectionsToRegions
+            detections,
+        })),
+    };
+}
+/**
+ * Normalise a bare JSON array returned by models that don't emit the canonical
+ * `{ frames: [...] }` wrapper.  Three sub-cases:
+ *   1. Array of frame result objects (already have a `detections` array)
+ *   2. Flat array of detection objects that each carry a `frameIndex` field
+ *   3. Flat list with no frame info → all assigned to frame 0
+ */
+function normaliseRawArray(arr) {
+    if (arr.length === 0)
+        return { frames: [] };
+    const first = arr[0];
+    if ('detections' in first && Array.isArray(first['detections'])) {
+        return { frames: arr };
+    }
+    if ('frameIndex' in first || 'box_2d' in first || 'box' in first) {
+        return groupFlatDetections(arr);
+    }
+    return { frames: [{ frameIndex: 0, timestampSec: 0, detections: arr }] };
+}
+/** Parse and Zod-validate a raw value; throw a descriptive error on failure. */
+function validateDetectionSchema(raw) {
+    const result = detectionResponseSchema.safeParse(raw);
+    if (!result.success) {
+        const preview = JSON.stringify(raw).slice(0, 500);
+        throw new Error(`AI provider returned invalid detection JSON: ${result.error.message}\nParsed structure preview: ${preview}`);
+    }
+    return result.data;
+}
+export function parseDetectionResponse(responseText, videoWidth = 1000, videoHeight = 1000) {
+    const cleaned = stripCodeFences(responseText);
+    let raw;
+    try {
+        raw = JSON.parse(cleaned);
+    }
+    catch {
+        throw new Error(`AI provider returned non-JSON response. Response preview: ${cleaned.slice(0, 200)}`);
+    }
+    if (Array.isArray(raw)) {
+        raw = normaliseRawArray(raw);
+    }
+    return normaliseBoxCoordinates(validateDetectionSchema(raw), videoWidth, videoHeight);
+}
+/**
+ * Convert normalized AI detections to pixel-coordinate regions.
+ * Filters by confidence and applies padding.
+ */
+/**
+ * Convert a single raw detection box to padded pixel coordinates.
+ *
+ * Applies double upward padding to account for the systematic downward drift
+ * common in vision-model bounding boxes (text baseline vs. cap-height).
+ *
+ * Extends the region to full video width when it starts near the left edge
+ * and spans a wide portion of the frame — the pattern for a code-editor or
+ * terminal row where long token values often extend past the detected box.
+ */
+function expandDetectionBox(rawX, rawY, rawW, rawH, videoWidth, videoHeight, paddingPixels) {
+    // Double upward padding compensates for model Y-offset toward the baseline
+    const upPad = paddingPixels * 2;
+    const y = Math.max(0, Math.round(rawY - upPad));
+    // Two cases indicate a full editor/terminal row that should be extended
+    // to the full frame width so the entire value is redacted:
+    //   (a) Detection starts near the left edge (original heuristic)
+    //   (b) Detection is wide AND reaches the right side of the frame — the
+    //       common pattern when an AI anchors its box to where key text is
+    //       visually dense but misses the left portion of the value.
+    const startsNearLeft = rawX < videoWidth * 0.2 && rawW > videoWidth * 0.25;
+    const reachesRightEdge = rawX + rawW > videoWidth * 0.7 && rawW > videoWidth * 0.3;
+    const isFullRowLike = startsNearLeft || reachesRightEdge;
+    // For right-anchored wide detections, pull the left edge back to the typical
+    // editor-content boundary (~15% from left) to cover the start of the value.
+    let x;
+    if (isFullRowLike && rawX > videoWidth * 0.2) {
+        x = Math.round(videoWidth * 0.15);
+    }
+    else {
+        x = Math.max(0, Math.round(rawX - paddingPixels));
+    }
+    const w = isFullRowLike
+        ? videoWidth - x
+        : Math.min(videoWidth - x, Math.round(rawW + paddingPixels * 2));
+    const h = Math.min(videoHeight - y, Math.round(rawH + upPad + paddingPixels));
+    return { x, y, width: w, height: h };
+}
+export function convertDetectionsToRegions(response, videoWidth, videoHeight, timestamps, minConfidence, paddingPixels) {
+    const regions = [];
+    for (const frameResult of response.frames) {
+        const timestamp = timestamps[frameResult.frameIndex] ?? frameResult.timestampSec;
+        for (const detection of frameResult.detections) {
+            // Filter by confidence
+            if (detection.confidence < minConfidence)
+                continue;
+            // Skip detections whose box could not be parsed
+            if (!detection.box)
+                continue;
+            // Convert normalized coords to pixels, applying padding and full-row extension
+            const rawX = detection.box.left * videoWidth;
+            const rawY = detection.box.top * videoHeight;
+            const rawW = detection.box.width * videoWidth;
+            const rawH = detection.box.height * videoHeight;
+            const { x, y, width: w, height: h, } = expandDetectionBox(rawX, rawY, rawW, rawH, videoWidth, videoHeight, paddingPixels);
+            // Skip degenerate boxes
+            if (w <= 0 || h <= 0)
+                continue;
+            regions.push({
+                x,
+                y,
+                width: w,
+                height: h,
+                label: detection.label,
+                confidence: detection.confidence,
+                startTime: timestamp,
+                endTime: timestamp,
+            });
+        }
+    }
+    return regions;
+}
+/**
+ * Check if two regions match by label and proximity.
+ */
+function regionsMatch(a, b, threshold) {
+    if (a.label !== b.label)
+        return false;
+    const dx = Math.abs(a.x - b.x);
+    const dy = Math.abs(a.y - b.y);
+    return dx < threshold && dy < threshold;
+}
+/**
+ * Expand a region's spatial bounds to encompass another region.
+ */
+function expandRegionBounds(target, source) {
+    const startX = Math.min(target.x, source.x);
+    const startY = Math.min(target.y, source.y);
+    const endX = Math.max(target.x + target.width, source.x + source.width);
+    const endY = Math.max(target.y + target.height, source.y + source.height);
+    target.x = startX;
+    target.y = startY;
+    target.width = endX - startX;
+    target.height = endY - startY;
+}
+/**
+ * Update a region's temporal and confidence metadata from another region.
+ */
+function expandRegionMetadata(target, source) {
+    const targetStart = target.startTime ?? 0;
+    const sourceStart = source.startTime ?? 0;
+    const targetEnd = target.endTime ?? 0;
+    const sourceEnd = source.endTime ?? 0;
+    target.startTime = Math.min(targetStart, sourceStart);
+    target.endTime = Math.max(targetEnd, sourceEnd);
+    target.confidence = Math.max(target.confidence ?? 0, source.confidence ?? 0);
+}
+/**
+ * Expand a region to encompass another region.
+ */
+function expandRegion(target, source) {
+    expandRegionBounds(target, source);
+    expandRegionMetadata(target, source);
+}
+/**
+ * Merge overlapping detections across frames into consolidated regions.
+ * Uses simple label + proximity matching (no full IOU in v1).
+ */
+export function mergeDetections(regions, proximityThreshold = 50) {
+    if (regions.length === 0)
+        return [];
+    const merged = [];
+    for (const region of regions) {
+        const existing = merged.find((m) => regionsMatch(m, region, proximityThreshold));
+        if (existing) {
+            expandRegion(existing, region);
+        }
+        else {
+            merged.push({ ...region });
+        }
+    }
+    return merged;
+}
+/**
+ * Compute Intersection-over-Union between two boxes.
+ */
+function computeIOU(a, b) {
+    const x1 = Math.max(a.x, b.x);
+    const y1 = Math.max(a.y, b.y);
+    const x2 = Math.min(a.x + a.width, b.x + b.width);
+    const y2 = Math.min(a.y + a.height, b.y + b.height);
+    const intersection = Math.max(0, x2 - x1) * Math.max(0, y2 - y1);
+    if (intersection === 0)
+        return 0;
+    const areaA = a.width * a.height;
+    const areaB = b.width * b.height;
+    const union = areaA + areaB - intersection;
+    return union > 0 ? intersection / union : 0;
+}
+/**
+ * Find the best matching track for a region using IOU and label matching.
+ */
+function findBestTrack(region, tracks, iouThreshold, maxTimeGap) {
+    let bestTrack = null;
+    let bestIOU = 0;
+    for (const track of tracks) {
+        if (track.label !== (region.label ?? 'unknown'))
+            continue;
+        if (region.startTime != null &&
+            region.startTime - track.endTime > maxTimeGap)
+            continue;
+        const iou = computeIOU({ x: region.x, y: region.y, width: region.width, height: region.height }, { x: track.x, y: track.y, width: track.width, height: track.height });
+        if (iou > bestIOU && iou >= iouThreshold) {
+            bestIOU = iou;
+            bestTrack = track;
+        }
+    }
+    return bestTrack;
+}
+/**
+ * Extend a track with a new detection.
+ */
+function extendTrack(track, region) {
+    const ts = region.startTime ?? 0;
+    track.endTime = Math.max(track.endTime, ts);
+    track.frameCount++;
+    track.confidence = Math.max(track.confidence, region.confidence ?? 0);
+    const minX = Math.min(track.x, region.x);
+    const minY = Math.min(track.y, region.y);
+    const maxX = Math.max(track.x + track.width, region.x + region.width);
+    const maxY = Math.max(track.y + track.height, region.y + region.height);
+    track.x = minX;
+    track.y = minY;
+    track.width = maxX - minX;
+    track.height = maxY - minY;
+}
+/**
+ * Group detections into temporal tracks across adjacent sampled frames.
+ *
+ * Uses IOU overlap and label similarity to merge boxes that represent the
+ * same persistent secret across time. Treats persistent detections in
+ * roughly the same area as one track.
+ *
+ * @param regions - Flat list of per-frame detections (already converted to pixels)
+ * @param iouThreshold - Minimum IOU to consider two boxes the same track (default: 0.3)
+ * @param sampleInterval - Seconds between sampled frames (used for gap-filling)
+ * @returns Consolidated tracks with stable time ranges
+ */
+export function groupDetectionsIntoTracks(regions, iouThreshold = 0.3, sampleInterval = 5) {
+    if (regions.length === 0)
+        return [];
+    const sorted = [...regions].sort((a, b) => (a.startTime ?? 0) - (b.startTime ?? 0));
+    const tracks = [];
+    const maxTimeGap = sampleInterval * 2;
+    for (const region of sorted) {
+        const bestTrack = findBestTrack(region, tracks, iouThreshold, maxTimeGap);
+        if (bestTrack) {
+            extendTrack(bestTrack, region);
+        }
+        else {
+            const ts = region.startTime ?? 0;
+            tracks.push({
+                label: region.label ?? 'unknown',
+                x: region.x,
+                y: region.y,
+                width: region.width,
+                height: region.height,
+                startTime: ts,
+                endTime: ts,
+                confidence: region.confidence ?? 0,
+                frameCount: 1,
+            });
+        }
+    }
+    return tracks;
+}
+/**
+ * Convert temporal tracks into a final `RedactionPlan` with time ranges.
+ *
+ * - Start time: detection timestamp minus half the sample interval
+ * - End time: last detection timestamp plus half the sample interval
+ * - Clamp to video duration
+ * - Merge short gaps between nearby detections
+ * - Expand boxes slightly to absorb tiny camera/UI shifts
+ *
+ * @param tracks - Grouped detection tracks
+ * @param videoDuration - Total video duration in seconds
+ * @param sampleInterval - Seconds between sampled frames
+ * @param paddingPixels - Extra padding to expand each region
+ * @param videoWidth - Video width for clamping
+ * @param videoHeight - Video height for clamping
+ * @returns Final consolidated redaction regions
+ */
+export function tracksToRedactionPlan(tracks, videoDuration, sampleInterval, paddingPixels, videoWidth, videoHeight) {
+    const halfInterval = sampleInterval / 2;
+    return tracks.map((track) => {
+        // Expand time range by half the sample interval
+        const startTime = Math.max(0, track.startTime - halfInterval);
+        const endTime = Math.min(videoDuration, track.endTime + halfInterval);
+        // Expand spatial box by padding
+        const x = Math.max(0, track.x - paddingPixels);
+        const y = Math.max(0, track.y - paddingPixels);
+        const width = Math.min(videoWidth - x, track.width + paddingPixels * 2);
+        const height = Math.min(videoHeight - y, track.height + paddingPixels * 2);
+        return {
+            x,
+            y,
+            width,
+            height,
+            label: track.label,
+            confidence: track.confidence,
+            startTime,
+            endTime,
+        };
+    });
+}
+/**
+ * Full consolidation pipeline: raw detections → stable redaction regions.
+ *
+ * 1. Group detections into temporal tracks (IOU + label matching)
+ * 2. Convert tracks to time-bounded regions
+ * 3. Expand boxes and clamp to video bounds
+ *
+ * This replaces the simpler `mergeDetections()` for AI-mode detection.
+ */
+export function consolidateDetections(regions, videoDuration, sampleInterval, paddingPixels, videoWidth, videoHeight, iouThreshold = 0.3) {
+    const tracks = groupDetectionsIntoTracks(regions, iouThreshold, sampleInterval);
+    return tracksToRedactionPlan(tracks, videoDuration, sampleInterval, paddingPixels, videoWidth, videoHeight);
+}
+//# sourceMappingURL=redactionDetector.js.map