npm - vidclaude - Versions diffs - 0.2.0 - Mend

vidclaude 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

package/README.md +237 -0
package/SKILL.md +138 -0
package/bin/setup.js +45 -0
package/bin/vidclaude.js +27 -0
package/package.json +31 -0
package/requirements.txt +2 -0
package/vidclaude/SKILL.md +138 -0
package/vidclaude/__init__.py +3 -0
package/vidclaude/__pycache__/__init__.cpython-313.pyc +0 -0
package/vidclaude/__pycache__/audio.cpython-313.pyc +0 -0
package/vidclaude/__pycache__/cli.cpython-313.pyc +0 -0
package/vidclaude/__pycache__/ingest.cpython-313.pyc +0 -0
package/vidclaude/__pycache__/intent.cpython-313.pyc +0 -0
package/vidclaude/__pycache__/memory.cpython-313.pyc +0 -0
package/vidclaude/__pycache__/models.cpython-313.pyc +0 -0
package/vidclaude/__pycache__/ocr.cpython-313.pyc +0 -0
package/vidclaude/__pycache__/reason.cpython-313.pyc +0 -0
package/vidclaude/__pycache__/segment.cpython-313.pyc +0 -0
package/vidclaude/__pycache__/timeline.cpython-313.pyc +0 -0
package/vidclaude/__pycache__/util.cpython-313.pyc +0 -0
package/vidclaude/audio.py +193 -0
package/vidclaude/cli.py +389 -0
package/vidclaude/ingest.py +80 -0
package/vidclaude/intent.py +116 -0
package/vidclaude/memory.py +174 -0
package/vidclaude/models.py +162 -0
package/vidclaude/ocr.py +110 -0
package/vidclaude/reason.py +285 -0
package/vidclaude/segment.py +239 -0
package/vidclaude/timeline.py +95 -0
package/vidclaude/util.py +163 -0
package/video_understand.py +12 -0

package/vidclaude/reason.py ADDED Viewed

@@ -0,0 +1,285 @@
+"""Layer J: Evidence assembly and output generation.
+Two modes:
+1. Skill mode (--extract): Generates evidence.md for Claude Code to read + reason over.
+2. API mode (--api): Builds Anthropic API message with base64 frames + evidence.
+"""
+from __future__ import annotations
+import logging
+from pathlib import Path
+from .models import (
+    Evidence, VideoMeta, Frame, TranscriptChunk, OCRResult, TimelineEvent,
+    save_json,
+)
+from .util import ms_to_hhmmss, image_to_base64
+logger = logging.getLogger("vidclaude")
+# Maximum frames to include in evidence.md frame listing
+MAX_EVIDENCE_FRAMES = 20
+def generate_evidence_md(evidence: Evidence, cache_dir: Path) -> str:
+    """Generate evidence.md — a structured report for Claude Code skill mode.
+    This file is read by Claude in the conversation to reason over the video.
+    Frame images are referenced by path so Claude Code can read them directly.
+    """
+    lines: list[str] = []
+    # Header
+    lines.append("# Video Analysis Evidence Report")
+    lines.append("")
+    # Video metadata
+    meta = evidence.video_meta
+    lines.append("## Video Information")
+    lines.append(f"- **File**: `{meta.path}`")
+    lines.append(f"- **Duration**: {ms_to_hhmmss(int(meta.duration_sec * 1000))}")
+    lines.append(f"- **Resolution**: {meta.resolution[0]}x{meta.resolution[1]}")
+    lines.append(f"- **FPS**: {meta.fps:.1f}")
+    lines.append(f"- **Audio tracks**: {meta.audio_tracks}")
+    lines.append(f"- **Format**: {meta.format}")
+    lines.append("")
+    # Question and intent
+    if evidence.question:
+        lines.append("## Question")
+        lines.append(evidence.question)
+        lines.append(f"- **Detected intent**: {evidence.intent}")
+        lines.append("")
+    # Global summary
+    if evidence.global_summary:
+        lines.append("## Global Summary")
+        lines.append(evidence.global_summary)
+        lines.append("")
+    # Scene summaries
+    if evidence.scene_summaries:
+        lines.append("## Scene Summaries")
+        for summary in evidence.scene_summaries:
+            lines.append(summary)
+            lines.append("")
+    # Extracted frames listing
+    lines.append("## Extracted Frames")
+    lines.append(f"Total frames: {len(evidence.frames)}")
+    lines.append("")
+    # Select frames to list (cap at MAX_EVIDENCE_FRAMES for readability)
+    display_frames = _select_display_frames(evidence.frames, MAX_EVIDENCE_FRAMES)
+    for frame in display_frames:
+        ts = ms_to_hhmmss(frame.timestamp_ms)
+        reasons = ", ".join(frame.sampling_reason)
+        lines.append(f"- **{frame.frame_id}** @ {ts} [{reasons}]: `{frame.image_path}`")
+    lines.append("")
+    # Transcript
+    if evidence.transcript_chunks:
+        lines.append("## Audio Transcript")
+        for chunk in evidence.transcript_chunks:
+            ts_start = ms_to_hhmmss(chunk.start_ms)
+            ts_end = ms_to_hhmmss(chunk.end_ms)
+            speaker = f"[{chunk.speaker}] " if chunk.speaker else ""
+            lines.append(f"- [{ts_start} → {ts_end}] {speaker}{chunk.text}")
+        lines.append("")
+    # OCR results
+    if evidence.ocr_results:
+        lines.append("## On-Screen Text (OCR)")
+        for ocr in evidence.ocr_results:
+            ts = ms_to_hhmmss(ocr.timestamp_ms)
+            lines.append(f"- [{ts}] (conf: {ocr.confidence:.0%}) {ocr.text}")
+        lines.append("")
+    # Timeline
+    if evidence.timeline_events:
+        lines.append("## Timeline")
+        for event in evidence.timeline_events:
+            ts = ms_to_hhmmss(event.start_ms)
+            end = f" → {ms_to_hhmmss(event.end_ms)}" if event.end_ms else ""
+            lines.append(f"- [{ts}{end}] **{event.modality}**: {event.summary}")
+        lines.append("")
+    # Frame paths for Claude Code to read
+    lines.append("## Frame Paths (for visual analysis)")
+    lines.append("The following frame images can be viewed for visual analysis:")
+    lines.append("")
+    for frame in display_frames:
+        lines.append(f"- `{frame.image_path}`")
+    lines.append("")
+    content = "\n".join(lines)
+    # Write to cache
+    evidence_path = cache_dir / "evidence.md"
+    evidence_path.write_text(content, encoding="utf-8")
+    logger.info("Evidence report written to %s", evidence_path)
+    return content
+def build_api_message(evidence: Evidence) -> dict:
+    """Build the Anthropic API message for standalone mode.
+    Returns dict with 'system' and 'messages' for the API call.
+    """
+    system_prompt = (
+        "You are analyzing a video. You will be shown frames extracted from the video "
+        "at regular intervals, along with an audio transcript and other evidence if available. "
+        "Analyze the visual and audio content to answer the user's question.\n\n"
+        "Guidelines:\n"
+        "- Ground every claim in timestamps\n"
+        "- Distinguish observation from inference\n"
+        "- If evidence is insufficient or ambiguous, say so clearly\n"
+        "- Rate your confidence as high/medium/low"
+    )
+    content: list[dict] = []
+    # Video info header
+    meta = evidence.video_meta
+    content.append({
+        "type": "text",
+        "text": (
+            f"Video: {meta.duration_sec:.1f}s, {meta.resolution[0]}x{meta.resolution[1]}, "
+            f"{meta.audio_tracks} audio track(s)\n"
+            f"Sharing {len(evidence.frames)} frames. Intent: {evidence.intent}"
+        ),
+    })
+    # Global summary
+    if evidence.global_summary:
+        content.append({
+            "type": "text",
+            "text": f"<global_summary>\n{evidence.global_summary}\n</global_summary>",
+        })
+    # Frames as base64 images
+    display_frames = _select_display_frames(evidence.frames, MAX_EVIDENCE_FRAMES)
+    for frame in display_frames:
+        ts = ms_to_hhmmss(frame.timestamp_ms)
+        content.append({
+            "type": "text",
+            "text": f"Frame {frame.frame_id} @ {ts}:",
+        })
+        try:
+            b64 = image_to_base64(frame.image_path)
+            content.append({
+                "type": "image",
+                "source": {
+                    "type": "base64",
+                    "media_type": "image/jpeg",
+                    "data": b64,
+                },
+            })
+        except Exception as e:
+            logger.warning("Could not encode frame %s: %s", frame.frame_id, e)
+    # Timeline
+    if evidence.timeline_events:
+        timeline_text = "\n".join(
+            f"[{ms_to_hhmmss(e.start_ms)}] {e.modality}: {e.summary}"
+            for e in evidence.timeline_events
+        )
+        content.append({
+            "type": "text",
+            "text": f"<timeline>\n{timeline_text}\n</timeline>",
+        })
+    # Transcript
+    if evidence.transcript_chunks:
+        transcript_text = "\n".join(
+            f"[{ms_to_hhmmss(c.start_ms)} → {ms_to_hhmmss(c.end_ms)}] {c.text}"
+            for c in evidence.transcript_chunks
+        )
+        content.append({
+            "type": "text",
+            "text": f"<transcript>\n{transcript_text}\n</transcript>",
+        })
+    # OCR
+    if evidence.ocr_results:
+        ocr_text = "\n".join(
+            f"[{ms_to_hhmmss(o.timestamp_ms)}] {o.text}"
+            for o in evidence.ocr_results
+        )
+        content.append({
+            "type": "text",
+            "text": f"<ocr>\n{ocr_text}\n</ocr>",
+        })
+    # Question
+    question = evidence.question or "Describe what's happening in this video in detail."
+    content.append({
+        "type": "text",
+        "text": question,
+    })
+    return {
+        "system": system_prompt,
+        "messages": [{"role": "user", "content": content}],
+    }
+def call_claude_api(evidence: Evidence) -> str:
+    """Make the Claude API call for standalone mode. Requires ANTHROPIC_API_KEY."""
+    try:
+        import anthropic
+    except ImportError:
+        raise RuntimeError(
+            "anthropic package not installed. Install with: pip install anthropic\n"
+            "Or use skill mode (--extract) which doesn't need the API."
+        )
+    import os
+    if not os.environ.get("ANTHROPIC_API_KEY"):
+        raise RuntimeError(
+            "ANTHROPIC_API_KEY not set. Set it with:\n"
+            "  export ANTHROPIC_API_KEY=sk-...\n"
+            "Or use skill mode (--extract) which doesn't need the API."
+        )
+    msg = build_api_message(evidence)
+    client = anthropic.Anthropic()
+    logger.info("Calling Claude API (model=claude-sonnet-4-20250514)...")
+    response = client.messages.create(
+        model="claude-sonnet-4-20250514",
+        max_tokens=4096,
+        system=msg["system"],
+        messages=msg["messages"],
+    )
+    return response.content[0].text
+def _select_display_frames(frames: list[Frame], max_count: int) -> list[Frame]:
+    """Select the most representative frames for display.
+    Prioritizes shot boundary frames, then evenly spaces base frames.
+    """
+    if len(frames) <= max_count:
+        return frames
+    # Separate boundary and base frames
+    boundary = [f for f in frames if "shot_boundary" in f.sampling_reason]
+    base = [f for f in frames if "shot_boundary" not in f.sampling_reason]
+    # Keep all boundary frames up to half the budget
+    max_boundary = min(len(boundary), max_count // 2)
+    selected = boundary[:max_boundary]
+    # Fill remaining with evenly spaced base frames
+    remaining = max_count - len(selected)
+    if remaining > 0 and base:
+        step = max(1, len(base) // remaining)
+        selected.extend(base[::step][:remaining])
+    # Sort by timestamp
+    selected.sort(key=lambda f: f.timestamp_ms)
+    return selected

package/vidclaude/segment.py ADDED Viewed

@@ -0,0 +1,239 @@
+"""Layers B+C: Shot boundary detection and adaptive visual sampling.
+Shot detection uses ffmpeg's scene change filter.
+Adaptive sampling adjusts frame rate based on content and mode.
+"""
+from __future__ import annotations
+import logging
+import re
+from pathlib import Path
+from .models import VideoMeta, Shot, Frame
+from .util import run_ffmpeg
+logger = logging.getLogger("vidclaude")
+# Mode-specific sampling parameters
+MODE_CONFIG = {
+    "quick": {
+        "base_fps": 0.2,
+        "max_frames": 20,
+        "burst_frames": 0,      # no extra frames at transitions
+    },
+    "standard": {
+        "base_fps": 0.5,
+        "max_frames": 60,
+        "burst_frames": 2,      # +2 frames around each shot boundary
+    },
+    "deep": {
+        "base_fps": 1.0,
+        "max_frames": 150,
+        "burst_frames": 4,      # +4 frames around transitions
+    },
+}
+def detect_shots(meta: VideoMeta, threshold: float = 0.3) -> list[Shot]:
+    """Detect shot boundaries using ffmpeg scene change filter.
+    Returns a list of Shot objects representing contiguous segments.
+    """
+    logger.info("Detecting shots (threshold=%.2f)...", threshold)
+    # Use select filter with scene detection + showinfo to get timestamps
+    result = run_ffmpeg([
+        "-i", meta.path,
+        "-vf", f"select='gt(scene,{threshold})',showinfo",
+        "-vsync", "vfr",
+        "-f", "null",
+        "-",
+    ])
+    # Parse showinfo output from stderr for timestamps
+    # Pattern: [Parsed_showinfo...] n:... pts:... pts_time:12.345 ...
+    boundary_times: list[float] = [0.0]  # video always starts at 0
+    for line in result.stderr.split("\n"):
+        match = re.search(r"pts_time:\s*([\d.]+)", line)
+        if match:
+            t = float(match.group(1))
+            if t > 0:
+                boundary_times.append(t)
+    # Also get scene scores where available
+    scores: list[float] = []
+    for line in result.stderr.split("\n"):
+        match = re.search(r"scene_score=\s*([\d.]+)", line)
+        if match:
+            scores.append(float(match.group(1)))
+    # Add video end as final boundary
+    boundary_times.append(meta.duration_sec)
+    # Remove duplicates and sort
+    boundary_times = sorted(set(boundary_times))
+    # Build shots from consecutive boundary pairs
+    shots: list[Shot] = []
+    for i in range(len(boundary_times) - 1):
+        start_ms = int(boundary_times[i] * 1000)
+        end_ms = int(boundary_times[i + 1] * 1000)
+        if end_ms <= start_ms:
+            continue
+        score = scores[i] if i < len(scores) else threshold
+        shots.append(Shot(
+            shot_id=f"s_{i:04d}",
+            start_ms=start_ms,
+            end_ms=end_ms,
+            score=score,
+        ))
+    # If no shots detected (no scene changes), treat entire video as one shot
+    if not shots:
+        shots = [Shot(
+            shot_id="s_0000",
+            start_ms=0,
+            end_ms=int(meta.duration_sec * 1000),
+            score=0.0,
+        )]
+    logger.info("Detected %d shot(s)", len(shots))
+    return shots
+def compute_sample_timestamps(
+    meta: VideoMeta,
+    shots: list[Shot],
+    mode: str,
+    fps_override: float | None = None,
+    max_frames_override: int | None = None,
+) -> list[tuple[int, list[str]]]:
+    """Compute which timestamps to sample frames at.
+    Returns list of (timestamp_ms, [sampling_reasons]).
+    """
+    config = MODE_CONFIG[mode]
+    base_fps = fps_override if fps_override is not None else config["base_fps"]
+    max_frames = max_frames_override if max_frames_override is not None else config["max_frames"]
+    burst = config["burst_frames"]
+    duration_ms = int(meta.duration_sec * 1000)
+    # Smart frame budget: reduce fps if we'd exceed max_frames
+    total_base = int(meta.duration_sec * base_fps)
+    if total_base > max_frames:
+        base_fps = max_frames / meta.duration_sec
+        logger.info("Auto-reduced fps to %.3f to stay under %d frames", base_fps, max_frames)
+    # Step 1: Base sampling at uniform intervals
+    samples: dict[int, list[str]] = {}  # timestamp_ms -> reasons
+    if base_fps > 0 and meta.duration_sec > 0:
+        interval_ms = int(1000 / base_fps)
+        t = 0
+        while t < duration_ms:
+            samples[t] = ["base"]
+            t += interval_ms
+    # Ensure at least one frame
+    if not samples:
+        samples[0] = ["base"]
+    # Step 2: Burst frames around shot boundaries
+    if burst > 0 and len(shots) > 1:
+        for shot in shots[1:]:  # skip first boundary (t=0)
+            boundary_ms = shot.start_ms
+            # Add frames before and after the boundary
+            for offset in range(-burst // 2, burst // 2 + 1):
+                t = boundary_ms + offset * 200  # 200ms spacing for burst frames
+                if 0 <= t < duration_ms:
+                    if t in samples:
+                        if "shot_boundary" not in samples[t]:
+                            samples[t].append("shot_boundary")
+                    else:
+                        samples[t] = ["shot_boundary"]
+    # Step 3: Enforce max_frames budget
+    sorted_samples = sorted(samples.items(), key=lambda x: x[0])
+    if len(sorted_samples) > max_frames:
+        # Prioritize: shot_boundary frames first, then evenly spaced base frames
+        boundary_frames = [(t, r) for t, r in sorted_samples if "shot_boundary" in r]
+        base_frames = [(t, r) for t, r in sorted_samples if "shot_boundary" not in r]
+        # Keep all boundary frames up to half the budget
+        max_boundary = min(len(boundary_frames), max_frames // 2)
+        kept_boundary = boundary_frames[:max_boundary]
+        # Fill remaining budget with evenly spaced base frames
+        remaining = max_frames - len(kept_boundary)
+        if remaining > 0 and base_frames:
+            step = max(1, len(base_frames) // remaining)
+            kept_base = base_frames[::step][:remaining]
+        else:
+            kept_base = []
+        sorted_samples = sorted(kept_boundary + kept_base, key=lambda x: x[0])
+    logger.debug("Computed %d sample timestamps", len(sorted_samples))
+    return sorted_samples
+def extract_frames(
+    meta: VideoMeta,
+    shots: list[Shot],
+    mode: str,
+    cache_dir: Path,
+    fps_override: float | None = None,
+    max_frames_override: int | None = None,
+) -> list[Frame]:
+    """Extract frames from video based on adaptive sampling.
+    Saves frames as JPEG to cache_dir/frames/ and returns Frame objects.
+    """
+    frames_dir = cache_dir / "frames"
+    frames_dir.mkdir(exist_ok=True)
+    timestamps = compute_sample_timestamps(
+        meta, shots, mode, fps_override, max_frames_override
+    )
+    logger.info("Extracting %d frames...", len(timestamps))
+    frames: list[Frame] = []
+    shot_index = 0
+    for seq, (ts_ms, reasons) in enumerate(timestamps):
+        # Find which shot this timestamp belongs to
+        while shot_index < len(shots) - 1 and ts_ms >= shots[shot_index].end_ms:
+            shot_index += 1
+        shot_id = shots[shot_index].shot_id if shot_index < len(shots) else "s_unknown"
+        # Output filename: frame_NNNN_TTTTTTTT.jpg
+        fname = f"frame_{seq:04d}_{ts_ms:08d}.jpg"
+        out_path = frames_dir / fname
+        # Extract single frame at this timestamp
+        ts_sec = ts_ms / 1000.0
+        result = run_ffmpeg([
+            "-ss", f"{ts_sec:.3f}",
+            "-i", meta.path,
+            "-frames:v", "1",
+            "-q:v", "2",
+            "-y",
+            str(out_path),
+        ])
+        if result.returncode != 0 or not out_path.exists():
+            logger.warning("Failed to extract frame at %.3fs", ts_sec)
+            continue
+        frames.append(Frame(
+            frame_id=f"f_{seq:04d}",
+            timestamp_ms=ts_ms,
+            shot_id=shot_id,
+            sampling_reason=reasons,
+            image_path=str(out_path),
+        ))
+    logger.info("Extracted %d frames successfully", len(frames))
+    return frames

package/vidclaude/timeline.py ADDED Viewed

@@ -0,0 +1,95 @@
+"""Layer G: Temporal event graph (simplified as sorted event list).
+Merges evidence from all modalities into a unified, time-sorted event list
+that gives Claude explicit temporal ordering for reasoning.
+"""
+from __future__ import annotations
+import logging
+from .models import Shot, Frame, TranscriptChunk, OCRResult, TimelineEvent
+logger = logging.getLogger("vidclaude")
+def build_timeline(
+    shots: list[Shot],
+    frames: list[Frame],
+    transcript: list[TranscriptChunk],
+    ocr_results: list[OCRResult],
+) -> list[TimelineEvent]:
+    """Build a unified timeline from all modality outputs.
+    Merges shot boundaries, frame observations, transcript chunks,
+    and OCR results into a single sorted event list.
+    """
+    events: list[TimelineEvent] = []
+    event_counter = 0
+    # Shot boundaries as scene change events
+    for i, shot in enumerate(shots):
+        if i == 0:
+            continue  # skip first (video start, not a transition)
+        events.append(TimelineEvent(
+            event_id=f"te_{event_counter:04d}",
+            start_ms=shot.start_ms,
+            end_ms=None,
+            modality="scene_change",
+            summary=f"Scene change (confidence: {shot.score:.2f})",
+            source_ids=[shot.shot_id],
+        ))
+        event_counter += 1
+    # Transcript chunks as speech events
+    for chunk in transcript:
+        events.append(TimelineEvent(
+            event_id=f"te_{event_counter:04d}",
+            start_ms=chunk.start_ms,
+            end_ms=chunk.end_ms,
+            modality="speech",
+            summary=chunk.text,
+            source_ids=[chunk.chunk_id],
+        ))
+        event_counter += 1
+    # OCR results as text-on-screen events
+    for ocr in ocr_results:
+        events.append(TimelineEvent(
+            event_id=f"te_{event_counter:04d}",
+            start_ms=ocr.timestamp_ms,
+            end_ms=None,
+            modality="ocr",
+            summary=f"On-screen text: {ocr.text}",
+            source_ids=[ocr.frame_id],
+        ))
+        event_counter += 1
+    # Frame extractions as visual observation markers
+    # Only include shot_boundary frames to avoid flooding the timeline
+    for frame in frames:
+        if "shot_boundary" in frame.sampling_reason:
+            events.append(TimelineEvent(
+                event_id=f"te_{event_counter:04d}",
+                start_ms=frame.timestamp_ms,
+                end_ms=None,
+                modality="visual",
+                summary=f"Keyframe captured ({', '.join(frame.sampling_reason)})",
+                source_ids=[frame.frame_id],
+            ))
+            event_counter += 1
+    # Sort by start time, then by modality priority for same-time events
+    modality_order = {"scene_change": 0, "visual": 1, "speech": 2, "ocr": 3}
+    events.sort(key=lambda e: (e.start_ms, modality_order.get(e.modality, 9)))
+    logger.info(
+        "Built timeline with %d events: %d speech, %d ocr, %d scene_change, %d visual",
+        len(events),
+        sum(1 for e in events if e.modality == "speech"),
+        sum(1 for e in events if e.modality == "ocr"),
+        sum(1 for e in events if e.modality == "scene_change"),
+        sum(1 for e in events if e.modality == "visual"),
+    )
+    return events