npm - vidclaude - Versions diffs - 0.2.0 - Mend

vidclaude 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

package/README.md +237 -0
package/SKILL.md +138 -0
package/bin/setup.js +45 -0
package/bin/vidclaude.js +27 -0
package/package.json +31 -0
package/requirements.txt +2 -0
package/vidclaude/SKILL.md +138 -0
package/vidclaude/__init__.py +3 -0
package/vidclaude/__pycache__/__init__.cpython-313.pyc +0 -0
package/vidclaude/__pycache__/audio.cpython-313.pyc +0 -0
package/vidclaude/__pycache__/cli.cpython-313.pyc +0 -0
package/vidclaude/__pycache__/ingest.cpython-313.pyc +0 -0
package/vidclaude/__pycache__/intent.cpython-313.pyc +0 -0
package/vidclaude/__pycache__/memory.cpython-313.pyc +0 -0
package/vidclaude/__pycache__/models.cpython-313.pyc +0 -0
package/vidclaude/__pycache__/ocr.cpython-313.pyc +0 -0
package/vidclaude/__pycache__/reason.cpython-313.pyc +0 -0
package/vidclaude/__pycache__/segment.cpython-313.pyc +0 -0
package/vidclaude/__pycache__/timeline.cpython-313.pyc +0 -0
package/vidclaude/__pycache__/util.cpython-313.pyc +0 -0
package/vidclaude/audio.py +193 -0
package/vidclaude/cli.py +389 -0
package/vidclaude/ingest.py +80 -0
package/vidclaude/intent.py +116 -0
package/vidclaude/memory.py +174 -0
package/vidclaude/models.py +162 -0
package/vidclaude/ocr.py +110 -0
package/vidclaude/reason.py +285 -0
package/vidclaude/segment.py +239 -0
package/vidclaude/timeline.py +95 -0
package/vidclaude/util.py +163 -0
package/video_understand.py +12 -0

package/vidclaude/intent.py ADDED Viewed

@@ -0,0 +1,116 @@
+"""Intent classification for query-conditioned processing.
+Classifies user questions into intent categories and returns
+processing configuration that adjusts pipeline behavior.
+"""
+from __future__ import annotations
+import re
+# Intent class definitions
+INTENT_DESCRIBE = "describe"
+INTENT_MOMENT = "moment_retrieval"
+INTENT_TEMPORAL = "temporal_ordering"
+INTENT_COUNTING = "counting"
+INTENT_OCR = "ocr_extraction"
+INTENT_SPEECH = "speech_understanding"
+INTENT_GENERAL = "general_qa"
+# Pattern-based classification rules (checked in order, first match wins)
+_INTENT_PATTERNS: list[tuple[str, list[str]]] = [
+    (INTENT_COUNTING, [
+        r"\bhow many\b", r"\bcount\b", r"\bhow often\b",
+        r"\bnumber of\b", r"\bfrequency\b",
+    ]),
+    (INTENT_TEMPORAL, [
+        r"\bbefore\b.*\bafter\b", r"\bafter\b.*\bbefore\b",
+        r"\bfirst\b.*\bthen\b", r"\border\b", r"\bsequence\b",
+        r"\bbefore or after\b", r"\bwhich came first\b",
+        r"\bchronolog", r"\btimeline\b",
+    ]),
+    (INTENT_MOMENT, [
+        r"\bwhen\b", r"\bat what point\b", r"\bat what time\b",
+        r"\bwhat time\b", r"\btimestamp\b", r"\bmoment\b",
+        r"\bfind the part\b", r"\bshow me where\b",
+    ]),
+    (INTENT_OCR, [
+        r"\btext\b", r"\bread\b", r"\bsign\b", r"\bslide\b",
+        r"\bwritten\b", r"\bscreen\b", r"\bdisplay\b",
+        r"\btitle\b", r"\bcaption\b", r"\bsubtitle\b",
+        r"\bwhat does it say\b", r"\bwhat is written\b",
+    ]),
+    (INTENT_SPEECH, [
+        r"\bsay\b", r"\bsaid\b", r"\bmention\b", r"\bspeak\b",
+        r"\bspoke\b", r"\btalk\b", r"\bword\b", r"\bdiscuss\b",
+        r"\bconversat", r"\bdialog",
+    ]),
+    (INTENT_DESCRIBE, [
+        r"\bdescribe\b", r"\bsummar", r"\bwhat happens\b",
+        r"\bwhat is happening\b", r"\boverview\b", r"\bwhat.s going on\b",
+    ]),
+]
+def classify_intent(question: str | None) -> str:
+    """Classify a user question into an intent category.
+    Returns one of the INTENT_* constants.
+    """
+    if not question:
+        return INTENT_DESCRIBE
+    q = question.lower().strip()
+    for intent, patterns in _INTENT_PATTERNS:
+        for pattern in patterns:
+            if re.search(pattern, q):
+                return intent
+    return INTENT_GENERAL
+# Processing config per intent — adjusts pipeline behavior
+_INTENT_CONFIGS: dict[str, dict] = {
+    INTENT_DESCRIBE: {
+        "prioritize_ocr": False,
+        "prioritize_transcript": False,
+        "dense_sampling": False,
+    },
+    INTENT_MOMENT: {
+        "prioritize_ocr": False,
+        "prioritize_transcript": True,
+        "dense_sampling": False,
+    },
+    INTENT_TEMPORAL: {
+        "prioritize_ocr": False,
+        "prioritize_transcript": True,
+        "dense_sampling": False,
+    },
+    INTENT_COUNTING: {
+        "prioritize_ocr": False,
+        "prioritize_transcript": False,
+        "dense_sampling": True,
+    },
+    INTENT_OCR: {
+        "prioritize_ocr": True,
+        "prioritize_transcript": False,
+        "dense_sampling": False,
+    },
+    INTENT_SPEECH: {
+        "prioritize_ocr": False,
+        "prioritize_transcript": True,
+        "dense_sampling": False,
+    },
+    INTENT_GENERAL: {
+        "prioritize_ocr": False,
+        "prioritize_transcript": False,
+        "dense_sampling": False,
+    },
+}
+def get_processing_config(intent: str) -> dict:
+    """Get processing configuration for an intent class."""
+    return dict(_INTENT_CONFIGS.get(intent, _INTENT_CONFIGS[INTENT_GENERAL]))

package/vidclaude/memory.py ADDED Viewed

@@ -0,0 +1,174 @@
+"""Layer I: Hierarchical memory and summarization.
+For longer videos, builds multi-level summaries:
+- Atomic: individual timeline events (already exist)
+- Scene: 60-second window summaries
+- Chapter: 5-minute window summaries (for videos >30min)
+- Global: overall video summary
+In skill mode, summaries are text-based (no API calls).
+In API mode, Claude generates them.
+"""
+from __future__ import annotations
+import logging
+from .models import TimelineEvent
+from .util import ms_to_hhmmss
+logger = logging.getLogger("vidclaude")
+def build_summaries(
+    timeline: list[TimelineEvent],
+    duration_sec: float,
+    mode: str = "standard",
+) -> dict:
+    """Build hierarchical summaries from timeline events.
+    Returns dict with keys: scene_summaries, chapter_summaries, global_summary.
+    For skill mode, these are text-based aggregations (no API calls).
+    """
+    result = {
+        "scene_summaries": [],
+        "chapter_summaries": [],
+        "global_summary": "",
+    }
+    # Quick mode: no summaries
+    if mode == "quick":
+        return result
+    # Short videos (<5min in standard mode): skip hierarchical summaries
+    if mode == "standard" and duration_sec < 300:
+        result["global_summary"] = _build_global_from_events(timeline)
+        return result
+    # Build scene summaries (60-second windows)
+    scene_summaries = _build_window_summaries(timeline, window_ms=60000)
+    result["scene_summaries"] = scene_summaries
+    # For long videos (>30min) or deep mode: add chapter summaries (5-min windows)
+    if duration_sec > 1800 or mode == "deep":
+        chapter_summaries = _build_window_summaries(timeline, window_ms=300000)
+        result["chapter_summaries"] = chapter_summaries
+    # Global summary from scene summaries
+    result["global_summary"] = _build_global_from_scenes(scene_summaries, duration_sec)
+    logger.info(
+        "Built summaries: %d scene, %d chapter",
+        len(result["scene_summaries"]),
+        len(result["chapter_summaries"]),
+    )
+    return result
+def _build_window_summaries(
+    events: list[TimelineEvent],
+    window_ms: int,
+) -> list[str]:
+    """Group events into time windows and summarize each."""
+    if not events:
+        return []
+    max_ms = max(e.start_ms for e in events)
+    summaries = []
+    window_start = 0
+    while window_start <= max_ms:
+        window_end = window_start + window_ms
+        window_events = [
+            e for e in events
+            if window_start <= e.start_ms < window_end
+        ]
+        if window_events:
+            summary = _summarize_window(window_start, window_end, window_events)
+            summaries.append(summary)
+        window_start = window_end
+    return summaries
+def _summarize_window(
+    start_ms: int,
+    end_ms: int,
+    events: list[TimelineEvent],
+) -> str:
+    """Create a text summary of events in a time window."""
+    ts_start = ms_to_hhmmss(start_ms)
+    ts_end = ms_to_hhmmss(end_ms)
+    parts = [f"[{ts_start} - {ts_end}]"]
+    # Group by modality
+    speech = [e for e in events if e.modality == "speech"]
+    ocr = [e for e in events if e.modality == "ocr"]
+    scene_changes = [e for e in events if e.modality == "scene_change"]
+    if scene_changes:
+        parts.append(f"  {len(scene_changes)} scene change(s)")
+    if speech:
+        # Concatenate speech text
+        speech_text = " ".join(e.summary for e in speech)
+        if len(speech_text) > 300:
+            speech_text = speech_text[:297] + "..."
+        parts.append(f"  Speech: {speech_text}")
+    if ocr:
+        ocr_texts = "; ".join(e.summary for e in ocr)
+        if len(ocr_texts) > 200:
+            ocr_texts = ocr_texts[:197] + "..."
+        parts.append(f"  {ocr_texts}")
+    return "\n".join(parts)
+def _build_global_from_events(events: list[TimelineEvent]) -> str:
+    """Build a simple global summary directly from events."""
+    if not events:
+        return "No events detected in video."
+    modality_counts = {}
+    for e in events:
+        modality_counts[e.modality] = modality_counts.get(e.modality, 0) + 1
+    parts = ["Video contains:"]
+    for mod, count in sorted(modality_counts.items()):
+        parts.append(f"  - {count} {mod} event(s)")
+    # Include first few speech segments as overview
+    speech = [e for e in events if e.modality == "speech"]
+    if speech:
+        preview = " ".join(e.summary for e in speech[:5])
+        if len(preview) > 500:
+            preview = preview[:497] + "..."
+        parts.append(f"\nSpeech preview: {preview}")
+    return "\n".join(parts)
+def _build_global_from_scenes(scene_summaries: list[str], duration_sec: float) -> str:
+    """Build global summary from scene-level summaries."""
+    if not scene_summaries:
+        return "No content detected."
+    duration_str = ms_to_hhmmss(int(duration_sec * 1000))
+    parts = [
+        f"Video duration: {duration_str}",
+        f"Total scenes: {len(scene_summaries)}",
+        "",
+        "Scene overview:",
+    ]
+    for i, summary in enumerate(scene_summaries):
+        # Just include the first line of each scene summary
+        first_line = summary.split("\n")[0]
+        parts.append(f"  {i+1}. {first_line}")
+    return "\n".join(parts)

package/vidclaude/models.py ADDED Viewed

@@ -0,0 +1,162 @@
+"""Data models for video understanding pipeline.
+All types are time-grounded with timestamp_ms or start_ms/end_ms fields.
+Includes JSON serialization for caching.
+"""
+from __future__ import annotations
+import json
+from dataclasses import dataclass, field, asdict
+from pathlib import Path
+from typing import Any
+@dataclass
+class VideoMeta:
+    """Layer A output: validated video metadata."""
+    path: str
+    duration_sec: float
+    fps: float
+    resolution: tuple[int, int]
+    audio_tracks: int
+    format: str
+    file_size_bytes: int = 0
+    def to_dict(self) -> dict[str, Any]:
+        d = asdict(self)
+        d["resolution"] = list(d["resolution"])
+        return d
+    @classmethod
+    def from_dict(cls, d: dict[str, Any]) -> VideoMeta:
+        d = dict(d)
+        d["resolution"] = tuple(d["resolution"])
+        return cls(**d)
+@dataclass
+class Shot:
+    """Layer B output: a detected shot boundary segment."""
+    shot_id: str
+    start_ms: int
+    end_ms: int
+    score: float  # scene change confidence at the boundary
+    def to_dict(self) -> dict[str, Any]:
+        return asdict(self)
+    @classmethod
+    def from_dict(cls, d: dict[str, Any]) -> Shot:
+        return cls(**d)
+@dataclass
+class Frame:
+    """Layer C output: an extracted frame with metadata."""
+    frame_id: str
+    timestamp_ms: int
+    shot_id: str
+    sampling_reason: list[str]  # e.g. ["base", "shot_boundary", "high_motion"]
+    image_path: str
+    def to_dict(self) -> dict[str, Any]:
+        return asdict(self)
+    @classmethod
+    def from_dict(cls, d: dict[str, Any]) -> Frame:
+        return cls(**d)
+@dataclass
+class TranscriptChunk:
+    """Layer D output: a timestamped speech segment."""
+    chunk_id: str
+    start_ms: int
+    end_ms: int
+    text: str
+    speaker: str | None = None
+    def to_dict(self) -> dict[str, Any]:
+        return asdict(self)
+    @classmethod
+    def from_dict(cls, d: dict[str, Any]) -> TranscriptChunk:
+        return cls(**d)
+@dataclass
+class OCRResult:
+    """Layer E output: text extracted from a frame."""
+    frame_id: str
+    timestamp_ms: int
+    text: str
+    confidence: float
+    def to_dict(self) -> dict[str, Any]:
+        return asdict(self)
+    @classmethod
+    def from_dict(cls, d: dict[str, Any]) -> OCRResult:
+        return cls(**d)
+@dataclass
+class TimelineEvent:
+    """Layer G output: a unified event from any modality."""
+    event_id: str
+    start_ms: int
+    end_ms: int | None
+    modality: str  # "visual" | "speech" | "ocr" | "scene_change"
+    summary: str
+    source_ids: list[str] = field(default_factory=list)
+    def to_dict(self) -> dict[str, Any]:
+        return asdict(self)
+    @classmethod
+    def from_dict(cls, d: dict[str, Any]) -> TimelineEvent:
+        return cls(**d)
+@dataclass
+class Evidence:
+    """Layer J input: the assembled context pack for Claude reasoning."""
+    video_meta: VideoMeta
+    question: str
+    intent: str
+    frames: list[Frame]
+    transcript_chunks: list[TranscriptChunk]
+    ocr_results: list[OCRResult]
+    timeline_events: list[TimelineEvent]
+    scene_summaries: list[str] = field(default_factory=list)
+    global_summary: str = ""
+    cache_dir: str = ""
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "video_meta": self.video_meta.to_dict(),
+            "question": self.question,
+            "intent": self.intent,
+            "frames": [f.to_dict() for f in self.frames],
+            "transcript_chunks": [t.to_dict() for t in self.transcript_chunks],
+            "ocr_results": [o.to_dict() for o in self.ocr_results],
+            "timeline_events": [e.to_dict() for e in self.timeline_events],
+            "scene_summaries": self.scene_summaries,
+            "global_summary": self.global_summary,
+            "cache_dir": self.cache_dir,
+        }
+# --- JSON file helpers ---
+def save_json(data: list[dict] | dict, path: str | Path) -> None:
+    """Save data to a JSON file."""
+    with open(path, "w", encoding="utf-8") as f:
+        json.dump(data, f, indent=2, ensure_ascii=False)
+def load_json(path: str | Path) -> Any:
+    """Load data from a JSON file."""
+    with open(path, "r", encoding="utf-8") as f:
+        return json.load(f)

package/vidclaude/ocr.py ADDED Viewed

@@ -0,0 +1,110 @@
+"""Layer E: OCR extraction from video frames using pytesseract."""
+from __future__ import annotations
+import logging
+from pathlib import Path
+from .models import Frame, OCRResult
+logger = logging.getLogger("vidclaude")
+def extract_ocr(
+    frames: list[Frame],
+    no_ocr: bool = False,
+    mode: str = "standard",
+) -> list[OCRResult]:
+    """Extract text from frames using pytesseract.
+    Args:
+        frames: List of extracted frames.
+        no_ocr: If True, skip OCR entirely.
+        mode: Processing mode — "quick" skips OCR, "standard" does keyframes,
+              "deep" does all frames.
+    Returns list of OCRResult for frames where text was detected.
+    """
+    if no_ocr:
+        logger.info("OCR skipped (--no-ocr)")
+        return []
+    if mode == "quick":
+        logger.info("OCR skipped in quick mode")
+        return []
+    # Try to import pytesseract
+    try:
+        import pytesseract
+        from PIL import Image
+    except ImportError:
+        logger.warning(
+            "pytesseract not installed. Skipping OCR. "
+            "Install with: pip install pytesseract "
+            "(also requires Tesseract system binary)"
+        )
+        return []
+    # In standard mode, only process keyframes (shot boundaries + every Nth frame)
+    if mode == "standard":
+        selected = _select_keyframes(frames)
+    else:
+        selected = frames
+    logger.info("Running OCR on %d frame(s)...", len(selected))
+    results: list[OCRResult] = []
+    for frame in selected:
+        if not Path(frame.image_path).exists():
+            continue
+        try:
+            img = Image.open(frame.image_path)
+            data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT)
+            # Collect text blocks with confidence > 60
+            texts = []
+            for i, conf in enumerate(data["conf"]):
+                try:
+                    conf_val = int(conf)
+                except (ValueError, TypeError):
+                    continue
+                if conf_val > 60 and data["text"][i].strip():
+                    texts.append(data["text"][i].strip())
+            if texts:
+                combined = " ".join(texts)
+                avg_conf = sum(
+                    int(c) for c in data["conf"]
+                    if str(c).strip() and int(c) > 60
+                ) / max(1, len(texts))
+                results.append(OCRResult(
+                    frame_id=frame.frame_id,
+                    timestamp_ms=frame.timestamp_ms,
+                    text=combined,
+                    confidence=avg_conf / 100.0,
+                ))
+        except Exception as e:
+            logger.debug("OCR failed on %s: %s", frame.frame_id, e)
+            continue
+    logger.info("OCR found text in %d frame(s)", len(results))
+    return results
+def _select_keyframes(frames: list[Frame]) -> list[Frame]:
+    """Select keyframes for OCR in standard mode.
+    Picks: shot boundary frames + every 5th base frame.
+    """
+    selected = []
+    base_count = 0
+    for frame in frames:
+        if "shot_boundary" in frame.sampling_reason:
+            selected.append(frame)
+        elif "base" in frame.sampling_reason:
+            if base_count % 5 == 0:
+                selected.append(frame)
+            base_count += 1
+    return selected if selected else frames[:5]