npm - vidclaude - Versions diffs - 0.2.0 - Mend

vidclaude 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

package/README.md +237 -0
package/SKILL.md +138 -0
package/bin/setup.js +45 -0
package/bin/vidclaude.js +27 -0
package/package.json +31 -0
package/requirements.txt +2 -0
package/vidclaude/SKILL.md +138 -0
package/vidclaude/__init__.py +3 -0
package/vidclaude/__pycache__/__init__.cpython-313.pyc +0 -0
package/vidclaude/__pycache__/audio.cpython-313.pyc +0 -0
package/vidclaude/__pycache__/cli.cpython-313.pyc +0 -0
package/vidclaude/__pycache__/ingest.cpython-313.pyc +0 -0
package/vidclaude/__pycache__/intent.cpython-313.pyc +0 -0
package/vidclaude/__pycache__/memory.cpython-313.pyc +0 -0
package/vidclaude/__pycache__/models.cpython-313.pyc +0 -0
package/vidclaude/__pycache__/ocr.cpython-313.pyc +0 -0
package/vidclaude/__pycache__/reason.cpython-313.pyc +0 -0
package/vidclaude/__pycache__/segment.cpython-313.pyc +0 -0
package/vidclaude/__pycache__/timeline.cpython-313.pyc +0 -0
package/vidclaude/__pycache__/util.cpython-313.pyc +0 -0
package/vidclaude/audio.py +193 -0
package/vidclaude/cli.py +389 -0
package/vidclaude/ingest.py +80 -0
package/vidclaude/intent.py +116 -0
package/vidclaude/memory.py +174 -0
package/vidclaude/models.py +162 -0
package/vidclaude/ocr.py +110 -0
package/vidclaude/reason.py +285 -0
package/vidclaude/segment.py +239 -0
package/vidclaude/timeline.py +95 -0
package/vidclaude/util.py +163 -0
package/video_understand.py +12 -0

package/vidclaude/audio.py ADDED Viewed

@@ -0,0 +1,193 @@
+"""Layer D: Audio understanding — extract audio, transcribe with faster-whisper.
+Uses faster-whisper (CTranslate2) with large-v3 by default for strong
+multilingual support including Hindi, Urdu, etc.
+Falls back to openai-whisper if faster-whisper is not installed.
+"""
+from __future__ import annotations
+import logging
+from pathlib import Path
+from .models import VideoMeta, TranscriptChunk
+from .util import run_ffmpeg
+logger = logging.getLogger("vidclaude")
+# Model selection per processing mode
+MODE_MODELS = {
+    "quick": "base",
+    "standard": "large-v3",
+    "deep": "large-v3",
+}
+def has_audio_stream(meta: VideoMeta) -> bool:
+    """Check if video has at least one audio track."""
+    return meta.audio_tracks > 0
+def extract_audio(meta: VideoMeta, cache_dir: Path) -> str | None:
+    """Extract audio from video as 16kHz mono WAV.
+    Returns path to WAV file, or None if no audio stream.
+    """
+    if not has_audio_stream(meta):
+        logger.info("No audio stream detected, skipping audio extraction")
+        return None
+    audio_path = str(cache_dir / "audio.wav")
+    logger.info("Extracting audio...")
+    result = run_ffmpeg([
+        "-i", meta.path,
+        "-vn",                    # no video
+        "-acodec", "pcm_s16le",   # raw PCM
+        "-ar", "16000",           # 16kHz sample rate
+        "-ac", "1",               # mono
+        "-y",
+        audio_path,
+    ])
+    if result.returncode != 0:
+        logger.warning("Audio extraction failed: %s", result.stderr[:200])
+        return None
+    if not Path(audio_path).exists():
+        return None
+    logger.info("Audio extracted to %s", audio_path)
+    return audio_path
+def transcribe(
+    meta: VideoMeta,
+    cache_dir: Path,
+    no_audio: bool = False,
+    model_name: str | None = None,
+    mode: str = "standard",
+) -> list[TranscriptChunk]:
+    """Transcribe video audio using faster-whisper (preferred) or openai-whisper.
+    Args:
+        meta: Video metadata.
+        cache_dir: Directory for intermediate files.
+        no_audio: If True, skip transcription entirely.
+        model_name: Whisper model override. If None, selected by mode.
+        mode: Processing mode (quick/standard/deep).
+    Returns list of TranscriptChunk with timestamps.
+    """
+    if no_audio:
+        logger.info("Audio transcription skipped (--no-audio)")
+        return []
+    # Extract audio first
+    audio_path = extract_audio(meta, cache_dir)
+    if audio_path is None:
+        return []
+    # Select model by mode if not overridden
+    if model_name is None:
+        model_name = MODE_MODELS.get(mode, "large-v3")
+    # Try faster-whisper first (preferred)
+    chunks = _transcribe_faster_whisper(audio_path, model_name)
+    if chunks is not None:
+        return chunks
+    # Fall back to openai-whisper
+    chunks = _transcribe_openai_whisper(audio_path, model_name)
+    if chunks is not None:
+        return chunks
+    logger.warning(
+        "No whisper implementation found. Skipping transcription.\n"
+        "Install with: pip install faster-whisper (recommended)\n"
+        "         or:  pip install openai-whisper"
+    )
+    return []
+def _transcribe_faster_whisper(
+    audio_path: str, model_name: str,
+) -> list[TranscriptChunk] | None:
+    """Transcribe using faster-whisper (CTranslate2 backend)."""
+    try:
+        from faster_whisper import WhisperModel
+    except ImportError:
+        return None
+    logger.info("Transcribing with faster-whisper (model=%s)...", model_name)
+    model = WhisperModel(model_name, device="auto", compute_type="auto")
+    segments, info = model.transcribe(
+        audio_path,
+        word_timestamps=True,
+        vad_filter=True,  # filter out non-speech
+    )
+    logger.info(
+        "Detected language: %s (probability: %.2f)",
+        info.language, info.language_probability,
+    )
+    chunks: list[TranscriptChunk] = []
+    for i, segment in enumerate(segments):
+        text = segment.text.strip()
+        if not text:
+            continue
+        start_ms = int(segment.start * 1000)
+        end_ms = int(segment.end * 1000)
+        chunks.append(TranscriptChunk(
+            chunk_id=f"tc_{i:04d}",
+            start_ms=start_ms,
+            end_ms=end_ms,
+            text=text,
+        ))
+    logger.info("Transcribed %d segment(s), total %d chars",
+                len(chunks), sum(len(c.text) for c in chunks))
+    return chunks
+def _transcribe_openai_whisper(
+    audio_path: str, model_name: str,
+) -> list[TranscriptChunk] | None:
+    """Transcribe using openai-whisper (fallback)."""
+    try:
+        import whisper
+    except ImportError:
+        return None
+    # openai-whisper doesn't support "large-v3" name, map it
+    ow_model = model_name
+    if model_name == "large-v3":
+        ow_model = "large"
+    logger.info("Transcribing with openai-whisper (model=%s)...", ow_model)
+    model = whisper.load_model(ow_model)
+    result = model.transcribe(audio_path, word_timestamps=True)
+    chunks: list[TranscriptChunk] = []
+    for i, segment in enumerate(result.get("segments", [])):
+        text = segment.get("text", "").strip()
+        if not text:
+            continue
+        start_ms = int(segment["start"] * 1000)
+        end_ms = int(segment["end"] * 1000)
+        chunks.append(TranscriptChunk(
+            chunk_id=f"tc_{i:04d}",
+            start_ms=start_ms,
+            end_ms=end_ms,
+            text=text,
+        ))
+    logger.info("Transcribed %d segment(s), total %d chars",
+                len(chunks), sum(len(c.text) for c in chunks))
+    return chunks

package/vidclaude/cli.py ADDED Viewed

@@ -0,0 +1,389 @@
+"""CLI orchestration: argparse, pipeline execution, caching, batch processing."""
+from __future__ import annotations
+import argparse
+import logging
+import shutil
+import sys
+from pathlib import Path
+from .models import Evidence, save_json, load_json
+from .util import (
+    setup_logging, check_ffmpeg, get_cache_dir, is_cached, find_videos,
+)
+from .ingest import ingest
+from .segment import detect_shots, extract_frames
+from .audio import transcribe
+from .ocr import extract_ocr
+from .intent import classify_intent
+from .timeline import build_timeline
+from .memory import build_summaries
+from .reason import generate_evidence_md, call_claude_api
+logger = logging.getLogger("vidclaude")
+# Bundled SKILL.md path (inside the package)
+SKILL_MD_SOURCE = Path(__file__).parent / "SKILL.md"
+def build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(
+        prog="vidclaude",
+        description="Multimodal video understanding powered by Claude.",
+        epilog=(
+            "Quick start:\n"
+            "  vidclaude video.mp4                     # extract + analyze\n"
+            "  vidclaude video.mp4 --mode quick        # fast mode\n"
+            "  vidclaude ./videos/                     # batch folder\n"
+            "  vidclaude --install-skill               # set up Claude Code skill\n"
+        ),
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument(
+        "input", nargs="?", default=None,
+        help="Video file or folder of videos",
+    )
+    parser.add_argument(
+        "--install-skill", action="store_true",
+        help="Copy SKILL.md to current directory for Claude Code integration",
+    )
+    parser.add_argument(
+        "--extract", action="store_true",
+        help="Extract evidence only (for Claude Code skill mode). "
+             "Outputs cache path for Claude to read.",
+    )
+    parser.add_argument(
+        "-q", "--question", default=None,
+        help="Question to answer about the video",
+    )
+    parser.add_argument(
+        "-f", "--fps", type=float, default=None,
+        help="Frames per second to extract (overrides mode default)",
+    )
+    parser.add_argument(
+        "-m", "--max-frames", type=int, default=None,
+        help="Maximum number of frames to extract (overrides mode default)",
+    )
+    parser.add_argument(
+        "--no-audio", action="store_true",
+        help="Skip audio transcription",
+    )
+    parser.add_argument(
+        "--no-ocr", action="store_true",
+        help="Skip OCR extraction",
+    )
+    parser.add_argument(
+        "-o", "--output", default=None,
+        help="Write output to file instead of stdout",
+    )
+    parser.add_argument(
+        "--verbose", action="store_true",
+        help="Print detailed progress information",
+    )
+    parser.add_argument(
+        "--mode", choices=["quick", "standard", "deep"], default="standard",
+        help="Processing mode (default: standard)",
+    )
+    parser.add_argument(
+        "--batch-summary", action="store_true",
+        help="Generate a cross-video summary when processing a folder",
+    )
+    parser.add_argument(
+        "--no-cache", action="store_true",
+        help="Force re-extraction even if cache exists",
+    )
+    parser.add_argument(
+        "--api", action="store_true",
+        help="Use Anthropic API for reasoning (standalone mode, needs API key)",
+    )
+    return parser
+def install_skill() -> None:
+    """Copy SKILL.md to the current directory for Claude Code integration."""
+    dest = Path.cwd() / "SKILL.md"
+    if SKILL_MD_SOURCE.exists():
+        source = SKILL_MD_SOURCE
+    else:
+        # Fallback: look relative to the package root
+        source = Path(__file__).parent.parent / "SKILL.md"
+    if not source.exists():
+        print("Error: SKILL.md not found in package.", file=sys.stderr)
+        sys.exit(1)
+    if dest.exists():
+        print(f"SKILL.md already exists at {dest}")
+        response = input("Overwrite? [y/N] ").strip().lower()
+        if response != "y":
+            print("Skipped.")
+            return
+    shutil.copy2(source, dest)
+    print(f"Installed SKILL.md to {dest}")
+    print()
+    print("You're all set! In Claude Code, just say:")
+    print('  "analyze the video at path/to/video.mp4"')
+    print()
+    print("Or from the command line:")
+    print("  vidclaude video.mp4 --mode standard --verbose")
+def process_video(
+    video_path: str,
+    args: argparse.Namespace,
+) -> tuple[str, Evidence | None]:
+    """Process a single video through the extraction pipeline.
+    Returns (cache_dir_path, evidence_object).
+    """
+    # Ingest
+    meta = ingest(video_path)
+    # Get or create cache directory
+    cache_dir = get_cache_dir(video_path)
+    # Check cache
+    if not args.no_cache and is_cached(cache_dir):
+        logger.info("Using cached extraction from %s", cache_dir)
+        return str(cache_dir), _load_cached_evidence(cache_dir, args.question)
+    # Classify intent from question
+    intent = classify_intent(args.question)
+    logger.info("Intent: %s", intent)
+    # Detect shots
+    shots = detect_shots(meta)
+    save_json([s.to_dict() for s in shots], cache_dir / "shots.json")
+    # Extract frames (adaptive sampling)
+    frames = extract_frames(
+        meta, shots, args.mode, cache_dir,
+        fps_override=args.fps,
+        max_frames_override=args.max_frames,
+    )
+    # Transcribe audio (model selected by mode: quick=base, standard/deep=large-v3)
+    transcript = transcribe(
+        meta, cache_dir,
+        no_audio=args.no_audio,
+        mode=args.mode,
+    )
+    save_json([t.to_dict() for t in transcript], cache_dir / "transcript.json")
+    # OCR
+    ocr_results = extract_ocr(frames, no_ocr=args.no_ocr, mode=args.mode)
+    save_json([o.to_dict() for o in ocr_results], cache_dir / "ocr.json")
+    # Build timeline
+    timeline = build_timeline(shots, frames, transcript, ocr_results)
+    save_json([e.to_dict() for e in timeline], cache_dir / "timeline.json")
+    # Build summaries
+    summaries = build_summaries(timeline, meta.duration_sec, mode=args.mode)
+    save_json(summaries, cache_dir / "summaries.json")
+    # Save metadata
+    save_json(meta.to_dict(), cache_dir / "meta.json")
+    # Build evidence object
+    evidence = Evidence(
+        video_meta=meta,
+        question=args.question or "",
+        intent=intent,
+        frames=frames,
+        transcript_chunks=transcript,
+        ocr_results=ocr_results,
+        timeline_events=timeline,
+        scene_summaries=summaries.get("scene_summaries", []),
+        global_summary=summaries.get("global_summary", ""),
+        cache_dir=str(cache_dir),
+    )
+    # Generate evidence.md (always, for skill mode)
+    generate_evidence_md(evidence, cache_dir)
+    return str(cache_dir), evidence
+def _load_cached_evidence(cache_dir: Path, question: str | None) -> Evidence:
+    """Load evidence from cached files."""
+    from .models import (
+        VideoMeta, Frame, TranscriptChunk, OCRResult, TimelineEvent,
+    )
+    meta = VideoMeta.from_dict(load_json(cache_dir / "meta.json"))
+    # Load frames from the frames directory
+    frames_dir = cache_dir / "frames"
+    frames = []
+    if frames_dir.exists():
+        for img_path in sorted(frames_dir.glob("frame_*.jpg")):
+            parts = img_path.stem.split("_")
+            # frame_NNNN_TTTTTTTT.jpg
+            if len(parts) >= 3:
+                seq = int(parts[1])
+                ts_ms = int(parts[2])
+                frames.append(Frame(
+                    frame_id=f"f_{seq:04d}",
+                    timestamp_ms=ts_ms,
+                    shot_id="",
+                    sampling_reason=["cached"],
+                    image_path=str(img_path),
+                ))
+    transcript = [
+        TranscriptChunk.from_dict(d)
+        for d in load_json(cache_dir / "transcript.json")
+    ] if (cache_dir / "transcript.json").exists() else []
+    ocr_results = [
+        OCRResult.from_dict(d)
+        for d in load_json(cache_dir / "ocr.json")
+    ] if (cache_dir / "ocr.json").exists() else []
+    timeline = [
+        TimelineEvent.from_dict(d)
+        for d in load_json(cache_dir / "timeline.json")
+    ] if (cache_dir / "timeline.json").exists() else []
+    summaries = load_json(cache_dir / "summaries.json") if (
+        cache_dir / "summaries.json"
+    ).exists() else {}
+    intent = classify_intent(question)
+    return Evidence(
+        video_meta=meta,
+        question=question or "",
+        intent=intent,
+        frames=frames,
+        transcript_chunks=transcript,
+        ocr_results=ocr_results,
+        timeline_events=timeline,
+        scene_summaries=summaries.get("scene_summaries", []),
+        global_summary=summaries.get("global_summary", ""),
+        cache_dir=str(cache_dir),
+    )
+def main() -> None:
+    """Main entry point for the CLI."""
+    parser = build_parser()
+    args = parser.parse_args()
+    # Handle --install-skill before anything else
+    if args.install_skill:
+        install_skill()
+        return
+    # Require input for all other operations
+    if args.input is None:
+        parser.print_help()
+        sys.exit(1)
+    setup_logging(args.verbose)
+    # Preflight checks
+    if not check_ffmpeg():
+        print(
+            "Error: ffmpeg not found on PATH.\n"
+            "Install from https://ffmpeg.org or via your package manager.\n"
+            "  Windows: winget install ffmpeg\n"
+            "  macOS:   brew install ffmpeg\n"
+            "  Linux:   sudo apt install ffmpeg",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+    # Find videos to process
+    try:
+        video_paths = find_videos(args.input)
+    except (FileNotFoundError, ValueError) as e:
+        print(f"Error: {e}", file=sys.stderr)
+        sys.exit(1)
+    logger.info("Found %d video(s) to process", len(video_paths))
+    results: list[tuple[str, str, Evidence | None]] = []  # (path, cache_dir, evidence)
+    for i, video_path in enumerate(video_paths):
+        if len(video_paths) > 1:
+            print(f"\n--- Processing [{i+1}/{len(video_paths)}]: {video_path} ---",
+                  file=sys.stderr)
+        try:
+            cache_dir, evidence = process_video(video_path, args)
+            results.append((video_path, cache_dir, evidence))
+        except KeyboardInterrupt:
+            print("\nInterrupted.", file=sys.stderr)
+            sys.exit(130)
+        except Exception as e:
+            logger.error("Failed to process %s: %s", video_path, e)
+            if args.verbose:
+                import traceback
+                traceback.print_exc()
+            continue
+    if not results:
+        print("No videos were successfully processed.", file=sys.stderr)
+        sys.exit(1)
+    # Output
+    output_text = ""
+    if args.extract:
+        # Skill mode: print cache paths and summary
+        for video_path, cache_dir, evidence in results:
+            print(f"Extracted: {video_path}")
+            print(f"  Cache: {cache_dir}")
+            if evidence:
+                print(f"  Frames: {len(evidence.frames)}")
+                print(f"  Transcript chunks: {len(evidence.transcript_chunks)}")
+                print(f"  OCR results: {len(evidence.ocr_results)}")
+                print(f"  Timeline events: {len(evidence.timeline_events)}")
+                print(f"  Evidence report: {cache_dir}/evidence.md")
+    elif args.api:
+        # Standalone API mode
+        for video_path, cache_dir, evidence in results:
+            if evidence:
+                try:
+                    answer = call_claude_api(evidence)
+                    output_text += f"## {video_path}\n\n{answer}\n\n"
+                except Exception as e:
+                    logger.error("API call failed for %s: %s", video_path, e)
+                    output_text += f"## {video_path}\n\nError: {e}\n\n"
+    else:
+        # Default: extract mode behavior (most useful)
+        for video_path, cache_dir, evidence in results:
+            print(f"Extracted: {video_path}")
+            print(f"  Cache: {cache_dir}")
+            if evidence:
+                print(f"  Frames: {len(evidence.frames)}")
+                print(f"  Transcript chunks: {len(evidence.transcript_chunks)}")
+                print(f"  Timeline events: {len(evidence.timeline_events)}")
+                print(f"  Evidence report: {cache_dir}/evidence.md")
+    # Batch summary
+    if args.batch_summary and len(results) > 1:
+        output_text += "\n## Batch Summary\n\n"
+        for video_path, cache_dir, evidence in results:
+            if evidence:
+                output_text += f"- **{Path(video_path).name}**: "
+                output_text += f"{len(evidence.frames)} frames, "
+                output_text += f"{len(evidence.transcript_chunks)} transcript segments, "
+                output_text += f"{len(evidence.timeline_events)} timeline events\n"
+    # Write output
+    if output_text:
+        if args.output:
+            Path(args.output).write_text(output_text, encoding="utf-8")
+            print(f"Output written to {args.output}", file=sys.stderr)
+        else:
+            print(output_text)

package/vidclaude/ingest.py ADDED Viewed

@@ -0,0 +1,80 @@
+"""Layer A: Video ingestion — validate format, extract metadata via ffprobe."""
+from __future__ import annotations
+import logging
+from pathlib import Path
+from .models import VideoMeta
+from .util import run_ffprobe, VIDEO_EXTENSIONS
+logger = logging.getLogger("vidclaude")
+def ingest(video_path: str) -> VideoMeta:
+    """Validate video file and extract metadata.
+    Returns VideoMeta with duration, fps, resolution, audio track info.
+    Raises on invalid file or ffprobe failure.
+    """
+    p = Path(video_path).resolve()
+    if not p.exists():
+        raise FileNotFoundError(f"Video not found: {video_path}")
+    if not p.is_file():
+        raise ValueError(f"Not a file: {video_path}")
+    if p.suffix.lower() not in VIDEO_EXTENSIONS:
+        raise ValueError(
+            f"Unsupported format '{p.suffix}'. "
+            f"Supported: {', '.join(sorted(VIDEO_EXTENSIONS))}"
+        )
+    probe = run_ffprobe(str(p))
+    # Extract format info
+    fmt = probe.get("format", {})
+    duration = float(fmt.get("duration", 0))
+    file_size = int(fmt.get("size", 0))
+    format_name = fmt.get("format_name", "unknown")
+    # Find video stream
+    video_stream = None
+    audio_count = 0
+    for stream in probe.get("streams", []):
+        if stream.get("codec_type") == "video" and video_stream is None:
+            video_stream = stream
+        elif stream.get("codec_type") == "audio":
+            audio_count += 1
+    if video_stream is None:
+        raise ValueError(f"No video stream found in {video_path}")
+    # Parse fps from r_frame_rate (e.g. "30000/1001" or "30/1")
+    fps_str = video_stream.get("r_frame_rate", "30/1")
+    try:
+        num, den = fps_str.split("/")
+        fps = float(num) / float(den)
+    except (ValueError, ZeroDivisionError):
+        fps = 30.0
+    width = int(video_stream.get("width", 0))
+    height = int(video_stream.get("height", 0))
+    meta = VideoMeta(
+        path=str(p),
+        duration_sec=duration,
+        fps=fps,
+        resolution=(width, height),
+        audio_tracks=audio_count,
+        format=format_name,
+        file_size_bytes=file_size,
+    )
+    logger.info(
+        "Ingested: %.1fs, %.1ffps, %dx%d, %d audio track(s), %s",
+        meta.duration_sec, meta.fps,
+        width, height,
+        audio_count, format_name,
+    )
+    return meta