PyPI - media-engine - Versions diffs - 0.1.0__py3-none-any.whl - Mend

media-engine 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (70) hide show

cli/clip.py +79 -0
cli/faces.py +91 -0
cli/metadata.py +68 -0
cli/motion.py +77 -0
cli/objects.py +94 -0
cli/ocr.py +93 -0
cli/scenes.py +57 -0
cli/telemetry.py +65 -0
cli/transcript.py +76 -0
media_engine/__init__.py +7 -0
media_engine/_version.py +34 -0
media_engine/app.py +80 -0
media_engine/batch/__init__.py +56 -0
media_engine/batch/models.py +99 -0
media_engine/batch/processor.py +1131 -0
media_engine/batch/queue.py +232 -0
media_engine/batch/state.py +30 -0
media_engine/batch/timing.py +321 -0
media_engine/cli.py +17 -0
media_engine/config.py +674 -0
media_engine/extractors/__init__.py +75 -0
media_engine/extractors/clip.py +401 -0
media_engine/extractors/faces.py +459 -0
media_engine/extractors/frame_buffer.py +351 -0
media_engine/extractors/frames.py +402 -0
media_engine/extractors/metadata/__init__.py +127 -0
media_engine/extractors/metadata/apple.py +169 -0
media_engine/extractors/metadata/arri.py +118 -0
media_engine/extractors/metadata/avchd.py +208 -0
media_engine/extractors/metadata/avchd_gps.py +270 -0
media_engine/extractors/metadata/base.py +688 -0
media_engine/extractors/metadata/blackmagic.py +139 -0
media_engine/extractors/metadata/camera_360.py +276 -0
media_engine/extractors/metadata/canon.py +290 -0
media_engine/extractors/metadata/dji.py +371 -0
media_engine/extractors/metadata/dv.py +121 -0
media_engine/extractors/metadata/ffmpeg.py +76 -0
media_engine/extractors/metadata/generic.py +119 -0
media_engine/extractors/metadata/gopro.py +256 -0
media_engine/extractors/metadata/red.py +305 -0
media_engine/extractors/metadata/registry.py +114 -0
media_engine/extractors/metadata/sony.py +442 -0
media_engine/extractors/metadata/tesla.py +157 -0
media_engine/extractors/motion.py +765 -0
media_engine/extractors/objects.py +245 -0
media_engine/extractors/objects_qwen.py +754 -0
media_engine/extractors/ocr.py +268 -0
media_engine/extractors/scenes.py +82 -0
media_engine/extractors/shot_type.py +217 -0
media_engine/extractors/telemetry.py +262 -0
media_engine/extractors/transcribe.py +579 -0
media_engine/extractors/translate.py +121 -0
media_engine/extractors/vad.py +263 -0
media_engine/main.py +68 -0
media_engine/py.typed +0 -0
media_engine/routers/__init__.py +15 -0
media_engine/routers/batch.py +78 -0
media_engine/routers/health.py +93 -0
media_engine/routers/models.py +211 -0
media_engine/routers/settings.py +87 -0
media_engine/routers/utils.py +135 -0
media_engine/schemas.py +581 -0
media_engine/utils/__init__.py +5 -0
media_engine/utils/logging.py +54 -0
media_engine/utils/memory.py +49 -0
media_engine-0.1.0.dist-info/METADATA +276 -0
media_engine-0.1.0.dist-info/RECORD +70 -0
media_engine-0.1.0.dist-info/WHEEL +4 -0
media_engine-0.1.0.dist-info/entry_points.txt +11 -0
media_engine-0.1.0.dist-info/licenses/LICENSE +21 -0

media_engine/schemas.py ADDED Viewed

@@ -0,0 +1,581 @@
+"""Pydantic schemas for request/response models."""
+from datetime import datetime
+from enum import StrEnum
+from pydantic import BaseModel, Field
+class MediaDeviceType(StrEnum):
+    """Type of media capture device."""
+    DRONE = "drone"
+    CAMERA = "camera"
+    CINEMA_CAMERA = "cinema_camera"
+    PHONE = "phone"
+    ACTION_CAMERA = "action_camera"
+    CAMERA_360 = "360_camera"
+    DASHCAM = "dashcam"
+    UNKNOWN = "unknown"
+class DetectionMethod(StrEnum):
+    """Method used for detection."""
+    METADATA = "metadata"
+    XML_SIDECAR = "xml_sidecar"
+    CLIP = "clip"
+class MediaType(StrEnum):
+    """Type of media file."""
+    VIDEO = "video"
+    IMAGE = "image"
+    AUDIO = "audio"
+    UNKNOWN = "unknown"
+# File extension sets for media type detection (matches Rust MediaType::from_extension)
+VIDEO_EXTENSIONS: set[str] = {
+    ".mp4",
+    ".mov",
+    ".mxf",
+    ".avi",
+    ".mkv",
+    ".m4v",
+    ".webm",
+    ".mts",
+    ".m2ts",
+    ".ts",
+    ".vob",
+    ".mpg",
+    ".mpeg",
+    ".wmv",
+    ".flv",
+    # RAW video formats
+    ".braw",
+    ".r3d",
+    ".ari",
+}
+IMAGE_EXTENSIONS: set[str] = {
+    ".jpg",
+    ".jpeg",
+    ".png",
+    ".gif",
+    ".webp",
+    ".heic",
+    ".heif",
+    ".tiff",
+    ".tif",
+    ".bmp",
+    # RAW image formats
+    ".arw",
+    ".cr2",
+    ".cr3",
+    ".nef",
+    ".dng",
+    ".raf",
+    ".orf",
+    ".rw2",
+    ".pef",
+    ".srw",
+    ".x3f",
+}
+AUDIO_EXTENSIONS: set[str] = {
+    ".wav",
+    ".mp3",
+    ".aac",
+    ".m4a",
+    ".flac",
+    ".ogg",
+    ".aiff",
+    ".wma",
+    ".opus",
+    ".ape",
+    ".wv",
+}
+def get_media_type(file_path: str) -> MediaType:
+    """Determine media type from file extension."""
+    from pathlib import Path
+    ext = Path(file_path).suffix.lower()
+    if ext in VIDEO_EXTENSIONS:
+        return MediaType.VIDEO
+    elif ext in IMAGE_EXTENSIONS:
+        return MediaType.IMAGE
+    elif ext in AUDIO_EXTENSIONS:
+        return MediaType.AUDIO
+    return MediaType.UNKNOWN
+# === Request Models ===
+# === Response Models ===
+class Resolution(BaseModel):
+    """Video resolution."""
+    width: int
+    height: int
+class VideoCodec(BaseModel):
+    """Video codec details."""
+    name: str  # h264, hevc, prores, etc.
+    profile: str | None = None  # Main 10, High, etc.
+    bit_depth: int | None = None  # 8, 10, 12
+    pixel_format: str | None = None  # yuv420p, yuv420p10le, etc.
+class AudioInfo(BaseModel):
+    """Audio stream information."""
+    codec: str | None = None  # pcm_s16be, aac, etc.
+    sample_rate: int | None = None  # 48000, 44100, etc.
+    channels: int | None = None  # 1, 2, 6, etc.
+    bit_depth: int | None = None  # 16, 24, 32
+    bitrate: int | None = None  # Audio bitrate in bps
+class Codec(BaseModel):
+    """Video/audio codec info (simplified for backwards compat)."""
+    video: str | None = None
+    audio: str | None = None
+class GPS(BaseModel):
+    """GPS coordinates."""
+    latitude: float
+    longitude: float
+    altitude: float | None = None
+class GPSTrackPoint(BaseModel):
+    """Single point in a GPS track."""
+    latitude: float
+    longitude: float
+    altitude: float | None = None
+    timestamp: float | None = None  # Video timestamp in seconds
+class GPSTrack(BaseModel):
+    """GPS track extracted from video."""
+    points: list[GPSTrackPoint]
+    source: str  # Source of track data (e.g., "avchd_sei", "srt_sidecar")
+    @property
+    def count(self) -> int:
+        """Number of points in track."""
+        return len(self.points)
+    @property
+    def bounds(self) -> dict[str, float] | None:
+        """Bounding box of track (min/max lat/lon)."""
+        if not self.points:
+            return None
+        lats = [p.latitude for p in self.points]
+        lons = [p.longitude for p in self.points]
+        return {
+            "min_lat": min(lats),
+            "max_lat": max(lats),
+            "min_lon": min(lons),
+            "max_lon": max(lons),
+        }
+class ColorSpace(BaseModel):
+    """Color space information for LOG/HDR footage."""
+    transfer: str | None = None  # Gamma/transfer function (e.g., "slog3", "bt709", "hlg")
+    primaries: str | None = None  # Color primaries (e.g., "sgamut3", "bt709", "bt2020")
+    matrix: str | None = None  # Color matrix (e.g., "bt709", "bt2020nc")
+    lut_file: str | None = None  # Reference to LUT file for conversion
+    detection_method: DetectionMethod = DetectionMethod.METADATA
+class Stereo3DMode(StrEnum):
+    """3D video format/layout."""
+    MVC = "mvc"  # H.264 Multiview Video Coding (3D Blu-ray, consumer 3D camcorders)
+    SIDE_BY_SIDE = "side_by_side"  # Left/right frames side by side (half width each)
+    SIDE_BY_SIDE_FULL = "side_by_side_full"  # Full width SBS (doubled width)
+    TOP_BOTTOM = "top_bottom"  # Left/right frames stacked (half height each)
+    TOP_BOTTOM_FULL = "top_bottom_full"  # Full height TAB (doubled height)
+    FRAME_SEQUENTIAL = "frame_sequential"  # Alternating L/R frames
+    DUAL_STREAM = "dual_stream"  # Separate files for each eye
+class Stereo3D(BaseModel):
+    """Stereoscopic 3D video information."""
+    mode: Stereo3DMode
+    eye_count: int = 2  # Number of views (usually 2)
+    has_left_eye: bool = True
+    has_right_eye: bool = True
+    detection_method: DetectionMethod = DetectionMethod.METADATA
+class LensInfo(BaseModel):
+    """Lens and camera settings."""
+    model: str | None = None  # Lens model name (e.g., "XT14X5.8")
+    focal_length: float | None = None  # Focal length in mm
+    focal_length_35mm: float | None = None  # 35mm equivalent focal length
+    aperture: float | None = None  # f-number (e.g., 2.8)
+    focus_distance: float | None = None  # Focus distance in meters
+    iris: str | None = None  # Iris setting as string (e.g., "F2.8")
+    detection_method: DetectionMethod = DetectionMethod.METADATA
+class DeviceInfo(BaseModel):
+    """Source device information."""
+    make: str | None = None
+    model: str | None = None
+    serial_number: str | None = None
+    software: str | None = None
+    type: MediaDeviceType | None = None
+    detection_method: DetectionMethod = DetectionMethod.METADATA
+    confidence: float = 1.0
+class ShotType(BaseModel):
+    """Shot type classification."""
+    primary: str  # aerial, interview, b-roll, studio, etc.
+    confidence: float
+    detection_method: str = "clip"
+class KeyframeInfo(BaseModel):
+    """Keyframe (I-frame) information from video stream.
+    Useful for detecting scene cuts: irregular keyframe intervals
+    often indicate actual cuts, while fixed intervals (e.g., every 2s)
+    indicate standard GOP compression.
+    """
+    timestamps: list[float]  # Keyframe timestamps in seconds
+    count: int  # Number of keyframes
+    is_fixed_interval: bool  # True if keyframes are at regular intervals (GOP)
+    avg_interval: float | None = None  # Average interval between keyframes
+class SpannedRecording(BaseModel):
+    """Information about spanned recordings (e.g., AVCHD files split at 2GB).
+    When a camera splits a long recording across multiple files, this tracks
+    which files belong together and the total recording duration.
+    """
+    is_continuation: bool  # True if this file is NOT the first of the recording
+    sibling_files: list[str]  # Other files in this recording (filenames only)
+    total_duration: float  # Total duration of the complete recording in seconds
+    file_index: int  # Position of this file in the recording (0-based)
+class Metadata(BaseModel):
+    """Video metadata."""
+    duration: float
+    resolution: Resolution
+    codec: Codec  # Simplified codec info for backwards compat
+    video_codec: VideoCodec | None = None  # Detailed video codec info
+    audio: AudioInfo | None = None  # Audio stream info
+    fps: float | None = None
+    bitrate: int | None = None  # Total bitrate in bps
+    file_size: int  # File size in bytes
+    timecode: str | None = None  # Start timecode (e.g., "01:15:07:17")
+    created_at: datetime | None = None
+    device: DeviceInfo | None = None
+    gps: GPS | None = None
+    gps_track: GPSTrack | None = None  # Full GPS track if available
+    color_space: ColorSpace | None = None
+    lens: LensInfo | None = None
+    shot_type: ShotType | None = None
+    keyframes: KeyframeInfo | None = None
+    spanned_recording: SpannedRecording | None = None  # For split recordings (AVCHD)
+    stereo_3d: Stereo3D | None = None  # Stereoscopic 3D video info
+class TranscriptSegment(BaseModel):
+    """Single transcript segment."""
+    start: float
+    end: float
+    text: str
+    speaker: str | None = None  # Speaker ID from diarization (e.g., "SPEAKER_00")
+class TranscriptHints(BaseModel):
+    """Language hints used during transcription."""
+    language_hints: list[str] = Field(default_factory=list)
+    context_hint: str | None = None
+    fallback_applied: bool = False
+class Transcript(BaseModel):
+    """Full transcript result."""
+    language: str
+    confidence: float
+    duration: float
+    speaker_count: int | None = None  # Number of speakers detected (None if diarization disabled)
+    hints_used: TranscriptHints
+    segments: list[TranscriptSegment]
+class BoundingBox(BaseModel):
+    """Bounding box for detected objects."""
+    x: int
+    y: int
+    width: int
+    height: int
+class FaceDetection(BaseModel):
+    """Single face detection."""
+    timestamp: float
+    bbox: BoundingBox
+    confidence: float
+    embedding: list[float]
+    image_base64: str | None = None  # Base64-encoded JPEG of cropped face
+    needs_review: bool = False  # Flag for uncertain detections
+    review_reason: str | None = None  # Why review is needed
+class FacesResult(BaseModel):
+    """Face detection results."""
+    count: int
+    unique_estimate: int
+    detections: list[FaceDetection]
+class SceneDetection(BaseModel):
+    """Single scene segment."""
+    index: int
+    start: float
+    end: float
+    duration: float
+class ScenesResult(BaseModel):
+    """Scene detection results."""
+    count: int
+    detections: list[SceneDetection]
+class ObjectDetection(BaseModel):
+    """Single object detection."""
+    timestamp: float
+    label: str
+    confidence: float
+    bbox: BoundingBox
+class ObjectsResult(BaseModel):
+    """Object detection results."""
+    summary: dict[str, int]
+    detections: list[ObjectDetection]
+    descriptions: list[str] | None = None  # Scene descriptions from VLM
+    error: str | None = None  # Error code if extraction failed (e.g., "out_of_memory")
+class ClipSegment(BaseModel):
+    """CLIP embedding for a segment."""
+    start: float
+    end: float
+    scene_index: int | None = None
+    embedding: list[float]
+class ClipResult(BaseModel):
+    """CLIP embedding results."""
+    model: str
+    segments: list[ClipSegment]
+class OcrDetection(BaseModel):
+    """Single OCR detection."""
+    timestamp: float
+    text: str
+    confidence: float
+    bbox: BoundingBox
+class OcrResult(BaseModel):
+    """OCR results."""
+    detections: list[OcrDetection]
+class MotionSegment(BaseModel):
+    """A segment of video with consistent camera motion."""
+    start: float
+    end: float
+    motion_type: str  # static, pan_left, pan_right, tilt_up, tilt_down, zoom_in, zoom_out, handheld
+    intensity: float  # Average flow magnitude
+class MotionResult(BaseModel):
+    """Camera motion analysis results."""
+    duration: float
+    fps: float
+    primary_motion: str  # Most common motion type
+    segments: list[MotionSegment]
+    avg_intensity: float
+    is_stable: bool  # True if mostly static/tripod
+class TelemetryPoint(BaseModel):
+    """Single telemetry point from drone/camera."""
+    timestamp: float  # Seconds from start of video
+    recorded_at: datetime | None = None  # Actual datetime from telemetry
+    latitude: float
+    longitude: float
+    altitude: float | None = None  # Absolute altitude in meters
+    relative_altitude: float | None = None  # Altitude above takeoff
+    # Camera settings
+    iso: int | None = None
+    shutter: float | None = None  # Shutter speed as fraction (1/100 = 0.01)
+    aperture: float | None = None  # f-number
+    focal_length: float | None = None
+    color_mode: str | None = None  # d_log, d_cinelike, etc.
+class TelemetryResult(BaseModel):
+    """Telemetry/flight path results."""
+    source: str  # "dji_srt", "gopro", etc.
+    sample_rate: float  # Points per second
+    duration: float  # Total duration in seconds
+    points: list[TelemetryPoint]
+    def to_gpx(self) -> str:
+        """Export telemetry as GPX track."""
+        lines = [
+            '<?xml version="1.0" encoding="UTF-8"?>',
+            '<gpx version="1.1" creator="Polybos Media Engine">',
+            "  <trk>",
+            "    <name>Flight Path</name>",
+            "    <trkseg>",
+        ]
+        for pt in self.points:
+            ele = f"<ele>{pt.altitude}</ele>" if pt.altitude else ""
+            time = ""
+            if pt.recorded_at:
+                time = f"<time>{pt.recorded_at.strftime('%Y-%m-%dT%H:%M:%S.%f')[:-3]}Z</time>"
+            lines.append(f'      <trkpt lat="{pt.latitude}" lon="{pt.longitude}">{ele}{time}</trkpt>')
+        lines.extend(["    </trkseg>", "  </trk>", "</gpx>"])
+        return "\n".join(lines)
+class HealthResponse(BaseModel):
+    """Response from /health endpoint."""
+    status: str
+    version: str
+    api_version: str
+class SettingsResponse(BaseModel):
+    """Response from GET /settings endpoint.
+    All settings are returned, with sensitive values (hf_token) masked.
+    """
+    # API settings
+    api_version: str
+    log_level: str
+    # Whisper settings
+    whisper_model: str
+    fallback_language: str
+    # Speaker diarization
+    hf_token_set: bool  # True if token is configured (actual value is masked)
+    diarization_model: str
+    # Processing settings
+    face_sample_fps: float
+    object_sample_fps: float
+    min_face_size: int
+    # Object detection
+    object_detector: str
+    qwen_model: str
+    qwen_frames_per_scene: int
+    yolo_model: str
+    # CLIP
+    clip_model: str
+    # OCR
+    ocr_languages: list[str]
+    # Temp directory
+    temp_dir: str
+class SettingsUpdate(BaseModel):
+    """Request body for PUT /settings endpoint.
+    All fields are optional - only provided fields are updated.
+    """
+    # API settings
+    log_level: str | None = None
+    # Whisper settings
+    whisper_model: str | None = None
+    fallback_language: str | None = None
+    # Speaker diarization
+    hf_token: str | None = None  # Set to empty string to clear
+    diarization_model: str | None = None
+    # Processing settings
+    face_sample_fps: float | None = None
+    object_sample_fps: float | None = None
+    min_face_size: int | None = None
+    # Object detection
+    object_detector: str | None = None
+    qwen_model: str | None = None
+    qwen_frames_per_scene: int | None = None
+    yolo_model: str | None = None
+    # CLIP
+    clip_model: str | None = None
+    # OCR
+    ocr_languages: list[str] | None = None
+    # Temp directory
+    temp_dir: str | None = None

media_engine/utils/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+"""Utility functions for Polybos Media Engine."""
+from media_engine.utils.memory import clear_memory, get_memory_mb
+__all__ = ["clear_memory", "get_memory_mb"]

media_engine/utils/logging.py ADDED Viewed

@@ -0,0 +1,54 @@
+"""Non-blocking queue-based logging setup."""
+import atexit
+import logging
+import logging.handlers
+import os
+import queue
+import sys
+def setup_logging() -> logging.handlers.QueueListener:
+    """Configure non-blocking logging using a queue.
+    Returns the QueueListener so it can be stopped on shutdown.
+    """
+    # When running under a parent process that doesn't read our stdout/stderr,
+    # writes block when the pipe buffer fills up. Redirect to /dev/null to prevent this.
+    is_interactive = sys.stdout.isatty() and sys.stderr.isatty()
+    if not is_interactive:
+        # Redirect stdout/stderr to /dev/null to prevent blocking writes
+        devnull = open(os.devnull, "w")
+        sys.stdout = devnull
+        sys.stderr = devnull
+    # Configure non-blocking logging using a queue
+    log_queue: queue.Queue[logging.LogRecord] = queue.Queue(-1)  # Unlimited size
+    queue_handler = logging.handlers.QueueHandler(log_queue)
+    # Always log to file (this is the only output when running non-interactively)
+    file_handler = logging.FileHandler("/tmp/media_engine.log")
+    log_formatter = logging.Formatter("%(asctime)s %(levelname)s %(name)s: %(message)s")
+    file_handler.setFormatter(log_formatter)
+    # Build handler list - only include stderr if running interactively
+    handlers: list[logging.Handler] = [file_handler]
+    if is_interactive:
+        stream_handler = logging.StreamHandler()
+        stream_handler.setFormatter(log_formatter)
+        handlers.append(stream_handler)
+    # QueueListener handles the actual I/O in a separate thread
+    queue_listener = logging.handlers.QueueListener(log_queue, *handlers, respect_handler_level=True)
+    queue_listener.start()
+    # Register cleanup on exit
+    atexit.register(queue_listener.stop)
+    # Configure root logger to use queue handler (non-blocking)
+    logging.basicConfig(
+        level=logging.INFO,
+        handlers=[queue_handler],
+    )
+    return queue_listener

media_engine/utils/memory.py ADDED Viewed

@@ -0,0 +1,49 @@
+"""Memory management utilities."""
+import gc
+def clear_memory() -> None:
+    """Force garbage collection and clear GPU/MPS caches.
+    Call before loading heavy AI models to free up memory.
+    """
+    # Multiple gc passes to handle circular references
+    for _ in range(3):
+        gc.collect()
+    try:
+        import torch
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            torch.cuda.synchronize()
+        if hasattr(torch, "mps"):
+            if hasattr(torch.mps, "empty_cache"):
+                torch.mps.empty_cache()
+            if hasattr(torch.mps, "synchronize"):
+                torch.mps.synchronize()
+    except ImportError:
+        pass
+    # Also try mlx cleanup
+    try:
+        import mlx.core as mx
+        mx.metal.clear_cache()
+    except (ImportError, AttributeError):
+        pass
+    # Final gc pass after GPU cleanup
+    gc.collect()
+def get_memory_mb() -> int:
+    """Get current process memory usage in MB."""
+    try:
+        import psutil  # type: ignore[import-not-found]
+        process = psutil.Process()
+        return process.memory_info().rss // (1024 * 1024)
+    except ImportError:
+        return 0