npm - @exulu/backend - Versions diffs - 1.60.0 → 1.61.1 - Mend

@exulu/backend 1.60.0 → 1.61.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

package/bin/backend.cjs +60 -0
package/dist/{catalog-EOKGOHTY.js → catalog-BWE6SLE2.js} +1 -1
package/dist/chunk-IDHS2BZO.js +210 -0
package/dist/{chunk-YS27XOXI.js → chunk-ILAHW4UT.js} +5 -1
package/dist/{chunk-23YNGK3V.js → chunk-MPV7HBV6.js} +63 -2
package/dist/cli/start-whisper.cjs +240 -0
package/dist/cli/start-whisper.d.cts +1 -0
package/dist/cli/start-whisper.d.ts +1 -0
package/dist/cli/start-whisper.js +204 -0
package/dist/{convert-exulu-tools-to-ai-sdk-tools-PLLM2CJL.js → convert-exulu-tools-to-ai-sdk-tools-CULC37U6.js} +1 -1
package/dist/index.cjs +1827 -346
package/dist/index.d.cts +2 -1
package/dist/index.d.ts +2 -1
package/dist/index.js +1447 -249
package/ee/python/requirements.txt +18 -0
package/ee/python/setup.sh +44 -0
package/ee/python/transcription/__init__.py +0 -0
package/ee/python/transcription/pipeline.py +232 -0
package/ee/python/transcription/server.py +151 -0
package/ee/python/transcription/tests/__init__.py +0 -0
package/ee/python/transcription/tests/test_server.py +111 -0
package/ee/python/transcription/worker.py +135 -0
package/package.json +5 -2

package/ee/python/requirements.txt CHANGED Viewed

@@ -3,6 +3,24 @@ transformers
 pyinstaller
 docling-hierarchical-pdf
 defusedxml
+# Whisper transcription server. Used by `npx @exulu/backend exulu-start-whisper`.
+#
+# Notes on the pins:
+# - whisperx >= 3.4: earlier versions transitively pin faster-whisper 1.0.0 →
+#   tokenizers <0.16, which collides with litellm's tokenizers >=0.21.
+#   whisperx 3.4+ uses faster-whisper 1.1+ on the modern tokenizers track.
+# - torch / torchaudio pinned because pyannote.audio (≤3.4) still imports
+#   `torchaudio.AudioMetaData`, which was removed in newer torchaudio. The
+#   2.5 line is the most recent series where the old API is still present.
+torch==2.5.1
+torchaudio==2.5.1
+torchvision==0.20.1
+whisperx>=3.4.0
+pyannote.audio>=3.3.0
+fastapi
+uvicorn
+python-multipart
+requests
 # LiteLLM proxy — only used when EXULU_USE_LITELLM=true. Always installed so
 # the dep is ready when the env var is flipped. Pinned to a tested version;
 # upgrade deliberately.

package/ee/python/setup.sh CHANGED Viewed

@@ -199,6 +199,43 @@ fi
 print_info "Installing packages from requirements.txt..."
 echo ""
+# Conditional torch wheel for the whisper transcription server.
+# WhisperX depends on torch transitively; by default pip would install the
+# CPU build, which works everywhere but is slow. On CUDA hosts we want the
+# CUDA build instead. Selection rule:
+#   - WHISPER_GPU=cuda explicit OR `nvidia-smi` present → install CUDA wheel
+#   - WHISPER_GPU=cpu/mps/skip OR no GPU detected         → fall through to default
+WHISPER_GPU_MODE="${WHISPER_GPU:-auto}"
+if [ "$WHISPER_GPU_MODE" = "auto" ]; then
+    if command -v nvidia-smi &> /dev/null; then
+        WHISPER_GPU_MODE="cuda"
+    elif [ "$(uname -s)" = "Darwin" ] && [ "$(uname -m)" = "arm64" ]; then
+        WHISPER_GPU_MODE="mps"
+    else
+        WHISPER_GPU_MODE="cpu"
+    fi
+fi
+case "$WHISPER_GPU_MODE" in
+    cuda)
+        print_info "Installing CUDA torch wheel (WHISPER_GPU=$WHISPER_GPU_MODE)…"
+        pip install torch==2.5.0 torchaudio==2.5.0 \
+            --index-url https://download.pytorch.org/whl/cu124 || {
+            print_warning "CUDA torch install failed; falling back to default torch."
+        }
+        ;;
+    mps|cpu)
+        print_info "Installing default torch wheel (WHISPER_GPU=$WHISPER_GPU_MODE)…"
+        ;;
+    skip)
+        print_warning "WHISPER_GPU=skip — torch install skipped; transcription server will not work."
+        ;;
+    *)
+        print_warning "Unknown WHISPER_GPU=$WHISPER_GPU_MODE — falling back to default torch."
+        ;;
+esac
 pip install -r "$REQUIREMENTS_FILE"
 print_success "All dependencies installed successfully"
@@ -225,6 +262,13 @@ print_info "Testing critical imports..."
 $PYTHON_CMD -c "import docling" 2>/dev/null && print_success "docling imported successfully" || print_error "Failed to import docling"
 $PYTHON_CMD -c "import transformers" 2>/dev/null && print_success "transformers imported successfully" || print_error "Failed to import transformers"
+# Whisper transcription server imports — non-fatal: only needed for
+# `npx @exulu/backend exulu-start-whisper`. If these fail, the rest of
+# the @exulu/backend package still works fine.
+$PYTHON_CMD -c "import whisperx" 2>/dev/null && print_success "whisperx imported successfully" || print_warning "whisperx not importable (transcription server will not start)"
+$PYTHON_CMD -c "import pyannote.audio" 2>/dev/null && print_success "pyannote.audio imported successfully" || print_warning "pyannote.audio not importable (diarization will be disabled even with HF_AUTH_TOKEN)"
+$PYTHON_CMD -c "import fastapi, uvicorn" 2>/dev/null && print_success "fastapi/uvicorn imported successfully" || print_warning "fastapi/uvicorn not importable (transcription server will not start)"
 # Step 8: Display summary
 echo ""
 echo -e "${GREEN}========================================${NC}"

package/ee/python/transcription/__init__.py ADDED Viewed

File without changes

package/ee/python/transcription/pipeline.py ADDED Viewed

@@ -0,0 +1,232 @@
+"""
+WhisperX + pyannote transcription pipeline.
+Loads models once at startup; one transcribe() call per audio file.
+Adapted from audio-transcription/src/transcription.py, but reshaped so the
+pipeline can serve multiple jobs from a long-running FastAPI process.
+"""
+import os
+import time
+from dataclasses import dataclass, field
+from typing import Callable, Optional
+import pandas as pd
+import torch
+import whisperx
+from whisperx.audio import SAMPLE_RATE
+@dataclass
+class TranscriptionOptions:
+    language: Optional[str] = None  # None = auto-detect
+    num_speakers: Optional[int] = None  # None = auto-detect
+    hotwords: list[str] = field(default_factory=list)
+@dataclass
+class TranscriptionResult:
+    segments: list[dict]
+    language: str
+    duration_seconds: float
+class CancelledError(Exception):
+    pass
+def detect_device(requested: str = "auto") -> str:
+    if requested != "auto":
+        return requested
+    if torch.cuda.is_available():
+        return "cuda"
+    if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
+        return "mps"
+    return "cpu"
+def describe_gpu(device: str) -> dict:
+    if device == "cuda":
+        try:
+            name = torch.cuda.get_device_name(0)
+            vram_gb = round(torch.cuda.get_device_properties(0).total_memory / (1024 ** 3), 1)
+            return {"available": True, "kind": "cuda", "name": name, "vram_gb": vram_gb}
+        except Exception:
+            return {"available": True, "kind": "cuda", "name": "unknown", "vram_gb": None}
+    if device == "mps":
+        return {"available": True, "kind": "mps", "name": "Apple Silicon MPS", "vram_gb": None}
+    return {"available": False, "kind": "cpu", "name": None, "vram_gb": None}
+class TranscriptionPipeline:
+    """
+    Loads whisper + pyannote once; serves multiple transcribe() calls.
+    Not thread-safe — designed for the single-consumer worker.
+    """
+    def __init__(self, model_name: str, device: str, batch_size: int):
+        self.model_name = model_name
+        self.device = device
+        self.batch_size = batch_size
+        self.model = None
+        self.diarize_model = None
+        self.diarization_enabled = False
+        self.diarization_disabled_reason: str = "not attempted"
+        self.align_models: dict[str, tuple] = {}
+    def load(self) -> None:
+        # whisperx doesn't ship MPS support; run whisper on CPU when DEVICE=mps
+        # (faster than torch CPU fallback because whisperx uses CTranslate2 int8).
+        whisper_device = "cpu" if self.device == "mps" else self.device
+        compute_type = (
+            "float16" if self.device == "cuda" else "int8"
+        )
+        print(f"[pipeline] Loading whisper '{self.model_name}' on {whisper_device} (compute_type={compute_type})", flush=True)
+        self.model = whisperx.load_model(
+            self.model_name,
+            device=whisper_device,
+            compute_type=compute_type,
+        )
+        hf_token = os.getenv("HF_AUTH_TOKEN")
+        if not hf_token:
+            self.diarization_disabled_reason = "HF_AUTH_TOKEN not set"
+            print(f"[pipeline] {self.diarization_disabled_reason}; diarization disabled", flush=True)
+            return
+        try:
+            from pyannote.audio import Pipeline as PyannotePipeline
+            self.diarize_model = PyannotePipeline.from_pretrained(
+                "pyannote/speaker-diarization-3.1",
+                use_auth_token=hf_token,
+            )
+            if self.diarize_model is None:
+                # pyannote.from_pretrained returns None (rather than raising)
+                # when the user has the token but hasn't accepted the gated
+                # repo's terms of use. Surface that distinctly.
+                self.diarization_disabled_reason = (
+                    "pyannote model could not be loaded — likely a gated-repo "
+                    "ToS not accepted. Accept both at "
+                    "https://huggingface.co/pyannote/segmentation-3.0 and "
+                    "https://huggingface.co/pyannote/speaker-diarization-3.1 "
+                    "using the account that owns HF_AUTH_TOKEN"
+                )
+                raise RuntimeError(self.diarization_disabled_reason)
+            if self.device == "cuda":
+                self.diarize_model.to(torch.device("cuda"))
+            self.diarization_enabled = True
+            self.diarization_disabled_reason = ""
+            print("[pipeline] Diarization enabled (pyannote)", flush=True)
+        except Exception as e:
+            self.diarization_disabled_reason = f"{type(e).__name__}: {e}"
+            print(f"[pipeline] Failed to load pyannote ({self.diarization_disabled_reason}); diarization disabled", flush=True)
+    def _get_align_model(self, language_code: str):
+        if language_code not in self.align_models:
+            device = "cpu" if self.device == "mps" else self.device
+            print(f"[pipeline] Loading align model for {language_code}", flush=True)
+            self.align_models[language_code] = whisperx.load_align_model(
+                language_code=language_code, device=device
+            )
+        return self.align_models[language_code]
+    def transcribe(
+        self,
+        audio_path: str,
+        options: TranscriptionOptions,
+        is_cancelled: Callable[[], bool] = lambda: False,
+        on_audio_loaded: Optional[Callable[[float], None]] = None,
+    ) -> TranscriptionResult:
+        if self.model is None:
+            raise RuntimeError("Pipeline not loaded; call load() first")
+        t0 = time.time()
+        audio = whisperx.load_audio(audio_path)
+        duration_seconds = len(audio) / SAMPLE_RATE
+        if on_audio_loaded is not None:
+            try:
+                on_audio_loaded(duration_seconds)
+            except Exception:
+                pass
+        if is_cancelled():
+            raise CancelledError()
+        hotwords = options.hotwords or []
+        if hotwords:
+            self.model.options = self.model.options._replace(prefix=" ".join(hotwords))
+        try:
+            transcribe_result = self.model.transcribe(
+                audio,
+                batch_size=self.batch_size,
+                language=options.language,
+            )
+        finally:
+            if hotwords:
+                self.model.options = self.model.options._replace(prefix=None)
+        if is_cancelled():
+            raise CancelledError()
+        language = transcribe_result["language"]
+        align_device = "cpu" if self.device == "mps" else self.device
+        model_a, metadata = self._get_align_model(language)
+        aligned = whisperx.align(
+            transcribe_result["segments"],
+            model_a,
+            metadata,
+            audio,
+            align_device,
+            return_char_alignments=False,
+        )
+        if is_cancelled():
+            raise CancelledError()
+        if self.diarization_enabled:
+            audio_data = {
+                "waveform": torch.from_numpy(audio[None, :]),
+                "sample_rate": SAMPLE_RATE,
+            }
+            kwargs = {}
+            if options.num_speakers is not None:
+                kwargs["num_speakers"] = options.num_speakers
+            diarize_segments = self.diarize_model(audio_data, **kwargs)
+            diarize_df = pd.DataFrame(
+                diarize_segments.itertracks(yield_label=True),
+                columns=["segment", "label", "speaker"],
+            )
+            diarize_df["start"] = diarize_df["segment"].apply(lambda x: x.start)
+            diarize_df["end"] = diarize_df["segment"].apply(lambda x: x.end)
+            assigned = whisperx.assign_word_speakers(diarize_df, aligned)
+        else:
+            assigned = aligned
+            for seg in assigned["segments"]:
+                seg["speaker"] = "unknown"
+        if self.device == "cuda":
+            torch.cuda.empty_cache()
+        elif self.device == "mps":
+            try:
+                torch.mps.empty_cache()
+            except Exception:
+                pass
+        segments: list[dict] = []
+        for seg in assigned["segments"]:
+            text = (seg.get("text") or "").strip()
+            if not text:
+                continue
+            segments.append({
+                "start": float(seg["start"]),
+                "end": float(seg["end"]),
+                "text": text,
+                "speaker": seg.get("speaker") or "unknown",
+            })
+        print(f"[pipeline] Done in {time.time() - t0:.1f}s ({len(segments)} segments)", flush=True)
+        return TranscriptionResult(
+            segments=segments,
+            language=language,
+            duration_seconds=duration_seconds,
+        )

package/ee/python/transcription/server.py ADDED Viewed

@@ -0,0 +1,151 @@
+"""
+FastAPI server exposing the whisper transcription API.
+Spawned as a sidecar by the @exulu/backend whisper supervisor, but also
+runnable standalone (typically on a separate GPU box) by setting
+TRANSCRIPTION_SERVER on the main app to point here.
+"""
+import os
+from contextlib import asynccontextmanager
+from typing import Optional
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+from pipeline import (
+    TranscriptionOptions,
+    TranscriptionPipeline,
+    describe_gpu,
+    detect_device,
+)
+from worker import TranscriptionWorker
+WHISPER_MODEL = os.getenv("WHISPER_MODEL", "large-v3")
+WHISPER_DEVICE = os.getenv("WHISPER_DEVICE", "auto")
+WHISPER_BATCH_SIZE = int(os.getenv("WHISPER_BATCH_SIZE", "4"))
+def _log(msg: str) -> None:
+    print(f"[EXULU-WHISPER] {msg}", flush=True)
+state: dict = {}
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    device = detect_device(WHISPER_DEVICE)
+    gpu = describe_gpu(device)
+    if gpu["available"]:
+        if gpu["kind"] == "cuda":
+            _log(f"GPU support: enabled (CUDA, {gpu['name']}, {gpu['vram_gb']} GB VRAM)")
+        else:
+            _log("GPU support: enabled (Apple Silicon MPS)")
+    else:
+        _log("GPU support: disabled (CPU only). Transcription will be slow.")
+        _log("  To enable on Linux/Windows: install CUDA + re-run `WHISPER_GPU=cuda npm install`")
+        _log("  To enable on macOS Apple Silicon: no setup needed (will auto-use MPS)")
+    _log(f"Model: {WHISPER_MODEL} (loading… ~30s first run)")
+    pipeline = TranscriptionPipeline(WHISPER_MODEL, device, WHISPER_BATCH_SIZE)
+    pipeline.load()
+    if pipeline.diarization_enabled:
+        _log("Diarization: enabled (pyannote)")
+    else:
+        reason = pipeline.diarization_disabled_reason or "unknown reason"
+        _log(f'Diarization: disabled ({reason}). All segments will get speaker="unknown".')
+        if not os.getenv("HF_AUTH_TOKEN"):
+            _log("  To enable: set HF_AUTH_TOKEN to a Hugging Face token, then accept ToS at")
+            _log("  https://huggingface.co/pyannote/segmentation-3.0 and")
+            _log("  https://huggingface.co/pyannote/speaker-diarization-3.1")
+        else:
+            _log("  Token is set; the missing piece is usually accepting the gated-repo ToS at")
+            _log("  https://huggingface.co/pyannote/segmentation-3.0 and")
+            _log("  https://huggingface.co/pyannote/speaker-diarization-3.1")
+    worker = TranscriptionWorker(pipeline)
+    worker.start()
+    state["device"] = device
+    state["gpu"] = gpu
+    state["pipeline"] = pipeline
+    state["worker"] = worker
+    _log("Ready.")
+    yield
+app = FastAPI(lifespan=lifespan)
+class JobCreate(BaseModel):
+    audio_url: str
+    language: Optional[str] = None
+    num_speakers: Optional[int] = None
+    hotwords: Optional[list[str]] = None
+def _job_to_dict(job) -> dict:
+    return {
+        "job_id": job.job_id,
+        "status": job.status,
+        "started_at": job.started_at,
+        "finished_at": job.finished_at,
+        "segments": job.segments,
+        "language": job.language,
+        "duration_seconds": job.duration_seconds,
+        "error": job.error,
+    }
+@app.post("/jobs")
+async def create_job(body: JobCreate):
+    worker: TranscriptionWorker = state["worker"]
+    opts = TranscriptionOptions(
+        language=body.language,
+        num_speakers=body.num_speakers,
+        hotwords=body.hotwords or [],
+    )
+    job_id = worker.submit(body.audio_url, opts)
+    return {"job_id": job_id, "status": "queued"}
+@app.get("/jobs")
+async def list_jobs():
+    worker: TranscriptionWorker = state["worker"]
+    return [_job_to_dict(j) for j in worker.list_jobs()]
+@app.get("/jobs/{job_id}")
+async def get_job(job_id: str):
+    worker: TranscriptionWorker = state["worker"]
+    job = worker.get(job_id)
+    if not job:
+        raise HTTPException(status_code=404, detail="job not found")
+    return _job_to_dict(job)
+@app.delete("/jobs/{job_id}")
+async def cancel_job(job_id: str):
+    worker: TranscriptionWorker = state["worker"]
+    job = worker.get(job_id)
+    if not job:
+        raise HTTPException(status_code=404, detail="job not found")
+    worker.cancel(job_id)
+    return {"job_id": job_id, "status": job.status}
+@app.get("/healthz")
+async def healthz():
+    pipeline: Optional[TranscriptionPipeline] = state.get("pipeline")
+    return {
+        "ok": True,
+        "device": state.get("device"),
+        "model": WHISPER_MODEL,
+        "gpu": state.get("gpu"),
+        "diarization": pipeline.diarization_enabled if pipeline else False,
+    }

package/ee/python/transcription/tests/__init__.py ADDED Viewed

File without changes

package/ee/python/transcription/tests/test_server.py ADDED Viewed

@@ -0,0 +1,111 @@
+"""
+FastAPI testclient tests for the whisper server endpoints.
+Stubs the pipeline so we exercise routing/state transitions, not the actual
+whisperx/pyannote stack — which is slow and needs a GPU + model downloads.
+Run from the repo root with the venv active:
+    cd ee/python/transcription && ../.venv/bin/python -m pytest tests
+"""
+from unittest.mock import patch
+import asyncio
+import sys
+from pathlib import Path
+import pytest
+# Make ee/python/transcription importable.
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
+class StubPipeline:
+    def __init__(self, *args, **kwargs):
+        self.diarization_enabled = True
+    def load(self):
+        return None
+class StubResult:
+    def __init__(self):
+        self.segments = [
+            {"start": 0.0, "end": 1.0, "text": "Hello", "speaker": "SPEAKER_00"},
+            {"start": 1.0, "end": 2.0, "text": "Hi there", "speaker": "SPEAKER_01"},
+        ]
+        self.language = "en"
+        self.duration_seconds = 2.0
+@pytest.fixture
+def client(monkeypatch):
+    monkeypatch.setattr("pipeline.TranscriptionPipeline", StubPipeline)
+    monkeypatch.setattr("pipeline.detect_device", lambda _r="auto": "cpu")
+    monkeypatch.setattr(
+        "pipeline.describe_gpu",
+        lambda _d: {"available": False, "kind": "cpu", "name": None, "vram_gb": None},
+    )
+    # Patch the worker's download to avoid hitting the network.
+    monkeypatch.setattr("worker._download_sync", lambda _u: "/tmp/dummy")
+    from fastapi.testclient import TestClient
+    # Reload server so it picks up the patched pipeline at import time.
+    if "server" in sys.modules:
+        del sys.modules["server"]
+    import server
+    # Make the pipeline's transcribe return a deterministic result.
+    monkeypatch.setattr(
+        "pipeline.TranscriptionPipeline.transcribe",
+        lambda self, path, options, is_cancelled=None: StubResult(),
+        raising=False,
+    )
+    with TestClient(server.app) as c:
+        yield c
+def test_healthz_shape(client):
+    r = client.get("/healthz")
+    assert r.status_code == 200
+    body = r.json()
+    assert body["ok"] is True
+    assert "device" in body
+    assert "model" in body
+    assert "gpu" in body
+    assert isinstance(body["diarization"], bool)
+def test_create_job_returns_job_id(client):
+    r = client.post("/jobs", json={"audio_url": "http://example.com/audio.wav"})
+    assert r.status_code == 200
+    body = r.json()
+    assert "job_id" in body
+    assert body["status"] == "queued"
+def test_get_unknown_job_is_404(client):
+    r = client.get("/jobs/nonexistent-id")
+    assert r.status_code == 404
+def test_list_jobs_returns_array(client):
+    r = client.get("/jobs")
+    assert r.status_code == 200
+    assert isinstance(r.json(), list)
+def test_delete_unknown_job_is_404(client):
+    r = client.delete("/jobs/nonexistent-id")
+    assert r.status_code == 404
+def test_create_then_get_job(client):
+    create = client.post("/jobs", json={"audio_url": "http://example.com/audio.wav"}).json()
+    job_id = create["job_id"]
+    r = client.get(f"/jobs/{job_id}")
+    assert r.status_code == 200
+    body = r.json()
+    assert body["job_id"] == job_id
+    assert body["status"] in {"queued", "running", "completed", "failed", "cancelled"}