@exulu/backend 1.60.0 → 1.61.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,6 +3,24 @@ transformers
3
3
  pyinstaller
4
4
  docling-hierarchical-pdf
5
5
  defusedxml
6
+ # Whisper transcription server. Used by `npx @exulu/backend exulu-start-whisper`.
7
+ #
8
+ # Notes on the pins:
9
+ # - whisperx >= 3.4: earlier versions transitively pin faster-whisper 1.0.0 →
10
+ # tokenizers <0.16, which collides with litellm's tokenizers >=0.21.
11
+ # whisperx 3.4+ uses faster-whisper 1.1+ on the modern tokenizers track.
12
+ # - torch / torchaudio pinned because pyannote.audio (≤3.4) still imports
13
+ # `torchaudio.AudioMetaData`, which was removed in newer torchaudio. The
14
+ # 2.5 line is the most recent series where the old API is still present.
15
+ torch==2.5.1
16
+ torchaudio==2.5.1
17
+ torchvision==0.20.1
18
+ whisperx>=3.4.0
19
+ pyannote.audio>=3.3.0
20
+ fastapi
21
+ uvicorn
22
+ python-multipart
23
+ requests
6
24
  # LiteLLM proxy — only used when EXULU_USE_LITELLM=true. Always installed so
7
25
  # the dep is ready when the env var is flipped. Pinned to a tested version;
8
26
  # upgrade deliberately.
@@ -199,6 +199,43 @@ fi
199
199
 
200
200
  print_info "Installing packages from requirements.txt..."
201
201
  echo ""
202
+
203
+ # Conditional torch wheel for the whisper transcription server.
204
+ # WhisperX depends on torch transitively; by default pip would install the
205
+ # CPU build, which works everywhere but is slow. On CUDA hosts we want the
206
+ # CUDA build instead. Selection rule:
207
+ # - WHISPER_GPU=cuda explicit OR `nvidia-smi` present → install CUDA wheel
208
+ # - WHISPER_GPU=cpu/mps/skip OR no GPU detected → fall through to default
209
+ WHISPER_GPU_MODE="${WHISPER_GPU:-auto}"
210
+ if [ "$WHISPER_GPU_MODE" = "auto" ]; then
211
+ if command -v nvidia-smi &> /dev/null; then
212
+ WHISPER_GPU_MODE="cuda"
213
+ elif [ "$(uname -s)" = "Darwin" ] && [ "$(uname -m)" = "arm64" ]; then
214
+ WHISPER_GPU_MODE="mps"
215
+ else
216
+ WHISPER_GPU_MODE="cpu"
217
+ fi
218
+ fi
219
+
220
+ case "$WHISPER_GPU_MODE" in
221
+ cuda)
222
+ print_info "Installing CUDA torch wheel (WHISPER_GPU=$WHISPER_GPU_MODE)…"
223
+ pip install torch==2.5.0 torchaudio==2.5.0 \
224
+ --index-url https://download.pytorch.org/whl/cu124 || {
225
+ print_warning "CUDA torch install failed; falling back to default torch."
226
+ }
227
+ ;;
228
+ mps|cpu)
229
+ print_info "Installing default torch wheel (WHISPER_GPU=$WHISPER_GPU_MODE)…"
230
+ ;;
231
+ skip)
232
+ print_warning "WHISPER_GPU=skip — torch install skipped; transcription server will not work."
233
+ ;;
234
+ *)
235
+ print_warning "Unknown WHISPER_GPU=$WHISPER_GPU_MODE — falling back to default torch."
236
+ ;;
237
+ esac
238
+
202
239
  pip install -r "$REQUIREMENTS_FILE"
203
240
 
204
241
  print_success "All dependencies installed successfully"
@@ -225,6 +262,13 @@ print_info "Testing critical imports..."
225
262
  $PYTHON_CMD -c "import docling" 2>/dev/null && print_success "docling imported successfully" || print_error "Failed to import docling"
226
263
  $PYTHON_CMD -c "import transformers" 2>/dev/null && print_success "transformers imported successfully" || print_error "Failed to import transformers"
227
264
 
265
+ # Whisper transcription server imports — non-fatal: only needed for
266
+ # `npx @exulu/backend exulu-start-whisper`. If these fail, the rest of
267
+ # the @exulu/backend package still works fine.
268
+ $PYTHON_CMD -c "import whisperx" 2>/dev/null && print_success "whisperx imported successfully" || print_warning "whisperx not importable (transcription server will not start)"
269
+ $PYTHON_CMD -c "import pyannote.audio" 2>/dev/null && print_success "pyannote.audio imported successfully" || print_warning "pyannote.audio not importable (diarization will be disabled even with HF_AUTH_TOKEN)"
270
+ $PYTHON_CMD -c "import fastapi, uvicorn" 2>/dev/null && print_success "fastapi/uvicorn imported successfully" || print_warning "fastapi/uvicorn not importable (transcription server will not start)"
271
+
228
272
  # Step 8: Display summary
229
273
  echo ""
230
274
  echo -e "${GREEN}========================================${NC}"
File without changes
@@ -0,0 +1,232 @@
1
+ """
2
+ WhisperX + pyannote transcription pipeline.
3
+
4
+ Loads models once at startup; one transcribe() call per audio file.
5
+ Adapted from audio-transcription/src/transcription.py, but reshaped so the
6
+ pipeline can serve multiple jobs from a long-running FastAPI process.
7
+ """
8
+
9
+ import os
10
+ import time
11
+ from dataclasses import dataclass, field
12
+ from typing import Callable, Optional
13
+
14
+ import pandas as pd
15
+ import torch
16
+ import whisperx
17
+ from whisperx.audio import SAMPLE_RATE
18
+
19
+
20
+ @dataclass
21
+ class TranscriptionOptions:
22
+ language: Optional[str] = None # None = auto-detect
23
+ num_speakers: Optional[int] = None # None = auto-detect
24
+ hotwords: list[str] = field(default_factory=list)
25
+
26
+
27
+ @dataclass
28
+ class TranscriptionResult:
29
+ segments: list[dict]
30
+ language: str
31
+ duration_seconds: float
32
+
33
+
34
+ class CancelledError(Exception):
35
+ pass
36
+
37
+
38
+ def detect_device(requested: str = "auto") -> str:
39
+ if requested != "auto":
40
+ return requested
41
+ if torch.cuda.is_available():
42
+ return "cuda"
43
+ if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
44
+ return "mps"
45
+ return "cpu"
46
+
47
+
48
+ def describe_gpu(device: str) -> dict:
49
+ if device == "cuda":
50
+ try:
51
+ name = torch.cuda.get_device_name(0)
52
+ vram_gb = round(torch.cuda.get_device_properties(0).total_memory / (1024 ** 3), 1)
53
+ return {"available": True, "kind": "cuda", "name": name, "vram_gb": vram_gb}
54
+ except Exception:
55
+ return {"available": True, "kind": "cuda", "name": "unknown", "vram_gb": None}
56
+ if device == "mps":
57
+ return {"available": True, "kind": "mps", "name": "Apple Silicon MPS", "vram_gb": None}
58
+ return {"available": False, "kind": "cpu", "name": None, "vram_gb": None}
59
+
60
+
61
+ class TranscriptionPipeline:
62
+ """
63
+ Loads whisper + pyannote once; serves multiple transcribe() calls.
64
+ Not thread-safe — designed for the single-consumer worker.
65
+ """
66
+
67
+ def __init__(self, model_name: str, device: str, batch_size: int):
68
+ self.model_name = model_name
69
+ self.device = device
70
+ self.batch_size = batch_size
71
+ self.model = None
72
+ self.diarize_model = None
73
+ self.diarization_enabled = False
74
+ self.diarization_disabled_reason: str = "not attempted"
75
+ self.align_models: dict[str, tuple] = {}
76
+
77
+ def load(self) -> None:
78
+ # whisperx doesn't ship MPS support; run whisper on CPU when DEVICE=mps
79
+ # (faster than torch CPU fallback because whisperx uses CTranslate2 int8).
80
+ whisper_device = "cpu" if self.device == "mps" else self.device
81
+ compute_type = (
82
+ "float16" if self.device == "cuda" else "int8"
83
+ )
84
+ print(f"[pipeline] Loading whisper '{self.model_name}' on {whisper_device} (compute_type={compute_type})", flush=True)
85
+ self.model = whisperx.load_model(
86
+ self.model_name,
87
+ device=whisper_device,
88
+ compute_type=compute_type,
89
+ )
90
+
91
+ hf_token = os.getenv("HF_AUTH_TOKEN")
92
+ if not hf_token:
93
+ self.diarization_disabled_reason = "HF_AUTH_TOKEN not set"
94
+ print(f"[pipeline] {self.diarization_disabled_reason}; diarization disabled", flush=True)
95
+ return
96
+
97
+ try:
98
+ from pyannote.audio import Pipeline as PyannotePipeline
99
+ self.diarize_model = PyannotePipeline.from_pretrained(
100
+ "pyannote/speaker-diarization-3.1",
101
+ use_auth_token=hf_token,
102
+ )
103
+ if self.diarize_model is None:
104
+ # pyannote.from_pretrained returns None (rather than raising)
105
+ # when the user has the token but hasn't accepted the gated
106
+ # repo's terms of use. Surface that distinctly.
107
+ self.diarization_disabled_reason = (
108
+ "pyannote model could not be loaded — likely a gated-repo "
109
+ "ToS not accepted. Accept both at "
110
+ "https://huggingface.co/pyannote/segmentation-3.0 and "
111
+ "https://huggingface.co/pyannote/speaker-diarization-3.1 "
112
+ "using the account that owns HF_AUTH_TOKEN"
113
+ )
114
+ raise RuntimeError(self.diarization_disabled_reason)
115
+ if self.device == "cuda":
116
+ self.diarize_model.to(torch.device("cuda"))
117
+ self.diarization_enabled = True
118
+ self.diarization_disabled_reason = ""
119
+ print("[pipeline] Diarization enabled (pyannote)", flush=True)
120
+ except Exception as e:
121
+ self.diarization_disabled_reason = f"{type(e).__name__}: {e}"
122
+ print(f"[pipeline] Failed to load pyannote ({self.diarization_disabled_reason}); diarization disabled", flush=True)
123
+
124
+ def _get_align_model(self, language_code: str):
125
+ if language_code not in self.align_models:
126
+ device = "cpu" if self.device == "mps" else self.device
127
+ print(f"[pipeline] Loading align model for {language_code}", flush=True)
128
+ self.align_models[language_code] = whisperx.load_align_model(
129
+ language_code=language_code, device=device
130
+ )
131
+ return self.align_models[language_code]
132
+
133
+ def transcribe(
134
+ self,
135
+ audio_path: str,
136
+ options: TranscriptionOptions,
137
+ is_cancelled: Callable[[], bool] = lambda: False,
138
+ on_audio_loaded: Optional[Callable[[float], None]] = None,
139
+ ) -> TranscriptionResult:
140
+ if self.model is None:
141
+ raise RuntimeError("Pipeline not loaded; call load() first")
142
+
143
+ t0 = time.time()
144
+ audio = whisperx.load_audio(audio_path)
145
+ duration_seconds = len(audio) / SAMPLE_RATE
146
+ if on_audio_loaded is not None:
147
+ try:
148
+ on_audio_loaded(duration_seconds)
149
+ except Exception:
150
+ pass
151
+
152
+ if is_cancelled():
153
+ raise CancelledError()
154
+
155
+ hotwords = options.hotwords or []
156
+ if hotwords:
157
+ self.model.options = self.model.options._replace(prefix=" ".join(hotwords))
158
+ try:
159
+ transcribe_result = self.model.transcribe(
160
+ audio,
161
+ batch_size=self.batch_size,
162
+ language=options.language,
163
+ )
164
+ finally:
165
+ if hotwords:
166
+ self.model.options = self.model.options._replace(prefix=None)
167
+
168
+ if is_cancelled():
169
+ raise CancelledError()
170
+
171
+ language = transcribe_result["language"]
172
+ align_device = "cpu" if self.device == "mps" else self.device
173
+ model_a, metadata = self._get_align_model(language)
174
+ aligned = whisperx.align(
175
+ transcribe_result["segments"],
176
+ model_a,
177
+ metadata,
178
+ audio,
179
+ align_device,
180
+ return_char_alignments=False,
181
+ )
182
+
183
+ if is_cancelled():
184
+ raise CancelledError()
185
+
186
+ if self.diarization_enabled:
187
+ audio_data = {
188
+ "waveform": torch.from_numpy(audio[None, :]),
189
+ "sample_rate": SAMPLE_RATE,
190
+ }
191
+ kwargs = {}
192
+ if options.num_speakers is not None:
193
+ kwargs["num_speakers"] = options.num_speakers
194
+ diarize_segments = self.diarize_model(audio_data, **kwargs)
195
+ diarize_df = pd.DataFrame(
196
+ diarize_segments.itertracks(yield_label=True),
197
+ columns=["segment", "label", "speaker"],
198
+ )
199
+ diarize_df["start"] = diarize_df["segment"].apply(lambda x: x.start)
200
+ diarize_df["end"] = diarize_df["segment"].apply(lambda x: x.end)
201
+ assigned = whisperx.assign_word_speakers(diarize_df, aligned)
202
+ else:
203
+ assigned = aligned
204
+ for seg in assigned["segments"]:
205
+ seg["speaker"] = "unknown"
206
+
207
+ if self.device == "cuda":
208
+ torch.cuda.empty_cache()
209
+ elif self.device == "mps":
210
+ try:
211
+ torch.mps.empty_cache()
212
+ except Exception:
213
+ pass
214
+
215
+ segments: list[dict] = []
216
+ for seg in assigned["segments"]:
217
+ text = (seg.get("text") or "").strip()
218
+ if not text:
219
+ continue
220
+ segments.append({
221
+ "start": float(seg["start"]),
222
+ "end": float(seg["end"]),
223
+ "text": text,
224
+ "speaker": seg.get("speaker") or "unknown",
225
+ })
226
+
227
+ print(f"[pipeline] Done in {time.time() - t0:.1f}s ({len(segments)} segments)", flush=True)
228
+ return TranscriptionResult(
229
+ segments=segments,
230
+ language=language,
231
+ duration_seconds=duration_seconds,
232
+ )
@@ -0,0 +1,151 @@
1
+ """
2
+ FastAPI server exposing the whisper transcription API.
3
+
4
+ Spawned as a sidecar by the @exulu/backend whisper supervisor, but also
5
+ runnable standalone (typically on a separate GPU box) by setting
6
+ TRANSCRIPTION_SERVER on the main app to point here.
7
+ """
8
+
9
+ import os
10
+ from contextlib import asynccontextmanager
11
+ from typing import Optional
12
+
13
+ from fastapi import FastAPI, HTTPException
14
+ from pydantic import BaseModel
15
+
16
+ from pipeline import (
17
+ TranscriptionOptions,
18
+ TranscriptionPipeline,
19
+ describe_gpu,
20
+ detect_device,
21
+ )
22
+ from worker import TranscriptionWorker
23
+
24
+
25
+ WHISPER_MODEL = os.getenv("WHISPER_MODEL", "large-v3")
26
+ WHISPER_DEVICE = os.getenv("WHISPER_DEVICE", "auto")
27
+ WHISPER_BATCH_SIZE = int(os.getenv("WHISPER_BATCH_SIZE", "4"))
28
+
29
+
30
+ def _log(msg: str) -> None:
31
+ print(f"[EXULU-WHISPER] {msg}", flush=True)
32
+
33
+
34
+ state: dict = {}
35
+
36
+
37
+ @asynccontextmanager
38
+ async def lifespan(app: FastAPI):
39
+ device = detect_device(WHISPER_DEVICE)
40
+ gpu = describe_gpu(device)
41
+
42
+ if gpu["available"]:
43
+ if gpu["kind"] == "cuda":
44
+ _log(f"GPU support: enabled (CUDA, {gpu['name']}, {gpu['vram_gb']} GB VRAM)")
45
+ else:
46
+ _log("GPU support: enabled (Apple Silicon MPS)")
47
+ else:
48
+ _log("GPU support: disabled (CPU only). Transcription will be slow.")
49
+ _log(" To enable on Linux/Windows: install CUDA + re-run `WHISPER_GPU=cuda npm install`")
50
+ _log(" To enable on macOS Apple Silicon: no setup needed (will auto-use MPS)")
51
+
52
+ _log(f"Model: {WHISPER_MODEL} (loading… ~30s first run)")
53
+ pipeline = TranscriptionPipeline(WHISPER_MODEL, device, WHISPER_BATCH_SIZE)
54
+ pipeline.load()
55
+
56
+ if pipeline.diarization_enabled:
57
+ _log("Diarization: enabled (pyannote)")
58
+ else:
59
+ reason = pipeline.diarization_disabled_reason or "unknown reason"
60
+ _log(f'Diarization: disabled ({reason}). All segments will get speaker="unknown".')
61
+ if not os.getenv("HF_AUTH_TOKEN"):
62
+ _log(" To enable: set HF_AUTH_TOKEN to a Hugging Face token, then accept ToS at")
63
+ _log(" https://huggingface.co/pyannote/segmentation-3.0 and")
64
+ _log(" https://huggingface.co/pyannote/speaker-diarization-3.1")
65
+ else:
66
+ _log(" Token is set; the missing piece is usually accepting the gated-repo ToS at")
67
+ _log(" https://huggingface.co/pyannote/segmentation-3.0 and")
68
+ _log(" https://huggingface.co/pyannote/speaker-diarization-3.1")
69
+
70
+ worker = TranscriptionWorker(pipeline)
71
+ worker.start()
72
+
73
+ state["device"] = device
74
+ state["gpu"] = gpu
75
+ state["pipeline"] = pipeline
76
+ state["worker"] = worker
77
+
78
+ _log("Ready.")
79
+ yield
80
+
81
+
82
+ app = FastAPI(lifespan=lifespan)
83
+
84
+
85
+ class JobCreate(BaseModel):
86
+ audio_url: str
87
+ language: Optional[str] = None
88
+ num_speakers: Optional[int] = None
89
+ hotwords: Optional[list[str]] = None
90
+
91
+
92
+ def _job_to_dict(job) -> dict:
93
+ return {
94
+ "job_id": job.job_id,
95
+ "status": job.status,
96
+ "started_at": job.started_at,
97
+ "finished_at": job.finished_at,
98
+ "segments": job.segments,
99
+ "language": job.language,
100
+ "duration_seconds": job.duration_seconds,
101
+ "error": job.error,
102
+ }
103
+
104
+
105
+ @app.post("/jobs")
106
+ async def create_job(body: JobCreate):
107
+ worker: TranscriptionWorker = state["worker"]
108
+ opts = TranscriptionOptions(
109
+ language=body.language,
110
+ num_speakers=body.num_speakers,
111
+ hotwords=body.hotwords or [],
112
+ )
113
+ job_id = worker.submit(body.audio_url, opts)
114
+ return {"job_id": job_id, "status": "queued"}
115
+
116
+
117
+ @app.get("/jobs")
118
+ async def list_jobs():
119
+ worker: TranscriptionWorker = state["worker"]
120
+ return [_job_to_dict(j) for j in worker.list_jobs()]
121
+
122
+
123
+ @app.get("/jobs/{job_id}")
124
+ async def get_job(job_id: str):
125
+ worker: TranscriptionWorker = state["worker"]
126
+ job = worker.get(job_id)
127
+ if not job:
128
+ raise HTTPException(status_code=404, detail="job not found")
129
+ return _job_to_dict(job)
130
+
131
+
132
+ @app.delete("/jobs/{job_id}")
133
+ async def cancel_job(job_id: str):
134
+ worker: TranscriptionWorker = state["worker"]
135
+ job = worker.get(job_id)
136
+ if not job:
137
+ raise HTTPException(status_code=404, detail="job not found")
138
+ worker.cancel(job_id)
139
+ return {"job_id": job_id, "status": job.status}
140
+
141
+
142
+ @app.get("/healthz")
143
+ async def healthz():
144
+ pipeline: Optional[TranscriptionPipeline] = state.get("pipeline")
145
+ return {
146
+ "ok": True,
147
+ "device": state.get("device"),
148
+ "model": WHISPER_MODEL,
149
+ "gpu": state.get("gpu"),
150
+ "diarization": pipeline.diarization_enabled if pipeline else False,
151
+ }
File without changes
@@ -0,0 +1,111 @@
1
+ """
2
+ FastAPI testclient tests for the whisper server endpoints.
3
+
4
+ Stubs the pipeline so we exercise routing/state transitions, not the actual
5
+ whisperx/pyannote stack — which is slow and needs a GPU + model downloads.
6
+
7
+ Run from the repo root with the venv active:
8
+ cd ee/python/transcription && ../.venv/bin/python -m pytest tests
9
+ """
10
+
11
+ from unittest.mock import patch
12
+ import asyncio
13
+ import sys
14
+ from pathlib import Path
15
+
16
+ import pytest
17
+
18
+ # Make ee/python/transcription importable.
19
+ sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
20
+
21
+
22
+ class StubPipeline:
23
+ def __init__(self, *args, **kwargs):
24
+ self.diarization_enabled = True
25
+
26
+ def load(self):
27
+ return None
28
+
29
+
30
+ class StubResult:
31
+ def __init__(self):
32
+ self.segments = [
33
+ {"start": 0.0, "end": 1.0, "text": "Hello", "speaker": "SPEAKER_00"},
34
+ {"start": 1.0, "end": 2.0, "text": "Hi there", "speaker": "SPEAKER_01"},
35
+ ]
36
+ self.language = "en"
37
+ self.duration_seconds = 2.0
38
+
39
+
40
+ @pytest.fixture
41
+ def client(monkeypatch):
42
+ monkeypatch.setattr("pipeline.TranscriptionPipeline", StubPipeline)
43
+ monkeypatch.setattr("pipeline.detect_device", lambda _r="auto": "cpu")
44
+ monkeypatch.setattr(
45
+ "pipeline.describe_gpu",
46
+ lambda _d: {"available": False, "kind": "cpu", "name": None, "vram_gb": None},
47
+ )
48
+ # Patch the worker's download to avoid hitting the network.
49
+ monkeypatch.setattr("worker._download_sync", lambda _u: "/tmp/dummy")
50
+
51
+ from fastapi.testclient import TestClient
52
+
53
+ # Reload server so it picks up the patched pipeline at import time.
54
+ if "server" in sys.modules:
55
+ del sys.modules["server"]
56
+ import server
57
+
58
+ # Make the pipeline's transcribe return a deterministic result.
59
+ monkeypatch.setattr(
60
+ "pipeline.TranscriptionPipeline.transcribe",
61
+ lambda self, path, options, is_cancelled=None: StubResult(),
62
+ raising=False,
63
+ )
64
+
65
+ with TestClient(server.app) as c:
66
+ yield c
67
+
68
+
69
+ def test_healthz_shape(client):
70
+ r = client.get("/healthz")
71
+ assert r.status_code == 200
72
+ body = r.json()
73
+ assert body["ok"] is True
74
+ assert "device" in body
75
+ assert "model" in body
76
+ assert "gpu" in body
77
+ assert isinstance(body["diarization"], bool)
78
+
79
+
80
+ def test_create_job_returns_job_id(client):
81
+ r = client.post("/jobs", json={"audio_url": "http://example.com/audio.wav"})
82
+ assert r.status_code == 200
83
+ body = r.json()
84
+ assert "job_id" in body
85
+ assert body["status"] == "queued"
86
+
87
+
88
+ def test_get_unknown_job_is_404(client):
89
+ r = client.get("/jobs/nonexistent-id")
90
+ assert r.status_code == 404
91
+
92
+
93
+ def test_list_jobs_returns_array(client):
94
+ r = client.get("/jobs")
95
+ assert r.status_code == 200
96
+ assert isinstance(r.json(), list)
97
+
98
+
99
+ def test_delete_unknown_job_is_404(client):
100
+ r = client.delete("/jobs/nonexistent-id")
101
+ assert r.status_code == 404
102
+
103
+
104
+ def test_create_then_get_job(client):
105
+ create = client.post("/jobs", json={"audio_url": "http://example.com/audio.wav"}).json()
106
+ job_id = create["job_id"]
107
+ r = client.get(f"/jobs/{job_id}")
108
+ assert r.status_code == 200
109
+ body = r.json()
110
+ assert body["job_id"] == job_id
111
+ assert body["status"] in {"queued", "running", "completed", "failed", "cancelled"}