agent-cli 0.70.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (196) hide show
  1. agent_cli/__init__.py +5 -0
  2. agent_cli/__main__.py +6 -0
  3. agent_cli/_extras.json +14 -0
  4. agent_cli/_requirements/.gitkeep +0 -0
  5. agent_cli/_requirements/audio.txt +79 -0
  6. agent_cli/_requirements/faster-whisper.txt +215 -0
  7. agent_cli/_requirements/kokoro.txt +425 -0
  8. agent_cli/_requirements/llm.txt +183 -0
  9. agent_cli/_requirements/memory.txt +355 -0
  10. agent_cli/_requirements/mlx-whisper.txt +222 -0
  11. agent_cli/_requirements/piper.txt +176 -0
  12. agent_cli/_requirements/rag.txt +402 -0
  13. agent_cli/_requirements/server.txt +154 -0
  14. agent_cli/_requirements/speed.txt +77 -0
  15. agent_cli/_requirements/vad.txt +155 -0
  16. agent_cli/_requirements/wyoming.txt +71 -0
  17. agent_cli/_tools.py +368 -0
  18. agent_cli/agents/__init__.py +23 -0
  19. agent_cli/agents/_voice_agent_common.py +136 -0
  20. agent_cli/agents/assistant.py +383 -0
  21. agent_cli/agents/autocorrect.py +284 -0
  22. agent_cli/agents/chat.py +496 -0
  23. agent_cli/agents/memory/__init__.py +31 -0
  24. agent_cli/agents/memory/add.py +190 -0
  25. agent_cli/agents/memory/proxy.py +160 -0
  26. agent_cli/agents/rag_proxy.py +128 -0
  27. agent_cli/agents/speak.py +209 -0
  28. agent_cli/agents/transcribe.py +671 -0
  29. agent_cli/agents/transcribe_daemon.py +499 -0
  30. agent_cli/agents/voice_edit.py +291 -0
  31. agent_cli/api.py +22 -0
  32. agent_cli/cli.py +106 -0
  33. agent_cli/config.py +503 -0
  34. agent_cli/config_cmd.py +307 -0
  35. agent_cli/constants.py +27 -0
  36. agent_cli/core/__init__.py +1 -0
  37. agent_cli/core/audio.py +461 -0
  38. agent_cli/core/audio_format.py +299 -0
  39. agent_cli/core/chroma.py +88 -0
  40. agent_cli/core/deps.py +191 -0
  41. agent_cli/core/openai_proxy.py +139 -0
  42. agent_cli/core/process.py +195 -0
  43. agent_cli/core/reranker.py +120 -0
  44. agent_cli/core/sse.py +87 -0
  45. agent_cli/core/transcription_logger.py +70 -0
  46. agent_cli/core/utils.py +526 -0
  47. agent_cli/core/vad.py +175 -0
  48. agent_cli/core/watch.py +65 -0
  49. agent_cli/dev/__init__.py +14 -0
  50. agent_cli/dev/cli.py +1588 -0
  51. agent_cli/dev/coding_agents/__init__.py +19 -0
  52. agent_cli/dev/coding_agents/aider.py +24 -0
  53. agent_cli/dev/coding_agents/base.py +167 -0
  54. agent_cli/dev/coding_agents/claude.py +39 -0
  55. agent_cli/dev/coding_agents/codex.py +24 -0
  56. agent_cli/dev/coding_agents/continue_dev.py +15 -0
  57. agent_cli/dev/coding_agents/copilot.py +24 -0
  58. agent_cli/dev/coding_agents/cursor_agent.py +48 -0
  59. agent_cli/dev/coding_agents/gemini.py +28 -0
  60. agent_cli/dev/coding_agents/opencode.py +15 -0
  61. agent_cli/dev/coding_agents/registry.py +49 -0
  62. agent_cli/dev/editors/__init__.py +19 -0
  63. agent_cli/dev/editors/base.py +89 -0
  64. agent_cli/dev/editors/cursor.py +15 -0
  65. agent_cli/dev/editors/emacs.py +46 -0
  66. agent_cli/dev/editors/jetbrains.py +56 -0
  67. agent_cli/dev/editors/nano.py +31 -0
  68. agent_cli/dev/editors/neovim.py +33 -0
  69. agent_cli/dev/editors/registry.py +59 -0
  70. agent_cli/dev/editors/sublime.py +20 -0
  71. agent_cli/dev/editors/vim.py +42 -0
  72. agent_cli/dev/editors/vscode.py +15 -0
  73. agent_cli/dev/editors/zed.py +20 -0
  74. agent_cli/dev/project.py +568 -0
  75. agent_cli/dev/registry.py +52 -0
  76. agent_cli/dev/skill/SKILL.md +141 -0
  77. agent_cli/dev/skill/examples.md +571 -0
  78. agent_cli/dev/terminals/__init__.py +19 -0
  79. agent_cli/dev/terminals/apple_terminal.py +82 -0
  80. agent_cli/dev/terminals/base.py +56 -0
  81. agent_cli/dev/terminals/gnome.py +51 -0
  82. agent_cli/dev/terminals/iterm2.py +84 -0
  83. agent_cli/dev/terminals/kitty.py +77 -0
  84. agent_cli/dev/terminals/registry.py +48 -0
  85. agent_cli/dev/terminals/tmux.py +58 -0
  86. agent_cli/dev/terminals/warp.py +132 -0
  87. agent_cli/dev/terminals/zellij.py +78 -0
  88. agent_cli/dev/worktree.py +856 -0
  89. agent_cli/docs_gen.py +417 -0
  90. agent_cli/example-config.toml +185 -0
  91. agent_cli/install/__init__.py +5 -0
  92. agent_cli/install/common.py +89 -0
  93. agent_cli/install/extras.py +174 -0
  94. agent_cli/install/hotkeys.py +48 -0
  95. agent_cli/install/services.py +87 -0
  96. agent_cli/memory/__init__.py +7 -0
  97. agent_cli/memory/_files.py +250 -0
  98. agent_cli/memory/_filters.py +63 -0
  99. agent_cli/memory/_git.py +157 -0
  100. agent_cli/memory/_indexer.py +142 -0
  101. agent_cli/memory/_ingest.py +408 -0
  102. agent_cli/memory/_persistence.py +182 -0
  103. agent_cli/memory/_prompt.py +91 -0
  104. agent_cli/memory/_retrieval.py +294 -0
  105. agent_cli/memory/_store.py +169 -0
  106. agent_cli/memory/_streaming.py +44 -0
  107. agent_cli/memory/_tasks.py +48 -0
  108. agent_cli/memory/api.py +113 -0
  109. agent_cli/memory/client.py +272 -0
  110. agent_cli/memory/engine.py +361 -0
  111. agent_cli/memory/entities.py +43 -0
  112. agent_cli/memory/models.py +112 -0
  113. agent_cli/opts.py +433 -0
  114. agent_cli/py.typed +0 -0
  115. agent_cli/rag/__init__.py +3 -0
  116. agent_cli/rag/_indexer.py +67 -0
  117. agent_cli/rag/_indexing.py +226 -0
  118. agent_cli/rag/_prompt.py +30 -0
  119. agent_cli/rag/_retriever.py +156 -0
  120. agent_cli/rag/_store.py +48 -0
  121. agent_cli/rag/_utils.py +218 -0
  122. agent_cli/rag/api.py +175 -0
  123. agent_cli/rag/client.py +299 -0
  124. agent_cli/rag/engine.py +302 -0
  125. agent_cli/rag/models.py +55 -0
  126. agent_cli/scripts/.runtime/.gitkeep +0 -0
  127. agent_cli/scripts/__init__.py +1 -0
  128. agent_cli/scripts/check_plugin_skill_sync.py +50 -0
  129. agent_cli/scripts/linux-hotkeys/README.md +63 -0
  130. agent_cli/scripts/linux-hotkeys/toggle-autocorrect.sh +45 -0
  131. agent_cli/scripts/linux-hotkeys/toggle-transcription.sh +58 -0
  132. agent_cli/scripts/linux-hotkeys/toggle-voice-edit.sh +58 -0
  133. agent_cli/scripts/macos-hotkeys/README.md +45 -0
  134. agent_cli/scripts/macos-hotkeys/skhd-config-example +5 -0
  135. agent_cli/scripts/macos-hotkeys/toggle-autocorrect.sh +12 -0
  136. agent_cli/scripts/macos-hotkeys/toggle-transcription.sh +37 -0
  137. agent_cli/scripts/macos-hotkeys/toggle-voice-edit.sh +37 -0
  138. agent_cli/scripts/nvidia-asr-server/README.md +99 -0
  139. agent_cli/scripts/nvidia-asr-server/pyproject.toml +27 -0
  140. agent_cli/scripts/nvidia-asr-server/server.py +255 -0
  141. agent_cli/scripts/nvidia-asr-server/shell.nix +32 -0
  142. agent_cli/scripts/nvidia-asr-server/uv.lock +4654 -0
  143. agent_cli/scripts/run-openwakeword.sh +11 -0
  144. agent_cli/scripts/run-piper-windows.ps1 +30 -0
  145. agent_cli/scripts/run-piper.sh +24 -0
  146. agent_cli/scripts/run-whisper-linux.sh +40 -0
  147. agent_cli/scripts/run-whisper-macos.sh +6 -0
  148. agent_cli/scripts/run-whisper-windows.ps1 +51 -0
  149. agent_cli/scripts/run-whisper.sh +9 -0
  150. agent_cli/scripts/run_faster_whisper_server.py +136 -0
  151. agent_cli/scripts/setup-linux-hotkeys.sh +72 -0
  152. agent_cli/scripts/setup-linux.sh +108 -0
  153. agent_cli/scripts/setup-macos-hotkeys.sh +61 -0
  154. agent_cli/scripts/setup-macos.sh +76 -0
  155. agent_cli/scripts/setup-windows.ps1 +63 -0
  156. agent_cli/scripts/start-all-services-windows.ps1 +53 -0
  157. agent_cli/scripts/start-all-services.sh +178 -0
  158. agent_cli/scripts/sync_extras.py +138 -0
  159. agent_cli/server/__init__.py +3 -0
  160. agent_cli/server/cli.py +721 -0
  161. agent_cli/server/common.py +222 -0
  162. agent_cli/server/model_manager.py +288 -0
  163. agent_cli/server/model_registry.py +225 -0
  164. agent_cli/server/proxy/__init__.py +3 -0
  165. agent_cli/server/proxy/api.py +444 -0
  166. agent_cli/server/streaming.py +67 -0
  167. agent_cli/server/tts/__init__.py +3 -0
  168. agent_cli/server/tts/api.py +335 -0
  169. agent_cli/server/tts/backends/__init__.py +82 -0
  170. agent_cli/server/tts/backends/base.py +139 -0
  171. agent_cli/server/tts/backends/kokoro.py +403 -0
  172. agent_cli/server/tts/backends/piper.py +253 -0
  173. agent_cli/server/tts/model_manager.py +201 -0
  174. agent_cli/server/tts/model_registry.py +28 -0
  175. agent_cli/server/tts/wyoming_handler.py +249 -0
  176. agent_cli/server/whisper/__init__.py +3 -0
  177. agent_cli/server/whisper/api.py +413 -0
  178. agent_cli/server/whisper/backends/__init__.py +89 -0
  179. agent_cli/server/whisper/backends/base.py +97 -0
  180. agent_cli/server/whisper/backends/faster_whisper.py +225 -0
  181. agent_cli/server/whisper/backends/mlx.py +270 -0
  182. agent_cli/server/whisper/languages.py +116 -0
  183. agent_cli/server/whisper/model_manager.py +157 -0
  184. agent_cli/server/whisper/model_registry.py +28 -0
  185. agent_cli/server/whisper/wyoming_handler.py +203 -0
  186. agent_cli/services/__init__.py +343 -0
  187. agent_cli/services/_wyoming_utils.py +64 -0
  188. agent_cli/services/asr.py +506 -0
  189. agent_cli/services/llm.py +228 -0
  190. agent_cli/services/tts.py +450 -0
  191. agent_cli/services/wake_word.py +142 -0
  192. agent_cli-0.70.5.dist-info/METADATA +2118 -0
  193. agent_cli-0.70.5.dist-info/RECORD +196 -0
  194. agent_cli-0.70.5.dist-info/WHEEL +4 -0
  195. agent_cli-0.70.5.dist-info/entry_points.txt +4 -0
  196. agent_cli-0.70.5.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,67 @@
1
+ """Core streaming types for subprocess-based audio generation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ from dataclasses import dataclass
7
+ from typing import TYPE_CHECKING, Any, Literal
8
+
9
+ if TYPE_CHECKING:
10
+ from collections.abc import AsyncIterator
11
+ from multiprocessing import Queue
12
+
13
+
14
+ @dataclass
15
+ class StreamChunk:
16
+ """A chunk of streamed data from a subprocess worker."""
17
+
18
+ chunk_type: Literal["data", "error", "done"]
19
+ payload: bytes | str | None = None
20
+ metadata: dict[str, Any] | None = None
21
+
22
+
23
+ class AsyncQueueReader:
24
+ """Async iterator over multiprocessing.Queue with timeout handling."""
25
+
26
+ def __init__(self, queue: Queue, *, timeout: float = 30.0) -> None:
27
+ """Initialize reader with queue and timeout."""
28
+ self._queue = queue
29
+ self._timeout = timeout
30
+
31
+ def __aiter__(self) -> AsyncIterator[StreamChunk]:
32
+ """Return async iterator."""
33
+ return self
34
+
35
+ async def __anext__(self) -> StreamChunk:
36
+ """Get the next chunk from the queue."""
37
+ loop = asyncio.get_running_loop()
38
+ try:
39
+ chunk = await asyncio.wait_for(
40
+ loop.run_in_executor(None, self._queue.get),
41
+ timeout=self._timeout,
42
+ )
43
+ except TimeoutError as e:
44
+ msg = f"Queue read timeout after {self._timeout}s"
45
+ raise TimeoutError(msg) from e
46
+ return chunk
47
+
48
+
49
+ class QueueWriter:
50
+ """Helper for subprocess to send chunks/errors/done sentinel."""
51
+
52
+ def __init__(self, queue: Queue) -> None:
53
+ """Initialize writer with queue."""
54
+ self._queue = queue
55
+
56
+ def send_data(self, data: bytes, metadata: dict[str, Any] | None = None) -> None:
57
+ """Send a data chunk."""
58
+ self._queue.put(StreamChunk("data", data, metadata))
59
+
60
+ def send_error(self, error: str | Exception) -> None:
61
+ """Send an error chunk."""
62
+ error_msg = str(error) if isinstance(error, Exception) else error
63
+ self._queue.put(StreamChunk("error", error_msg))
64
+
65
+ def send_done(self, metadata: dict[str, Any] | None = None) -> None:
66
+ """Send the done sentinel."""
67
+ self._queue.put(StreamChunk("done", metadata=metadata))
@@ -0,0 +1,3 @@
1
+ """TTS server module with TTL-based model management."""
2
+
3
+ from __future__ import annotations
@@ -0,0 +1,335 @@
1
+ """FastAPI application for TTS server with OpenAI-compatible API."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from typing import TYPE_CHECKING, Annotated, Literal
7
+
8
+ from fastapi import FastAPI, HTTPException, Query
9
+ from fastapi.responses import StreamingResponse
10
+ from pydantic import BaseModel
11
+
12
+ from agent_cli import constants
13
+ from agent_cli.core.audio_format import check_ffmpeg_available, convert_to_mp3
14
+ from agent_cli.server.common import configure_app, create_lifespan
15
+ from agent_cli.server.tts.backends.base import InvalidTextError
16
+
17
+ if TYPE_CHECKING:
18
+ from collections.abc import AsyncIterator
19
+
20
+ from agent_cli.server.tts.model_registry import TTSModelRegistry
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ def _format_audio_response(
26
+ audio: bytes,
27
+ response_format: str,
28
+ sample_rate: int,
29
+ sample_width: int,
30
+ channels: int,
31
+ ) -> StreamingResponse:
32
+ """Format audio data as a streaming response."""
33
+ if response_format == "wav":
34
+ return StreamingResponse(iter([audio]), media_type="audio/wav")
35
+
36
+ if response_format == "pcm":
37
+ pcm_data = (
38
+ audio[constants.WAV_HEADER_SIZE :] if len(audio) > constants.WAV_HEADER_SIZE else audio
39
+ )
40
+ return StreamingResponse(
41
+ iter([pcm_data]),
42
+ media_type="audio/pcm",
43
+ headers={
44
+ "X-Sample-Rate": str(sample_rate),
45
+ "X-Sample-Width": str(sample_width),
46
+ "X-Channels": str(channels),
47
+ },
48
+ )
49
+
50
+ if response_format == "mp3":
51
+ if not check_ffmpeg_available():
52
+ raise HTTPException(
53
+ status_code=422,
54
+ detail="MP3 format requires ffmpeg to be installed",
55
+ )
56
+ try:
57
+ mp3_data = convert_to_mp3(audio, input_format="wav")
58
+ except RuntimeError as e:
59
+ raise HTTPException(status_code=500, detail=str(e)) from e
60
+ return StreamingResponse(iter([mp3_data]), media_type="audio/mpeg")
61
+
62
+ # Unreachable due to early validation
63
+ msg = f"Unsupported response_format: {response_format}"
64
+ raise HTTPException(status_code=422, detail=msg) # pragma: no cover
65
+
66
+
67
+ # --- Pydantic Models ---
68
+
69
+
70
+ class ModelStatusResponse(BaseModel):
71
+ """Status of a single model."""
72
+
73
+ name: str
74
+ loaded: bool
75
+ device: str | None
76
+ ttl_seconds: int
77
+ ttl_remaining: float | None
78
+ active_requests: int
79
+ # Stats
80
+ load_count: int
81
+ unload_count: int
82
+ total_requests: int
83
+ total_characters: int
84
+ total_audio_seconds: float
85
+ total_synthesis_seconds: float
86
+ last_load_time: float | None
87
+ last_request_time: float | None
88
+ load_duration_seconds: float | None
89
+
90
+
91
+ class HealthResponse(BaseModel):
92
+ """Health check response."""
93
+
94
+ status: str
95
+ models: list[ModelStatusResponse]
96
+
97
+
98
+ class UnloadResponse(BaseModel):
99
+ """Response from model unload request."""
100
+
101
+ status: str
102
+ model: str
103
+ was_loaded: bool
104
+
105
+
106
+ class SpeechRequest(BaseModel):
107
+ """Request body for JSON speech synthesis endpoint."""
108
+
109
+ input: str
110
+ model: str = "tts-1"
111
+ voice: str = "alloy"
112
+ response_format: Literal["mp3", "wav", "pcm"] = "mp3"
113
+ speed: float = 1.0
114
+ stream_format: Literal["audio"] | None = None
115
+
116
+
117
+ class VoiceInfo(BaseModel):
118
+ """Information about an available voice."""
119
+
120
+ voice_id: str
121
+ name: str
122
+ description: str
123
+ preview_url: str | None = None
124
+ labels: dict[str, str] | None = None
125
+
126
+
127
+ class VoicesResponse(BaseModel):
128
+ """Response containing available voices."""
129
+
130
+ voices: list[VoiceInfo]
131
+
132
+
133
+ # --- App Factory ---
134
+
135
+
136
+ def create_app(
137
+ registry: TTSModelRegistry,
138
+ *,
139
+ enable_wyoming: bool = True,
140
+ wyoming_uri: str = "tcp://0.0.0.0:10200",
141
+ ) -> FastAPI:
142
+ """Create the FastAPI application.
143
+
144
+ Args:
145
+ registry: The model registry to use.
146
+ enable_wyoming: Whether to start Wyoming server.
147
+ wyoming_uri: URI for Wyoming server.
148
+
149
+ Returns:
150
+ Configured FastAPI application.
151
+
152
+ """
153
+ lifespan = create_lifespan(
154
+ registry,
155
+ wyoming_handler_module="agent_cli.server.tts.wyoming_handler",
156
+ enable_wyoming=enable_wyoming,
157
+ wyoming_uri=wyoming_uri,
158
+ )
159
+
160
+ app = FastAPI(
161
+ title="TTS Server",
162
+ description="OpenAI-compatible TTS server with TTL-based model unloading",
163
+ version="1.0.0",
164
+ lifespan=lifespan,
165
+ )
166
+
167
+ configure_app(app)
168
+
169
+ # --- Health & Status Endpoints ---
170
+
171
+ @app.get("/health", response_model=HealthResponse)
172
+ async def health_check() -> HealthResponse:
173
+ """Health check endpoint."""
174
+ models = [
175
+ ModelStatusResponse(
176
+ name=s.name,
177
+ loaded=s.loaded,
178
+ device=s.device,
179
+ ttl_seconds=s.ttl_seconds,
180
+ ttl_remaining=s.ttl_remaining,
181
+ active_requests=s.active_requests,
182
+ load_count=s.load_count,
183
+ unload_count=s.unload_count,
184
+ total_requests=s.total_requests,
185
+ total_characters=int(s.extra.get("total_characters", 0.0)),
186
+ total_audio_seconds=s.total_audio_seconds,
187
+ total_synthesis_seconds=s.extra.get("total_synthesis_seconds", 0.0),
188
+ last_load_time=s.last_load_time,
189
+ last_request_time=s.last_request_time,
190
+ load_duration_seconds=s.load_duration_seconds,
191
+ )
192
+ for s in registry.list_status()
193
+ ]
194
+ return HealthResponse(status="healthy", models=models)
195
+
196
+ @app.post("/v1/model/unload", response_model=UnloadResponse)
197
+ async def unload_model(
198
+ model: Annotated[str | None, Query(description="Model to unload")] = None,
199
+ ) -> UnloadResponse:
200
+ """Manually unload a model from memory."""
201
+ try:
202
+ manager = registry.get_manager(model)
203
+ was_loaded = await manager.unload()
204
+ return UnloadResponse(
205
+ status="success",
206
+ model=manager.config.model_name,
207
+ was_loaded=was_loaded,
208
+ )
209
+ except ValueError as e:
210
+ raise HTTPException(status_code=404, detail=str(e)) from e
211
+
212
+ @app.get("/v1/voices", response_model=VoicesResponse)
213
+ async def list_voices() -> VoicesResponse:
214
+ """List available voices (models).
215
+
216
+ For Piper TTS, each model IS a voice. This endpoint returns
217
+ the list of registered models as available voices.
218
+ """
219
+ voices = [
220
+ VoiceInfo(
221
+ voice_id=s.name,
222
+ name=s.name,
223
+ description=f"Piper TTS voice: {s.name}",
224
+ labels={"language": s.name.split("_")[0] if "_" in s.name else "en"},
225
+ )
226
+ for s in registry.list_status()
227
+ ]
228
+ return VoicesResponse(voices=voices)
229
+
230
+ # --- OpenAI-Compatible TTS Endpoint ---
231
+
232
+ async def _synthesize(
233
+ input_text: str,
234
+ model: str,
235
+ voice: str,
236
+ response_format: str,
237
+ speed: float,
238
+ stream_format: str | None,
239
+ ) -> StreamingResponse:
240
+ """Core synthesis logic shared by JSON and form endpoints."""
241
+ # Resolve model name - "tts-1" and "tts-1-hd" are OpenAI's model names
242
+ model_name = None if model in ("tts-1", "tts-1-hd") else model
243
+
244
+ try:
245
+ manager = registry.get_manager(model_name)
246
+ except ValueError as e:
247
+ raise HTTPException(status_code=400, detail=str(e)) from e
248
+
249
+ if not input_text.strip():
250
+ raise HTTPException(status_code=400, detail="Input text cannot be empty")
251
+
252
+ # Clamp speed to valid range
253
+ speed = max(0.25, min(4.0, speed))
254
+
255
+ # Handle streaming mode (OpenAI uses stream_format=audio with response_format=pcm)
256
+ if stream_format is not None:
257
+ if stream_format != "audio":
258
+ raise HTTPException(
259
+ status_code=422,
260
+ detail="Only 'audio' stream_format is supported",
261
+ )
262
+ if response_format != "pcm":
263
+ raise HTTPException(
264
+ status_code=422,
265
+ detail="Streaming requires response_format=pcm",
266
+ )
267
+ if not manager.supports_streaming:
268
+ raise HTTPException(
269
+ status_code=422,
270
+ detail="This model does not support streaming synthesis",
271
+ )
272
+
273
+ async def generate_audio() -> AsyncIterator[bytes]:
274
+ async for chunk in manager.synthesize_stream(
275
+ input_text,
276
+ voice=voice,
277
+ speed=speed,
278
+ ):
279
+ yield chunk
280
+
281
+ return StreamingResponse(
282
+ generate_audio(),
283
+ media_type="audio/pcm",
284
+ headers={
285
+ "X-Sample-Rate": str(constants.KOKORO_DEFAULT_SAMPLE_RATE),
286
+ "X-Sample-Width": "2",
287
+ "X-Channels": "1",
288
+ },
289
+ )
290
+
291
+ # Non-streaming mode: validate format and synthesize complete audio
292
+ valid_formats = ("wav", "pcm", "mp3")
293
+ if response_format not in valid_formats:
294
+ raise HTTPException(
295
+ status_code=422,
296
+ detail=f"Unsupported response_format: {response_format}. Supported: {', '.join(valid_formats)}",
297
+ )
298
+
299
+ try:
300
+ result = await manager.synthesize(
301
+ input_text,
302
+ voice=voice,
303
+ speed=speed,
304
+ )
305
+ except InvalidTextError as e:
306
+ raise HTTPException(status_code=400, detail=str(e)) from e
307
+ except Exception as e:
308
+ logger.exception("Synthesis failed")
309
+ raise HTTPException(status_code=500, detail=str(e)) from e
310
+
311
+ return _format_audio_response(
312
+ result.audio,
313
+ response_format,
314
+ result.sample_rate,
315
+ result.sample_width,
316
+ result.channels,
317
+ )
318
+
319
+ @app.post("/v1/audio/speech")
320
+ async def synthesize_speech(request: SpeechRequest) -> StreamingResponse:
321
+ """OpenAI-compatible text-to-speech endpoint.
322
+
323
+ Accepts JSON body with input, model, voice, response_format, speed,
324
+ and optional stream_format parameters.
325
+ """
326
+ return await _synthesize(
327
+ input_text=request.input,
328
+ model=request.model,
329
+ voice=request.voice,
330
+ response_format=request.response_format,
331
+ speed=request.speed,
332
+ stream_format=request.stream_format,
333
+ )
334
+
335
+ return app
@@ -0,0 +1,82 @@
1
+ """TTS backend factory with platform auto-detection."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from typing import TYPE_CHECKING, Literal
7
+
8
+ if TYPE_CHECKING:
9
+ from agent_cli.server.tts.backends.base import TTSBackend
10
+
11
+ from agent_cli.server.tts.backends.base import (
12
+ BackendConfig,
13
+ SynthesisResult,
14
+ has_gpu,
15
+ )
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+ BackendType = Literal["piper", "kokoro", "auto"]
20
+
21
+
22
+ def detect_backend() -> Literal["piper", "kokoro"]:
23
+ """Detect the best backend for the current platform.
24
+
25
+ Returns:
26
+ "kokoro" if GPU is available and kokoro is installed, otherwise "piper".
27
+
28
+ """
29
+ if has_gpu():
30
+ try:
31
+ import kokoro # noqa: F401, PLC0415
32
+
33
+ return "kokoro"
34
+ except ImportError:
35
+ pass
36
+ return "piper"
37
+
38
+
39
+ def create_backend(
40
+ config: BackendConfig,
41
+ backend_type: BackendType = "auto",
42
+ ) -> TTSBackend:
43
+ """Create a TTS backend instance.
44
+
45
+ Args:
46
+ config: Backend configuration.
47
+ backend_type: Backend to use, or "auto" for platform detection.
48
+
49
+ Returns:
50
+ Configured TTSBackend instance.
51
+
52
+ Raises:
53
+ ImportError: If the required backend package is not installed.
54
+ ValueError: If an unknown backend type is specified.
55
+
56
+ """
57
+ if backend_type == "auto":
58
+ backend_type = detect_backend()
59
+
60
+ logger.debug("Creating %s backend for model %s", backend_type, config.model_name)
61
+
62
+ if backend_type == "piper":
63
+ from agent_cli.server.tts.backends.piper import PiperBackend # noqa: PLC0415
64
+
65
+ return PiperBackend(config)
66
+
67
+ if backend_type == "kokoro":
68
+ from agent_cli.server.tts.backends.kokoro import KokoroBackend # noqa: PLC0415
69
+
70
+ return KokoroBackend(config)
71
+
72
+ msg = f"Unknown backend type: {backend_type}"
73
+ raise ValueError(msg)
74
+
75
+
76
+ __all__ = [
77
+ "BackendConfig",
78
+ "BackendType",
79
+ "SynthesisResult",
80
+ "create_backend",
81
+ "detect_backend",
82
+ ]
@@ -0,0 +1,139 @@
1
+ """Base types and protocol for TTS backends."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from pathlib import Path
7
+ from typing import TYPE_CHECKING, Protocol, runtime_checkable
8
+
9
+ if TYPE_CHECKING:
10
+ from collections.abc import AsyncIterator
11
+
12
+
13
+ def get_torch_device() -> str:
14
+ """Detect the best available PyTorch device."""
15
+ try:
16
+ import torch # noqa: PLC0415
17
+
18
+ if torch.cuda.is_available():
19
+ return "cuda"
20
+ if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
21
+ return "mps"
22
+ except ImportError:
23
+ pass
24
+ return "cpu"
25
+
26
+
27
+ def has_gpu() -> bool:
28
+ """Check if a GPU (CUDA or MPS) is available."""
29
+ return get_torch_device() in ("cuda", "mps")
30
+
31
+
32
+ def get_backend_cache_dir(backend_name: str) -> Path:
33
+ """Get default cache directory for a TTS backend."""
34
+ cache_dir = Path.home() / ".cache" / backend_name
35
+ cache_dir.mkdir(parents=True, exist_ok=True)
36
+ return cache_dir
37
+
38
+
39
+ @dataclass
40
+ class SynthesisResult:
41
+ """Result of a synthesis operation."""
42
+
43
+ audio: bytes
44
+ sample_rate: int
45
+ sample_width: int
46
+ channels: int
47
+ duration: float
48
+
49
+
50
+ @dataclass
51
+ class BackendConfig:
52
+ """Configuration for a TTS backend."""
53
+
54
+ model_name: str
55
+ device: str = "auto"
56
+ cache_dir: Path | None = None
57
+
58
+
59
+ class InvalidTextError(ValueError):
60
+ """Raised when the input text is invalid or unsupported."""
61
+
62
+
63
+ @runtime_checkable
64
+ class TTSBackend(Protocol):
65
+ """Protocol for TTS synthesis backends.
66
+
67
+ Backends handle model loading, unloading, and synthesis.
68
+ The ModelManager handles TTL, stats, and lifecycle.
69
+ """
70
+
71
+ @property
72
+ def is_loaded(self) -> bool:
73
+ """Check if the model is currently loaded."""
74
+ ...
75
+
76
+ @property
77
+ def device(self) -> str | None:
78
+ """Get the device the model is loaded on, or None if not loaded."""
79
+ ...
80
+
81
+ async def load(self) -> float:
82
+ """Load the model into memory.
83
+
84
+ Returns:
85
+ Load duration in seconds.
86
+
87
+ """
88
+ ...
89
+
90
+ async def unload(self) -> None:
91
+ """Unload the model and free memory."""
92
+ ...
93
+
94
+ async def synthesize(
95
+ self,
96
+ text: str,
97
+ *,
98
+ voice: str | None = None,
99
+ speed: float = 1.0,
100
+ ) -> SynthesisResult:
101
+ """Synthesize text to audio.
102
+
103
+ Args:
104
+ text: Text to synthesize.
105
+ voice: Voice to use (optional, uses model default if not specified).
106
+ speed: Speech speed multiplier (0.25 to 4.0).
107
+
108
+ Returns:
109
+ SynthesisResult with audio data and metadata.
110
+
111
+ """
112
+ ...
113
+
114
+ @property
115
+ def supports_streaming(self) -> bool:
116
+ """Check if backend supports streaming synthesis."""
117
+ return False
118
+
119
+ def synthesize_stream(
120
+ self,
121
+ text: str,
122
+ *,
123
+ voice: str | None = None,
124
+ speed: float = 1.0,
125
+ ) -> AsyncIterator[bytes]:
126
+ """Stream synthesized audio chunks as they are generated.
127
+
128
+ Implementations should be async generators (async def with yield).
129
+
130
+ Args:
131
+ text: Text to synthesize.
132
+ voice: Voice to use (optional).
133
+ speed: Speech speed multiplier (0.25 to 4.0).
134
+
135
+ Yields:
136
+ Raw PCM audio chunks (int16, mono).
137
+
138
+ """
139
+ ...