inferall 0.2.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. inferall-0.2.2/LICENSE +21 -0
  2. inferall-0.2.2/PKG-INFO +636 -0
  3. inferall-0.2.2/README.md +568 -0
  4. inferall-0.2.2/inferall/__init__.py +3 -0
  5. inferall-0.2.2/inferall/__main__.py +6 -0
  6. inferall-0.2.2/inferall/api/__init__.py +0 -0
  7. inferall-0.2.2/inferall/api/server.py +2228 -0
  8. inferall-0.2.2/inferall/api/websocket.py +175 -0
  9. inferall-0.2.2/inferall/auth/__init__.py +0 -0
  10. inferall-0.2.2/inferall/auth/key_store.py +218 -0
  11. inferall-0.2.2/inferall/auth/middleware.py +115 -0
  12. inferall-0.2.2/inferall/backends/__init__.py +0 -0
  13. inferall-0.2.2/inferall/backends/asr_backend.py +189 -0
  14. inferall-0.2.2/inferall/backends/base.py +802 -0
  15. inferall-0.2.2/inferall/backends/classification_backend.py +444 -0
  16. inferall-0.2.2/inferall/backends/diffusion_backend.py +155 -0
  17. inferall-0.2.2/inferall/backends/embedding_backend.py +183 -0
  18. inferall-0.2.2/inferall/backends/img2img_backend.py +164 -0
  19. inferall-0.2.2/inferall/backends/llamacpp_backend.py +213 -0
  20. inferall-0.2.2/inferall/backends/ollama_cloud_backend.py +279 -0
  21. inferall-0.2.2/inferall/backends/rerank_backend.py +263 -0
  22. inferall-0.2.2/inferall/backends/seq2seq_backend.py +188 -0
  23. inferall-0.2.2/inferall/backends/transformers_backend.py +683 -0
  24. inferall-0.2.2/inferall/backends/tts_backend.py +167 -0
  25. inferall-0.2.2/inferall/backends/video_backend.py +227 -0
  26. inferall-0.2.2/inferall/backends/vllm_backend.py +585 -0
  27. inferall-0.2.2/inferall/backends/vllm_runtime.py +158 -0
  28. inferall-0.2.2/inferall/backends/vlm_backend.py +501 -0
  29. inferall-0.2.2/inferall/cli/__init__.py +0 -0
  30. inferall-0.2.2/inferall/cli/app.py +54 -0
  31. inferall-0.2.2/inferall/cli/commands/__init__.py +0 -0
  32. inferall-0.2.2/inferall/cli/commands/keys.py +118 -0
  33. inferall-0.2.2/inferall/cli/commands/list_cmd.py +82 -0
  34. inferall-0.2.2/inferall/cli/commands/login.py +29 -0
  35. inferall-0.2.2/inferall/cli/commands/pull.py +308 -0
  36. inferall-0.2.2/inferall/cli/commands/remove.py +96 -0
  37. inferall-0.2.2/inferall/cli/commands/run.py +281 -0
  38. inferall-0.2.2/inferall/cli/commands/serve.py +148 -0
  39. inferall-0.2.2/inferall/cli/commands/status.py +83 -0
  40. inferall-0.2.2/inferall/cli/commands/vllm_cmd.py +129 -0
  41. inferall-0.2.2/inferall/config.py +153 -0
  42. inferall-0.2.2/inferall/gpu/__init__.py +0 -0
  43. inferall-0.2.2/inferall/gpu/allocator.py +432 -0
  44. inferall-0.2.2/inferall/gpu/manager.py +358 -0
  45. inferall-0.2.2/inferall/orchestrator.py +873 -0
  46. inferall-0.2.2/inferall/registry/__init__.py +0 -0
  47. inferall-0.2.2/inferall/registry/assistants_store.py +390 -0
  48. inferall-0.2.2/inferall/registry/file_store.py +164 -0
  49. inferall-0.2.2/inferall/registry/hf_resolver.py +549 -0
  50. inferall-0.2.2/inferall/registry/jobs_store.py +439 -0
  51. inferall-0.2.2/inferall/registry/metadata.py +175 -0
  52. inferall-0.2.2/inferall/registry/ollama_resolver.py +416 -0
  53. inferall-0.2.2/inferall/registry/registry.py +409 -0
  54. inferall-0.2.2/inferall/scheduling/__init__.py +0 -0
  55. inferall-0.2.2/inferall/scheduling/batcher.py +147 -0
  56. inferall-0.2.2/inferall/scheduling/dispatcher.py +194 -0
  57. inferall-0.2.2/inferall/scheduling/gpu_scheduler.py +117 -0
  58. inferall-0.2.2/inferall/tui/__init__.py +1 -0
  59. inferall-0.2.2/inferall/tui/__main__.py +19 -0
  60. inferall-0.2.2/inferall/tui/app.py +557 -0
  61. inferall-0.2.2/inferall.egg-info/PKG-INFO +636 -0
  62. inferall-0.2.2/inferall.egg-info/SOURCES.txt +95 -0
  63. inferall-0.2.2/inferall.egg-info/dependency_links.txt +1 -0
  64. inferall-0.2.2/inferall.egg-info/entry_points.txt +3 -0
  65. inferall-0.2.2/inferall.egg-info/requires.txt +49 -0
  66. inferall-0.2.2/inferall.egg-info/top_level.txt +1 -0
  67. inferall-0.2.2/pyproject.toml +92 -0
  68. inferall-0.2.2/setup.cfg +4 -0
  69. inferall-0.2.2/tests/test_api.py +270 -0
  70. inferall-0.2.2/tests/test_api_multimodal.py +168 -0
  71. inferall-0.2.2/tests/test_api_streaming.py +106 -0
  72. inferall-0.2.2/tests/test_assistants_api.py +273 -0
  73. inferall-0.2.2/tests/test_auth.py +102 -0
  74. inferall-0.2.2/tests/test_backends.py +172 -0
  75. inferall-0.2.2/tests/test_backends_multimodal.py +74 -0
  76. inferall-0.2.2/tests/test_classification.py +313 -0
  77. inferall-0.2.2/tests/test_cli_commands.py +88 -0
  78. inferall-0.2.2/tests/test_config.py +119 -0
  79. inferall-0.2.2/tests/test_dispatcher.py +143 -0
  80. inferall-0.2.2/tests/test_files_api.py +240 -0
  81. inferall-0.2.2/tests/test_gpu_allocator.py +220 -0
  82. inferall-0.2.2/tests/test_gpu_scheduler.py +166 -0
  83. inferall-0.2.2/tests/test_hf_resolver.py +189 -0
  84. inferall-0.2.2/tests/test_img2img.py +275 -0
  85. inferall-0.2.2/tests/test_jobs_api.py +224 -0
  86. inferall-0.2.2/tests/test_ollama_resolver.py +78 -0
  87. inferall-0.2.2/tests/test_openai_compat.py +235 -0
  88. inferall-0.2.2/tests/test_orchestrator.py +245 -0
  89. inferall-0.2.2/tests/test_orchestrator_multimodal.py +120 -0
  90. inferall-0.2.2/tests/test_registry.py +146 -0
  91. inferall-0.2.2/tests/test_registry_v2.py +138 -0
  92. inferall-0.2.2/tests/test_remaining_tasks.py +276 -0
  93. inferall-0.2.2/tests/test_rerank.py +327 -0
  94. inferall-0.2.2/tests/test_seq2seq.py +289 -0
  95. inferall-0.2.2/tests/test_tool_calling.py +247 -0
  96. inferall-0.2.2/tests/test_video.py +306 -0
  97. inferall-0.2.2/tests/test_vllm_backend.py +432 -0
inferall-0.2.2/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 GravenSm
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,636 @@
1
+ Metadata-Version: 2.4
2
+ Name: inferall
3
+ Version: 0.2.2
4
+ Summary: Universal inference engine for every AI model — chat, embeddings, vision, ASR, diffusion, TTS, with optional vLLM acceleration
5
+ Author-email: GravenSm <GravenSm@users.noreply.github.com>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://inferall.dev
8
+ Project-URL: Repository, https://github.com/GravenSm/inferall
9
+ Project-URL: Issues, https://github.com/GravenSm/inferall/issues
10
+ Project-URL: Changelog, https://github.com/GravenSm/inferall/releases
11
+ Keywords: inference,llm,openai-compatible,huggingface,vllm,transformers,gguf,embeddings,vision-language,whisper,diffusion,self-hosted,ai-server
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Environment :: Console
14
+ Classifier: Environment :: GPU :: NVIDIA CUDA
15
+ Classifier: Environment :: Web Environment
16
+ Classifier: Intended Audience :: Developers
17
+ Classifier: Intended Audience :: Science/Research
18
+ Classifier: Operating System :: POSIX :: Linux
19
+ Classifier: Programming Language :: Python :: 3
20
+ Classifier: Programming Language :: Python :: 3.10
21
+ Classifier: Programming Language :: Python :: 3.11
22
+ Classifier: Programming Language :: Python :: 3.12
23
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
24
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
25
+ Classifier: Topic :: Internet :: WWW/HTTP :: HTTP Servers
26
+ Requires-Python: >=3.10
27
+ Description-Content-Type: text/markdown
28
+ License-File: LICENSE
29
+ Requires-Dist: torch>=2.0
30
+ Requires-Dist: transformers>=4.36
31
+ Requires-Dist: accelerate>=0.25
32
+ Requires-Dist: huggingface-hub>=0.20
33
+ Requires-Dist: fastapi>=0.104
34
+ Requires-Dist: uvicorn[standard]>=0.24
35
+ Requires-Dist: typer>=0.9
36
+ Requires-Dist: rich>=13.0
37
+ Requires-Dist: pyyaml>=6.0
38
+ Requires-Dist: nvidia-ml-py>=11.5
39
+ Requires-Dist: python-multipart>=0.0.6
40
+ Requires-Dist: textual>=0.40
41
+ Provides-Extra: gguf
42
+ Requires-Dist: llama-cpp-python>=0.2; extra == "gguf"
43
+ Provides-Extra: gptq
44
+ Requires-Dist: auto-gptq>=0.6; extra == "gptq"
45
+ Provides-Extra: awq
46
+ Requires-Dist: autoawq>=0.1; extra == "awq"
47
+ Provides-Extra: bnb
48
+ Requires-Dist: bitsandbytes>=0.41; extra == "bnb"
49
+ Provides-Extra: embeddings
50
+ Requires-Dist: sentence-transformers>=2.2; extra == "embeddings"
51
+ Provides-Extra: diffusion
52
+ Requires-Dist: diffusers>=0.25; extra == "diffusion"
53
+ Requires-Dist: Pillow>=10.0; extra == "diffusion"
54
+ Provides-Extra: asr
55
+ Requires-Dist: soundfile>=0.12; extra == "asr"
56
+ Requires-Dist: librosa>=0.10; extra == "asr"
57
+ Provides-Extra: tts
58
+ Requires-Dist: scipy>=1.10; extra == "tts"
59
+ Provides-Extra: multimodal
60
+ Requires-Dist: inferall[asr,diffusion,embeddings,tts]; extra == "multimodal"
61
+ Provides-Extra: dev
62
+ Requires-Dist: pytest>=7.0; extra == "dev"
63
+ Requires-Dist: pytest-asyncio>=0.23; extra == "dev"
64
+ Requires-Dist: httpx>=0.25; extra == "dev"
65
+ Provides-Extra: all
66
+ Requires-Dist: inferall[awq,bnb,dev,gguf,gptq,multimodal]; extra == "all"
67
+ Dynamic: license-file
68
+
69
+ # InferAll
70
+
71
+ **Run any AI model locally — one unified API for chat, vision, speech, images, video, and more. Built for multi-user serving.**
72
+
73
+ InferAll is a self-hosted inference server that exposes an **OpenAI-compatible REST API** for every type of AI model. Point any OpenAI SDK client, LangChain, LlamaIndex, or custom application at InferAll and it just works — no code changes needed.
74
+
75
+ ### What it does
76
+
77
+ - **One API for everything** — 17 model types through standard OpenAI endpoints (`/v1/chat/completions`, `/v1/embeddings`, `/v1/images/generations`, `/v1/audio/transcriptions`, and 50+ more)
78
+ - **Runs as a server** — start it with `inferall serve` and any client on your network can connect
79
+ - **Multi-user ready** — per-API-key rate limiting, priority levels, and per-model request queuing so one user's request never blocks another's
80
+ - **Pull from anywhere** — models from HuggingFace Hub, Ollama registry, or Ollama cloud, all through one CLI
81
+ - **GPU optimized** — multi-GPU scheduling with load balancing, VRAM-aware allocation, GGUF at full speed (113 tok/s on RTX 4090), plus fp16/GPTQ/AWQ/BNB quantization
82
+ - **Optional vLLM acceleration** — opt any chat or VLM model into a high-throughput vLLM backend for ~50% faster single-stream inference and continuous batching under load (chandra-ocr-2: 31.6 → 48.2 tok/s on RTX 4090)
83
+ - **Production features** — Assistants API with threads and runs, Files API, Batch processing, Fine-tuning API, tool/function calling, structured JSON output
84
+ - **Built-in dashboard** — terminal UI for real-time GPU monitoring, request queues, performance metrics, and model management
85
+
86
+ ### Supported model types
87
+
88
+ Chat/LLM · Embeddings · Reranking · Vision-Language · Speech Recognition · Text-to-Speech · Image Generation · Image-to-Image · Video Generation · Translation · Summarization · Classification · Object Detection · Segmentation · Depth Estimation · Document QA · Audio Processing
89
+
90
+ ## Requirements
91
+
92
+ - Python 3.10+
93
+ - NVIDIA GPU with CUDA (CPU fallback available)
94
+ - ~2GB disk for base install (models downloaded separately)
95
+
96
+ ## Installation
97
+
98
+ ### 1. Clone and create virtual environment
99
+
100
+ ```bash
101
+ git clone https://github.com/GravenSm/inferall.git
102
+ cd inferall
103
+ python3 -m venv .venv
104
+ source .venv/bin/activate
105
+ ```
106
+
107
+ > **Note:** If your filesystem doesn't support symlinks (NTFS, exFAT), use `python3 -m venv --copies .venv` or create the venv on a native Linux filesystem.
108
+
109
+ ### 2. Install PyTorch first (with CUDA)
110
+
111
+ ```bash
112
+ pip install torch>=2.0
113
+ ```
114
+
115
+ Verify CUDA works:
116
+ ```bash
117
+ python -c "import torch; print(f'CUDA: {torch.cuda.is_available()}, GPUs: {torch.cuda.device_count()}')"
118
+ ```
119
+
120
+ ### 3. Install InferAll
121
+
122
+ **Minimal** (chat + embeddings):
123
+ ```bash
124
+ pip install -e .
125
+ ```
126
+
127
+ **Full install** (all model types):
128
+ ```bash
129
+ pip install -e ".[all]"
130
+ ```
131
+
132
+ **Custom install** (pick what you need):
133
+ ```bash
134
+ # GGUF support (llama.cpp)
135
+ pip install -e ".[gguf]"
136
+
137
+ # Quantized models
138
+ pip install -e ".[bnb]" # bitsandbytes 4/8-bit
139
+ pip install -e ".[gptq]" # GPTQ models
140
+ pip install -e ".[awq]" # AWQ models
141
+
142
+ # Multi-modal
143
+ pip install -e ".[multimodal]" # embeddings + diffusion + ASR + TTS
144
+
145
+ # Development
146
+ pip install -e ".[dev]" # pytest + httpx
147
+ ```
148
+
149
+ ### 4. Extra dependencies for specific tasks
150
+
151
+ ```bash
152
+ # SSE streaming (required for streaming chat)
153
+ pip install sse-starlette
154
+
155
+ # Object detection (DETR, YOLO)
156
+ pip install timm
157
+
158
+ # Document QA (LayoutLM — needs Tesseract OCR)
159
+ pip install pytesseract
160
+ # Also install system tesseract: sudo apt install tesseract-ocr
161
+
162
+ # Video generation (optional MP4 encoding)
163
+ pip install imageio[ffmpeg]
164
+
165
+ # VLM models (Qwen-VL, etc.)
166
+ pip install torchvision
167
+ ```
168
+
169
+ ### 5. GGUF with CUDA (for GPU-accelerated llama.cpp)
170
+
171
+ The default `llama-cpp-python` pip install is CPU-only. For GPU acceleration:
172
+
173
+ ```bash
174
+ # Install pre-built CUDA wheel
175
+ pip install llama-cpp-python --force-reinstall \
176
+ --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu124
177
+
178
+ # Set library path (add to your shell profile)
179
+ export LD_LIBRARY_PATH="$(python -c 'import nvidia.cuda_runtime; print(nvidia.cuda_runtime.__path__[0])')/lib:$LD_LIBRARY_PATH"
180
+ ```
181
+
182
+ ## Quick Start
183
+
184
+ ### Pull a model
185
+
186
+ Models can be pulled from **HuggingFace Hub** or **Ollama's registry**. The source is auto-detected:
187
+
188
+ ```bash
189
+ # From HuggingFace (org/model format)
190
+ inferall pull Qwen/Qwen2.5-1.5B-Instruct
191
+ inferall pull sentence-transformers/all-MiniLM-L6-v2
192
+
193
+ # From Ollama (short name = Ollama registry)
194
+ inferall pull llama3.1
195
+ inferall pull llama3.1:70b
196
+ inferall pull codellama
197
+
198
+ # Force a specific source
199
+ inferall pull --source ollama gemma2
200
+ inferall pull --source hf google/gemma-2-2b-it
201
+ ```
202
+
203
+ Ollama models are GGUF files served from `registry.ollama.ai` — they work with the llama.cpp backend just like HuggingFace GGUF models.
204
+
205
+ ### Chat interactively
206
+
207
+ ```bash
208
+ inferall run Qwen/Qwen2.5-1.5B-Instruct
209
+ ```
210
+
211
+ Commands inside the REPL:
212
+ - Type your message and press Enter
213
+ - `/system <prompt>` — set system prompt
214
+ - `/clear` — reset conversation
215
+ - `/params` — show generation parameters
216
+ - `/exit` or Ctrl+D — quit
217
+ - End a line with `\` for multi-line input
218
+
219
+ ### Start the API server
220
+
221
+ ```bash
222
+ inferall serve
223
+ ```
224
+
225
+ With options:
226
+ ```bash
227
+ inferall serve --port 8080 --host 0.0.0.0 --api-key mykey --workers 4
228
+ ```
229
+
230
+ Or via environment variables:
231
+ ```bash
232
+ INFERALL_PORT=8080 INFERALL_API_KEY=mykey inferall serve
233
+ ```
234
+
235
+ ### List pulled models
236
+
237
+ ```bash
238
+ inferall list
239
+ ```
240
+
241
+ ### Check GPU status
242
+
243
+ ```bash
244
+ inferall status
245
+ ```
246
+
247
+ ### Remove a model
248
+
249
+ ```bash
250
+ inferall remove Qwen/Qwen2.5-1.5B-Instruct
251
+ ```
252
+
253
+ ### vLLM acceleration (optional)
254
+
255
+ ```bash
256
+ inferall vllm install # bootstrap isolated vllm venv
257
+ inferall vllm enable datalab-to/chandra-ocr-2 # opt a model in
258
+ inferall vllm disable datalab-to/chandra-ocr-2 # revert to default backend
259
+ inferall vllm status # show runtime location
260
+ ```
261
+
262
+ See the [vLLM Backend](#vllm-backend-optional-high-throughput) section below for details.
263
+
264
+ ## API Reference
265
+
266
+ All endpoints are OpenAI-compatible where applicable. The server runs at `http://127.0.0.1:8000` by default.
267
+
268
+ ### Chat Completion
269
+
270
+ ```bash
271
+ # Non-streaming
272
+ curl http://localhost:8000/v1/chat/completions \
273
+ -H "Content-Type: application/json" \
274
+ -d '{
275
+ "model": "Qwen/Qwen2.5-1.5B-Instruct",
276
+ "messages": [{"role": "user", "content": "Hello!"}],
277
+ "max_tokens": 256
278
+ }'
279
+
280
+ # Streaming
281
+ curl http://localhost:8000/v1/chat/completions \
282
+ -H "Content-Type: application/json" \
283
+ -d '{
284
+ "model": "Qwen/Qwen2.5-1.5B-Instruct",
285
+ "messages": [{"role": "user", "content": "Hello!"}],
286
+ "stream": true
287
+ }'
288
+ ```
289
+
290
+ ### Embeddings
291
+
292
+ ```bash
293
+ curl http://localhost:8000/v1/embeddings \
294
+ -H "Content-Type: application/json" \
295
+ -d '{
296
+ "model": "sentence-transformers/all-MiniLM-L6-v2",
297
+ "input": ["Hello world", "How are you?"]
298
+ }'
299
+ ```
300
+
301
+ ### Reranking
302
+
303
+ ```bash
304
+ curl http://localhost:8000/v1/rerank \
305
+ -H "Content-Type: application/json" \
306
+ -d '{
307
+ "model": "cross-encoder/ms-marco-MiniLM-L-6-v2",
308
+ "query": "What is Python?",
309
+ "documents": ["Python is a snake", "Python is a programming language"],
310
+ "top_n": 2,
311
+ "return_documents": true
312
+ }'
313
+ ```
314
+
315
+ ### Translation (Seq2seq)
316
+
317
+ ```bash
318
+ curl http://localhost:8000/v1/text/generate \
319
+ -H "Content-Type: application/json" \
320
+ -d '{
321
+ "model": "Helsinki-NLP/opus-mt-en-fr",
322
+ "input": "Hello, how are you today?",
323
+ "num_beams": 4
324
+ }'
325
+ ```
326
+
327
+ ### Image Generation
328
+
329
+ ```bash
330
+ curl http://localhost:8000/v1/images/generations \
331
+ -H "Content-Type: application/json" \
332
+ -d '{
333
+ "model": "stabilityai/sdxl-turbo",
334
+ "prompt": "a cat sitting on a chair",
335
+ "size": "512x512",
336
+ "num_inference_steps": 1,
337
+ "guidance_scale": 0.0
338
+ }'
339
+ ```
340
+
341
+ ### Image-to-Image
342
+
343
+ ```bash
344
+ curl http://localhost:8000/v1/images/edits \
345
+ -H "Content-Type: application/json" \
346
+ -d '{
347
+ "model": "your-img2img-model",
348
+ "prompt": "make it a watercolor painting",
349
+ "image": "<base64-encoded-image>",
350
+ "strength": 0.7
351
+ }'
352
+ ```
353
+
354
+ ### Video Generation
355
+
356
+ ```bash
357
+ curl http://localhost:8000/v1/videos/generations \
358
+ -H "Content-Type: application/json" \
359
+ -d '{
360
+ "model": "your-video-model",
361
+ "prompt": "a cat running in a field",
362
+ "num_frames": 16,
363
+ "fps": 8,
364
+ "size": "512x512"
365
+ }'
366
+ ```
367
+
368
+ ### Speech Recognition (ASR)
369
+
370
+ ```bash
371
+ curl http://localhost:8000/v1/audio/transcriptions \
372
+ -F "file=@audio.wav" \
373
+ -F "model=openai/whisper-tiny"
374
+ ```
375
+
376
+ ### Text-to-Speech
377
+
378
+ ```bash
379
+ curl http://localhost:8000/v1/audio/speech \
380
+ -H "Content-Type: application/json" \
381
+ -d '{
382
+ "model": "suno/bark-small",
383
+ "input": "Hello world"
384
+ }' -o speech.wav
385
+ ```
386
+
387
+ ### Classification
388
+
389
+ ```bash
390
+ # Image classification
391
+ curl http://localhost:8000/v1/classify \
392
+ -H "Content-Type: application/json" \
393
+ -d '{
394
+ "model": "google/vit-base-patch16-224",
395
+ "image": "<base64-image>",
396
+ "top_k": 5
397
+ }'
398
+
399
+ # Zero-shot text classification
400
+ curl http://localhost:8000/v1/classify \
401
+ -H "Content-Type: application/json" \
402
+ -d '{
403
+ "model": "facebook/bart-large-mnli",
404
+ "text": "The stock market crashed today",
405
+ "candidate_labels": ["politics", "finance", "sports"]
406
+ }'
407
+ ```
408
+
409
+ ### Object Detection
410
+
411
+ ```bash
412
+ curl http://localhost:8000/v1/detect \
413
+ -H "Content-Type: application/json" \
414
+ -d '{
415
+ "model": "facebook/detr-resnet-50",
416
+ "image": "<base64-image>",
417
+ "threshold": 0.5
418
+ }'
419
+ ```
420
+
421
+ ### Image Segmentation
422
+
423
+ ```bash
424
+ curl http://localhost:8000/v1/segment \
425
+ -H "Content-Type: application/json" \
426
+ -d '{
427
+ "model": "mattmdjaga/segformer_b2_clothes",
428
+ "image": "<base64-image>"
429
+ }'
430
+ ```
431
+
432
+ ### Depth Estimation
433
+
434
+ ```bash
435
+ curl http://localhost:8000/v1/depth \
436
+ -H "Content-Type: application/json" \
437
+ -d '{
438
+ "model": "LiheYoung/depth-anything-small-hf",
439
+ "image": "<base64-image>"
440
+ }'
441
+ ```
442
+
443
+ ### Document QA
444
+
445
+ ```bash
446
+ curl http://localhost:8000/v1/document-qa \
447
+ -H "Content-Type: application/json" \
448
+ -d '{
449
+ "model": "impira/layoutlm-document-qa",
450
+ "image": "<base64-document-image>",
451
+ "question": "What is the invoice number?"
452
+ }'
453
+ ```
454
+
455
+ ### Audio Processing
456
+
457
+ ```bash
458
+ curl http://localhost:8000/v1/audio/process \
459
+ -H "Content-Type: application/json" \
460
+ -d '{
461
+ "model": "your-audio-model",
462
+ "audio": "<base64-audio>"
463
+ }'
464
+ ```
465
+
466
+ ### Health Check
467
+
468
+ ```bash
469
+ curl http://localhost:8000/health
470
+ ```
471
+
472
+ ### List Models
473
+
474
+ ```bash
475
+ curl http://localhost:8000/v1/models
476
+ ```
477
+
478
+ ## Supported Model Types
479
+
480
+ | Task | Endpoint | Example Models | Quantization |
481
+ |------|----------|---------------|-------------|
482
+ | Chat / LLM | `/v1/chat/completions` | Llama, Qwen, Mistral | fp16, GPTQ, AWQ, BNB 4/8bit, GGUF |
483
+ | Embeddings | `/v1/embeddings` | all-MiniLM, BGE, E5 | fp16 |
484
+ | Reranking | `/v1/rerank` | ms-marco, bge-reranker | fp16 |
485
+ | Vision-Language | `/v1/chat/completions` | Qwen-VL, LLaVA | fp16 |
486
+ | Translation | `/v1/text/generate` | OPUS-MT, NLLB, mBART | fp16 |
487
+ | Summarization | `/v1/text/generate` | T5, FLAN-T5, BART | fp16 |
488
+ | Image Generation | `/v1/images/generations` | SDXL, Stable Diffusion | fp16 |
489
+ | Image-to-Image | `/v1/images/edits` | SD img2img, ControlNet | fp16 |
490
+ | Video Generation | `/v1/videos/generations` | CogVideoX, AnimateDiff | fp16 |
491
+ | Speech Recognition | `/v1/audio/transcriptions` | Whisper | fp16 |
492
+ | Text-to-Speech | `/v1/audio/speech` | Bark, SpeechT5 | fp16 |
493
+ | Classification | `/v1/classify` | ViT, CLIP, BART-MNLI | fp16 |
494
+ | Object Detection | `/v1/detect` | DETR, OWL-ViT | fp16 |
495
+ | Segmentation | `/v1/segment` | SAM, Mask2Former | fp16 |
496
+ | Depth Estimation | `/v1/depth` | Depth Anything, DPT | fp16 |
497
+ | Document QA | `/v1/document-qa` | LayoutLM, Donut | fp16 |
498
+ | Audio Processing | `/v1/audio/process` | Voice conversion | fp16 |
499
+
500
+ ## Configuration
501
+
502
+ Configuration is loaded in layers (highest priority first):
503
+
504
+ 1. **CLI flags** (`--port`, `--host`, etc.)
505
+ 2. **Environment variables** (`INFERALL_PORT`, `INFERALL_HOST`, etc.)
506
+ 3. **Config file** (`~/.inferall/config.yaml`)
507
+ 4. **Built-in defaults**
508
+
509
+ ### Config file example
510
+
511
+ ```yaml
512
+ # ~/.inferall/config.yaml
513
+ default_port: 8000
514
+ default_host: "127.0.0.1"
515
+ idle_timeout: 300 # seconds before idle models are unloaded
516
+ vram_buffer_mb: 512 # VRAM headroom to keep free
517
+ max_loaded_models: 3 # max models in GPU memory simultaneously
518
+ inference_workers: 2 # thread pool size for inference
519
+ trust_remote_code: false
520
+ ```
521
+
522
+ ### Environment variables
523
+
524
+ | Variable | Default | Description |
525
+ |----------|---------|-------------|
526
+ | `INFERALL_PORT` | 8000 | API server port |
527
+ | `INFERALL_HOST` | 127.0.0.1 | Bind address |
528
+ | `INFERALL_API_KEY` | None | API key for auth |
529
+ | `INFERALL_IDLE_TIMEOUT` | 300 | Idle model eviction (seconds) |
530
+ | `INFERALL_VRAM_BUFFER_MB` | 512 | VRAM headroom (MB) |
531
+ | `INFERALL_MAX_LOADED` | 3 | Max loaded models |
532
+ | `INFERALL_WORKERS` | 2 | Inference threads |
533
+ | `INFERALL_BASE_DIR` | ~/.inferall | Data directory |
534
+
535
+ ## vLLM Backend (optional, high-throughput)
536
+
537
+ For chat and vision-language models, InferAll can route inference through [vLLM](https://github.com/vllm-project/vllm) instead of the default HuggingFace transformers backend. vLLM uses PagedAttention, custom CUDA kernels, and continuous batching to deliver substantially higher throughput — especially for models with linear-attention layers (Qwen3-Next, chandra-ocr-2, etc.) where HF transformers can't apply its standard cache optimizations.
538
+
539
+ ### Why a separate venv?
540
+
541
+ vLLM pins `transformers<5` while InferAll uses transformers 5.x, so embedding it directly would force a downgrade and rewrite of every existing backend. Instead, the vLLM backend runs vLLM as a subprocess in its own isolated venv and proxies requests over its OpenAI-compatible HTTP server. This is also how chandra and most production deployments run vLLM.
542
+
543
+ ### Setup
544
+
545
+ ```bash
546
+ # One-time bootstrap — creates ~/.cache/inferall/vllm-venv and installs vllm
547
+ inferall vllm install
548
+
549
+ # Check that the runtime is detected
550
+ inferall vllm status
551
+
552
+ # Opt a model into the vLLM backend (persists in the registry)
553
+ inferall vllm enable datalab-to/chandra-ocr-2
554
+
555
+ # Revert to the default backend
556
+ inferall vllm disable datalab-to/chandra-ocr-2
557
+
558
+ # Or point at an existing vllm install instead of bootstrapping
559
+ export INFERALL_VLLM_PYTHON=/path/to/vllm-venv/bin/python
560
+ ```
561
+
562
+ After enabling, the next request to that model will spawn a vLLM subprocess and serve through it. Unloading the model (idle eviction, manual unload, or `inferall serve` shutdown) cleans up the subprocess and frees the GPU.
563
+
564
+ ### Tuning
565
+
566
+ vLLM's defaults (`gpu_memory_utilization=0.9`, `max_num_seqs=256`) are sized for shared serving infrastructure and OOM almost immediately when other processes already use any GPU memory. InferAll picks conservative defaults and exposes four env vars for tuning:
567
+
568
+ | Variable | Default | Purpose |
569
+ |----------|---------|---------|
570
+ | `INFERALL_VLLM_GPU_MEMORY_UTILIZATION` | auto (≤0.85) | Fraction of total GPU memory vLLM may claim |
571
+ | `INFERALL_VLLM_MAX_MODEL_LEN` | 4096 | Cap on context length (larger = bigger KV cache) |
572
+ | `INFERALL_VLLM_MAX_NUM_SEQS` | 8 | Concurrent in-flight sequences |
573
+ | `INFERALL_VLLM_PYTHON` | (auto-detect) | Path override for the vLLM interpreter |
574
+
575
+ The auto memory budget is computed from currently free VRAM minus a 1.5 GiB safety buffer, then clamped to `[0.30, 0.85]`. Override it explicitly if you know exactly how much vLLM should claim.
576
+
577
+ ## Performance
578
+
579
+ All responses include a `performance` section with timing data:
580
+
581
+ ```json
582
+ {
583
+ "performance": {
584
+ "total_time_ms": 647.0,
585
+ "tokens_per_second": 18.5
586
+ }
587
+ }
588
+ ```
589
+
590
+ Streaming responses include performance in the final SSE chunk.
591
+
592
+ ### Benchmarks (RTX 4090)
593
+
594
+ | Model | Backend | tok/s |
595
+ |-------|---------|-------|
596
+ | Llama 3.1 8B | GGUF Q4_K_M | ~113 |
597
+ | Qwen 2.5 1.5B | Transformers fp16 | ~18.5 |
598
+ | chandra-ocr-2 (5.3B VLM) | Transformers fp16 (default) | 31.6 |
599
+ | chandra-ocr-2 (5.3B VLM) | **vLLM** | **48.2** |
600
+
601
+ ## Architecture
602
+
603
+ ```
604
+ inferall/
605
+ ├── api/server.py # FastAPI server, OpenAI-compatible endpoints
606
+ ├── backends/
607
+ │ ├── base.py # ABCs and data structures
608
+ │ ├── transformers_backend.py # HF transformers (fp16/GPTQ/AWQ/BNB)
609
+ │ ├── llamacpp_backend.py # GGUF via llama.cpp
610
+ │ ├── vllm_backend.py # vLLM via subprocess + HTTP (opt-in)
611
+ │ ├── vllm_runtime.py # vLLM venv discovery + bootstrap
612
+ │ ├── embedding_backend.py # Sentence embeddings
613
+ │ ├── rerank_backend.py # Cross-encoder reranking
614
+ │ ├── vlm_backend.py # Vision-language models
615
+ │ ├── asr_backend.py # Whisper ASR
616
+ │ ├── tts_backend.py # Bark/SpeechT5 TTS
617
+ │ ├── diffusion_backend.py # Text-to-image (diffusers)
618
+ │ ├── img2img_backend.py # Image-to-image
619
+ │ ├── video_backend.py # Text-to-video
620
+ │ ├── seq2seq_backend.py # Translation/summarization
621
+ │ └── classification_backend.py # Classification, detection, segmentation, etc.
622
+ ├── cli/ # Typer CLI (pull, run, serve, list, status, remove, login, vllm)
623
+ ├── gpu/
624
+ │ ├── manager.py # GPU enumeration, VRAM tracking (pynvml)
625
+ │ └── allocator.py # VRAM estimation, multi-GPU allocation, load balancing
626
+ ├── registry/
627
+ │ ├── registry.py # SQLite model registry with migrations
628
+ │ ├── metadata.py # ModelTask, ModelFormat enums, preferred_engine
629
+ │ └── hf_resolver.py # HuggingFace download + format auto-detection
630
+ ├── orchestrator.py # Model lifecycle, LRU eviction, ref counting
631
+ └── config.py # Layered configuration
632
+ ```
633
+
634
+ ## License
635
+
636
+ MIT