ai-track 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. ai_track-0.1.0/MANIFEST.in +2 -0
  2. ai_track-0.1.0/PKG-INFO +236 -0
  3. ai_track-0.1.0/README.md +212 -0
  4. ai_track-0.1.0/ai_track.egg-info/PKG-INFO +236 -0
  5. ai_track-0.1.0/ai_track.egg-info/SOURCES.txt +52 -0
  6. ai_track-0.1.0/ai_track.egg-info/dependency_links.txt +1 -0
  7. ai_track-0.1.0/ai_track.egg-info/requires.txt +23 -0
  8. ai_track-0.1.0/ai_track.egg-info/top_level.txt +1 -0
  9. ai_track-0.1.0/assets/logo_light.png +0 -0
  10. ai_track-0.1.0/pyproject.toml +44 -0
  11. ai_track-0.1.0/setup.cfg +4 -0
  12. ai_track-0.1.0/tests/test_cuda_factories.py +72 -0
  13. ai_track-0.1.0/tests/test_cuda_runtime.py +127 -0
  14. ai_track-0.1.0/tests/test_hub.py +152 -0
  15. ai_track-0.1.0/tests/test_inference_primitives.py +51 -0
  16. ai_track-0.1.0/tests/test_openai_compat.py +59 -0
  17. ai_track-0.1.0/tests/test_public_api.py +26 -0
  18. ai_track-0.1.0/track/__init__.py +17 -0
  19. ai_track-0.1.0/track/hub/__init__.py +255 -0
  20. ai_track-0.1.0/track/inference/__init__.py +52 -0
  21. ai_track-0.1.0/track/inference/ai_model.py +63 -0
  22. ai_track-0.1.0/track/inference/audio/__init__.py +32 -0
  23. ai_track-0.1.0/track/inference/audio/base.py +34 -0
  24. ai_track-0.1.0/track/inference/audio/mlx.py +158 -0
  25. ai_track-0.1.0/track/inference/audio/models.py +48 -0
  26. ai_track-0.1.0/track/inference/audio/transformers.py +147 -0
  27. ai_track-0.1.0/track/inference/audio/utils.py +72 -0
  28. ai_track-0.1.0/track/inference/chat/__init__.py +24 -0
  29. ai_track-0.1.0/track/inference/chat/base.py +38 -0
  30. ai_track-0.1.0/track/inference/chat/mlx.py +200 -0
  31. ai_track-0.1.0/track/inference/chat/models.py +27 -0
  32. ai_track-0.1.0/track/inference/chat/utils.py +92 -0
  33. ai_track-0.1.0/track/inference/chat/vllm.py +165 -0
  34. ai_track-0.1.0/track/inference/embedding/__init__.py +28 -0
  35. ai_track-0.1.0/track/inference/embedding/base.py +15 -0
  36. ai_track-0.1.0/track/inference/embedding/mlx.py +88 -0
  37. ai_track-0.1.0/track/inference/embedding/models.py +37 -0
  38. ai_track-0.1.0/track/inference/embedding/transformers.py +165 -0
  39. ai_track-0.1.0/track/inference/head.py +469 -0
  40. ai_track-0.1.0/track/inference/image/__init__.py +32 -0
  41. ai_track-0.1.0/track/inference/image/base.py +46 -0
  42. ai_track-0.1.0/track/inference/image/diffusers.py +158 -0
  43. ai_track-0.1.0/track/inference/image/mflux.py +128 -0
  44. ai_track-0.1.0/track/inference/image/models.py +39 -0
  45. ai_track-0.1.0/track/inference/model_storage.py +47 -0
  46. ai_track-0.1.0/track/inference/openai.py +1214 -0
  47. ai_track-0.1.0/track/inference/transcription/__init__.py +30 -0
  48. ai_track-0.1.0/track/inference/transcription/base.py +31 -0
  49. ai_track-0.1.0/track/inference/transcription/models.py +39 -0
  50. ai_track-0.1.0/track/inference/transcription/transformers.py +124 -0
  51. ai_track-0.1.0/track/inference/transcription/utils.py +47 -0
  52. ai_track-0.1.0/track/inference/types.py +78 -0
  53. ai_track-0.1.0/track/providers/__init__.py +0 -0
  54. ai_track-0.1.0/track/server/__init__.py +0 -0
@@ -0,0 +1,2 @@
1
+ include README.md
2
+ recursive-include assets *.png
@@ -0,0 +1,236 @@
1
+ Metadata-Version: 2.4
2
+ Name: ai-track
3
+ Version: 0.1.0
4
+ Summary: Universal AI runtime for local and remote inference.
5
+ Requires-Python: >=3.12
6
+ Description-Content-Type: text/markdown
7
+ Requires-Dist: openai>=1.0
8
+ Requires-Dist: pydantic>=2.0
9
+ Requires-Dist: numpy>=1.26
10
+ Requires-Dist: pillow>=10.0
11
+ Requires-Dist: huggingface-hub>=0.24
12
+ Requires-Dist: tqdm>=4.66
13
+ Provides-Extra: macos
14
+ Requires-Dist: mlx-lm>=0.24; sys_platform == "darwin" and extra == "macos"
15
+ Requires-Dist: mlx-vlm>=0.4; sys_platform == "darwin" and extra == "macos"
16
+ Requires-Dist: mlx-audio>=0.4; sys_platform == "darwin" and extra == "macos"
17
+ Requires-Dist: mflux>=0.5; sys_platform == "darwin" and extra == "macos"
18
+ Provides-Extra: cuda
19
+ Requires-Dist: torch>=2.2; sys_platform == "linux" and extra == "cuda"
20
+ Requires-Dist: vllm>=0.11; sys_platform == "linux" and extra == "cuda"
21
+ Requires-Dist: transformers>=4.45; sys_platform == "linux" and extra == "cuda"
22
+ Requires-Dist: accelerate>=1.0; sys_platform == "linux" and extra == "cuda"
23
+ Requires-Dist: diffusers>=0.30; sys_platform == "linux" and extra == "cuda"
24
+
25
+ [![CI](https://github.com/langelabs/ai-track/actions/workflows/ci.yml/badge.svg)](https://github.com/langelabs/ai-track/actions/workflows/ci.yml)
26
+
27
+ ![ai-track logo](assets/logo_light.png)
28
+
29
+ `ai-track` is a universal AI runtime library for local and remote inference.
30
+ It chooses the best available execution tier automatically, keeps the core
31
+ package lightweight, and exposes an OpenAI-style client surface so application
32
+ code can stay backend-agnostic.
33
+
34
+ ## What it does
35
+
36
+ - Routes requests through local or remote inference automatically.
37
+ - Supports macOS MLX backends for on-device inference.
38
+ - Supports CUDA backends for GPU inference with vLLM and Hugging Face models.
39
+ - Falls back to a remote OpenAI-compatible client when no local backend fits.
40
+ - Exposes a familiar client surface for chat, embeddings, images, audio, and
41
+ transcription.
42
+
43
+ ## Architecture
44
+
45
+ The codebase is split into two major layers:
46
+
47
+ - `track.inference` contains the runtime primitives and backend implementations.
48
+ - `track.hub` contains the public routing layer that decides whether a model
49
+ should use local inference or a remote client.
50
+
51
+ The runtime is centered around `LocalAI`, which can manage:
52
+
53
+ - chat generation
54
+ - embeddings
55
+ - image generation
56
+ - text-to-speech
57
+ - speech-to-text transcription
58
+
59
+ ### Runtime selection
60
+
61
+ The runtime chooses a backend automatically when you do not pass one explicitly:
62
+
63
+ - macOS resolves to the MLX backend
64
+ - CUDA-capable Linux systems resolve to the CUDA backend
65
+ - everything else stays available through the remote OpenAI-compatible path
66
+
67
+ You can still force a backend explicitly when you need to.
68
+
69
+ ## Local-first routing
70
+
71
+ Routing is local-first:
72
+
73
+ 1. The hub checks whether the selected model is local.
74
+ 2. If the runtime can serve it locally, the request stays on-device.
75
+ 3. Otherwise the hub falls back to a remote OpenAI-compatible client.
76
+
77
+ This keeps local inference fast and private when available while preserving a
78
+ reliable remote fallback.
79
+
80
+ ## Public API
81
+
82
+ The main entrypoints are:
83
+
84
+ ```python
85
+ from track.hub import Hub
86
+ from track.inference import LocalAI
87
+ ```
88
+
89
+ `LocalAI` exposes the local runtime directly and can also return an
90
+ OpenAI-style client.
91
+
92
+ `Hub` resolves a final client for a selected model and is the preferred way to
93
+ route requests from application code.
94
+
95
+ ## OpenAI-style client
96
+
97
+ The local compatibility layer mirrors the shape of the OpenAI Python client.
98
+ It supports:
99
+
100
+ - `client.chat.completions.create(...)`
101
+ - `client.embeddings.create(...)`
102
+ - `client.images.generate(...)`
103
+ - `client.audio.speech.create(...)`
104
+ - `client.audio.transcriptions.create(...)`
105
+
106
+ ### Example: chat
107
+
108
+ ```python
109
+ from track.hub import Hub
110
+ from track.inference import AiModel, InferenceConfig, LocalAI
111
+
112
+ chat_model = AiModel(
113
+ default=True,
114
+ location="local",
115
+ type="llm",
116
+ status="available",
117
+ model="mlx-community/qwen2",
118
+ alias="Qwen2",
119
+ inference_config=InferenceConfig(max_tokens=256, temperature=0.2),
120
+ )
121
+
122
+ runtime = LocalAI(
123
+ chat_config=chat_model,
124
+ remote_api_key="sk-example",
125
+ remote_base_url="https://openrouter.ai/api/v1",
126
+ )
127
+
128
+ hub = Hub(local_ai=runtime)
129
+ client = hub.get_client(chat_model)
130
+
131
+ response = client.chat.completions.create(
132
+ model=chat_model.model,
133
+ messages=[
134
+ {"role": "user", "content": "Summarize this architecture."},
135
+ ],
136
+ )
137
+
138
+ print(response.choices[0].message["content"])
139
+ ```
140
+
141
+ ### Example: transcription
142
+
143
+ ```python
144
+ from track.inference import LocalAI, TranscriptionModelConfig
145
+
146
+ runtime = LocalAI(
147
+ backend="cuda",
148
+ transcription_config=TranscriptionModelConfig(
149
+ model_id="openai/whisper-small",
150
+ alias="Whisper Small",
151
+ ),
152
+ )
153
+
154
+ result = runtime.transcribe("sample.wav")
155
+ print(result.text)
156
+ ```
157
+
158
+ ### Example: OpenAI-style transcription
159
+
160
+ ```python
161
+ client = runtime.get_client()
162
+ result = client.audio.transcriptions.create(
163
+ model="openai/whisper-small",
164
+ file="sample.wav",
165
+ )
166
+ print(result.text)
167
+ ```
168
+
169
+ ## Installation
170
+
171
+ The core package is intentionally small and works without the optional local
172
+ backends.
173
+
174
+ ### Core install
175
+
176
+ ```bash
177
+ uv sync
178
+ ```
179
+
180
+ If you want to install from PyPI with `pip`, use:
181
+
182
+ ```bash
183
+ pip install ai-track
184
+ ```
185
+
186
+ ### macOS MLX extras
187
+
188
+ ```bash
189
+ uv sync --extra macos
190
+ ```
191
+
192
+ For `pip`:
193
+
194
+ ```bash
195
+ pip install "ai-track[macos]"
196
+ ```
197
+
198
+ ### CUDA extras
199
+
200
+ ```bash
201
+ uv sync --extra cuda
202
+ ```
203
+
204
+ For `pip`:
205
+
206
+ ```bash
207
+ pip install "ai-track[cuda]"
208
+ ```
209
+
210
+ The CUDA extra brings in the GPU-oriented runtime stack, including vLLM,
211
+ Transformers, Diffusers, and PyTorch-based helpers.
212
+
213
+ ## Testing
214
+
215
+ Run the full unit suite with:
216
+
217
+ ```bash
218
+ uv run pytest -q tests
219
+ ```
220
+
221
+ The tests focus on:
222
+
223
+ - hub routing decisions
224
+ - backend selection
225
+ - OpenAI-style client compatibility
226
+ - multimodal cleanup behavior
227
+ - transcription support
228
+ - CUDA factory selection
229
+
230
+ ## Development notes
231
+
232
+ - Prefer `track.hub` for routing decisions.
233
+ - Keep optional imports lazy so the core package stays importable without MLX
234
+ or CUDA dependencies.
235
+ - Add docstrings and type hints to new helpers and edited functions.
236
+ - Reuse shared helpers where both MLX and CUDA backends need the same logic.
@@ -0,0 +1,212 @@
1
+ [![CI](https://github.com/langelabs/ai-track/actions/workflows/ci.yml/badge.svg)](https://github.com/langelabs/ai-track/actions/workflows/ci.yml)
2
+
3
+ ![ai-track logo](assets/logo_light.png)
4
+
5
+ `ai-track` is a universal AI runtime library for local and remote inference.
6
+ It chooses the best available execution tier automatically, keeps the core
7
+ package lightweight, and exposes an OpenAI-style client surface so application
8
+ code can stay backend-agnostic.
9
+
10
+ ## What it does
11
+
12
+ - Routes requests through local or remote inference automatically.
13
+ - Supports macOS MLX backends for on-device inference.
14
+ - Supports CUDA backends for GPU inference with vLLM and Hugging Face models.
15
+ - Falls back to a remote OpenAI-compatible client when no local backend fits.
16
+ - Exposes a familiar client surface for chat, embeddings, images, audio, and
17
+ transcription.
18
+
19
+ ## Architecture
20
+
21
+ The codebase is split into two major layers:
22
+
23
+ - `track.inference` contains the runtime primitives and backend implementations.
24
+ - `track.hub` contains the public routing layer that decides whether a model
25
+ should use local inference or a remote client.
26
+
27
+ The runtime is centered around `LocalAI`, which can manage:
28
+
29
+ - chat generation
30
+ - embeddings
31
+ - image generation
32
+ - text-to-speech
33
+ - speech-to-text transcription
34
+
35
+ ### Runtime selection
36
+
37
+ The runtime chooses a backend automatically when you do not pass one explicitly:
38
+
39
+ - macOS resolves to the MLX backend
40
+ - CUDA-capable Linux systems resolve to the CUDA backend
41
+ - everything else stays available through the remote OpenAI-compatible path
42
+
43
+ You can still force a backend explicitly when you need to.
44
+
45
+ ## Local-first routing
46
+
47
+ Routing is local-first:
48
+
49
+ 1. The hub checks whether the selected model is local.
50
+ 2. If the runtime can serve it locally, the request stays on-device.
51
+ 3. Otherwise the hub falls back to a remote OpenAI-compatible client.
52
+
53
+ This keeps local inference fast and private when available while preserving a
54
+ reliable remote fallback.
55
+
56
+ ## Public API
57
+
58
+ The main entrypoints are:
59
+
60
+ ```python
61
+ from track.hub import Hub
62
+ from track.inference import LocalAI
63
+ ```
64
+
65
+ `LocalAI` exposes the local runtime directly and can also return an
66
+ OpenAI-style client.
67
+
68
+ `Hub` resolves a final client for a selected model and is the preferred way to
69
+ route requests from application code.
70
+
71
+ ## OpenAI-style client
72
+
73
+ The local compatibility layer mirrors the shape of the OpenAI Python client.
74
+ It supports:
75
+
76
+ - `client.chat.completions.create(...)`
77
+ - `client.embeddings.create(...)`
78
+ - `client.images.generate(...)`
79
+ - `client.audio.speech.create(...)`
80
+ - `client.audio.transcriptions.create(...)`
81
+
82
+ ### Example: chat
83
+
84
+ ```python
85
+ from track.hub import Hub
86
+ from track.inference import AiModel, InferenceConfig, LocalAI
87
+
88
+ chat_model = AiModel(
89
+ default=True,
90
+ location="local",
91
+ type="llm",
92
+ status="available",
93
+ model="mlx-community/qwen2",
94
+ alias="Qwen2",
95
+ inference_config=InferenceConfig(max_tokens=256, temperature=0.2),
96
+ )
97
+
98
+ runtime = LocalAI(
99
+ chat_config=chat_model,
100
+ remote_api_key="sk-example",
101
+ remote_base_url="https://openrouter.ai/api/v1",
102
+ )
103
+
104
+ hub = Hub(local_ai=runtime)
105
+ client = hub.get_client(chat_model)
106
+
107
+ response = client.chat.completions.create(
108
+ model=chat_model.model,
109
+ messages=[
110
+ {"role": "user", "content": "Summarize this architecture."},
111
+ ],
112
+ )
113
+
114
+ print(response.choices[0].message["content"])
115
+ ```
116
+
117
+ ### Example: transcription
118
+
119
+ ```python
120
+ from track.inference import LocalAI, TranscriptionModelConfig
121
+
122
+ runtime = LocalAI(
123
+ backend="cuda",
124
+ transcription_config=TranscriptionModelConfig(
125
+ model_id="openai/whisper-small",
126
+ alias="Whisper Small",
127
+ ),
128
+ )
129
+
130
+ result = runtime.transcribe("sample.wav")
131
+ print(result.text)
132
+ ```
133
+
134
+ ### Example: OpenAI-style transcription
135
+
136
+ ```python
137
+ client = runtime.get_client()
138
+ result = client.audio.transcriptions.create(
139
+ model="openai/whisper-small",
140
+ file="sample.wav",
141
+ )
142
+ print(result.text)
143
+ ```
144
+
145
+ ## Installation
146
+
147
+ The core package is intentionally small and works without the optional local
148
+ backends.
149
+
150
+ ### Core install
151
+
152
+ ```bash
153
+ uv sync
154
+ ```
155
+
156
+ If you want to install from PyPI with `pip`, use:
157
+
158
+ ```bash
159
+ pip install ai-track
160
+ ```
161
+
162
+ ### macOS MLX extras
163
+
164
+ ```bash
165
+ uv sync --extra macos
166
+ ```
167
+
168
+ For `pip`:
169
+
170
+ ```bash
171
+ pip install "ai-track[macos]"
172
+ ```
173
+
174
+ ### CUDA extras
175
+
176
+ ```bash
177
+ uv sync --extra cuda
178
+ ```
179
+
180
+ For `pip`:
181
+
182
+ ```bash
183
+ pip install "ai-track[cuda]"
184
+ ```
185
+
186
+ The CUDA extra brings in the GPU-oriented runtime stack, including vLLM,
187
+ Transformers, Diffusers, and PyTorch-based helpers.
188
+
189
+ ## Testing
190
+
191
+ Run the full unit suite with:
192
+
193
+ ```bash
194
+ uv run pytest -q tests
195
+ ```
196
+
197
+ The tests focus on:
198
+
199
+ - hub routing decisions
200
+ - backend selection
201
+ - OpenAI-style client compatibility
202
+ - multimodal cleanup behavior
203
+ - transcription support
204
+ - CUDA factory selection
205
+
206
+ ## Development notes
207
+
208
+ - Prefer `track.hub` for routing decisions.
209
+ - Keep optional imports lazy so the core package stays importable without MLX
210
+ or CUDA dependencies.
211
+ - Add docstrings and type hints to new helpers and edited functions.
212
+ - Reuse shared helpers where both MLX and CUDA backends need the same logic.
@@ -0,0 +1,236 @@
1
+ Metadata-Version: 2.4
2
+ Name: ai-track
3
+ Version: 0.1.0
4
+ Summary: Universal AI runtime for local and remote inference.
5
+ Requires-Python: >=3.12
6
+ Description-Content-Type: text/markdown
7
+ Requires-Dist: openai>=1.0
8
+ Requires-Dist: pydantic>=2.0
9
+ Requires-Dist: numpy>=1.26
10
+ Requires-Dist: pillow>=10.0
11
+ Requires-Dist: huggingface-hub>=0.24
12
+ Requires-Dist: tqdm>=4.66
13
+ Provides-Extra: macos
14
+ Requires-Dist: mlx-lm>=0.24; sys_platform == "darwin" and extra == "macos"
15
+ Requires-Dist: mlx-vlm>=0.4; sys_platform == "darwin" and extra == "macos"
16
+ Requires-Dist: mlx-audio>=0.4; sys_platform == "darwin" and extra == "macos"
17
+ Requires-Dist: mflux>=0.5; sys_platform == "darwin" and extra == "macos"
18
+ Provides-Extra: cuda
19
+ Requires-Dist: torch>=2.2; sys_platform == "linux" and extra == "cuda"
20
+ Requires-Dist: vllm>=0.11; sys_platform == "linux" and extra == "cuda"
21
+ Requires-Dist: transformers>=4.45; sys_platform == "linux" and extra == "cuda"
22
+ Requires-Dist: accelerate>=1.0; sys_platform == "linux" and extra == "cuda"
23
+ Requires-Dist: diffusers>=0.30; sys_platform == "linux" and extra == "cuda"
24
+
25
+ [![CI](https://github.com/langelabs/ai-track/actions/workflows/ci.yml/badge.svg)](https://github.com/langelabs/ai-track/actions/workflows/ci.yml)
26
+
27
+ ![ai-track logo](assets/logo_light.png)
28
+
29
+ `ai-track` is a universal AI runtime library for local and remote inference.
30
+ It chooses the best available execution tier automatically, keeps the core
31
+ package lightweight, and exposes an OpenAI-style client surface so application
32
+ code can stay backend-agnostic.
33
+
34
+ ## What it does
35
+
36
+ - Routes requests through local or remote inference automatically.
37
+ - Supports macOS MLX backends for on-device inference.
38
+ - Supports CUDA backends for GPU inference with vLLM and Hugging Face models.
39
+ - Falls back to a remote OpenAI-compatible client when no local backend fits.
40
+ - Exposes a familiar client surface for chat, embeddings, images, audio, and
41
+ transcription.
42
+
43
+ ## Architecture
44
+
45
+ The codebase is split into two major layers:
46
+
47
+ - `track.inference` contains the runtime primitives and backend implementations.
48
+ - `track.hub` contains the public routing layer that decides whether a model
49
+ should use local inference or a remote client.
50
+
51
+ The runtime is centered around `LocalAI`, which can manage:
52
+
53
+ - chat generation
54
+ - embeddings
55
+ - image generation
56
+ - text-to-speech
57
+ - speech-to-text transcription
58
+
59
+ ### Runtime selection
60
+
61
+ The runtime chooses a backend automatically when you do not pass one explicitly:
62
+
63
+ - macOS resolves to the MLX backend
64
+ - CUDA-capable Linux systems resolve to the CUDA backend
65
+ - everything else stays available through the remote OpenAI-compatible path
66
+
67
+ You can still force a backend explicitly when you need to.
68
+
69
+ ## Local-first routing
70
+
71
+ Routing is local-first:
72
+
73
+ 1. The hub checks whether the selected model is local.
74
+ 2. If the runtime can serve it locally, the request stays on-device.
75
+ 3. Otherwise the hub falls back to a remote OpenAI-compatible client.
76
+
77
+ This keeps local inference fast and private when available while preserving a
78
+ reliable remote fallback.
79
+
80
+ ## Public API
81
+
82
+ The main entrypoints are:
83
+
84
+ ```python
85
+ from track.hub import Hub
86
+ from track.inference import LocalAI
87
+ ```
88
+
89
+ `LocalAI` exposes the local runtime directly and can also return an
90
+ OpenAI-style client.
91
+
92
+ `Hub` resolves a final client for a selected model and is the preferred way to
93
+ route requests from application code.
94
+
95
+ ## OpenAI-style client
96
+
97
+ The local compatibility layer mirrors the shape of the OpenAI Python client.
98
+ It supports:
99
+
100
+ - `client.chat.completions.create(...)`
101
+ - `client.embeddings.create(...)`
102
+ - `client.images.generate(...)`
103
+ - `client.audio.speech.create(...)`
104
+ - `client.audio.transcriptions.create(...)`
105
+
106
+ ### Example: chat
107
+
108
+ ```python
109
+ from track.hub import Hub
110
+ from track.inference import AiModel, InferenceConfig, LocalAI
111
+
112
+ chat_model = AiModel(
113
+ default=True,
114
+ location="local",
115
+ type="llm",
116
+ status="available",
117
+ model="mlx-community/qwen2",
118
+ alias="Qwen2",
119
+ inference_config=InferenceConfig(max_tokens=256, temperature=0.2),
120
+ )
121
+
122
+ runtime = LocalAI(
123
+ chat_config=chat_model,
124
+ remote_api_key="sk-example",
125
+ remote_base_url="https://openrouter.ai/api/v1",
126
+ )
127
+
128
+ hub = Hub(local_ai=runtime)
129
+ client = hub.get_client(chat_model)
130
+
131
+ response = client.chat.completions.create(
132
+ model=chat_model.model,
133
+ messages=[
134
+ {"role": "user", "content": "Summarize this architecture."},
135
+ ],
136
+ )
137
+
138
+ print(response.choices[0].message["content"])
139
+ ```
140
+
141
+ ### Example: transcription
142
+
143
+ ```python
144
+ from track.inference import LocalAI, TranscriptionModelConfig
145
+
146
+ runtime = LocalAI(
147
+ backend="cuda",
148
+ transcription_config=TranscriptionModelConfig(
149
+ model_id="openai/whisper-small",
150
+ alias="Whisper Small",
151
+ ),
152
+ )
153
+
154
+ result = runtime.transcribe("sample.wav")
155
+ print(result.text)
156
+ ```
157
+
158
+ ### Example: OpenAI-style transcription
159
+
160
+ ```python
161
+ client = runtime.get_client()
162
+ result = client.audio.transcriptions.create(
163
+ model="openai/whisper-small",
164
+ file="sample.wav",
165
+ )
166
+ print(result.text)
167
+ ```
168
+
169
+ ## Installation
170
+
171
+ The core package is intentionally small and works without the optional local
172
+ backends.
173
+
174
+ ### Core install
175
+
176
+ ```bash
177
+ uv sync
178
+ ```
179
+
180
+ If you want to install from PyPI with `pip`, use:
181
+
182
+ ```bash
183
+ pip install ai-track
184
+ ```
185
+
186
+ ### macOS MLX extras
187
+
188
+ ```bash
189
+ uv sync --extra macos
190
+ ```
191
+
192
+ For `pip`:
193
+
194
+ ```bash
195
+ pip install "ai-track[macos]"
196
+ ```
197
+
198
+ ### CUDA extras
199
+
200
+ ```bash
201
+ uv sync --extra cuda
202
+ ```
203
+
204
+ For `pip`:
205
+
206
+ ```bash
207
+ pip install "ai-track[cuda]"
208
+ ```
209
+
210
+ The CUDA extra brings in the GPU-oriented runtime stack, including vLLM,
211
+ Transformers, Diffusers, and PyTorch-based helpers.
212
+
213
+ ## Testing
214
+
215
+ Run the full unit suite with:
216
+
217
+ ```bash
218
+ uv run pytest -q tests
219
+ ```
220
+
221
+ The tests focus on:
222
+
223
+ - hub routing decisions
224
+ - backend selection
225
+ - OpenAI-style client compatibility
226
+ - multimodal cleanup behavior
227
+ - transcription support
228
+ - CUDA factory selection
229
+
230
+ ## Development notes
231
+
232
+ - Prefer `track.hub` for routing decisions.
233
+ - Keep optional imports lazy so the core package stays importable without MLX
234
+ or CUDA dependencies.
235
+ - Add docstrings and type hints to new helpers and edited functions.
236
+ - Reuse shared helpers where both MLX and CUDA backends need the same logic.