mlx-code 0.0.24__tar.gz → 0.0.26__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. {mlx_code-0.0.24 → mlx_code-0.0.26}/PKG-INFO +83 -38
  2. {mlx_code-0.0.24 → mlx_code-0.0.26}/README.md +80 -37
  3. mlx_code-0.0.24/mlx_code/ntui.py → mlx_code-0.0.26/mlx_code/bare.py +1 -0
  4. mlx_code-0.0.26/mlx_code/bats.py +299 -0
  5. {mlx_code-0.0.24 → mlx_code-0.0.26}/mlx_code/main.py +65 -11
  6. {mlx_code-0.0.24 → mlx_code-0.0.26}/mlx_code/repl.py +7 -7
  7. {mlx_code-0.0.24 → mlx_code-0.0.26}/mlx_code/view_log.py +1 -1
  8. {mlx_code-0.0.24 → mlx_code-0.0.26}/mlx_code.egg-info/PKG-INFO +83 -38
  9. {mlx_code-0.0.24 → mlx_code-0.0.26}/mlx_code.egg-info/SOURCES.txt +2 -1
  10. {mlx_code-0.0.24 → mlx_code-0.0.26}/mlx_code.egg-info/requires.txt +2 -0
  11. {mlx_code-0.0.24 → mlx_code-0.0.26}/setup.py +4 -1
  12. {mlx_code-0.0.24 → mlx_code-0.0.26}/LICENSE +0 -0
  13. {mlx_code-0.0.24 → mlx_code-0.0.26}/mlx_code/__init__.py +0 -0
  14. {mlx_code-0.0.24 → mlx_code-0.0.26}/mlx_code/apis.py +0 -0
  15. {mlx_code-0.0.24 → mlx_code-0.0.26}/mlx_code/gits.py +0 -0
  16. {mlx_code-0.0.24 → mlx_code-0.0.26}/mlx_code/lsp_tool.py +0 -0
  17. {mlx_code-0.0.24 → mlx_code-0.0.26}/mlx_code/mcb.py +0 -0
  18. {mlx_code-0.0.24 → mlx_code-0.0.26}/mlx_code/mcb_tool.py +0 -0
  19. {mlx_code-0.0.24 → mlx_code-0.0.26}/mlx_code/stream_log.py +0 -0
  20. {mlx_code-0.0.24 → mlx_code-0.0.26}/mlx_code/tools.py +0 -0
  21. {mlx_code-0.0.24 → mlx_code-0.0.26}/mlx_code/util.py +0 -0
  22. {mlx_code-0.0.24 → mlx_code-0.0.26}/mlx_code/view_git.py +0 -0
  23. {mlx_code-0.0.24 → mlx_code-0.0.26}/mlx_code.egg-info/dependency_links.txt +0 -0
  24. {mlx_code-0.0.24 → mlx_code-0.0.26}/mlx_code.egg-info/entry_points.txt +0 -0
  25. {mlx_code-0.0.24 → mlx_code-0.0.26}/mlx_code.egg-info/top_level.txt +0 -0
  26. {mlx_code-0.0.24 → mlx_code-0.0.26}/setup.cfg +0 -0
  27. {mlx_code-0.0.24 → mlx_code-0.0.26}/tests/__init__.py +0 -0
  28. {mlx_code-0.0.24 → mlx_code-0.0.26}/tests/test.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mlx-code
3
- Version: 0.0.24
3
+ Version: 0.0.26
4
4
  Summary: Coding Agent for Mac
5
5
  Home-page: https://josefalbers.github.io/mlx-code/
6
6
  Author: J Joe
@@ -17,6 +17,8 @@ Requires-Dist: httpx
17
17
  Requires-Dist: pydantic
18
18
  Requires-Dist: textual>=8.2.7
19
19
  Requires-Dist: rich>=15.0.0
20
+ Requires-Dist: starlette
21
+ Requires-Dist: uvicorn
20
22
  Provides-Extra: all
21
23
  Requires-Dist: python-lsp-server[all]; extra == "all"
22
24
  Requires-Dist: GitPython; extra == "all"
@@ -47,7 +49,7 @@ A Git-native coding agent that can run entirely on your Mac. No API keys, no clo
47
49
  ```
48
50
  Conversation tree (nodes = git commits with embedded chat history)
49
51
 
50
- main ──●──●──●──●──●──●──●──●──●──●
52
+ main ──●──●──●──●──●──●──●──●──●──●──●──●──●──●
51
53
  │ │
52
54
  │ └── branch-1 ──●──●──●
53
55
  │ │ ┌────────────┐
@@ -66,21 +68,21 @@ REPL tabs (each tab = a git branch + agent) │
66
68
  │ └──────┘ └────┬─────┘ └──────────┘ └────────────┘ │
67
69
  └─────────────────┼──────────────────────────────────────┘
68
70
 
69
- ├────────────────────────────────────► each tab is an independent Agent
71
+ ├─────────────────────────────────────────► Each tab is an independent Agent
70
72
 
71
- ┌────┴─────────────────────────────────┐
72
- │ Agent
73
- ┌──────────────┐ ┌──────────────┐
74
- │ │ API: │ │ Tools: │ │
75
- │ │ MLX (local) │ │ Read Write │ │
76
- │ │ Claude │ │ Edit Bash │ │
77
- │ │ Gemini │ │ Grep Find │ │
78
- │ │ OpenAI │ │ Ls Skill │ │
79
- └──────────────┘ │ Agent ───────┼──┼───► spawns child Agent
80
- └──────────────┘ │ (each with own tools + worktree + etc)
81
- │ Git worktree
82
- │ (isolation + session state)
83
- └──────────────────────────────────────┘
73
+ ┌────┴─────────────────────────────────────┐
74
+ │ Agent
75
+ ┌────────────────┐ ┌────────────────┐
76
+ │ │ API: │ │ Tools: │ │
77
+ │ │ Local (mlx-lm) │ │ Read Write │ │
78
+ │ │ Claude │ │ Edit Bash │ │
79
+ │ │ Gemini │ │ Grep Find │ │
80
+ │ │ OpenAI │ │ Ls Skill │ │
81
+ └────────────────┘ │ Agent ─────────┼──┼───► Spawns child Agent
82
+ └────────────────┘ │ (each with own tools + worktree + etc)
83
+ │ Git worktree
84
+ │ (isolation + session state)
85
+ └──────────────────────────────────────────┘
84
86
  ```
85
87
 
86
88
  Each layer is importable and composable on its own. A commit records state, a branch records an alternative path, and a tab is just a live view over an `Agent`.
@@ -98,10 +100,15 @@ result = await agent.run('refactor utils.py to use dataclasses')
98
100
  ## Quick start
99
101
 
100
102
  ```bash
103
+ # ephemeral run (no installation)
104
+ uvx --from mlx-code mlc
105
+
106
+ # or install into the current environment
101
107
  pip install mlx-code
102
- mlc # launch with local MLX model
108
+
109
+ # launch
110
+ mlc # with a local MLX model
103
111
  mlc-run --api gemini # or use a remote provider
104
- mlc-run --api deepseek --model deepseek-v4-flash
105
112
  ```
106
113
 
107
114
  That's it. The first run starts a local inference server and drops you into the REPL.
@@ -123,12 +130,12 @@ That's it. The first run starts a local inference server and drops you into the
123
130
 
124
131
  **Git is the database.** When the agent makes file changes, they’re committed to a git worktree with the full conversation embedded in the commit message. Resume any past session by hash, branch from any checkpoint, and inspect the agent timeline with `git log`. No proprietary state files, just Git.
125
132
 
126
- **Your working directory is never at risk.** The agent operates inside a `git worktree`, not your checkout. It can make a mess, and you can inspect or discard it without ever touching `main`.
127
-
128
- **Built-in safety nets.** Subprocess environment variables go through an explicit allowlist, so secrets in your shell are never leaked to agent-spawned processes.
133
+ **Built-in safety nets.** Your working directory is never at risk. The agent operates inside a `git worktree`, not your checkout. It can make a mess, and you can inspect or discard it without ever touching `main`. Subprocess environment variables go through an explicit allowlist, so secrets in your shell are never leaked to agent-spawned processes.
129
134
 
130
135
  **Batteries included.** Everything ships in one pip install: the MLX inference engine, the multi-protocol API server, the agent loop, the tools, and the TUI. No llama.cpp, no ollama, no vLLM bridge to find and configure. And the server natively speaks OpenAI, Anthropic, Gemini, and Codex wire formats simultaneously, so `claude`, `codex`, and `gemini` CLIs can all work against your local model without a translation layer.
131
136
 
137
+ **Continuous batching.** The local inference server runs a continuous batching engine that processes multiple sequences concurrently. When you spawn parallel agents (eg, multiple tabs, `asyncio.gather` pipelines, or delegated sub-tasks) they all share the same GPU context and are stepped together each tick. A prefix cache persists KV snapshots to disk, so repeated system prompts and conversation prefixes are prefilled once and reused across sessions. No request queueing, no waiting for the previous agent to finish.
138
+
132
139
  ---
133
140
 
134
141
  ## Agent primitive
@@ -166,12 +173,12 @@ agent.messages = messages
166
173
  await agent.run("now add unit tests")
167
174
  ```
168
175
 
169
- Branch from any point in the conversation each branch gets its own worktree:
176
+ Branch from any point in the conversation. Each branch gets its own worktree:
170
177
 
171
178
  ```
172
179
  /branch # branch from current state
173
180
  /branch --rev 2 # branch from the 2nd user turn
174
- /branch --rev 3 --as-worktree try different approach
181
+ /branch --rev 3 make it use httpx instead
175
182
  ```
176
183
 
177
184
  Since it's just git, you can inspect the timeline outside the REPL:
@@ -236,6 +243,43 @@ Reliability comes from specialization plus constraint. A read-only reviewer can'
236
243
 
237
244
  ---
238
245
 
246
+ ## Continuous batching
247
+
248
+ The local server can run multiple inference sequences concurrently inside a single batch step. Instead of a global lock that serialises one request at a time, the batching engine maintains a live set of active sequences and yields tokens for all of them on every step.
249
+
250
+ ```bash
251
+ mlc --engine batch # continuous batching + built-in REPL
252
+ ```
253
+
254
+ This unlocks true parallelism for multi-agent workloads:
255
+
256
+ ```python
257
+ import asyncio
258
+ from mlx_code.repl import Agent
259
+
260
+ async def main():
261
+ agents = [Agent() for _ in range(4)]
262
+ await asyncio.gather(*[
263
+ a.run(f"Research topic: {t}")
264
+ for a, t in zip(agents, ["consensus", "cryptography", "networking", "storage"])
265
+ ])
266
+
267
+ asyncio.run(main())
268
+ ```
269
+
270
+ All four agents generate simultaneously inside the same batch. No sequential blocking.
271
+
272
+ ### Health endpoint
273
+
274
+ ```bash
275
+ curl http://127.0.0.1:8000/health
276
+ # {"status":"ok","model":"mlx-community/Qwen3.5-4B-OptiQ-4bit","active_sequences":2,"prefix_cache_files":5}
277
+ ```
278
+
279
+ `active_sequences` shows how many agents are generating right now; `prefix_cache_files` shows how many prefix KV snapshots are stored on disk.
280
+
281
+ ---
282
+
239
283
  ## Command Line
240
284
 
241
285
  ### `mlc`: local server + harness
@@ -243,20 +287,20 @@ Reliability comes from specialization plus constraint. A read-only reviewer can'
243
287
  Starts the MLX inference server and launches the built-in TUI harness against it.
244
288
 
245
289
  ```bash
246
- # Default: local server + default TUI
290
+ # Default: local server + default harness
247
291
  mlc
248
292
 
249
- # Use a simple terminal REPL instead of the TUI
250
- mlc --notui
293
+ # Continuous batching mode (default is sequential caching mode)
294
+ mlc --engine batch
295
+
296
+ # Server only, no harness
297
+ mlc --leash none
251
298
 
252
299
  # Use a different harness (routes traffic through the local server)
253
300
  mlc --leash claude
254
301
  mlc --leash gemini
255
302
  mlc --leash codex
256
303
 
257
- # Server only, no harness
258
- mlc --leash none
259
-
260
304
  # Specify a model
261
305
  mlc --model mlx-community/Qwen3.5-4B-OptiQ-4bit
262
306
 
@@ -307,7 +351,7 @@ mlc-run --api codex
307
351
  echo "explain lsp.py" | mlc-run -a deepseek | cat - PLAN.md | mlc-run --url http://localhost:9000
308
352
 
309
353
  # Simple terminal REPL (no TUI)
310
- mlc-run --notui
354
+ mlc-run --bare
311
355
  ```
312
356
 
313
357
  ---
@@ -432,18 +476,19 @@ agent = Agent(extra_tool_classes=[LiveDBTool], tool_names=["QueryDB"])
432
476
 
433
477
  | Command | Description |
434
478
  |---|---|
435
- | `/help` | Show command reference |
479
+ | `/branch [--rev N] [prompt]` | Open a new branch tab from the current (or earlier) checkpoint |
480
+ | `/diff [--all]` | Show a side-by-side diff of changes in the worktree |
436
481
  | `/clear [--config F]` | Clear conversation; `--config` reloads agent from a JSON/YAML file |
482
+ | `/tab [N]` | Jump to tab N |
437
483
  | `/history [--raw]` | Show conversation transcript; `--raw` shows the raw API message log |
438
- | `/diff [--all]` | Show a side-by-side diff of changes in the worktree |
439
- | `/errors` | Show timestamped error log for the current tab |
440
484
  | `/tools` | List active tools |
441
- | `/branch [--rev N] [prompt]` | Open a new branch tab from the current (or earlier) checkpoint |
442
485
  | `/abort` | Abort the running agent |
486
+ | `/errors` | Show timestamped error log for the current tab |
443
487
  | `/export [path]` | Export session to JSON |
444
488
  | `/exit [--all]` | Close branch tab, or exit the app |
445
- | `!command` | Run a shell command; output captured in the TUI |
446
- | `$command` | Run an interactive command (TUI suspends, terminal handed to process) |
489
+ | `/help` | Show command reference |
490
+ | `!command` | Run a shell command; output captured in the TUI (eg, `ls`, `cat hello.c`) |
491
+ | `$command` | Run an interactive command (eg, `vim`, `yazi`, `less hello.c`) |
447
492
 
448
493
  ### Key bindings
449
494
 
@@ -453,7 +498,7 @@ agent = Agent(extra_tool_classes=[LiveDBTool], tool_names=["QueryDB"])
453
498
  | `Ctrl-J` | Insert newline |
454
499
  | `Ctrl-1` … `Ctrl-9` | Jump to tab N |
455
500
  | `Ctrl-,` / `Ctrl-.` | Cycle through tabs |
456
- | `Ctrl-C` | Abort running agent |
501
+ | `Ctrl-C` | Clear input, or abort running agent |
457
502
  | `Ctrl-D` | Close branch tab, or exit app |
458
503
  | `Ctrl-R` | Recall last prompt into editor |
459
504
 
@@ -471,7 +516,7 @@ agent = Agent(extra_tool_classes=[LiveDBTool], tool_names=["QueryDB"])
471
516
  | `Skill` | Retrieve named skill instructions from config |
472
517
  | `Agent` | Spawn an autonomous sub-agent for delegated work |
473
518
 
474
- All file tools enforce path sandboxing the agent cannot read or write outside the worktree.
519
+ All file tools enforce path sandboxing. The agent cannot read or write outside the worktree.
475
520
 
476
521
  ### Backends
477
522
 
@@ -11,7 +11,7 @@ A Git-native coding agent that can run entirely on your Mac. No API keys, no clo
11
11
  ```
12
12
  Conversation tree (nodes = git commits with embedded chat history)
13
13
 
14
- main ──●──●──●──●──●──●──●──●──●──●
14
+ main ──●──●──●──●──●──●──●──●──●──●──●──●──●──●
15
15
  │ │
16
16
  │ └── branch-1 ──●──●──●
17
17
  │ │ ┌────────────┐
@@ -30,21 +30,21 @@ REPL tabs (each tab = a git branch + agent) │
30
30
  │ └──────┘ └────┬─────┘ └──────────┘ └────────────┘ │
31
31
  └─────────────────┼──────────────────────────────────────┘
32
32
 
33
- ├────────────────────────────────────► each tab is an independent Agent
33
+ ├─────────────────────────────────────────► Each tab is an independent Agent
34
34
 
35
- ┌────┴─────────────────────────────────┐
36
- │ Agent
37
- ┌──────────────┐ ┌──────────────┐
38
- │ │ API: │ │ Tools: │ │
39
- │ │ MLX (local) │ │ Read Write │ │
40
- │ │ Claude │ │ Edit Bash │ │
41
- │ │ Gemini │ │ Grep Find │ │
42
- │ │ OpenAI │ │ Ls Skill │ │
43
- └──────────────┘ │ Agent ───────┼──┼───► spawns child Agent
44
- └──────────────┘ │ (each with own tools + worktree + etc)
45
- │ Git worktree
46
- │ (isolation + session state)
47
- └──────────────────────────────────────┘
35
+ ┌────┴─────────────────────────────────────┐
36
+ │ Agent
37
+ ┌────────────────┐ ┌────────────────┐
38
+ │ │ API: │ │ Tools: │ │
39
+ │ │ Local (mlx-lm) │ │ Read Write │ │
40
+ │ │ Claude │ │ Edit Bash │ │
41
+ │ │ Gemini │ │ Grep Find │ │
42
+ │ │ OpenAI │ │ Ls Skill │ │
43
+ └────────────────┘ │ Agent ─────────┼──┼───► Spawns child Agent
44
+ └────────────────┘ │ (each with own tools + worktree + etc)
45
+ │ Git worktree
46
+ │ (isolation + session state)
47
+ └──────────────────────────────────────────┘
48
48
  ```
49
49
 
50
50
  Each layer is importable and composable on its own. A commit records state, a branch records an alternative path, and a tab is just a live view over an `Agent`.
@@ -62,10 +62,15 @@ result = await agent.run('refactor utils.py to use dataclasses')
62
62
  ## Quick start
63
63
 
64
64
  ```bash
65
+ # ephemeral run (no installation)
66
+ uvx --from mlx-code mlc
67
+
68
+ # or install into the current environment
65
69
  pip install mlx-code
66
- mlc # launch with local MLX model
70
+
71
+ # launch
72
+ mlc # with a local MLX model
67
73
  mlc-run --api gemini # or use a remote provider
68
- mlc-run --api deepseek --model deepseek-v4-flash
69
74
  ```
70
75
 
71
76
  That's it. The first run starts a local inference server and drops you into the REPL.
@@ -87,12 +92,12 @@ That's it. The first run starts a local inference server and drops you into the
87
92
 
88
93
  **Git is the database.** When the agent makes file changes, they’re committed to a git worktree with the full conversation embedded in the commit message. Resume any past session by hash, branch from any checkpoint, and inspect the agent timeline with `git log`. No proprietary state files, just Git.
89
94
 
90
- **Your working directory is never at risk.** The agent operates inside a `git worktree`, not your checkout. It can make a mess, and you can inspect or discard it without ever touching `main`.
91
-
92
- **Built-in safety nets.** Subprocess environment variables go through an explicit allowlist, so secrets in your shell are never leaked to agent-spawned processes.
95
+ **Built-in safety nets.** Your working directory is never at risk. The agent operates inside a `git worktree`, not your checkout. It can make a mess, and you can inspect or discard it without ever touching `main`. Subprocess environment variables go through an explicit allowlist, so secrets in your shell are never leaked to agent-spawned processes.
93
96
 
94
97
  **Batteries included.** Everything ships in one pip install: the MLX inference engine, the multi-protocol API server, the agent loop, the tools, and the TUI. No llama.cpp, no ollama, no vLLM bridge to find and configure. And the server natively speaks OpenAI, Anthropic, Gemini, and Codex wire formats simultaneously, so `claude`, `codex`, and `gemini` CLIs can all work against your local model without a translation layer.
95
98
 
99
+ **Continuous batching.** The local inference server runs a continuous batching engine that processes multiple sequences concurrently. When you spawn parallel agents (eg, multiple tabs, `asyncio.gather` pipelines, or delegated sub-tasks) they all share the same GPU context and are stepped together each tick. A prefix cache persists KV snapshots to disk, so repeated system prompts and conversation prefixes are prefilled once and reused across sessions. No request queueing, no waiting for the previous agent to finish.
100
+
96
101
  ---
97
102
 
98
103
  ## Agent primitive
@@ -130,12 +135,12 @@ agent.messages = messages
130
135
  await agent.run("now add unit tests")
131
136
  ```
132
137
 
133
- Branch from any point in the conversation each branch gets its own worktree:
138
+ Branch from any point in the conversation. Each branch gets its own worktree:
134
139
 
135
140
  ```
136
141
  /branch # branch from current state
137
142
  /branch --rev 2 # branch from the 2nd user turn
138
- /branch --rev 3 --as-worktree try different approach
143
+ /branch --rev 3 make it use httpx instead
139
144
  ```
140
145
 
141
146
  Since it's just git, you can inspect the timeline outside the REPL:
@@ -200,6 +205,43 @@ Reliability comes from specialization plus constraint. A read-only reviewer can'
200
205
 
201
206
  ---
202
207
 
208
+ ## Continuous batching
209
+
210
+ The local server can run multiple inference sequences concurrently inside a single batch step. Instead of a global lock that serialises one request at a time, the batching engine maintains a live set of active sequences and yields tokens for all of them on every step.
211
+
212
+ ```bash
213
+ mlc --engine batch # continuous batching + built-in REPL
214
+ ```
215
+
216
+ This unlocks true parallelism for multi-agent workloads:
217
+
218
+ ```python
219
+ import asyncio
220
+ from mlx_code.repl import Agent
221
+
222
+ async def main():
223
+ agents = [Agent() for _ in range(4)]
224
+ await asyncio.gather(*[
225
+ a.run(f"Research topic: {t}")
226
+ for a, t in zip(agents, ["consensus", "cryptography", "networking", "storage"])
227
+ ])
228
+
229
+ asyncio.run(main())
230
+ ```
231
+
232
+ All four agents generate simultaneously inside the same batch. No sequential blocking.
233
+
234
+ ### Health endpoint
235
+
236
+ ```bash
237
+ curl http://127.0.0.1:8000/health
238
+ # {"status":"ok","model":"mlx-community/Qwen3.5-4B-OptiQ-4bit","active_sequences":2,"prefix_cache_files":5}
239
+ ```
240
+
241
+ `active_sequences` shows how many agents are generating right now; `prefix_cache_files` shows how many prefix KV snapshots are stored on disk.
242
+
243
+ ---
244
+
203
245
  ## Command Line
204
246
 
205
247
  ### `mlc`: local server + harness
@@ -207,20 +249,20 @@ Reliability comes from specialization plus constraint. A read-only reviewer can'
207
249
  Starts the MLX inference server and launches the built-in TUI harness against it.
208
250
 
209
251
  ```bash
210
- # Default: local server + default TUI
252
+ # Default: local server + default harness
211
253
  mlc
212
254
 
213
- # Use a simple terminal REPL instead of the TUI
214
- mlc --notui
255
+ # Continuous batching mode (default is sequential caching mode)
256
+ mlc --engine batch
257
+
258
+ # Server only, no harness
259
+ mlc --leash none
215
260
 
216
261
  # Use a different harness (routes traffic through the local server)
217
262
  mlc --leash claude
218
263
  mlc --leash gemini
219
264
  mlc --leash codex
220
265
 
221
- # Server only, no harness
222
- mlc --leash none
223
-
224
266
  # Specify a model
225
267
  mlc --model mlx-community/Qwen3.5-4B-OptiQ-4bit
226
268
 
@@ -271,7 +313,7 @@ mlc-run --api codex
271
313
  echo "explain lsp.py" | mlc-run -a deepseek | cat - PLAN.md | mlc-run --url http://localhost:9000
272
314
 
273
315
  # Simple terminal REPL (no TUI)
274
- mlc-run --notui
316
+ mlc-run --bare
275
317
  ```
276
318
 
277
319
  ---
@@ -396,18 +438,19 @@ agent = Agent(extra_tool_classes=[LiveDBTool], tool_names=["QueryDB"])
396
438
 
397
439
  | Command | Description |
398
440
  |---|---|
399
- | `/help` | Show command reference |
441
+ | `/branch [--rev N] [prompt]` | Open a new branch tab from the current (or earlier) checkpoint |
442
+ | `/diff [--all]` | Show a side-by-side diff of changes in the worktree |
400
443
  | `/clear [--config F]` | Clear conversation; `--config` reloads agent from a JSON/YAML file |
444
+ | `/tab [N]` | Jump to tab N |
401
445
  | `/history [--raw]` | Show conversation transcript; `--raw` shows the raw API message log |
402
- | `/diff [--all]` | Show a side-by-side diff of changes in the worktree |
403
- | `/errors` | Show timestamped error log for the current tab |
404
446
  | `/tools` | List active tools |
405
- | `/branch [--rev N] [prompt]` | Open a new branch tab from the current (or earlier) checkpoint |
406
447
  | `/abort` | Abort the running agent |
448
+ | `/errors` | Show timestamped error log for the current tab |
407
449
  | `/export [path]` | Export session to JSON |
408
450
  | `/exit [--all]` | Close branch tab, or exit the app |
409
- | `!command` | Run a shell command; output captured in the TUI |
410
- | `$command` | Run an interactive command (TUI suspends, terminal handed to process) |
451
+ | `/help` | Show command reference |
452
+ | `!command` | Run a shell command; output captured in the TUI (eg, `ls`, `cat hello.c`) |
453
+ | `$command` | Run an interactive command (eg, `vim`, `yazi`, `less hello.c`) |
411
454
 
412
455
  ### Key bindings
413
456
 
@@ -417,7 +460,7 @@ agent = Agent(extra_tool_classes=[LiveDBTool], tool_names=["QueryDB"])
417
460
  | `Ctrl-J` | Insert newline |
418
461
  | `Ctrl-1` … `Ctrl-9` | Jump to tab N |
419
462
  | `Ctrl-,` / `Ctrl-.` | Cycle through tabs |
420
- | `Ctrl-C` | Abort running agent |
463
+ | `Ctrl-C` | Clear input, or abort running agent |
421
464
  | `Ctrl-D` | Close branch tab, or exit app |
422
465
  | `Ctrl-R` | Recall last prompt into editor |
423
466
 
@@ -435,7 +478,7 @@ agent = Agent(extra_tool_classes=[LiveDBTool], tool_names=["QueryDB"])
435
478
  | `Skill` | Retrieve named skill instructions from config |
436
479
  | `Agent` | Spawn an autonomous sub-agent for delegated work |
437
480
 
438
- All file tools enforce path sandboxing the agent cannot read or write outside the worktree.
481
+ All file tools enforce path sandboxing. The agent cannot read or write outside the worktree.
439
482
 
440
483
  ### Backends
441
484
 
@@ -110,6 +110,7 @@ class SimpleRepl:
110
110
  if out_text:
111
111
  self._write_delta(prefix + out_text, 'tool_result')
112
112
  self._last_stream_type = t
113
+ print()
113
114
  elif t == 'commit':
114
115
  self._pending_nls = 0
115
116
  self._awaiting_content = False
@@ -0,0 +1,299 @@
1
+ import asyncio
2
+ import json
3
+ import queue as _queue
4
+ import time
5
+ import uuid
6
+ import threading
7
+ import hashlib
8
+ from array import array
9
+ from contextlib import asynccontextmanager
10
+ from pathlib import Path
11
+ import mlx.core as mx
12
+ from starlette.applications import Starlette
13
+ from starlette.requests import Request
14
+ from starlette.responses import StreamingResponse, JSONResponse
15
+ from starlette.routing import Route
16
+ import logging
17
+ logger = logging.getLogger(__name__)
18
+ MIN_PREFIX_TOKENS = 256
19
+
20
+ def _hash_tokens(tokens):
21
+ arr = array('I', tokens)
22
+ return hashlib.blake2b(arr.tobytes(), digest_size=8).hexdigest()
23
+
24
+ class PrefixCache:
25
+
26
+ def __init__(self, model_name, cache_dir):
27
+ self.model_name = model_name
28
+ self.cache_dir = Path(cache_dir)
29
+ self.cache_dir.mkdir(parents=True, exist_ok=True)
30
+
31
+ def _path(self, prefix_tokens):
32
+ safe = ''.join((c for c in self.model_name if c.isalnum()))
33
+ h = _hash_tokens(prefix_tokens)
34
+ return self.cache_dir / f'{safe}_{len(prefix_tokens)}_{h}.safetensors'
35
+
36
+ def lookup(self, prefix_tokens):
37
+ if not prefix_tokens or len(prefix_tokens) < MIN_PREFIX_TOKENS:
38
+ return None
39
+ path = self._path(prefix_tokens)
40
+ if not path.exists():
41
+ return None
42
+ try:
43
+ from mlx_lm.models.cache import load_prompt_cache
44
+ cache, _ = load_prompt_cache(str(path), return_metadata=True)
45
+ mx.async_eval(cache)
46
+ return cache
47
+ except Exception as exc:
48
+ logger.info(f'[batch] failed to load prefix cache {path.name}: {exc}')
49
+ return None
50
+
51
+ def store(self, prefix_tokens, kv_cache):
52
+ if not prefix_tokens or len(prefix_tokens) < MIN_PREFIX_TOKENS:
53
+ return
54
+ path = self._path(prefix_tokens)
55
+ if path.exists():
56
+ return
57
+ try:
58
+ from mlx_lm.models.cache import save_prompt_cache
59
+ save_prompt_cache(str(path), kv_cache)
60
+ logger.info(f'[batch] saved prefix cache len={len(prefix_tokens)} file={path.name}')
61
+ except Exception as exc:
62
+ logger.info(f'[batch] failed to save prefix cache: {exc}')
63
+
64
+ def _prefill_prefix(model, tokens, prefill_step_size=2048):
65
+ from mlx_lm.models.cache import make_prompt_cache
66
+ prompt_cache = make_prompt_cache(model)
67
+ prompt = mx.array(tokens)
68
+ while prompt.shape[0] > 0:
69
+ n = min(prefill_step_size, prompt.shape[0])
70
+ model(prompt[:n][None], cache=prompt_cache)
71
+ mx.eval([c.state for c in prompt_cache])
72
+ prompt = prompt[n:]
73
+ mx.clear_cache()
74
+ return prompt_cache
75
+
76
+ def _get_prefix(tokens, ckpts):
77
+ if not ckpts:
78
+ return (None, 0)
79
+ first_ckpt = min(ckpts)
80
+ if first_ckpt < MIN_PREFIX_TOKENS:
81
+ return (None, 0)
82
+ return (tokens[:first_ckpt], first_ckpt)
83
+
84
+ def make_batch_app(model_name: str, cache_dir: str='.cache'):
85
+ state = {'model': None, 'tokenizer': None, 'batch_gen': None, 'request_queue': _queue.Queue(), 'active': {}, 'loop': None, 'prefix_cache': None}
86
+
87
+ def _engine():
88
+ rq = state['request_queue']
89
+ active = state['active']
90
+ bg = state['batch_gen']
91
+ tok = state['tokenizer']
92
+ loop = state['loop']
93
+ model = state['model']
94
+ pcache = state['prefix_cache']
95
+ while True:
96
+ while not rq.empty():
97
+ try:
98
+ tokens, max_tokens, token_queue, ckpts = rq.get_nowait()
99
+ _insert(bg, active, pcache, model, tok, loop, tokens, max_tokens, token_queue, ckpts)
100
+ except _queue.Empty:
101
+ break
102
+ if not active:
103
+ tokens, max_tokens, token_queue, ckpts = rq.get()
104
+ _insert(bg, active, pcache, model, tok, loop, tokens, max_tokens, token_queue, ckpts)
105
+ try:
106
+ results = bg.next_generated()
107
+ except Exception:
108
+ for uid, meta in list(active.items()):
109
+ loop.call_soon_threadsafe(meta['q'].put_nowait, None)
110
+ active.clear()
111
+ continue
112
+ for r in results:
113
+ meta = active.get(r.uid)
114
+ if meta is None:
115
+ continue
116
+ detok = meta['detok']
117
+ detok.add_token(r.token)
118
+ seg = detok.last_segment
119
+ if r.finish_reason is not None:
120
+ detok.finalize()
121
+ if (final := detok.last_segment):
122
+ loop.call_soon_threadsafe(meta['q'].put_nowait, final)
123
+ loop.call_soon_threadsafe(meta['q'].put_nowait, None)
124
+ del active[r.uid]
125
+ elif seg:
126
+ loop.call_soon_threadsafe(meta['q'].put_nowait, seg)
127
+
128
+ def _insert(bg, active, pcache, model, tok, loop, tokens, max_tokens, token_queue, ckpts):
129
+ prefix_tokens, prefix_len = _get_prefix(tokens, ckpts)
130
+ if prefix_tokens is not None:
131
+ cached_kv = pcache.lookup(prefix_tokens)
132
+ if cached_kv is not None:
133
+ suffix = tokens[prefix_len:]
134
+ try:
135
+ uids = bg.insert([suffix], [max_tokens], caches=[cached_kv])
136
+ except Exception as exc:
137
+ logger.info(f'[batch] cache insert failed ({exc}), falling back to full prompt')
138
+ uids = bg.insert([tokens], [max_tokens])
139
+ prefix_len = 0
140
+ else:
141
+ logger.info(f'[batch] cache HIT prefix={prefix_len} suffix={len(suffix)}')
142
+ del cached_kv
143
+ mx.clear_cache()
144
+ else:
145
+ logger.info(f'[batch] prefilling prefix prefix={prefix_len} suffix={len(tokens) - prefix_len}')
146
+ prefix_kv = _prefill_prefix(model, prefix_tokens)
147
+ pcache.store(prefix_tokens, prefix_kv)
148
+ suffix = tokens[prefix_len:]
149
+ try:
150
+ uids = bg.insert([suffix], [max_tokens], caches=[prefix_kv])
151
+ except Exception as exc:
152
+ logger.info(f'[batch] cache insert failed ({exc}), falling back to full prompt')
153
+ uids = bg.insert([tokens], [max_tokens])
154
+ prefix_len = 0
155
+ del prefix_kv
156
+ mx.clear_cache()
157
+ active[uids[0]] = {'q': token_queue, 'detok': tok.detokenizer}
158
+ else:
159
+ uids = bg.insert([tokens], [max_tokens])
160
+ logger.info(f'[batch] no cache prompt={len(tokens)}')
161
+ active[uids[0]] = {'q': token_queue, 'detok': tok.detokenizer}
162
+
163
+ @asynccontextmanager
164
+ async def lifespan(_app):
165
+ from mlx_lm import load
166
+ from mlx_lm.generate import BatchGenerator
167
+ from mlx_lm.tokenizer_utils import TokenizerWrapper
168
+ logger.info(f'[batch] Loading model {model_name!r} …')
169
+ model, tokenizer = load(model_name)
170
+ if not isinstance(tokenizer, TokenizerWrapper):
171
+ tokenizer = TokenizerWrapper(tokenizer)
172
+ eos = set(tokenizer.eos_token_ids) | {tokenizer.eos_token_id}
173
+ stop_tokens = [[t] for t in eos]
174
+ batch_gen = BatchGenerator(model, stop_tokens=stop_tokens)
175
+ state.update(model=model, tokenizer=tokenizer, batch_gen=batch_gen, loop=asyncio.get_running_loop(), prefix_cache=PrefixCache(model_name, cache_dir))
176
+ logger.info('[batch] Model ready. Starting engine thread.')
177
+ threading.Thread(target=_engine, daemon=True).start()
178
+ yield
179
+ batch_gen.close()
180
+
181
+ @staticmethod
182
+ def _detect_api(path: str) -> str:
183
+ if path.startswith('/v1beta/models/'):
184
+ return 'gemini'
185
+ if path.startswith('/v1/messages'):
186
+ return 'claude'
187
+ if path.startswith('/v1/responses'):
188
+ return 'codex'
189
+ return 'noapi'
190
+
191
+ async def _stream_sse(token_queue, api, msg_id, in_tokens):
192
+ from . import main as _m
193
+ adapters = {'claude': _m.ClaudeAdapter, 'codex': _m.CodexAdapter, 'gemini': _m.GeminiAdapter, 'noapi': _m.DefaultAdapter}
194
+ adapter = adapters.get(api, _m.DefaultAdapter)(msg_id, in_tokens)
195
+ yield adapter.start()
196
+ st = 'thinking'
197
+ buf = ''
198
+ think_tags = ['<think>', '</think>']
199
+ while True:
200
+ text = await token_queue.get()
201
+ if text is None:
202
+ break
203
+ buf += text
204
+ seg = text
205
+ while any((t in seg for t in think_tags)):
206
+ if st == 'text' and think_tags[0] in seg:
207
+ before, _, seg = seg.partition(think_tags[0])
208
+ if before:
209
+ yield adapter.text('text', before)
210
+ st = 'thinking'
211
+ if st == 'thinking' and think_tags[1] in seg:
212
+ before, _, seg = seg.partition(think_tags[1])
213
+ if before:
214
+ yield adapter.text('thinking', before)
215
+ st = 'text'
216
+ if seg:
217
+ yield adapter.text(st, seg)
218
+ if (tools := _m._parse_tools_xml(buf)):
219
+ for tool in tools:
220
+ yield adapter.tool(tool)
221
+ yield adapter.end(True)
222
+ else:
223
+ yield adapter.end(False)
224
+
225
+ async def generate_endpoint(request: Request):
226
+ from . import main as _m
227
+ if state['batch_gen'] is None:
228
+ return JSONResponse({'error': 'model not loaded'}, status_code=503)
229
+ path = request.url.path.split('?')[0].rstrip('/')
230
+ api = _detect_api(path)
231
+ if api == 'gemini':
232
+ q = str(request.url.query) or ''
233
+ if 'alt=sse' not in q and 'streamGenerateContent' not in path:
234
+ return JSONResponse({'candidates': [{'content': {'role': 'model', 'parts': [{'text': '{"complexity_reasoning":"local","complexity_score":50}'}]}, 'finishReason': 'STOP'}], 'usageMetadata': {'promptTokenCount': 0, 'candidatesTokenCount': 0}})
235
+ body = await request.json()
236
+ max_tokens = int(body.get('max_tokens', body.get('max_completion_tokens', 8192)))
237
+ try:
238
+ prompt, ckpts = _m.encode(body, api, state['tokenizer'], None, None, None)
239
+ except Exception as exc:
240
+ return JSONResponse({'error': f'encode: {exc}'}, status_code=500)
241
+ if ckpts is None or not prompt:
242
+ return JSONResponse({'error': 'empty prompt'}, status_code=400)
243
+ msg_id = f'msg_{uuid.uuid4().hex}'
244
+ token_queue = asyncio.Queue()
245
+ state['request_queue'].put((prompt, max_tokens, token_queue, ckpts))
246
+
247
+ async def _sse():
248
+ async for chunk in _stream_sse(token_queue, api, msg_id, len(prompt)):
249
+ yield chunk
250
+ return StreamingResponse(_sse(), media_type='text/event-stream')
251
+
252
+ async def simple_generate(request: Request):
253
+ if state['batch_gen'] is None:
254
+ return JSONResponse({'error': 'model not loaded'}, status_code=503)
255
+ body = await request.json()
256
+ tok = state['tokenizer']
257
+ max_tokens = body.get('max_tokens', 256)
258
+ if 'messages' in body:
259
+ text = tok.apply_chat_template(body['messages'], tokenize=False, add_generation_prompt=True)
260
+ else:
261
+ text = body.get('prompt', '')
262
+ tokens = tok.encode(text)
263
+ if not tokens:
264
+ return JSONResponse({'error': 'empty prompt'}, status_code=400)
265
+ token_queue = asyncio.Queue()
266
+ state['request_queue'].put((tokens, max_tokens, token_queue, []))
267
+ if body.get('stream', True):
268
+
269
+ async def _raw():
270
+ while True:
271
+ chunk = await token_queue.get()
272
+ if chunk is None:
273
+ break
274
+ yield chunk
275
+ return StreamingResponse(_raw(), media_type='text/plain')
276
+ parts = []
277
+ while True:
278
+ chunk = await token_queue.get()
279
+ if chunk is None:
280
+ break
281
+ parts.append(chunk)
282
+ return JSONResponse({'text': ''.join(parts)})
283
+
284
+ async def list_models(_req):
285
+ return JSONResponse({'data': [{'id': 'local', 'object': 'model', 'created': int(time.time()), 'owned_by': 'local'}]})
286
+
287
+ async def count_tokens(_req):
288
+ return JSONResponse({'input_tokens': 0})
289
+
290
+ async def health(_req):
291
+ pc = state['prefix_cache']
292
+ n_cached = 0
293
+ if pc and pc.cache_dir.exists():
294
+ n_cached = sum((1 for _ in pc.cache_dir.glob('*.safetensors')))
295
+ return JSONResponse({'status': 'ok', 'model': model_name, 'active_sequences': len(state['active']), 'prefix_cache_files': n_cached})
296
+ return Starlette(routes=[Route('/v1/models', list_models, methods=['GET']), Route('/v1/messages/count_tokens', count_tokens, methods=['POST']), Route('/v1/chat/completions', generate_endpoint, methods=['POST']), Route('/v1/messages', generate_endpoint, methods=['POST']), Route('/v1/responses', generate_endpoint, methods=['POST']), Route('/v1beta/models/{rest:path}', generate_endpoint, methods=['POST']), Route('/generate', simple_generate, methods=['POST']), Route('/health', health, methods=['GET'])], lifespan=lifespan)
297
+ if __name__ == '__main__':
298
+ import uvicorn
299
+ uvicorn.run(make_batch_app('mlx-community/Qwen3.5-4B-OptiQ-4bit'), host='0.0.0.0', port=8000)
@@ -871,13 +871,13 @@ def make_handler(model_name, cache_dir, system, names, skips, gwt=None, parse_th
871
871
  raise
872
872
  return Handler
873
873
 
874
- def serve(host: str, port: int, model: str, cache: str, system: str | None, tools: list[str], skips: list[str], *, fixed_port: bool=False, gwt=None) -> tuple[HTTPServer, str]:
874
+ def _serve_cache(host, port, model, cache, system, tools, skips, *, fixed_port=False, gwt=None):
875
875
  handler = make_handler(model, cache, system, tools, skips, gwt)
876
876
  while True:
877
877
  try:
878
878
  server = HTTPServer((host, port), handler)
879
879
  url = f'http://{host}:{port}'
880
- logger.debug(f'Server bound to {url}')
880
+ logger.debug(f'Cache server bound to {url}')
881
881
  return (server, url)
882
882
  except OSError as e:
883
883
  if e.errno in (48, 98):
@@ -888,12 +888,52 @@ def serve(host: str, port: int, model: str, cache: str, system: str | None, tool
888
888
  else:
889
889
  raise
890
890
 
891
+ def _serve_batch(host, port, model, cache_dir='.cache', *, fixed_port=False):
892
+ import uvicorn
893
+ from .bats import make_batch_app
894
+ import socket
895
+ import time
896
+ app = make_batch_app(model, cache_dir=cache_dir)
897
+ while True:
898
+ try:
899
+ with socket.socket() as s:
900
+ s.bind((host, port))
901
+ except OSError as e:
902
+ if e.errno in (48, 98):
903
+ if fixed_port:
904
+ logger.error(f'Port {port} is already in use.')
905
+ sys.exit(1)
906
+ port += 1
907
+ else:
908
+ raise
909
+ else:
910
+ break
911
+ config = uvicorn.Config(app, host=host, port=port, loop='asyncio', log_level='warning')
912
+ uv_server = uvicorn.Server(config)
913
+ t = threading.Thread(target=uv_server.run, daemon=True)
914
+ t.start()
915
+ start_time = time.time()
916
+ notified = False
917
+ while True:
918
+ try:
919
+ with socket.create_connection((host, port), timeout=0.1):
920
+ break
921
+ except OSError:
922
+ if not notified and time.time() - start_time > 3.0:
923
+ logger.info('Waiting for batch server to start (model may be downloading)...')
924
+ notified = True
925
+ time.sleep(0.2)
926
+ url = f'http://{host}:{port}'
927
+ logger.debug(f'Batch server bound to {url}')
928
+ return (uv_server, url)
929
+
891
930
  def main():
892
931
  parser = argparse.ArgumentParser(description='mlx-code MAIN')
893
932
  parser.add_argument('-p', '--prompt', default=None, help='Initial prompt sent automatically when the REPL starts')
894
933
  parser.add_argument('-r', '--resume', default=None, metavar='COMMIT', help='Resume a previous session from the given git commit hash')
895
934
  parser.add_argument('-m', '--model', default='mlx-community/Qwen3.5-4B-OptiQ-4bit', help='MLX model path or HuggingFace repo ID (default: Qwen3.5-4B-OptiQ-4bit)')
896
935
  parser.add_argument('-l', '--leash', choices=['claude', 'codex', 'gemini', 'noapi', 'none'], default='noapi', help="AI harness to launch against the server; 'noapi' starts the built-in REPL, 'none' runs the server only")
936
+ parser.add_argument('--engine', choices=['cache', 'batch'], default='cache', help="'cache' uses PromptCache + single-sequence (default); 'batch' uses BatchGenerator for concurrent sequences (only compatible with --leash none or noapi)")
897
937
  parser.add_argument('--skill', default=None, help='Directory to scan recursively for SKILL.md files')
898
938
  parser.add_argument('--tools', nargs='+', default=None, help='Whitelist of tool names to enable; allows all tools when omitted')
899
939
  parser.add_argument('--system', type=str, default=None, help='System prompt override passed to the model')
@@ -903,10 +943,14 @@ def main():
903
943
  parser.add_argument('--port', type=int, default=None, help='Port to listen on; auto-increments if already in use (default: 8000)')
904
944
  parser.add_argument('--skips', nargs='+', default=['(?m)^\\[SUGGESTION MODE[\\s\\S]*', '(?m)^<system-reminder>[\\s\\S]*?^</system-reminder>\\s*'], help='Regex patterns stripped from model output before it is returned to the client')
905
945
  parser.add_argument('--stream', default=None, help='File to stream log into')
906
- parser.add_argument('--notui', action='store_true', help='Use simple terminal REPL instead of TUI')
946
+ parser.add_argument('--bare', action='store_true', help='Use simple terminal REPL instead of TUI')
907
947
  args, leash_args = parser.parse_known_args()
908
948
  logger.debug(f'args={args!r} leash_args={leash_args!r}')
949
+ if args.engine == 'batch' and args.leash not in ('none', 'noapi'):
950
+ parser.error('--engine batch only supports --leash none or --leash noapi for now')
909
951
  cache = os.path.abspath(args.cache)
952
+ port = args.port if args.port is not None else 8000
953
+ fixed_port = args.port is not None
910
954
  with tempfile.TemporaryDirectory(dir='/tmp') as _home:
911
955
  env = os.environ.copy()
912
956
  home = Path(_home)
@@ -915,18 +959,28 @@ def main():
915
959
  env['HOME'] = _home
916
960
  env['SHELL'] = '/bin/bash'
917
961
  env['PWD'] = cwd
918
- server, url = serve(host=args.host, port=args.port if args.port is not None else 8000, model=args.model, cache=cache, system=None if args.leash in ('none', 'noapi') else args.system, tools=args.tools, skips=args.skips, fixed_port=args.port is not None, gwt=gwt)
962
+ if args.engine == 'batch':
963
+ server, url = _serve_batch(args.host, port, args.model, cache_dir=cache, fixed_port=fixed_port)
964
+ else:
965
+ server, url = _serve_cache(host=args.host, port=port, model=args.model, cache=cache, system=None if args.leash in ('none', 'noapi') else args.system, tools=args.tools, skips=args.skips, fixed_port=fixed_port, gwt=gwt)
919
966
  if args.leash == 'none':
920
- try:
921
- server.serve_forever()
922
- except KeyboardInterrupt:
923
- print('\nShutting down server...')
924
- server.server_close()
967
+ if args.engine == 'batch':
968
+ try:
969
+ threading.Event().wait()
970
+ except KeyboardInterrupt:
971
+ print('\nShutting down server...')
972
+ else:
973
+ try:
974
+ server.serve_forever()
975
+ except KeyboardInterrupt:
976
+ print('\nShutting down server...')
977
+ server.server_close()
925
978
  else:
926
- threading.Thread(target=server.serve_forever, daemon=True).start()
979
+ if args.engine == 'cache':
980
+ threading.Thread(target=server.serve_forever, daemon=True).start()
927
981
  if args.leash == 'noapi':
928
982
  from .repl import run_repl
929
- run_repl(base_url=url, api=args.leash, repo=cwd, env=env, system=args.system, tool_names=args.tools, sdir=args.skill, init_prompt=args.prompt, resume=args.resume, stream=args.stream, notui=args.notui)
983
+ run_repl(base_url=url, api=args.leash, repo=cwd, env=env, system=args.system, tool_names=args.tools, sdir=args.skill, init_prompt=args.prompt, resume=args.resume, stream=args.stream, bare=args.bare)
930
984
  else:
931
985
  env['GOOGLE_GEMINI_BASE_URL'] = url
932
986
  env['GEMINI_API_KEY'] = 'mc'
@@ -980,10 +980,10 @@ async def _stream_to_stdout(agent: Agent, user_input: str) -> None:
980
980
  if text:
981
981
  print(text)
982
982
 
983
- async def repl(agent, init_prompt=None, notui=False):
983
+ async def repl(agent, init_prompt=None, bare=False):
984
984
  is_tty = sys.stdin.isatty() and sys.stdout.isatty()
985
- if notui and is_tty:
986
- from .ntui import SimpleRepl
985
+ if bare and is_tty:
986
+ from .bare import SimpleRepl
987
987
  sr = SimpleRepl(agent, init_prompt=init_prompt)
988
988
  await sr.run()
989
989
  return None
@@ -1025,7 +1025,7 @@ _AGENT_ENV_ALLOWLIST: re.Pattern = re.compile('\n ^(\n # ── Execution
1025
1025
  def _make_agent_env(base: dict[str, str]) -> dict[str, str]:
1026
1026
  return {k: v for k, v in base.items() if _AGENT_ENV_ALLOWLIST.match(k)}
1027
1027
 
1028
- def run_repl(*, base_url=None, model=None, api: Literal['claude', 'codex', 'gemini', 'deepseek', 'noapi']='noapi', system='', sdir=None, skills=None, env=None, tool_names=None, extra_tool_classes=None, api_key=None, gwt=None, ctx=None, init_prompt=None, resume_messages=None, repo=None, resume=None, stream=None, verbose_transcript=False, notui=False):
1028
+ def run_repl(*, base_url=None, model=None, api: Literal['claude', 'codex', 'gemini', 'deepseek', 'noapi']='noapi', system='', sdir=None, skills=None, env=None, tool_names=None, extra_tool_classes=None, api_key=None, gwt=None, ctx=None, init_prompt=None, resume_messages=None, repo=None, resume=None, stream=None, verbose_transcript=False, bare=False):
1029
1029
  repo = os.path.abspath(repo or os.getcwd())
1030
1030
  with tempfile.TemporaryDirectory(dir=tempfile.gettempdir()) as _home:
1031
1031
  if gwt is None:
@@ -1064,7 +1064,7 @@ def run_repl(*, base_url=None, model=None, api: Literal['claude', 'codex', 'gemi
1064
1064
  print(f'[resumed {len(resume_messages)} messages from checkpoint]')
1065
1065
  app_instance = None
1066
1066
  try:
1067
- app_instance = asyncio.run(repl(agent, init_prompt=init_prompt, notui=notui))
1067
+ app_instance = asyncio.run(repl(agent, init_prompt=init_prompt, bare=bare))
1068
1068
  finally:
1069
1069
  if log_fp:
1070
1070
  log_fp.close()
@@ -1103,7 +1103,7 @@ def main():
1103
1103
  parser.add_argument('--key', default=None, help='API key')
1104
1104
  parser.add_argument('--stream', default=None, help='File to stream log into')
1105
1105
  parser.add_argument('--verbose-transcript', action='store_true', help='Reserved; not yet implemented')
1106
- parser.add_argument('--notui', action='store_true', help='Use simple terminal REPL instead of TUI')
1106
+ parser.add_argument('--bare', action='store_true', help='Use simple terminal REPL instead of TUI')
1107
1107
  args = parser.parse_args()
1108
1108
  logger.debug(args)
1109
1109
  url, model, tool_names, api_key = (args.url, args.model, args.tools, args.key)
@@ -1117,6 +1117,6 @@ def main():
1117
1117
  url = 'https://generativelanguage.googleapis.com' if api_key else url
1118
1118
  model = 'gemini-3.1-flash-lite' if model is None else model
1119
1119
  tool_names = [] if tool_names is None else tool_names
1120
- run_repl(api=args.api, system=args.system, repo=args.cwd, model=model, base_url=url, tool_names=tool_names, sdir=args.skill, api_key=api_key, init_prompt=args.prompt, resume=args.resume, stream=args.stream, notui=args.notui)
1120
+ run_repl(api=args.api, system=args.system, repo=args.cwd, model=model, base_url=url, tool_names=tool_names, sdir=args.skill, api_key=api_key, init_prompt=args.prompt, resume=args.resume, stream=args.stream, bare=args.bare)
1121
1121
  if __name__ == '__main__':
1122
1122
  main()
@@ -597,7 +597,7 @@ def tui(stdscr, entries, log_file, initial_filter='', initial_visible=None):
597
597
  def main():
598
598
  parser = argparse.ArgumentParser(description='TUI viewer for JSON log files')
599
599
  parser.add_argument('logfile', nargs='?', default='.log.json', help='Path to log file (default: .log.json)')
600
- parser.add_argument('-f', '--filter', default=f'lvl:10;file:main,repl,gits,apis,tools', help='Initial filter string (same syntax as in UI)')
600
+ parser.add_argument('-f', '--filter', default=f'lvl:10;file:main,bats,repl,bare,gits,apis,tools', help='Initial filter string (same syntax as in UI)')
601
601
  parser.add_argument('-o', '--out', dest='out', metavar='FILE', help='Write marked entries to FILE (JSON lines format) instead of stdout')
602
602
  args = parser.parse_args()
603
603
  log_path = args.logfile
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mlx-code
3
- Version: 0.0.24
3
+ Version: 0.0.26
4
4
  Summary: Coding Agent for Mac
5
5
  Home-page: https://josefalbers.github.io/mlx-code/
6
6
  Author: J Joe
@@ -17,6 +17,8 @@ Requires-Dist: httpx
17
17
  Requires-Dist: pydantic
18
18
  Requires-Dist: textual>=8.2.7
19
19
  Requires-Dist: rich>=15.0.0
20
+ Requires-Dist: starlette
21
+ Requires-Dist: uvicorn
20
22
  Provides-Extra: all
21
23
  Requires-Dist: python-lsp-server[all]; extra == "all"
22
24
  Requires-Dist: GitPython; extra == "all"
@@ -47,7 +49,7 @@ A Git-native coding agent that can run entirely on your Mac. No API keys, no clo
47
49
  ```
48
50
  Conversation tree (nodes = git commits with embedded chat history)
49
51
 
50
- main ──●──●──●──●──●──●──●──●──●──●
52
+ main ──●──●──●──●──●──●──●──●──●──●──●──●──●──●
51
53
  │ │
52
54
  │ └── branch-1 ──●──●──●
53
55
  │ │ ┌────────────┐
@@ -66,21 +68,21 @@ REPL tabs (each tab = a git branch + agent) │
66
68
  │ └──────┘ └────┬─────┘ └──────────┘ └────────────┘ │
67
69
  └─────────────────┼──────────────────────────────────────┘
68
70
 
69
- ├────────────────────────────────────► each tab is an independent Agent
71
+ ├─────────────────────────────────────────► Each tab is an independent Agent
70
72
 
71
- ┌────┴─────────────────────────────────┐
72
- │ Agent
73
- ┌──────────────┐ ┌──────────────┐
74
- │ │ API: │ │ Tools: │ │
75
- │ │ MLX (local) │ │ Read Write │ │
76
- │ │ Claude │ │ Edit Bash │ │
77
- │ │ Gemini │ │ Grep Find │ │
78
- │ │ OpenAI │ │ Ls Skill │ │
79
- └──────────────┘ │ Agent ───────┼──┼───► spawns child Agent
80
- └──────────────┘ │ (each with own tools + worktree + etc)
81
- │ Git worktree
82
- │ (isolation + session state)
83
- └──────────────────────────────────────┘
73
+ ┌────┴─────────────────────────────────────┐
74
+ │ Agent
75
+ ┌────────────────┐ ┌────────────────┐
76
+ │ │ API: │ │ Tools: │ │
77
+ │ │ Local (mlx-lm) │ │ Read Write │ │
78
+ │ │ Claude │ │ Edit Bash │ │
79
+ │ │ Gemini │ │ Grep Find │ │
80
+ │ │ OpenAI │ │ Ls Skill │ │
81
+ └────────────────┘ │ Agent ─────────┼──┼───► Spawns child Agent
82
+ └────────────────┘ │ (each with own tools + worktree + etc)
83
+ │ Git worktree
84
+ │ (isolation + session state)
85
+ └──────────────────────────────────────────┘
84
86
  ```
85
87
 
86
88
  Each layer is importable and composable on its own. A commit records state, a branch records an alternative path, and a tab is just a live view over an `Agent`.
@@ -98,10 +100,15 @@ result = await agent.run('refactor utils.py to use dataclasses')
98
100
  ## Quick start
99
101
 
100
102
  ```bash
103
+ # ephemeral run (no installation)
104
+ uvx --from mlx-code mlc
105
+
106
+ # or install into the current environment
101
107
  pip install mlx-code
102
- mlc # launch with local MLX model
108
+
109
+ # launch
110
+ mlc # with a local MLX model
103
111
  mlc-run --api gemini # or use a remote provider
104
- mlc-run --api deepseek --model deepseek-v4-flash
105
112
  ```
106
113
 
107
114
  That's it. The first run starts a local inference server and drops you into the REPL.
@@ -123,12 +130,12 @@ That's it. The first run starts a local inference server and drops you into the
123
130
 
124
131
  **Git is the database.** When the agent makes file changes, they’re committed to a git worktree with the full conversation embedded in the commit message. Resume any past session by hash, branch from any checkpoint, and inspect the agent timeline with `git log`. No proprietary state files, just Git.
125
132
 
126
- **Your working directory is never at risk.** The agent operates inside a `git worktree`, not your checkout. It can make a mess, and you can inspect or discard it without ever touching `main`.
127
-
128
- **Built-in safety nets.** Subprocess environment variables go through an explicit allowlist, so secrets in your shell are never leaked to agent-spawned processes.
133
+ **Built-in safety nets.** Your working directory is never at risk. The agent operates inside a `git worktree`, not your checkout. It can make a mess, and you can inspect or discard it without ever touching `main`. Subprocess environment variables go through an explicit allowlist, so secrets in your shell are never leaked to agent-spawned processes.
129
134
 
130
135
  **Batteries included.** Everything ships in one pip install: the MLX inference engine, the multi-protocol API server, the agent loop, the tools, and the TUI. No llama.cpp, no ollama, no vLLM bridge to find and configure. And the server natively speaks OpenAI, Anthropic, Gemini, and Codex wire formats simultaneously, so `claude`, `codex`, and `gemini` CLIs can all work against your local model without a translation layer.
131
136
 
137
+ **Continuous batching.** The local inference server runs a continuous batching engine that processes multiple sequences concurrently. When you spawn parallel agents (eg, multiple tabs, `asyncio.gather` pipelines, or delegated sub-tasks) they all share the same GPU context and are stepped together each tick. A prefix cache persists KV snapshots to disk, so repeated system prompts and conversation prefixes are prefilled once and reused across sessions. No request queueing, no waiting for the previous agent to finish.
138
+
132
139
  ---
133
140
 
134
141
  ## Agent primitive
@@ -166,12 +173,12 @@ agent.messages = messages
166
173
  await agent.run("now add unit tests")
167
174
  ```
168
175
 
169
- Branch from any point in the conversation each branch gets its own worktree:
176
+ Branch from any point in the conversation. Each branch gets its own worktree:
170
177
 
171
178
  ```
172
179
  /branch # branch from current state
173
180
  /branch --rev 2 # branch from the 2nd user turn
174
- /branch --rev 3 --as-worktree try different approach
181
+ /branch --rev 3 make it use httpx instead
175
182
  ```
176
183
 
177
184
  Since it's just git, you can inspect the timeline outside the REPL:
@@ -236,6 +243,43 @@ Reliability comes from specialization plus constraint. A read-only reviewer can'
236
243
 
237
244
  ---
238
245
 
246
+ ## Continuous batching
247
+
248
+ The local server can run multiple inference sequences concurrently inside a single batch step. Instead of a global lock that serialises one request at a time, the batching engine maintains a live set of active sequences and yields tokens for all of them on every step.
249
+
250
+ ```bash
251
+ mlc --engine batch # continuous batching + built-in REPL
252
+ ```
253
+
254
+ This unlocks true parallelism for multi-agent workloads:
255
+
256
+ ```python
257
+ import asyncio
258
+ from mlx_code.repl import Agent
259
+
260
+ async def main():
261
+ agents = [Agent() for _ in range(4)]
262
+ await asyncio.gather(*[
263
+ a.run(f"Research topic: {t}")
264
+ for a, t in zip(agents, ["consensus", "cryptography", "networking", "storage"])
265
+ ])
266
+
267
+ asyncio.run(main())
268
+ ```
269
+
270
+ All four agents generate simultaneously inside the same batch. No sequential blocking.
271
+
272
+ ### Health endpoint
273
+
274
+ ```bash
275
+ curl http://127.0.0.1:8000/health
276
+ # {"status":"ok","model":"mlx-community/Qwen3.5-4B-OptiQ-4bit","active_sequences":2,"prefix_cache_files":5}
277
+ ```
278
+
279
+ `active_sequences` shows how many agents are generating right now; `prefix_cache_files` shows how many prefix KV snapshots are stored on disk.
280
+
281
+ ---
282
+
239
283
  ## Command Line
240
284
 
241
285
  ### `mlc`: local server + harness
@@ -243,20 +287,20 @@ Reliability comes from specialization plus constraint. A read-only reviewer can'
243
287
  Starts the MLX inference server and launches the built-in TUI harness against it.
244
288
 
245
289
  ```bash
246
- # Default: local server + default TUI
290
+ # Default: local server + default harness
247
291
  mlc
248
292
 
249
- # Use a simple terminal REPL instead of the TUI
250
- mlc --notui
293
+ # Continuous batching mode (default is sequential caching mode)
294
+ mlc --engine batch
295
+
296
+ # Server only, no harness
297
+ mlc --leash none
251
298
 
252
299
  # Use a different harness (routes traffic through the local server)
253
300
  mlc --leash claude
254
301
  mlc --leash gemini
255
302
  mlc --leash codex
256
303
 
257
- # Server only, no harness
258
- mlc --leash none
259
-
260
304
  # Specify a model
261
305
  mlc --model mlx-community/Qwen3.5-4B-OptiQ-4bit
262
306
 
@@ -307,7 +351,7 @@ mlc-run --api codex
307
351
  echo "explain lsp.py" | mlc-run -a deepseek | cat - PLAN.md | mlc-run --url http://localhost:9000
308
352
 
309
353
  # Simple terminal REPL (no TUI)
310
- mlc-run --notui
354
+ mlc-run --bare
311
355
  ```
312
356
 
313
357
  ---
@@ -432,18 +476,19 @@ agent = Agent(extra_tool_classes=[LiveDBTool], tool_names=["QueryDB"])
432
476
 
433
477
  | Command | Description |
434
478
  |---|---|
435
- | `/help` | Show command reference |
479
+ | `/branch [--rev N] [prompt]` | Open a new branch tab from the current (or earlier) checkpoint |
480
+ | `/diff [--all]` | Show a side-by-side diff of changes in the worktree |
436
481
  | `/clear [--config F]` | Clear conversation; `--config` reloads agent from a JSON/YAML file |
482
+ | `/tab [N]` | Jump to tab N |
437
483
  | `/history [--raw]` | Show conversation transcript; `--raw` shows the raw API message log |
438
- | `/diff [--all]` | Show a side-by-side diff of changes in the worktree |
439
- | `/errors` | Show timestamped error log for the current tab |
440
484
  | `/tools` | List active tools |
441
- | `/branch [--rev N] [prompt]` | Open a new branch tab from the current (or earlier) checkpoint |
442
485
  | `/abort` | Abort the running agent |
486
+ | `/errors` | Show timestamped error log for the current tab |
443
487
  | `/export [path]` | Export session to JSON |
444
488
  | `/exit [--all]` | Close branch tab, or exit the app |
445
- | `!command` | Run a shell command; output captured in the TUI |
446
- | `$command` | Run an interactive command (TUI suspends, terminal handed to process) |
489
+ | `/help` | Show command reference |
490
+ | `!command` | Run a shell command; output captured in the TUI (eg, `ls`, `cat hello.c`) |
491
+ | `$command` | Run an interactive command (eg, `vim`, `yazi`, `less hello.c`) |
447
492
 
448
493
  ### Key bindings
449
494
 
@@ -453,7 +498,7 @@ agent = Agent(extra_tool_classes=[LiveDBTool], tool_names=["QueryDB"])
453
498
  | `Ctrl-J` | Insert newline |
454
499
  | `Ctrl-1` … `Ctrl-9` | Jump to tab N |
455
500
  | `Ctrl-,` / `Ctrl-.` | Cycle through tabs |
456
- | `Ctrl-C` | Abort running agent |
501
+ | `Ctrl-C` | Clear input, or abort running agent |
457
502
  | `Ctrl-D` | Close branch tab, or exit app |
458
503
  | `Ctrl-R` | Recall last prompt into editor |
459
504
 
@@ -471,7 +516,7 @@ agent = Agent(extra_tool_classes=[LiveDBTool], tool_names=["QueryDB"])
471
516
  | `Skill` | Retrieve named skill instructions from config |
472
517
  | `Agent` | Spawn an autonomous sub-agent for delegated work |
473
518
 
474
- All file tools enforce path sandboxing the agent cannot read or write outside the worktree.
519
+ All file tools enforce path sandboxing. The agent cannot read or write outside the worktree.
475
520
 
476
521
  ### Backends
477
522
 
@@ -3,12 +3,13 @@ README.md
3
3
  setup.py
4
4
  mlx_code/__init__.py
5
5
  mlx_code/apis.py
6
+ mlx_code/bare.py
7
+ mlx_code/bats.py
6
8
  mlx_code/gits.py
7
9
  mlx_code/lsp_tool.py
8
10
  mlx_code/main.py
9
11
  mlx_code/mcb.py
10
12
  mlx_code/mcb_tool.py
11
- mlx_code/ntui.py
12
13
  mlx_code/repl.py
13
14
  mlx_code/stream_log.py
14
15
  mlx_code/tools.py
@@ -2,6 +2,8 @@ httpx
2
2
  pydantic
3
3
  textual>=8.2.7
4
4
  rich>=15.0.0
5
+ starlette
6
+ uvicorn
5
7
 
6
8
  [:platform_system == "Darwin"]
7
9
  mlx-lm>=0.31.3
@@ -11,7 +11,7 @@ setup(
11
11
  author_email="albersj66@gmail.com",
12
12
  author="J Joe",
13
13
  license="Apache-2.0",
14
- version="0.0.24",
14
+ version="0.0.26",
15
15
  readme="README.md",
16
16
  description="Coding Agent for Mac",
17
17
  long_description=open("README.md").read(),
@@ -24,6 +24,9 @@ setup(
24
24
 
25
25
  "textual>=8.2.7",
26
26
  "rich>=15.0.0",
27
+
28
+ "starlette",
29
+ "uvicorn",
27
30
  ],
28
31
  extras_require={"all": [
29
32
  "python-lsp-server[all]",
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes