mlx-code 0.0.25__tar.gz → 0.0.26__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. {mlx_code-0.0.25 → mlx_code-0.0.26}/PKG-INFO +78 -38
  2. {mlx_code-0.0.25 → mlx_code-0.0.26}/README.md +75 -37
  3. mlx_code-0.0.25/mlx_code/ntui.py → mlx_code-0.0.26/mlx_code/bare.py +1 -0
  4. mlx_code-0.0.26/mlx_code/bats.py +299 -0
  5. {mlx_code-0.0.25 → mlx_code-0.0.26}/mlx_code/main.py +65 -11
  6. {mlx_code-0.0.25 → mlx_code-0.0.26}/mlx_code/repl.py +7 -7
  7. {mlx_code-0.0.25 → mlx_code-0.0.26}/mlx_code/view_log.py +1 -1
  8. {mlx_code-0.0.25 → mlx_code-0.0.26}/mlx_code.egg-info/PKG-INFO +78 -38
  9. {mlx_code-0.0.25 → mlx_code-0.0.26}/mlx_code.egg-info/SOURCES.txt +2 -1
  10. {mlx_code-0.0.25 → mlx_code-0.0.26}/mlx_code.egg-info/requires.txt +2 -0
  11. {mlx_code-0.0.25 → mlx_code-0.0.26}/setup.py +4 -1
  12. {mlx_code-0.0.25 → mlx_code-0.0.26}/LICENSE +0 -0
  13. {mlx_code-0.0.25 → mlx_code-0.0.26}/mlx_code/__init__.py +0 -0
  14. {mlx_code-0.0.25 → mlx_code-0.0.26}/mlx_code/apis.py +0 -0
  15. {mlx_code-0.0.25 → mlx_code-0.0.26}/mlx_code/gits.py +0 -0
  16. {mlx_code-0.0.25 → mlx_code-0.0.26}/mlx_code/lsp_tool.py +0 -0
  17. {mlx_code-0.0.25 → mlx_code-0.0.26}/mlx_code/mcb.py +0 -0
  18. {mlx_code-0.0.25 → mlx_code-0.0.26}/mlx_code/mcb_tool.py +0 -0
  19. {mlx_code-0.0.25 → mlx_code-0.0.26}/mlx_code/stream_log.py +0 -0
  20. {mlx_code-0.0.25 → mlx_code-0.0.26}/mlx_code/tools.py +0 -0
  21. {mlx_code-0.0.25 → mlx_code-0.0.26}/mlx_code/util.py +0 -0
  22. {mlx_code-0.0.25 → mlx_code-0.0.26}/mlx_code/view_git.py +0 -0
  23. {mlx_code-0.0.25 → mlx_code-0.0.26}/mlx_code.egg-info/dependency_links.txt +0 -0
  24. {mlx_code-0.0.25 → mlx_code-0.0.26}/mlx_code.egg-info/entry_points.txt +0 -0
  25. {mlx_code-0.0.25 → mlx_code-0.0.26}/mlx_code.egg-info/top_level.txt +0 -0
  26. {mlx_code-0.0.25 → mlx_code-0.0.26}/setup.cfg +0 -0
  27. {mlx_code-0.0.25 → mlx_code-0.0.26}/tests/__init__.py +0 -0
  28. {mlx_code-0.0.25 → mlx_code-0.0.26}/tests/test.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mlx-code
3
- Version: 0.0.25
3
+ Version: 0.0.26
4
4
  Summary: Coding Agent for Mac
5
5
  Home-page: https://josefalbers.github.io/mlx-code/
6
6
  Author: J Joe
@@ -17,6 +17,8 @@ Requires-Dist: httpx
17
17
  Requires-Dist: pydantic
18
18
  Requires-Dist: textual>=8.2.7
19
19
  Requires-Dist: rich>=15.0.0
20
+ Requires-Dist: starlette
21
+ Requires-Dist: uvicorn
20
22
  Provides-Extra: all
21
23
  Requires-Dist: python-lsp-server[all]; extra == "all"
22
24
  Requires-Dist: GitPython; extra == "all"
@@ -47,7 +49,7 @@ A Git-native coding agent that can run entirely on your Mac. No API keys, no clo
47
49
  ```
48
50
  Conversation tree (nodes = git commits with embedded chat history)
49
51
 
50
- main ──●──●──●──●──●──●──●──●──●──●
52
+ main ──●──●──●──●──●──●──●──●──●──●──●──●──●──●
51
53
  │ │
52
54
  │ └── branch-1 ──●──●──●
53
55
  │ │ ┌────────────┐
@@ -66,21 +68,21 @@ REPL tabs (each tab = a git branch + agent) │
66
68
  │ └──────┘ └────┬─────┘ └──────────┘ └────────────┘ │
67
69
  └─────────────────┼──────────────────────────────────────┘
68
70
 
69
- ├────────────────────────────────────► each tab is an independent Agent
71
+ ├─────────────────────────────────────────► Each tab is an independent Agent
70
72
 
71
- ┌────┴─────────────────────────────────┐
72
- │ Agent
73
- ┌──────────────┐ ┌──────────────┐
74
- │ │ API: │ │ Tools: │ │
75
- │ │ MLX (local) │ │ Read Write │ │
76
- │ │ Claude │ │ Edit Bash │ │
77
- │ │ Gemini │ │ Grep Find │ │
78
- │ │ OpenAI │ │ Ls Skill │ │
79
- └──────────────┘ │ Agent ───────┼──┼───► spawns child Agent
80
- └──────────────┘ │ (each with own tools + worktree + etc)
81
- │ Git worktree
82
- │ (isolation + session state)
83
- └──────────────────────────────────────┘
73
+ ┌────┴─────────────────────────────────────┐
74
+ │ Agent
75
+ ┌────────────────┐ ┌────────────────┐
76
+ │ │ API: │ │ Tools: │ │
77
+ │ │ Local (mlx-lm) │ │ Read Write │ │
78
+ │ │ Claude │ │ Edit Bash │ │
79
+ │ │ Gemini │ │ Grep Find │ │
80
+ │ │ OpenAI │ │ Ls Skill │ │
81
+ └────────────────┘ │ Agent ─────────┼──┼───► Spawns child Agent
82
+ └────────────────┘ │ (each with own tools + worktree + etc)
83
+ │ Git worktree
84
+ │ (isolation + session state)
85
+ └──────────────────────────────────────────┘
84
86
  ```
85
87
 
86
88
  Each layer is importable and composable on its own. A commit records state, a branch records an alternative path, and a tab is just a live view over an `Agent`.
@@ -104,9 +106,9 @@ uvx --from mlx-code mlc
104
106
  # or install into the current environment
105
107
  pip install mlx-code
106
108
 
107
- mlc # launch with local MLX model
109
+ # launch
110
+ mlc # with a local MLX model
108
111
  mlc-run --api gemini # or use a remote provider
109
- mlc-run --api deepseek --model deepseek-v4-flash
110
112
  ```
111
113
 
112
114
  That's it. The first run starts a local inference server and drops you into the REPL.
@@ -128,12 +130,12 @@ That's it. The first run starts a local inference server and drops you into the
128
130
 
129
131
  **Git is the database.** When the agent makes file changes, they’re committed to a git worktree with the full conversation embedded in the commit message. Resume any past session by hash, branch from any checkpoint, and inspect the agent timeline with `git log`. No proprietary state files, just Git.
130
132
 
131
- **Your working directory is never at risk.** The agent operates inside a `git worktree`, not your checkout. It can make a mess, and you can inspect or discard it without ever touching `main`.
132
-
133
- **Built-in safety nets.** Subprocess environment variables go through an explicit allowlist, so secrets in your shell are never leaked to agent-spawned processes.
133
+ **Built-in safety nets.** Your working directory is never at risk. The agent operates inside a `git worktree`, not your checkout. It can make a mess, and you can inspect or discard it without ever touching `main`. Subprocess environment variables go through an explicit allowlist, so secrets in your shell are never leaked to agent-spawned processes.
134
134
 
135
135
  **Batteries included.** Everything ships in one pip install: the MLX inference engine, the multi-protocol API server, the agent loop, the tools, and the TUI. No llama.cpp, no ollama, no vLLM bridge to find and configure. And the server natively speaks OpenAI, Anthropic, Gemini, and Codex wire formats simultaneously, so `claude`, `codex`, and `gemini` CLIs can all work against your local model without a translation layer.
136
136
 
137
+ **Continuous batching.** The local inference server runs a continuous batching engine that processes multiple sequences concurrently. When you spawn parallel agents (eg, multiple tabs, `asyncio.gather` pipelines, or delegated sub-tasks) they all share the same GPU context and are stepped together each tick. A prefix cache persists KV snapshots to disk, so repeated system prompts and conversation prefixes are prefilled once and reused across sessions. No request queueing, no waiting for the previous agent to finish.
138
+
137
139
  ---
138
140
 
139
141
  ## Agent primitive
@@ -171,12 +173,12 @@ agent.messages = messages
171
173
  await agent.run("now add unit tests")
172
174
  ```
173
175
 
174
- Branch from any point in the conversation each branch gets its own worktree:
176
+ Branch from any point in the conversation. Each branch gets its own worktree:
175
177
 
176
178
  ```
177
179
  /branch # branch from current state
178
180
  /branch --rev 2 # branch from the 2nd user turn
179
- /branch --rev 3 --as-worktree try different approach
181
+ /branch --rev 3 make it use httpx instead
180
182
  ```
181
183
 
182
184
  Since it's just git, you can inspect the timeline outside the REPL:
@@ -241,6 +243,43 @@ Reliability comes from specialization plus constraint. A read-only reviewer can'
241
243
 
242
244
  ---
243
245
 
246
+ ## Continuous batching
247
+
248
+ The local server can run multiple inference sequences concurrently inside a single batch step. Instead of a global lock that serialises one request at a time, the batching engine maintains a live set of active sequences and yields tokens for all of them on every step.
249
+
250
+ ```bash
251
+ mlc --engine batch # continuous batching + built-in REPL
252
+ ```
253
+
254
+ This unlocks true parallelism for multi-agent workloads:
255
+
256
+ ```python
257
+ import asyncio
258
+ from mlx_code.repl import Agent
259
+
260
+ async def main():
261
+ agents = [Agent() for _ in range(4)]
262
+ await asyncio.gather(*[
263
+ a.run(f"Research topic: {t}")
264
+ for a, t in zip(agents, ["consensus", "cryptography", "networking", "storage"])
265
+ ])
266
+
267
+ asyncio.run(main())
268
+ ```
269
+
270
+ All four agents generate simultaneously inside the same batch. No sequential blocking.
271
+
272
+ ### Health endpoint
273
+
274
+ ```bash
275
+ curl http://127.0.0.1:8000/health
276
+ # {"status":"ok","model":"mlx-community/Qwen3.5-4B-OptiQ-4bit","active_sequences":2,"prefix_cache_files":5}
277
+ ```
278
+
279
+ `active_sequences` shows how many agents are generating right now; `prefix_cache_files` shows how many prefix KV snapshots are stored on disk.
280
+
281
+ ---
282
+
244
283
  ## Command Line
245
284
 
246
285
  ### `mlc`: local server + harness
@@ -248,20 +287,20 @@ Reliability comes from specialization plus constraint. A read-only reviewer can'
248
287
  Starts the MLX inference server and launches the built-in TUI harness against it.
249
288
 
250
289
  ```bash
251
- # Default: local server + default TUI
290
+ # Default: local server + default harness
252
291
  mlc
253
292
 
254
- # Use a simple terminal REPL instead of the TUI
255
- mlc --notui
293
+ # Continuous batching mode (default is sequential caching mode)
294
+ mlc --engine batch
295
+
296
+ # Server only, no harness
297
+ mlc --leash none
256
298
 
257
299
  # Use a different harness (routes traffic through the local server)
258
300
  mlc --leash claude
259
301
  mlc --leash gemini
260
302
  mlc --leash codex
261
303
 
262
- # Server only, no harness
263
- mlc --leash none
264
-
265
304
  # Specify a model
266
305
  mlc --model mlx-community/Qwen3.5-4B-OptiQ-4bit
267
306
 
@@ -312,7 +351,7 @@ mlc-run --api codex
312
351
  echo "explain lsp.py" | mlc-run -a deepseek | cat - PLAN.md | mlc-run --url http://localhost:9000
313
352
 
314
353
  # Simple terminal REPL (no TUI)
315
- mlc-run --notui
354
+ mlc-run --bare
316
355
  ```
317
356
 
318
357
  ---
@@ -437,18 +476,19 @@ agent = Agent(extra_tool_classes=[LiveDBTool], tool_names=["QueryDB"])
437
476
 
438
477
  | Command | Description |
439
478
  |---|---|
440
- | `/help` | Show command reference |
479
+ | `/branch [--rev N] [prompt]` | Open a new branch tab from the current (or earlier) checkpoint |
480
+ | `/diff [--all]` | Show a side-by-side diff of changes in the worktree |
441
481
  | `/clear [--config F]` | Clear conversation; `--config` reloads agent from a JSON/YAML file |
482
+ | `/tab [N]` | Jump to tab N |
442
483
  | `/history [--raw]` | Show conversation transcript; `--raw` shows the raw API message log |
443
- | `/diff [--all]` | Show a side-by-side diff of changes in the worktree |
444
- | `/errors` | Show timestamped error log for the current tab |
445
484
  | `/tools` | List active tools |
446
- | `/branch [--rev N] [prompt]` | Open a new branch tab from the current (or earlier) checkpoint |
447
485
  | `/abort` | Abort the running agent |
486
+ | `/errors` | Show timestamped error log for the current tab |
448
487
  | `/export [path]` | Export session to JSON |
449
488
  | `/exit [--all]` | Close branch tab, or exit the app |
450
- | `!command` | Run a shell command; output captured in the TUI |
451
- | `$command` | Run an interactive command (TUI suspends, terminal handed to process) |
489
+ | `/help` | Show command reference |
490
+ | `!command` | Run a shell command; output captured in the TUI (eg, `ls`, `cat hello.c`) |
491
+ | `$command` | Run an interactive command (eg, `vim`, `yazi`, `less hello.c`) |
452
492
 
453
493
  ### Key bindings
454
494
 
@@ -458,7 +498,7 @@ agent = Agent(extra_tool_classes=[LiveDBTool], tool_names=["QueryDB"])
458
498
  | `Ctrl-J` | Insert newline |
459
499
  | `Ctrl-1` … `Ctrl-9` | Jump to tab N |
460
500
  | `Ctrl-,` / `Ctrl-.` | Cycle through tabs |
461
- | `Ctrl-C` | Abort running agent |
501
+ | `Ctrl-C` | Clear input, or abort running agent |
462
502
  | `Ctrl-D` | Close branch tab, or exit app |
463
503
  | `Ctrl-R` | Recall last prompt into editor |
464
504
 
@@ -476,7 +516,7 @@ agent = Agent(extra_tool_classes=[LiveDBTool], tool_names=["QueryDB"])
476
516
  | `Skill` | Retrieve named skill instructions from config |
477
517
  | `Agent` | Spawn an autonomous sub-agent for delegated work |
478
518
 
479
- All file tools enforce path sandboxing the agent cannot read or write outside the worktree.
519
+ All file tools enforce path sandboxing. The agent cannot read or write outside the worktree.
480
520
 
481
521
  ### Backends
482
522
 
@@ -11,7 +11,7 @@ A Git-native coding agent that can run entirely on your Mac. No API keys, no clo
11
11
  ```
12
12
  Conversation tree (nodes = git commits with embedded chat history)
13
13
 
14
- main ──●──●──●──●──●──●──●──●──●──●
14
+ main ──●──●──●──●──●──●──●──●──●──●──●──●──●──●
15
15
  │ │
16
16
  │ └── branch-1 ──●──●──●
17
17
  │ │ ┌────────────┐
@@ -30,21 +30,21 @@ REPL tabs (each tab = a git branch + agent) │
30
30
  │ └──────┘ └────┬─────┘ └──────────┘ └────────────┘ │
31
31
  └─────────────────┼──────────────────────────────────────┘
32
32
 
33
- ├────────────────────────────────────► each tab is an independent Agent
33
+ ├─────────────────────────────────────────► Each tab is an independent Agent
34
34
 
35
- ┌────┴─────────────────────────────────┐
36
- │ Agent
37
- ┌──────────────┐ ┌──────────────┐
38
- │ │ API: │ │ Tools: │ │
39
- │ │ MLX (local) │ │ Read Write │ │
40
- │ │ Claude │ │ Edit Bash │ │
41
- │ │ Gemini │ │ Grep Find │ │
42
- │ │ OpenAI │ │ Ls Skill │ │
43
- └──────────────┘ │ Agent ───────┼──┼───► spawns child Agent
44
- └──────────────┘ │ (each with own tools + worktree + etc)
45
- │ Git worktree
46
- │ (isolation + session state)
47
- └──────────────────────────────────────┘
35
+ ┌────┴─────────────────────────────────────┐
36
+ │ Agent
37
+ ┌────────────────┐ ┌────────────────┐
38
+ │ │ API: │ │ Tools: │ │
39
+ │ │ Local (mlx-lm) │ │ Read Write │ │
40
+ │ │ Claude │ │ Edit Bash │ │
41
+ │ │ Gemini │ │ Grep Find │ │
42
+ │ │ OpenAI │ │ Ls Skill │ │
43
+ └────────────────┘ │ Agent ─────────┼──┼───► Spawns child Agent
44
+ └────────────────┘ │ (each with own tools + worktree + etc)
45
+ │ Git worktree
46
+ │ (isolation + session state)
47
+ └──────────────────────────────────────────┘
48
48
  ```
49
49
 
50
50
  Each layer is importable and composable on its own. A commit records state, a branch records an alternative path, and a tab is just a live view over an `Agent`.
@@ -68,9 +68,9 @@ uvx --from mlx-code mlc
68
68
  # or install into the current environment
69
69
  pip install mlx-code
70
70
 
71
- mlc # launch with local MLX model
71
+ # launch
72
+ mlc # with a local MLX model
72
73
  mlc-run --api gemini # or use a remote provider
73
- mlc-run --api deepseek --model deepseek-v4-flash
74
74
  ```
75
75
 
76
76
  That's it. The first run starts a local inference server and drops you into the REPL.
@@ -92,12 +92,12 @@ That's it. The first run starts a local inference server and drops you into the
92
92
 
93
93
  **Git is the database.** When the agent makes file changes, they’re committed to a git worktree with the full conversation embedded in the commit message. Resume any past session by hash, branch from any checkpoint, and inspect the agent timeline with `git log`. No proprietary state files, just Git.
94
94
 
95
- **Your working directory is never at risk.** The agent operates inside a `git worktree`, not your checkout. It can make a mess, and you can inspect or discard it without ever touching `main`.
96
-
97
- **Built-in safety nets.** Subprocess environment variables go through an explicit allowlist, so secrets in your shell are never leaked to agent-spawned processes.
95
+ **Built-in safety nets.** Your working directory is never at risk. The agent operates inside a `git worktree`, not your checkout. It can make a mess, and you can inspect or discard it without ever touching `main`. Subprocess environment variables go through an explicit allowlist, so secrets in your shell are never leaked to agent-spawned processes.
98
96
 
99
97
  **Batteries included.** Everything ships in one pip install: the MLX inference engine, the multi-protocol API server, the agent loop, the tools, and the TUI. No llama.cpp, no ollama, no vLLM bridge to find and configure. And the server natively speaks OpenAI, Anthropic, Gemini, and Codex wire formats simultaneously, so `claude`, `codex`, and `gemini` CLIs can all work against your local model without a translation layer.
100
98
 
99
+ **Continuous batching.** The local inference server runs a continuous batching engine that processes multiple sequences concurrently. When you spawn parallel agents (eg, multiple tabs, `asyncio.gather` pipelines, or delegated sub-tasks) they all share the same GPU context and are stepped together each tick. A prefix cache persists KV snapshots to disk, so repeated system prompts and conversation prefixes are prefilled once and reused across sessions. No request queueing, no waiting for the previous agent to finish.
100
+
101
101
  ---
102
102
 
103
103
  ## Agent primitive
@@ -135,12 +135,12 @@ agent.messages = messages
135
135
  await agent.run("now add unit tests")
136
136
  ```
137
137
 
138
- Branch from any point in the conversation each branch gets its own worktree:
138
+ Branch from any point in the conversation. Each branch gets its own worktree:
139
139
 
140
140
  ```
141
141
  /branch # branch from current state
142
142
  /branch --rev 2 # branch from the 2nd user turn
143
- /branch --rev 3 --as-worktree try different approach
143
+ /branch --rev 3 make it use httpx instead
144
144
  ```
145
145
 
146
146
  Since it's just git, you can inspect the timeline outside the REPL:
@@ -205,6 +205,43 @@ Reliability comes from specialization plus constraint. A read-only reviewer can'
205
205
 
206
206
  ---
207
207
 
208
+ ## Continuous batching
209
+
210
+ The local server can run multiple inference sequences concurrently inside a single batch step. Instead of a global lock that serialises one request at a time, the batching engine maintains a live set of active sequences and yields tokens for all of them on every step.
211
+
212
+ ```bash
213
+ mlc --engine batch # continuous batching + built-in REPL
214
+ ```
215
+
216
+ This unlocks true parallelism for multi-agent workloads:
217
+
218
+ ```python
219
+ import asyncio
220
+ from mlx_code.repl import Agent
221
+
222
+ async def main():
223
+ agents = [Agent() for _ in range(4)]
224
+ await asyncio.gather(*[
225
+ a.run(f"Research topic: {t}")
226
+ for a, t in zip(agents, ["consensus", "cryptography", "networking", "storage"])
227
+ ])
228
+
229
+ asyncio.run(main())
230
+ ```
231
+
232
+ All four agents generate simultaneously inside the same batch. No sequential blocking.
233
+
234
+ ### Health endpoint
235
+
236
+ ```bash
237
+ curl http://127.0.0.1:8000/health
238
+ # {"status":"ok","model":"mlx-community/Qwen3.5-4B-OptiQ-4bit","active_sequences":2,"prefix_cache_files":5}
239
+ ```
240
+
241
+ `active_sequences` shows how many agents are generating right now; `prefix_cache_files` shows how many prefix KV snapshots are stored on disk.
242
+
243
+ ---
244
+
208
245
  ## Command Line
209
246
 
210
247
  ### `mlc`: local server + harness
@@ -212,20 +249,20 @@ Reliability comes from specialization plus constraint. A read-only reviewer can'
212
249
  Starts the MLX inference server and launches the built-in TUI harness against it.
213
250
 
214
251
  ```bash
215
- # Default: local server + default TUI
252
+ # Default: local server + default harness
216
253
  mlc
217
254
 
218
- # Use a simple terminal REPL instead of the TUI
219
- mlc --notui
255
+ # Continuous batching mode (default is sequential caching mode)
256
+ mlc --engine batch
257
+
258
+ # Server only, no harness
259
+ mlc --leash none
220
260
 
221
261
  # Use a different harness (routes traffic through the local server)
222
262
  mlc --leash claude
223
263
  mlc --leash gemini
224
264
  mlc --leash codex
225
265
 
226
- # Server only, no harness
227
- mlc --leash none
228
-
229
266
  # Specify a model
230
267
  mlc --model mlx-community/Qwen3.5-4B-OptiQ-4bit
231
268
 
@@ -276,7 +313,7 @@ mlc-run --api codex
276
313
  echo "explain lsp.py" | mlc-run -a deepseek | cat - PLAN.md | mlc-run --url http://localhost:9000
277
314
 
278
315
  # Simple terminal REPL (no TUI)
279
- mlc-run --notui
316
+ mlc-run --bare
280
317
  ```
281
318
 
282
319
  ---
@@ -401,18 +438,19 @@ agent = Agent(extra_tool_classes=[LiveDBTool], tool_names=["QueryDB"])
401
438
 
402
439
  | Command | Description |
403
440
  |---|---|
404
- | `/help` | Show command reference |
441
+ | `/branch [--rev N] [prompt]` | Open a new branch tab from the current (or earlier) checkpoint |
442
+ | `/diff [--all]` | Show a side-by-side diff of changes in the worktree |
405
443
  | `/clear [--config F]` | Clear conversation; `--config` reloads agent from a JSON/YAML file |
444
+ | `/tab [N]` | Jump to tab N |
406
445
  | `/history [--raw]` | Show conversation transcript; `--raw` shows the raw API message log |
407
- | `/diff [--all]` | Show a side-by-side diff of changes in the worktree |
408
- | `/errors` | Show timestamped error log for the current tab |
409
446
  | `/tools` | List active tools |
410
- | `/branch [--rev N] [prompt]` | Open a new branch tab from the current (or earlier) checkpoint |
411
447
  | `/abort` | Abort the running agent |
448
+ | `/errors` | Show timestamped error log for the current tab |
412
449
  | `/export [path]` | Export session to JSON |
413
450
  | `/exit [--all]` | Close branch tab, or exit the app |
414
- | `!command` | Run a shell command; output captured in the TUI |
415
- | `$command` | Run an interactive command (TUI suspends, terminal handed to process) |
451
+ | `/help` | Show command reference |
452
+ | `!command` | Run a shell command; output captured in the TUI (eg, `ls`, `cat hello.c`) |
453
+ | `$command` | Run an interactive command (eg, `vim`, `yazi`, `less hello.c`) |
416
454
 
417
455
  ### Key bindings
418
456
 
@@ -422,7 +460,7 @@ agent = Agent(extra_tool_classes=[LiveDBTool], tool_names=["QueryDB"])
422
460
  | `Ctrl-J` | Insert newline |
423
461
  | `Ctrl-1` … `Ctrl-9` | Jump to tab N |
424
462
  | `Ctrl-,` / `Ctrl-.` | Cycle through tabs |
425
- | `Ctrl-C` | Abort running agent |
463
+ | `Ctrl-C` | Clear input, or abort running agent |
426
464
  | `Ctrl-D` | Close branch tab, or exit app |
427
465
  | `Ctrl-R` | Recall last prompt into editor |
428
466
 
@@ -440,7 +478,7 @@ agent = Agent(extra_tool_classes=[LiveDBTool], tool_names=["QueryDB"])
440
478
  | `Skill` | Retrieve named skill instructions from config |
441
479
  | `Agent` | Spawn an autonomous sub-agent for delegated work |
442
480
 
443
- All file tools enforce path sandboxing the agent cannot read or write outside the worktree.
481
+ All file tools enforce path sandboxing. The agent cannot read or write outside the worktree.
444
482
 
445
483
  ### Backends
446
484
 
@@ -110,6 +110,7 @@ class SimpleRepl:
110
110
  if out_text:
111
111
  self._write_delta(prefix + out_text, 'tool_result')
112
112
  self._last_stream_type = t
113
+ print()
113
114
  elif t == 'commit':
114
115
  self._pending_nls = 0
115
116
  self._awaiting_content = False
@@ -0,0 +1,299 @@
1
+ import asyncio
2
+ import json
3
+ import queue as _queue
4
+ import time
5
+ import uuid
6
+ import threading
7
+ import hashlib
8
+ from array import array
9
+ from contextlib import asynccontextmanager
10
+ from pathlib import Path
11
+ import mlx.core as mx
12
+ from starlette.applications import Starlette
13
+ from starlette.requests import Request
14
+ from starlette.responses import StreamingResponse, JSONResponse
15
+ from starlette.routing import Route
16
+ import logging
17
+ logger = logging.getLogger(__name__)
18
+ MIN_PREFIX_TOKENS = 256
19
+
20
+ def _hash_tokens(tokens):
21
+ arr = array('I', tokens)
22
+ return hashlib.blake2b(arr.tobytes(), digest_size=8).hexdigest()
23
+
24
+ class PrefixCache:
25
+
26
+ def __init__(self, model_name, cache_dir):
27
+ self.model_name = model_name
28
+ self.cache_dir = Path(cache_dir)
29
+ self.cache_dir.mkdir(parents=True, exist_ok=True)
30
+
31
+ def _path(self, prefix_tokens):
32
+ safe = ''.join((c for c in self.model_name if c.isalnum()))
33
+ h = _hash_tokens(prefix_tokens)
34
+ return self.cache_dir / f'{safe}_{len(prefix_tokens)}_{h}.safetensors'
35
+
36
+ def lookup(self, prefix_tokens):
37
+ if not prefix_tokens or len(prefix_tokens) < MIN_PREFIX_TOKENS:
38
+ return None
39
+ path = self._path(prefix_tokens)
40
+ if not path.exists():
41
+ return None
42
+ try:
43
+ from mlx_lm.models.cache import load_prompt_cache
44
+ cache, _ = load_prompt_cache(str(path), return_metadata=True)
45
+ mx.async_eval(cache)
46
+ return cache
47
+ except Exception as exc:
48
+ logger.info(f'[batch] failed to load prefix cache {path.name}: {exc}')
49
+ return None
50
+
51
+ def store(self, prefix_tokens, kv_cache):
52
+ if not prefix_tokens or len(prefix_tokens) < MIN_PREFIX_TOKENS:
53
+ return
54
+ path = self._path(prefix_tokens)
55
+ if path.exists():
56
+ return
57
+ try:
58
+ from mlx_lm.models.cache import save_prompt_cache
59
+ save_prompt_cache(str(path), kv_cache)
60
+ logger.info(f'[batch] saved prefix cache len={len(prefix_tokens)} file={path.name}')
61
+ except Exception as exc:
62
+ logger.info(f'[batch] failed to save prefix cache: {exc}')
63
+
64
+ def _prefill_prefix(model, tokens, prefill_step_size=2048):
65
+ from mlx_lm.models.cache import make_prompt_cache
66
+ prompt_cache = make_prompt_cache(model)
67
+ prompt = mx.array(tokens)
68
+ while prompt.shape[0] > 0:
69
+ n = min(prefill_step_size, prompt.shape[0])
70
+ model(prompt[:n][None], cache=prompt_cache)
71
+ mx.eval([c.state for c in prompt_cache])
72
+ prompt = prompt[n:]
73
+ mx.clear_cache()
74
+ return prompt_cache
75
+
76
+ def _get_prefix(tokens, ckpts):
77
+ if not ckpts:
78
+ return (None, 0)
79
+ first_ckpt = min(ckpts)
80
+ if first_ckpt < MIN_PREFIX_TOKENS:
81
+ return (None, 0)
82
+ return (tokens[:first_ckpt], first_ckpt)
83
+
84
+ def make_batch_app(model_name: str, cache_dir: str='.cache'):
85
+ state = {'model': None, 'tokenizer': None, 'batch_gen': None, 'request_queue': _queue.Queue(), 'active': {}, 'loop': None, 'prefix_cache': None}
86
+
87
+ def _engine():
88
+ rq = state['request_queue']
89
+ active = state['active']
90
+ bg = state['batch_gen']
91
+ tok = state['tokenizer']
92
+ loop = state['loop']
93
+ model = state['model']
94
+ pcache = state['prefix_cache']
95
+ while True:
96
+ while not rq.empty():
97
+ try:
98
+ tokens, max_tokens, token_queue, ckpts = rq.get_nowait()
99
+ _insert(bg, active, pcache, model, tok, loop, tokens, max_tokens, token_queue, ckpts)
100
+ except _queue.Empty:
101
+ break
102
+ if not active:
103
+ tokens, max_tokens, token_queue, ckpts = rq.get()
104
+ _insert(bg, active, pcache, model, tok, loop, tokens, max_tokens, token_queue, ckpts)
105
+ try:
106
+ results = bg.next_generated()
107
+ except Exception:
108
+ for uid, meta in list(active.items()):
109
+ loop.call_soon_threadsafe(meta['q'].put_nowait, None)
110
+ active.clear()
111
+ continue
112
+ for r in results:
113
+ meta = active.get(r.uid)
114
+ if meta is None:
115
+ continue
116
+ detok = meta['detok']
117
+ detok.add_token(r.token)
118
+ seg = detok.last_segment
119
+ if r.finish_reason is not None:
120
+ detok.finalize()
121
+ if (final := detok.last_segment):
122
+ loop.call_soon_threadsafe(meta['q'].put_nowait, final)
123
+ loop.call_soon_threadsafe(meta['q'].put_nowait, None)
124
+ del active[r.uid]
125
+ elif seg:
126
+ loop.call_soon_threadsafe(meta['q'].put_nowait, seg)
127
+
128
+ def _insert(bg, active, pcache, model, tok, loop, tokens, max_tokens, token_queue, ckpts):
129
+ prefix_tokens, prefix_len = _get_prefix(tokens, ckpts)
130
+ if prefix_tokens is not None:
131
+ cached_kv = pcache.lookup(prefix_tokens)
132
+ if cached_kv is not None:
133
+ suffix = tokens[prefix_len:]
134
+ try:
135
+ uids = bg.insert([suffix], [max_tokens], caches=[cached_kv])
136
+ except Exception as exc:
137
+ logger.info(f'[batch] cache insert failed ({exc}), falling back to full prompt')
138
+ uids = bg.insert([tokens], [max_tokens])
139
+ prefix_len = 0
140
+ else:
141
+ logger.info(f'[batch] cache HIT prefix={prefix_len} suffix={len(suffix)}')
142
+ del cached_kv
143
+ mx.clear_cache()
144
+ else:
145
+ logger.info(f'[batch] prefilling prefix prefix={prefix_len} suffix={len(tokens) - prefix_len}')
146
+ prefix_kv = _prefill_prefix(model, prefix_tokens)
147
+ pcache.store(prefix_tokens, prefix_kv)
148
+ suffix = tokens[prefix_len:]
149
+ try:
150
+ uids = bg.insert([suffix], [max_tokens], caches=[prefix_kv])
151
+ except Exception as exc:
152
+ logger.info(f'[batch] cache insert failed ({exc}), falling back to full prompt')
153
+ uids = bg.insert([tokens], [max_tokens])
154
+ prefix_len = 0
155
+ del prefix_kv
156
+ mx.clear_cache()
157
+ active[uids[0]] = {'q': token_queue, 'detok': tok.detokenizer}
158
+ else:
159
+ uids = bg.insert([tokens], [max_tokens])
160
+ logger.info(f'[batch] no cache prompt={len(tokens)}')
161
+ active[uids[0]] = {'q': token_queue, 'detok': tok.detokenizer}
162
+
163
+ @asynccontextmanager
164
+ async def lifespan(_app):
165
+ from mlx_lm import load
166
+ from mlx_lm.generate import BatchGenerator
167
+ from mlx_lm.tokenizer_utils import TokenizerWrapper
168
+ logger.info(f'[batch] Loading model {model_name!r} …')
169
+ model, tokenizer = load(model_name)
170
+ if not isinstance(tokenizer, TokenizerWrapper):
171
+ tokenizer = TokenizerWrapper(tokenizer)
172
+ eos = set(tokenizer.eos_token_ids) | {tokenizer.eos_token_id}
173
+ stop_tokens = [[t] for t in eos]
174
+ batch_gen = BatchGenerator(model, stop_tokens=stop_tokens)
175
+ state.update(model=model, tokenizer=tokenizer, batch_gen=batch_gen, loop=asyncio.get_running_loop(), prefix_cache=PrefixCache(model_name, cache_dir))
176
+ logger.info('[batch] Model ready. Starting engine thread.')
177
+ threading.Thread(target=_engine, daemon=True).start()
178
+ yield
179
+ batch_gen.close()
180
+
181
+ @staticmethod
182
+ def _detect_api(path: str) -> str:
183
+ if path.startswith('/v1beta/models/'):
184
+ return 'gemini'
185
+ if path.startswith('/v1/messages'):
186
+ return 'claude'
187
+ if path.startswith('/v1/responses'):
188
+ return 'codex'
189
+ return 'noapi'
190
+
191
+ async def _stream_sse(token_queue, api, msg_id, in_tokens):
192
+ from . import main as _m
193
+ adapters = {'claude': _m.ClaudeAdapter, 'codex': _m.CodexAdapter, 'gemini': _m.GeminiAdapter, 'noapi': _m.DefaultAdapter}
194
+ adapter = adapters.get(api, _m.DefaultAdapter)(msg_id, in_tokens)
195
+ yield adapter.start()
196
+ st = 'thinking'
197
+ buf = ''
198
+ think_tags = ['<think>', '</think>']
199
+ while True:
200
+ text = await token_queue.get()
201
+ if text is None:
202
+ break
203
+ buf += text
204
+ seg = text
205
+ while any((t in seg for t in think_tags)):
206
+ if st == 'text' and think_tags[0] in seg:
207
+ before, _, seg = seg.partition(think_tags[0])
208
+ if before:
209
+ yield adapter.text('text', before)
210
+ st = 'thinking'
211
+ if st == 'thinking' and think_tags[1] in seg:
212
+ before, _, seg = seg.partition(think_tags[1])
213
+ if before:
214
+ yield adapter.text('thinking', before)
215
+ st = 'text'
216
+ if seg:
217
+ yield adapter.text(st, seg)
218
+ if (tools := _m._parse_tools_xml(buf)):
219
+ for tool in tools:
220
+ yield adapter.tool(tool)
221
+ yield adapter.end(True)
222
+ else:
223
+ yield adapter.end(False)
224
+
225
+ async def generate_endpoint(request: Request):
226
+ from . import main as _m
227
+ if state['batch_gen'] is None:
228
+ return JSONResponse({'error': 'model not loaded'}, status_code=503)
229
+ path = request.url.path.split('?')[0].rstrip('/')
230
+ api = _detect_api(path)
231
+ if api == 'gemini':
232
+ q = str(request.url.query) or ''
233
+ if 'alt=sse' not in q and 'streamGenerateContent' not in path:
234
+ return JSONResponse({'candidates': [{'content': {'role': 'model', 'parts': [{'text': '{"complexity_reasoning":"local","complexity_score":50}'}]}, 'finishReason': 'STOP'}], 'usageMetadata': {'promptTokenCount': 0, 'candidatesTokenCount': 0}})
235
+ body = await request.json()
236
+ max_tokens = int(body.get('max_tokens', body.get('max_completion_tokens', 8192)))
237
+ try:
238
+ prompt, ckpts = _m.encode(body, api, state['tokenizer'], None, None, None)
239
+ except Exception as exc:
240
+ return JSONResponse({'error': f'encode: {exc}'}, status_code=500)
241
+ if ckpts is None or not prompt:
242
+ return JSONResponse({'error': 'empty prompt'}, status_code=400)
243
+ msg_id = f'msg_{uuid.uuid4().hex}'
244
+ token_queue = asyncio.Queue()
245
+ state['request_queue'].put((prompt, max_tokens, token_queue, ckpts))
246
+
247
+ async def _sse():
248
+ async for chunk in _stream_sse(token_queue, api, msg_id, len(prompt)):
249
+ yield chunk
250
+ return StreamingResponse(_sse(), media_type='text/event-stream')
251
+
252
+ async def simple_generate(request: Request):
253
+ if state['batch_gen'] is None:
254
+ return JSONResponse({'error': 'model not loaded'}, status_code=503)
255
+ body = await request.json()
256
+ tok = state['tokenizer']
257
+ max_tokens = body.get('max_tokens', 256)
258
+ if 'messages' in body:
259
+ text = tok.apply_chat_template(body['messages'], tokenize=False, add_generation_prompt=True)
260
+ else:
261
+ text = body.get('prompt', '')
262
+ tokens = tok.encode(text)
263
+ if not tokens:
264
+ return JSONResponse({'error': 'empty prompt'}, status_code=400)
265
+ token_queue = asyncio.Queue()
266
+ state['request_queue'].put((tokens, max_tokens, token_queue, []))
267
+ if body.get('stream', True):
268
+
269
+ async def _raw():
270
+ while True:
271
+ chunk = await token_queue.get()
272
+ if chunk is None:
273
+ break
274
+ yield chunk
275
+ return StreamingResponse(_raw(), media_type='text/plain')
276
+ parts = []
277
+ while True:
278
+ chunk = await token_queue.get()
279
+ if chunk is None:
280
+ break
281
+ parts.append(chunk)
282
+ return JSONResponse({'text': ''.join(parts)})
283
+
284
+ async def list_models(_req):
285
+ return JSONResponse({'data': [{'id': 'local', 'object': 'model', 'created': int(time.time()), 'owned_by': 'local'}]})
286
+
287
+ async def count_tokens(_req):
288
+ return JSONResponse({'input_tokens': 0})
289
+
290
+ async def health(_req):
291
+ pc = state['prefix_cache']
292
+ n_cached = 0
293
+ if pc and pc.cache_dir.exists():
294
+ n_cached = sum((1 for _ in pc.cache_dir.glob('*.safetensors')))
295
+ return JSONResponse({'status': 'ok', 'model': model_name, 'active_sequences': len(state['active']), 'prefix_cache_files': n_cached})
296
+ return Starlette(routes=[Route('/v1/models', list_models, methods=['GET']), Route('/v1/messages/count_tokens', count_tokens, methods=['POST']), Route('/v1/chat/completions', generate_endpoint, methods=['POST']), Route('/v1/messages', generate_endpoint, methods=['POST']), Route('/v1/responses', generate_endpoint, methods=['POST']), Route('/v1beta/models/{rest:path}', generate_endpoint, methods=['POST']), Route('/generate', simple_generate, methods=['POST']), Route('/health', health, methods=['GET'])], lifespan=lifespan)
297
+ if __name__ == '__main__':
298
+ import uvicorn
299
+ uvicorn.run(make_batch_app('mlx-community/Qwen3.5-4B-OptiQ-4bit'), host='0.0.0.0', port=8000)
@@ -871,13 +871,13 @@ def make_handler(model_name, cache_dir, system, names, skips, gwt=None, parse_th
871
871
  raise
872
872
  return Handler
873
873
 
874
- def serve(host: str, port: int, model: str, cache: str, system: str | None, tools: list[str], skips: list[str], *, fixed_port: bool=False, gwt=None) -> tuple[HTTPServer, str]:
874
+ def _serve_cache(host, port, model, cache, system, tools, skips, *, fixed_port=False, gwt=None):
875
875
  handler = make_handler(model, cache, system, tools, skips, gwt)
876
876
  while True:
877
877
  try:
878
878
  server = HTTPServer((host, port), handler)
879
879
  url = f'http://{host}:{port}'
880
- logger.debug(f'Server bound to {url}')
880
+ logger.debug(f'Cache server bound to {url}')
881
881
  return (server, url)
882
882
  except OSError as e:
883
883
  if e.errno in (48, 98):
@@ -888,12 +888,52 @@ def serve(host: str, port: int, model: str, cache: str, system: str | None, tool
888
888
  else:
889
889
  raise
890
890
 
891
+ def _serve_batch(host, port, model, cache_dir='.cache', *, fixed_port=False):
892
+ import uvicorn
893
+ from .bats import make_batch_app
894
+ import socket
895
+ import time
896
+ app = make_batch_app(model, cache_dir=cache_dir)
897
+ while True:
898
+ try:
899
+ with socket.socket() as s:
900
+ s.bind((host, port))
901
+ except OSError as e:
902
+ if e.errno in (48, 98):
903
+ if fixed_port:
904
+ logger.error(f'Port {port} is already in use.')
905
+ sys.exit(1)
906
+ port += 1
907
+ else:
908
+ raise
909
+ else:
910
+ break
911
+ config = uvicorn.Config(app, host=host, port=port, loop='asyncio', log_level='warning')
912
+ uv_server = uvicorn.Server(config)
913
+ t = threading.Thread(target=uv_server.run, daemon=True)
914
+ t.start()
915
+ start_time = time.time()
916
+ notified = False
917
+ while True:
918
+ try:
919
+ with socket.create_connection((host, port), timeout=0.1):
920
+ break
921
+ except OSError:
922
+ if not notified and time.time() - start_time > 3.0:
923
+ logger.info('Waiting for batch server to start (model may be downloading)...')
924
+ notified = True
925
+ time.sleep(0.2)
926
+ url = f'http://{host}:{port}'
927
+ logger.debug(f'Batch server bound to {url}')
928
+ return (uv_server, url)
929
+
891
930
  def main():
892
931
  parser = argparse.ArgumentParser(description='mlx-code MAIN')
893
932
  parser.add_argument('-p', '--prompt', default=None, help='Initial prompt sent automatically when the REPL starts')
894
933
  parser.add_argument('-r', '--resume', default=None, metavar='COMMIT', help='Resume a previous session from the given git commit hash')
895
934
  parser.add_argument('-m', '--model', default='mlx-community/Qwen3.5-4B-OptiQ-4bit', help='MLX model path or HuggingFace repo ID (default: Qwen3.5-4B-OptiQ-4bit)')
896
935
  parser.add_argument('-l', '--leash', choices=['claude', 'codex', 'gemini', 'noapi', 'none'], default='noapi', help="AI harness to launch against the server; 'noapi' starts the built-in REPL, 'none' runs the server only")
936
+ parser.add_argument('--engine', choices=['cache', 'batch'], default='cache', help="'cache' uses PromptCache + single-sequence (default); 'batch' uses BatchGenerator for concurrent sequences (only compatible with --leash none or noapi)")
897
937
  parser.add_argument('--skill', default=None, help='Directory to scan recursively for SKILL.md files')
898
938
  parser.add_argument('--tools', nargs='+', default=None, help='Whitelist of tool names to enable; allows all tools when omitted')
899
939
  parser.add_argument('--system', type=str, default=None, help='System prompt override passed to the model')
@@ -903,10 +943,14 @@ def main():
903
943
  parser.add_argument('--port', type=int, default=None, help='Port to listen on; auto-increments if already in use (default: 8000)')
904
944
  parser.add_argument('--skips', nargs='+', default=['(?m)^\\[SUGGESTION MODE[\\s\\S]*', '(?m)^<system-reminder>[\\s\\S]*?^</system-reminder>\\s*'], help='Regex patterns stripped from model output before it is returned to the client')
905
945
  parser.add_argument('--stream', default=None, help='File to stream log into')
906
- parser.add_argument('--notui', action='store_true', help='Use simple terminal REPL instead of TUI')
946
+ parser.add_argument('--bare', action='store_true', help='Use simple terminal REPL instead of TUI')
907
947
  args, leash_args = parser.parse_known_args()
908
948
  logger.debug(f'args={args!r} leash_args={leash_args!r}')
949
+ if args.engine == 'batch' and args.leash not in ('none', 'noapi'):
950
+ parser.error('--engine batch only supports --leash none or --leash noapi for now')
909
951
  cache = os.path.abspath(args.cache)
952
+ port = args.port if args.port is not None else 8000
953
+ fixed_port = args.port is not None
910
954
  with tempfile.TemporaryDirectory(dir='/tmp') as _home:
911
955
  env = os.environ.copy()
912
956
  home = Path(_home)
@@ -915,18 +959,28 @@ def main():
915
959
  env['HOME'] = _home
916
960
  env['SHELL'] = '/bin/bash'
917
961
  env['PWD'] = cwd
918
- server, url = serve(host=args.host, port=args.port if args.port is not None else 8000, model=args.model, cache=cache, system=None if args.leash in ('none', 'noapi') else args.system, tools=args.tools, skips=args.skips, fixed_port=args.port is not None, gwt=gwt)
962
+ if args.engine == 'batch':
963
+ server, url = _serve_batch(args.host, port, args.model, cache_dir=cache, fixed_port=fixed_port)
964
+ else:
965
+ server, url = _serve_cache(host=args.host, port=port, model=args.model, cache=cache, system=None if args.leash in ('none', 'noapi') else args.system, tools=args.tools, skips=args.skips, fixed_port=fixed_port, gwt=gwt)
919
966
  if args.leash == 'none':
920
- try:
921
- server.serve_forever()
922
- except KeyboardInterrupt:
923
- print('\nShutting down server...')
924
- server.server_close()
967
+ if args.engine == 'batch':
968
+ try:
969
+ threading.Event().wait()
970
+ except KeyboardInterrupt:
971
+ print('\nShutting down server...')
972
+ else:
973
+ try:
974
+ server.serve_forever()
975
+ except KeyboardInterrupt:
976
+ print('\nShutting down server...')
977
+ server.server_close()
925
978
  else:
926
- threading.Thread(target=server.serve_forever, daemon=True).start()
979
+ if args.engine == 'cache':
980
+ threading.Thread(target=server.serve_forever, daemon=True).start()
927
981
  if args.leash == 'noapi':
928
982
  from .repl import run_repl
929
- run_repl(base_url=url, api=args.leash, repo=cwd, env=env, system=args.system, tool_names=args.tools, sdir=args.skill, init_prompt=args.prompt, resume=args.resume, stream=args.stream, notui=args.notui)
983
+ run_repl(base_url=url, api=args.leash, repo=cwd, env=env, system=args.system, tool_names=args.tools, sdir=args.skill, init_prompt=args.prompt, resume=args.resume, stream=args.stream, bare=args.bare)
930
984
  else:
931
985
  env['GOOGLE_GEMINI_BASE_URL'] = url
932
986
  env['GEMINI_API_KEY'] = 'mc'
@@ -980,10 +980,10 @@ async def _stream_to_stdout(agent: Agent, user_input: str) -> None:
980
980
  if text:
981
981
  print(text)
982
982
 
983
- async def repl(agent, init_prompt=None, notui=False):
983
+ async def repl(agent, init_prompt=None, bare=False):
984
984
  is_tty = sys.stdin.isatty() and sys.stdout.isatty()
985
- if notui and is_tty:
986
- from .ntui import SimpleRepl
985
+ if bare and is_tty:
986
+ from .bare import SimpleRepl
987
987
  sr = SimpleRepl(agent, init_prompt=init_prompt)
988
988
  await sr.run()
989
989
  return None
@@ -1025,7 +1025,7 @@ _AGENT_ENV_ALLOWLIST: re.Pattern = re.compile('\n ^(\n # ── Execution
1025
1025
  def _make_agent_env(base: dict[str, str]) -> dict[str, str]:
1026
1026
  return {k: v for k, v in base.items() if _AGENT_ENV_ALLOWLIST.match(k)}
1027
1027
 
1028
- def run_repl(*, base_url=None, model=None, api: Literal['claude', 'codex', 'gemini', 'deepseek', 'noapi']='noapi', system='', sdir=None, skills=None, env=None, tool_names=None, extra_tool_classes=None, api_key=None, gwt=None, ctx=None, init_prompt=None, resume_messages=None, repo=None, resume=None, stream=None, verbose_transcript=False, notui=False):
1028
+ def run_repl(*, base_url=None, model=None, api: Literal['claude', 'codex', 'gemini', 'deepseek', 'noapi']='noapi', system='', sdir=None, skills=None, env=None, tool_names=None, extra_tool_classes=None, api_key=None, gwt=None, ctx=None, init_prompt=None, resume_messages=None, repo=None, resume=None, stream=None, verbose_transcript=False, bare=False):
1029
1029
  repo = os.path.abspath(repo or os.getcwd())
1030
1030
  with tempfile.TemporaryDirectory(dir=tempfile.gettempdir()) as _home:
1031
1031
  if gwt is None:
@@ -1064,7 +1064,7 @@ def run_repl(*, base_url=None, model=None, api: Literal['claude', 'codex', 'gemi
1064
1064
  print(f'[resumed {len(resume_messages)} messages from checkpoint]')
1065
1065
  app_instance = None
1066
1066
  try:
1067
- app_instance = asyncio.run(repl(agent, init_prompt=init_prompt, notui=notui))
1067
+ app_instance = asyncio.run(repl(agent, init_prompt=init_prompt, bare=bare))
1068
1068
  finally:
1069
1069
  if log_fp:
1070
1070
  log_fp.close()
@@ -1103,7 +1103,7 @@ def main():
1103
1103
  parser.add_argument('--key', default=None, help='API key')
1104
1104
  parser.add_argument('--stream', default=None, help='File to stream log into')
1105
1105
  parser.add_argument('--verbose-transcript', action='store_true', help='Reserved; not yet implemented')
1106
- parser.add_argument('--notui', action='store_true', help='Use simple terminal REPL instead of TUI')
1106
+ parser.add_argument('--bare', action='store_true', help='Use simple terminal REPL instead of TUI')
1107
1107
  args = parser.parse_args()
1108
1108
  logger.debug(args)
1109
1109
  url, model, tool_names, api_key = (args.url, args.model, args.tools, args.key)
@@ -1117,6 +1117,6 @@ def main():
1117
1117
  url = 'https://generativelanguage.googleapis.com' if api_key else url
1118
1118
  model = 'gemini-3.1-flash-lite' if model is None else model
1119
1119
  tool_names = [] if tool_names is None else tool_names
1120
- run_repl(api=args.api, system=args.system, repo=args.cwd, model=model, base_url=url, tool_names=tool_names, sdir=args.skill, api_key=api_key, init_prompt=args.prompt, resume=args.resume, stream=args.stream, notui=args.notui)
1120
+ run_repl(api=args.api, system=args.system, repo=args.cwd, model=model, base_url=url, tool_names=tool_names, sdir=args.skill, api_key=api_key, init_prompt=args.prompt, resume=args.resume, stream=args.stream, bare=args.bare)
1121
1121
  if __name__ == '__main__':
1122
1122
  main()
@@ -597,7 +597,7 @@ def tui(stdscr, entries, log_file, initial_filter='', initial_visible=None):
597
597
  def main():
598
598
  parser = argparse.ArgumentParser(description='TUI viewer for JSON log files')
599
599
  parser.add_argument('logfile', nargs='?', default='.log.json', help='Path to log file (default: .log.json)')
600
- parser.add_argument('-f', '--filter', default=f'lvl:10;file:main,repl,gits,apis,tools', help='Initial filter string (same syntax as in UI)')
600
+ parser.add_argument('-f', '--filter', default=f'lvl:10;file:main,bats,repl,bare,gits,apis,tools', help='Initial filter string (same syntax as in UI)')
601
601
  parser.add_argument('-o', '--out', dest='out', metavar='FILE', help='Write marked entries to FILE (JSON lines format) instead of stdout')
602
602
  args = parser.parse_args()
603
603
  log_path = args.logfile
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mlx-code
3
- Version: 0.0.25
3
+ Version: 0.0.26
4
4
  Summary: Coding Agent for Mac
5
5
  Home-page: https://josefalbers.github.io/mlx-code/
6
6
  Author: J Joe
@@ -17,6 +17,8 @@ Requires-Dist: httpx
17
17
  Requires-Dist: pydantic
18
18
  Requires-Dist: textual>=8.2.7
19
19
  Requires-Dist: rich>=15.0.0
20
+ Requires-Dist: starlette
21
+ Requires-Dist: uvicorn
20
22
  Provides-Extra: all
21
23
  Requires-Dist: python-lsp-server[all]; extra == "all"
22
24
  Requires-Dist: GitPython; extra == "all"
@@ -47,7 +49,7 @@ A Git-native coding agent that can run entirely on your Mac. No API keys, no clo
47
49
  ```
48
50
  Conversation tree (nodes = git commits with embedded chat history)
49
51
 
50
- main ──●──●──●──●──●──●──●──●──●──●
52
+ main ──●──●──●──●──●──●──●──●──●──●──●──●──●──●
51
53
  │ │
52
54
  │ └── branch-1 ──●──●──●
53
55
  │ │ ┌────────────┐
@@ -66,21 +68,21 @@ REPL tabs (each tab = a git branch + agent) │
66
68
  │ └──────┘ └────┬─────┘ └──────────┘ └────────────┘ │
67
69
  └─────────────────┼──────────────────────────────────────┘
68
70
 
69
- ├────────────────────────────────────► each tab is an independent Agent
71
+ ├─────────────────────────────────────────► Each tab is an independent Agent
70
72
 
71
- ┌────┴─────────────────────────────────┐
72
- │ Agent
73
- ┌──────────────┐ ┌──────────────┐
74
- │ │ API: │ │ Tools: │ │
75
- │ │ MLX (local) │ │ Read Write │ │
76
- │ │ Claude │ │ Edit Bash │ │
77
- │ │ Gemini │ │ Grep Find │ │
78
- │ │ OpenAI │ │ Ls Skill │ │
79
- └──────────────┘ │ Agent ───────┼──┼───► spawns child Agent
80
- └──────────────┘ │ (each with own tools + worktree + etc)
81
- │ Git worktree
82
- │ (isolation + session state)
83
- └──────────────────────────────────────┘
73
+ ┌────┴─────────────────────────────────────┐
74
+ │ Agent
75
+ ┌────────────────┐ ┌────────────────┐
76
+ │ │ API: │ │ Tools: │ │
77
+ │ │ Local (mlx-lm) │ │ Read Write │ │
78
+ │ │ Claude │ │ Edit Bash │ │
79
+ │ │ Gemini │ │ Grep Find │ │
80
+ │ │ OpenAI │ │ Ls Skill │ │
81
+ └────────────────┘ │ Agent ─────────┼──┼───► Spawns child Agent
82
+ └────────────────┘ │ (each with own tools + worktree + etc)
83
+ │ Git worktree
84
+ │ (isolation + session state)
85
+ └──────────────────────────────────────────┘
84
86
  ```
85
87
 
86
88
  Each layer is importable and composable on its own. A commit records state, a branch records an alternative path, and a tab is just a live view over an `Agent`.
@@ -104,9 +106,9 @@ uvx --from mlx-code mlc
104
106
  # or install into the current environment
105
107
  pip install mlx-code
106
108
 
107
- mlc # launch with local MLX model
109
+ # launch
110
+ mlc # with a local MLX model
108
111
  mlc-run --api gemini # or use a remote provider
109
- mlc-run --api deepseek --model deepseek-v4-flash
110
112
  ```
111
113
 
112
114
  That's it. The first run starts a local inference server and drops you into the REPL.
@@ -128,12 +130,12 @@ That's it. The first run starts a local inference server and drops you into the
128
130
 
129
131
  **Git is the database.** When the agent makes file changes, they’re committed to a git worktree with the full conversation embedded in the commit message. Resume any past session by hash, branch from any checkpoint, and inspect the agent timeline with `git log`. No proprietary state files, just Git.
130
132
 
131
- **Your working directory is never at risk.** The agent operates inside a `git worktree`, not your checkout. It can make a mess, and you can inspect or discard it without ever touching `main`.
132
-
133
- **Built-in safety nets.** Subprocess environment variables go through an explicit allowlist, so secrets in your shell are never leaked to agent-spawned processes.
133
+ **Built-in safety nets.** Your working directory is never at risk. The agent operates inside a `git worktree`, not your checkout. It can make a mess, and you can inspect or discard it without ever touching `main`. Subprocess environment variables go through an explicit allowlist, so secrets in your shell are never leaked to agent-spawned processes.
134
134
 
135
135
  **Batteries included.** Everything ships in one pip install: the MLX inference engine, the multi-protocol API server, the agent loop, the tools, and the TUI. No llama.cpp, no ollama, no vLLM bridge to find and configure. And the server natively speaks OpenAI, Anthropic, Gemini, and Codex wire formats simultaneously, so `claude`, `codex`, and `gemini` CLIs can all work against your local model without a translation layer.
136
136
 
137
+ **Continuous batching.** The local inference server runs a continuous batching engine that processes multiple sequences concurrently. When you spawn parallel agents (eg, multiple tabs, `asyncio.gather` pipelines, or delegated sub-tasks) they all share the same GPU context and are stepped together each tick. A prefix cache persists KV snapshots to disk, so repeated system prompts and conversation prefixes are prefilled once and reused across sessions. No request queueing, no waiting for the previous agent to finish.
138
+
137
139
  ---
138
140
 
139
141
  ## Agent primitive
@@ -171,12 +173,12 @@ agent.messages = messages
171
173
  await agent.run("now add unit tests")
172
174
  ```
173
175
 
174
- Branch from any point in the conversation each branch gets its own worktree:
176
+ Branch from any point in the conversation. Each branch gets its own worktree:
175
177
 
176
178
  ```
177
179
  /branch # branch from current state
178
180
  /branch --rev 2 # branch from the 2nd user turn
179
- /branch --rev 3 --as-worktree try different approach
181
+ /branch --rev 3 make it use httpx instead
180
182
  ```
181
183
 
182
184
  Since it's just git, you can inspect the timeline outside the REPL:
@@ -241,6 +243,43 @@ Reliability comes from specialization plus constraint. A read-only reviewer can'
241
243
 
242
244
  ---
243
245
 
246
+ ## Continuous batching
247
+
248
+ The local server can run multiple inference sequences concurrently inside a single batch step. Instead of a global lock that serialises one request at a time, the batching engine maintains a live set of active sequences and yields tokens for all of them on every step.
249
+
250
+ ```bash
251
+ mlc --engine batch # continuous batching + built-in REPL
252
+ ```
253
+
254
+ This unlocks true parallelism for multi-agent workloads:
255
+
256
+ ```python
257
+ import asyncio
258
+ from mlx_code.repl import Agent
259
+
260
+ async def main():
261
+ agents = [Agent() for _ in range(4)]
262
+ await asyncio.gather(*[
263
+ a.run(f"Research topic: {t}")
264
+ for a, t in zip(agents, ["consensus", "cryptography", "networking", "storage"])
265
+ ])
266
+
267
+ asyncio.run(main())
268
+ ```
269
+
270
+ All four agents generate simultaneously inside the same batch. No sequential blocking.
271
+
272
+ ### Health endpoint
273
+
274
+ ```bash
275
+ curl http://127.0.0.1:8000/health
276
+ # {"status":"ok","model":"mlx-community/Qwen3.5-4B-OptiQ-4bit","active_sequences":2,"prefix_cache_files":5}
277
+ ```
278
+
279
+ `active_sequences` shows how many agents are generating right now; `prefix_cache_files` shows how many prefix KV snapshots are stored on disk.
280
+
281
+ ---
282
+
244
283
  ## Command Line
245
284
 
246
285
  ### `mlc`: local server + harness
@@ -248,20 +287,20 @@ Reliability comes from specialization plus constraint. A read-only reviewer can'
248
287
  Starts the MLX inference server and launches the built-in TUI harness against it.
249
288
 
250
289
  ```bash
251
- # Default: local server + default TUI
290
+ # Default: local server + default harness
252
291
  mlc
253
292
 
254
- # Use a simple terminal REPL instead of the TUI
255
- mlc --notui
293
+ # Continuous batching mode (default is sequential caching mode)
294
+ mlc --engine batch
295
+
296
+ # Server only, no harness
297
+ mlc --leash none
256
298
 
257
299
  # Use a different harness (routes traffic through the local server)
258
300
  mlc --leash claude
259
301
  mlc --leash gemini
260
302
  mlc --leash codex
261
303
 
262
- # Server only, no harness
263
- mlc --leash none
264
-
265
304
  # Specify a model
266
305
  mlc --model mlx-community/Qwen3.5-4B-OptiQ-4bit
267
306
 
@@ -312,7 +351,7 @@ mlc-run --api codex
312
351
  echo "explain lsp.py" | mlc-run -a deepseek | cat - PLAN.md | mlc-run --url http://localhost:9000
313
352
 
314
353
  # Simple terminal REPL (no TUI)
315
- mlc-run --notui
354
+ mlc-run --bare
316
355
  ```
317
356
 
318
357
  ---
@@ -437,18 +476,19 @@ agent = Agent(extra_tool_classes=[LiveDBTool], tool_names=["QueryDB"])
437
476
 
438
477
  | Command | Description |
439
478
  |---|---|
440
- | `/help` | Show command reference |
479
+ | `/branch [--rev N] [prompt]` | Open a new branch tab from the current (or earlier) checkpoint |
480
+ | `/diff [--all]` | Show a side-by-side diff of changes in the worktree |
441
481
  | `/clear [--config F]` | Clear conversation; `--config` reloads agent from a JSON/YAML file |
482
+ | `/tab [N]` | Jump to tab N |
442
483
  | `/history [--raw]` | Show conversation transcript; `--raw` shows the raw API message log |
443
- | `/diff [--all]` | Show a side-by-side diff of changes in the worktree |
444
- | `/errors` | Show timestamped error log for the current tab |
445
484
  | `/tools` | List active tools |
446
- | `/branch [--rev N] [prompt]` | Open a new branch tab from the current (or earlier) checkpoint |
447
485
  | `/abort` | Abort the running agent |
486
+ | `/errors` | Show timestamped error log for the current tab |
448
487
  | `/export [path]` | Export session to JSON |
449
488
  | `/exit [--all]` | Close branch tab, or exit the app |
450
- | `!command` | Run a shell command; output captured in the TUI |
451
- | `$command` | Run an interactive command (TUI suspends, terminal handed to process) |
489
+ | `/help` | Show command reference |
490
+ | `!command` | Run a shell command; output captured in the TUI (eg, `ls`, `cat hello.c`) |
491
+ | `$command` | Run an interactive command (eg, `vim`, `yazi`, `less hello.c`) |
452
492
 
453
493
  ### Key bindings
454
494
 
@@ -458,7 +498,7 @@ agent = Agent(extra_tool_classes=[LiveDBTool], tool_names=["QueryDB"])
458
498
  | `Ctrl-J` | Insert newline |
459
499
  | `Ctrl-1` … `Ctrl-9` | Jump to tab N |
460
500
  | `Ctrl-,` / `Ctrl-.` | Cycle through tabs |
461
- | `Ctrl-C` | Abort running agent |
501
+ | `Ctrl-C` | Clear input, or abort running agent |
462
502
  | `Ctrl-D` | Close branch tab, or exit app |
463
503
  | `Ctrl-R` | Recall last prompt into editor |
464
504
 
@@ -476,7 +516,7 @@ agent = Agent(extra_tool_classes=[LiveDBTool], tool_names=["QueryDB"])
476
516
  | `Skill` | Retrieve named skill instructions from config |
477
517
  | `Agent` | Spawn an autonomous sub-agent for delegated work |
478
518
 
479
- All file tools enforce path sandboxing the agent cannot read or write outside the worktree.
519
+ All file tools enforce path sandboxing. The agent cannot read or write outside the worktree.
480
520
 
481
521
  ### Backends
482
522
 
@@ -3,12 +3,13 @@ README.md
3
3
  setup.py
4
4
  mlx_code/__init__.py
5
5
  mlx_code/apis.py
6
+ mlx_code/bare.py
7
+ mlx_code/bats.py
6
8
  mlx_code/gits.py
7
9
  mlx_code/lsp_tool.py
8
10
  mlx_code/main.py
9
11
  mlx_code/mcb.py
10
12
  mlx_code/mcb_tool.py
11
- mlx_code/ntui.py
12
13
  mlx_code/repl.py
13
14
  mlx_code/stream_log.py
14
15
  mlx_code/tools.py
@@ -2,6 +2,8 @@ httpx
2
2
  pydantic
3
3
  textual>=8.2.7
4
4
  rich>=15.0.0
5
+ starlette
6
+ uvicorn
5
7
 
6
8
  [:platform_system == "Darwin"]
7
9
  mlx-lm>=0.31.3
@@ -11,7 +11,7 @@ setup(
11
11
  author_email="albersj66@gmail.com",
12
12
  author="J Joe",
13
13
  license="Apache-2.0",
14
- version="0.0.25",
14
+ version="0.0.26",
15
15
  readme="README.md",
16
16
  description="Coding Agent for Mac",
17
17
  long_description=open("README.md").read(),
@@ -24,6 +24,9 @@ setup(
24
24
 
25
25
  "textual>=8.2.7",
26
26
  "rich>=15.0.0",
27
+
28
+ "starlette",
29
+ "uvicorn",
27
30
  ],
28
31
  extras_require={"all": [
29
32
  "python-lsp-server[all]",
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes