superlinear 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. apps/__init__.py +4 -0
  2. apps/cli/__init__.py +8 -0
  3. apps/cli/bm25_rag.py +471 -0
  4. apps/cli/chat_repl.py +1497 -0
  5. apps/cli/client.py +195 -0
  6. apps/cli/docs_repl.py +2275 -0
  7. apps/cli/light_rag.py +729 -0
  8. apps/cli/local_snapshots.py +139 -0
  9. apps/cli/locks.py +214 -0
  10. apps/cli/main.py +457 -0
  11. apps/cli/output.py +32 -0
  12. apps/cli/server_cmds.py +516 -0
  13. apps/cli/session_cmds.py +491 -0
  14. apps/cli/snapshot_cmds.py +303 -0
  15. apps/cli/state.py +265 -0
  16. apps/server/__init__.py +4 -0
  17. apps/server/app.py +1363 -0
  18. apps/server/main.py +313 -0
  19. superlinear/__init__.py +114 -0
  20. superlinear/_version.py +3 -0
  21. superlinear/engine/__init__.py +10 -0
  22. superlinear/engine/adapters/__init__.py +12 -0
  23. superlinear/engine/adapters/base.py +91 -0
  24. superlinear/engine/adapters/superlinear.py +1233 -0
  25. superlinear/engine/chat_engine.py +1173 -0
  26. superlinear/engine/chat_types.py +130 -0
  27. superlinear/engine/registry.py +51 -0
  28. superlinear/engine/repetition.py +203 -0
  29. superlinear/engine/session_snapshots.py +451 -0
  30. superlinear/engine/tool_parser.py +83 -0
  31. superlinear/engine/types.py +42 -0
  32. superlinear/kernels/__init__.py +2 -0
  33. superlinear/kernels/common/__init__.py +21 -0
  34. superlinear/kernels/common/adjustment.py +106 -0
  35. superlinear/kernels/common/power.py +154 -0
  36. superlinear/kernels/superlinear/__init__.py +10 -0
  37. superlinear/kernels/superlinear/attention/__init__.py +78 -0
  38. superlinear/kernels/superlinear/attention/_prefill.py +940 -0
  39. superlinear/kernels/superlinear/attention/_sliding_window.py +1167 -0
  40. superlinear/kernels/superlinear/attention/api.py +433 -0
  41. superlinear/kernels/superlinear/search/__init__.py +33 -0
  42. superlinear/kernels/superlinear/search/_reference.py +204 -0
  43. superlinear/kernels/superlinear/search/_triton.py +488 -0
  44. superlinear/kernels/superlinear/search/_triton_gqa.py +534 -0
  45. superlinear/kernels/superlinear/search/api.py +200 -0
  46. superlinear/kernels/superlinear/span/__init__.py +41 -0
  47. superlinear/kernels/superlinear/span/_triton_bucketed_gqa.py +1461 -0
  48. superlinear/kernels/superlinear/span/_triton_forward.py +22 -0
  49. superlinear/kernels/superlinear/span/_triton_gqa.py +1226 -0
  50. superlinear/kernels/superlinear/span/_triton_impl.py +928 -0
  51. superlinear/kernels/superlinear/span/_triton_precomputed_sw.py +460 -0
  52. superlinear/kernels/superlinear/span/_triton_precomputed_sw_gqa.py +598 -0
  53. superlinear/kernels/superlinear/span/api.py +296 -0
  54. superlinear/kernels/superlinear/span/masks.py +187 -0
  55. superlinear/py.typed +0 -0
  56. superlinear/runtime.py +71 -0
  57. superlinear-0.1.0.dist-info/METADATA +469 -0
  58. superlinear-0.1.0.dist-info/RECORD +62 -0
  59. superlinear-0.1.0.dist-info/WHEEL +5 -0
  60. superlinear-0.1.0.dist-info/entry_points.txt +2 -0
  61. superlinear-0.1.0.dist-info/licenses/LICENSE +202 -0
  62. superlinear-0.1.0.dist-info/top_level.txt +2 -0
apps/cli/chat_repl.py ADDED
@@ -0,0 +1,1497 @@
1
+ from __future__ import annotations
2
+
3
+ import atexit
4
+ import os
5
+ import select
6
+ import shutil
7
+ import shlex
8
+ import sys
9
+ import time
10
+ import textwrap
11
+ from dataclasses import dataclass
12
+ from pathlib import Path
13
+ from typing import Any
14
+
15
+ # Enable readline for arrow keys, history navigation, and line editing.
16
+ # This gives us: ↑/↓ (history), ←/→ (cursor), Ctrl+A/E (start/end), Ctrl+R (search), Ctrl+L (clear).
17
+ try:
18
+ import readline
19
+ except ImportError:
20
+ readline = None # type: ignore[assignment] # Windows fallback
21
+
22
+ from apps.cli.client import HttpError, SuperlinearClient
23
+ from apps.cli.local_snapshots import delete_local_snapshot, list_local_snapshots
24
+ from apps.cli.locks import AlreadyLockedError, SessionLock
25
+ from apps.cli.state import CliState, load_state, save_state
26
+
27
+
28
+ def _chat_history_file_path() -> Path:
29
+ return Path.home() / ".config" / "spl" / "chat_history"
30
+
31
+
32
+ def _setup_readline_history() -> None:
33
+ """Set up persistent command history for the REPL."""
34
+ if readline is None:
35
+ return
36
+ history_file = _chat_history_file_path()
37
+ history_file.parent.mkdir(parents=True, exist_ok=True)
38
+ try:
39
+ readline.read_history_file(history_file)
40
+ except FileNotFoundError:
41
+ pass
42
+ readline.set_history_length(1000)
43
+ atexit.register(readline.write_history_file, history_file)
44
+
45
+
46
+ # Commands for tab completion
47
+ _CHAT_COMMANDS = [
48
+ "/help", "/exit", "/clear", "/history", "/new", "/rm", "/head", "/tail",
49
+ "/show", "/ls", "/switch", "/save", "/load", "/stats",
50
+ ]
51
+
52
+
53
+ def _setup_completer() -> None:
54
+ """Set up tab completion for REPL commands."""
55
+ if readline is None:
56
+ return
57
+
58
+ def completer(text: str, state: int) -> str | None:
59
+ if text.startswith("/"):
60
+ matches = [cmd for cmd in _CHAT_COMMANDS if cmd.startswith(text)]
61
+ else:
62
+ matches = []
63
+ return matches[state] if state < len(matches) else None
64
+
65
+ readline.set_completer(completer)
66
+ readline.set_completer_delims(" \t\n")
67
+ readline.parse_and_bind("tab: complete")
68
+
69
+
70
+ def _cmd_history(n: int = 20) -> None:
71
+ """Show the last n entries from readline input history."""
72
+ if readline is None:
73
+ print("history not available (readline not loaded)", file=sys.stderr)
74
+ return
75
+ length = readline.get_current_history_length()
76
+ if length == 0:
77
+ print("(no history)")
78
+ return
79
+ start = max(1, length - n + 1)
80
+ for i in range(start, length + 1):
81
+ item = readline.get_history_item(i)
82
+ if item:
83
+ print(f"{i:4d} {item}")
84
+
85
+
86
+ def _cmd_history_clear() -> None:
87
+ """Clear readline input history (both in-memory and on disk)."""
88
+ if readline is None:
89
+ print("history not available (readline not loaded)", file=sys.stderr)
90
+ return
91
+ try:
92
+ readline.clear_history()
93
+ except Exception as exc:
94
+ print(f"failed to clear history: {exc}", file=sys.stderr)
95
+ return
96
+
97
+ # Persist the empty history immediately.
98
+ history_file = _chat_history_file_path()
99
+ try:
100
+ history_file.parent.mkdir(parents=True, exist_ok=True)
101
+ readline.write_history_file(history_file)
102
+ except Exception:
103
+ # Best effort; the in-memory history is already cleared.
104
+ pass
105
+ print("cleared input history")
106
+
107
+
108
+ # Default system prompt for Superlinear Chat.
109
+ # Emphasizes long-context review and instruction-following.
110
+ DEFAULT_SYSTEM_PROMPT = """\
111
+ You are Superlinear Chat, a helpful, harmless, and honest AI assistant developed by concavity.ai.
112
+
113
+ You are operating in a stateful chat session: you can see the prior messages in this conversation (they are provided as chat history). Do not claim you are “stateless” or that you “can’t access previous messages”. If asked to recall, summarize, or list prior turns, do so to the best of your ability. If a user requests exact verbatim quotes of very long messages, provide short excerpts and offer to continue with the full text.
114
+
115
+ ## Core Principles
116
+
117
+ 1. **Review conversation history thoroughly**: Before responding, carefully review the entire conversation history. This is a long-context assistant—important details, constraints, or prior decisions may appear much earlier in the conversation. Never assume you remember everything; re-read to ensure continuity and avoid contradicting or repeating yourself.
118
+
119
+ 2. **Follow instructions precisely**: Prioritize the user's explicit instructions over assumptions. When instructions conflict with conventions or best practices, follow the instructions while noting any concerns. Ask clarifying questions only when truly necessary.
120
+
121
+ 3. **Be helpful and thorough**: Provide comprehensive, accurate answers that fully address the user's needs. Anticipate follow-up questions and include relevant context proactively.
122
+
123
+ 4. **Be honest about limitations**: If you don't know something, say so. Don't fabricate information. Distinguish clearly between facts and opinions or speculation.
124
+
125
+ ## Response Style
126
+
127
+ - Be concise for simple questions; be thorough for complex ones
128
+ - Use markdown formatting effectively: headers, lists, code blocks, emphasis
129
+ - Match the user's tone and technical level
130
+ - Structure long responses with clear sections
131
+
132
+ ## For Technical and Code Requests
133
+
134
+ - Write clean, well-documented, idiomatic code
135
+ - Follow language-specific best practices and conventions
136
+ - Include error handling and edge case considerations
137
+ - Explain your approach when helpful, but prioritize working code
138
+ - When modifying existing code, preserve the original style and conventions
139
+
140
+ ## For Complex Problems
141
+
142
+ - Use <think>...</think> blocks for extended reasoning when helpful
143
+ - Break down complex tasks into clear steps
144
+ - Show your reasoning process for non-trivial decisions
145
+ - Consider multiple approaches before recommending one
146
+
147
+ ## For Creative Tasks
148
+
149
+ - Be imaginative and explore possibilities
150
+ - Offer alternatives and variations when appropriate
151
+ - Respect the user's creative vision while offering constructive input
152
+
153
+ ## Boundaries
154
+
155
+ - Politely decline requests that would cause harm
156
+ - Respect privacy and don't ask for unnecessary personal information
157
+ - For medical, legal, or financial matters, recommend consulting qualified professionals
158
+
159
+ ## Multi-turn Conversations
160
+
161
+ - Maintain awareness of the full conversation context
162
+ - Reference previous messages and decisions appropriately
163
+ - Adapt smoothly when the user changes topics or refines their request
164
+ - When asked to revise or build on prior work, ensure changes are consistent with the established context
165
+ """
166
+
167
+
168
+ class ChatReplError(RuntimeError):
169
+ pass
170
+
171
+
172
+ @dataclass
173
+ class TurnStats:
174
+ finish_reason: str | None = None
175
+ ttft_s: float | None = None
176
+ total_s: float | None = None
177
+ prompt_tokens: int | None = None
178
+ completion_tokens: int | None = None
179
+ tok_per_s: float | None = None
180
+ server_prefill_s: float | None = None
181
+ server_decode_s: float | None = None
182
+ server_total_s: float | None = None
183
+
184
+
185
+ def _now_utc_compact() -> str:
186
+ return time.strftime("%Y%m%d_%H%M%S", time.gmtime())
187
+
188
+
189
+ def _new_session_id(*, prefix: str = "chat") -> str:
190
+ # Keep IDs short, URL-safe, and easy to read.
191
+ import secrets
192
+
193
+ return f"{prefix}_{_now_utc_compact()}_{secrets.token_hex(3)}"
194
+
195
+
196
+ def _ensure_reachable(client: SuperlinearClient) -> None:
197
+ try:
198
+ client.health()
199
+ except HttpError as exc:
200
+ raise ChatReplError(
201
+ f"Server unreachable at {client.base_url}. Start it with `spl server start --model <model>` "
202
+ f"or pass `--url`.\n{exc}"
203
+ ) from exc
204
+
205
+
206
+ def _get_session_counters(*, client: SuperlinearClient, session_id: str) -> tuple[int, int]:
207
+ """Return (message_count, cache_position) for a session."""
208
+ try:
209
+ info = client.request_json("GET", f"/v1/sessions/{session_id}", timeout_s=5.0)
210
+ except HttpError as exc:
211
+ raise ChatReplError(str(exc)) from exc
212
+
213
+ if not isinstance(info, dict):
214
+ raise ChatReplError("Invalid response from server for /v1/sessions/<id>")
215
+
216
+ msg_count = info.get("message_count")
217
+ cache_pos = info.get("cache_position")
218
+ if cache_pos is None:
219
+ cache_pos = info.get("current_pos")
220
+
221
+ try:
222
+ msg_count_i = int(msg_count or 0)
223
+ except Exception:
224
+ msg_count_i = 0
225
+ try:
226
+ cache_pos_i = int(cache_pos or 0)
227
+ except Exception:
228
+ cache_pos_i = 0
229
+
230
+ return msg_count_i, cache_pos_i
231
+
232
+
233
+ def _session_exists(client: SuperlinearClient, session_id: str) -> bool:
234
+ try:
235
+ client.request_json("GET", f"/v1/sessions/{session_id}", timeout_s=5.0)
236
+ return True
237
+ except HttpError as exc:
238
+ if exc.status_code == 404:
239
+ return False
240
+ raise
241
+
242
+
243
+ def _create_session(client: SuperlinearClient, session_id: str, *, max_seq_len: int | None = None) -> None:
244
+ payload: dict[str, Any] = {"session_id": session_id}
245
+ if max_seq_len is not None:
246
+ payload["max_seq_len"] = int(max_seq_len)
247
+ try:
248
+ client.request_json("POST", "/v1/sessions", payload=payload, timeout_s=30.0)
249
+ except HttpError as exc:
250
+ # Idempotent behavior for "start"/resume flows.
251
+ if exc.status_code == 409:
252
+ return
253
+ raise
254
+
255
+
256
+ def _maybe_resize_session(
257
+ client: SuperlinearClient,
258
+ session_id: str,
259
+ *,
260
+ min_max_seq_len: int | None,
261
+ strategy: str = "auto",
262
+ ) -> None:
263
+ if min_max_seq_len is None:
264
+ return
265
+ try:
266
+ info = client.request_json("GET", f"/v1/sessions/{session_id}", timeout_s=10.0)
267
+ except HttpError:
268
+ return
269
+ if not isinstance(info, dict):
270
+ return
271
+ try:
272
+ cur = int(info.get("max_seq_len") or 0)
273
+ except Exception:
274
+ cur = 0
275
+ target = int(min_max_seq_len)
276
+ if target <= 0 or (cur > 0 and target <= cur):
277
+ return
278
+
279
+ try:
280
+ client.request_json(
281
+ "POST",
282
+ f"/v1/sessions/{session_id}/resize",
283
+ payload={"max_seq_len": target, "strategy": strategy},
284
+ timeout_s=300.0,
285
+ )
286
+ except HttpError as exc:
287
+ raise ChatReplError(
288
+ "Failed to resize session context length. "
289
+ "This can happen if the target is too large for GPU memory. "
290
+ f"(session_id={session_id} target_max_seq_len={target}): {exc}"
291
+ ) from exc
292
+
293
+
294
+ def _banner(*, url: str, session_id: str, resumed: bool) -> None:
295
+ mode = "resumed" if resumed else "new"
296
+ print(f"server={url}")
297
+ print(f"session_id={session_id} ({mode})")
298
+ print("type /help for commands")
299
+
300
+
301
+ def _format_metrics(stats: TurnStats) -> str:
302
+ parts: list[str] = []
303
+ if stats.ttft_s is not None:
304
+ parts.append(f"ttft={stats.ttft_s:.3f}s")
305
+ if stats.tok_per_s is not None:
306
+ parts.append(f"tok/s={stats.tok_per_s:.2f}")
307
+ if stats.prompt_tokens is not None and stats.completion_tokens is not None:
308
+ parts.append(f"tokens={stats.prompt_tokens}+{stats.completion_tokens}")
309
+ if stats.finish_reason is not None:
310
+ parts.append(f"finish={stats.finish_reason}")
311
+ if stats.total_s is not None:
312
+ parts.append(f"wall={stats.total_s:.3f}s")
313
+ return " ".join(parts) if parts else ""
314
+
315
+
316
+ def _stats_detail(stats: TurnStats) -> str:
317
+ lines = []
318
+ if stats.finish_reason is not None:
319
+ lines.append(f"finish_reason={stats.finish_reason}")
320
+ if stats.ttft_s is not None:
321
+ lines.append(f"ttft_s={stats.ttft_s:.6f}")
322
+ if stats.total_s is not None:
323
+ lines.append(f"wall_s={stats.total_s:.6f}")
324
+ if stats.prompt_tokens is not None:
325
+ lines.append(f"prompt_tokens={stats.prompt_tokens}")
326
+ if stats.completion_tokens is not None:
327
+ lines.append(f"completion_tokens={stats.completion_tokens}")
328
+ if stats.tok_per_s is not None:
329
+ lines.append(f"tok_per_s={stats.tok_per_s:.6f}")
330
+ if stats.server_prefill_s is not None:
331
+ lines.append(f"server_prefill_s={stats.server_prefill_s:.6f}")
332
+ if stats.server_decode_s is not None:
333
+ lines.append(f"server_decode_s={stats.server_decode_s:.6f}")
334
+ if stats.server_total_s is not None:
335
+ lines.append(f"server_total_s={stats.server_total_s:.6f}")
336
+ return "\n".join(lines)
337
+
338
+
339
+ def _set_active_session(state: CliState, session_id: str) -> None:
340
+ state.active_chat_session_id = session_id
341
+ # Keep legacy field aligned (also handled in save_state()).
342
+ state.chat_checkpoint_snapshot_id = state.chat_checkpoints.get(session_id)
343
+
344
+
345
+ def _cmd_help() -> None:
346
+ print(
347
+ "\n".join(
348
+ [
349
+ "commands:",
350
+ " /help",
351
+ " /exit [-c] exit (--clean/-c: delete session)",
352
+ " /clear clear screen",
353
+ " /history [n] show last n input commands (default 20)",
354
+ " /history clear clear input command history",
355
+ " /new start new session (keeps old)",
356
+ " /rm delete current session, start new",
357
+ " /rm <id...> delete session(s) or snapshot(s)",
358
+ " /rm --all delete all chat sessions",
359
+ " /head [n] show first n messages",
360
+ " /tail [n] show last n messages",
361
+ " /show <i> show message i in full (use /tail to find ids)",
362
+ " /ls list sessions and snapshots",
363
+ " /switch <id> switch to another session",
364
+ " /save [title] save snapshot",
365
+ " /load <snap> load snapshot into new session",
366
+ " /stats show last turn metrics",
367
+ ]
368
+ )
369
+ )
370
+
371
+
372
+ def _wrap_for_terminal(text: str, *, indent: str = "", width: int | None = None) -> str:
373
+ cols = shutil.get_terminal_size(fallback=(120, 24)).columns
374
+ target_width = cols if width is None else int(width)
375
+ target_width = max(20, target_width)
376
+
377
+ normalized = text.replace("\r", "")
378
+ out_lines: list[str] = []
379
+ for logical in normalized.split("\n"):
380
+ if not logical:
381
+ out_lines.append(indent)
382
+ continue
383
+ wrapped = textwrap.wrap(
384
+ logical,
385
+ width=max(10, target_width - len(indent)),
386
+ replace_whitespace=False,
387
+ drop_whitespace=False,
388
+ break_long_words=True,
389
+ break_on_hyphens=False,
390
+ )
391
+ if not wrapped:
392
+ out_lines.append(indent)
393
+ else:
394
+ out_lines.extend([indent + w for w in wrapped])
395
+ return "\n".join(out_lines)
396
+
397
+
398
+ def _cmd_show(*, client: SuperlinearClient, session_id: str, index: int) -> None:
399
+ """Show a single message in full by 1-based index."""
400
+ try:
401
+ resp = client.request_json("GET", f"/v1/sessions/{session_id}/history", timeout_s=10.0)
402
+ except HttpError as exc:
403
+ raise ChatReplError(str(exc)) from exc
404
+
405
+ msgs = resp.get("messages") if isinstance(resp, dict) else None
406
+ if not isinstance(msgs, list):
407
+ raise ChatReplError("Invalid response from server for /show")
408
+
409
+ n = len(msgs)
410
+ if n == 0:
411
+ print("(empty)")
412
+ return
413
+ if index < 1 or index > n:
414
+ raise ChatReplError(f"Message index out of range: {index} (1..{n})")
415
+
416
+ m = msgs[index - 1]
417
+ if not isinstance(m, dict):
418
+ raise ChatReplError("Invalid message format")
419
+
420
+ role = str(m.get("role") or "")
421
+ content = m.get("content")
422
+ tool_calls = m.get("tool_calls")
423
+
424
+ if content is None and tool_calls is not None:
425
+ content_str = f"<tool_calls {len(tool_calls) if isinstance(tool_calls, list) else 1}>"
426
+ else:
427
+ content_str = "" if content is None else str(content)
428
+
429
+ header = f"{index:>4} {role}:"
430
+ print(header)
431
+ if content_str:
432
+ print(_wrap_for_terminal(content_str, indent=" "))
433
+ else:
434
+ print(" (empty)")
435
+
436
+
437
+ def _cmd_head(*, client: SuperlinearClient, session_id: str, limit: int = 10) -> None:
438
+ """Show first n messages."""
439
+ try:
440
+ resp = client.request_json("GET", f"/v1/sessions/{session_id}/history", timeout_s=10.0)
441
+ except HttpError as exc:
442
+ raise ChatReplError(str(exc)) from exc
443
+
444
+ msgs = resp.get("messages") if isinstance(resp, dict) else None
445
+ if not isinstance(msgs, list):
446
+ raise ChatReplError("Invalid response from server for /head")
447
+
448
+ limit = max(1, min(int(limit), 200))
449
+ head = msgs[:limit]
450
+ if not head:
451
+ print("(empty)")
452
+ return
453
+
454
+ for i, m in enumerate(head, start=1):
455
+ if not isinstance(m, dict):
456
+ continue
457
+ role = m.get("role")
458
+ content = m.get("content")
459
+ text = content if isinstance(content, str) else ""
460
+ one_line = text.replace("\r", "").replace("\n", " ").strip()
461
+ if len(one_line) > 200:
462
+ one_line = one_line[:197] + "…"
463
+ print(f"{i:>4} {role}: {one_line}")
464
+
465
+
466
+ def _cmd_tail(*, client: SuperlinearClient, session_id: str, limit: int = 10) -> None:
467
+ """Show last n messages."""
468
+ try:
469
+ resp = client.request_json("GET", f"/v1/sessions/{session_id}/history", timeout_s=10.0)
470
+ except HttpError as exc:
471
+ raise ChatReplError(str(exc)) from exc
472
+
473
+ msgs = resp.get("messages") if isinstance(resp, dict) else None
474
+ if not isinstance(msgs, list):
475
+ raise ChatReplError("Invalid response from server for /tail")
476
+
477
+ limit = max(1, min(int(limit), 200))
478
+ tail = msgs[-limit:]
479
+ if not tail:
480
+ print("(empty)")
481
+ return
482
+
483
+ for i, m in enumerate(tail, start=max(1, len(msgs) - len(tail) + 1)):
484
+ if not isinstance(m, dict):
485
+ continue
486
+ role = m.get("role")
487
+ content = m.get("content")
488
+ text = content if isinstance(content, str) else ""
489
+ one_line = text.replace("\r", "").replace("\n", " ").strip()
490
+ if len(one_line) > 200:
491
+ one_line = one_line[:197] + "…"
492
+ print(f"{i:>4} {role}: {one_line}")
493
+
494
+
495
+ def _cmd_ls(*, client: SuperlinearClient, current_session_id: str) -> None:
496
+ """List all sessions and snapshots."""
497
+ # Sessions
498
+ try:
499
+ payload = client.request_json("GET", "/v1/sessions", timeout_s=10.0)
500
+ except HttpError as exc:
501
+ raise ChatReplError(str(exc)) from exc
502
+
503
+ raw_sessions = payload.get("sessions") if isinstance(payload, dict) else None
504
+ if not isinstance(raw_sessions, list):
505
+ raw_sessions = []
506
+ session_ids = [s for s in raw_sessions if isinstance(s, str)]
507
+
508
+ print("sessions:")
509
+ if not session_ids:
510
+ print(" (none)")
511
+ else:
512
+ for sid in session_ids:
513
+ marker = " *" if sid == current_session_id else ""
514
+ print(f" {sid}{marker}")
515
+
516
+ # Snapshots (local)
517
+ snapshots = list_local_snapshots()
518
+ print("\nsnapshots:")
519
+ if not snapshots:
520
+ print(" (none)")
521
+ else:
522
+ for snap in snapshots:
523
+ sid = snap.get("snapshot_id") or ""
524
+ title = snap.get("title") or ""
525
+ if title:
526
+ print(f" {sid} {title}")
527
+ else:
528
+ print(f" {sid}")
529
+
530
+
531
+ def _cmd_rm(
532
+ *,
533
+ client: SuperlinearClient,
534
+ target_ids: list[str],
535
+ current_session_id: str,
536
+ ) -> bool:
537
+ """Remove session(s) and/or snapshot(s). Returns True if current session was removed."""
538
+ removed_current = False
539
+ for target_id in target_ids:
540
+ # Check if it's a snapshot ID (32-char hex)
541
+ raw_id = target_id[5:] if target_id.startswith("snap-") else target_id
542
+ is_snapshot = len(raw_id) == 32 and all(c in "0123456789abcdef" for c in raw_id.lower())
543
+
544
+ if is_snapshot:
545
+ # Delete snapshot
546
+ deleted = delete_local_snapshot(raw_id)
547
+ if deleted:
548
+ print(f"removed snapshot_id={raw_id}")
549
+ else:
550
+ print(f"error: snapshot not found: {raw_id}", file=sys.stderr)
551
+ else:
552
+ # Delete session
553
+ try:
554
+ client.request_json("DELETE", f"/v1/sessions/{target_id}", timeout_s=10.0)
555
+ print(f"removed session_id={target_id}")
556
+ if target_id == current_session_id:
557
+ removed_current = True
558
+ except HttpError as exc:
559
+ if exc.status_code == 404:
560
+ print(f"error: session not found: {target_id}", file=sys.stderr)
561
+ else:
562
+ print(f"error: failed to remove {target_id}: {exc}", file=sys.stderr)
563
+ return removed_current
564
+
565
+
566
+ def _cmd_rm_all(
567
+ *,
568
+ client: SuperlinearClient,
569
+ current_session_id: str,
570
+ ) -> bool:
571
+ """Remove all chat-* sessions. Returns True if current session was removed."""
572
+ try:
573
+ payload = client.request_json("GET", "/v1/sessions", timeout_s=10.0)
574
+ except HttpError as exc:
575
+ raise ChatReplError(str(exc)) from exc
576
+
577
+ raw_sessions = payload.get("sessions") if isinstance(payload, dict) else []
578
+ if not isinstance(raw_sessions, list):
579
+ raw_sessions = []
580
+ chat_sessions = [s for s in raw_sessions if isinstance(s, str) and s.startswith("chat")]
581
+
582
+ if not chat_sessions:
583
+ print("(no chat sessions to remove)")
584
+ return False
585
+
586
+ return _cmd_rm(client=client, target_ids=chat_sessions, current_session_id=current_session_id)
587
+
588
+
589
+ def _cmd_save(
590
+ *,
591
+ client: SuperlinearClient,
592
+ state: CliState,
593
+ session_id: str,
594
+ title: str | None,
595
+ archive: bool,
596
+ ) -> None:
597
+ payload: dict[str, Any] = {}
598
+ if title:
599
+ payload["title"] = title
600
+
601
+ resp = client.request_json("POST", f"/v1/sessions/{session_id}/save", payload=payload, timeout_s=300.0)
602
+ snapshot_id = resp.get("snapshot_id") if isinstance(resp, dict) else None
603
+ if not isinstance(snapshot_id, str) or not snapshot_id:
604
+ raise ChatReplError("Invalid response from server for /save")
605
+
606
+ if archive:
607
+ print(f"saved (archive) snapshot_id={snapshot_id}")
608
+ return
609
+
610
+ prev = state.chat_checkpoints.get(session_id)
611
+ state.chat_checkpoints[session_id] = snapshot_id
612
+ _set_active_session(state, session_id)
613
+ save_state(state)
614
+
615
+ if prev and prev != snapshot_id:
616
+ try:
617
+ client.request_json("DELETE", f"/v1/snapshots/{prev}", timeout_s=30.0)
618
+ print(f"saved checkpoint snapshot_id={snapshot_id} (deleted previous {prev})")
619
+ except HttpError:
620
+ print(f"saved checkpoint snapshot_id={snapshot_id} (failed to delete previous {prev})")
621
+ else:
622
+ print(f"saved checkpoint snapshot_id={snapshot_id}")
623
+
624
+
625
+ def _cmd_load(
626
+ *,
627
+ client: SuperlinearClient,
628
+ state: CliState,
629
+ snapshot_id: str,
630
+ as_session_id: str | None,
631
+ ) -> str:
632
+ target = as_session_id or _new_session_id(prefix="chat")
633
+ try:
634
+ resp = client.request_json(
635
+ "POST",
636
+ f"/v1/snapshots/{snapshot_id}/load",
637
+ payload={"session_id": target},
638
+ timeout_s=300.0,
639
+ )
640
+ except HttpError as exc:
641
+ if exc.status_code == 404:
642
+ raise ChatReplError(f"Snapshot not found: {snapshot_id} (use `spl snapshot ls`).") from exc
643
+ if exc.status_code == 409:
644
+ raise ChatReplError(
645
+ f"Target session already exists: {target} (choose a different --as, or omit --as)."
646
+ ) from exc
647
+ if exc.status_code == 429:
648
+ raise ChatReplError("Server is busy (429). Try again.") from exc
649
+ raise
650
+ if not isinstance(resp, dict) or not isinstance(resp.get("session_id"), str):
651
+ raise ChatReplError("Invalid response from server for /load")
652
+
653
+ new_session_id = resp["session_id"]
654
+ _set_active_session(state, new_session_id)
655
+ # Do not automatically treat the loaded snapshot as the checkpoint.
656
+ state.chat_checkpoints.pop(new_session_id, None)
657
+ state.chat_checkpoint_snapshot_id = None
658
+ save_state(state)
659
+ print(f"loaded snapshot_id={snapshot_id} session_id={new_session_id}")
660
+ return new_session_id
661
+
662
+
663
+ def _cmd_switch(*, client: SuperlinearClient, state: CliState, session_id: str) -> str:
664
+ try:
665
+ client.request_json("GET", f"/v1/sessions/{session_id}", timeout_s=10.0)
666
+ except HttpError as exc:
667
+ if exc.status_code == 404:
668
+ raise ChatReplError(f"Unknown session: {session_id} (use /new to create one)") from exc
669
+ raise ChatReplError(str(exc)) from exc
670
+
671
+ _set_active_session(state, session_id)
672
+ save_state(state)
673
+ print(f"switched session_id={session_id}")
674
+ return session_id
675
+
676
+
677
+ def _stream_chat_turn(
678
+ *,
679
+ client: SuperlinearClient,
680
+ session_id: str,
681
+ user_text: str,
682
+ think_budget: int | None,
683
+ temperature: float = 0.1,
684
+ top_p: float = 0.95,
685
+ system_prompt: str | None = None,
686
+ ) -> TurnStats:
687
+ messages: list[dict[str, str]] = []
688
+ if system_prompt:
689
+ messages.append({"role": "system", "content": system_prompt})
690
+ messages.append({"role": "user", "content": user_text})
691
+
692
+ payload: dict[str, Any] = {
693
+ "stream": True,
694
+ "session_id": session_id,
695
+ "messages": messages,
696
+ "max_completion_tokens": 32768,
697
+ "temperature": temperature,
698
+ "top_p": top_p,
699
+ }
700
+
701
+ enable_thinking_ui = think_budget is not None and think_budget > 0
702
+ if enable_thinking_ui:
703
+ # Enable thinking mode (Superlinear-specific). The model will emit <think>...</think>.
704
+ payload["reasoning_budget"] = int(think_budget)
705
+ # Do not persist thinking into the server-side session transcript.
706
+ payload["discard_thinking"] = True
707
+ # Ask the server to stream thinking deltas separately (delta.thinking).
708
+ payload["stream_thinking"] = True
709
+
710
+ started = time.monotonic()
711
+ ttft_s: float | None = None
712
+ finish_reason: str | None = None
713
+ usage: dict[str, Any] | None = None
714
+ timing: dict[str, Any] | None = None
715
+
716
+ started_answer = False
717
+ in_think = False
718
+ thinking_accum: str = ""
719
+ thinking_panel_active = False
720
+ thinking_panel_lines = 0
721
+ content_buf = ""
722
+ saw_thinking_delta = False
723
+ thinking_start_time: float | None = None
724
+ thinking_end_time: float | None = None
725
+
726
+ def _thinking_panel_format(text: str) -> list[str]:
727
+ cols = shutil.get_terminal_size(fallback=(120, 24)).columns
728
+ prefix = "thinking: "
729
+ width = max(20, cols - len(prefix) - 1)
730
+
731
+ normalized = text.replace("\r", "")
732
+ wrapped: list[str] = []
733
+ for logical in normalized.split("\n"):
734
+ parts = textwrap.wrap(
735
+ logical,
736
+ width=width,
737
+ replace_whitespace=False,
738
+ drop_whitespace=False,
739
+ break_long_words=True,
740
+ break_on_hyphens=False,
741
+ )
742
+ if not parts:
743
+ wrapped.append("")
744
+ else:
745
+ wrapped.extend(parts)
746
+
747
+ tail = wrapped[-10:]
748
+ # Always show at least one line once thinking starts.
749
+ if not tail:
750
+ tail = [""]
751
+ return [prefix + ln for ln in tail]
752
+
753
+ def _thinking_panel_move_to_top() -> None:
754
+ nonlocal thinking_panel_lines
755
+ if thinking_panel_lines > 1:
756
+ sys.stdout.write(f"\x1b[{thinking_panel_lines - 1}A")
757
+
758
+ def _thinking_panel_render(text: str) -> None:
759
+ nonlocal thinking_panel_active, thinking_panel_lines
760
+ lines = _thinking_panel_format(text)
761
+
762
+ if not thinking_panel_active:
763
+ # Allocate space for the panel.
764
+ sys.stdout.write("\n")
765
+ thinking_panel_active = True
766
+ thinking_panel_lines = 1
767
+
768
+ _thinking_panel_move_to_top()
769
+
770
+ # Clear the old panel area.
771
+ for i in range(thinking_panel_lines):
772
+ sys.stdout.write("\r\x1b[2K")
773
+ if i < thinking_panel_lines - 1:
774
+ sys.stdout.write("\n")
775
+ _thinking_panel_move_to_top()
776
+
777
+ # Draw the new panel.
778
+ for i, ln in enumerate(lines):
779
+ sys.stdout.write("\r\x1b[2K" + ln)
780
+ if i < len(lines) - 1:
781
+ sys.stdout.write("\n")
782
+
783
+ thinking_panel_lines = len(lines)
784
+ sys.stdout.flush()
785
+
786
+ def _thinking_panel_clear() -> None:
787
+ nonlocal thinking_panel_active, thinking_panel_lines, thinking_start_time, thinking_end_time
788
+ if not thinking_panel_active:
789
+ return
790
+
791
+ _thinking_panel_move_to_top()
792
+ for i in range(thinking_panel_lines):
793
+ sys.stdout.write("\r\x1b[2K")
794
+ if i < thinking_panel_lines - 1:
795
+ sys.stdout.write("\n")
796
+ _thinking_panel_move_to_top()
797
+
798
+ thinking_panel_active = False
799
+ thinking_panel_lines = 0
800
+
801
+ # Show thinking duration summary if we have timing info
802
+ if thinking_start_time is not None and thinking_end_time is not None:
803
+ duration = thinking_end_time - thinking_start_time
804
+ if duration >= 60:
805
+ minutes = int(duration // 60)
806
+ seconds = duration % 60
807
+ sys.stdout.write(f"[thinking complete] duration: {minutes} minute{'s' if minutes != 1 else ''} {seconds:.1f} seconds\n")
808
+ else:
809
+ sys.stdout.write(f"[thinking complete] duration: {duration:.1f} seconds\n")
810
+
811
+ sys.stdout.flush()
812
+
813
+ def _answer_start_if_needed() -> None:
814
+ nonlocal started_answer
815
+ if not started_answer:
816
+ _thinking_panel_clear()
817
+ print("assistant: ", end="", flush=True)
818
+ started_answer = True
819
+
820
+ gen = client.request_sse("POST", "/v1/chat/completions", payload=payload, timeout_s=3600.0)
821
+ try:
822
+ for event in gen:
823
+ if isinstance(event, dict) and "error" in event:
824
+ err = event.get("error")
825
+ msg = err.get("message") if isinstance(err, dict) else str(err)
826
+ raise ChatReplError(str(msg))
827
+
828
+ if not isinstance(event, dict):
829
+ continue
830
+
831
+ choices = event.get("choices")
832
+ if isinstance(choices, list) and choices:
833
+ ch0 = choices[0]
834
+ if isinstance(ch0, dict):
835
+ delta = ch0.get("delta") if isinstance(ch0.get("delta"), dict) else {}
836
+ if isinstance(delta, dict):
837
+ thinking = delta.get("thinking")
838
+ if isinstance(thinking, str) and thinking:
839
+ saw_thinking_delta = True
840
+ if ttft_s is None:
841
+ ttft_s = time.monotonic() - started
842
+
843
+ buf = thinking
844
+ while buf:
845
+ if not in_think:
846
+ start_idx = buf.find("<think>")
847
+ if start_idx == -1:
848
+ # If server doesn't send tags, ignore stray thinking outside think.
849
+ break
850
+ buf = buf[start_idx + len("<think>") :]
851
+ in_think = True
852
+ thinking_accum = ""
853
+ if thinking_start_time is None:
854
+ thinking_start_time = time.monotonic()
855
+ _thinking_panel_render(thinking_accum)
856
+ continue
857
+
858
+ end_idx = buf.find("</think>")
859
+ if end_idx == -1:
860
+ thinking_accum += buf
861
+ buf = ""
862
+ _thinking_panel_render(thinking_accum)
863
+ break
864
+
865
+ thinking_accum += buf[:end_idx]
866
+ buf = buf[end_idx + len("</think>") :]
867
+ if thinking_start_time is not None:
868
+ thinking_end_time = time.monotonic()
869
+ _thinking_panel_clear()
870
+ in_think = False
871
+ break
872
+
873
+ content = delta.get("content")
874
+ if isinstance(content, str) and content:
875
+ if ttft_s is None:
876
+ ttft_s = time.monotonic() - started
877
+
878
+ # If the server is streaming thinking deltas separately, don't run the
879
+ # fallback <think>-tag parser on content.
880
+ if not enable_thinking_ui or saw_thinking_delta:
881
+ _answer_start_if_needed()
882
+ sys.stdout.write(content)
883
+ sys.stdout.flush()
884
+ else:
885
+ # Stream thinking live, then clear it once </think> arrives.
886
+ content_buf += content
887
+ while content_buf:
888
+ if in_think:
889
+ end_idx = content_buf.find("</think>")
890
+ if end_idx == -1:
891
+ thinking_accum += content_buf
892
+ content_buf = ""
893
+ _thinking_panel_render(thinking_accum)
894
+ break
895
+
896
+ thinking_accum += content_buf[:end_idx]
897
+ content_buf = content_buf[end_idx + len("</think>") :]
898
+ in_think = False
899
+ if thinking_start_time is not None:
900
+ thinking_end_time = time.monotonic()
901
+ _thinking_panel_clear()
902
+ continue
903
+
904
+ start_idx = content_buf.find("<think>")
905
+ if start_idx == -1:
906
+ _answer_start_if_needed()
907
+ sys.stdout.write(content_buf)
908
+ sys.stdout.flush()
909
+ content_buf = ""
910
+ break
911
+
912
+ # Emit any prelude before <think> as answer.
913
+ if start_idx > 0:
914
+ _answer_start_if_needed()
915
+ sys.stdout.write(content_buf[:start_idx])
916
+ sys.stdout.flush()
917
+
918
+ content_buf = content_buf[start_idx + len("<think>") :]
919
+ in_think = True
920
+ thinking_accum = ""
921
+ if thinking_start_time is None:
922
+ thinking_start_time = time.monotonic()
923
+ _thinking_panel_render(thinking_accum)
924
+ continue
925
+
926
+ tool_calls = delta.get("tool_calls")
927
+ if tool_calls is not None:
928
+ if ttft_s is None:
929
+ ttft_s = time.monotonic() - started
930
+ _thinking_panel_clear()
931
+ sys.stdout.write(
932
+ f"\n<tool_calls {len(tool_calls) if isinstance(tool_calls, list) else 1}>\n"
933
+ )
934
+ sys.stdout.flush()
935
+
936
+ fr = ch0.get("finish_reason")
937
+ if fr is not None:
938
+ finish_reason = str(fr)
939
+
940
+ # Terminal chunk may include usage/timing.
941
+ if event.get("usage") is not None and isinstance(event.get("usage"), dict):
942
+ usage = event["usage"]
943
+ if event.get("x_superlinear_timing") is not None and isinstance(event.get("x_superlinear_timing"), dict):
944
+ timing = event["x_superlinear_timing"]
945
+ except KeyboardInterrupt:
946
+ try:
947
+ gen.close()
948
+ except Exception:
949
+ pass
950
+ raise
951
+ finally:
952
+ try:
953
+ gen.close()
954
+ except Exception:
955
+ pass
956
+
957
+ # Always clear any on-screen thinking UI, even on errors.
958
+ _thinking_panel_clear()
959
+ if in_think and thinking_start_time is not None and thinking_end_time is None:
960
+ sys.stdout.write("[thinking incomplete] (no </think> received before stream ended)\n")
961
+ sys.stdout.flush()
962
+
963
+ ended = time.monotonic()
964
+ sys.stdout.write("\n")
965
+ sys.stdout.flush()
966
+
967
+ stats = TurnStats()
968
+ stats.finish_reason = finish_reason
969
+ stats.ttft_s = ttft_s
970
+ stats.total_s = max(ended - started, 0.0)
971
+
972
+ if usage is not None:
973
+ pt = usage.get("prompt_tokens")
974
+ ct = usage.get("completion_tokens")
975
+ if isinstance(pt, int):
976
+ stats.prompt_tokens = pt
977
+ if isinstance(ct, int):
978
+ stats.completion_tokens = ct
979
+
980
+ if timing is not None:
981
+ prefill_s = timing.get("prefill_s")
982
+ decode_s = timing.get("decode_s")
983
+ total_s = timing.get("total_s")
984
+ tok_per_s = timing.get("tok_per_s")
985
+ if isinstance(prefill_s, (float, int)):
986
+ stats.server_prefill_s = float(prefill_s)
987
+ if isinstance(decode_s, (float, int)):
988
+ stats.server_decode_s = float(decode_s)
989
+ if isinstance(total_s, (float, int)):
990
+ stats.server_total_s = float(total_s)
991
+ if isinstance(tok_per_s, (float, int)):
992
+ stats.tok_per_s = float(tok_per_s)
993
+
994
+ return stats
995
+
996
+
997
+ def chat_repl(
998
+ *,
999
+ url: str,
1000
+ new: bool = False,
1001
+ session: str | None = None,
1002
+ max_seq_len: int | None = 1_048_576,
1003
+ think_budget: int | None = 8192,
1004
+ temperature: float = 0.1,
1005
+ top_p: float = 0.95,
1006
+ system_prompt: str | None = DEFAULT_SYSTEM_PROMPT,
1007
+ ) -> int:
1008
+ _setup_readline_history()
1009
+ _setup_completer()
1010
+ client = SuperlinearClient(base_url=url, timeout_s=3600.0)
1011
+ try:
1012
+ _ensure_reachable(client)
1013
+ except ChatReplError as exc:
1014
+ print(str(exc), file=sys.stderr)
1015
+ return 1
1016
+
1017
+ state = load_state()
1018
+
1019
+ resumed = False
1020
+ if new:
1021
+ session_id = _new_session_id(prefix="chat")
1022
+ lock = SessionLock(session_id=session_id, kind="chat", label="spl chat")
1023
+ elif session:
1024
+ session_id = session
1025
+ lock = SessionLock(session_id=session_id, kind="chat", label="spl chat")
1026
+ else:
1027
+ session_id = state.active_chat_session_id
1028
+ if not session_id:
1029
+ session_id = _new_session_id(prefix="chat")
1030
+ lock = SessionLock(session_id=session_id, kind="chat", label="spl chat")
1031
+
1032
+ try:
1033
+ lock.acquire()
1034
+ except AlreadyLockedError as exc:
1035
+ pid = exc.info.pid
1036
+ label = exc.info.label or exc.info.kind or "spl"
1037
+ print(
1038
+ f"error: session is already open in another REPL (session_id={session_id} pid={pid} label={label}).",
1039
+ file=sys.stderr,
1040
+ )
1041
+ if not new:
1042
+ print("next steps: `spl chat --new` or choose a different `--session`.", file=sys.stderr)
1043
+ return 2
1044
+
1045
+ try:
1046
+ if new:
1047
+ _create_session(client, session_id, max_seq_len=max_seq_len)
1048
+ _maybe_resize_session(client, session_id, min_max_seq_len=max_seq_len)
1049
+ _set_active_session(state, session_id)
1050
+ state.chat_checkpoints.pop(session_id, None)
1051
+ state.chat_checkpoint_snapshot_id = None
1052
+ save_state(state)
1053
+ elif session:
1054
+ if _session_exists(client, session_id):
1055
+ resumed = True
1056
+ _maybe_resize_session(client, session_id, min_max_seq_len=max_seq_len)
1057
+ else:
1058
+ _create_session(client, session_id, max_seq_len=max_seq_len)
1059
+ _maybe_resize_session(client, session_id, min_max_seq_len=max_seq_len)
1060
+ _set_active_session(state, session_id)
1061
+ save_state(state)
1062
+ else:
1063
+ if session_id and _session_exists(client, session_id):
1064
+ resumed = True
1065
+ _maybe_resize_session(client, session_id, min_max_seq_len=max_seq_len)
1066
+ else:
1067
+ if session_id:
1068
+ print(f"note: session not found on server: {session_id} (starting a new one)")
1069
+ # Release lock for the missing session id, and start a new chat session.
1070
+ lock.release()
1071
+ session_id = _new_session_id(prefix="chat")
1072
+ lock = SessionLock(session_id=session_id, kind="chat", label="spl chat")
1073
+ lock.acquire()
1074
+ _create_session(client, session_id, max_seq_len=max_seq_len)
1075
+ _maybe_resize_session(client, session_id, min_max_seq_len=max_seq_len)
1076
+ _set_active_session(state, session_id)
1077
+ state.chat_checkpoints.pop(session_id, None)
1078
+ state.chat_checkpoint_snapshot_id = None
1079
+ save_state(state)
1080
+
1081
+ _banner(url=client.base_url, session_id=session_id, resumed=resumed)
1082
+
1083
+ # Only send the system prompt when the session is truly empty.
1084
+ # Sending a system prompt on an already-prefilled KV cache would be a prefix edit and can
1085
+ # corrupt session append-from behavior.
1086
+ should_send_system_prompt = False
1087
+ if system_prompt:
1088
+ try:
1089
+ msg_count, cache_pos = _get_session_counters(client=client, session_id=session_id)
1090
+ should_send_system_prompt = (msg_count <= 0 and cache_pos <= 0)
1091
+ except ChatReplError:
1092
+ # If we can't verify, default to not sending to avoid KV corruption.
1093
+ should_send_system_prompt = False
1094
+
1095
+ last_stats: TurnStats | None = None
1096
+
1097
+ while True:
1098
+ prompt = f"spl(chat:{session_id})> "
1099
+ try:
1100
+ raw = input(prompt)
1101
+ except EOFError:
1102
+ print()
1103
+ return 0
1104
+ except KeyboardInterrupt:
1105
+ print("^C")
1106
+ continue
1107
+
1108
+ raw = _coalesce_pasted_lines(raw)
1109
+
1110
+ line = raw.strip()
1111
+ if not line:
1112
+ continue
1113
+
1114
+ # Guard against accidental paste leftovers from transcripts.
1115
+ # These frequently show up as a single role label on its own line
1116
+ # (e.g. the user copied/pasted a block that included "assistant:").
1117
+ non_empty_lines = [ln.strip() for ln in raw.splitlines() if ln.strip()]
1118
+ if len(non_empty_lines) == 1:
1119
+ lone = non_empty_lines[0]
1120
+ # Normalize aggressively: drop punctuation, spaces, and control chars.
1121
+ alpha = "".join(ch for ch in lone.lower() if "a" <= ch <= "z")
1122
+ if alpha in {"assistant", "user", "system"}:
1123
+ continue
1124
+
1125
+ # Commands are single-line only. If the user pasted a block that starts
1126
+ # with '/', treat it as a normal message.
1127
+ if line.startswith("/") and "\n" not in raw:
1128
+ cmdline = line[1:].strip()
1129
+ try:
1130
+ parts = shlex.split(cmdline)
1131
+ except ValueError as exc:
1132
+ print(f"parse error: {exc}", file=sys.stderr)
1133
+ continue
1134
+ if not parts:
1135
+ continue
1136
+ cmd, args = parts[0], parts[1:]
1137
+
1138
+ if cmd in {"exit", "quit"}:
1139
+ if args in [["--clean"], ["-c"]]:
1140
+ try:
1141
+ client.request_json("DELETE", f"/v1/sessions/{session_id}", timeout_s=10.0)
1142
+ print(f"removed session_id={session_id}")
1143
+ except HttpError:
1144
+ pass # Best effort
1145
+ return 0
1146
+ if cmd == "help":
1147
+ _cmd_help()
1148
+ continue
1149
+ if cmd == "clear":
1150
+ print("\033[2J\033[H", end="", flush=True)
1151
+ continue
1152
+ if cmd == "head":
1153
+ n = 10
1154
+ if len(args) == 1:
1155
+ try:
1156
+ n = int(args[0])
1157
+ except Exception:
1158
+ print("usage: /head [n]", file=sys.stderr)
1159
+ continue
1160
+ elif len(args) > 1:
1161
+ print("usage: /head [n]", file=sys.stderr)
1162
+ continue
1163
+ try:
1164
+ _cmd_head(client=client, session_id=session_id, limit=n)
1165
+ except ChatReplError as exc:
1166
+ print(str(exc), file=sys.stderr)
1167
+ continue
1168
+ if cmd == "tail":
1169
+ n = 10
1170
+ if len(args) == 1:
1171
+ try:
1172
+ n = int(args[0])
1173
+ except Exception:
1174
+ print("usage: /tail [n]", file=sys.stderr)
1175
+ continue
1176
+ elif len(args) > 1:
1177
+ print("usage: /tail [n]", file=sys.stderr)
1178
+ continue
1179
+ try:
1180
+ _cmd_tail(client=client, session_id=session_id, limit=n)
1181
+ except ChatReplError as exc:
1182
+ print(str(exc), file=sys.stderr)
1183
+ continue
1184
+ if cmd == "show":
1185
+ if len(args) != 1:
1186
+ print("usage: /show <i>", file=sys.stderr)
1187
+ continue
1188
+ try:
1189
+ i = int(args[0])
1190
+ except Exception:
1191
+ print("usage: /show <i>", file=sys.stderr)
1192
+ continue
1193
+ try:
1194
+ _cmd_show(client=client, session_id=session_id, index=i)
1195
+ except ChatReplError as exc:
1196
+ print(str(exc), file=sys.stderr)
1197
+ continue
1198
+ if cmd == "history":
1199
+ if args in [["clear"], ["--clear"], ["-c"]]:
1200
+ _cmd_history_clear()
1201
+ continue
1202
+ n = 20
1203
+ if len(args) == 1:
1204
+ try:
1205
+ n = int(args[0])
1206
+ except Exception:
1207
+ print("usage: /history [n] | /history clear", file=sys.stderr)
1208
+ continue
1209
+ elif len(args) > 1:
1210
+ print("usage: /history [n] | /history clear", file=sys.stderr)
1211
+ continue
1212
+ _cmd_history(n)
1213
+ continue
1214
+ if cmd == "ls":
1215
+ try:
1216
+ _cmd_ls(client=client, current_session_id=session_id)
1217
+ except ChatReplError as exc:
1218
+ print(str(exc), file=sys.stderr)
1219
+ continue
1220
+ if cmd == "rm":
1221
+ try:
1222
+ if not args:
1223
+ # /rm with no args = delete current session
1224
+ removed_current = _cmd_rm(
1225
+ client=client,
1226
+ target_ids=[session_id],
1227
+ current_session_id=session_id,
1228
+ )
1229
+ elif args == ["--all"]:
1230
+ removed_current = _cmd_rm_all(
1231
+ client=client,
1232
+ current_session_id=session_id,
1233
+ )
1234
+ else:
1235
+ removed_current = _cmd_rm(
1236
+ client=client,
1237
+ target_ids=args,
1238
+ current_session_id=session_id,
1239
+ )
1240
+ except ChatReplError as exc:
1241
+ print(str(exc), file=sys.stderr)
1242
+ continue
1243
+ if removed_current:
1244
+ # Current session removed; create a new one
1245
+ next_id = _new_session_id(prefix="chat")
1246
+ next_lock = SessionLock(session_id=next_id, kind="chat", label="spl chat")
1247
+ try:
1248
+ next_lock.acquire()
1249
+ except AlreadyLockedError:
1250
+ print(f"error: session is already open: {next_id}", file=sys.stderr)
1251
+ continue
1252
+ try:
1253
+ _create_session(client, next_id, max_seq_len=max_seq_len)
1254
+ _maybe_resize_session(client, next_id, min_max_seq_len=max_seq_len)
1255
+ except Exception as exc:
1256
+ next_lock.release()
1257
+ print(str(exc), file=sys.stderr)
1258
+ continue
1259
+ lock.release()
1260
+ lock = next_lock
1261
+ session_id = next_id
1262
+ _set_active_session(state, session_id)
1263
+ state.chat_checkpoints.pop(session_id, None)
1264
+ state.chat_checkpoint_snapshot_id = None
1265
+ save_state(state)
1266
+ last_stats = None
1267
+ _banner(url=client.base_url, session_id=session_id, resumed=False)
1268
+ should_send_system_prompt = bool(system_prompt)
1269
+ continue
1270
+ if cmd == "stats":
1271
+ if last_stats is None:
1272
+ print("no stats yet")
1273
+ else:
1274
+ print(_stats_detail(last_stats))
1275
+ continue
1276
+ if cmd == "new":
1277
+ clean = args == ["--clean"]
1278
+ if args and not clean:
1279
+ print("usage: /new [--clean]", file=sys.stderr)
1280
+ continue
1281
+ # If --clean, delete current session first
1282
+ if clean:
1283
+ try:
1284
+ client.request_json("DELETE", f"/v1/sessions/{session_id}", timeout_s=10.0)
1285
+ print(f"removed session_id={session_id}")
1286
+ except HttpError as exc:
1287
+ if exc.status_code != 404:
1288
+ print(f"warning: failed to delete current session: {exc}", file=sys.stderr)
1289
+ next_id = _new_session_id(prefix="chat")
1290
+ next_lock = SessionLock(session_id=next_id, kind="chat", label="spl chat")
1291
+ try:
1292
+ next_lock.acquire()
1293
+ except AlreadyLockedError:
1294
+ print(f"error: session is already open: {next_id}", file=sys.stderr)
1295
+ continue
1296
+ try:
1297
+ _create_session(client, next_id, max_seq_len=max_seq_len)
1298
+ _maybe_resize_session(client, next_id, min_max_seq_len=max_seq_len)
1299
+ except Exception as exc:
1300
+ next_lock.release()
1301
+ print(str(exc), file=sys.stderr)
1302
+ continue
1303
+ lock.release()
1304
+ lock = next_lock
1305
+ session_id = next_id
1306
+ _set_active_session(state, session_id)
1307
+ state.chat_checkpoints.pop(session_id, None)
1308
+ state.chat_checkpoint_snapshot_id = None
1309
+ save_state(state)
1310
+ last_stats = None
1311
+ _banner(url=client.base_url, session_id=session_id, resumed=False)
1312
+ should_send_system_prompt = bool(system_prompt)
1313
+ continue
1314
+ if cmd == "switch":
1315
+ if len(args) != 1:
1316
+ print("usage: /switch <session_id>", file=sys.stderr)
1317
+ continue
1318
+ target_id = args[0]
1319
+ next_lock = SessionLock(session_id=target_id, kind="chat", label="spl chat")
1320
+ try:
1321
+ next_lock.acquire()
1322
+ except AlreadyLockedError as exc:
1323
+ print(
1324
+ f"error: session is already open in another REPL (session_id={target_id} pid={exc.info.pid}).",
1325
+ file=sys.stderr,
1326
+ )
1327
+ continue
1328
+ try:
1329
+ target_id = _cmd_switch(client=client, state=state, session_id=target_id)
1330
+ except ChatReplError as exc:
1331
+ next_lock.release()
1332
+ print(str(exc), file=sys.stderr)
1333
+ continue
1334
+ lock.release()
1335
+ lock = next_lock
1336
+ session_id = target_id
1337
+ last_stats = None
1338
+ if system_prompt:
1339
+ try:
1340
+ msg_count, cache_pos = _get_session_counters(client=client, session_id=session_id)
1341
+ should_send_system_prompt = (msg_count <= 0 and cache_pos <= 0)
1342
+ except ChatReplError:
1343
+ should_send_system_prompt = False
1344
+ else:
1345
+ should_send_system_prompt = False
1346
+ continue
1347
+ if cmd == "load":
1348
+ if len(args) != 1:
1349
+ print("usage: /load <snapshot_id>", file=sys.stderr)
1350
+ continue
1351
+ snap = args[0]
1352
+
1353
+ target_id = _new_session_id(prefix="chat")
1354
+ next_lock = SessionLock(session_id=target_id, kind="chat", label="spl chat")
1355
+ try:
1356
+ next_lock.acquire()
1357
+ except AlreadyLockedError as exc:
1358
+ print(
1359
+ f"error: session is already open in another REPL (session_id={target_id} pid={exc.info.pid}).",
1360
+ file=sys.stderr,
1361
+ )
1362
+ continue
1363
+
1364
+ try:
1365
+ loaded_id = _cmd_load(
1366
+ client=client,
1367
+ state=state,
1368
+ snapshot_id=snap,
1369
+ as_session_id=target_id,
1370
+ )
1371
+ except (ChatReplError, HttpError) as exc:
1372
+ next_lock.release()
1373
+ print(str(exc), file=sys.stderr)
1374
+ continue
1375
+
1376
+ lock.release()
1377
+ lock = next_lock
1378
+ session_id = loaded_id
1379
+ last_stats = None
1380
+ _banner(url=client.base_url, session_id=session_id, resumed=False)
1381
+ continue
1382
+ if cmd == "save":
1383
+ title: str | None = None
1384
+ if args:
1385
+ title = " ".join(args).strip() or None
1386
+ try:
1387
+ _cmd_save(
1388
+ client=client,
1389
+ state=state,
1390
+ session_id=session_id,
1391
+ title=title,
1392
+ archive=True, # Always keep snapshots (no auto-delete)
1393
+ )
1394
+ except (ChatReplError, HttpError) as exc:
1395
+ print(str(exc), file=sys.stderr)
1396
+ continue
1397
+
1398
+ print(f"unknown command: /{cmd}", file=sys.stderr)
1399
+ continue
1400
+
1401
+ # Regular user message.
1402
+ try:
1403
+ stats = _stream_chat_turn(
1404
+ client=client,
1405
+ session_id=session_id,
1406
+ user_text=line,
1407
+ think_budget=think_budget,
1408
+ temperature=temperature,
1409
+ top_p=top_p,
1410
+ system_prompt=system_prompt if should_send_system_prompt else None,
1411
+ )
1412
+ should_send_system_prompt = False
1413
+ last_stats = stats
1414
+ footer = _format_metrics(stats)
1415
+ if footer:
1416
+ print(footer)
1417
+ except KeyboardInterrupt:
1418
+ print("\n(cancelled)")
1419
+ continue
1420
+ except ChatReplError as exc:
1421
+ print(str(exc), file=sys.stderr)
1422
+ continue
1423
+ except HttpError as exc:
1424
+ if exc.status_code == 429:
1425
+ print("Server is busy (429). Try again.", file=sys.stderr)
1426
+ else:
1427
+ print(str(exc), file=sys.stderr)
1428
+ continue
1429
+ except (ChatReplError, HttpError) as exc:
1430
+ print(str(exc), file=sys.stderr)
1431
+ return 1
1432
+ except Exception as exc: # pragma: no cover
1433
+ print(f"error: {exc}", file=sys.stderr)
1434
+ return 1
1435
+ finally:
1436
+ try:
1437
+ lock.release()
1438
+ except Exception:
1439
+ pass
1440
+
1441
+
1442
+ def _coalesce_pasted_lines(first_line: str) -> str:
1443
+ """Best-effort: if the user pasted multiple lines, treat them as one message.
1444
+
1445
+ Python's built-in `input()` reads only one line. When a user pastes a block
1446
+ containing newlines, the remaining lines are immediately available on stdin
1447
+ and would otherwise be consumed as separate turns.
1448
+ """
1449
+
1450
+ if not sys.stdin.isatty():
1451
+ return first_line
1452
+
1453
+ def _strip_paste_markers(text: str) -> str:
1454
+ # Some terminals use bracketed paste mode. If those markers leak into
1455
+ # stdin, remove them.
1456
+ return text.replace("\x1b[200~", "").replace("\x1b[201~", "")
1457
+
1458
+ lines = [_strip_paste_markers(first_line.rstrip("\r\n"))]
1459
+
1460
+ # Hard limits avoid accidental runaway reads.
1461
+ max_lines = 128
1462
+ max_chars = 64_000
1463
+ total_chars = len(lines[0])
1464
+
1465
+ # Fast path: if nothing else is immediately queued, don't add latency.
1466
+ try:
1467
+ ready, _, _ = select.select([sys.stdin], [], [], 0.0)
1468
+ except Exception:
1469
+ return lines[0]
1470
+ if not ready:
1471
+ return lines[0]
1472
+
1473
+ # Pasted blocks can arrive over a few scheduling ticks. Once we detect that
1474
+ # stdin is readable, keep draining lines until a short quiet period elapses
1475
+ # (or we hit limits).
1476
+ deadline = time.monotonic() + 0.10
1477
+ quiet_s = 0.02
1478
+
1479
+ while len(lines) < max_lines and total_chars < max_chars:
1480
+ timeout = max(0.0, min(quiet_s, deadline - time.monotonic()))
1481
+ if timeout <= 0.0:
1482
+ break
1483
+ try:
1484
+ ready, _, _ = select.select([sys.stdin], [], [], timeout)
1485
+ except Exception:
1486
+ break
1487
+ if not ready:
1488
+ break
1489
+
1490
+ extra = sys.stdin.readline()
1491
+ if not extra:
1492
+ break
1493
+ extra = _strip_paste_markers(extra.rstrip("\r\n"))
1494
+ lines.append(extra)
1495
+ total_chars += len(extra) + 1
1496
+
1497
+ return "\n".join(lines)