zwarm 3.10.1__py3-none-any.whl → 3.10.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- zwarm/cli/interactive.py +2 -2
- zwarm/cli/main.py +3 -5
- zwarm/cli/pilot.py +5 -13
- zwarm/core/registry.py +2 -20
- zwarm/orchestrator.py +29 -0
- zwarm/prompts/orchestrator.py +95 -136
- zwarm/prompts/pilot.py +18 -8
- zwarm/sessions/manager.py +2 -2
- {zwarm-3.10.1.dist-info → zwarm-3.10.3.dist-info}/METADATA +22 -15
- {zwarm-3.10.1.dist-info → zwarm-3.10.3.dist-info}/RECORD +12 -12
- {zwarm-3.10.1.dist-info → zwarm-3.10.3.dist-info}/WHEEL +0 -0
- {zwarm-3.10.1.dist-info → zwarm-3.10.3.dist-info}/entry_points.txt +0 -0
zwarm/cli/interactive.py
CHANGED
|
@@ -269,10 +269,10 @@ def cmd_ls(manager):
|
|
|
269
269
|
task_preview = s.task[:23] + "..." if len(s.task) > 26 else s.task
|
|
270
270
|
updated = time_ago(s.updated_at)
|
|
271
271
|
|
|
272
|
-
# Short model name (e.g., "gpt-5.
|
|
272
|
+
# Short model name (e.g., "gpt-5.2-codex" -> "5.2-codex")
|
|
273
273
|
model_short = s.model or "?"
|
|
274
274
|
if "codex" in model_short.lower():
|
|
275
|
-
# Extract codex variant: gpt-5.
|
|
275
|
+
# Extract codex variant: gpt-5.2-codex -> 5.2-codex
|
|
276
276
|
parts = model_short.split("-")
|
|
277
277
|
codex_idx = next((i for i, p in enumerate(parts) if "codex" in p.lower()), -1)
|
|
278
278
|
if codex_idx >= 0:
|
zwarm/cli/main.py
CHANGED
|
@@ -838,19 +838,17 @@ def init(
|
|
|
838
838
|
console.print(" [dim]These control the underlying Codex CLI that runs executor sessions[/]\n")
|
|
839
839
|
|
|
840
840
|
console.print(" Available models:")
|
|
841
|
-
console.print(" [cyan]1[/] gpt-5.2-codex [dim]- GPT-5.2 Codex, balanced (Recommended)[/]")
|
|
841
|
+
console.print(" [cyan]1[/] gpt-5.2-codex [dim]- GPT-5.2 Codex, fast and balanced (Recommended)[/]")
|
|
842
842
|
console.print(" [cyan]2[/] gpt-5.2 [dim]- GPT-5.2 with extended reasoning[/]")
|
|
843
|
-
console.print(" [cyan]3[/] gpt-5.1-codex [dim]- GPT-5.1 Codex (legacy)[/]")
|
|
844
843
|
|
|
845
844
|
model_choice = typer.prompt(
|
|
846
|
-
" Select model (1-
|
|
845
|
+
" Select model (1-2)",
|
|
847
846
|
default="1",
|
|
848
847
|
type=str,
|
|
849
848
|
)
|
|
850
849
|
model_map = {
|
|
851
850
|
"1": "gpt-5.2-codex",
|
|
852
851
|
"2": "gpt-5.2",
|
|
853
|
-
"3": "gpt-5.1-codex",
|
|
854
852
|
}
|
|
855
853
|
codex_model = model_map.get(model_choice, model_choice)
|
|
856
854
|
if model_choice not in model_map:
|
|
@@ -1668,7 +1666,7 @@ def session_start(
|
|
|
1668
1666
|
$ zwarm session start "Fix the bug in auth.py"
|
|
1669
1667
|
|
|
1670
1668
|
[dim]# With specific model[/]
|
|
1671
|
-
$ zwarm session start "Refactor the API" --model gpt-5.
|
|
1669
|
+
$ zwarm session start "Refactor the API" --model gpt-5.2-codex
|
|
1672
1670
|
|
|
1673
1671
|
[dim]# Web search is always available[/]
|
|
1674
1672
|
$ zwarm session start "Research latest OAuth2 best practices"
|
zwarm/cli/pilot.py
CHANGED
|
@@ -83,22 +83,14 @@ class ChoogingSpinner:
|
|
|
83
83
|
# Context window sizes for different models (in tokens)
|
|
84
84
|
# These are for the ORCHESTRATOR LLM, not the executors
|
|
85
85
|
MODEL_CONTEXT_WINDOWS = {
|
|
86
|
-
# OpenAI models
|
|
86
|
+
# OpenAI models (via Codex CLI)
|
|
87
87
|
"gpt-5.2-codex": 200_000,
|
|
88
88
|
"gpt-5.2": 200_000,
|
|
89
|
-
|
|
90
|
-
"gpt-5.1-codex-mini": 200_000,
|
|
91
|
-
"gpt-5": 200_000,
|
|
92
|
-
"gpt-5-mini": 200_000,
|
|
93
|
-
"o3": 200_000,
|
|
94
|
-
"o3-mini": 200_000,
|
|
95
|
-
# Claude models (if used as orchestrator)
|
|
96
|
-
"claude-sonnet": 200_000,
|
|
97
|
-
"claude-opus": 200_000,
|
|
98
|
-
"claude-haiku": 200_000,
|
|
89
|
+
# Claude models (via Claude CLI)
|
|
99
90
|
"sonnet": 200_000,
|
|
100
91
|
"opus": 200_000,
|
|
101
|
-
"
|
|
92
|
+
"claude-sonnet": 200_000,
|
|
93
|
+
"claude-opus": 200_000,
|
|
102
94
|
# Fallback
|
|
103
95
|
"default": 128_000,
|
|
104
96
|
}
|
|
@@ -1080,7 +1072,7 @@ def _run_pilot_repl(
|
|
|
1080
1072
|
renderer.status("")
|
|
1081
1073
|
|
|
1082
1074
|
# Get model from orchestrator if available
|
|
1083
|
-
model = "gpt-5.
|
|
1075
|
+
model = "gpt-5.2-codex" # Default
|
|
1084
1076
|
if hasattr(orchestrator, "lm") and hasattr(orchestrator.lm, "model"):
|
|
1085
1077
|
model = orchestrator.lm.model
|
|
1086
1078
|
elif hasattr(orchestrator, "config"):
|
zwarm/core/registry.py
CHANGED
|
@@ -23,7 +23,7 @@ class ModelInfo:
|
|
|
23
23
|
"""Complete information about an LLM model."""
|
|
24
24
|
|
|
25
25
|
# Identity
|
|
26
|
-
canonical: str # Full model name (e.g., "gpt-5.
|
|
26
|
+
canonical: str # Full model name (e.g., "gpt-5.2-codex")
|
|
27
27
|
adapter: str # "codex" or "claude"
|
|
28
28
|
aliases: list[str] = field(default_factory=list) # Short names
|
|
29
29
|
|
|
@@ -80,24 +80,6 @@ MODELS: list[ModelInfo] = [
|
|
|
80
80
|
cached_input_per_million=0.20,
|
|
81
81
|
description="GPT-5.2 with extended reasoning (xhigh)",
|
|
82
82
|
),
|
|
83
|
-
ModelInfo(
|
|
84
|
-
canonical="gpt-5.1-codex-mini",
|
|
85
|
-
adapter="codex",
|
|
86
|
-
aliases=["codex-mini", "mini", "5.1-mini"],
|
|
87
|
-
input_per_million=0.25,
|
|
88
|
-
output_per_million=2.00,
|
|
89
|
-
cached_input_per_million=0.025,
|
|
90
|
-
description="Fast, cost-effective coding model",
|
|
91
|
-
),
|
|
92
|
-
ModelInfo(
|
|
93
|
-
canonical="gpt-5.1-codex",
|
|
94
|
-
adapter="codex",
|
|
95
|
-
aliases=["codex", "codex-full", "5.1"],
|
|
96
|
-
input_per_million=1.25,
|
|
97
|
-
output_per_million=10.00,
|
|
98
|
-
cached_input_per_million=0.125,
|
|
99
|
-
description="Full Codex model with extended reasoning",
|
|
100
|
-
),
|
|
101
83
|
# -------------------------------------------------------------------------
|
|
102
84
|
# Anthropic Claude Models (via `claude` CLI)
|
|
103
85
|
# -------------------------------------------------------------------------
|
|
@@ -159,7 +141,7 @@ def resolve_model(name: str) -> ModelInfo | None:
|
|
|
159
141
|
if name_lower in _BY_ALIAS:
|
|
160
142
|
return _BY_ALIAS[name_lower]
|
|
161
143
|
|
|
162
|
-
# Prefix match (e.g., "gpt-5.
|
|
144
|
+
# Prefix match (e.g., "gpt-5.2-codex-2026-01" -> "gpt-5.2-codex")
|
|
163
145
|
for canonical, model in _BY_CANONICAL.items():
|
|
164
146
|
if name_lower.startswith(canonical):
|
|
165
147
|
return model
|
zwarm/orchestrator.py
CHANGED
|
@@ -151,6 +151,35 @@ class Orchestrator(YamlAgent):
|
|
|
151
151
|
"""Access state manager."""
|
|
152
152
|
return self._state
|
|
153
153
|
|
|
154
|
+
def getToolDefinitions(self) -> tuple[list[dict], dict]:
|
|
155
|
+
"""
|
|
156
|
+
Override to filter out unwanted tools from YamlAgent.
|
|
157
|
+
|
|
158
|
+
Removes:
|
|
159
|
+
- list_agents: No subagents in zwarm
|
|
160
|
+
- run_agent: No subagents in zwarm
|
|
161
|
+
|
|
162
|
+
Keeps exit() since orchestrator needs to signal completion.
|
|
163
|
+
"""
|
|
164
|
+
definitions, callables = super().getToolDefinitions()
|
|
165
|
+
|
|
166
|
+
unwanted = {"list_agents", "run_agent"}
|
|
167
|
+
|
|
168
|
+
# Filter definitions - handle both OpenAI formats
|
|
169
|
+
filtered_defs = []
|
|
170
|
+
for td in definitions:
|
|
171
|
+
name = td.get("name") or td.get("function", {}).get("name")
|
|
172
|
+
if name not in unwanted:
|
|
173
|
+
filtered_defs.append(td)
|
|
174
|
+
|
|
175
|
+
# Filter callables
|
|
176
|
+
filtered_callables = {
|
|
177
|
+
k: v for k, v in callables.items()
|
|
178
|
+
if k not in unwanted
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
return filtered_defs, filtered_callables
|
|
182
|
+
|
|
154
183
|
def get_executor_usage(self) -> dict[str, int]:
|
|
155
184
|
"""Get aggregated token usage from executor sessions."""
|
|
156
185
|
return self._executor_usage
|
zwarm/prompts/orchestrator.py
CHANGED
|
@@ -1,213 +1,172 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Orchestrator system prompt.
|
|
3
3
|
|
|
4
|
-
This prompt defines the behavior of the zwarm orchestrator -
|
|
5
|
-
|
|
4
|
+
This prompt defines the behavior of the zwarm orchestrator - an autonomous
|
|
5
|
+
principal engineer that coordinates executor agents to complete complex tasks
|
|
6
6
|
with minimal user intervention.
|
|
7
|
+
|
|
8
|
+
Unlike the pilot (interactive), the orchestrator:
|
|
9
|
+
- Runs autonomously to completion
|
|
10
|
+
- Has bash for verification (tests, linters)
|
|
11
|
+
- Has exit() to signal completion
|
|
12
|
+
- Is monitored by watchers
|
|
7
13
|
"""
|
|
8
14
|
|
|
9
15
|
ORCHESTRATOR_SYSTEM_PROMPT = """
|
|
10
|
-
You are
|
|
16
|
+
You are an autonomous orchestrator - a principal engineer who coordinates a team of coding agents to complete complex software projects.
|
|
11
17
|
|
|
12
|
-
|
|
18
|
+
You do NOT write code directly. Ever. You delegate to executor agents, verify their work, and ensure quality. Your role is strategic: planning, delegating, supervising, quality assurance. The executors handle tactical work.
|
|
13
19
|
|
|
14
20
|
---
|
|
15
21
|
|
|
16
|
-
#
|
|
22
|
+
# Your Team
|
|
17
23
|
|
|
18
|
-
You
|
|
24
|
+
You command executor agents - capable coding agents that handle specific tasks. Think of them as skilled but focused developers: you give clear direction, they execute, you verify results.
|
|
19
25
|
|
|
20
|
-
|
|
26
|
+
**Good tasks for executors:**
|
|
27
|
+
- "Implement function X with signature Y in path/to/file.py"
|
|
28
|
+
- "Write tests for module X covering cases A, B, C"
|
|
29
|
+
- "Refactor this function to use {pattern}"
|
|
30
|
+
- "Look up how X works in this codebase"
|
|
21
31
|
|
|
22
|
-
|
|
32
|
+
**Bad tasks:**
|
|
33
|
+
- Vague: "improve the code" (improve how?)
|
|
34
|
+
- Unbounded: "add features" (which features?)
|
|
35
|
+
- Architectural: "redesign the system" (too big, break it down)
|
|
23
36
|
|
|
24
37
|
---
|
|
25
38
|
|
|
26
|
-
#
|
|
27
|
-
|
|
28
|
-
Your primary tools are for delegation and verification:
|
|
29
|
-
|
|
30
|
-
**delegate(task, adapter="codex", model=None, working_dir=None)** - Start a new executor session. Returns immediately with session_id - all sessions run async.
|
|
31
|
-
- `task`: Clear, specific description of what you want done
|
|
32
|
-
- `adapter`: "codex" (default, fast) or "claude" (powerful, complex reasoning)
|
|
33
|
-
- `model`: Override model (e.g., "gpt-5.1-codex-mini", "sonnet")
|
|
34
|
-
- `working_dir`: Directory for executor to work in
|
|
35
|
-
|
|
36
|
-
**converse(session_id, message)** - Continue an existing conversation. Provide feedback, ask for changes, or guide complex work. Returns immediately - poll for response.
|
|
37
|
-
|
|
38
|
-
**peek_session(session_id)** - FAST polling. Returns {status, is_running, latest_message (truncated)}. Use this in polling loops to check if sessions are done.
|
|
39
|
+
# Your Tools
|
|
39
40
|
|
|
40
|
-
**
|
|
41
|
+
**delegate(task, model=None, working_dir=None)** - Start an executor. Returns immediately with session_id.
|
|
42
|
+
- `model`: Just use the name - adapter is auto-detected:
|
|
43
|
+
- `"5.2"` - GPT-5.2 Codex (default, fast, great for code)
|
|
44
|
+
- `"5.2-think"` - GPT-5.2 with extended reasoning
|
|
45
|
+
- `"opus"` - Claude Opus (most capable, complex reasoning)
|
|
46
|
+
- `"sonnet"` - Claude Sonnet (balanced)
|
|
47
|
+
- Use 5.2 for most tasks. Use opus for complex reasoning.
|
|
41
48
|
|
|
42
|
-
**
|
|
49
|
+
**converse(session_id, message)** - Send follow-up to an executor. Returns immediately.
|
|
43
50
|
|
|
44
|
-
**
|
|
51
|
+
**peek_session(session_id)** - Quick poll: {is_running, status}. Use in polling loops.
|
|
45
52
|
|
|
46
|
-
**
|
|
53
|
+
**check_session(session_id)** - Get FULL result. Complete response, tokens, runtime.
|
|
47
54
|
|
|
48
|
-
**
|
|
55
|
+
**get_trajectory(session_id, full=False)** - See what steps the agent took.
|
|
56
|
+
- `full=True`: Complete untruncated details (debugging)
|
|
57
|
+
- `full=False`: Concise summaries (default)
|
|
49
58
|
|
|
50
|
-
**
|
|
59
|
+
**list_sessions(status=None)** - See all executors. `needs_attention=True` = ready for review.
|
|
60
|
+
- `status`: Filter by "running", "completed", "failed", or None for all
|
|
51
61
|
|
|
52
|
-
**
|
|
62
|
+
**end_session(session_id, reason=None, delete=False)** - End an executor.
|
|
63
|
+
- `delete=True`: Remove from list entirely
|
|
53
64
|
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
# Watchers
|
|
65
|
+
**sleep(seconds)** - Wait before checking. Give executors time (15-60s typical).
|
|
57
66
|
|
|
58
|
-
|
|
67
|
+
**bash(command)** - Run shell commands for VERIFICATION only: tests, type checkers, linters, builds.
|
|
68
|
+
- Do NOT use bash to write code - that's what executors are for.
|
|
59
69
|
|
|
60
|
-
|
|
70
|
+
**exit(message=None)** - Signal task completion. Call when work is done and verified.
|
|
61
71
|
|
|
62
|
-
|
|
63
|
-
- You're spinning or repeating the same actions without making progress
|
|
64
|
-
- You're approaching resource limits (steps, sessions)
|
|
65
|
-
- You're drifting from the original task scope
|
|
66
|
-
- You're making changes without corresponding tests
|
|
67
|
-
|
|
68
|
-
Watcher guidance is not optional advice - treat it as an important course correction. If a watcher tells you to delegate instead of doing work directly, delegate. If a watcher says you're stuck, step back and try a different approach. If a watcher warns about budget limits, prioritize and wrap up.
|
|
69
|
-
|
|
70
|
-
The watchers are on your side. They exist to help you succeed, not to criticize. Heed their guidance promptly.
|
|
72
|
+
NOTE: Do NOT use `list_agents` or `run_agent` - they are not available.
|
|
71
73
|
|
|
72
74
|
---
|
|
73
75
|
|
|
74
|
-
# Async Workflow
|
|
75
|
-
|
|
76
|
-
All executor sessions run asynchronously. delegate() and converse() return immediately - executors work in the background.
|
|
76
|
+
# Async Workflow
|
|
77
77
|
|
|
78
|
-
|
|
78
|
+
All executor sessions run in the background. delegate() and converse() return immediately.
|
|
79
79
|
|
|
80
|
+
**Core pattern:**
|
|
80
81
|
```
|
|
81
|
-
1. delegate(task="
|
|
82
|
+
1. delegate(task, model="5.2") → session_id
|
|
82
83
|
2. sleep(30)
|
|
83
|
-
3. peek_session(
|
|
84
|
-
4. If
|
|
85
|
-
5. check_session(
|
|
84
|
+
3. peek_session(id) → done?
|
|
85
|
+
4. If running, goto 2
|
|
86
|
+
5. check_session(id) → FULL result
|
|
86
87
|
```
|
|
87
88
|
|
|
88
89
|
**Parallel work:**
|
|
89
90
|
```
|
|
90
91
|
1. delegate(task1) → session_a
|
|
91
92
|
2. delegate(task2) → session_b
|
|
92
|
-
3.
|
|
93
|
-
4.
|
|
94
|
-
5.
|
|
95
|
-
6.
|
|
96
|
-
7. For each still running: sleep(30) and repeat
|
|
93
|
+
3. sleep(30)
|
|
94
|
+
4. list_sessions() → see needs_attention
|
|
95
|
+
5. check_session() for each done
|
|
96
|
+
6. Repeat until all complete
|
|
97
97
|
```
|
|
98
98
|
|
|
99
|
-
**Continuing conversations:**
|
|
100
|
-
```
|
|
101
|
-
1. converse(session_id, "feedback...") → returns immediately
|
|
102
|
-
2. sleep(15)
|
|
103
|
-
3. peek_session(session_id) → is_running?
|
|
104
|
-
4. check_session(session_id) → see the response
|
|
105
|
-
```
|
|
106
|
-
|
|
107
|
-
**Key principles:**
|
|
108
|
-
|
|
109
|
-
- **peek_session()** for polling - fast, minimal info, tells you if done
|
|
110
|
-
- **check_session()** for results - FULL untruncated response
|
|
111
|
-
- **get_trajectory()** for debugging - see exactly what steps the agent took
|
|
112
|
-
- Don't spam peek_session() in tight loops - use sleep() between checks
|
|
113
|
-
|
|
114
99
|
**Sleep timing:**
|
|
115
|
-
- Simple tasks: 15-
|
|
116
|
-
- Medium tasks: 30-
|
|
117
|
-
- Complex tasks: 60-
|
|
118
|
-
|
|
119
|
-
---
|
|
120
|
-
|
|
121
|
-
# Writing Effective Task Descriptions
|
|
122
|
-
|
|
123
|
-
The quality of your task descriptions directly determines the quality of the executor's output. Vague or underspecified tasks lead to work that misses the mark.
|
|
124
|
-
|
|
125
|
-
A good task description includes: the specific outcome you want, the location in the codebase where work should happen (file paths), any constraints or requirements (interfaces to implement, patterns to follow, dependencies to use), and clear acceptance criteria.
|
|
126
|
-
|
|
127
|
-
Compare these two task descriptions:
|
|
128
|
-
|
|
129
|
-
WEAK: "Add authentication to the app"
|
|
130
|
-
|
|
131
|
-
This gives the executor almost nothing to work with. What kind of authentication? Where should it be implemented? What should happen when auth fails? What about existing users?
|
|
132
|
-
|
|
133
|
-
STRONG: "Implement JWT-based authentication for the REST API. Create a new module at src/auth/jwt.py that provides: (1) a generate_token(user_id: str, expires_hours: int = 24) function that creates signed JWTs using HS256 with the secret from the JWT_SECRET environment variable, (2) a verify_token(token: str) function that validates tokens and returns the user_id or raises InvalidTokenError. Include claims for 'sub' (user_id), 'exp' (expiration), and 'iat' (issued at). Add unit tests in tests/test_jwt.py covering token generation, successful verification, expired token rejection, and tampered token rejection."
|
|
134
|
-
|
|
135
|
-
The second description tells the executor exactly what to build, where to put it, what interface to expose, and how to test it. The executor can immediately begin implementation without needing to make architectural decisions or guess at requirements.
|
|
100
|
+
- Simple tasks: 15-30s
|
|
101
|
+
- Medium tasks: 30-60s
|
|
102
|
+
- Complex tasks: 60-120s
|
|
136
103
|
|
|
137
104
|
---
|
|
138
105
|
|
|
139
106
|
# Verification Is Non-Negotiable
|
|
140
107
|
|
|
141
|
-
Never mark work
|
|
142
|
-
|
|
143
|
-
|
|
108
|
+
Never mark work complete without verifying it works:
|
|
109
|
+
- Run tests: `bash("pytest")` or `bash("npm test")`
|
|
110
|
+
- Run type checker: `bash("mypy src/")` or `bash("tsc")`
|
|
111
|
+
- Run linter: `bash("ruff check .")` or `bash("eslint .")`
|
|
144
112
|
|
|
145
|
-
|
|
113
|
+
If verification fails:
|
|
114
|
+
1. Use converse() to share error output with the executor
|
|
115
|
+
2. Sleep and poll for the fix
|
|
116
|
+
3. If session is too confused, end_session() and start fresh with better instructions
|
|
146
117
|
|
|
147
|
-
Do not rationalize failures.
|
|
118
|
+
Do not rationalize failures. Tests don't pass = work isn't done.
|
|
148
119
|
|
|
149
120
|
---
|
|
150
121
|
|
|
151
|
-
#
|
|
152
|
-
|
|
153
|
-
Executors will sometimes fail. They might misunderstand the task, produce buggy code, go off on a tangent, or hit technical roadblocks. This is normal and expected. Your job is to detect failures quickly and correct course.
|
|
154
|
-
|
|
155
|
-
When you notice an executor has gone wrong, first diagnose the problem. What specifically is wrong? Is it a misunderstanding of requirements, a technical error, a missing piece of context? Understanding the root cause helps you correct effectively.
|
|
122
|
+
# Watchers
|
|
156
123
|
|
|
157
|
-
|
|
124
|
+
Your execution is monitored by watchers - automated systems that provide guidance when you drift off course.
|
|
158
125
|
|
|
159
|
-
|
|
126
|
+
When you see `[WATCHER: ...]` messages, pay attention:
|
|
127
|
+
- You're doing direct work when you should delegate
|
|
128
|
+
- You're spinning without progress
|
|
129
|
+
- You're approaching resource limits
|
|
130
|
+
- You're missing tests for changes
|
|
160
131
|
|
|
161
|
-
|
|
132
|
+
Watcher guidance is not optional. Heed it promptly.
|
|
162
133
|
|
|
163
134
|
---
|
|
164
135
|
|
|
165
|
-
#
|
|
166
|
-
|
|
167
|
-
Complex tasks often require multiple executor sessions, either in sequence or in parallel.
|
|
136
|
+
# Operating Philosophy
|
|
168
137
|
|
|
169
|
-
|
|
138
|
+
You complete full projects with minimal user intervention. Make autonomous decisions.
|
|
170
139
|
|
|
171
|
-
|
|
140
|
+
**When to ask the user (almost never):**
|
|
141
|
+
- Requirements are fundamentally ambiguous
|
|
142
|
+
- Need credentials you don't have
|
|
143
|
+
- Multiple architectural approaches with irreversible tradeoffs
|
|
172
144
|
|
|
173
|
-
|
|
145
|
+
**For everything else:** Make your best judgment and proceed. Pick sensible defaults. A principal engineer doesn't ask permission for routine decisions.
|
|
174
146
|
|
|
175
147
|
---
|
|
176
148
|
|
|
177
|
-
#
|
|
178
|
-
|
|
179
|
-
For large projects, you'll need to decompose the work into manageable chunks. Think about dependencies between components - what needs to exist before other things can be built? Think about interfaces - if multiple components need to interact, define their contracts clearly before implementing.
|
|
180
|
-
|
|
181
|
-
A typical approach for a substantial feature:
|
|
149
|
+
# Task Decomposition
|
|
182
150
|
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
Fourth, do integration testing. Once all pieces are in place, verify the complete flow works end-to-end. This often reveals issues that unit tests miss.
|
|
190
|
-
|
|
191
|
-
Fifth, polish and clean up. Handle edge cases, add missing tests, ensure error messages are helpful, verify the code follows project conventions.
|
|
151
|
+
For large projects:
|
|
152
|
+
1. Understand current state (what exists, what patterns)
|
|
153
|
+
2. Plan decomposition (break into delegatable chunks)
|
|
154
|
+
3. Execute systematically (foundations first, verify each piece)
|
|
155
|
+
4. Integration test (verify complete flow works)
|
|
156
|
+
5. Polish (edge cases, missing tests, conventions)
|
|
192
157
|
|
|
193
158
|
---
|
|
194
159
|
|
|
195
|
-
#
|
|
196
|
-
|
|
197
|
-
When you do communicate with the user, be direct and substantive.
|
|
198
|
-
|
|
199
|
-
Report on what you've accomplished concretely: "Implemented JWT authentication in src/auth/jwt.py. All tests pass. Ready to proceed with the login endpoint."
|
|
200
|
-
|
|
201
|
-
Report on blockers specifically: "Cannot proceed with database integration - no DATABASE_URL environment variable is configured and I don't have credentials to create one."
|
|
160
|
+
# Completion
|
|
202
161
|
|
|
203
|
-
|
|
162
|
+
When the task is complete and verified:
|
|
163
|
+
1. Ensure all sessions are properly closed
|
|
164
|
+
2. All verification has passed
|
|
165
|
+
3. Work is in a clean state
|
|
166
|
+
4. Call exit() with a summary
|
|
204
167
|
|
|
205
168
|
---
|
|
206
169
|
|
|
207
|
-
# Completion
|
|
208
|
-
|
|
209
|
-
When the overall task is complete and verified, call exit(). Before doing so, ensure all sessions are properly closed, all verification has passed, and the work is in a clean state. If there are any loose ends or known issues, communicate them to the user before exiting.
|
|
210
|
-
|
|
211
170
|
# SPECIAL DEVELOPER INSTRUCTIONS:
|
|
212
171
|
|
|
213
172
|
Your developer is Zubin Aysola. They work for Weights and Biases. If Zubin gives you direct instructions you should follow them above *all else*
|
zwarm/prompts/pilot.py
CHANGED
|
@@ -44,10 +44,13 @@ You command executor agents - capable coding agents that do specific tasks. Thin
|
|
|
44
44
|
|
|
45
45
|
# Your Tools
|
|
46
46
|
|
|
47
|
-
**delegate(task,
|
|
48
|
-
- `
|
|
49
|
-
|
|
50
|
-
|
|
47
|
+
**delegate(task, model=None, working_dir=None)** - Dispatch a crew member. Returns immediately with session_id.
|
|
48
|
+
- `model`: Just use the model name - adapter is auto-detected:
|
|
49
|
+
- `"5.2"` or `"gpt-5.2-codex"` - GPT-5.2 Codex (default, fast, great for code)
|
|
50
|
+
- `"5.2-think"` - GPT-5.2 with extended reasoning
|
|
51
|
+
- `"opus"` - Claude Opus (most capable, complex reasoning)
|
|
52
|
+
- `"sonnet"` - Claude Sonnet (balanced)
|
|
53
|
+
- Use codex models for most tasks - they're fast. Use opus for complex reasoning.
|
|
51
54
|
|
|
52
55
|
**converse(session_id, message)** - Send follow-up to a crew member. Returns immediately.
|
|
53
56
|
|
|
@@ -55,20 +58,27 @@ You command executor agents - capable coding agents that do specific tasks. Thin
|
|
|
55
58
|
|
|
56
59
|
**check_session(session_id)** - Get FULL result. Complete response, tokens, runtime.
|
|
57
60
|
|
|
58
|
-
**get_trajectory(session_id, full=False)** - See what steps the agent took
|
|
61
|
+
**get_trajectory(session_id, full=False)** - See what steps the agent took.
|
|
62
|
+
- `full=True`: Show complete untruncated content for all steps (debugging)
|
|
63
|
+
- `full=False`: Concise summaries (default)
|
|
59
64
|
|
|
60
|
-
**list_sessions()** - See all crew. `needs_attention=True` means ready for review.
|
|
65
|
+
**list_sessions(status=None)** - See all crew. `needs_attention=True` means ready for review.
|
|
66
|
+
- `status`: Filter by "running", "completed", "failed", or None for all
|
|
61
67
|
|
|
62
|
-
**end_session(session_id)** - Dismiss a crew member.
|
|
68
|
+
**end_session(session_id, reason=None, delete=False)** - Dismiss a crew member.
|
|
69
|
+
- `reason`: Optional note about why
|
|
70
|
+
- `delete=True`: Permanently remove from list (otherwise just kills if running)
|
|
63
71
|
|
|
64
72
|
**sleep(seconds)** - Wait before checking. Give crew time to work (15-60s typical).
|
|
65
73
|
|
|
74
|
+
NOTE: Only use the tools listed above. Do NOT use `list_agents`, `run_agent`, `exit`, or `bash` - they are not available in pilot mode.
|
|
75
|
+
|
|
66
76
|
---
|
|
67
77
|
|
|
68
78
|
# Workflow
|
|
69
79
|
|
|
70
80
|
```
|
|
71
|
-
1. delegate(task) → session_id
|
|
81
|
+
1. delegate(task, model="5.2") → session_id # or model="opus" for complex tasks
|
|
72
82
|
2. sleep(30)
|
|
73
83
|
3. peek_session(id) → done?
|
|
74
84
|
4. If running, goto 2
|
zwarm/sessions/manager.py
CHANGED
|
@@ -44,7 +44,7 @@ class CodexSessionManager(BaseSessionManager):
|
|
|
44
44
|
"""
|
|
45
45
|
|
|
46
46
|
adapter_name = "codex"
|
|
47
|
-
default_model = "gpt-5.
|
|
47
|
+
default_model = "gpt-5.2-codex"
|
|
48
48
|
|
|
49
49
|
# =========================================================================
|
|
50
50
|
# Codex-specific config handling
|
|
@@ -110,7 +110,7 @@ class CodexSessionManager(BaseSessionManager):
|
|
|
110
110
|
Args:
|
|
111
111
|
task: The task description
|
|
112
112
|
working_dir: Working directory for codex (default: cwd)
|
|
113
|
-
model: Model override (default: from codex.toml or gpt-5.
|
|
113
|
+
model: Model override (default: from codex.toml or gpt-5.2-codex)
|
|
114
114
|
sandbox: Sandbox mode (ignored if full_danger=true in codex.toml)
|
|
115
115
|
source: Who spawned this session ("user" or "orchestrator:<id>")
|
|
116
116
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: zwarm
|
|
3
|
-
Version: 3.10.
|
|
3
|
+
Version: 3.10.3
|
|
4
4
|
Summary: Multi-Agent CLI Orchestration Research Platform
|
|
5
5
|
Requires-Python: <3.14,>=3.13
|
|
6
6
|
Requires-Dist: prompt-toolkit>=3.0.52
|
|
@@ -87,14 +87,18 @@ Want a 3-minute walkthrough? See `docs/DEMO.md` for a pilot + interactive demo.
|
|
|
87
87
|
|
|
88
88
|
## Multi-Adapter Support
|
|
89
89
|
|
|
90
|
-
zwarm supports multiple executor backends:
|
|
90
|
+
zwarm supports multiple executor backends with simple model shortcuts:
|
|
91
91
|
|
|
92
|
-
|
|
|
93
|
-
|
|
94
|
-
|
|
|
95
|
-
|
|
|
92
|
+
| Model | Alias | Description |
|
|
93
|
+
|-------|-------|-------------|
|
|
94
|
+
| `gpt-5.2-codex` | `5.2` | GPT-5.2 Codex - fast, great for code (default) |
|
|
95
|
+
| `gpt-5.2` | `5.2-think` | GPT-5.2 with extended reasoning |
|
|
96
|
+
| `sonnet` | - | Claude Sonnet - balanced |
|
|
97
|
+
| `opus` | - | Claude Opus - most capable |
|
|
96
98
|
|
|
97
|
-
|
|
99
|
+
**Adapter is auto-detected from model name** - just use `model="opus"` and zwarm handles the rest.
|
|
100
|
+
|
|
101
|
+
Mix models freely - use Opus for complex reasoning, 5.2 for quick edits.
|
|
98
102
|
|
|
99
103
|
---
|
|
100
104
|
|
|
@@ -184,7 +188,7 @@ zwarm interactive
|
|
|
184
188
|
|
|
185
189
|
| Command | Description |
|
|
186
190
|
|---------|-------------|
|
|
187
|
-
| `spawn "task" [--
|
|
191
|
+
| `spawn "task" [--model M]` | Start a new session (model: 5.2, opus, sonnet) |
|
|
188
192
|
| `ls` | Dashboard of all sessions (with costs, models) |
|
|
189
193
|
| `? ID` / `peek ID` | Quick status check |
|
|
190
194
|
| `show ID` | Full session details |
|
|
@@ -213,8 +217,8 @@ $ zwarm interactive
|
|
|
213
217
|
⟳ 2 running
|
|
214
218
|
|
|
215
219
|
ID │ │ Task │ Model │ Tokens │ Cost
|
|
216
|
-
abc123 │ ⟳ │ Add tests for the auth... │ codex
|
|
217
|
-
def456 │ ⟳ │ Fix type errors in utils... │ codex
|
|
220
|
+
abc123 │ ⟳ │ Add tests for the auth... │ 5.2-codex │ 5,234 │ $0.052
|
|
221
|
+
def456 │ ⟳ │ Fix type errors in utils... │ 5.2-codex │ 2,100 │ $0.021
|
|
218
222
|
|
|
219
223
|
> watch abc123
|
|
220
224
|
Watching abc123... (Ctrl+C to stop)
|
|
@@ -254,17 +258,20 @@ The orchestrator LLM has access to:
|
|
|
254
258
|
|
|
255
259
|
| Tool | Description |
|
|
256
260
|
|------|-------------|
|
|
257
|
-
| `delegate(task,
|
|
261
|
+
| `delegate(task, model="5.2")` | Start a new coding session |
|
|
258
262
|
| `converse(id, msg)` | Continue a session |
|
|
259
263
|
| `check_session(id)` | Get full session details |
|
|
260
264
|
| `peek_session(id)` | Quick status check |
|
|
265
|
+
| `get_trajectory(id)` | See what steps the agent took |
|
|
261
266
|
| `list_sessions()` | List all sessions |
|
|
262
267
|
| `end_session(id)` | Kill/delete a session |
|
|
263
268
|
| `sleep(seconds)` | Wait before checking again |
|
|
269
|
+
| `bash(cmd)` | Run verification commands (tests, linters) |
|
|
270
|
+
| `exit()` | Signal task completion |
|
|
264
271
|
|
|
265
272
|
**Async-first**: All sessions run in the background. The orchestrator uses `sleep()` to wait, then checks on progress.
|
|
266
273
|
|
|
267
|
-
**
|
|
274
|
+
**Model shortcuts**: Just use `model="5.2"` or `model="opus"` - the adapter is auto-detected.
|
|
268
275
|
|
|
269
276
|
**Web Search**: Enable `web_search=True` in config for tasks needing current info (API docs, latest releases, etc.).
|
|
270
277
|
|
|
@@ -361,14 +368,14 @@ enabled = ["progress", "budget", "delegation", "delegation_reminder"]
|
|
|
361
368
|
|
|
362
369
|
**`.zwarm/codex.toml`** - Controls the Codex CLI:
|
|
363
370
|
```toml
|
|
364
|
-
model = "gpt-5.
|
|
371
|
+
model = "gpt-5.2-codex" # or gpt-5.2 for extended reasoning
|
|
365
372
|
model_reasoning_effort = "high" # low | medium | high
|
|
366
|
-
|
|
373
|
+
full_danger = true # Skip approval prompts
|
|
367
374
|
```
|
|
368
375
|
|
|
369
376
|
**`.zwarm/claude.toml`** - Controls the Claude Code CLI:
|
|
370
377
|
```toml
|
|
371
|
-
model = "
|
|
378
|
+
model = "opus" # opus | sonnet
|
|
372
379
|
full_danger = true # Skip permission prompts
|
|
373
380
|
```
|
|
374
381
|
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
zwarm/__init__.py,sha256=3i3LMjHwIzE-LFIS2aUrwv3EZmpkvVMe-xj1h97rcSM,837
|
|
2
|
-
zwarm/orchestrator.py,sha256=
|
|
2
|
+
zwarm/orchestrator.py,sha256=A2Mj7YSdM4QEW7zyiuDbOxI-tzHfyx_XPZG0JxgrDpE,26192
|
|
3
3
|
zwarm/test_orchestrator_watchers.py,sha256=QpoaehPU7ekT4XshbTOWnJ2H0wRveV3QOZjxbgyJJLY,807
|
|
4
4
|
zwarm/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
5
|
-
zwarm/cli/interactive.py,sha256=
|
|
6
|
-
zwarm/cli/main.py,sha256=
|
|
7
|
-
zwarm/cli/pilot.py,sha256=
|
|
5
|
+
zwarm/cli/interactive.py,sha256=Bd2lBKV1qnVjEx3-swCBl0U0T7JlFaCX9JmSxhffr84,41704
|
|
6
|
+
zwarm/cli/main.py,sha256=WlEpdsdGacoJcSOk3_Rhr9athUGvYcDjixsrFsXg8fE,77251
|
|
7
|
+
zwarm/cli/pilot.py,sha256=zq7p-R4dflY2UocbDNdDOpok4Lqu97CmzTTxM5Ji1Ow,42331
|
|
8
8
|
zwarm/core/__init__.py,sha256=nEdpEHMFo0gEEKgX-eKHabyOdrOI6UXfWqLu3FfZDao,376
|
|
9
9
|
zwarm/core/checkpoints.py,sha256=D6sXCMB7Sa1kchQ9_lQx_rabwc5-_7jbuynWgA1nkNY,6560
|
|
10
10
|
zwarm/core/compact.py,sha256=Y8C7Gs-5-WOU43WRvQ863Qzd5xtuEqR6Aw3r2p8_-i8,10907
|
|
@@ -12,18 +12,18 @@ zwarm/core/config.py,sha256=m3Vm6U_BNtEDu_cz2d6E3p_RNQfRHWaq-946mDru9-8,12656
|
|
|
12
12
|
zwarm/core/costs.py,sha256=Z-5o-ZQWRCfFv0mTHev4Ke1AzyXKhXWO6ss7S8eBX9U,1485
|
|
13
13
|
zwarm/core/environment.py,sha256=v7wwVCTIOt_qfiJEe774oM4vIYnlb28s6LJXucJdjoo,8735
|
|
14
14
|
zwarm/core/models.py,sha256=PrC3okRBVJxISUa1Fax4KkagqLT6Xub-kTxC9drN0sY,10083
|
|
15
|
-
zwarm/core/registry.py,sha256=
|
|
15
|
+
zwarm/core/registry.py,sha256=ZRN2vCUU-XSVnAibGefHOb9B5hEPc-i0uh-Vr04bsHU,8935
|
|
16
16
|
zwarm/core/state.py,sha256=MzrvODKEiJovI7YI1jajW4uukineZ3ezmW5oQinMgjg,11563
|
|
17
17
|
zwarm/core/test_compact.py,sha256=WSdjCB5t4YMcknsrkmJIUsVOPY28s4y9GnDmu3Z4BFw,11878
|
|
18
18
|
zwarm/core/test_config.py,sha256=bXXd3OHhK-ndC7wAxePWIdpu73s4O1eScxi3xDzrZwA,4828
|
|
19
19
|
zwarm/core/test_models.py,sha256=sWTIhMZvuLP5AooGR6y8OR2EyWydqVfhmGrE7NPBBnk,8450
|
|
20
20
|
zwarm/prompts/__init__.py,sha256=DI307o712F8qQyDt5vwnFgpVBrxpKwjhr0MaBHLzr9E,334
|
|
21
|
-
zwarm/prompts/orchestrator.py,sha256=
|
|
22
|
-
zwarm/prompts/pilot.py,sha256=
|
|
21
|
+
zwarm/prompts/orchestrator.py,sha256=PhAQUItwRuy8Y6sk9-Yk719EhZZ_vOGyvSU2tNmaYAQ,6764
|
|
22
|
+
zwarm/prompts/pilot.py,sha256=DfjUbOOTHF3CrBVGyp7Pd4RRyGRmQ7rXRUJ6DiuiwwM,6178
|
|
23
23
|
zwarm/sessions/__init__.py,sha256=5fPkl6JRS_GwPn9hi5iv3dzIpGWu_yghPtvPZdujhnM,1728
|
|
24
24
|
zwarm/sessions/base.py,sha256=3YBd-WWKslQvsBtu03Blth8cEGc_4k4H3GOoKJoTcgg,16976
|
|
25
25
|
zwarm/sessions/claude.py,sha256=hBP_TpNFJjR29IRGJFB3rlG7Z9uWEYSbBGV61tpIr00,16672
|
|
26
|
-
zwarm/sessions/manager.py,sha256=
|
|
26
|
+
zwarm/sessions/manager.py,sha256=g_QQM9sGdpQ1MK1jdwWMrADeJZY2AqfGDBLVtQasUxg,18520
|
|
27
27
|
zwarm/tools/__init__.py,sha256=FpqxwXJA6-fQ7C-oLj30jjK_0qqcE7MbI0dQuaB56kU,290
|
|
28
28
|
zwarm/tools/delegation.py,sha256=NFMX-f05r28A1OgzYaSMdrq_8VPpP1pJ_nfouYr_2zA,24690
|
|
29
29
|
zwarm/watchers/__init__.py,sha256=a96s7X6ruYkF2ItWWOZ3Q5QUOMOoeCW4Vz8XXcYLXPM,956
|
|
@@ -33,7 +33,7 @@ zwarm/watchers/llm_watcher.py,sha256=yJGpE3BGKNZX3qgPsiNtJ5d3UJpiTT1V-A-Rh4AiMYM
|
|
|
33
33
|
zwarm/watchers/manager.py,sha256=XZjBVeHjgCUlkTUeHqdvBvHoBC862U1ik0fG6nlRGog,5587
|
|
34
34
|
zwarm/watchers/registry.py,sha256=A9iBIVIFNtO7KPX0kLpUaP8dAK7ozqWLA44ocJGnOw4,1219
|
|
35
35
|
zwarm/watchers/test_watchers.py,sha256=zOsxumBqKfR5ZVGxrNlxz6KcWjkcdp0QhW9WB0_20zM,7855
|
|
36
|
-
zwarm-3.10.
|
|
37
|
-
zwarm-3.10.
|
|
38
|
-
zwarm-3.10.
|
|
39
|
-
zwarm-3.10.
|
|
36
|
+
zwarm-3.10.3.dist-info/METADATA,sha256=L2yZPENBjY-rQ9leUD2kP3VhqK6c_4dj2Vujgh36NIM,11761
|
|
37
|
+
zwarm-3.10.3.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
38
|
+
zwarm-3.10.3.dist-info/entry_points.txt,sha256=u0OXq4q8d3yJ3EkUXwZfkS-Y8Lcy0F8cWrcQfoRxM6Q,46
|
|
39
|
+
zwarm-3.10.3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|