zwarm 2.3.5__py3-none-any.whl → 3.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- zwarm/cli/interactive.py +1065 -0
- zwarm/cli/main.py +525 -934
- zwarm/cli/pilot.py +1240 -0
- zwarm/core/__init__.py +20 -0
- zwarm/core/checkpoints.py +216 -0
- zwarm/core/config.py +26 -9
- zwarm/core/costs.py +71 -0
- zwarm/core/registry.py +329 -0
- zwarm/core/test_config.py +2 -3
- zwarm/orchestrator.py +17 -43
- zwarm/prompts/__init__.py +3 -0
- zwarm/prompts/orchestrator.py +36 -29
- zwarm/prompts/pilot.py +147 -0
- zwarm/sessions/__init__.py +48 -9
- zwarm/sessions/base.py +501 -0
- zwarm/sessions/claude.py +481 -0
- zwarm/sessions/manager.py +233 -486
- zwarm/tools/delegation.py +150 -187
- zwarm-3.6.0.dist-info/METADATA +445 -0
- zwarm-3.6.0.dist-info/RECORD +39 -0
- zwarm/adapters/__init__.py +0 -21
- zwarm/adapters/base.py +0 -109
- zwarm/adapters/claude_code.py +0 -357
- zwarm/adapters/codex_mcp.py +0 -1262
- zwarm/adapters/registry.py +0 -69
- zwarm/adapters/test_codex_mcp.py +0 -274
- zwarm/adapters/test_registry.py +0 -68
- zwarm-2.3.5.dist-info/METADATA +0 -309
- zwarm-2.3.5.dist-info/RECORD +0 -38
- {zwarm-2.3.5.dist-info → zwarm-3.6.0.dist-info}/WHEEL +0 -0
- {zwarm-2.3.5.dist-info → zwarm-3.6.0.dist-info}/entry_points.txt +0 -0
zwarm/core/registry.py
ADDED
|
@@ -0,0 +1,329 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Model Registry - Centralized LLM model definitions for zwarm.
|
|
3
|
+
|
|
4
|
+
This registry defines all supported models with:
|
|
5
|
+
- Canonical names and aliases
|
|
6
|
+
- Adapter mapping (which CLI handles the model)
|
|
7
|
+
- Pricing information
|
|
8
|
+
|
|
9
|
+
Add new models here and they'll automatically appear in:
|
|
10
|
+
- `zwarm interactive` help and `models` command
|
|
11
|
+
- Cost estimation
|
|
12
|
+
- Adapter auto-detection from model name
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
from dataclasses import dataclass, field
|
|
18
|
+
from typing import Any
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class ModelInfo:
|
|
23
|
+
"""Complete information about an LLM model."""
|
|
24
|
+
|
|
25
|
+
# Identity
|
|
26
|
+
canonical: str # Full model name (e.g., "gpt-5.1-codex-mini")
|
|
27
|
+
adapter: str # "codex" or "claude"
|
|
28
|
+
aliases: list[str] = field(default_factory=list) # Short names
|
|
29
|
+
|
|
30
|
+
# Pricing ($ per million tokens)
|
|
31
|
+
input_per_million: float = 0.0
|
|
32
|
+
output_per_million: float = 0.0
|
|
33
|
+
cached_input_per_million: float | None = None
|
|
34
|
+
|
|
35
|
+
# Metadata
|
|
36
|
+
description: str = ""
|
|
37
|
+
is_default: bool = False # Default model for this adapter
|
|
38
|
+
|
|
39
|
+
def estimate_cost(
|
|
40
|
+
self,
|
|
41
|
+
input_tokens: int,
|
|
42
|
+
output_tokens: int,
|
|
43
|
+
cached_tokens: int = 0,
|
|
44
|
+
) -> float:
|
|
45
|
+
"""Estimate cost in dollars."""
|
|
46
|
+
input_cost = (input_tokens / 1_000_000) * self.input_per_million
|
|
47
|
+
output_cost = (output_tokens / 1_000_000) * self.output_per_million
|
|
48
|
+
|
|
49
|
+
cached_cost = 0.0
|
|
50
|
+
if cached_tokens and self.cached_input_per_million:
|
|
51
|
+
cached_cost = (cached_tokens / 1_000_000) * self.cached_input_per_million
|
|
52
|
+
|
|
53
|
+
return input_cost + output_cost + cached_cost
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
# =============================================================================
|
|
57
|
+
# Model Registry - ADD NEW MODELS HERE
|
|
58
|
+
# =============================================================================
|
|
59
|
+
|
|
60
|
+
MODELS: list[ModelInfo] = [
|
|
61
|
+
# -------------------------------------------------------------------------
|
|
62
|
+
# OpenAI Codex Models (via `codex` CLI)
|
|
63
|
+
# -------------------------------------------------------------------------
|
|
64
|
+
ModelInfo(
|
|
65
|
+
canonical="gpt-5.1-codex-mini",
|
|
66
|
+
adapter="codex",
|
|
67
|
+
aliases=["codex-mini", "mini"],
|
|
68
|
+
input_per_million=0.25,
|
|
69
|
+
output_per_million=2.00,
|
|
70
|
+
cached_input_per_million=0.025,
|
|
71
|
+
description="Fast, cost-effective coding model",
|
|
72
|
+
is_default=True,
|
|
73
|
+
),
|
|
74
|
+
ModelInfo(
|
|
75
|
+
canonical="gpt-5.1-codex",
|
|
76
|
+
adapter="codex",
|
|
77
|
+
aliases=["codex", "codex-full"],
|
|
78
|
+
input_per_million=1.25,
|
|
79
|
+
output_per_million=10.00,
|
|
80
|
+
cached_input_per_million=0.125,
|
|
81
|
+
description="Full Codex model with extended reasoning",
|
|
82
|
+
),
|
|
83
|
+
ModelInfo(
|
|
84
|
+
canonical="gpt-5.1-codex-max",
|
|
85
|
+
adapter="codex",
|
|
86
|
+
aliases=["codex-max", "max"],
|
|
87
|
+
input_per_million=1.25,
|
|
88
|
+
output_per_million=10.00,
|
|
89
|
+
cached_input_per_million=0.125,
|
|
90
|
+
description="Maximum context Codex model",
|
|
91
|
+
),
|
|
92
|
+
# -------------------------------------------------------------------------
|
|
93
|
+
# Anthropic Claude Models (via `claude` CLI)
|
|
94
|
+
# -------------------------------------------------------------------------
|
|
95
|
+
ModelInfo(
|
|
96
|
+
canonical="sonnet",
|
|
97
|
+
adapter="claude",
|
|
98
|
+
aliases=["claude-sonnet", "claude-4-sonnet"],
|
|
99
|
+
input_per_million=3.00,
|
|
100
|
+
output_per_million=15.00,
|
|
101
|
+
description="Balanced Claude model for most tasks",
|
|
102
|
+
is_default=True,
|
|
103
|
+
),
|
|
104
|
+
ModelInfo(
|
|
105
|
+
canonical="opus",
|
|
106
|
+
adapter="claude",
|
|
107
|
+
aliases=["claude-opus", "claude-4-opus"],
|
|
108
|
+
input_per_million=15.00,
|
|
109
|
+
output_per_million=75.00,
|
|
110
|
+
description="Most capable Claude model",
|
|
111
|
+
),
|
|
112
|
+
ModelInfo(
|
|
113
|
+
canonical="haiku",
|
|
114
|
+
adapter="claude",
|
|
115
|
+
aliases=["claude-haiku", "claude-4-haiku"],
|
|
116
|
+
input_per_million=0.25,
|
|
117
|
+
output_per_million=1.25,
|
|
118
|
+
description="Fast, lightweight Claude model",
|
|
119
|
+
),
|
|
120
|
+
]
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
# =============================================================================
|
|
124
|
+
# Registry Lookups
|
|
125
|
+
# =============================================================================
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def _build_lookup_tables() -> tuple[dict[str, ModelInfo], dict[str, ModelInfo]]:
|
|
129
|
+
"""Build lookup tables for fast model resolution."""
|
|
130
|
+
by_canonical: dict[str, ModelInfo] = {}
|
|
131
|
+
by_alias: dict[str, ModelInfo] = {}
|
|
132
|
+
|
|
133
|
+
for model in MODELS:
|
|
134
|
+
by_canonical[model.canonical.lower()] = model
|
|
135
|
+
by_alias[model.canonical.lower()] = model
|
|
136
|
+
for alias in model.aliases:
|
|
137
|
+
by_alias[alias.lower()] = model
|
|
138
|
+
|
|
139
|
+
return by_canonical, by_alias
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
_BY_CANONICAL, _BY_ALIAS = _build_lookup_tables()
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def resolve_model(name: str) -> ModelInfo | None:
|
|
146
|
+
"""
|
|
147
|
+
Resolve a model name or alias to its ModelInfo.
|
|
148
|
+
|
|
149
|
+
Args:
|
|
150
|
+
name: Model name, alias, or partial match
|
|
151
|
+
|
|
152
|
+
Returns:
|
|
153
|
+
ModelInfo or None if not found
|
|
154
|
+
"""
|
|
155
|
+
name_lower = name.lower()
|
|
156
|
+
|
|
157
|
+
# Exact match on alias or canonical
|
|
158
|
+
if name_lower in _BY_ALIAS:
|
|
159
|
+
return _BY_ALIAS[name_lower]
|
|
160
|
+
|
|
161
|
+
# Prefix match (e.g., "gpt-5.1-codex-mini-2026-01" -> "gpt-5.1-codex-mini")
|
|
162
|
+
for canonical, model in _BY_CANONICAL.items():
|
|
163
|
+
if name_lower.startswith(canonical):
|
|
164
|
+
return model
|
|
165
|
+
|
|
166
|
+
return None
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def get_adapter_for_model(name: str) -> str | None:
|
|
170
|
+
"""
|
|
171
|
+
Get the adapter name for a model.
|
|
172
|
+
|
|
173
|
+
Args:
|
|
174
|
+
name: Model name or alias
|
|
175
|
+
|
|
176
|
+
Returns:
|
|
177
|
+
Adapter name ("codex" or "claude") or None if unknown
|
|
178
|
+
"""
|
|
179
|
+
model = resolve_model(name)
|
|
180
|
+
return model.adapter if model else None
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def get_default_model(adapter: str) -> str | None:
|
|
184
|
+
"""
|
|
185
|
+
Get the default model for an adapter.
|
|
186
|
+
|
|
187
|
+
Args:
|
|
188
|
+
adapter: Adapter name ("codex" or "claude")
|
|
189
|
+
|
|
190
|
+
Returns:
|
|
191
|
+
Default model canonical name or None
|
|
192
|
+
"""
|
|
193
|
+
for model in MODELS:
|
|
194
|
+
if model.adapter == adapter and model.is_default:
|
|
195
|
+
return model.canonical
|
|
196
|
+
return None
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def list_models(adapter: str | None = None) -> list[ModelInfo]:
|
|
200
|
+
"""
|
|
201
|
+
List available models.
|
|
202
|
+
|
|
203
|
+
Args:
|
|
204
|
+
adapter: Filter by adapter, or None for all
|
|
205
|
+
|
|
206
|
+
Returns:
|
|
207
|
+
List of ModelInfo objects
|
|
208
|
+
"""
|
|
209
|
+
if adapter:
|
|
210
|
+
return [m for m in MODELS if m.adapter == adapter]
|
|
211
|
+
return MODELS.copy()
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def list_adapters() -> list[str]:
|
|
215
|
+
"""Get list of unique adapter names."""
|
|
216
|
+
return sorted(set(m.adapter for m in MODELS))
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def get_models_help_text() -> str:
|
|
220
|
+
"""
|
|
221
|
+
Generate help text showing all available models.
|
|
222
|
+
|
|
223
|
+
Returns formatted string for display in help messages.
|
|
224
|
+
"""
|
|
225
|
+
lines = ["", "Available models:"]
|
|
226
|
+
|
|
227
|
+
for adapter in list_adapters():
|
|
228
|
+
lines.append(f"\n {adapter.upper()}:")
|
|
229
|
+
for model in list_models(adapter):
|
|
230
|
+
default_marker = " *" if model.is_default else ""
|
|
231
|
+
aliases = ", ".join(model.aliases) if model.aliases else ""
|
|
232
|
+
alias_str = f" ({aliases})" if aliases else ""
|
|
233
|
+
|
|
234
|
+
lines.append(f" {model.canonical}{alias_str}{default_marker}")
|
|
235
|
+
|
|
236
|
+
lines.append("\n * = default for adapter")
|
|
237
|
+
return "\n".join(lines)
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def get_models_table_data() -> list[dict[str, Any]]:
|
|
241
|
+
"""
|
|
242
|
+
Get model data formatted for table display.
|
|
243
|
+
|
|
244
|
+
Returns list of dicts with keys: adapter, model, aliases, default, price, description
|
|
245
|
+
"""
|
|
246
|
+
data = []
|
|
247
|
+
for model in MODELS:
|
|
248
|
+
data.append({
|
|
249
|
+
"adapter": model.adapter,
|
|
250
|
+
"model": model.canonical,
|
|
251
|
+
"aliases": ", ".join(model.aliases),
|
|
252
|
+
"default": model.is_default,
|
|
253
|
+
"input_price": model.input_per_million,
|
|
254
|
+
"output_price": model.output_per_million,
|
|
255
|
+
"description": model.description,
|
|
256
|
+
})
|
|
257
|
+
return data
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
# =============================================================================
|
|
261
|
+
# Cost Estimation
|
|
262
|
+
# =============================================================================
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
def estimate_cost(
|
|
266
|
+
model: str,
|
|
267
|
+
input_tokens: int,
|
|
268
|
+
output_tokens: int,
|
|
269
|
+
cached_tokens: int = 0,
|
|
270
|
+
) -> float | None:
|
|
271
|
+
"""
|
|
272
|
+
Estimate cost for a model run.
|
|
273
|
+
|
|
274
|
+
Args:
|
|
275
|
+
model: Model name or alias
|
|
276
|
+
input_tokens: Number of input tokens
|
|
277
|
+
output_tokens: Number of output tokens
|
|
278
|
+
cached_tokens: Number of cached input tokens
|
|
279
|
+
|
|
280
|
+
Returns:
|
|
281
|
+
Cost in USD, or None if model unknown
|
|
282
|
+
"""
|
|
283
|
+
model_info = resolve_model(model)
|
|
284
|
+
if model_info is None:
|
|
285
|
+
return None
|
|
286
|
+
|
|
287
|
+
return model_info.estimate_cost(input_tokens, output_tokens, cached_tokens)
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
def format_cost(cost: float | None) -> str:
|
|
291
|
+
"""Format cost as a human-readable string."""
|
|
292
|
+
if cost is None:
|
|
293
|
+
return "?"
|
|
294
|
+
if cost < 0.01:
|
|
295
|
+
return f"${cost:.4f}"
|
|
296
|
+
elif cost < 1.00:
|
|
297
|
+
return f"${cost:.3f}"
|
|
298
|
+
else:
|
|
299
|
+
return f"${cost:.2f}"
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
def estimate_session_cost(
|
|
303
|
+
model: str,
|
|
304
|
+
token_usage: dict[str, Any],
|
|
305
|
+
) -> dict[str, Any]:
|
|
306
|
+
"""
|
|
307
|
+
Estimate cost for a session given its token usage.
|
|
308
|
+
|
|
309
|
+
Args:
|
|
310
|
+
model: Model used
|
|
311
|
+
token_usage: Dict with input_tokens, output_tokens, etc.
|
|
312
|
+
|
|
313
|
+
Returns:
|
|
314
|
+
Dict with cost info: {cost, cost_formatted, pricing_known, ...}
|
|
315
|
+
"""
|
|
316
|
+
input_tokens = token_usage.get("input_tokens", 0)
|
|
317
|
+
output_tokens = token_usage.get("output_tokens", 0)
|
|
318
|
+
cached_tokens = token_usage.get("cached_tokens", 0)
|
|
319
|
+
|
|
320
|
+
cost = estimate_cost(model, input_tokens, output_tokens, cached_tokens)
|
|
321
|
+
|
|
322
|
+
return {
|
|
323
|
+
"cost": cost,
|
|
324
|
+
"cost_formatted": format_cost(cost),
|
|
325
|
+
"pricing_known": cost is not None,
|
|
326
|
+
"model": model,
|
|
327
|
+
"input_tokens": input_tokens,
|
|
328
|
+
"output_tokens": output_tokens,
|
|
329
|
+
}
|
zwarm/core/test_config.py
CHANGED
|
@@ -20,7 +20,6 @@ def test_default_config():
|
|
|
20
20
|
assert config.executor.adapter == "codex_mcp"
|
|
21
21
|
assert config.executor.sandbox == "workspace-write"
|
|
22
22
|
assert config.orchestrator.lm == "gpt-5-mini"
|
|
23
|
-
assert config.orchestrator.sync_first is True
|
|
24
23
|
assert config.state_dir == ".zwarm"
|
|
25
24
|
|
|
26
25
|
|
|
@@ -68,8 +67,8 @@ def test_apply_overrides():
|
|
|
68
67
|
assert result["executor"]["adapter"] == "claude_code"
|
|
69
68
|
|
|
70
69
|
# Override with boolean
|
|
71
|
-
result = apply_overrides(config, ["
|
|
72
|
-
assert result["
|
|
70
|
+
result = apply_overrides(config, ["executor.web_search=true"])
|
|
71
|
+
assert result["executor"]["web_search"] is True
|
|
73
72
|
|
|
74
73
|
# Create new nested path
|
|
75
74
|
result = apply_overrides(config, ["weave.project=my-project"])
|
zwarm/orchestrator.py
CHANGED
|
@@ -23,7 +23,6 @@ from wbal.helper import TOOL_CALL_TYPE, format_openai_tool_response
|
|
|
23
23
|
from wbal.lm import LM as wbalLMGeneric
|
|
24
24
|
from wbal.lm import GPT5LargeVerbose
|
|
25
25
|
|
|
26
|
-
from zwarm.adapters import ExecutorAdapter, get_adapter
|
|
27
26
|
from zwarm.core.compact import compact_messages, should_compact
|
|
28
27
|
from zwarm.core.config import ZwarmConfig, load_config
|
|
29
28
|
from zwarm.core.environment import OrchestratorEnv
|
|
@@ -72,7 +71,6 @@ class Orchestrator(YamlAgent):
|
|
|
72
71
|
# State management
|
|
73
72
|
_state: StateManager = PrivateAttr()
|
|
74
73
|
_sessions: dict[str, ConversationSession] = PrivateAttr(default_factory=dict)
|
|
75
|
-
_adapters: dict[str, ExecutorAdapter] = PrivateAttr(default_factory=dict)
|
|
76
74
|
_watcher_manager: WatcherManager | None = PrivateAttr(default=None)
|
|
77
75
|
_resumed: bool = PrivateAttr(default=False)
|
|
78
76
|
_total_tokens: int = PrivateAttr(default=0) # Cumulative orchestrator tokens
|
|
@@ -83,9 +81,11 @@ class Orchestrator(YamlAgent):
|
|
|
83
81
|
"total_tokens": 0,
|
|
84
82
|
}
|
|
85
83
|
)
|
|
84
|
+
# Callback for step progress (used by CLI to print tool calls)
|
|
85
|
+
_step_callback: Callable[[int, list[tuple[dict[str, Any], Any]]], None] | None = PrivateAttr(default=None)
|
|
86
86
|
|
|
87
87
|
def model_post_init(self, __context: Any) -> None:
|
|
88
|
-
"""Initialize state
|
|
88
|
+
"""Initialize state after model creation."""
|
|
89
89
|
super().model_post_init(__context)
|
|
90
90
|
|
|
91
91
|
# Initialize state manager with instance isolation
|
|
@@ -151,40 +151,9 @@ class Orchestrator(YamlAgent):
|
|
|
151
151
|
"""Access state manager."""
|
|
152
152
|
return self._state
|
|
153
153
|
|
|
154
|
-
def _get_adapter(self, name: str) -> ExecutorAdapter:
|
|
155
|
-
"""Get or create an adapter by name using the adapter registry."""
|
|
156
|
-
if name not in self._adapters:
|
|
157
|
-
# Get model from config (adapters have their own defaults if None)
|
|
158
|
-
model = self.config.executor.model
|
|
159
|
-
|
|
160
|
-
# Use isolated codex config if available
|
|
161
|
-
config_path = self.working_dir / self.config.state_dir / "codex.toml"
|
|
162
|
-
if not config_path.exists():
|
|
163
|
-
config_path = None # Fallback to adapter defaults
|
|
164
|
-
|
|
165
|
-
self._adapters[name] = get_adapter(
|
|
166
|
-
name, model=model, config_path=config_path
|
|
167
|
-
)
|
|
168
|
-
return self._adapters[name]
|
|
169
|
-
|
|
170
154
|
def get_executor_usage(self) -> dict[str, int]:
|
|
171
|
-
"""Get aggregated token usage
|
|
172
|
-
|
|
173
|
-
"input_tokens": 0,
|
|
174
|
-
"output_tokens": 0,
|
|
175
|
-
"total_tokens": 0,
|
|
176
|
-
}
|
|
177
|
-
for adapter in self._adapters.values():
|
|
178
|
-
if hasattr(adapter, "total_usage"):
|
|
179
|
-
usage = adapter.total_usage
|
|
180
|
-
for key in total:
|
|
181
|
-
total[key] += usage.get(key, 0)
|
|
182
|
-
return total
|
|
183
|
-
|
|
184
|
-
@property
|
|
185
|
-
def executor_usage(self) -> dict[str, int]:
|
|
186
|
-
"""Aggregated executor token usage (for Weave tracking)."""
|
|
187
|
-
return self.get_executor_usage()
|
|
155
|
+
"""Get aggregated token usage from executor sessions."""
|
|
156
|
+
return self._executor_usage
|
|
188
157
|
|
|
189
158
|
def save_state(self) -> None:
|
|
190
159
|
"""Save orchestrator state for resume."""
|
|
@@ -587,7 +556,11 @@ Review what was accomplished in the previous session and delegate new tasks as n
|
|
|
587
556
|
}
|
|
588
557
|
# NUDGE and CONTINUE just continue
|
|
589
558
|
|
|
590
|
-
self.step()
|
|
559
|
+
tool_results = self.step()
|
|
560
|
+
|
|
561
|
+
# Call step callback if registered (for CLI progress display)
|
|
562
|
+
if self._step_callback:
|
|
563
|
+
self._step_callback(self._step_count, tool_results)
|
|
591
564
|
|
|
592
565
|
if self.stopCondition:
|
|
593
566
|
break
|
|
@@ -599,8 +572,7 @@ Review what was accomplished in the previous session and delegate new tasks as n
|
|
|
599
572
|
|
|
600
573
|
async def cleanup(self) -> None:
|
|
601
574
|
"""Clean up resources."""
|
|
602
|
-
|
|
603
|
-
await adapter.cleanup()
|
|
575
|
+
pass # Session cleanup handled by CodexSessionManager
|
|
604
576
|
|
|
605
577
|
|
|
606
578
|
def build_orchestrator(
|
|
@@ -631,15 +603,17 @@ def build_orchestrator(
|
|
|
631
603
|
"""
|
|
632
604
|
from uuid import uuid4
|
|
633
605
|
|
|
634
|
-
#
|
|
606
|
+
# Resolve working directory first (needed for config loading)
|
|
607
|
+
working_dir = working_dir or Path.cwd()
|
|
608
|
+
|
|
609
|
+
# Load configuration from working_dir (not cwd!)
|
|
610
|
+
# This ensures config.toml and .env are loaded from the project being worked on
|
|
635
611
|
config = load_config(
|
|
636
612
|
config_path=config_path,
|
|
637
613
|
overrides=overrides,
|
|
614
|
+
working_dir=working_dir,
|
|
638
615
|
)
|
|
639
616
|
|
|
640
|
-
# Resolve working directory
|
|
641
|
-
working_dir = working_dir or Path.cwd()
|
|
642
|
-
|
|
643
617
|
# Generate instance ID if not provided (enables isolation by default for new runs)
|
|
644
618
|
# For resume, instance_id should be provided explicitly
|
|
645
619
|
if instance_id is None and not resume:
|
zwarm/prompts/__init__.py
CHANGED
|
@@ -3,8 +3,11 @@ System prompts for zwarm agents.
|
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
5
|
from zwarm.prompts.orchestrator import ORCHESTRATOR_SYSTEM_PROMPT, get_orchestrator_prompt
|
|
6
|
+
from zwarm.prompts.pilot import PILOT_SYSTEM_PROMPT, get_pilot_prompt
|
|
6
7
|
|
|
7
8
|
__all__ = [
|
|
8
9
|
"ORCHESTRATOR_SYSTEM_PROMPT",
|
|
9
10
|
"get_orchestrator_prompt",
|
|
11
|
+
"PILOT_SYSTEM_PROMPT",
|
|
12
|
+
"get_pilot_prompt",
|
|
10
13
|
]
|
zwarm/prompts/orchestrator.py
CHANGED
|
@@ -27,18 +27,20 @@ For everything else, make your best judgment and proceed. If you're unsure wheth
|
|
|
27
27
|
|
|
28
28
|
Your primary tools are for delegation and verification:
|
|
29
29
|
|
|
30
|
-
**delegate(task, working_dir=None, model=None
|
|
30
|
+
**delegate(task, working_dir=None, model=None)** - Start a new executor session. The `task` should be a clear, specific description of what you want done. All sessions run asynchronously - you'll get a session_id back immediately and can poll for results. The `working_dir` parameter lets you run the executor in a specific directory.
|
|
31
31
|
|
|
32
|
-
**converse(session_id, message
|
|
32
|
+
**converse(session_id, message)** - Continue an existing conversation. Use this to provide feedback, ask for changes, or guide the executor through complex work. The executor maintains full context. Returns immediately - use polling to check for the response.
|
|
33
33
|
|
|
34
|
-
**peek_session(session_id)** - Quick status check. Returns just the session status and latest message. Use this for fast polling
|
|
34
|
+
**peek_session(session_id)** - Quick status check. Returns just the session status and latest message. Use this for fast polling.
|
|
35
35
|
|
|
36
36
|
**check_session(session_id)** - Full session details including all messages, token usage, runtime. Use this when you need the complete picture.
|
|
37
37
|
|
|
38
|
-
**list_sessions(status=None)** - List all sessions. Returns a `needs_attention` flag for each session indicating if it recently completed or failed. Use this to monitor multiple
|
|
38
|
+
**list_sessions(status=None)** - List all sessions. Returns a `needs_attention` flag for each session indicating if it recently completed or failed. Use this to monitor multiple sessions and see which ones have new responses ready for review.
|
|
39
39
|
|
|
40
40
|
**end_session(session_id, reason=None, delete=False)** - Kill a running session or clean up a completed one. Use `delete=True` to remove the session entirely (won't show in list_sessions anymore).
|
|
41
41
|
|
|
42
|
+
**sleep(seconds)** - Pause execution for specified seconds (max 300). Use this when you've started sessions and want to give them time to complete before polling. Essential for the async workflow pattern.
|
|
43
|
+
|
|
42
44
|
**bash(command)** - Run shell commands directly. Use this primarily for verification: running tests, type checkers, linters, build commands, or inspecting the filesystem. Do NOT use bash to write code yourself - that's what executors are for.
|
|
43
45
|
|
|
44
46
|
**chat(message, wait_for_user_input)** - Communicate with the human user. Use this sparingly. Most of the time you should be working autonomously without bothering the user.
|
|
@@ -63,35 +65,40 @@ The watchers are on your side. They exist to help you succeed, not to criticize.
|
|
|
63
65
|
|
|
64
66
|
---
|
|
65
67
|
|
|
66
|
-
#
|
|
67
|
-
|
|
68
|
-
The `wait` parameter controls whether you block waiting for a response or continue immediately.
|
|
68
|
+
# Async Workflow Pattern
|
|
69
69
|
|
|
70
|
-
|
|
70
|
+
All executor sessions run asynchronously. When you call delegate() or converse(), you get a session_id back immediately and the executor works in the background. This lets you parallelize work efficiently.
|
|
71
71
|
|
|
72
|
-
|
|
72
|
+
The core workflow pattern is: **delegate → sleep → poll → respond**
|
|
73
73
|
|
|
74
|
-
|
|
75
|
-
1.
|
|
76
|
-
2.
|
|
77
|
-
3.
|
|
78
|
-
4.
|
|
79
|
-
5.
|
|
74
|
+
```
|
|
75
|
+
1. delegate(task1) → session_a
|
|
76
|
+
2. delegate(task2) → session_b
|
|
77
|
+
3. delegate(task3) → session_c
|
|
78
|
+
4. sleep(30) → give them time to work
|
|
79
|
+
5. list_sessions() → check which have needs_attention=True
|
|
80
|
+
6. peek_session(a) → quick status check
|
|
81
|
+
7. If still running, sleep(30) and repeat
|
|
82
|
+
8. check_session(a) → full results when done
|
|
83
|
+
9. converse(a, "feedback...") → continue the conversation
|
|
84
|
+
10. sleep(15) → wait for response
|
|
85
|
+
11. check_session(a) → see the response
|
|
86
|
+
```
|
|
80
87
|
|
|
81
|
-
**
|
|
88
|
+
**Key principles:**
|
|
82
89
|
|
|
83
|
-
|
|
90
|
+
- Use **sleep()** to give executors time to work before polling. Don't spam peek_session() in a tight loop.
|
|
91
|
+
- Use **list_sessions()** to see which sessions have `needs_attention=True` (recently completed or failed).
|
|
92
|
+
- Use **peek_session()** for quick status checks during polling.
|
|
93
|
+
- Use **check_session()** to get full details including all messages when you need to review the actual work.
|
|
94
|
+
- After **converse()**, always sleep() and poll - you won't get the response immediately.
|
|
84
95
|
|
|
85
|
-
|
|
86
|
-
1. `delegate(task1, wait=False)` → session a
|
|
87
|
-
2. `delegate(task2, wait=False)` → session b
|
|
88
|
-
3. `delegate(task3, wait=False)` → session c
|
|
89
|
-
4. `list_sessions()` → check `needs_attention` flags
|
|
90
|
-
5. `peek_session(a)` → quick status check
|
|
91
|
-
6. `check_session(b)` → full details when ready
|
|
92
|
-
7. `converse(a, "now do X", wait=False)` → continue without blocking
|
|
96
|
+
**Sleep timing guidance:**
|
|
93
97
|
|
|
94
|
-
|
|
98
|
+
- Simple tasks (single file edits, small fixes): 15-30 seconds
|
|
99
|
+
- Medium tasks (multiple files, tests): 30-60 seconds
|
|
100
|
+
- Complex tasks (new features, refactoring): 60-120 seconds
|
|
101
|
+
- If a session is still running after polling, sleep again rather than waiting forever
|
|
95
102
|
|
|
96
103
|
---
|
|
97
104
|
|
|
@@ -119,7 +126,7 @@ Never mark work as complete without verifying it actually works. This is the mos
|
|
|
119
126
|
|
|
120
127
|
After an executor completes work, run the relevant verification commands. For Python projects, this typically means: pytest for tests, mypy or pyright for type checking, ruff or flake8 for linting. For JavaScript/TypeScript: npm test, tsc for type checking, eslint for linting. For compiled languages: ensure the build succeeds without errors.
|
|
121
128
|
|
|
122
|
-
When verification fails,
|
|
129
|
+
When verification fails, use converse() to share the error output and ask the executor to fix it. Be specific about what failed - paste the actual error message. Remember to sleep() and poll for the response. If the session has become too confused or gone too far down the wrong path, end it with verdict="failed" and start a fresh session with a clearer task description that incorporates what you learned.
|
|
123
130
|
|
|
124
131
|
Do not rationalize failures. If the tests don't pass, the work isn't done. If the type checker complains, the work isn't done. If the linter shows errors, the work isn't done. Your job is to ensure quality, and that means holding firm on verification.
|
|
125
132
|
|
|
@@ -131,7 +138,7 @@ Executors will sometimes fail. They might misunderstand the task, produce buggy
|
|
|
131
138
|
|
|
132
139
|
When you notice an executor has gone wrong, first diagnose the problem. What specifically is wrong? Is it a misunderstanding of requirements, a technical error, a missing piece of context? Understanding the root cause helps you correct effectively.
|
|
133
140
|
|
|
134
|
-
|
|
141
|
+
You can often recover through conversation using converse(). Explain what's wrong clearly and specifically. Don't just say "this is wrong" - explain why and what you expected instead. Provide the error messages, the failing test output, or a clear description of the incorrect behavior. Give the executor the information they need to fix the issue. Then sleep() and poll for their response.
|
|
135
142
|
|
|
136
143
|
Sometimes a session becomes too confused or goes too far down the wrong path. In these cases, it's better to cut your losses: call end_session() with verdict="failed" and a summary of what went wrong, then start fresh with a new session that has a better task description informed by what you learned.
|
|
137
144
|
|
|
@@ -145,7 +152,7 @@ Complex tasks often require multiple executor sessions, either in sequence or in
|
|
|
145
152
|
|
|
146
153
|
For sequential work with dependencies, complete each session fully before starting the next. Don't leave sessions hanging in an ambiguous state while you start new work. This creates confusion and makes it hard to track what's actually done.
|
|
147
154
|
|
|
148
|
-
For parallel work on independent tasks,
|
|
155
|
+
For parallel work on independent tasks, start multiple sessions and use the sleep-poll pattern to monitor them. Use list_sessions() to see which have needs_attention=True, check_session() for full details, and end each session properly when complete. Keep mental track of what's running - don't lose track of sessions.
|
|
149
156
|
|
|
150
157
|
Prioritize completing in-progress work before starting new work. A half-finished feature is worth less than nothing - it's technical debt that will confuse future work. Better to have fewer things fully done than many things partially done.
|
|
151
158
|
|