swarmkit 0.1.34__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bridge/__init__.py +5 -0
- bridge/dist/bridge.bundle.cjs +8 -0
- swarmkit/__init__.py +152 -0
- swarmkit/agent.py +480 -0
- swarmkit/bridge.py +475 -0
- swarmkit/config.py +92 -0
- swarmkit/pipeline/__init__.py +59 -0
- swarmkit/pipeline/pipeline.py +487 -0
- swarmkit/pipeline/types.py +272 -0
- swarmkit/prompts/__init__.py +126 -0
- swarmkit/prompts/agent_md/judge.md +30 -0
- swarmkit/prompts/agent_md/reduce.md +7 -0
- swarmkit/prompts/agent_md/verify.md +33 -0
- swarmkit/prompts/user/judge.md +1 -0
- swarmkit/prompts/user/retry_feedback.md +9 -0
- swarmkit/prompts/user/verify.md +1 -0
- swarmkit/results.py +45 -0
- swarmkit/retry.py +133 -0
- swarmkit/schema.py +107 -0
- swarmkit/swarm/__init__.py +75 -0
- swarmkit/swarm/results.py +140 -0
- swarmkit/swarm/swarm.py +1751 -0
- swarmkit/swarm/types.py +193 -0
- swarmkit/utils.py +82 -0
- swarmkit-0.1.34.dist-info/METADATA +80 -0
- swarmkit-0.1.34.dist-info/RECORD +29 -0
- swarmkit-0.1.34.dist-info/WHEEL +5 -0
- swarmkit-0.1.34.dist-info/licenses/LICENSE +24 -0
- swarmkit-0.1.34.dist-info/top_level.txt +2 -0
swarmkit/swarm/swarm.py
ADDED
|
@@ -0,0 +1,1751 @@
|
|
|
1
|
+
"""Swarm - Functional programming abstractions for AI agents.
|
|
2
|
+
|
|
3
|
+
Provides map, filter, reduce, and bestOf operations for parallel AI agent execution.
|
|
4
|
+
|
|
5
|
+
Example:
|
|
6
|
+
```python
|
|
7
|
+
from swarmkit import Swarm
|
|
8
|
+
|
|
9
|
+
# Minimal usage - uses SWARMKIT_API_KEY and E2B_API_KEY env vars
|
|
10
|
+
swarm = Swarm()
|
|
11
|
+
|
|
12
|
+
# Or with explicit config
|
|
13
|
+
from swarmkit import SwarmConfig, AgentConfig, E2BProvider
|
|
14
|
+
swarm = Swarm(SwarmConfig(
|
|
15
|
+
agent=AgentConfig(type="claude", api_key="..."),
|
|
16
|
+
sandbox=E2BProvider(api_key="..."),
|
|
17
|
+
))
|
|
18
|
+
|
|
19
|
+
# Map: apply agent to each item
|
|
20
|
+
results = await swarm.map(
|
|
21
|
+
items=[{"doc.txt": "content1"}, {"doc.txt": "content2"}],
|
|
22
|
+
prompt="Analyze this document",
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
# Filter: evaluate and filter items
|
|
26
|
+
critical = await swarm.filter(
|
|
27
|
+
items=results.success,
|
|
28
|
+
prompt="Evaluate severity",
|
|
29
|
+
schema=SeveritySchema,
|
|
30
|
+
condition=lambda x: x.severity == "critical",
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
# Reduce: synthesize many into one
|
|
34
|
+
report = await swarm.reduce(
|
|
35
|
+
items=critical.success,
|
|
36
|
+
prompt="Create summary report",
|
|
37
|
+
)
|
|
38
|
+
```
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
import asyncio
|
|
42
|
+
import base64
|
|
43
|
+
import json
|
|
44
|
+
import secrets
|
|
45
|
+
import warnings
|
|
46
|
+
from typing import Any, Callable, Dict, List, Optional, Type, Union
|
|
47
|
+
|
|
48
|
+
from ..bridge import BridgeManager
|
|
49
|
+
from ..schema import is_pydantic_model, is_dataclass, to_json_schema, validate_and_parse
|
|
50
|
+
from ..config import AgentConfig
|
|
51
|
+
from ..utils import _encode_files_for_transport, _filter_none
|
|
52
|
+
from ..prompts import JUDGE_PROMPT, JUDGE_USER_PROMPT, VERIFY_PROMPT, VERIFY_USER_PROMPT, REDUCE_PROMPT, RETRY_FEEDBACK_PROMPT, apply_template, build_file_tree
|
|
53
|
+
from ..retry import RetryConfig, execute_with_retry
|
|
54
|
+
from .types import (
|
|
55
|
+
FileMap,
|
|
56
|
+
SwarmConfig,
|
|
57
|
+
BestOfConfig,
|
|
58
|
+
VerifyConfig,
|
|
59
|
+
IndexedMeta,
|
|
60
|
+
ReduceMeta,
|
|
61
|
+
JudgeMeta,
|
|
62
|
+
VerifyMeta,
|
|
63
|
+
Prompt,
|
|
64
|
+
ItemInput,
|
|
65
|
+
SchemaType,
|
|
66
|
+
)
|
|
67
|
+
from .results import (
|
|
68
|
+
SwarmResult,
|
|
69
|
+
SwarmResultList,
|
|
70
|
+
ReduceResult,
|
|
71
|
+
BestOfResult,
|
|
72
|
+
BestOfInfo,
|
|
73
|
+
VerifyInfo,
|
|
74
|
+
is_swarm_result,
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
# =============================================================================
|
|
79
|
+
# CONSTANTS
|
|
80
|
+
# =============================================================================
|
|
81
|
+
|
|
82
|
+
MAX_CONCURRENCY = 100 # Cap to prevent resource exhaustion
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
# =============================================================================
|
|
86
|
+
# SWARM CLASS
|
|
87
|
+
# =============================================================================
|
|
88
|
+
|
|
89
|
+
class Swarm:
|
|
90
|
+
"""Functional programming abstractions for AI agents.
|
|
91
|
+
|
|
92
|
+
Provides map, filter, reduce, and bestOf operations for parallel AI agent execution.
|
|
93
|
+
Uses a shared bridge process with multiple SwarmKit instances for efficiency.
|
|
94
|
+
"""
|
|
95
|
+
|
|
96
|
+
def __init__(self, config: Optional[SwarmConfig] = None):
|
|
97
|
+
"""Initialize Swarm with configuration.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
config: SwarmConfig with agent, sandbox, concurrency settings
|
|
101
|
+
(optional - defaults to SWARMKIT_API_KEY and E2B_API_KEY env vars)
|
|
102
|
+
|
|
103
|
+
Raises:
|
|
104
|
+
ValueError: If concurrency exceeds MAX_CONCURRENCY
|
|
105
|
+
"""
|
|
106
|
+
config = config or SwarmConfig()
|
|
107
|
+
if config.concurrency > MAX_CONCURRENCY:
|
|
108
|
+
raise ValueError(
|
|
109
|
+
f"concurrency={config.concurrency} exceeds max {MAX_CONCURRENCY}. "
|
|
110
|
+
f"For higher parallelism, scale horizontally with multiple processes."
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
self.config = config
|
|
114
|
+
self.semaphore = asyncio.Semaphore(config.concurrency)
|
|
115
|
+
self.bridge = BridgeManager()
|
|
116
|
+
self._bridge_started = False
|
|
117
|
+
|
|
118
|
+
async def _ensure_bridge(self):
|
|
119
|
+
"""Ensure bridge is started."""
|
|
120
|
+
if not self._bridge_started:
|
|
121
|
+
await self.bridge.start()
|
|
122
|
+
self._bridge_started = True
|
|
123
|
+
|
|
124
|
+
# =========================================================================
|
|
125
|
+
# PUBLIC API
|
|
126
|
+
# =========================================================================
|
|
127
|
+
|
|
128
|
+
async def map(
|
|
129
|
+
self,
|
|
130
|
+
items: List[ItemInput],
|
|
131
|
+
prompt: Prompt,
|
|
132
|
+
system_prompt: Optional[str] = None,
|
|
133
|
+
schema: Optional[SchemaType] = None,
|
|
134
|
+
schema_options: Optional[Dict[str, Any]] = None,
|
|
135
|
+
agent: Optional[AgentConfig] = None,
|
|
136
|
+
mcp_servers: Optional[Dict[str, Any]] = None,
|
|
137
|
+
best_of: Optional[BestOfConfig] = None,
|
|
138
|
+
verify: Optional[VerifyConfig] = None,
|
|
139
|
+
retry: Optional[RetryConfig] = None,
|
|
140
|
+
timeout_ms: Optional[int] = None,
|
|
141
|
+
) -> SwarmResultList:
|
|
142
|
+
"""Apply an agent to each item in parallel.
|
|
143
|
+
|
|
144
|
+
Args:
|
|
145
|
+
items: List of items (FileMaps or SwarmResults from previous operation)
|
|
146
|
+
prompt: Task prompt (string or function(files, index) -> string)
|
|
147
|
+
system_prompt: Optional system prompt
|
|
148
|
+
schema: Optional Pydantic model or JSON Schema for structured output
|
|
149
|
+
schema_options: Optional validation options
|
|
150
|
+
agent: Optional agent override
|
|
151
|
+
mcp_servers: Optional MCP servers override (replaces swarm default)
|
|
152
|
+
best_of: Optional bestOf configuration for N candidates + judge per item (mutually exclusive with verify)
|
|
153
|
+
verify: Optional verify configuration for LLM-as-judge quality verification with retry (mutually exclusive with best_of)
|
|
154
|
+
retry: Optional retry configuration for failed items
|
|
155
|
+
timeout_ms: Optional timeout in ms
|
|
156
|
+
|
|
157
|
+
Returns:
|
|
158
|
+
SwarmResultList with results for each item
|
|
159
|
+
"""
|
|
160
|
+
await self._ensure_bridge()
|
|
161
|
+
run_id = self._generate_run_id()
|
|
162
|
+
timeout = timeout_ms or self.config.timeout_ms
|
|
163
|
+
retry = retry or self.config.retry
|
|
164
|
+
resolved_mcp_servers = mcp_servers or self.config.mcp_servers
|
|
165
|
+
|
|
166
|
+
# best_of and verify are mutually exclusive
|
|
167
|
+
if best_of and verify:
|
|
168
|
+
raise ValueError("map() cannot use both best_of and verify options simultaneously")
|
|
169
|
+
|
|
170
|
+
async def process_item(item: ItemInput, index: int) -> SwarmResult:
|
|
171
|
+
# bestOf has internal per-candidate and judge retry - don't double-wrap
|
|
172
|
+
if best_of:
|
|
173
|
+
return await self._execute_map_item_with_best_of(
|
|
174
|
+
item, prompt, index, run_id, system_prompt, schema,
|
|
175
|
+
schema_options, agent, resolved_mcp_servers, best_of, retry, timeout
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
# verify has internal retry loop with feedback - don't double-wrap with retry
|
|
179
|
+
if verify:
|
|
180
|
+
return await self._execute_map_item_with_verify(
|
|
181
|
+
item, prompt, index, run_id, system_prompt, schema,
|
|
182
|
+
schema_options, agent, resolved_mcp_servers, verify, timeout, retry
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
# Wrap with retry if configured (simple map only)
|
|
186
|
+
if retry:
|
|
187
|
+
return await execute_with_retry(
|
|
188
|
+
lambda attempt: self._execute_map_item(
|
|
189
|
+
item, prompt, index, run_id, system_prompt, schema,
|
|
190
|
+
schema_options, agent, resolved_mcp_servers, timeout, attempt
|
|
191
|
+
),
|
|
192
|
+
retry,
|
|
193
|
+
item_index=index,
|
|
194
|
+
)
|
|
195
|
+
return await self._execute_map_item(
|
|
196
|
+
item, prompt, index, run_id, system_prompt, schema,
|
|
197
|
+
schema_options, agent, resolved_mcp_servers, timeout
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
results = await asyncio.gather(*[
|
|
201
|
+
process_item(item, i) for i, item in enumerate(items)
|
|
202
|
+
])
|
|
203
|
+
|
|
204
|
+
return SwarmResultList.from_results(list(results))
|
|
205
|
+
|
|
206
|
+
async def filter(
|
|
207
|
+
self,
|
|
208
|
+
items: List[ItemInput],
|
|
209
|
+
prompt: str,
|
|
210
|
+
schema: SchemaType,
|
|
211
|
+
condition: Callable[[Any], bool],
|
|
212
|
+
schema_options: Optional[Dict[str, Any]] = None,
|
|
213
|
+
system_prompt: Optional[str] = None,
|
|
214
|
+
agent: Optional[AgentConfig] = None,
|
|
215
|
+
mcp_servers: Optional[Dict[str, Any]] = None,
|
|
216
|
+
verify: Optional[VerifyConfig] = None,
|
|
217
|
+
retry: Optional[RetryConfig] = None,
|
|
218
|
+
timeout_ms: Optional[int] = None,
|
|
219
|
+
) -> SwarmResultList:
|
|
220
|
+
"""Two-step evaluation: agent assesses each item, then local condition applies threshold.
|
|
221
|
+
|
|
222
|
+
1. Agent sees context files, evaluates per prompt, outputs result.json matching schema
|
|
223
|
+
2. Condition function receives parsed data, returns true (success) or false (filtered)
|
|
224
|
+
|
|
225
|
+
Returns ALL items with status:
|
|
226
|
+
- "success": passed condition
|
|
227
|
+
- "filtered": evaluated but didn't pass condition
|
|
228
|
+
- "error": agent error
|
|
229
|
+
|
|
230
|
+
Use `.success` for passing items, `.filtered` for non-passing.
|
|
231
|
+
|
|
232
|
+
Args:
|
|
233
|
+
items: List of items to filter
|
|
234
|
+
prompt: Evaluation prompt
|
|
235
|
+
schema: Pydantic model or JSON Schema (required for filter)
|
|
236
|
+
condition: Function(data) -> bool to determine pass/fail
|
|
237
|
+
schema_options: Optional validation options
|
|
238
|
+
system_prompt: Optional system prompt
|
|
239
|
+
agent: Optional agent override
|
|
240
|
+
mcp_servers: Optional MCP servers override (replaces swarm default)
|
|
241
|
+
verify: Optional verify configuration for LLM-as-judge quality verification with retry
|
|
242
|
+
retry: Optional retry configuration for failed items
|
|
243
|
+
timeout_ms: Optional timeout in ms
|
|
244
|
+
|
|
245
|
+
Returns:
|
|
246
|
+
SwarmResultList with all items (success, filtered, or error status)
|
|
247
|
+
"""
|
|
248
|
+
await self._ensure_bridge()
|
|
249
|
+
run_id = self._generate_run_id()
|
|
250
|
+
timeout = timeout_ms or self.config.timeout_ms
|
|
251
|
+
retry = retry or self.config.retry
|
|
252
|
+
resolved_mcp_servers = mcp_servers or self.config.mcp_servers
|
|
253
|
+
|
|
254
|
+
async def process_item(item: ItemInput, index: int) -> SwarmResult:
|
|
255
|
+
# verify has internal retry loop with feedback - don't double-wrap with retry
|
|
256
|
+
if verify:
|
|
257
|
+
return await self._execute_filter_item_with_verify(
|
|
258
|
+
item, prompt, index, run_id, system_prompt, schema,
|
|
259
|
+
schema_options, agent, resolved_mcp_servers, verify, timeout, retry
|
|
260
|
+
)
|
|
261
|
+
|
|
262
|
+
# Wrap with retry if configured
|
|
263
|
+
if retry:
|
|
264
|
+
return await execute_with_retry(
|
|
265
|
+
lambda attempt: self._execute_filter_item(
|
|
266
|
+
item, prompt, index, run_id, system_prompt, schema,
|
|
267
|
+
schema_options, agent, resolved_mcp_servers, timeout, attempt
|
|
268
|
+
),
|
|
269
|
+
retry,
|
|
270
|
+
item_index=index,
|
|
271
|
+
)
|
|
272
|
+
return await self._execute_filter_item(
|
|
273
|
+
item, prompt, index, run_id, system_prompt, schema,
|
|
274
|
+
schema_options, agent, resolved_mcp_servers, timeout
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
evaluated = await asyncio.gather(*[
|
|
278
|
+
process_item(item, i) for i, item in enumerate(items)
|
|
279
|
+
])
|
|
280
|
+
|
|
281
|
+
# Apply condition and set status accordingly
|
|
282
|
+
results: List[SwarmResult] = []
|
|
283
|
+
for r in evaluated:
|
|
284
|
+
if r.status == "error":
|
|
285
|
+
results.append(r)
|
|
286
|
+
elif r.data is not None:
|
|
287
|
+
try:
|
|
288
|
+
if condition(r.data):
|
|
289
|
+
results.append(r) # success
|
|
290
|
+
else:
|
|
291
|
+
# Didn't pass condition → filtered
|
|
292
|
+
results.append(SwarmResult(
|
|
293
|
+
status="filtered",
|
|
294
|
+
data=r.data,
|
|
295
|
+
files=r.files,
|
|
296
|
+
meta=r.meta,
|
|
297
|
+
verify=r.verify,
|
|
298
|
+
))
|
|
299
|
+
except Exception as e:
|
|
300
|
+
# Condition threw → error (preserve raw_data if present)
|
|
301
|
+
results.append(SwarmResult(
|
|
302
|
+
status="error",
|
|
303
|
+
data=None,
|
|
304
|
+
files=r.files,
|
|
305
|
+
meta=r.meta,
|
|
306
|
+
error=f"Condition function threw: {e}",
|
|
307
|
+
raw_data=getattr(r, 'raw_data', None),
|
|
308
|
+
))
|
|
309
|
+
else:
|
|
310
|
+
results.append(r)
|
|
311
|
+
|
|
312
|
+
return SwarmResultList.from_results(results)
|
|
313
|
+
|
|
314
|
+
async def reduce(
|
|
315
|
+
self,
|
|
316
|
+
items: List[ItemInput],
|
|
317
|
+
prompt: str,
|
|
318
|
+
system_prompt: Optional[str] = None,
|
|
319
|
+
schema: Optional[SchemaType] = None,
|
|
320
|
+
schema_options: Optional[Dict[str, Any]] = None,
|
|
321
|
+
agent: Optional[AgentConfig] = None,
|
|
322
|
+
mcp_servers: Optional[Dict[str, Any]] = None,
|
|
323
|
+
verify: Optional[VerifyConfig] = None,
|
|
324
|
+
retry: Optional[RetryConfig] = None,
|
|
325
|
+
timeout_ms: Optional[int] = None,
|
|
326
|
+
) -> ReduceResult:
|
|
327
|
+
"""Synthesize many items into one.
|
|
328
|
+
|
|
329
|
+
Args:
|
|
330
|
+
items: List of items to reduce
|
|
331
|
+
prompt: Synthesis prompt
|
|
332
|
+
system_prompt: Optional system prompt
|
|
333
|
+
schema: Optional Pydantic model or JSON Schema
|
|
334
|
+
schema_options: Optional validation options
|
|
335
|
+
agent: Optional agent override
|
|
336
|
+
mcp_servers: Optional MCP servers override (replaces swarm default)
|
|
337
|
+
verify: Optional verify configuration for LLM-as-judge quality verification with retry
|
|
338
|
+
retry: Optional retry configuration
|
|
339
|
+
timeout_ms: Optional timeout in ms
|
|
340
|
+
|
|
341
|
+
Returns:
|
|
342
|
+
ReduceResult with synthesized output
|
|
343
|
+
"""
|
|
344
|
+
await self._ensure_bridge()
|
|
345
|
+
run_id = self._generate_run_id()
|
|
346
|
+
timeout = timeout_ms or self.config.timeout_ms
|
|
347
|
+
retry = retry or self.config.retry
|
|
348
|
+
resolved_mcp_servers = mcp_servers or self.config.mcp_servers
|
|
349
|
+
|
|
350
|
+
# Collect files and track original indices
|
|
351
|
+
all_files: List[FileMap] = []
|
|
352
|
+
indices: List[int] = []
|
|
353
|
+
|
|
354
|
+
for i, item in enumerate(items):
|
|
355
|
+
all_files.append(self._get_files(item))
|
|
356
|
+
indices.append(self._get_index(item, i))
|
|
357
|
+
|
|
358
|
+
# Build context: item_0/, item_1/, etc.
|
|
359
|
+
context: FileMap = {}
|
|
360
|
+
for i, files in enumerate(all_files):
|
|
361
|
+
for name, content in files.items():
|
|
362
|
+
context[f"item_{indices[i]}/{name}"] = content
|
|
363
|
+
|
|
364
|
+
# Build reduce system prompt (context structure + user's system_prompt)
|
|
365
|
+
file_tree = build_file_tree(context)
|
|
366
|
+
reduce_context_prompt = apply_template(REDUCE_PROMPT, {"fileTree": file_tree})
|
|
367
|
+
final_system_prompt = (
|
|
368
|
+
f"{reduce_context_prompt}\n\n{system_prompt}"
|
|
369
|
+
if system_prompt
|
|
370
|
+
else reduce_context_prompt
|
|
371
|
+
)
|
|
372
|
+
|
|
373
|
+
# Build meta (sandboxId/tag updated after execution)
|
|
374
|
+
def build_meta(result: Dict[str, Any]) -> ReduceMeta:
|
|
375
|
+
return ReduceMeta(
|
|
376
|
+
run_id=run_id,
|
|
377
|
+
operation="reduce",
|
|
378
|
+
tag=result["tag"],
|
|
379
|
+
sandbox_id=result["sandbox_id"],
|
|
380
|
+
input_count=len(items),
|
|
381
|
+
input_indices=indices,
|
|
382
|
+
)
|
|
383
|
+
|
|
384
|
+
# Shared execution logic
|
|
385
|
+
async def execute_once(prompt_to_use: str, tag_prefix: str) -> ReduceResult:
|
|
386
|
+
async with self.semaphore:
|
|
387
|
+
result = await self._execute(
|
|
388
|
+
context=context,
|
|
389
|
+
prompt=prompt_to_use,
|
|
390
|
+
system_prompt=final_system_prompt,
|
|
391
|
+
schema=schema,
|
|
392
|
+
schema_options=schema_options,
|
|
393
|
+
agent=agent,
|
|
394
|
+
mcp_servers=resolved_mcp_servers,
|
|
395
|
+
tag_prefix=tag_prefix,
|
|
396
|
+
timeout=timeout,
|
|
397
|
+
)
|
|
398
|
+
|
|
399
|
+
meta = build_meta(result)
|
|
400
|
+
|
|
401
|
+
if result.get("error"):
|
|
402
|
+
return ReduceResult(
|
|
403
|
+
status="error",
|
|
404
|
+
data=None,
|
|
405
|
+
files=result["files"],
|
|
406
|
+
meta=meta,
|
|
407
|
+
error=result["error"],
|
|
408
|
+
raw_data=result.get("raw_data"),
|
|
409
|
+
)
|
|
410
|
+
|
|
411
|
+
return ReduceResult(
|
|
412
|
+
status="success",
|
|
413
|
+
data=result["data"],
|
|
414
|
+
files=result["files"],
|
|
415
|
+
meta=meta,
|
|
416
|
+
)
|
|
417
|
+
|
|
418
|
+
base_tag = f"{self.config.tag}-reduce"
|
|
419
|
+
|
|
420
|
+
# verify has internal retry loop with feedback - don't double-wrap with retry
|
|
421
|
+
if verify:
|
|
422
|
+
return await self._run_with_verification(
|
|
423
|
+
worker_fn=lambda current_prompt, tag_prefix: execute_once(current_prompt, tag_prefix),
|
|
424
|
+
original_prompt=prompt,
|
|
425
|
+
input_files=context,
|
|
426
|
+
verify_config=verify,
|
|
427
|
+
mcp_servers=resolved_mcp_servers,
|
|
428
|
+
timeout=timeout,
|
|
429
|
+
system_prompt=final_system_prompt,
|
|
430
|
+
schema=schema,
|
|
431
|
+
run_id=run_id,
|
|
432
|
+
base_tag=base_tag,
|
|
433
|
+
retry=retry,
|
|
434
|
+
)
|
|
435
|
+
|
|
436
|
+
# Wrap with retry if configured
|
|
437
|
+
if retry:
|
|
438
|
+
async def execute_fn(attempt: int = 1) -> ReduceResult:
|
|
439
|
+
tag_prefix = f"{base_tag}-er{attempt - 1}" if attempt > 1 else base_tag
|
|
440
|
+
return await execute_once(prompt, tag_prefix)
|
|
441
|
+
return await execute_with_retry(execute_fn, retry)
|
|
442
|
+
|
|
443
|
+
return await execute_once(prompt, base_tag)
|
|
444
|
+
|
|
445
|
+
async def best_of(
|
|
446
|
+
self,
|
|
447
|
+
item: ItemInput,
|
|
448
|
+
prompt: str,
|
|
449
|
+
config: BestOfConfig,
|
|
450
|
+
system_prompt: Optional[str] = None,
|
|
451
|
+
schema: Optional[SchemaType] = None,
|
|
452
|
+
schema_options: Optional[Dict[str, Any]] = None,
|
|
453
|
+
retry: Optional[RetryConfig] = None,
|
|
454
|
+
timeout_ms: Optional[int] = None,
|
|
455
|
+
) -> BestOfResult:
|
|
456
|
+
"""Run N candidates on the same task, judge picks the best.
|
|
457
|
+
|
|
458
|
+
Args:
|
|
459
|
+
item: Single item to process
|
|
460
|
+
prompt: Task prompt
|
|
461
|
+
config: BestOf configuration (n, judge_criteria, mcp_servers, etc.)
|
|
462
|
+
system_prompt: Optional system prompt
|
|
463
|
+
schema: Optional Pydantic model or JSON Schema
|
|
464
|
+
schema_options: Optional validation options
|
|
465
|
+
retry: Optional retry configuration for candidates and judge
|
|
466
|
+
timeout_ms: Optional timeout in ms
|
|
467
|
+
|
|
468
|
+
Returns:
|
|
469
|
+
BestOfResult with winner, candidates, and judge info
|
|
470
|
+
"""
|
|
471
|
+
await self._ensure_bridge()
|
|
472
|
+
retry = retry or self.config.retry
|
|
473
|
+
|
|
474
|
+
# Resolve n
|
|
475
|
+
n = config.n or (len(config.task_agents) if config.task_agents else None)
|
|
476
|
+
if n is None:
|
|
477
|
+
raise ValueError("bestOf requires n or task_agents")
|
|
478
|
+
if n < 2:
|
|
479
|
+
raise ValueError("bestOf requires n >= 2")
|
|
480
|
+
|
|
481
|
+
run_id = self._generate_run_id()
|
|
482
|
+
timeout = timeout_ms or self.config.timeout_ms
|
|
483
|
+
input_files = self._get_files(item)
|
|
484
|
+
|
|
485
|
+
# Resolve MCP servers for candidates and judge
|
|
486
|
+
candidate_mcp_servers = config.mcp_servers or self.config.mcp_servers
|
|
487
|
+
judge_mcp_servers = config.judge_mcp_servers or config.mcp_servers or self.config.mcp_servers
|
|
488
|
+
|
|
489
|
+
# Run candidates (semaphore acquired inside _execute_best_of_candidate)
|
|
490
|
+
async def run_candidate(candidate_index: int) -> SwarmResult:
|
|
491
|
+
if retry:
|
|
492
|
+
result = await execute_with_retry(
|
|
493
|
+
lambda attempt: self._execute_best_of_candidate(
|
|
494
|
+
input_files=input_files,
|
|
495
|
+
prompt=prompt,
|
|
496
|
+
candidate_index=candidate_index,
|
|
497
|
+
run_id=run_id,
|
|
498
|
+
config=config,
|
|
499
|
+
mcp_servers=candidate_mcp_servers,
|
|
500
|
+
system_prompt=system_prompt,
|
|
501
|
+
schema=schema,
|
|
502
|
+
schema_options=schema_options,
|
|
503
|
+
timeout=timeout,
|
|
504
|
+
attempt=attempt,
|
|
505
|
+
),
|
|
506
|
+
retry,
|
|
507
|
+
item_index=0, # standalone bestOf uses item_index=0
|
|
508
|
+
)
|
|
509
|
+
else:
|
|
510
|
+
result = await self._execute_best_of_candidate(
|
|
511
|
+
input_files=input_files,
|
|
512
|
+
prompt=prompt,
|
|
513
|
+
candidate_index=candidate_index,
|
|
514
|
+
run_id=run_id,
|
|
515
|
+
config=config,
|
|
516
|
+
mcp_servers=candidate_mcp_servers,
|
|
517
|
+
system_prompt=system_prompt,
|
|
518
|
+
schema=schema,
|
|
519
|
+
schema_options=schema_options,
|
|
520
|
+
timeout=timeout,
|
|
521
|
+
)
|
|
522
|
+
# Call callback after candidate completes
|
|
523
|
+
if config.on_candidate_complete:
|
|
524
|
+
config.on_candidate_complete(0, candidate_index, result.status if result.status != "filtered" else "success")
|
|
525
|
+
return result
|
|
526
|
+
|
|
527
|
+
candidates = await asyncio.gather(*[
|
|
528
|
+
run_candidate(i) for i in range(n)
|
|
529
|
+
])
|
|
530
|
+
candidates = list(candidates)
|
|
531
|
+
|
|
532
|
+
# Run judge (semaphore acquired inside _execute_best_of_judge)
|
|
533
|
+
# Judge uses default retry (status === "error"), not custom retry_on
|
|
534
|
+
if retry:
|
|
535
|
+
# Create a copy of retry config without custom retry_on for judge
|
|
536
|
+
judge_retry = RetryConfig(
|
|
537
|
+
max_attempts=retry.max_attempts,
|
|
538
|
+
backoff_ms=retry.backoff_ms,
|
|
539
|
+
backoff_multiplier=retry.backoff_multiplier,
|
|
540
|
+
retry_on=None, # Use default (status == "error")
|
|
541
|
+
)
|
|
542
|
+
judge = await execute_with_retry(
|
|
543
|
+
lambda attempt: self._execute_best_of_judge(
|
|
544
|
+
input_files=input_files,
|
|
545
|
+
task_prompt=prompt,
|
|
546
|
+
candidates=candidates,
|
|
547
|
+
config=config,
|
|
548
|
+
mcp_servers=judge_mcp_servers,
|
|
549
|
+
timeout=timeout,
|
|
550
|
+
system_prompt=system_prompt,
|
|
551
|
+
schema=schema,
|
|
552
|
+
attempt=attempt,
|
|
553
|
+
),
|
|
554
|
+
judge_retry
|
|
555
|
+
)
|
|
556
|
+
else:
|
|
557
|
+
judge = await self._execute_best_of_judge(
|
|
558
|
+
input_files=input_files,
|
|
559
|
+
task_prompt=prompt,
|
|
560
|
+
candidates=candidates,
|
|
561
|
+
config=config,
|
|
562
|
+
mcp_servers=judge_mcp_servers,
|
|
563
|
+
timeout=timeout,
|
|
564
|
+
system_prompt=system_prompt,
|
|
565
|
+
schema=schema,
|
|
566
|
+
)
|
|
567
|
+
|
|
568
|
+
first_success = next((i for i, c in enumerate(candidates) if c.status == "success"), -1)
|
|
569
|
+
winner_index = judge["winner"] if judge["winner"] is not None else (first_success if first_success >= 0 else 0)
|
|
570
|
+
|
|
571
|
+
# Call judge callback
|
|
572
|
+
if config.on_judge_complete:
|
|
573
|
+
config.on_judge_complete(0, winner_index, judge.get("reasoning", ""))
|
|
574
|
+
|
|
575
|
+
judge_meta = JudgeMeta(
|
|
576
|
+
run_id=run_id,
|
|
577
|
+
operation="bestof-judge",
|
|
578
|
+
tag=judge["tag"],
|
|
579
|
+
sandbox_id=judge["sandbox_id"],
|
|
580
|
+
candidate_count=n,
|
|
581
|
+
)
|
|
582
|
+
|
|
583
|
+
return BestOfResult(
|
|
584
|
+
winner=candidates[winner_index] if winner_index < len(candidates) else candidates[0],
|
|
585
|
+
winner_index=winner_index,
|
|
586
|
+
judge_reasoning=judge.get("reasoning", "Judge failed to provide reasoning"),
|
|
587
|
+
judge_meta=judge_meta,
|
|
588
|
+
candidates=candidates,
|
|
589
|
+
)
|
|
590
|
+
|
|
591
|
+
async def close(self):
|
|
592
|
+
"""Close the bridge connection."""
|
|
593
|
+
if self._bridge_started:
|
|
594
|
+
await self.bridge.stop()
|
|
595
|
+
self._bridge_started = False
|
|
596
|
+
|
|
597
|
+
async def __aenter__(self):
|
|
598
|
+
"""Async context manager entry - ensures bridge is started."""
|
|
599
|
+
await self._ensure_bridge()
|
|
600
|
+
return self
|
|
601
|
+
|
|
602
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
603
|
+
"""Async context manager exit - closes the bridge."""
|
|
604
|
+
await self.close()
|
|
605
|
+
return False
|
|
606
|
+
|
|
607
|
+
# =========================================================================
|
|
608
|
+
# PRIVATE: EXECUTION
|
|
609
|
+
# =========================================================================
|
|
610
|
+
|
|
611
|
+
async def _execute(
|
|
612
|
+
self,
|
|
613
|
+
context: FileMap,
|
|
614
|
+
prompt: str,
|
|
615
|
+
system_prompt: Optional[str],
|
|
616
|
+
schema: Optional[SchemaType],
|
|
617
|
+
schema_options: Optional[Dict[str, Any]],
|
|
618
|
+
agent: Optional[AgentConfig],
|
|
619
|
+
mcp_servers: Optional[Dict[str, Any]],
|
|
620
|
+
tag_prefix: str,
|
|
621
|
+
timeout: int,
|
|
622
|
+
) -> Dict[str, Any]:
|
|
623
|
+
"""Execute a single agent task."""
|
|
624
|
+
instance_id = f"{tag_prefix}-{secrets.token_hex(4)}"
|
|
625
|
+
|
|
626
|
+
# Build agent config (merges override with base config)
|
|
627
|
+
agent_config = self._build_agent_config(agent)
|
|
628
|
+
|
|
629
|
+
# Convert schema to JSON Schema
|
|
630
|
+
json_schema = to_json_schema(schema)
|
|
631
|
+
|
|
632
|
+
# Build init params with _filter_none to exclude None values
|
|
633
|
+
# TS SDK resolves defaults from env vars when not provided
|
|
634
|
+
init_params = _filter_none({
|
|
635
|
+
# Agent config (optional - TS SDK resolves from SWARMKIT_API_KEY)
|
|
636
|
+
'agent_type': agent_config.type if agent_config else None,
|
|
637
|
+
'api_key': agent_config.api_key if agent_config else None,
|
|
638
|
+
'model': agent_config.model if agent_config else None,
|
|
639
|
+
'reasoning_effort': agent_config.reasoning_effort if agent_config else None,
|
|
640
|
+
'betas': agent_config.betas if agent_config else None,
|
|
641
|
+
# Sandbox (optional - TS SDK resolves from E2B_API_KEY)
|
|
642
|
+
'sandbox_provider': {'type': self.config.sandbox.type, 'config': self.config.sandbox.config} if self.config.sandbox else None,
|
|
643
|
+
# Other settings
|
|
644
|
+
'workspace_mode': self.config.workspace_mode,
|
|
645
|
+
'session_tag_prefix': tag_prefix,
|
|
646
|
+
'system_prompt': system_prompt,
|
|
647
|
+
'schema': json_schema,
|
|
648
|
+
'schema_options': schema_options,
|
|
649
|
+
'context': _encode_files_for_transport(context) if context else None,
|
|
650
|
+
'mcp_servers': mcp_servers,
|
|
651
|
+
})
|
|
652
|
+
|
|
653
|
+
files: FileMap = {}
|
|
654
|
+
data: Any = None
|
|
655
|
+
error: Optional[str] = None
|
|
656
|
+
raw_data: Optional[str] = None
|
|
657
|
+
sandbox_id = ""
|
|
658
|
+
tag = tag_prefix
|
|
659
|
+
|
|
660
|
+
try:
|
|
661
|
+
# Create instance
|
|
662
|
+
await self.bridge.create_instance(instance_id, init_params)
|
|
663
|
+
|
|
664
|
+
# Run prompt
|
|
665
|
+
run_result = await self.bridge.run_on_instance(
|
|
666
|
+
instance_id,
|
|
667
|
+
prompt,
|
|
668
|
+
timeout_ms=timeout,
|
|
669
|
+
call_timeout_s=(timeout / 1000) + 60, # Add buffer for RPC overhead
|
|
670
|
+
)
|
|
671
|
+
sandbox_id = run_result.get('sandbox_id', '')
|
|
672
|
+
|
|
673
|
+
# Get output
|
|
674
|
+
output = await self.bridge.get_output_on_instance(instance_id, recursive=True)
|
|
675
|
+
files = self._decode_files(output.get('files', {}))
|
|
676
|
+
|
|
677
|
+
if run_result.get('exit_code', 0) != 0:
|
|
678
|
+
error = f"Agent exited with code {run_result.get('exit_code')}"
|
|
679
|
+
elif json_schema:
|
|
680
|
+
# Validate result.json against schema
|
|
681
|
+
raw_json = files.get('result.json')
|
|
682
|
+
if raw_json is not None:
|
|
683
|
+
if isinstance(raw_json, bytes):
|
|
684
|
+
raw_json = raw_json.decode('utf-8')
|
|
685
|
+
|
|
686
|
+
if is_pydantic_model(schema) or is_dataclass(schema):
|
|
687
|
+
# Pydantic model or dataclass - validate and return instance
|
|
688
|
+
try:
|
|
689
|
+
strict = schema_options.get('mode') == 'strict' if schema_options else False
|
|
690
|
+
data = validate_and_parse(raw_json, schema, strict=strict)
|
|
691
|
+
except Exception as e:
|
|
692
|
+
error = f"Schema validation failed: {e}"
|
|
693
|
+
raw_data = raw_json
|
|
694
|
+
else:
|
|
695
|
+
# JSON Schema dict - use TS validation result
|
|
696
|
+
data = output.get('data')
|
|
697
|
+
if output.get('error'):
|
|
698
|
+
error = output['error']
|
|
699
|
+
if output.get('raw_data'):
|
|
700
|
+
raw_data = output['raw_data']
|
|
701
|
+
else:
|
|
702
|
+
error = "Schema provided but agent did not create output/result.json"
|
|
703
|
+
else:
|
|
704
|
+
data = files
|
|
705
|
+
|
|
706
|
+
except Exception as e:
|
|
707
|
+
error = str(e)
|
|
708
|
+
# Try to capture partial output even on failure (e.g., timeout)
|
|
709
|
+
try:
|
|
710
|
+
output = await self.bridge.get_output_on_instance(instance_id, recursive=True)
|
|
711
|
+
files = self._decode_files(output.get('files', {}))
|
|
712
|
+
except Exception:
|
|
713
|
+
pass # Sandbox may already be gone
|
|
714
|
+
finally:
|
|
715
|
+
# Always cleanup
|
|
716
|
+
try:
|
|
717
|
+
await self.bridge.kill_instance(instance_id)
|
|
718
|
+
except Exception:
|
|
719
|
+
pass
|
|
720
|
+
|
|
721
|
+
return {
|
|
722
|
+
"files": files,
|
|
723
|
+
"data": data,
|
|
724
|
+
"tag": tag,
|
|
725
|
+
"sandbox_id": sandbox_id,
|
|
726
|
+
"error": error,
|
|
727
|
+
"raw_data": raw_data,
|
|
728
|
+
}
|
|
729
|
+
|
|
730
|
+
# =========================================================================
|
|
731
|
+
# PRIVATE: MAP
|
|
732
|
+
# =========================================================================
|
|
733
|
+
|
|
734
|
+
async def _execute_map_item(
|
|
735
|
+
self,
|
|
736
|
+
item: ItemInput,
|
|
737
|
+
prompt: Prompt,
|
|
738
|
+
index: int,
|
|
739
|
+
run_id: str,
|
|
740
|
+
system_prompt: Optional[str],
|
|
741
|
+
schema: Optional[SchemaType],
|
|
742
|
+
schema_options: Optional[Dict[str, Any]],
|
|
743
|
+
agent: Optional[AgentConfig],
|
|
744
|
+
mcp_servers: Optional[Dict[str, Any]],
|
|
745
|
+
timeout: int,
|
|
746
|
+
attempt: int = 1,
|
|
747
|
+
) -> SwarmResult:
|
|
748
|
+
"""Execute a single map item."""
|
|
749
|
+
files = self._get_files(item)
|
|
750
|
+
tag_prefix = (
|
|
751
|
+
f"{self.config.tag}-map-{index}-er{attempt - 1}"
|
|
752
|
+
if attempt > 1
|
|
753
|
+
else f"{self.config.tag}-map-{index}"
|
|
754
|
+
)
|
|
755
|
+
|
|
756
|
+
try:
|
|
757
|
+
prompt_str = self._resolve_prompt(prompt, files, index)
|
|
758
|
+
except Exception as e:
|
|
759
|
+
return self._build_error_result(
|
|
760
|
+
f"Prompt function threw: {e}",
|
|
761
|
+
IndexedMeta(run_id=run_id, operation="map", tag=tag_prefix, sandbox_id="", index=index)
|
|
762
|
+
)
|
|
763
|
+
|
|
764
|
+
async with self.semaphore:
|
|
765
|
+
result = await self._execute(
|
|
766
|
+
context=files,
|
|
767
|
+
prompt=prompt_str,
|
|
768
|
+
system_prompt=system_prompt,
|
|
769
|
+
schema=schema,
|
|
770
|
+
schema_options=schema_options,
|
|
771
|
+
agent=agent,
|
|
772
|
+
mcp_servers=mcp_servers,
|
|
773
|
+
tag_prefix=tag_prefix,
|
|
774
|
+
timeout=timeout,
|
|
775
|
+
)
|
|
776
|
+
|
|
777
|
+
meta = IndexedMeta(
|
|
778
|
+
run_id=run_id,
|
|
779
|
+
operation="map",
|
|
780
|
+
tag=result["tag"],
|
|
781
|
+
sandbox_id=result["sandbox_id"],
|
|
782
|
+
index=index,
|
|
783
|
+
)
|
|
784
|
+
|
|
785
|
+
return self._build_result(result, meta)
|
|
786
|
+
|
|
787
|
+
async def _execute_map_item_with_best_of(
|
|
788
|
+
self,
|
|
789
|
+
item: ItemInput,
|
|
790
|
+
prompt: Prompt,
|
|
791
|
+
index: int,
|
|
792
|
+
run_id: str,
|
|
793
|
+
system_prompt: Optional[str],
|
|
794
|
+
schema: Optional[SchemaType],
|
|
795
|
+
schema_options: Optional[Dict[str, Any]],
|
|
796
|
+
agent: Optional[AgentConfig],
|
|
797
|
+
mcp_servers: Optional[Dict[str, Any]],
|
|
798
|
+
best_of_config: BestOfConfig,
|
|
799
|
+
retry: Optional[RetryConfig],
|
|
800
|
+
timeout: int,
|
|
801
|
+
) -> SwarmResult:
|
|
802
|
+
"""Execute a single map item with bestOf."""
|
|
803
|
+
files = self._get_files(item)
|
|
804
|
+
tag_prefix = f"{self.config.tag}-map-{index}"
|
|
805
|
+
|
|
806
|
+
try:
|
|
807
|
+
prompt_str = self._resolve_prompt(prompt, files, index)
|
|
808
|
+
except Exception as e:
|
|
809
|
+
return self._build_error_result(
|
|
810
|
+
f"Prompt function threw: {e}",
|
|
811
|
+
IndexedMeta(run_id=run_id, operation="map", tag=tag_prefix, sandbox_id="", index=index)
|
|
812
|
+
)
|
|
813
|
+
|
|
814
|
+
n = best_of_config.n or (len(best_of_config.task_agents) if best_of_config.task_agents else None)
|
|
815
|
+
if n is None or n < 2:
|
|
816
|
+
return self._build_error_result(
|
|
817
|
+
"bestOf requires n >= 2 or task_agents with at least 2 elements",
|
|
818
|
+
IndexedMeta(run_id=run_id, operation="map", tag=tag_prefix, sandbox_id="", index=index)
|
|
819
|
+
)
|
|
820
|
+
|
|
821
|
+
# Resolve MCP servers: bestOf.mcp_servers overrides operation-level
|
|
822
|
+
candidate_mcp_servers = best_of_config.mcp_servers or mcp_servers
|
|
823
|
+
judge_mcp_servers = best_of_config.judge_mcp_servers or best_of_config.mcp_servers or mcp_servers
|
|
824
|
+
|
|
825
|
+
# Run candidates in parallel (semaphore acquired inside _execute_best_of_candidate)
|
|
826
|
+
async def run_candidate(candidate_index: int) -> SwarmResult:
|
|
827
|
+
if retry:
|
|
828
|
+
result = await execute_with_retry(
|
|
829
|
+
lambda attempt: self._execute_best_of_candidate(
|
|
830
|
+
input_files=files,
|
|
831
|
+
prompt=prompt_str,
|
|
832
|
+
candidate_index=candidate_index,
|
|
833
|
+
run_id=run_id,
|
|
834
|
+
config=best_of_config,
|
|
835
|
+
mcp_servers=candidate_mcp_servers,
|
|
836
|
+
system_prompt=system_prompt,
|
|
837
|
+
schema=schema,
|
|
838
|
+
schema_options=schema_options,
|
|
839
|
+
timeout=timeout,
|
|
840
|
+
parent_index=index,
|
|
841
|
+
attempt=attempt,
|
|
842
|
+
),
|
|
843
|
+
retry,
|
|
844
|
+
item_index=index, # map item index
|
|
845
|
+
)
|
|
846
|
+
else:
|
|
847
|
+
result = await self._execute_best_of_candidate(
|
|
848
|
+
input_files=files,
|
|
849
|
+
prompt=prompt_str,
|
|
850
|
+
candidate_index=candidate_index,
|
|
851
|
+
run_id=run_id,
|
|
852
|
+
config=best_of_config,
|
|
853
|
+
mcp_servers=candidate_mcp_servers,
|
|
854
|
+
system_prompt=system_prompt,
|
|
855
|
+
schema=schema,
|
|
856
|
+
schema_options=schema_options,
|
|
857
|
+
timeout=timeout,
|
|
858
|
+
parent_index=index,
|
|
859
|
+
)
|
|
860
|
+
# Call callback after candidate completes
|
|
861
|
+
if best_of_config.on_candidate_complete:
|
|
862
|
+
best_of_config.on_candidate_complete(index, candidate_index, result.status if result.status != "filtered" else "success")
|
|
863
|
+
return result
|
|
864
|
+
|
|
865
|
+
candidates = list(await asyncio.gather(*[
|
|
866
|
+
run_candidate(i) for i in range(n)
|
|
867
|
+
]))
|
|
868
|
+
|
|
869
|
+
# Run judge (semaphore acquired inside _execute_best_of_judge)
|
|
870
|
+
# Judge uses default retry (status === "error"), not custom retry_on
|
|
871
|
+
if retry:
|
|
872
|
+
judge_retry = RetryConfig(
|
|
873
|
+
max_attempts=retry.max_attempts,
|
|
874
|
+
backoff_ms=retry.backoff_ms,
|
|
875
|
+
backoff_multiplier=retry.backoff_multiplier,
|
|
876
|
+
retry_on=None,
|
|
877
|
+
)
|
|
878
|
+
judge = await execute_with_retry(
|
|
879
|
+
lambda attempt: self._execute_best_of_judge(
|
|
880
|
+
input_files=files,
|
|
881
|
+
task_prompt=prompt_str,
|
|
882
|
+
candidates=candidates,
|
|
883
|
+
config=best_of_config,
|
|
884
|
+
mcp_servers=judge_mcp_servers,
|
|
885
|
+
timeout=timeout,
|
|
886
|
+
system_prompt=system_prompt,
|
|
887
|
+
schema=schema,
|
|
888
|
+
parent_index=index,
|
|
889
|
+
attempt=attempt,
|
|
890
|
+
),
|
|
891
|
+
judge_retry
|
|
892
|
+
)
|
|
893
|
+
else:
|
|
894
|
+
judge = await self._execute_best_of_judge(
|
|
895
|
+
input_files=files,
|
|
896
|
+
task_prompt=prompt_str,
|
|
897
|
+
candidates=candidates,
|
|
898
|
+
config=best_of_config,
|
|
899
|
+
mcp_servers=judge_mcp_servers,
|
|
900
|
+
timeout=timeout,
|
|
901
|
+
system_prompt=system_prompt,
|
|
902
|
+
schema=schema,
|
|
903
|
+
parent_index=index,
|
|
904
|
+
)
|
|
905
|
+
|
|
906
|
+
first_success = next((i for i, c in enumerate(candidates) if c.status == "success"), -1)
|
|
907
|
+
winner_index = judge["winner"] if judge["winner"] is not None else (first_success if first_success >= 0 else 0)
|
|
908
|
+
winner = candidates[winner_index] if winner_index < len(candidates) else candidates[0]
|
|
909
|
+
|
|
910
|
+
# Call judge callback with map item index
|
|
911
|
+
if best_of_config.on_judge_complete:
|
|
912
|
+
best_of_config.on_judge_complete(index, winner_index, judge.get("reasoning", ""))
|
|
913
|
+
|
|
914
|
+
judge_meta = JudgeMeta(
|
|
915
|
+
run_id=run_id,
|
|
916
|
+
operation="bestof-judge",
|
|
917
|
+
tag=judge["tag"],
|
|
918
|
+
sandbox_id=judge["sandbox_id"],
|
|
919
|
+
candidate_count=n,
|
|
920
|
+
)
|
|
921
|
+
|
|
922
|
+
# Return winner with bestOf info
|
|
923
|
+
return SwarmResult(
|
|
924
|
+
status=winner.status,
|
|
925
|
+
data=winner.data,
|
|
926
|
+
files=winner.files,
|
|
927
|
+
meta=IndexedMeta(
|
|
928
|
+
run_id=run_id,
|
|
929
|
+
operation="map",
|
|
930
|
+
tag=winner.meta.tag,
|
|
931
|
+
sandbox_id=winner.meta.sandbox_id,
|
|
932
|
+
index=index,
|
|
933
|
+
),
|
|
934
|
+
error=winner.error,
|
|
935
|
+
raw_data=winner.raw_data,
|
|
936
|
+
best_of=BestOfInfo(
|
|
937
|
+
winner_index=winner_index,
|
|
938
|
+
judge_reasoning=judge.get("reasoning", "Judge failed to provide reasoning"),
|
|
939
|
+
judge_meta=judge_meta,
|
|
940
|
+
candidates=candidates,
|
|
941
|
+
),
|
|
942
|
+
)
|
|
943
|
+
|
|
944
|
+
async def _execute_map_item_with_verify(
|
|
945
|
+
self,
|
|
946
|
+
item: ItemInput,
|
|
947
|
+
prompt: Prompt,
|
|
948
|
+
index: int,
|
|
949
|
+
run_id: str,
|
|
950
|
+
system_prompt: Optional[str],
|
|
951
|
+
schema: Optional[SchemaType],
|
|
952
|
+
schema_options: Optional[Dict[str, Any]],
|
|
953
|
+
agent: Optional[AgentConfig],
|
|
954
|
+
mcp_servers: Optional[Dict[str, Any]],
|
|
955
|
+
verify_config: VerifyConfig,
|
|
956
|
+
timeout: int,
|
|
957
|
+
retry: Optional[RetryConfig] = None,
|
|
958
|
+
) -> SwarmResult:
|
|
959
|
+
"""Execute a single map item with verification."""
|
|
960
|
+
files = self._get_files(item)
|
|
961
|
+
base_tag = f"{self.config.tag}-map-{index}"
|
|
962
|
+
|
|
963
|
+
try:
|
|
964
|
+
prompt_str = self._resolve_prompt(prompt, files, index)
|
|
965
|
+
except Exception as e:
|
|
966
|
+
return self._build_error_result(
|
|
967
|
+
f"Prompt function threw: {e}",
|
|
968
|
+
IndexedMeta(run_id=run_id, operation="map", tag=base_tag, sandbox_id="", index=index)
|
|
969
|
+
)
|
|
970
|
+
|
|
971
|
+
# Worker function that executes map item (tag_prefix managed by _run_with_verification)
|
|
972
|
+
async def worker_fn(current_prompt: str, tag_prefix: str) -> SwarmResult:
|
|
973
|
+
async with self.semaphore:
|
|
974
|
+
result = await self._execute(
|
|
975
|
+
context=files,
|
|
976
|
+
prompt=current_prompt,
|
|
977
|
+
system_prompt=system_prompt,
|
|
978
|
+
schema=schema,
|
|
979
|
+
schema_options=schema_options,
|
|
980
|
+
agent=agent,
|
|
981
|
+
mcp_servers=mcp_servers,
|
|
982
|
+
tag_prefix=tag_prefix,
|
|
983
|
+
timeout=timeout,
|
|
984
|
+
)
|
|
985
|
+
|
|
986
|
+
meta = IndexedMeta(
|
|
987
|
+
run_id=run_id,
|
|
988
|
+
operation="map",
|
|
989
|
+
tag=result["tag"],
|
|
990
|
+
sandbox_id=result["sandbox_id"],
|
|
991
|
+
index=index,
|
|
992
|
+
)
|
|
993
|
+
|
|
994
|
+
return self._build_result(result, meta)
|
|
995
|
+
|
|
996
|
+
# Run with verification loop
|
|
997
|
+
return await self._run_with_verification(
|
|
998
|
+
worker_fn=worker_fn,
|
|
999
|
+
original_prompt=prompt_str,
|
|
1000
|
+
input_files=files,
|
|
1001
|
+
verify_config=verify_config,
|
|
1002
|
+
mcp_servers=mcp_servers,
|
|
1003
|
+
timeout=timeout,
|
|
1004
|
+
system_prompt=system_prompt,
|
|
1005
|
+
schema=schema,
|
|
1006
|
+
run_id=run_id,
|
|
1007
|
+
base_tag=base_tag,
|
|
1008
|
+
retry=retry,
|
|
1009
|
+
item_index=index,
|
|
1010
|
+
)
|
|
1011
|
+
|
|
1012
|
+
# =========================================================================
|
|
1013
|
+
# PRIVATE: FILTER
|
|
1014
|
+
# =========================================================================
|
|
1015
|
+
|
|
1016
|
+
async def _execute_filter_item(
|
|
1017
|
+
self,
|
|
1018
|
+
item: ItemInput,
|
|
1019
|
+
prompt: str,
|
|
1020
|
+
index: int,
|
|
1021
|
+
run_id: str,
|
|
1022
|
+
system_prompt: Optional[str],
|
|
1023
|
+
schema: SchemaType,
|
|
1024
|
+
schema_options: Optional[Dict[str, Any]],
|
|
1025
|
+
agent: Optional[AgentConfig],
|
|
1026
|
+
mcp_servers: Optional[Dict[str, Any]],
|
|
1027
|
+
timeout: int,
|
|
1028
|
+
attempt: int = 1,
|
|
1029
|
+
) -> SwarmResult:
|
|
1030
|
+
"""Execute a single filter item."""
|
|
1031
|
+
original_files = self._get_files(item)
|
|
1032
|
+
tag_prefix = (
|
|
1033
|
+
f"{self.config.tag}-filter-{index}-er{attempt - 1}"
|
|
1034
|
+
if attempt > 1
|
|
1035
|
+
else f"{self.config.tag}-filter-{index}"
|
|
1036
|
+
)
|
|
1037
|
+
|
|
1038
|
+
async with self.semaphore:
|
|
1039
|
+
result = await self._execute(
|
|
1040
|
+
context=original_files,
|
|
1041
|
+
prompt=prompt,
|
|
1042
|
+
system_prompt=system_prompt,
|
|
1043
|
+
schema=schema,
|
|
1044
|
+
schema_options=schema_options,
|
|
1045
|
+
agent=agent,
|
|
1046
|
+
mcp_servers=mcp_servers,
|
|
1047
|
+
tag_prefix=tag_prefix,
|
|
1048
|
+
timeout=timeout,
|
|
1049
|
+
)
|
|
1050
|
+
|
|
1051
|
+
meta = IndexedMeta(
|
|
1052
|
+
run_id=run_id,
|
|
1053
|
+
operation="filter",
|
|
1054
|
+
tag=result["tag"],
|
|
1055
|
+
sandbox_id=result["sandbox_id"],
|
|
1056
|
+
index=index,
|
|
1057
|
+
)
|
|
1058
|
+
|
|
1059
|
+
# Filter passes through ORIGINAL files, not output
|
|
1060
|
+
return self._build_result(result, meta, files_override=original_files)
|
|
1061
|
+
|
|
1062
|
+
async def _execute_filter_item_with_verify(
|
|
1063
|
+
self,
|
|
1064
|
+
item: ItemInput,
|
|
1065
|
+
prompt: str,
|
|
1066
|
+
index: int,
|
|
1067
|
+
run_id: str,
|
|
1068
|
+
system_prompt: Optional[str],
|
|
1069
|
+
schema: SchemaType,
|
|
1070
|
+
schema_options: Optional[Dict[str, Any]],
|
|
1071
|
+
agent: Optional[AgentConfig],
|
|
1072
|
+
mcp_servers: Optional[Dict[str, Any]],
|
|
1073
|
+
verify_config: VerifyConfig,
|
|
1074
|
+
timeout: int,
|
|
1075
|
+
retry: Optional[RetryConfig] = None,
|
|
1076
|
+
) -> SwarmResult:
|
|
1077
|
+
"""Execute a single filter item with verification."""
|
|
1078
|
+
original_files = self._get_files(item)
|
|
1079
|
+
base_tag = f"{self.config.tag}-filter-{index}"
|
|
1080
|
+
|
|
1081
|
+
# Worker function that executes filter item (tag_prefix managed by _run_with_verification)
|
|
1082
|
+
async def worker_fn(current_prompt: str, tag_prefix: str) -> SwarmResult:
|
|
1083
|
+
async with self.semaphore:
|
|
1084
|
+
result = await self._execute(
|
|
1085
|
+
context=original_files,
|
|
1086
|
+
prompt=current_prompt,
|
|
1087
|
+
system_prompt=system_prompt,
|
|
1088
|
+
schema=schema,
|
|
1089
|
+
schema_options=schema_options,
|
|
1090
|
+
agent=agent,
|
|
1091
|
+
mcp_servers=mcp_servers,
|
|
1092
|
+
tag_prefix=tag_prefix,
|
|
1093
|
+
timeout=timeout,
|
|
1094
|
+
)
|
|
1095
|
+
|
|
1096
|
+
meta = IndexedMeta(
|
|
1097
|
+
run_id=run_id,
|
|
1098
|
+
operation="filter",
|
|
1099
|
+
tag=result["tag"],
|
|
1100
|
+
sandbox_id=result["sandbox_id"],
|
|
1101
|
+
index=index,
|
|
1102
|
+
)
|
|
1103
|
+
|
|
1104
|
+
# Filter passes through ORIGINAL files, not output
|
|
1105
|
+
return self._build_result(result, meta, files_override=original_files)
|
|
1106
|
+
|
|
1107
|
+
# Run with verification loop
|
|
1108
|
+
return await self._run_with_verification(
|
|
1109
|
+
worker_fn=worker_fn,
|
|
1110
|
+
original_prompt=prompt,
|
|
1111
|
+
input_files=original_files,
|
|
1112
|
+
verify_config=verify_config,
|
|
1113
|
+
mcp_servers=mcp_servers,
|
|
1114
|
+
timeout=timeout,
|
|
1115
|
+
system_prompt=system_prompt,
|
|
1116
|
+
schema=schema,
|
|
1117
|
+
run_id=run_id,
|
|
1118
|
+
base_tag=base_tag,
|
|
1119
|
+
retry=retry,
|
|
1120
|
+
item_index=index,
|
|
1121
|
+
)
|
|
1122
|
+
|
|
1123
|
+
# =========================================================================
|
|
1124
|
+
# PRIVATE: VERIFY
|
|
1125
|
+
# =========================================================================
|
|
1126
|
+
|
|
1127
|
+
async def _run_with_verification(
|
|
1128
|
+
self,
|
|
1129
|
+
worker_fn: Callable[[str, str], Any], # async function(prompt, tag_prefix) -> result with status and files
|
|
1130
|
+
original_prompt: str,
|
|
1131
|
+
input_files: FileMap,
|
|
1132
|
+
verify_config: VerifyConfig,
|
|
1133
|
+
mcp_servers: Optional[Dict[str, Any]],
|
|
1134
|
+
timeout: int,
|
|
1135
|
+
system_prompt: Optional[str],
|
|
1136
|
+
schema: Optional[SchemaType],
|
|
1137
|
+
run_id: str,
|
|
1138
|
+
base_tag: str,
|
|
1139
|
+
retry: Optional[RetryConfig] = None,
|
|
1140
|
+
item_index: int = 0,
|
|
1141
|
+
) -> Any:
|
|
1142
|
+
"""Shared verification loop for map, filter, and reduce.
|
|
1143
|
+
|
|
1144
|
+
Runs worker function, verifies output, retries with feedback if needed.
|
|
1145
|
+
|
|
1146
|
+
Args:
|
|
1147
|
+
worker_fn: Async function that executes the worker with a given prompt and tag prefix
|
|
1148
|
+
original_prompt: The original user prompt
|
|
1149
|
+
input_files: Input files for the worker
|
|
1150
|
+
verify_config: Verification configuration
|
|
1151
|
+
mcp_servers: MCP servers for verifier (resolved from operation or swarm)
|
|
1152
|
+
timeout: Timeout in ms
|
|
1153
|
+
system_prompt: Optional system prompt
|
|
1154
|
+
schema: Optional schema
|
|
1155
|
+
run_id: Run ID for metadata
|
|
1156
|
+
base_tag: Base tag for worker/verifier
|
|
1157
|
+
retry: Optional retry config for verifier error retry
|
|
1158
|
+
item_index: Item index for callbacks (default: 0 for reduce)
|
|
1159
|
+
|
|
1160
|
+
Returns:
|
|
1161
|
+
Result with verify info attached
|
|
1162
|
+
"""
|
|
1163
|
+
# Resolve verifier MCP servers
|
|
1164
|
+
verifier_mcp_servers = verify_config.verifier_mcp_servers or mcp_servers
|
|
1165
|
+
max_attempts = verify_config.max_attempts
|
|
1166
|
+
|
|
1167
|
+
current_prompt = original_prompt
|
|
1168
|
+
last_result = None
|
|
1169
|
+
verify_attempts = 0
|
|
1170
|
+
|
|
1171
|
+
while verify_attempts < max_attempts:
|
|
1172
|
+
verify_attempts += 1
|
|
1173
|
+
|
|
1174
|
+
# Build worker tag: base_tag, base_tag-vr1, base_tag-vr2, etc. (vr = verify retry)
|
|
1175
|
+
worker_tag = f"{base_tag}-vr{verify_attempts - 1}" if verify_attempts > 1 else base_tag
|
|
1176
|
+
|
|
1177
|
+
# Run worker (with error retry if configured)
|
|
1178
|
+
# Worker keeps retry_on (user-specified condition) and gets -er{n} tag suffix for error retries
|
|
1179
|
+
if retry:
|
|
1180
|
+
async def worker_with_retry(retry_attempt: int = 1):
|
|
1181
|
+
tag = f"{worker_tag}-er{retry_attempt - 1}" if retry_attempt > 1 else worker_tag
|
|
1182
|
+
return await worker_fn(current_prompt, tag)
|
|
1183
|
+
worker_result = await execute_with_retry(worker_with_retry, retry)
|
|
1184
|
+
else:
|
|
1185
|
+
worker_result = await worker_fn(current_prompt, worker_tag)
|
|
1186
|
+
|
|
1187
|
+
# If worker failed even after retries, return immediately
|
|
1188
|
+
if worker_result.status == "error":
|
|
1189
|
+
# Call worker callback with error status
|
|
1190
|
+
if verify_config.on_worker_complete:
|
|
1191
|
+
verify_config.on_worker_complete(item_index, verify_attempts, "error")
|
|
1192
|
+
return worker_result
|
|
1193
|
+
|
|
1194
|
+
# Call worker callback with success status
|
|
1195
|
+
if verify_config.on_worker_complete:
|
|
1196
|
+
verify_config.on_worker_complete(item_index, verify_attempts, "success")
|
|
1197
|
+
|
|
1198
|
+
last_result = worker_result
|
|
1199
|
+
|
|
1200
|
+
# Run verification (verifier tag = worker_tag-verify, with error retry like judge)
|
|
1201
|
+
if retry:
|
|
1202
|
+
async def verify_with_retry(retry_attempt: int = 1):
|
|
1203
|
+
return await self._execute_verify(
|
|
1204
|
+
input_files=input_files,
|
|
1205
|
+
output_files=worker_result.files,
|
|
1206
|
+
task_prompt=current_prompt,
|
|
1207
|
+
config=verify_config,
|
|
1208
|
+
mcp_servers=verifier_mcp_servers,
|
|
1209
|
+
timeout=timeout,
|
|
1210
|
+
system_prompt=system_prompt,
|
|
1211
|
+
schema=schema,
|
|
1212
|
+
run_id=run_id,
|
|
1213
|
+
worker_tag=worker_tag,
|
|
1214
|
+
retry_attempt=retry_attempt,
|
|
1215
|
+
)
|
|
1216
|
+
# Use retry but ignore custom retry_on (like judge)
|
|
1217
|
+
retry_config = RetryConfig(
|
|
1218
|
+
max_attempts=retry.max_attempts,
|
|
1219
|
+
backoff_ms=retry.backoff_ms,
|
|
1220
|
+
backoff_multiplier=retry.backoff_multiplier,
|
|
1221
|
+
)
|
|
1222
|
+
verification = await execute_with_retry(verify_with_retry, retry_config)
|
|
1223
|
+
else:
|
|
1224
|
+
verification = await self._execute_verify(
|
|
1225
|
+
input_files=input_files,
|
|
1226
|
+
output_files=worker_result.files,
|
|
1227
|
+
task_prompt=current_prompt,
|
|
1228
|
+
config=verify_config,
|
|
1229
|
+
mcp_servers=verifier_mcp_servers,
|
|
1230
|
+
timeout=timeout,
|
|
1231
|
+
system_prompt=system_prompt,
|
|
1232
|
+
schema=schema,
|
|
1233
|
+
run_id=run_id,
|
|
1234
|
+
worker_tag=worker_tag,
|
|
1235
|
+
)
|
|
1236
|
+
|
|
1237
|
+
# Call verifier callback
|
|
1238
|
+
if verify_config.on_verifier_complete:
|
|
1239
|
+
verify_config.on_verifier_complete(
|
|
1240
|
+
item_index,
|
|
1241
|
+
verify_attempts,
|
|
1242
|
+
bool(verification.get("passed")),
|
|
1243
|
+
verification.get("feedback"),
|
|
1244
|
+
)
|
|
1245
|
+
|
|
1246
|
+
# Build verify meta
|
|
1247
|
+
verify_meta = VerifyMeta(
|
|
1248
|
+
run_id=run_id,
|
|
1249
|
+
operation="verify",
|
|
1250
|
+
tag=verification["tag"],
|
|
1251
|
+
sandbox_id=verification["sandbox_id"],
|
|
1252
|
+
attempts=verify_attempts,
|
|
1253
|
+
)
|
|
1254
|
+
|
|
1255
|
+
# If verification passed, return result with verify info
|
|
1256
|
+
if verification.get("passed"):
|
|
1257
|
+
# Create a new result with verify info attached
|
|
1258
|
+
return self._attach_verify_info(
|
|
1259
|
+
worker_result,
|
|
1260
|
+
VerifyInfo(
|
|
1261
|
+
passed=True,
|
|
1262
|
+
reasoning=verification.get("reasoning", ""),
|
|
1263
|
+
verify_meta=verify_meta,
|
|
1264
|
+
attempts=verify_attempts,
|
|
1265
|
+
)
|
|
1266
|
+
)
|
|
1267
|
+
|
|
1268
|
+
# If verification failed and we have attempts left, rebuild prompt with feedback
|
|
1269
|
+
if verify_attempts < max_attempts:
|
|
1270
|
+
feedback = verification.get("feedback") or verification.get("reasoning") or "Output did not meet criteria"
|
|
1271
|
+
current_prompt = self._build_retry_prompt_with_feedback(original_prompt, feedback)
|
|
1272
|
+
|
|
1273
|
+
# Max retries exceeded - return last result with error status and verify info
|
|
1274
|
+
# Use last worker tag for consistency
|
|
1275
|
+
last_worker_tag = f"{base_tag}-vr{verify_attempts - 1}" if verify_attempts > 1 else base_tag
|
|
1276
|
+
verify_meta = VerifyMeta(
|
|
1277
|
+
run_id=run_id,
|
|
1278
|
+
operation="verify",
|
|
1279
|
+
tag=f"{last_worker_tag}-verifier",
|
|
1280
|
+
sandbox_id="",
|
|
1281
|
+
attempts=verify_attempts,
|
|
1282
|
+
)
|
|
1283
|
+
|
|
1284
|
+
return self._attach_verify_info(
|
|
1285
|
+
last_result,
|
|
1286
|
+
VerifyInfo(
|
|
1287
|
+
passed=False,
|
|
1288
|
+
reasoning="Max verification retries exceeded",
|
|
1289
|
+
verify_meta=verify_meta,
|
|
1290
|
+
attempts=verify_attempts,
|
|
1291
|
+
),
|
|
1292
|
+
force_error=True
|
|
1293
|
+
)
|
|
1294
|
+
|
|
1295
|
+
async def _execute_verify(
|
|
1296
|
+
self,
|
|
1297
|
+
input_files: FileMap,
|
|
1298
|
+
output_files: FileMap,
|
|
1299
|
+
task_prompt: str,
|
|
1300
|
+
config: VerifyConfig,
|
|
1301
|
+
mcp_servers: Optional[Dict[str, Any]],
|
|
1302
|
+
timeout: int,
|
|
1303
|
+
system_prompt: Optional[str],
|
|
1304
|
+
schema: Optional[SchemaType],
|
|
1305
|
+
run_id: str,
|
|
1306
|
+
worker_tag: str,
|
|
1307
|
+
retry_attempt: int = 1,
|
|
1308
|
+
) -> Dict[str, Any]:
|
|
1309
|
+
"""Execute verifier to check if output meets criteria."""
|
|
1310
|
+
# Verifier tag = worker_tag-verifier, with -er{n} suffix for error retries
|
|
1311
|
+
tag_prefix = (
|
|
1312
|
+
f"{worker_tag}-verifier-er{retry_attempt - 1}"
|
|
1313
|
+
if retry_attempt > 1
|
|
1314
|
+
else f"{worker_tag}-verifier"
|
|
1315
|
+
)
|
|
1316
|
+
|
|
1317
|
+
# Build verify context
|
|
1318
|
+
context = self._build_verify_context(
|
|
1319
|
+
input_files=input_files,
|
|
1320
|
+
task_prompt=task_prompt,
|
|
1321
|
+
output_files=output_files,
|
|
1322
|
+
system_prompt=system_prompt,
|
|
1323
|
+
schema=schema,
|
|
1324
|
+
)
|
|
1325
|
+
|
|
1326
|
+
# Build verify system prompt
|
|
1327
|
+
file_tree = build_file_tree(context)
|
|
1328
|
+
verify_system_prompt = apply_template(VERIFY_PROMPT, {
|
|
1329
|
+
"criteria": config.criteria,
|
|
1330
|
+
"fileTree": file_tree,
|
|
1331
|
+
})
|
|
1332
|
+
|
|
1333
|
+
# Verify schema (always JSON Schema dict for simplicity)
|
|
1334
|
+
verify_schema = {
|
|
1335
|
+
"type": "object",
|
|
1336
|
+
"properties": {
|
|
1337
|
+
"passed": {"type": "boolean"},
|
|
1338
|
+
"reasoning": {"type": "string"},
|
|
1339
|
+
"feedback": {"type": "string"},
|
|
1340
|
+
},
|
|
1341
|
+
"required": ["passed", "reasoning"],
|
|
1342
|
+
}
|
|
1343
|
+
|
|
1344
|
+
async with self.semaphore:
|
|
1345
|
+
result = await self._execute(
|
|
1346
|
+
context=context,
|
|
1347
|
+
prompt=VERIFY_USER_PROMPT,
|
|
1348
|
+
system_prompt=verify_system_prompt,
|
|
1349
|
+
schema=verify_schema,
|
|
1350
|
+
schema_options=None,
|
|
1351
|
+
agent=config.verifier_agent,
|
|
1352
|
+
mcp_servers=mcp_servers,
|
|
1353
|
+
tag_prefix=tag_prefix,
|
|
1354
|
+
timeout=timeout,
|
|
1355
|
+
)
|
|
1356
|
+
|
|
1357
|
+
passed = None
|
|
1358
|
+
reasoning = "Verification completed"
|
|
1359
|
+
feedback = None
|
|
1360
|
+
|
|
1361
|
+
if result.get("data") and not result.get("error"):
|
|
1362
|
+
data = result["data"]
|
|
1363
|
+
if isinstance(data, dict):
|
|
1364
|
+
passed = data.get("passed")
|
|
1365
|
+
reasoning = data.get("reasoning", reasoning)
|
|
1366
|
+
feedback = data.get("feedback")
|
|
1367
|
+
elif result.get("raw_data"):
|
|
1368
|
+
# Validation failed but we have raw data - try to extract
|
|
1369
|
+
try:
|
|
1370
|
+
raw = json.loads(result["raw_data"])
|
|
1371
|
+
passed = bool(raw.get("passed"))
|
|
1372
|
+
reasoning = raw.get("reasoning", reasoning)
|
|
1373
|
+
feedback = raw.get("feedback")
|
|
1374
|
+
except Exception:
|
|
1375
|
+
warnings.warn(
|
|
1376
|
+
f"Verify validation failed: {result.get('error')}",
|
|
1377
|
+
stacklevel=2
|
|
1378
|
+
)
|
|
1379
|
+
|
|
1380
|
+
return {
|
|
1381
|
+
"status": "success" if passed is not None else "error",
|
|
1382
|
+
"passed": passed,
|
|
1383
|
+
"reasoning": reasoning,
|
|
1384
|
+
"feedback": feedback,
|
|
1385
|
+
"tag": result["tag"],
|
|
1386
|
+
"sandbox_id": result["sandbox_id"],
|
|
1387
|
+
"error": None if passed is not None else "Verifier failed to produce valid decision",
|
|
1388
|
+
}
|
|
1389
|
+
|
|
1390
|
+
def _build_verify_context(
|
|
1391
|
+
self,
|
|
1392
|
+
input_files: FileMap,
|
|
1393
|
+
task_prompt: str,
|
|
1394
|
+
output_files: FileMap,
|
|
1395
|
+
system_prompt: Optional[str],
|
|
1396
|
+
schema: Optional[SchemaType],
|
|
1397
|
+
) -> FileMap:
|
|
1398
|
+
"""Build verify context containing worker task info and output to verify."""
|
|
1399
|
+
# Start with shared worker_task structure
|
|
1400
|
+
context = self._build_evaluator_context(
|
|
1401
|
+
input_files=input_files,
|
|
1402
|
+
task_prompt=task_prompt,
|
|
1403
|
+
system_prompt=system_prompt,
|
|
1404
|
+
schema=schema,
|
|
1405
|
+
)
|
|
1406
|
+
|
|
1407
|
+
# Add output files to verify
|
|
1408
|
+
for name, content in output_files.items():
|
|
1409
|
+
context[f"worker_output/{name}"] = content
|
|
1410
|
+
|
|
1411
|
+
return context
|
|
1412
|
+
|
|
1413
|
+
def _build_evaluator_context(
|
|
1414
|
+
self,
|
|
1415
|
+
input_files: FileMap,
|
|
1416
|
+
task_prompt: str,
|
|
1417
|
+
system_prompt: Optional[str],
|
|
1418
|
+
schema: Optional[SchemaType],
|
|
1419
|
+
) -> FileMap:
|
|
1420
|
+
"""Build evaluator context (shared by judge and verify).
|
|
1421
|
+
|
|
1422
|
+
Creates worker_task/ structure with input files, prompts, schema.
|
|
1423
|
+
"""
|
|
1424
|
+
context: FileMap = {}
|
|
1425
|
+
|
|
1426
|
+
if system_prompt:
|
|
1427
|
+
context["worker_task/system_prompt.txt"] = system_prompt
|
|
1428
|
+
context["worker_task/user_prompt.txt"] = task_prompt
|
|
1429
|
+
|
|
1430
|
+
json_schema = to_json_schema(schema)
|
|
1431
|
+
if json_schema:
|
|
1432
|
+
context["worker_task/schema.json"] = json.dumps(json_schema, indent=2)
|
|
1433
|
+
|
|
1434
|
+
for name, content in input_files.items():
|
|
1435
|
+
context[f"worker_task/input/{name}"] = content
|
|
1436
|
+
|
|
1437
|
+
return context
|
|
1438
|
+
|
|
1439
|
+
@staticmethod
|
|
1440
|
+
def _build_retry_prompt_with_feedback(original_prompt: str, feedback: str) -> str:
|
|
1441
|
+
"""Build a retry prompt with verifier feedback."""
|
|
1442
|
+
return apply_template(RETRY_FEEDBACK_PROMPT, {
|
|
1443
|
+
"originalPrompt": original_prompt,
|
|
1444
|
+
"feedback": feedback,
|
|
1445
|
+
})
|
|
1446
|
+
|
|
1447
|
+
def _attach_verify_info(
|
|
1448
|
+
self,
|
|
1449
|
+
result: Any,
|
|
1450
|
+
verify_info: VerifyInfo,
|
|
1451
|
+
force_error: bool = False,
|
|
1452
|
+
) -> Any:
|
|
1453
|
+
"""Attach verify info to a result, creating a new result object."""
|
|
1454
|
+
if isinstance(result, SwarmResult):
|
|
1455
|
+
return SwarmResult(
|
|
1456
|
+
status="error" if force_error else result.status,
|
|
1457
|
+
data=result.data,
|
|
1458
|
+
files=result.files,
|
|
1459
|
+
meta=result.meta,
|
|
1460
|
+
error=result.error,
|
|
1461
|
+
raw_data=result.raw_data,
|
|
1462
|
+
best_of=result.best_of,
|
|
1463
|
+
verify=verify_info,
|
|
1464
|
+
)
|
|
1465
|
+
elif isinstance(result, ReduceResult):
|
|
1466
|
+
return ReduceResult(
|
|
1467
|
+
status="error" if force_error else result.status,
|
|
1468
|
+
data=result.data,
|
|
1469
|
+
files=result.files,
|
|
1470
|
+
meta=result.meta,
|
|
1471
|
+
error=result.error,
|
|
1472
|
+
raw_data=result.raw_data,
|
|
1473
|
+
verify=verify_info,
|
|
1474
|
+
)
|
|
1475
|
+
else:
|
|
1476
|
+
# Fallback - just set verify attribute
|
|
1477
|
+
result.verify = verify_info
|
|
1478
|
+
if force_error:
|
|
1479
|
+
result.status = "error"
|
|
1480
|
+
return result
|
|
1481
|
+
|
|
1482
|
+
# =========================================================================
|
|
1483
|
+
# PRIVATE: BESTOF
|
|
1484
|
+
# =========================================================================
|
|
1485
|
+
|
|
1486
|
+
async def _execute_best_of_candidate(
|
|
1487
|
+
self,
|
|
1488
|
+
input_files: FileMap,
|
|
1489
|
+
prompt: str,
|
|
1490
|
+
candidate_index: int,
|
|
1491
|
+
run_id: str,
|
|
1492
|
+
config: BestOfConfig,
|
|
1493
|
+
mcp_servers: Optional[Dict[str, Any]],
|
|
1494
|
+
system_prompt: Optional[str],
|
|
1495
|
+
schema: Optional[SchemaType],
|
|
1496
|
+
schema_options: Optional[Dict[str, Any]],
|
|
1497
|
+
timeout: int,
|
|
1498
|
+
parent_index: Optional[int] = None,
|
|
1499
|
+
attempt: int = 1,
|
|
1500
|
+
) -> SwarmResult:
|
|
1501
|
+
"""Execute a single bestOf candidate."""
|
|
1502
|
+
base_tag = (
|
|
1503
|
+
f"{self.config.tag}-map-{parent_index}-bestof-cand-{candidate_index}"
|
|
1504
|
+
if parent_index is not None
|
|
1505
|
+
else f"{self.config.tag}-bestof-cand-{candidate_index}"
|
|
1506
|
+
)
|
|
1507
|
+
tag_prefix = f"{base_tag}-er{attempt - 1}" if attempt > 1 else base_tag
|
|
1508
|
+
|
|
1509
|
+
# Get agent override for this candidate
|
|
1510
|
+
candidate_agent = config.task_agents[candidate_index] if config.task_agents and candidate_index < len(config.task_agents) else None
|
|
1511
|
+
|
|
1512
|
+
# Acquire semaphore here (inside retry loop) so it's released during backoff
|
|
1513
|
+
async with self.semaphore:
|
|
1514
|
+
result = await self._execute(
|
|
1515
|
+
context=input_files,
|
|
1516
|
+
prompt=prompt,
|
|
1517
|
+
system_prompt=system_prompt,
|
|
1518
|
+
schema=schema,
|
|
1519
|
+
schema_options=schema_options,
|
|
1520
|
+
agent=candidate_agent,
|
|
1521
|
+
mcp_servers=mcp_servers,
|
|
1522
|
+
tag_prefix=tag_prefix,
|
|
1523
|
+
timeout=timeout,
|
|
1524
|
+
)
|
|
1525
|
+
|
|
1526
|
+
meta = IndexedMeta(
|
|
1527
|
+
run_id=run_id,
|
|
1528
|
+
operation="bestof-cand",
|
|
1529
|
+
tag=result["tag"],
|
|
1530
|
+
sandbox_id=result["sandbox_id"],
|
|
1531
|
+
index=candidate_index,
|
|
1532
|
+
)
|
|
1533
|
+
|
|
1534
|
+
return self._build_result(result, meta)
|
|
1535
|
+
|
|
1536
|
+
async def _execute_best_of_judge(
|
|
1537
|
+
self,
|
|
1538
|
+
input_files: FileMap,
|
|
1539
|
+
task_prompt: str,
|
|
1540
|
+
candidates: List[SwarmResult],
|
|
1541
|
+
config: BestOfConfig,
|
|
1542
|
+
mcp_servers: Optional[Dict[str, Any]],
|
|
1543
|
+
timeout: int,
|
|
1544
|
+
system_prompt: Optional[str],
|
|
1545
|
+
schema: Optional[SchemaType],
|
|
1546
|
+
parent_index: Optional[int] = None,
|
|
1547
|
+
attempt: int = 1,
|
|
1548
|
+
) -> Dict[str, Any]:
|
|
1549
|
+
"""Execute bestOf judge.
|
|
1550
|
+
|
|
1551
|
+
Returns a dict with status field for retry compatibility.
|
|
1552
|
+
"""
|
|
1553
|
+
base_tag = (
|
|
1554
|
+
f"{self.config.tag}-map-{parent_index}-bestof-judge"
|
|
1555
|
+
if parent_index is not None
|
|
1556
|
+
else f"{self.config.tag}-bestof-judge"
|
|
1557
|
+
)
|
|
1558
|
+
tag_prefix = f"{base_tag}-er{attempt - 1}" if attempt > 1 else base_tag
|
|
1559
|
+
|
|
1560
|
+
# Build judge context
|
|
1561
|
+
context = self._build_judge_context(
|
|
1562
|
+
input_files=input_files,
|
|
1563
|
+
task_prompt=task_prompt,
|
|
1564
|
+
candidates=candidates,
|
|
1565
|
+
system_prompt=system_prompt,
|
|
1566
|
+
schema=schema,
|
|
1567
|
+
)
|
|
1568
|
+
|
|
1569
|
+
# Build judge system prompt
|
|
1570
|
+
file_tree = build_file_tree(context)
|
|
1571
|
+
judge_system_prompt = apply_template(JUDGE_PROMPT, {
|
|
1572
|
+
"candidateCount": str(len(candidates)),
|
|
1573
|
+
"criteria": config.judge_criteria,
|
|
1574
|
+
"fileTree": file_tree,
|
|
1575
|
+
})
|
|
1576
|
+
|
|
1577
|
+
# Judge schema (always JSON Schema dict for simplicity)
|
|
1578
|
+
judge_schema = {
|
|
1579
|
+
"type": "object",
|
|
1580
|
+
"properties": {
|
|
1581
|
+
"winner": {"type": "integer", "minimum": 0, "maximum": len(candidates) - 1},
|
|
1582
|
+
"reasoning": {"type": "string"},
|
|
1583
|
+
},
|
|
1584
|
+
"required": ["winner", "reasoning"],
|
|
1585
|
+
}
|
|
1586
|
+
|
|
1587
|
+
# Acquire semaphore here (inside retry loop) so it's released during backoff
|
|
1588
|
+
async with self.semaphore:
|
|
1589
|
+
result = await self._execute(
|
|
1590
|
+
context=context,
|
|
1591
|
+
prompt=JUDGE_USER_PROMPT,
|
|
1592
|
+
system_prompt=judge_system_prompt,
|
|
1593
|
+
schema=judge_schema,
|
|
1594
|
+
schema_options=None,
|
|
1595
|
+
agent=config.judge_agent,
|
|
1596
|
+
mcp_servers=mcp_servers,
|
|
1597
|
+
tag_prefix=tag_prefix,
|
|
1598
|
+
timeout=timeout,
|
|
1599
|
+
)
|
|
1600
|
+
|
|
1601
|
+
winner = None
|
|
1602
|
+
reasoning = "Judge failed to provide reasoning"
|
|
1603
|
+
|
|
1604
|
+
if result.get("data") and not result.get("error"):
|
|
1605
|
+
data = result["data"]
|
|
1606
|
+
if isinstance(data, dict):
|
|
1607
|
+
winner = data.get("winner")
|
|
1608
|
+
reasoning = data.get("reasoning", reasoning)
|
|
1609
|
+
elif result.get("raw_data"):
|
|
1610
|
+
# Validation failed but we have raw data - extract reasoning and default winner to 0
|
|
1611
|
+
try:
|
|
1612
|
+
raw = json.loads(result["raw_data"])
|
|
1613
|
+
warnings.warn(
|
|
1614
|
+
f"Judge returned invalid winner {raw.get('winner')}, defaulting to candidate 0",
|
|
1615
|
+
stacklevel=2
|
|
1616
|
+
)
|
|
1617
|
+
winner = 0
|
|
1618
|
+
reasoning = raw.get("reasoning", reasoning)
|
|
1619
|
+
except Exception:
|
|
1620
|
+
warnings.warn(
|
|
1621
|
+
f"Judge validation failed: {result.get('error')}",
|
|
1622
|
+
stacklevel=2
|
|
1623
|
+
)
|
|
1624
|
+
|
|
1625
|
+
return {
|
|
1626
|
+
"status": "success" if winner is not None else "error",
|
|
1627
|
+
"winner": winner,
|
|
1628
|
+
"reasoning": reasoning,
|
|
1629
|
+
"tag": result["tag"],
|
|
1630
|
+
"sandbox_id": result["sandbox_id"],
|
|
1631
|
+
"error": None if winner is not None else "Judge failed to produce valid decision",
|
|
1632
|
+
}
|
|
1633
|
+
|
|
1634
|
+
def _build_judge_context(
|
|
1635
|
+
self,
|
|
1636
|
+
input_files: FileMap,
|
|
1637
|
+
task_prompt: str,
|
|
1638
|
+
candidates: List[SwarmResult],
|
|
1639
|
+
system_prompt: Optional[str],
|
|
1640
|
+
schema: Optional[SchemaType],
|
|
1641
|
+
) -> FileMap:
|
|
1642
|
+
"""Build judge context containing worker task info and candidate outputs."""
|
|
1643
|
+
# Start with shared worker_task structure
|
|
1644
|
+
context = self._build_evaluator_context(
|
|
1645
|
+
input_files=input_files,
|
|
1646
|
+
task_prompt=task_prompt,
|
|
1647
|
+
system_prompt=system_prompt,
|
|
1648
|
+
schema=schema,
|
|
1649
|
+
)
|
|
1650
|
+
|
|
1651
|
+
# Add candidate outputs
|
|
1652
|
+
for i, c in enumerate(candidates):
|
|
1653
|
+
if c.status == "error":
|
|
1654
|
+
context[f"candidate_{i}/_failed.txt"] = f"STATUS: FAILED\n\nError: {c.error or 'Unknown error'}"
|
|
1655
|
+
for name, content in c.files.items():
|
|
1656
|
+
context[f"candidate_{i}/{name}"] = content
|
|
1657
|
+
|
|
1658
|
+
return context
|
|
1659
|
+
|
|
1660
|
+
# =========================================================================
|
|
1661
|
+
# PRIVATE: UTILITIES
|
|
1662
|
+
# =========================================================================
|
|
1663
|
+
|
|
1664
|
+
def _generate_run_id(self) -> str:
|
|
1665
|
+
"""Generate a unique run ID."""
|
|
1666
|
+
return secrets.token_hex(8)
|
|
1667
|
+
|
|
1668
|
+
def _get_files(self, item: ItemInput) -> FileMap:
|
|
1669
|
+
"""Extract files from an item (FileMap or SwarmResult)."""
|
|
1670
|
+
if is_swarm_result(item):
|
|
1671
|
+
files = dict(item.files)
|
|
1672
|
+
# Rename result.json → data.json for clarity when used as input
|
|
1673
|
+
if "result.json" in files:
|
|
1674
|
+
files["data.json"] = files.pop("result.json")
|
|
1675
|
+
return files
|
|
1676
|
+
return item
|
|
1677
|
+
|
|
1678
|
+
def _get_index(self, item: ItemInput, fallback: int) -> int:
|
|
1679
|
+
"""Get index from item (for SwarmResult) or use fallback."""
|
|
1680
|
+
if is_swarm_result(item):
|
|
1681
|
+
return item.meta.index
|
|
1682
|
+
return fallback
|
|
1683
|
+
|
|
1684
|
+
def _resolve_prompt(self, prompt: Prompt, files: FileMap, index: int) -> str:
|
|
1685
|
+
"""Resolve prompt (string or callable) to string."""
|
|
1686
|
+
return prompt(files, index) if callable(prompt) else prompt
|
|
1687
|
+
|
|
1688
|
+
def _build_agent_config(self, override: Optional[AgentConfig]) -> Optional[AgentConfig]:
|
|
1689
|
+
"""Build agent config with optional override.
|
|
1690
|
+
|
|
1691
|
+
If override provided, merge with base config (apiKey inherited from base).
|
|
1692
|
+
If no override and no base config, return None (TS SDK resolves from env).
|
|
1693
|
+
"""
|
|
1694
|
+
base = self.config.agent
|
|
1695
|
+
if override:
|
|
1696
|
+
return AgentConfig(
|
|
1697
|
+
type=override.type,
|
|
1698
|
+
api_key=base.api_key if base else None,
|
|
1699
|
+
model=override.model,
|
|
1700
|
+
reasoning_effort=override.reasoning_effort,
|
|
1701
|
+
betas=override.betas,
|
|
1702
|
+
)
|
|
1703
|
+
return base
|
|
1704
|
+
|
|
1705
|
+
def _build_result(
|
|
1706
|
+
self,
|
|
1707
|
+
result: Dict[str, Any],
|
|
1708
|
+
meta: IndexedMeta,
|
|
1709
|
+
files_override: Optional[FileMap] = None,
|
|
1710
|
+
) -> SwarmResult:
|
|
1711
|
+
"""Build SwarmResult from execution result."""
|
|
1712
|
+
files = files_override if files_override is not None else result["files"]
|
|
1713
|
+
|
|
1714
|
+
if result.get("error"):
|
|
1715
|
+
return SwarmResult(
|
|
1716
|
+
status="error",
|
|
1717
|
+
data=None,
|
|
1718
|
+
files=files,
|
|
1719
|
+
meta=meta,
|
|
1720
|
+
error=result["error"],
|
|
1721
|
+
raw_data=result.get("raw_data"),
|
|
1722
|
+
)
|
|
1723
|
+
|
|
1724
|
+
return SwarmResult(
|
|
1725
|
+
status="success",
|
|
1726
|
+
data=result["data"],
|
|
1727
|
+
files=files,
|
|
1728
|
+
meta=meta,
|
|
1729
|
+
)
|
|
1730
|
+
|
|
1731
|
+
def _build_error_result(self, error: str, meta: IndexedMeta) -> SwarmResult:
|
|
1732
|
+
"""Build error SwarmResult."""
|
|
1733
|
+
return SwarmResult(
|
|
1734
|
+
status="error",
|
|
1735
|
+
data=None,
|
|
1736
|
+
files={},
|
|
1737
|
+
meta=meta,
|
|
1738
|
+
error=error,
|
|
1739
|
+
)
|
|
1740
|
+
|
|
1741
|
+
def _decode_files(self, encoded: Dict[str, Any]) -> FileMap:
|
|
1742
|
+
"""Decode files from bridge response."""
|
|
1743
|
+
files: FileMap = {}
|
|
1744
|
+
for name, file_data in encoded.items():
|
|
1745
|
+
content = file_data.get('content', '')
|
|
1746
|
+
encoding = file_data.get('encoding', 'text')
|
|
1747
|
+
if encoding == 'base64':
|
|
1748
|
+
files[name] = base64.b64decode(content)
|
|
1749
|
+
else:
|
|
1750
|
+
files[name] = content
|
|
1751
|
+
return files
|