fleet-python 0.2.91__tar.gz → 0.2.93__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {fleet_python-0.2.91/fleet_python.egg-info → fleet_python-0.2.93}/PKG-INFO +1 -1
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet/__init__.py +1 -1
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet/_async/__init__.py +1 -1
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet/_async/base.py +1 -1
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet/agent/gemini_cua/agent.py +75 -21
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet/agent/gemini_cua/mcp_server.py +28 -4
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet/agent/orchestrator.py +91 -10
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet/agent/types.py +1 -1
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet/base.py +1 -1
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet/cli.py +76 -2
- {fleet_python-0.2.91 → fleet_python-0.2.93/fleet_python.egg-info}/PKG-INFO +1 -1
- {fleet_python-0.2.91 → fleet_python-0.2.93}/pyproject.toml +1 -1
- {fleet_python-0.2.91 → fleet_python-0.2.93}/LICENSE +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/README.md +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/examples/diff_example.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/examples/dsl_example.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/examples/example.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/examples/exampleResume.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/examples/example_account.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/examples/example_action_log.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/examples/example_client.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/examples/example_mcp_anthropic.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/examples/example_mcp_openai.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/examples/example_sync.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/examples/example_task.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/examples/example_tasks.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/examples/example_verifier.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/examples/export_tasks.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/examples/fetch_tasks.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/examples/gemini_example.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/examples/import_tasks.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/examples/iterate_verifiers.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/examples/json_tasks_example.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/examples/nova_act_example.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/examples/openai_example.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/examples/openai_simple_example.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/examples/query_builder_example.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/examples/quickstart.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/examples/test_cdp_logging.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet/_async/client.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet/_async/env/__init__.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet/_async/env/client.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet/_async/exceptions.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet/_async/global_client.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet/_async/instance/__init__.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet/_async/instance/base.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet/_async/instance/client.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet/_async/models.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet/_async/resources/__init__.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet/_async/resources/base.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet/_async/resources/browser.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet/_async/resources/mcp.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet/_async/resources/sqlite.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet/_async/tasks.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet/_async/verifiers/__init__.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet/_async/verifiers/bundler.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet/_async/verifiers/verifier.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet/agent/__init__.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet/agent/gemini_cua/Dockerfile +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet/agent/gemini_cua/__init__.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet/agent/gemini_cua/playwright_utils.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet/agent/gemini_cua/requirements.txt +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet/agent/gemini_cua/start.sh +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet/agent/utils.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet/client.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet/config.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet/env/__init__.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet/env/client.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet/eval/__init__.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet/eval/uploader.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet/exceptions.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet/global_client.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet/instance/__init__.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet/instance/base.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet/instance/client.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet/instance/models.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet/models.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet/proxy/__init__.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet/proxy/proxy.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet/proxy/whitelist.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet/resources/__init__.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet/resources/base.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet/resources/browser.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet/resources/mcp.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet/resources/sqlite.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet/tasks.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet/types.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet/utils/__init__.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet/utils/http_logging.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet/utils/logging.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet/utils/playwright.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet/verifiers/__init__.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet/verifiers/bundler.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet/verifiers/code.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet/verifiers/db.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet/verifiers/decorator.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet/verifiers/parse.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet/verifiers/sql_differ.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet/verifiers/verifier.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet_python.egg-info/SOURCES.txt +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet_python.egg-info/dependency_links.txt +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet_python.egg-info/entry_points.txt +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet_python.egg-info/requires.txt +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/fleet_python.egg-info/top_level.txt +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/scripts/fix_sync_imports.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/scripts/unasync.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/setup.cfg +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/tests/__init__.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/tests/test_app_method.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/tests/test_expect_only.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/tests/test_instance_dispatch.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/tests/test_sqlite_resource_dual_mode.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/tests/test_sqlite_shared_memory_behavior.py +0 -0
- {fleet_python-0.2.91 → fleet_python-0.2.93}/tests/test_verifier_from_string.py +0 -0
|
@@ -8,7 +8,7 @@ Env vars:
|
|
|
8
8
|
FLEET_TASK_PROMPT: Task prompt
|
|
9
9
|
FLEET_TASK_KEY: Task key
|
|
10
10
|
FLEET_MODEL: Model (default: gemini-2.5-pro)
|
|
11
|
-
FLEET_MAX_STEPS: Max steps (default:
|
|
11
|
+
FLEET_MAX_STEPS: Max steps (default: 200)
|
|
12
12
|
FLEET_VERBOSE: Enable verbose logging (default: false)
|
|
13
13
|
USE_OAUTH: Use gcloud OAuth instead of API key (default: false)
|
|
14
14
|
GOOG_PROJECT: Google Cloud project for OAuth (default: gemini-agents-area)
|
|
@@ -95,22 +95,33 @@ class MCP:
|
|
|
95
95
|
|
|
96
96
|
async def __aenter__(self):
|
|
97
97
|
# Connect using streamable-http transport
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
98
|
+
print(f"MCP: Connecting to {self.url}...")
|
|
99
|
+
try:
|
|
100
|
+
self._client = streamable_http_client(self.url)
|
|
101
|
+
read, write, _ = await self._client.__aenter__()
|
|
102
|
+
self._session = ClientSession(read, write)
|
|
103
|
+
await self._session.__aenter__()
|
|
104
|
+
await self._session.initialize()
|
|
105
|
+
print(f"MCP: Connected successfully")
|
|
106
|
+
except Exception as e:
|
|
107
|
+
print(f"MCP: Connection failed: {type(e).__name__}: {e}")
|
|
108
|
+
raise
|
|
103
109
|
|
|
104
110
|
# Fetch available tools from server
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
111
|
+
try:
|
|
112
|
+
result = await self._session.list_tools()
|
|
113
|
+
self._tools = [
|
|
114
|
+
{
|
|
115
|
+
"name": tool.name,
|
|
116
|
+
"description": tool.description or "",
|
|
117
|
+
"inputSchema": tool.inputSchema,
|
|
118
|
+
}
|
|
119
|
+
for tool in result.tools
|
|
120
|
+
]
|
|
121
|
+
print(f"MCP: Loaded {len(self._tools)} tools")
|
|
122
|
+
except Exception as e:
|
|
123
|
+
print(f"MCP: Failed to list tools: {type(e).__name__}: {e}")
|
|
124
|
+
raise
|
|
114
125
|
return self
|
|
115
126
|
|
|
116
127
|
async def __aexit__(self, *args):
|
|
@@ -212,6 +223,8 @@ class GeminiAgent:
|
|
|
212
223
|
self.client = get_gemini_client()
|
|
213
224
|
self.transcript: List[Dict] = []
|
|
214
225
|
self.session = session # Fleet session for live logging
|
|
226
|
+
self._consecutive_errors = 0
|
|
227
|
+
self._max_consecutive_errors = 5
|
|
215
228
|
|
|
216
229
|
async def _execute_tool(self, name: str, args: Dict) -> Dict:
|
|
217
230
|
return await self.mcp.call(name, args)
|
|
@@ -287,9 +300,27 @@ STRICT RULES:
|
|
|
287
300
|
contents=history,
|
|
288
301
|
config=config,
|
|
289
302
|
)
|
|
303
|
+
self._consecutive_errors = 0 # Reset on success
|
|
290
304
|
except Exception as e:
|
|
291
|
-
|
|
292
|
-
|
|
305
|
+
self._consecutive_errors += 1
|
|
306
|
+
error_type = type(e).__name__
|
|
307
|
+
print(f"API error ({error_type}): {e}")
|
|
308
|
+
print(f" Consecutive errors: {self._consecutive_errors}/{self._max_consecutive_errors}")
|
|
309
|
+
|
|
310
|
+
if self._consecutive_errors >= self._max_consecutive_errors:
|
|
311
|
+
return self._result(False, f"Too many consecutive API errors: {error_type}: {e}", step, start_time)
|
|
312
|
+
|
|
313
|
+
# Check for retryable errors
|
|
314
|
+
if "429" in str(e) or "quota" in str(e).lower() or "rate" in str(e).lower():
|
|
315
|
+
print(f" Rate limited, waiting 10s...")
|
|
316
|
+
await asyncio.sleep(10)
|
|
317
|
+
continue
|
|
318
|
+
elif "503" in str(e) or "500" in str(e) or "overloaded" in str(e).lower():
|
|
319
|
+
print(f" Server error, waiting 5s...")
|
|
320
|
+
await asyncio.sleep(5)
|
|
321
|
+
continue
|
|
322
|
+
else:
|
|
323
|
+
return self._result(False, f"{error_type}: {e}", step, start_time)
|
|
293
324
|
|
|
294
325
|
if not response.candidates:
|
|
295
326
|
print("[WARN] No candidates, retrying...")
|
|
@@ -309,6 +340,7 @@ STRICT RULES:
|
|
|
309
340
|
if step == 1 and self.session.session_id:
|
|
310
341
|
print(f"Session: https://fleetai.com/dashboard/sessions/{self.session.session_id}")
|
|
311
342
|
except Exception as e:
|
|
343
|
+
print(f" [WARN] Session log failed: {type(e).__name__}: {e}")
|
|
312
344
|
log_verbose(f" [WARN] Session log failed: {e}")
|
|
313
345
|
|
|
314
346
|
# Log all parts for debugging
|
|
@@ -370,9 +402,28 @@ STRICT RULES:
|
|
|
370
402
|
try:
|
|
371
403
|
result = await self._execute_tool(name, args)
|
|
372
404
|
log_verbose(f" Result: isError={result.get('isError', False)}, content_types={[c.get('type') for c in result.get('content', [])]}")
|
|
405
|
+
|
|
406
|
+
if result.get("isError"):
|
|
407
|
+
self._consecutive_errors += 1
|
|
408
|
+
error_text = ""
|
|
409
|
+
for c in result.get("content", []):
|
|
410
|
+
if c.get("type") == "text":
|
|
411
|
+
error_text = c.get("text", "")[:200]
|
|
412
|
+
print(f" Tool error: {error_text}")
|
|
413
|
+
else:
|
|
414
|
+
self._consecutive_errors = 0
|
|
373
415
|
except Exception as e:
|
|
374
|
-
|
|
375
|
-
|
|
416
|
+
self._consecutive_errors += 1
|
|
417
|
+
error_type = type(e).__name__
|
|
418
|
+
print(f" Tool exception ({error_type}): {e}")
|
|
419
|
+
print(f" Consecutive errors: {self._consecutive_errors}/{self._max_consecutive_errors}")
|
|
420
|
+
log_verbose(f" Exception: {error_type}: {e}")
|
|
421
|
+
|
|
422
|
+
# Check if this is a connection/MCP error that we should fail fast on
|
|
423
|
+
if "connection" in str(e).lower() or "closed" in str(e).lower():
|
|
424
|
+
print(f" MCP connection lost, failing task")
|
|
425
|
+
return self._result(False, f"MCP connection error: {e}", step, start_time)
|
|
426
|
+
|
|
376
427
|
result = {"content": [{"type": "text", "text": str(e)}], "isError": True}
|
|
377
428
|
|
|
378
429
|
# Build function response with image embedded (per reference format)
|
|
@@ -414,7 +465,10 @@ STRICT RULES:
|
|
|
414
465
|
history.append(types.Content(role="model", parts=response_parts))
|
|
415
466
|
log_verbose(f" Added {len(response_parts)} function response(s) to history")
|
|
416
467
|
|
|
417
|
-
|
|
468
|
+
# Max steps reached - still mark as completed so verification runs
|
|
469
|
+
# The agent may have done the task but just didn't say "DONE"
|
|
470
|
+
print(f"\n⚠ Max steps ({max_steps}) reached - will still run verification")
|
|
471
|
+
return self._result(True, "Max steps reached", max_steps, start_time, "Max steps reached - task may be complete")
|
|
418
472
|
|
|
419
473
|
def _result(self, completed: bool, error: Optional[str], steps: int, start_time: float, answer: str = None) -> Dict:
|
|
420
474
|
"""Build result dict."""
|
|
@@ -437,7 +491,7 @@ async def main():
|
|
|
437
491
|
"job_id": os.environ.get("FLEET_JOB_ID"),
|
|
438
492
|
"instance_id": os.environ.get("FLEET_INSTANCE_ID"),
|
|
439
493
|
"model": os.environ.get("FLEET_MODEL", "gemini-2.5-pro"),
|
|
440
|
-
"max_steps": int(os.environ.get("FLEET_MAX_STEPS", "
|
|
494
|
+
"max_steps": int(os.environ.get("FLEET_MAX_STEPS", "200")),
|
|
441
495
|
}
|
|
442
496
|
|
|
443
497
|
print(f"Gemini CUA Agent")
|
|
@@ -57,10 +57,20 @@ async def lifespan(app):
|
|
|
57
57
|
)
|
|
58
58
|
|
|
59
59
|
try:
|
|
60
|
+
logger.info("Starting Playwright browser...")
|
|
60
61
|
await computer.start()
|
|
62
|
+
logger.info(f"Browser started, navigated to: {computer.current_url}")
|
|
61
63
|
yield
|
|
64
|
+
except Exception as e:
|
|
65
|
+
logger.error(f"Browser startup FAILED: {type(e).__name__}: {e}")
|
|
66
|
+
raise
|
|
62
67
|
finally:
|
|
63
|
-
|
|
68
|
+
logger.info("Stopping Playwright browser...")
|
|
69
|
+
try:
|
|
70
|
+
await computer.stop()
|
|
71
|
+
logger.info("Browser stopped")
|
|
72
|
+
except Exception as e:
|
|
73
|
+
logger.error(f"Browser stop error: {type(e).__name__}: {e}")
|
|
64
74
|
|
|
65
75
|
|
|
66
76
|
mcp = FastMCP("cua-server", lifespan=lifespan, host="0.0.0.0", port=PORT)
|
|
@@ -74,7 +84,13 @@ mcp = FastMCP("cua-server", lifespan=lifespan, host="0.0.0.0", port=PORT)
|
|
|
74
84
|
async def computer_screenshot() -> list:
|
|
75
85
|
"""Takes a screenshot of the computer screen. Use this to see what's on screen."""
|
|
76
86
|
logger.info("computer_screenshot()")
|
|
77
|
-
|
|
87
|
+
try:
|
|
88
|
+
result = await computer.screenshot()
|
|
89
|
+
logger.info(f"computer_screenshot() -> {len(result)} bytes")
|
|
90
|
+
return _screenshot_response(result)
|
|
91
|
+
except Exception as e:
|
|
92
|
+
logger.error(f"computer_screenshot() FAILED: {type(e).__name__}: {e}")
|
|
93
|
+
raise
|
|
78
94
|
|
|
79
95
|
|
|
80
96
|
@mcp.tool()
|
|
@@ -88,7 +104,11 @@ async def mouse_click(x: int, y: int, button: str, repeats: int = 1) -> None:
|
|
|
88
104
|
repeats: The number of times to click. Default is 1.
|
|
89
105
|
"""
|
|
90
106
|
logger.info(f"mouse_click({x}, {y}, {button}, {repeats})")
|
|
91
|
-
|
|
107
|
+
try:
|
|
108
|
+
await computer.mouse_click(_dx(x), _dy(y), button, repeats)
|
|
109
|
+
except Exception as e:
|
|
110
|
+
logger.error(f"mouse_click FAILED: {type(e).__name__}: {e}")
|
|
111
|
+
raise
|
|
92
112
|
|
|
93
113
|
|
|
94
114
|
@mcp.tool()
|
|
@@ -172,7 +192,11 @@ async def type_text(input_text: str, press_enter: bool) -> None:
|
|
|
172
192
|
press_enter: Whether to press enter after typing.
|
|
173
193
|
"""
|
|
174
194
|
logger.info(f"type_text({input_text[:50]}{'...' if len(input_text) > 50 else ''}, enter={press_enter})")
|
|
175
|
-
|
|
195
|
+
try:
|
|
196
|
+
await computer.type_text(input_text, press_enter)
|
|
197
|
+
except Exception as e:
|
|
198
|
+
logger.error(f"type_text FAILED: {type(e).__name__}: {e}")
|
|
199
|
+
raise
|
|
176
200
|
|
|
177
201
|
|
|
178
202
|
@mcp.tool()
|
|
@@ -168,6 +168,50 @@ class AgentOrchestrator:
|
|
|
168
168
|
self._available_ports: List[Tuple[int, int]] = []
|
|
169
169
|
# Register global cleanup handlers
|
|
170
170
|
_register_cleanup()
|
|
171
|
+
# Stats tracking
|
|
172
|
+
self._stats = {"started": 0, "completed": 0, "failed": 0, "errors": {}}
|
|
173
|
+
|
|
174
|
+
def _track_error(self, category: str, message: str):
|
|
175
|
+
"""Track an error for summary statistics."""
|
|
176
|
+
if category not in self._stats["errors"]:
|
|
177
|
+
self._stats["errors"][category] = []
|
|
178
|
+
# Keep up to 5 examples per category
|
|
179
|
+
if len(self._stats["errors"][category]) < 5:
|
|
180
|
+
self._stats["errors"][category].append(message[:200])
|
|
181
|
+
|
|
182
|
+
def _print_stats(self):
|
|
183
|
+
"""Print summary statistics."""
|
|
184
|
+
from rich.console import Console
|
|
185
|
+
from rich.table import Table
|
|
186
|
+
|
|
187
|
+
console = Console()
|
|
188
|
+
|
|
189
|
+
total = self._stats["started"]
|
|
190
|
+
completed = self._stats["completed"]
|
|
191
|
+
failed = self._stats["failed"]
|
|
192
|
+
|
|
193
|
+
console.print()
|
|
194
|
+
console.print("[bold]Run Summary:[/bold]")
|
|
195
|
+
console.print(f" Started: {total}")
|
|
196
|
+
console.print(f" Completed: [green]{completed}[/green] ({100*completed/total:.1f}%)" if total > 0 else " Completed: 0")
|
|
197
|
+
console.print(f" Failed: [red]{failed}[/red] ({100*failed/total:.1f}%)" if total > 0 else " Failed: 0")
|
|
198
|
+
|
|
199
|
+
if self._stats["errors"]:
|
|
200
|
+
console.print()
|
|
201
|
+
console.print("[bold]Error Breakdown:[/bold]")
|
|
202
|
+
table = Table(show_header=True, header_style="bold")
|
|
203
|
+
table.add_column("Category")
|
|
204
|
+
table.add_column("Count")
|
|
205
|
+
table.add_column("Example")
|
|
206
|
+
|
|
207
|
+
for category, examples in sorted(self._stats["errors"].items(), key=lambda x: -len(x[1])):
|
|
208
|
+
table.add_row(
|
|
209
|
+
category,
|
|
210
|
+
str(len(examples)),
|
|
211
|
+
examples[0][:80] + "..." if len(examples[0]) > 80 else examples[0]
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
console.print(table)
|
|
171
215
|
|
|
172
216
|
async def _get_next_ports(self) -> Tuple[int, int]:
|
|
173
217
|
"""Get next available MCP port and VNC port."""
|
|
@@ -282,7 +326,10 @@ class AgentOrchestrator:
|
|
|
282
326
|
session_logs = list(self._log_dir.glob("*.jsonl"))
|
|
283
327
|
console.print(f"Logs: {self._log_dir}/ ({len(session_logs)} sessions)")
|
|
284
328
|
|
|
285
|
-
|
|
329
|
+
# Print summary statistics
|
|
330
|
+
self._print_stats()
|
|
331
|
+
|
|
332
|
+
return final, self._job_id
|
|
286
333
|
|
|
287
334
|
async def _build_docker_image(self, agent_path: Path):
|
|
288
335
|
"""Build Docker image for CUA server."""
|
|
@@ -334,15 +381,18 @@ class AgentOrchestrator:
|
|
|
334
381
|
task_prompt = task.prompt
|
|
335
382
|
short_key = task_key[:20]
|
|
336
383
|
|
|
337
|
-
|
|
384
|
+
self._stats["started"] += 1
|
|
385
|
+
logger.debug(f"[{short_key}] Starting (total started: {self._stats['started']})")
|
|
338
386
|
|
|
339
387
|
env = None
|
|
340
388
|
container_id = None
|
|
341
389
|
port = None
|
|
342
390
|
vnc_port = None
|
|
391
|
+
current_phase = "init"
|
|
343
392
|
|
|
344
393
|
try:
|
|
345
394
|
# 1. Create Fleet environment
|
|
395
|
+
current_phase = "create_env"
|
|
346
396
|
logger.debug(f"[{short_key}] Creating env...")
|
|
347
397
|
env = await make_async(
|
|
348
398
|
env_key=task.env_key,
|
|
@@ -356,6 +406,7 @@ class AgentOrchestrator:
|
|
|
356
406
|
await asyncio.sleep(3) # Wait for env to be ready
|
|
357
407
|
|
|
358
408
|
# 2. Start Docker container with CUA server
|
|
409
|
+
current_phase = "start_container"
|
|
359
410
|
port, vnc_port = await self._get_next_ports()
|
|
360
411
|
logger.debug(f"[{short_key}] Starting container on port {port}...")
|
|
361
412
|
container_id = await self._start_container(
|
|
@@ -373,11 +424,13 @@ class AgentOrchestrator:
|
|
|
373
424
|
print(f"[{short_key}] Browser: http://localhost:{vnc_port}/vnc.html")
|
|
374
425
|
|
|
375
426
|
# Wait for server to be ready
|
|
427
|
+
current_phase = "wait_for_server"
|
|
376
428
|
logger.debug(f"[{short_key}] Waiting for CUA server...")
|
|
377
429
|
await self._wait_for_server(port)
|
|
378
430
|
logger.debug(f"[{short_key}] CUA server ready")
|
|
379
431
|
|
|
380
432
|
# 3. Run agent
|
|
433
|
+
current_phase = "run_agent"
|
|
381
434
|
logger.debug(f"[{short_key}] Running agent...")
|
|
382
435
|
agent_result = await self._run_agent(
|
|
383
436
|
port=port,
|
|
@@ -388,14 +441,17 @@ class AgentOrchestrator:
|
|
|
388
441
|
logger.debug(
|
|
389
442
|
f"[{short_key}] Agent done: completed={agent_result.completed}"
|
|
390
443
|
)
|
|
444
|
+
if agent_result.error and agent_result.error != "Max steps reached":
|
|
445
|
+
print(f"[{short_key}] Agent error: {agent_result.error[:200]}")
|
|
391
446
|
|
|
392
447
|
# 4. Run verification
|
|
448
|
+
current_phase = "verification"
|
|
393
449
|
verification_success = None
|
|
394
450
|
verification_score = None
|
|
395
451
|
verifier_execution_id = None
|
|
396
452
|
|
|
397
453
|
if agent_result.completed and task.verifier:
|
|
398
|
-
logger.info(f"[{
|
|
454
|
+
logger.info(f"[{short_key}] Running verification...")
|
|
399
455
|
try:
|
|
400
456
|
v = await task.verify_detailed_async(
|
|
401
457
|
env=env,
|
|
@@ -407,9 +463,21 @@ class AgentOrchestrator:
|
|
|
407
463
|
verification_score = (
|
|
408
464
|
v.result if isinstance(v.result, (int, float)) else None
|
|
409
465
|
)
|
|
410
|
-
logger.info(f"[{
|
|
466
|
+
logger.info(f"[{short_key}] Verification: {verification_success}")
|
|
467
|
+
if verification_success:
|
|
468
|
+
self._stats["completed"] += 1
|
|
469
|
+
else:
|
|
470
|
+
self._stats["failed"] += 1
|
|
471
|
+
print(f"[{short_key}] Verification FAILED: score={verification_score}")
|
|
411
472
|
except Exception as e:
|
|
412
|
-
logger.error(f"[{
|
|
473
|
+
logger.error(f"[{short_key}] Verification error: {e}")
|
|
474
|
+
self._stats["failed"] += 1
|
|
475
|
+
self._track_error("verification_error", str(e))
|
|
476
|
+
elif not agent_result.completed:
|
|
477
|
+
self._stats["failed"] += 1
|
|
478
|
+
error_msg = agent_result.error or "unknown"
|
|
479
|
+
self._track_error("agent_not_completed", error_msg)
|
|
480
|
+
print(f"[{short_key}] Agent did not complete: {error_msg}")
|
|
413
481
|
|
|
414
482
|
# 5. Complete/fail session (session was created by agent, we just complete it)
|
|
415
483
|
session_id = getattr(agent_result, "session_id", None)
|
|
@@ -439,11 +507,24 @@ class AgentOrchestrator:
|
|
|
439
507
|
)
|
|
440
508
|
|
|
441
509
|
except Exception as e:
|
|
442
|
-
|
|
510
|
+
import traceback
|
|
511
|
+
error_type = type(e).__name__
|
|
512
|
+
error_msg = str(e)
|
|
513
|
+
tb = traceback.format_exc()
|
|
514
|
+
|
|
515
|
+
# Categorize the error
|
|
516
|
+
error_category = f"{current_phase}:{error_type}"
|
|
517
|
+
self._track_error(error_category, error_msg)
|
|
518
|
+
self._stats["failed"] += 1
|
|
519
|
+
|
|
520
|
+
# Always print errors for visibility
|
|
521
|
+
print(f"[{short_key}] EXCEPTION in {current_phase}: {error_type}: {error_msg[:200]}")
|
|
522
|
+
logger.error(f"[{short_key}] Traceback:\n{tb}")
|
|
523
|
+
|
|
443
524
|
return TaskResult(
|
|
444
525
|
task_key=task_key,
|
|
445
526
|
task_prompt=task_prompt,
|
|
446
|
-
error=
|
|
527
|
+
error=f"[{current_phase}] {error_type}: {error_msg}",
|
|
447
528
|
execution_time_ms=int((time.time() - start) * 1000),
|
|
448
529
|
)
|
|
449
530
|
|
|
@@ -687,12 +768,12 @@ async def run_agent(
|
|
|
687
768
|
agent: str = "gemini_cua",
|
|
688
769
|
model: str = "gemini-2.5-pro",
|
|
689
770
|
max_concurrent: int = 4,
|
|
690
|
-
max_steps: int =
|
|
771
|
+
max_steps: int = 200,
|
|
691
772
|
timeout_seconds: int = 600,
|
|
692
773
|
api_keys: Optional[Dict[str, str]] = None,
|
|
693
774
|
headful: bool = False,
|
|
694
775
|
verbose: bool = False,
|
|
695
|
-
) -> List[TaskResult]:
|
|
776
|
+
) -> Tuple[List[TaskResult], str]:
|
|
696
777
|
"""Run agent on Fleet tasks.
|
|
697
778
|
|
|
698
779
|
Args:
|
|
@@ -708,7 +789,7 @@ async def run_agent(
|
|
|
708
789
|
verbose: Enable verbose agent logging
|
|
709
790
|
|
|
710
791
|
Returns:
|
|
711
|
-
List of TaskResult
|
|
792
|
+
Tuple of (List of TaskResult, job_id)
|
|
712
793
|
"""
|
|
713
794
|
config = AgentConfig(
|
|
714
795
|
project_key=project_key,
|
|
@@ -94,6 +94,53 @@ def get_client() -> Fleet:
|
|
|
94
94
|
return Fleet(api_key=api_key, base_url=base_url)
|
|
95
95
|
|
|
96
96
|
|
|
97
|
+
def _run_oversight(job_id: str, model: str = "anthropic/claude-sonnet-4"):
|
|
98
|
+
"""Run oversight summarization on a completed job."""
|
|
99
|
+
import httpx
|
|
100
|
+
|
|
101
|
+
api_key = os.getenv("FLEET_API_KEY")
|
|
102
|
+
if not api_key:
|
|
103
|
+
console.print("[yellow]Warning:[/yellow] FLEET_API_KEY not set, skipping oversight")
|
|
104
|
+
return
|
|
105
|
+
|
|
106
|
+
base_url = os.getenv("FLEET_BASE_URL", CLI_DEFAULT_BASE_URL)
|
|
107
|
+
oversight_url = f"{base_url}/v1/summarize/job"
|
|
108
|
+
|
|
109
|
+
console.print()
|
|
110
|
+
console.print("[bold]Running Oversight Analysis...[/bold]")
|
|
111
|
+
|
|
112
|
+
try:
|
|
113
|
+
with httpx.Client(timeout=300) as client:
|
|
114
|
+
response = client.post(
|
|
115
|
+
oversight_url,
|
|
116
|
+
headers={
|
|
117
|
+
"accept": "application/json",
|
|
118
|
+
"Authorization": f"Bearer {api_key}",
|
|
119
|
+
"Content-Type": "application/json",
|
|
120
|
+
},
|
|
121
|
+
json={
|
|
122
|
+
"job_id": job_id,
|
|
123
|
+
"model": model,
|
|
124
|
+
"max_context_tokens": 180000,
|
|
125
|
+
"force_new_summary": False,
|
|
126
|
+
"max_concurrent": 20,
|
|
127
|
+
},
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
if response.status_code == 200:
|
|
131
|
+
result = response.json()
|
|
132
|
+
console.print(f"[green]✓[/green] Oversight analysis started")
|
|
133
|
+
if "summary_id" in result:
|
|
134
|
+
console.print(f" Summary ID: [cyan]{result['summary_id']}[/cyan]")
|
|
135
|
+
# Show link to dashboard
|
|
136
|
+
console.print(f" View: [cyan]https://fleetai.com/dashboard/jobs/{job_id}[/cyan]")
|
|
137
|
+
else:
|
|
138
|
+
console.print(f"[yellow]Warning:[/yellow] Oversight API returned {response.status_code}")
|
|
139
|
+
console.print(f" {response.text[:200]}")
|
|
140
|
+
except Exception as e:
|
|
141
|
+
console.print(f"[yellow]Warning:[/yellow] Oversight request failed: {e}")
|
|
142
|
+
|
|
143
|
+
|
|
97
144
|
# Jobs commands
|
|
98
145
|
|
|
99
146
|
|
|
@@ -326,6 +373,15 @@ def list_job_sessions(
|
|
|
326
373
|
console.print(f"[dim] Session transcript: flt sessions transcript {first_session_id}[/dim]")
|
|
327
374
|
|
|
328
375
|
|
|
376
|
+
@jobs_app.command("oversight")
|
|
377
|
+
def run_job_oversight(
|
|
378
|
+
job_id: str = typer.Argument(..., help="Job ID to analyze"),
|
|
379
|
+
model: str = typer.Option("anthropic/claude-sonnet-4", "--model", "-m", help="Model for oversight analysis"),
|
|
380
|
+
):
|
|
381
|
+
"""Run AI oversight analysis on a job."""
|
|
382
|
+
_run_oversight(job_id, model)
|
|
383
|
+
|
|
384
|
+
|
|
329
385
|
# Sessions commands
|
|
330
386
|
|
|
331
387
|
|
|
@@ -488,6 +544,8 @@ def _run_local_agent(
|
|
|
488
544
|
output_json: bool,
|
|
489
545
|
verbose: bool = False,
|
|
490
546
|
headful: bool = False,
|
|
547
|
+
oversight: bool = False,
|
|
548
|
+
oversight_model: str = "anthropic/claude-sonnet-4",
|
|
491
549
|
):
|
|
492
550
|
"""Run agent locally with Docker-based browser control."""
|
|
493
551
|
import asyncio
|
|
@@ -563,8 +621,9 @@ def _run_local_agent(
|
|
|
563
621
|
console.print("[dim]Starting agent...[/dim]")
|
|
564
622
|
console.print()
|
|
565
623
|
|
|
624
|
+
job_id = None
|
|
566
625
|
try:
|
|
567
|
-
results = asyncio.run(run())
|
|
626
|
+
results, job_id = asyncio.run(run())
|
|
568
627
|
except KeyboardInterrupt:
|
|
569
628
|
console.print()
|
|
570
629
|
console.print("[yellow]Cancelled.[/yellow]")
|
|
@@ -633,6 +692,10 @@ def _run_local_agent(
|
|
|
633
692
|
console.print(f"[bold]Pass Rate:[/bold] [{color}]{passed}/{total} ({rate:.1f}%)[/{color}]")
|
|
634
693
|
if errors:
|
|
635
694
|
console.print(f"[bold]Errors:[/bold] [red]{errors}[/red]")
|
|
695
|
+
|
|
696
|
+
# Run oversight if requested
|
|
697
|
+
if oversight and job_id:
|
|
698
|
+
_run_oversight(job_id, oversight_model)
|
|
636
699
|
|
|
637
700
|
|
|
638
701
|
def _listen_for_detach_key(stop_event: threading.Event):
|
|
@@ -688,6 +751,9 @@ def eval_run(
|
|
|
688
751
|
local: Optional[str] = typer.Option(None, "--local", "-l", help="Run locally. Use 'gemini_cua' for built-in or path for custom agent"),
|
|
689
752
|
headful: bool = typer.Option(False, "--headful", help="Show browser via noVNC (local mode)"),
|
|
690
753
|
verbose: bool = typer.Option(False, "--verbose", "-v", help="Show debug output"),
|
|
754
|
+
# Oversight
|
|
755
|
+
oversight: bool = typer.Option(False, "--oversight", help="Run AI oversight analysis on job completion"),
|
|
756
|
+
oversight_model: str = typer.Option("anthropic/claude-sonnet-4", "--oversight-model", help="Model for oversight analysis"),
|
|
691
757
|
):
|
|
692
758
|
"""
|
|
693
759
|
Run an evaluation on a project or specific tasks.
|
|
@@ -721,13 +787,15 @@ def eval_run(
|
|
|
721
787
|
task_keys=task_keys,
|
|
722
788
|
model=model[0] if model else "gemini-2.5-pro",
|
|
723
789
|
agent=local if local else "gemini_cua",
|
|
724
|
-
max_steps=max_steps or
|
|
790
|
+
max_steps=max_steps or 200,
|
|
725
791
|
max_duration=max_duration,
|
|
726
792
|
max_concurrent=max_concurrent,
|
|
727
793
|
byok=byok,
|
|
728
794
|
output_json=output_json,
|
|
729
795
|
verbose=verbose,
|
|
730
796
|
headful=headful,
|
|
797
|
+
oversight=oversight,
|
|
798
|
+
oversight_model=oversight_model,
|
|
731
799
|
)
|
|
732
800
|
return
|
|
733
801
|
|
|
@@ -938,6 +1006,10 @@ def eval_run(
|
|
|
938
1006
|
console.print(f" {task_name}: {tg.passed_sessions}/{tg.total_sessions} ({task_rate:.0f}%)")
|
|
939
1007
|
except:
|
|
940
1008
|
pass
|
|
1009
|
+
|
|
1010
|
+
# Run oversight if requested and job completed (not detached)
|
|
1011
|
+
if oversight and not detached:
|
|
1012
|
+
_run_oversight(job_id, oversight_model)
|
|
941
1013
|
|
|
942
1014
|
finally:
|
|
943
1015
|
# Signal the keyboard listener thread to stop
|
|
@@ -948,6 +1020,8 @@ def eval_run(
|
|
|
948
1020
|
console.print()
|
|
949
1021
|
console.print("[yellow]Detached. Eval continues running in background.[/yellow]")
|
|
950
1022
|
console.print(f"[dim]Check status: flt jobs get {job_id}[/dim]")
|
|
1023
|
+
if oversight:
|
|
1024
|
+
console.print(f"[dim]Run oversight manually: flt jobs oversight {job_id}[/dim]")
|
|
951
1025
|
|
|
952
1026
|
|
|
953
1027
|
def main():
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|