fleet-python 0.2.66b2__py3-none-any.whl → 0.2.105__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. examples/export_tasks.py +16 -5
  2. examples/export_tasks_filtered.py +245 -0
  3. examples/fetch_tasks.py +230 -0
  4. examples/import_tasks.py +140 -8
  5. examples/iterate_verifiers.py +725 -0
  6. fleet/__init__.py +128 -5
  7. fleet/_async/__init__.py +27 -3
  8. fleet/_async/base.py +24 -9
  9. fleet/_async/client.py +938 -41
  10. fleet/_async/env/client.py +60 -3
  11. fleet/_async/instance/client.py +52 -7
  12. fleet/_async/models.py +15 -0
  13. fleet/_async/resources/api.py +200 -0
  14. fleet/_async/resources/sqlite.py +1801 -46
  15. fleet/_async/tasks.py +122 -25
  16. fleet/_async/verifiers/bundler.py +22 -21
  17. fleet/_async/verifiers/verifier.py +25 -19
  18. fleet/agent/__init__.py +32 -0
  19. fleet/agent/gemini_cua/Dockerfile +45 -0
  20. fleet/agent/gemini_cua/__init__.py +10 -0
  21. fleet/agent/gemini_cua/agent.py +759 -0
  22. fleet/agent/gemini_cua/mcp/main.py +108 -0
  23. fleet/agent/gemini_cua/mcp_server/__init__.py +5 -0
  24. fleet/agent/gemini_cua/mcp_server/main.py +105 -0
  25. fleet/agent/gemini_cua/mcp_server/tools.py +178 -0
  26. fleet/agent/gemini_cua/requirements.txt +5 -0
  27. fleet/agent/gemini_cua/start.sh +30 -0
  28. fleet/agent/orchestrator.py +854 -0
  29. fleet/agent/types.py +49 -0
  30. fleet/agent/utils.py +34 -0
  31. fleet/base.py +34 -9
  32. fleet/cli.py +1061 -0
  33. fleet/client.py +1060 -48
  34. fleet/config.py +1 -1
  35. fleet/env/__init__.py +16 -0
  36. fleet/env/client.py +60 -3
  37. fleet/eval/__init__.py +15 -0
  38. fleet/eval/uploader.py +231 -0
  39. fleet/exceptions.py +8 -0
  40. fleet/instance/client.py +53 -8
  41. fleet/instance/models.py +1 -0
  42. fleet/models.py +303 -0
  43. fleet/proxy/__init__.py +25 -0
  44. fleet/proxy/proxy.py +453 -0
  45. fleet/proxy/whitelist.py +244 -0
  46. fleet/resources/api.py +200 -0
  47. fleet/resources/sqlite.py +1845 -46
  48. fleet/tasks.py +113 -20
  49. fleet/utils/__init__.py +7 -0
  50. fleet/utils/http_logging.py +178 -0
  51. fleet/utils/logging.py +13 -0
  52. fleet/utils/playwright.py +440 -0
  53. fleet/verifiers/bundler.py +22 -21
  54. fleet/verifiers/db.py +985 -1
  55. fleet/verifiers/decorator.py +1 -1
  56. fleet/verifiers/verifier.py +25 -19
  57. {fleet_python-0.2.66b2.dist-info → fleet_python-0.2.105.dist-info}/METADATA +28 -1
  58. fleet_python-0.2.105.dist-info/RECORD +115 -0
  59. {fleet_python-0.2.66b2.dist-info → fleet_python-0.2.105.dist-info}/WHEEL +1 -1
  60. fleet_python-0.2.105.dist-info/entry_points.txt +2 -0
  61. tests/test_app_method.py +85 -0
  62. tests/test_expect_exactly.py +4148 -0
  63. tests/test_expect_only.py +2593 -0
  64. tests/test_instance_dispatch.py +607 -0
  65. tests/test_sqlite_resource_dual_mode.py +263 -0
  66. tests/test_sqlite_shared_memory_behavior.py +117 -0
  67. fleet_python-0.2.66b2.dist-info/RECORD +0 -81
  68. tests/test_verifier_security.py +0 -427
  69. {fleet_python-0.2.66b2.dist-info → fleet_python-0.2.105.dist-info}/licenses/LICENSE +0 -0
  70. {fleet_python-0.2.66b2.dist-info → fleet_python-0.2.105.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,759 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Gemini CUA Agent (Standalone)
4
+
5
+ Env vars:
6
+ GEMINI_API_KEY: API key
7
+ FLEET_MCP_URL: CUA server URL (http://localhost:PORT)
8
+ FLEET_TASK_PROMPT: Task prompt
9
+ FLEET_TASK_KEY: Task key
10
+ FLEET_MODEL: Model (default: gemini-3-pro-preview)
11
+ FLEET_MAX_STEPS: Max steps (default: 200)
12
+ FLEET_VERBOSE: Enable verbose logging (default: false)
13
+ USE_OAUTH: Use gcloud OAuth instead of API key (default: false)
14
+ GOOG_PROJECT: Google Cloud project for OAuth (default: gemini-agents-area)
15
+ """
16
+
17
+ import asyncio
18
+ import json
19
+ import os
20
+ import subprocess
21
+ import sys
22
+ import time
23
+ from typing import Any, Dict, List, Optional
24
+
25
+ from google.genai.types import Content, Part
26
+ from google import genai
27
+ from google.genai import types
28
+ from mcp import ClientSession
29
+ from mcp.client.streamable_http import streamable_http_client
30
+
31
+ import fleet
32
+ from fleet.utils.logging import log_verbose, VERBOSE
33
+
34
+ # Whitelist hooks for auto-detecting model endpoints (optional)
35
+ _register_endpoint = lambda url: None
36
+ if os.environ.get("FLEET_PROXY_ENABLED"):
37
+ from fleet.proxy.whitelist import install_hooks, register_endpoint as _register_endpoint
38
+ install_hooks()
39
+
40
+ # OAuth configuration
41
+ GOOG_PROJECT = os.environ.get("GOOG_PROJECT", "gemini-agents-area")
42
+ USE_OAUTH = os.environ.get("USE_OAUTH", "false").lower() in ("true", "1", "yes")
43
+
44
+ # Screen dimensions for coordinate denormalization (matches MCP browser)
45
+ SCREEN_WIDTH = 1366
46
+ SCREEN_HEIGHT = 768
47
+
48
+ # Gemini 3 tool definitions (0-1000 normalized coordinates)
49
+ GEMINI_3_TOOL_DEFINITIONS = [
50
+ {
51
+ "name": "click_at",
52
+ "description": "Click at the specified screen coordinates. Coordinates are normalized 0-1000.",
53
+ "parameters": {
54
+ "type": "object",
55
+ "properties": {
56
+ "x": {
57
+ "type": "integer",
58
+ "description": "X coordinate (0-1000, where 0 is left edge, 1000 is right edge)",
59
+ },
60
+ "y": {
61
+ "type": "integer",
62
+ "description": "Y coordinate (0-1000, where 0 is top edge, 1000 is bottom edge)",
63
+ },
64
+ },
65
+ "required": ["x", "y"],
66
+ },
67
+ },
68
+ {
69
+ "name": "type_text",
70
+ "description": "Type text at the current cursor position. Use click_at first to focus the input field.",
71
+ "parameters": {
72
+ "type": "object",
73
+ "properties": {
74
+ "text": {
75
+ "type": "string",
76
+ "description": "The text to type",
77
+ },
78
+ "press_enter": {
79
+ "type": "boolean",
80
+ "description": "Whether to press Enter after typing (default: false)",
81
+ },
82
+ },
83
+ "required": ["text"],
84
+ },
85
+ },
86
+ {
87
+ "name": "key_press",
88
+ "description": "Press a key or key combination (e.g., 'Enter', 'Tab', 'Meta+A', 'Ctrl+C', 'Backspace').",
89
+ "parameters": {
90
+ "type": "object",
91
+ "properties": {
92
+ "keys": {
93
+ "type": "string",
94
+ "description": "Key or key combination to press",
95
+ },
96
+ },
97
+ "required": ["keys"],
98
+ },
99
+ },
100
+ {
101
+ "name": "scroll",
102
+ "description": "Scroll the page up or down.",
103
+ "parameters": {
104
+ "type": "object",
105
+ "properties": {
106
+ "direction": {
107
+ "type": "string",
108
+ "description": "Direction to scroll: 'up' or 'down'",
109
+ "enum": ["up", "down"],
110
+ },
111
+ },
112
+ "required": ["direction"],
113
+ },
114
+ },
115
+ {
116
+ "name": "wait",
117
+ "description": "Wait for a few seconds to allow page to load.",
118
+ "parameters": {
119
+ "type": "object",
120
+ "properties": {
121
+ "seconds": {
122
+ "type": "integer",
123
+ "description": "Number of seconds to wait (1-10)",
124
+ },
125
+ },
126
+ "required": ["seconds"],
127
+ },
128
+ },
129
+ ]
130
+
131
+ # Key name normalization for xdotool/X11 keysym compatibility
132
+ _KEY_NAME_MAP_LOWER = {
133
+ "backspace": "BackSpace",
134
+ "arrowleft": "Left", "arrowright": "Right", "arrowup": "Up", "arrowdown": "Down",
135
+ "left": "Left", "right": "Right", "up": "Up", "down": "Down",
136
+ "esc": "Escape", "escape": "Escape",
137
+ "del": "Delete", "delete": "Delete",
138
+ "pgup": "Page_Up", "pageup": "Page_Up",
139
+ "pgdown": "Page_Down", "pgdn": "Page_Down", "pagedown": "Page_Down",
140
+ "enter": "Return", "return": "Return",
141
+ "tab": "Tab", "space": "space",
142
+ "meta": "super", "command": "super", "cmd": "super", "super": "super",
143
+ "ctrl": "ctrl", "control": "ctrl",
144
+ "alt": "alt", "shift": "shift",
145
+ "f1": "F1", "f2": "F2", "f3": "F3", "f4": "F4", "f5": "F5", "f6": "F6",
146
+ "f7": "F7", "f8": "F8", "f9": "F9", "f10": "F10", "f11": "F11", "f12": "F12",
147
+ "home": "Home", "end": "End", "insert": "Insert",
148
+ }
149
+
150
+
151
+ def normalize_key_name(key: str) -> str:
152
+ """Normalize key names to xdotool/X11 keysym format."""
153
+ if not key:
154
+ return key
155
+ if "+" in key:
156
+ parts = key.split("+")
157
+ normalized_parts = [_KEY_NAME_MAP_LOWER.get(p.lower(), p) for p in parts]
158
+ return "+".join(normalized_parts)
159
+ return _KEY_NAME_MAP_LOWER.get(key.lower(), key)
160
+
161
+
162
+ def get_oauth_token() -> str:
163
+ """Get OAuth token from gcloud."""
164
+ ret = subprocess.run(
165
+ ["gcloud", "auth", "application-default", "print-access-token"],
166
+ capture_output=True,
167
+ check=True,
168
+ )
169
+ return ret.stdout.decode().strip()
170
+
171
+
172
+ def get_gemini_client() -> genai.Client:
173
+ """Create Gemini client with appropriate auth."""
174
+ api_key = os.environ.get("GEMINI_API_KEY")
175
+ custom_endpoint = os.environ.get("FLEET_MODEL_ENDPOINT")
176
+
177
+ _register_endpoint(custom_endpoint or "generativelanguage.googleapis.com")
178
+
179
+ http_opts = None
180
+ if USE_OAUTH or custom_endpoint:
181
+ opts = {}
182
+ if custom_endpoint:
183
+ opts["base_url"] = custom_endpoint
184
+ log_verbose(f"Using custom endpoint: {custom_endpoint}")
185
+ if USE_OAUTH:
186
+ opts["headers"] = {
187
+ "Authorization": f"Bearer {get_oauth_token()}",
188
+ "X-Goog-User-Project": GOOG_PROJECT,
189
+ }
190
+ opts["api_version"] = "v1alpha"
191
+ log_verbose(f"Using OAuth (project: {GOOG_PROJECT})")
192
+ http_opts = types.HttpOptions(**opts)
193
+
194
+ return genai.Client(api_key=api_key, http_options=http_opts)
195
+
196
+
197
+ def convert_gemini_3_to_mcp(function_name: str, args: Dict[str, Any]) -> List[Dict[str, Any]]:
198
+ """Convert Gemini 3 custom function calls to MCP computer tool format.
199
+
200
+ Coordinates are normalized 0-1000, denormalized to screen dimensions.
201
+ Returns a list of MCP actions since some functions expand to multiple steps.
202
+ """
203
+ def denormalize_x(x: int) -> int:
204
+ return int(x / 1000 * SCREEN_WIDTH)
205
+
206
+ def denormalize_y(y: int) -> int:
207
+ return int(y / 1000 * SCREEN_HEIGHT)
208
+
209
+ mcp_actions = []
210
+
211
+ if function_name == "click_at":
212
+ x = denormalize_x(args.get("x", 500))
213
+ y = denormalize_y(args.get("y", 500))
214
+ mcp_actions.append({"action": "left_click", "coordinate": [x, y]})
215
+
216
+ elif function_name == "type_text":
217
+ text = args.get("text", "")
218
+ press_enter = args.get("press_enter", False)
219
+ mcp_actions.append({"action": "type", "text": text})
220
+ if press_enter:
221
+ mcp_actions.append({"action": "key", "text": "Return"})
222
+
223
+ elif function_name == "key_press":
224
+ keys = args.get("keys", "Return")
225
+ mcp_actions.append({"action": "key", "text": normalize_key_name(keys)})
226
+
227
+ elif function_name == "scroll":
228
+ direction = args.get("direction", "down")
229
+ mcp_actions.append({
230
+ "action": "scroll",
231
+ "coordinate": [SCREEN_WIDTH // 2, SCREEN_HEIGHT // 2],
232
+ "scroll_direction": direction,
233
+ "scroll_amount": 5,
234
+ })
235
+
236
+ elif function_name == "wait":
237
+ seconds = min(args.get("seconds", 3), 10)
238
+ mcp_actions.append({"action": "wait", "duration": seconds})
239
+
240
+ else:
241
+ # Unknown function, fallback to screenshot
242
+ mcp_actions.append({"action": "screenshot"})
243
+
244
+ return mcp_actions
245
+
246
+
247
+ class MCP:
248
+ """MCP client using streamable-http transport."""
249
+
250
+ def __init__(self, url: str, log_file: Optional[str] = None):
251
+ self.url = url.rstrip("/") + "/mcp/"
252
+ self._session: Optional[ClientSession] = None
253
+ self._client = None
254
+ self._log_file = log_file or os.environ.get("FLEET_SESSION_LOG")
255
+ self._log_handle = None
256
+ if self._log_file:
257
+ from pathlib import Path
258
+ Path(self._log_file).parent.mkdir(parents=True, exist_ok=True)
259
+ self._log_handle = open(self._log_file, "a")
260
+
261
+ async def __aenter__(self):
262
+ print(f"MCP: Connecting to {self.url}...")
263
+ try:
264
+ self._client = streamable_http_client(self.url)
265
+ read, write, _ = await self._client.__aenter__()
266
+ self._session = ClientSession(read, write)
267
+ await self._session.__aenter__()
268
+ await self._session.initialize()
269
+ print(f"MCP: Connected successfully")
270
+ except Exception as e:
271
+ print(f"MCP: Connection failed: {type(e).__name__}: {e}")
272
+ raise
273
+
274
+ # Fetch available tools from server
275
+ try:
276
+ result = await self._session.list_tools()
277
+ self._tools = [
278
+ {
279
+ "name": tool.name,
280
+ "description": tool.description or "",
281
+ "inputSchema": tool.inputSchema,
282
+ }
283
+ for tool in result.tools
284
+ ]
285
+ print(f"MCP: Loaded {len(self._tools)} tools")
286
+ except Exception as e:
287
+ print(f"MCP: Failed to list tools: {type(e).__name__}: {e}")
288
+ raise
289
+ return self
290
+
291
+ async def __aexit__(self, *args):
292
+ if self._session:
293
+ await self._session.__aexit__(*args)
294
+ if self._client:
295
+ await self._client.__aexit__(*args)
296
+ if self._log_handle:
297
+ self._log_handle.close()
298
+
299
+ def _log(self, entry: dict):
300
+ """Log an entry to the traffic file."""
301
+ if self._log_handle:
302
+ from datetime import datetime
303
+ entry["timestamp"] = datetime.now().isoformat()
304
+ entry["url"] = self.url
305
+ self._log_handle.write(json.dumps(entry) + "\n")
306
+ self._log_handle.flush()
307
+
308
+ async def call(self, name: str, args: Dict = None) -> Dict:
309
+ """Call a tool and return the result."""
310
+ start_time = time.time()
311
+ result = await self._session.call_tool(name, args or {})
312
+ duration_ms = int((time.time() - start_time) * 1000)
313
+
314
+ # Debug: log raw MCP result structure
315
+ log_verbose(f" MCP result.content ({len(result.content)} items):")
316
+ for i, item in enumerate(result.content):
317
+ log_verbose(f" [{i}] type={type(item).__name__}, attrs={dir(item)[:10]}...")
318
+ if hasattr(item, "type"):
319
+ log_verbose(f" .type = {repr(item.type)}")
320
+ if hasattr(item, "data"):
321
+ data_preview = str(item.data)[:50] if item.data else "None"
322
+ log_verbose(f" .data = {data_preview}...")
323
+
324
+ # Helper to get attribute or dict key
325
+ def _get(item, key, default=None):
326
+ if isinstance(item, dict):
327
+ return item.get(key, default)
328
+ return getattr(item, key, default)
329
+
330
+ content = []
331
+ for item in result.content:
332
+ item_type = _get(item, "type")
333
+ if item_type == "image":
334
+ content.append({
335
+ "type": "image",
336
+ "data": _get(item, "data", ""),
337
+ "mimeType": _get(item, "mimeType", "image/png"),
338
+ })
339
+ elif item_type == "text":
340
+ content.append({"type": "text", "text": _get(item, "text", "")})
341
+
342
+ self._log({
343
+ "type": "mcp_call",
344
+ "tool": name,
345
+ "args": args or {},
346
+ "duration_ms": duration_ms,
347
+ "response_content_types": [c.get("type") for c in content],
348
+ "is_error": result.isError if hasattr(result, "isError") else False,
349
+ })
350
+ return {"content": content, "isError": result.isError if hasattr(result, "isError") else False}
351
+
352
+
353
+ def get_gemini_3_tools() -> List[types.FunctionDeclaration]:
354
+ """Return Gemini 3 custom tools as FunctionDeclarations."""
355
+ return [
356
+ types.FunctionDeclaration(
357
+ name=tool["name"],
358
+ description=tool["description"],
359
+ parameters=tool["parameters"],
360
+ )
361
+ for tool in GEMINI_3_TOOL_DEFINITIONS
362
+ ]
363
+
364
+
365
+ def get_image_data(result: Dict) -> Optional[str]:
366
+ """Extract base64 image from MCP result."""
367
+ for content in result.get("content", []):
368
+ if content.get("type") == "image":
369
+ return content.get("data")
370
+ return None
371
+
372
+
373
+ def extract_reasoning_from_candidate(candidate) -> Optional[str]:
374
+ """Extract reasoning trace from Gemini candidate response."""
375
+ reasoning_parts = []
376
+
377
+ if not candidate or not candidate.content or not candidate.content.parts:
378
+ return None
379
+
380
+ has_function_calls = any(
381
+ hasattr(p, "function_call") and p.function_call for p in candidate.content.parts
382
+ )
383
+
384
+ for part in candidate.content.parts:
385
+ if hasattr(part, "thought") and part.thought:
386
+ if isinstance(part.thought, str):
387
+ reasoning_parts.append(part.thought)
388
+ elif part.thought is True and hasattr(part, "text") and part.text:
389
+ reasoning_parts.append(part.text)
390
+ elif hasattr(part, "text") and part.text and has_function_calls:
391
+ reasoning_parts.append(part.text)
392
+
393
+ if not reasoning_parts:
394
+ return None
395
+ return "\n\n".join(reasoning_parts)
396
+
397
+
398
+ class GeminiAgent:
399
+ """Gemini Computer Use Agent."""
400
+
401
+ def __init__(self, mcp: MCP, model: str, session=None):
402
+ self.mcp = mcp
403
+ self.model = model.split("/")[-1] if "/" in model else model
404
+ self.client = get_gemini_client()
405
+ self.transcript: List[Dict] = []
406
+ self.session = session
407
+ self._consecutive_errors = 0
408
+ self._max_consecutive_errors = 5
409
+
410
+ async def _take_screenshot(self) -> Optional[str]:
411
+ """Take a screenshot and return base64 data."""
412
+ try:
413
+ result = await self.mcp.call("computer", {"action": "screenshot"})
414
+ return get_image_data(result)
415
+ except Exception as e:
416
+ print(f"Screenshot failed: {e}")
417
+ return None
418
+
419
+ async def _execute_gemini_function(self, name: str, args: Dict) -> Dict:
420
+ """Execute a Gemini function by converting to MCP actions."""
421
+ mcp_actions = convert_gemini_3_to_mcp(name, args)
422
+ log_verbose(f" Converting {name} -> {len(mcp_actions)} MCP action(s)")
423
+
424
+ last_result = None
425
+ for i, action in enumerate(mcp_actions):
426
+ log_verbose(f" Action {i+1}: {action}")
427
+ last_result = await self.mcp.call("computer", action)
428
+ if last_result.get("isError"):
429
+ return last_result
430
+
431
+ # After executing actions, take a screenshot
432
+ screenshot_result = await self.mcp.call("computer", {"action": "screenshot"})
433
+ return screenshot_result
434
+
435
+ async def run(self, prompt: str, max_steps: int) -> Dict[str, Any]:
436
+ """Run the agent on a task."""
437
+ start_time = time.time()
438
+
439
+ system_prompt = """You are a helpful agent. Complete the task by interacting with the browser.
440
+
441
+ Use the available tools to click, type, scroll, and interact with the page.
442
+ Coordinates are normalized 0-1000 (0,0 is top-left, 1000,1000 is bottom-right).
443
+
444
+ When done, stop calling tools and provide your final response."""
445
+
446
+ # Get Gemini 3 tools
447
+ gemini_tools = get_gemini_3_tools()
448
+
449
+ log_verbose("\n" + "="*60)
450
+ log_verbose("SYSTEM PROMPT:")
451
+ log_verbose("="*60)
452
+ log_verbose(system_prompt)
453
+
454
+ log_verbose(f"\nTOOLS ({len(gemini_tools)} total):")
455
+ for tool in GEMINI_3_TOOL_DEFINITIONS:
456
+ log_verbose(f" {tool['name']}: {tool['description'][:80]}...")
457
+
458
+ # Configure Gemini with thinking enabled
459
+ config = types.GenerateContentConfig(
460
+ max_output_tokens=65536,
461
+ system_instruction=system_prompt,
462
+ tools=[types.Tool(function_declarations=gemini_tools)],
463
+ thinking_config=types.ThinkingConfig(include_thoughts=True),
464
+ )
465
+
466
+ # Set config on session for logging (if session exists)
467
+ if self.session:
468
+ self.session.config = config
469
+
470
+ # Take initial screenshot
471
+ print("Taking initial screenshot...")
472
+ initial_screenshot = await self._take_screenshot()
473
+
474
+ # Build initial user message with task + screenshot
475
+ user_parts = [Part(text=f"Task: {prompt}")]
476
+ if initial_screenshot:
477
+ user_parts.append(Part(inline_data={
478
+ "mime_type": "image/png",
479
+ "data": initial_screenshot,
480
+ }))
481
+ print("✓ Initial screenshot captured")
482
+ else:
483
+ print("⚠ Could not capture initial screenshot")
484
+
485
+ history: List[Content] = [Content(role="user", parts=user_parts)]
486
+ self.transcript.append({"role": "user", "content": prompt})
487
+
488
+ log_verbose("\n" + "="*60)
489
+ log_verbose("USER PROMPT:")
490
+ log_verbose("="*60)
491
+ log_verbose(prompt)
492
+
493
+ for step in range(1, max_steps + 1):
494
+ print(f"\n{'='*50}")
495
+ print(f"Step {step}/{max_steps}")
496
+
497
+ # Log history size
498
+ log_verbose(f" History: {len(history)} messages")
499
+
500
+ try:
501
+ response = self.client.models.generate_content(
502
+ model=self.model,
503
+ contents=history,
504
+ config=config,
505
+ )
506
+ self._consecutive_errors = 0
507
+ except Exception as e:
508
+ self._consecutive_errors += 1
509
+ error_type = type(e).__name__
510
+ print(f"API error ({error_type}): {e}")
511
+ print(f" Consecutive errors: {self._consecutive_errors}/{self._max_consecutive_errors}")
512
+
513
+ if self._consecutive_errors >= self._max_consecutive_errors:
514
+ return self._result(False, f"Too many consecutive API errors: {error_type}: {e}", step, start_time)
515
+
516
+ # Check for retryable errors
517
+ if "429" in str(e) or "quota" in str(e).lower() or "rate" in str(e).lower():
518
+ print(" Rate limited, waiting 10s...")
519
+ await asyncio.sleep(10)
520
+ continue
521
+ elif "503" in str(e) or "500" in str(e) or "overloaded" in str(e).lower():
522
+ print(" Server error, waiting 5s...")
523
+ await asyncio.sleep(5)
524
+ continue
525
+ else:
526
+ return self._result(False, f"{error_type}: {e}", step, start_time)
527
+
528
+ if not response.candidates:
529
+ print("[WARN] No candidates, retrying...")
530
+ log_verbose(f" Response: {response}")
531
+ continue
532
+
533
+ candidate = response.candidates[0]
534
+ if not candidate.content or not candidate.content.parts:
535
+ print("[WARN] Empty response, retrying...")
536
+ continue
537
+
538
+ # Extract reasoning trace
539
+ reasoning = extract_reasoning_from_candidate(candidate)
540
+ if reasoning:
541
+ preview = reasoning[:100] + "..." if len(reasoning) > 100 else reasoning
542
+ print(f"🧠 Thinking: {preview}")
543
+
544
+ # Log to Fleet session if available
545
+ if self.session:
546
+ try:
547
+ await self.session.log(history, response)
548
+ if step == 1 and self.session.session_id:
549
+ print(f"Session: https://fleetai.com/dashboard/sessions/{self.session.session_id}")
550
+ except Exception as e:
551
+ print(f" [WARN] Session log failed: {type(e).__name__}: {e}")
552
+ log_verbose(f" [WARN] Session log failed: {e}")
553
+
554
+ # Log all parts for debugging
555
+ log_verbose(f"\n Response parts ({len(candidate.content.parts)}):")
556
+ for i, part in enumerate(candidate.content.parts):
557
+ if part.text:
558
+ log_verbose(f" [{i}] TEXT: {part.text[:300]}{'...' if len(part.text) > 300 else ''}")
559
+ elif part.function_call:
560
+ fc = part.function_call
561
+ args_str = json.dumps(dict(fc.args) if fc.args else {})
562
+ log_verbose(f" [{i}] FUNCTION_CALL: {fc.name}({args_str})")
563
+ elif hasattr(part, 'thought') and part.thought:
564
+ log_verbose(f" [{i}] THOUGHT: {part.thought[:300]}{'...' if len(part.thought) > 300 else ''}")
565
+ else:
566
+ log_verbose(f" [{i}] OTHER: {type(part).__name__}")
567
+
568
+ # Extract function calls and text
569
+ function_calls = [p.function_call for p in candidate.content.parts if p.function_call]
570
+ text_parts = [p.text for p in candidate.content.parts if p.text and not getattr(p, "thought", False)]
571
+
572
+ # Print model output
573
+ if text_parts:
574
+ for text in text_parts:
575
+ display = text[:200] + "..." if len(text) > 200 else text
576
+ print(f"Model: {display}")
577
+
578
+ # Check for completion (no function calls)
579
+ if text_parts and not function_calls:
580
+ final_text = " ".join(text_parts)
581
+ self.transcript.append({"role": "assistant", "content": final_text})
582
+
583
+ if final_text.strip().upper().startswith("DONE:"):
584
+ answer = final_text.strip()[5:].strip()
585
+ print(f"\n✓ Agent completed: {answer[:100]}")
586
+ return self._result(True, None, step, start_time, answer)
587
+ elif final_text.strip().upper().startswith("FAILED:"):
588
+ error = final_text.strip()[7:].strip()
589
+ print(f"\n✗ Agent failed: {error[:100]}")
590
+ return self._result(False, error, step, start_time)
591
+ else:
592
+ print(f"\n✓ Agent finished with response")
593
+ return self._result(True, None, step, start_time, final_text)
594
+
595
+ # Check for thinking-only response (no function calls, no text)
596
+ if not function_calls and not text_parts:
597
+ print("🧠 Thinking-only response, continuing...")
598
+ # Add thinking to history so model has context
599
+ history.append(candidate.content)
600
+ continue
601
+
602
+ if function_calls:
603
+ # Add model's response to history
604
+ history.append(candidate.content)
605
+
606
+ log_verbose(f"\n Executing {len(function_calls)} function call(s):")
607
+
608
+ # Execute each function call
609
+ response_parts = []
610
+ for i, fc in enumerate(function_calls):
611
+ name = fc.name
612
+ args = dict(fc.args) if fc.args else {}
613
+ print(f" Tool {i+1}/{len(function_calls)}: {name}({json.dumps(args)})")
614
+ self.transcript.append({"role": "tool_call", "name": name, "args": args})
615
+
616
+ try:
617
+ result = await self._execute_gemini_function(name, args)
618
+
619
+ if result.get("isError"):
620
+ self._consecutive_errors += 1
621
+ error_text = ""
622
+ for c in result.get("content", []):
623
+ if c.get("type") == "text":
624
+ error_text = c.get("text", "")[:200]
625
+ print(f" Tool error: {error_text}")
626
+
627
+ # Return error to model
628
+ response_parts.append(Part(
629
+ function_response={
630
+ "name": name,
631
+ "response": {"status": "error", "error": error_text},
632
+ }
633
+ ))
634
+ else:
635
+ self._consecutive_errors = 0
636
+ img_data = get_image_data(result)
637
+
638
+ if img_data:
639
+ # Function response with screenshot
640
+ response_parts.append(Part(
641
+ function_response={
642
+ "name": name,
643
+ "response": {"status": "success"},
644
+ }
645
+ ))
646
+ # Add screenshot as inline_data
647
+ response_parts.append(Part(
648
+ inline_data={
649
+ "mime_type": "image/png",
650
+ "data": img_data,
651
+ }
652
+ ))
653
+ log_verbose(" Response: screenshot captured")
654
+ else:
655
+ response_parts.append(Part(
656
+ function_response={
657
+ "name": name,
658
+ "response": {"status": "success"},
659
+ }
660
+ ))
661
+ log_verbose(" Response: no screenshot")
662
+
663
+ except Exception as e:
664
+ self._consecutive_errors += 1
665
+ error_type = type(e).__name__
666
+ print(f" Tool exception ({error_type}): {e}")
667
+
668
+ if "connection" in str(e).lower() or "closed" in str(e).lower():
669
+ print(" MCP connection lost, failing task")
670
+ return self._result(False, f"MCP connection error: {e}", step, start_time)
671
+
672
+ response_parts.append(Part(
673
+ function_response={
674
+ "name": name,
675
+ "response": {"status": "error", "error": str(e)},
676
+ }
677
+ ))
678
+
679
+ # Small delay between tool calls
680
+ if i < len(function_calls) - 1:
681
+ await asyncio.sleep(0.1)
682
+
683
+ # Add function responses to history as user role
684
+ # (Gemini expects function_response in user messages)
685
+ history.append(Content(role="user", parts=response_parts))
686
+ log_verbose(f" Added {len(response_parts)} response part(s) to history")
687
+
688
+ # Max steps reached
689
+ print(f"\n⚠ Max steps ({max_steps}) reached")
690
+ return self._result(True, "Max steps reached", max_steps, start_time, "Max steps reached - task may be complete")
691
+
692
+ def _result(self, completed: bool, error: Optional[str], steps: int, start_time: float, answer: str = None) -> Dict:
693
+ """Build result dict."""
694
+ return {
695
+ "completed": completed,
696
+ "error": error,
697
+ "final_answer": answer,
698
+ "steps_taken": steps,
699
+ "execution_time_ms": int((time.time() - start_time) * 1000),
700
+ "transcript": self.transcript,
701
+ }
702
+
703
+
704
+ async def main():
705
+ """Main entry point."""
706
+ config = {
707
+ "url": os.environ.get("FLEET_MCP_URL", "http://localhost:8765"),
708
+ "prompt": os.environ.get("FLEET_TASK_PROMPT", ""),
709
+ "task_key": os.environ.get("FLEET_TASK_KEY", ""),
710
+ "job_id": os.environ.get("FLEET_JOB_ID"),
711
+ "instance_id": os.environ.get("FLEET_INSTANCE_ID"),
712
+ "model": os.environ.get("FLEET_MODEL", "gemini-3-pro-preview"),
713
+ "max_steps": int(os.environ.get("FLEET_MAX_STEPS", "200")),
714
+ }
715
+
716
+ print("Gemini CUA Agent")
717
+ print(f" Model: {config['model']}")
718
+ print(f" MCP: {config['url']}")
719
+ print(f" Verbose: {VERBOSE}")
720
+ print(f" Task: {config['prompt'][:80]}...")
721
+
722
+ if not os.environ.get("GEMINI_API_KEY"):
723
+ result = {"task_key": config["task_key"], "completed": False, "error": "No GEMINI_API_KEY"}
724
+ print(json.dumps(result))
725
+ return result
726
+
727
+ try:
728
+ # Create Fleet session for live logging
729
+ session = None
730
+ if os.environ.get("FLEET_API_KEY"):
731
+ session = fleet.session_async(
732
+ job_id=config["job_id"],
733
+ model=config["model"],
734
+ task_key=config["task_key"],
735
+ instance_id=config["instance_id"],
736
+ )
737
+
738
+ async with MCP(config["url"]) as mcp:
739
+ agent = GeminiAgent(mcp, config["model"], session=session)
740
+ result = await agent.run(config["prompt"], config["max_steps"])
741
+ result["task_key"] = config["task_key"]
742
+ if session and session.session_id:
743
+ result["session_id"] = session.session_id
744
+
745
+ print(json.dumps(result))
746
+ return result
747
+ except Exception as e:
748
+ import traceback
749
+ error_msg = f"{type(e).__name__}: {e}"
750
+ print(f"Agent exception: {error_msg}", file=sys.stderr)
751
+ traceback.print_exc(file=sys.stderr)
752
+ result = {"task_key": config["task_key"], "completed": False, "error": error_msg}
753
+ print(json.dumps(result))
754
+ return result
755
+
756
+
757
+ if __name__ == "__main__":
758
+ result = asyncio.run(main())
759
+ sys.exit(0 if result.get("completed") else 1)