fleet-python 0.2.66b2__py3-none-any.whl → 0.2.105__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- examples/export_tasks.py +16 -5
- examples/export_tasks_filtered.py +245 -0
- examples/fetch_tasks.py +230 -0
- examples/import_tasks.py +140 -8
- examples/iterate_verifiers.py +725 -0
- fleet/__init__.py +128 -5
- fleet/_async/__init__.py +27 -3
- fleet/_async/base.py +24 -9
- fleet/_async/client.py +938 -41
- fleet/_async/env/client.py +60 -3
- fleet/_async/instance/client.py +52 -7
- fleet/_async/models.py +15 -0
- fleet/_async/resources/api.py +200 -0
- fleet/_async/resources/sqlite.py +1801 -46
- fleet/_async/tasks.py +122 -25
- fleet/_async/verifiers/bundler.py +22 -21
- fleet/_async/verifiers/verifier.py +25 -19
- fleet/agent/__init__.py +32 -0
- fleet/agent/gemini_cua/Dockerfile +45 -0
- fleet/agent/gemini_cua/__init__.py +10 -0
- fleet/agent/gemini_cua/agent.py +759 -0
- fleet/agent/gemini_cua/mcp/main.py +108 -0
- fleet/agent/gemini_cua/mcp_server/__init__.py +5 -0
- fleet/agent/gemini_cua/mcp_server/main.py +105 -0
- fleet/agent/gemini_cua/mcp_server/tools.py +178 -0
- fleet/agent/gemini_cua/requirements.txt +5 -0
- fleet/agent/gemini_cua/start.sh +30 -0
- fleet/agent/orchestrator.py +854 -0
- fleet/agent/types.py +49 -0
- fleet/agent/utils.py +34 -0
- fleet/base.py +34 -9
- fleet/cli.py +1061 -0
- fleet/client.py +1060 -48
- fleet/config.py +1 -1
- fleet/env/__init__.py +16 -0
- fleet/env/client.py +60 -3
- fleet/eval/__init__.py +15 -0
- fleet/eval/uploader.py +231 -0
- fleet/exceptions.py +8 -0
- fleet/instance/client.py +53 -8
- fleet/instance/models.py +1 -0
- fleet/models.py +303 -0
- fleet/proxy/__init__.py +25 -0
- fleet/proxy/proxy.py +453 -0
- fleet/proxy/whitelist.py +244 -0
- fleet/resources/api.py +200 -0
- fleet/resources/sqlite.py +1845 -46
- fleet/tasks.py +113 -20
- fleet/utils/__init__.py +7 -0
- fleet/utils/http_logging.py +178 -0
- fleet/utils/logging.py +13 -0
- fleet/utils/playwright.py +440 -0
- fleet/verifiers/bundler.py +22 -21
- fleet/verifiers/db.py +985 -1
- fleet/verifiers/decorator.py +1 -1
- fleet/verifiers/verifier.py +25 -19
- {fleet_python-0.2.66b2.dist-info → fleet_python-0.2.105.dist-info}/METADATA +28 -1
- fleet_python-0.2.105.dist-info/RECORD +115 -0
- {fleet_python-0.2.66b2.dist-info → fleet_python-0.2.105.dist-info}/WHEEL +1 -1
- fleet_python-0.2.105.dist-info/entry_points.txt +2 -0
- tests/test_app_method.py +85 -0
- tests/test_expect_exactly.py +4148 -0
- tests/test_expect_only.py +2593 -0
- tests/test_instance_dispatch.py +607 -0
- tests/test_sqlite_resource_dual_mode.py +263 -0
- tests/test_sqlite_shared_memory_behavior.py +117 -0
- fleet_python-0.2.66b2.dist-info/RECORD +0 -81
- tests/test_verifier_security.py +0 -427
- {fleet_python-0.2.66b2.dist-info → fleet_python-0.2.105.dist-info}/licenses/LICENSE +0 -0
- {fleet_python-0.2.66b2.dist-info → fleet_python-0.2.105.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,759 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Gemini CUA Agent (Standalone)
|
|
4
|
+
|
|
5
|
+
Env vars:
|
|
6
|
+
GEMINI_API_KEY: API key
|
|
7
|
+
FLEET_MCP_URL: CUA server URL (http://localhost:PORT)
|
|
8
|
+
FLEET_TASK_PROMPT: Task prompt
|
|
9
|
+
FLEET_TASK_KEY: Task key
|
|
10
|
+
FLEET_MODEL: Model (default: gemini-3-pro-preview)
|
|
11
|
+
FLEET_MAX_STEPS: Max steps (default: 200)
|
|
12
|
+
FLEET_VERBOSE: Enable verbose logging (default: false)
|
|
13
|
+
USE_OAUTH: Use gcloud OAuth instead of API key (default: false)
|
|
14
|
+
GOOG_PROJECT: Google Cloud project for OAuth (default: gemini-agents-area)
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import asyncio
|
|
18
|
+
import json
|
|
19
|
+
import os
|
|
20
|
+
import subprocess
|
|
21
|
+
import sys
|
|
22
|
+
import time
|
|
23
|
+
from typing import Any, Dict, List, Optional
|
|
24
|
+
|
|
25
|
+
from google.genai.types import Content, Part
|
|
26
|
+
from google import genai
|
|
27
|
+
from google.genai import types
|
|
28
|
+
from mcp import ClientSession
|
|
29
|
+
from mcp.client.streamable_http import streamable_http_client
|
|
30
|
+
|
|
31
|
+
import fleet
|
|
32
|
+
from fleet.utils.logging import log_verbose, VERBOSE
|
|
33
|
+
|
|
34
|
+
# Whitelist hooks for auto-detecting model endpoints (optional)
|
|
35
|
+
_register_endpoint = lambda url: None
|
|
36
|
+
if os.environ.get("FLEET_PROXY_ENABLED"):
|
|
37
|
+
from fleet.proxy.whitelist import install_hooks, register_endpoint as _register_endpoint
|
|
38
|
+
install_hooks()
|
|
39
|
+
|
|
40
|
+
# OAuth configuration
|
|
41
|
+
GOOG_PROJECT = os.environ.get("GOOG_PROJECT", "gemini-agents-area")
|
|
42
|
+
USE_OAUTH = os.environ.get("USE_OAUTH", "false").lower() in ("true", "1", "yes")
|
|
43
|
+
|
|
44
|
+
# Screen dimensions for coordinate denormalization (matches MCP browser)
|
|
45
|
+
SCREEN_WIDTH = 1366
|
|
46
|
+
SCREEN_HEIGHT = 768
|
|
47
|
+
|
|
48
|
+
# Gemini 3 tool definitions (0-1000 normalized coordinates)
|
|
49
|
+
GEMINI_3_TOOL_DEFINITIONS = [
|
|
50
|
+
{
|
|
51
|
+
"name": "click_at",
|
|
52
|
+
"description": "Click at the specified screen coordinates. Coordinates are normalized 0-1000.",
|
|
53
|
+
"parameters": {
|
|
54
|
+
"type": "object",
|
|
55
|
+
"properties": {
|
|
56
|
+
"x": {
|
|
57
|
+
"type": "integer",
|
|
58
|
+
"description": "X coordinate (0-1000, where 0 is left edge, 1000 is right edge)",
|
|
59
|
+
},
|
|
60
|
+
"y": {
|
|
61
|
+
"type": "integer",
|
|
62
|
+
"description": "Y coordinate (0-1000, where 0 is top edge, 1000 is bottom edge)",
|
|
63
|
+
},
|
|
64
|
+
},
|
|
65
|
+
"required": ["x", "y"],
|
|
66
|
+
},
|
|
67
|
+
},
|
|
68
|
+
{
|
|
69
|
+
"name": "type_text",
|
|
70
|
+
"description": "Type text at the current cursor position. Use click_at first to focus the input field.",
|
|
71
|
+
"parameters": {
|
|
72
|
+
"type": "object",
|
|
73
|
+
"properties": {
|
|
74
|
+
"text": {
|
|
75
|
+
"type": "string",
|
|
76
|
+
"description": "The text to type",
|
|
77
|
+
},
|
|
78
|
+
"press_enter": {
|
|
79
|
+
"type": "boolean",
|
|
80
|
+
"description": "Whether to press Enter after typing (default: false)",
|
|
81
|
+
},
|
|
82
|
+
},
|
|
83
|
+
"required": ["text"],
|
|
84
|
+
},
|
|
85
|
+
},
|
|
86
|
+
{
|
|
87
|
+
"name": "key_press",
|
|
88
|
+
"description": "Press a key or key combination (e.g., 'Enter', 'Tab', 'Meta+A', 'Ctrl+C', 'Backspace').",
|
|
89
|
+
"parameters": {
|
|
90
|
+
"type": "object",
|
|
91
|
+
"properties": {
|
|
92
|
+
"keys": {
|
|
93
|
+
"type": "string",
|
|
94
|
+
"description": "Key or key combination to press",
|
|
95
|
+
},
|
|
96
|
+
},
|
|
97
|
+
"required": ["keys"],
|
|
98
|
+
},
|
|
99
|
+
},
|
|
100
|
+
{
|
|
101
|
+
"name": "scroll",
|
|
102
|
+
"description": "Scroll the page up or down.",
|
|
103
|
+
"parameters": {
|
|
104
|
+
"type": "object",
|
|
105
|
+
"properties": {
|
|
106
|
+
"direction": {
|
|
107
|
+
"type": "string",
|
|
108
|
+
"description": "Direction to scroll: 'up' or 'down'",
|
|
109
|
+
"enum": ["up", "down"],
|
|
110
|
+
},
|
|
111
|
+
},
|
|
112
|
+
"required": ["direction"],
|
|
113
|
+
},
|
|
114
|
+
},
|
|
115
|
+
{
|
|
116
|
+
"name": "wait",
|
|
117
|
+
"description": "Wait for a few seconds to allow page to load.",
|
|
118
|
+
"parameters": {
|
|
119
|
+
"type": "object",
|
|
120
|
+
"properties": {
|
|
121
|
+
"seconds": {
|
|
122
|
+
"type": "integer",
|
|
123
|
+
"description": "Number of seconds to wait (1-10)",
|
|
124
|
+
},
|
|
125
|
+
},
|
|
126
|
+
"required": ["seconds"],
|
|
127
|
+
},
|
|
128
|
+
},
|
|
129
|
+
]
|
|
130
|
+
|
|
131
|
+
# Key name normalization for xdotool/X11 keysym compatibility
|
|
132
|
+
_KEY_NAME_MAP_LOWER = {
|
|
133
|
+
"backspace": "BackSpace",
|
|
134
|
+
"arrowleft": "Left", "arrowright": "Right", "arrowup": "Up", "arrowdown": "Down",
|
|
135
|
+
"left": "Left", "right": "Right", "up": "Up", "down": "Down",
|
|
136
|
+
"esc": "Escape", "escape": "Escape",
|
|
137
|
+
"del": "Delete", "delete": "Delete",
|
|
138
|
+
"pgup": "Page_Up", "pageup": "Page_Up",
|
|
139
|
+
"pgdown": "Page_Down", "pgdn": "Page_Down", "pagedown": "Page_Down",
|
|
140
|
+
"enter": "Return", "return": "Return",
|
|
141
|
+
"tab": "Tab", "space": "space",
|
|
142
|
+
"meta": "super", "command": "super", "cmd": "super", "super": "super",
|
|
143
|
+
"ctrl": "ctrl", "control": "ctrl",
|
|
144
|
+
"alt": "alt", "shift": "shift",
|
|
145
|
+
"f1": "F1", "f2": "F2", "f3": "F3", "f4": "F4", "f5": "F5", "f6": "F6",
|
|
146
|
+
"f7": "F7", "f8": "F8", "f9": "F9", "f10": "F10", "f11": "F11", "f12": "F12",
|
|
147
|
+
"home": "Home", "end": "End", "insert": "Insert",
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def normalize_key_name(key: str) -> str:
|
|
152
|
+
"""Normalize key names to xdotool/X11 keysym format."""
|
|
153
|
+
if not key:
|
|
154
|
+
return key
|
|
155
|
+
if "+" in key:
|
|
156
|
+
parts = key.split("+")
|
|
157
|
+
normalized_parts = [_KEY_NAME_MAP_LOWER.get(p.lower(), p) for p in parts]
|
|
158
|
+
return "+".join(normalized_parts)
|
|
159
|
+
return _KEY_NAME_MAP_LOWER.get(key.lower(), key)
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def get_oauth_token() -> str:
|
|
163
|
+
"""Get OAuth token from gcloud."""
|
|
164
|
+
ret = subprocess.run(
|
|
165
|
+
["gcloud", "auth", "application-default", "print-access-token"],
|
|
166
|
+
capture_output=True,
|
|
167
|
+
check=True,
|
|
168
|
+
)
|
|
169
|
+
return ret.stdout.decode().strip()
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def get_gemini_client() -> genai.Client:
|
|
173
|
+
"""Create Gemini client with appropriate auth."""
|
|
174
|
+
api_key = os.environ.get("GEMINI_API_KEY")
|
|
175
|
+
custom_endpoint = os.environ.get("FLEET_MODEL_ENDPOINT")
|
|
176
|
+
|
|
177
|
+
_register_endpoint(custom_endpoint or "generativelanguage.googleapis.com")
|
|
178
|
+
|
|
179
|
+
http_opts = None
|
|
180
|
+
if USE_OAUTH or custom_endpoint:
|
|
181
|
+
opts = {}
|
|
182
|
+
if custom_endpoint:
|
|
183
|
+
opts["base_url"] = custom_endpoint
|
|
184
|
+
log_verbose(f"Using custom endpoint: {custom_endpoint}")
|
|
185
|
+
if USE_OAUTH:
|
|
186
|
+
opts["headers"] = {
|
|
187
|
+
"Authorization": f"Bearer {get_oauth_token()}",
|
|
188
|
+
"X-Goog-User-Project": GOOG_PROJECT,
|
|
189
|
+
}
|
|
190
|
+
opts["api_version"] = "v1alpha"
|
|
191
|
+
log_verbose(f"Using OAuth (project: {GOOG_PROJECT})")
|
|
192
|
+
http_opts = types.HttpOptions(**opts)
|
|
193
|
+
|
|
194
|
+
return genai.Client(api_key=api_key, http_options=http_opts)
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def convert_gemini_3_to_mcp(function_name: str, args: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
198
|
+
"""Convert Gemini 3 custom function calls to MCP computer tool format.
|
|
199
|
+
|
|
200
|
+
Coordinates are normalized 0-1000, denormalized to screen dimensions.
|
|
201
|
+
Returns a list of MCP actions since some functions expand to multiple steps.
|
|
202
|
+
"""
|
|
203
|
+
def denormalize_x(x: int) -> int:
|
|
204
|
+
return int(x / 1000 * SCREEN_WIDTH)
|
|
205
|
+
|
|
206
|
+
def denormalize_y(y: int) -> int:
|
|
207
|
+
return int(y / 1000 * SCREEN_HEIGHT)
|
|
208
|
+
|
|
209
|
+
mcp_actions = []
|
|
210
|
+
|
|
211
|
+
if function_name == "click_at":
|
|
212
|
+
x = denormalize_x(args.get("x", 500))
|
|
213
|
+
y = denormalize_y(args.get("y", 500))
|
|
214
|
+
mcp_actions.append({"action": "left_click", "coordinate": [x, y]})
|
|
215
|
+
|
|
216
|
+
elif function_name == "type_text":
|
|
217
|
+
text = args.get("text", "")
|
|
218
|
+
press_enter = args.get("press_enter", False)
|
|
219
|
+
mcp_actions.append({"action": "type", "text": text})
|
|
220
|
+
if press_enter:
|
|
221
|
+
mcp_actions.append({"action": "key", "text": "Return"})
|
|
222
|
+
|
|
223
|
+
elif function_name == "key_press":
|
|
224
|
+
keys = args.get("keys", "Return")
|
|
225
|
+
mcp_actions.append({"action": "key", "text": normalize_key_name(keys)})
|
|
226
|
+
|
|
227
|
+
elif function_name == "scroll":
|
|
228
|
+
direction = args.get("direction", "down")
|
|
229
|
+
mcp_actions.append({
|
|
230
|
+
"action": "scroll",
|
|
231
|
+
"coordinate": [SCREEN_WIDTH // 2, SCREEN_HEIGHT // 2],
|
|
232
|
+
"scroll_direction": direction,
|
|
233
|
+
"scroll_amount": 5,
|
|
234
|
+
})
|
|
235
|
+
|
|
236
|
+
elif function_name == "wait":
|
|
237
|
+
seconds = min(args.get("seconds", 3), 10)
|
|
238
|
+
mcp_actions.append({"action": "wait", "duration": seconds})
|
|
239
|
+
|
|
240
|
+
else:
|
|
241
|
+
# Unknown function, fallback to screenshot
|
|
242
|
+
mcp_actions.append({"action": "screenshot"})
|
|
243
|
+
|
|
244
|
+
return mcp_actions
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
class MCP:
|
|
248
|
+
"""MCP client using streamable-http transport."""
|
|
249
|
+
|
|
250
|
+
def __init__(self, url: str, log_file: Optional[str] = None):
|
|
251
|
+
self.url = url.rstrip("/") + "/mcp/"
|
|
252
|
+
self._session: Optional[ClientSession] = None
|
|
253
|
+
self._client = None
|
|
254
|
+
self._log_file = log_file or os.environ.get("FLEET_SESSION_LOG")
|
|
255
|
+
self._log_handle = None
|
|
256
|
+
if self._log_file:
|
|
257
|
+
from pathlib import Path
|
|
258
|
+
Path(self._log_file).parent.mkdir(parents=True, exist_ok=True)
|
|
259
|
+
self._log_handle = open(self._log_file, "a")
|
|
260
|
+
|
|
261
|
+
async def __aenter__(self):
|
|
262
|
+
print(f"MCP: Connecting to {self.url}...")
|
|
263
|
+
try:
|
|
264
|
+
self._client = streamable_http_client(self.url)
|
|
265
|
+
read, write, _ = await self._client.__aenter__()
|
|
266
|
+
self._session = ClientSession(read, write)
|
|
267
|
+
await self._session.__aenter__()
|
|
268
|
+
await self._session.initialize()
|
|
269
|
+
print(f"MCP: Connected successfully")
|
|
270
|
+
except Exception as e:
|
|
271
|
+
print(f"MCP: Connection failed: {type(e).__name__}: {e}")
|
|
272
|
+
raise
|
|
273
|
+
|
|
274
|
+
# Fetch available tools from server
|
|
275
|
+
try:
|
|
276
|
+
result = await self._session.list_tools()
|
|
277
|
+
self._tools = [
|
|
278
|
+
{
|
|
279
|
+
"name": tool.name,
|
|
280
|
+
"description": tool.description or "",
|
|
281
|
+
"inputSchema": tool.inputSchema,
|
|
282
|
+
}
|
|
283
|
+
for tool in result.tools
|
|
284
|
+
]
|
|
285
|
+
print(f"MCP: Loaded {len(self._tools)} tools")
|
|
286
|
+
except Exception as e:
|
|
287
|
+
print(f"MCP: Failed to list tools: {type(e).__name__}: {e}")
|
|
288
|
+
raise
|
|
289
|
+
return self
|
|
290
|
+
|
|
291
|
+
async def __aexit__(self, *args):
|
|
292
|
+
if self._session:
|
|
293
|
+
await self._session.__aexit__(*args)
|
|
294
|
+
if self._client:
|
|
295
|
+
await self._client.__aexit__(*args)
|
|
296
|
+
if self._log_handle:
|
|
297
|
+
self._log_handle.close()
|
|
298
|
+
|
|
299
|
+
def _log(self, entry: dict):
|
|
300
|
+
"""Log an entry to the traffic file."""
|
|
301
|
+
if self._log_handle:
|
|
302
|
+
from datetime import datetime
|
|
303
|
+
entry["timestamp"] = datetime.now().isoformat()
|
|
304
|
+
entry["url"] = self.url
|
|
305
|
+
self._log_handle.write(json.dumps(entry) + "\n")
|
|
306
|
+
self._log_handle.flush()
|
|
307
|
+
|
|
308
|
+
async def call(self, name: str, args: Dict = None) -> Dict:
|
|
309
|
+
"""Call a tool and return the result."""
|
|
310
|
+
start_time = time.time()
|
|
311
|
+
result = await self._session.call_tool(name, args or {})
|
|
312
|
+
duration_ms = int((time.time() - start_time) * 1000)
|
|
313
|
+
|
|
314
|
+
# Debug: log raw MCP result structure
|
|
315
|
+
log_verbose(f" MCP result.content ({len(result.content)} items):")
|
|
316
|
+
for i, item in enumerate(result.content):
|
|
317
|
+
log_verbose(f" [{i}] type={type(item).__name__}, attrs={dir(item)[:10]}...")
|
|
318
|
+
if hasattr(item, "type"):
|
|
319
|
+
log_verbose(f" .type = {repr(item.type)}")
|
|
320
|
+
if hasattr(item, "data"):
|
|
321
|
+
data_preview = str(item.data)[:50] if item.data else "None"
|
|
322
|
+
log_verbose(f" .data = {data_preview}...")
|
|
323
|
+
|
|
324
|
+
# Helper to get attribute or dict key
|
|
325
|
+
def _get(item, key, default=None):
|
|
326
|
+
if isinstance(item, dict):
|
|
327
|
+
return item.get(key, default)
|
|
328
|
+
return getattr(item, key, default)
|
|
329
|
+
|
|
330
|
+
content = []
|
|
331
|
+
for item in result.content:
|
|
332
|
+
item_type = _get(item, "type")
|
|
333
|
+
if item_type == "image":
|
|
334
|
+
content.append({
|
|
335
|
+
"type": "image",
|
|
336
|
+
"data": _get(item, "data", ""),
|
|
337
|
+
"mimeType": _get(item, "mimeType", "image/png"),
|
|
338
|
+
})
|
|
339
|
+
elif item_type == "text":
|
|
340
|
+
content.append({"type": "text", "text": _get(item, "text", "")})
|
|
341
|
+
|
|
342
|
+
self._log({
|
|
343
|
+
"type": "mcp_call",
|
|
344
|
+
"tool": name,
|
|
345
|
+
"args": args or {},
|
|
346
|
+
"duration_ms": duration_ms,
|
|
347
|
+
"response_content_types": [c.get("type") for c in content],
|
|
348
|
+
"is_error": result.isError if hasattr(result, "isError") else False,
|
|
349
|
+
})
|
|
350
|
+
return {"content": content, "isError": result.isError if hasattr(result, "isError") else False}
|
|
351
|
+
|
|
352
|
+
|
|
353
|
+
def get_gemini_3_tools() -> List[types.FunctionDeclaration]:
|
|
354
|
+
"""Return Gemini 3 custom tools as FunctionDeclarations."""
|
|
355
|
+
return [
|
|
356
|
+
types.FunctionDeclaration(
|
|
357
|
+
name=tool["name"],
|
|
358
|
+
description=tool["description"],
|
|
359
|
+
parameters=tool["parameters"],
|
|
360
|
+
)
|
|
361
|
+
for tool in GEMINI_3_TOOL_DEFINITIONS
|
|
362
|
+
]
|
|
363
|
+
|
|
364
|
+
|
|
365
|
+
def get_image_data(result: Dict) -> Optional[str]:
|
|
366
|
+
"""Extract base64 image from MCP result."""
|
|
367
|
+
for content in result.get("content", []):
|
|
368
|
+
if content.get("type") == "image":
|
|
369
|
+
return content.get("data")
|
|
370
|
+
return None
|
|
371
|
+
|
|
372
|
+
|
|
373
|
+
def extract_reasoning_from_candidate(candidate) -> Optional[str]:
|
|
374
|
+
"""Extract reasoning trace from Gemini candidate response."""
|
|
375
|
+
reasoning_parts = []
|
|
376
|
+
|
|
377
|
+
if not candidate or not candidate.content or not candidate.content.parts:
|
|
378
|
+
return None
|
|
379
|
+
|
|
380
|
+
has_function_calls = any(
|
|
381
|
+
hasattr(p, "function_call") and p.function_call for p in candidate.content.parts
|
|
382
|
+
)
|
|
383
|
+
|
|
384
|
+
for part in candidate.content.parts:
|
|
385
|
+
if hasattr(part, "thought") and part.thought:
|
|
386
|
+
if isinstance(part.thought, str):
|
|
387
|
+
reasoning_parts.append(part.thought)
|
|
388
|
+
elif part.thought is True and hasattr(part, "text") and part.text:
|
|
389
|
+
reasoning_parts.append(part.text)
|
|
390
|
+
elif hasattr(part, "text") and part.text and has_function_calls:
|
|
391
|
+
reasoning_parts.append(part.text)
|
|
392
|
+
|
|
393
|
+
if not reasoning_parts:
|
|
394
|
+
return None
|
|
395
|
+
return "\n\n".join(reasoning_parts)
|
|
396
|
+
|
|
397
|
+
|
|
398
|
+
class GeminiAgent:
|
|
399
|
+
"""Gemini Computer Use Agent."""
|
|
400
|
+
|
|
401
|
+
def __init__(self, mcp: MCP, model: str, session=None):
|
|
402
|
+
self.mcp = mcp
|
|
403
|
+
self.model = model.split("/")[-1] if "/" in model else model
|
|
404
|
+
self.client = get_gemini_client()
|
|
405
|
+
self.transcript: List[Dict] = []
|
|
406
|
+
self.session = session
|
|
407
|
+
self._consecutive_errors = 0
|
|
408
|
+
self._max_consecutive_errors = 5
|
|
409
|
+
|
|
410
|
+
async def _take_screenshot(self) -> Optional[str]:
|
|
411
|
+
"""Take a screenshot and return base64 data."""
|
|
412
|
+
try:
|
|
413
|
+
result = await self.mcp.call("computer", {"action": "screenshot"})
|
|
414
|
+
return get_image_data(result)
|
|
415
|
+
except Exception as e:
|
|
416
|
+
print(f"Screenshot failed: {e}")
|
|
417
|
+
return None
|
|
418
|
+
|
|
419
|
+
async def _execute_gemini_function(self, name: str, args: Dict) -> Dict:
|
|
420
|
+
"""Execute a Gemini function by converting to MCP actions."""
|
|
421
|
+
mcp_actions = convert_gemini_3_to_mcp(name, args)
|
|
422
|
+
log_verbose(f" Converting {name} -> {len(mcp_actions)} MCP action(s)")
|
|
423
|
+
|
|
424
|
+
last_result = None
|
|
425
|
+
for i, action in enumerate(mcp_actions):
|
|
426
|
+
log_verbose(f" Action {i+1}: {action}")
|
|
427
|
+
last_result = await self.mcp.call("computer", action)
|
|
428
|
+
if last_result.get("isError"):
|
|
429
|
+
return last_result
|
|
430
|
+
|
|
431
|
+
# After executing actions, take a screenshot
|
|
432
|
+
screenshot_result = await self.mcp.call("computer", {"action": "screenshot"})
|
|
433
|
+
return screenshot_result
|
|
434
|
+
|
|
435
|
+
async def run(self, prompt: str, max_steps: int) -> Dict[str, Any]:
|
|
436
|
+
"""Run the agent on a task."""
|
|
437
|
+
start_time = time.time()
|
|
438
|
+
|
|
439
|
+
system_prompt = """You are a helpful agent. Complete the task by interacting with the browser.
|
|
440
|
+
|
|
441
|
+
Use the available tools to click, type, scroll, and interact with the page.
|
|
442
|
+
Coordinates are normalized 0-1000 (0,0 is top-left, 1000,1000 is bottom-right).
|
|
443
|
+
|
|
444
|
+
When done, stop calling tools and provide your final response."""
|
|
445
|
+
|
|
446
|
+
# Get Gemini 3 tools
|
|
447
|
+
gemini_tools = get_gemini_3_tools()
|
|
448
|
+
|
|
449
|
+
log_verbose("\n" + "="*60)
|
|
450
|
+
log_verbose("SYSTEM PROMPT:")
|
|
451
|
+
log_verbose("="*60)
|
|
452
|
+
log_verbose(system_prompt)
|
|
453
|
+
|
|
454
|
+
log_verbose(f"\nTOOLS ({len(gemini_tools)} total):")
|
|
455
|
+
for tool in GEMINI_3_TOOL_DEFINITIONS:
|
|
456
|
+
log_verbose(f" {tool['name']}: {tool['description'][:80]}...")
|
|
457
|
+
|
|
458
|
+
# Configure Gemini with thinking enabled
|
|
459
|
+
config = types.GenerateContentConfig(
|
|
460
|
+
max_output_tokens=65536,
|
|
461
|
+
system_instruction=system_prompt,
|
|
462
|
+
tools=[types.Tool(function_declarations=gemini_tools)],
|
|
463
|
+
thinking_config=types.ThinkingConfig(include_thoughts=True),
|
|
464
|
+
)
|
|
465
|
+
|
|
466
|
+
# Set config on session for logging (if session exists)
|
|
467
|
+
if self.session:
|
|
468
|
+
self.session.config = config
|
|
469
|
+
|
|
470
|
+
# Take initial screenshot
|
|
471
|
+
print("Taking initial screenshot...")
|
|
472
|
+
initial_screenshot = await self._take_screenshot()
|
|
473
|
+
|
|
474
|
+
# Build initial user message with task + screenshot
|
|
475
|
+
user_parts = [Part(text=f"Task: {prompt}")]
|
|
476
|
+
if initial_screenshot:
|
|
477
|
+
user_parts.append(Part(inline_data={
|
|
478
|
+
"mime_type": "image/png",
|
|
479
|
+
"data": initial_screenshot,
|
|
480
|
+
}))
|
|
481
|
+
print("✓ Initial screenshot captured")
|
|
482
|
+
else:
|
|
483
|
+
print("⚠ Could not capture initial screenshot")
|
|
484
|
+
|
|
485
|
+
history: List[Content] = [Content(role="user", parts=user_parts)]
|
|
486
|
+
self.transcript.append({"role": "user", "content": prompt})
|
|
487
|
+
|
|
488
|
+
log_verbose("\n" + "="*60)
|
|
489
|
+
log_verbose("USER PROMPT:")
|
|
490
|
+
log_verbose("="*60)
|
|
491
|
+
log_verbose(prompt)
|
|
492
|
+
|
|
493
|
+
for step in range(1, max_steps + 1):
|
|
494
|
+
print(f"\n{'='*50}")
|
|
495
|
+
print(f"Step {step}/{max_steps}")
|
|
496
|
+
|
|
497
|
+
# Log history size
|
|
498
|
+
log_verbose(f" History: {len(history)} messages")
|
|
499
|
+
|
|
500
|
+
try:
|
|
501
|
+
response = self.client.models.generate_content(
|
|
502
|
+
model=self.model,
|
|
503
|
+
contents=history,
|
|
504
|
+
config=config,
|
|
505
|
+
)
|
|
506
|
+
self._consecutive_errors = 0
|
|
507
|
+
except Exception as e:
|
|
508
|
+
self._consecutive_errors += 1
|
|
509
|
+
error_type = type(e).__name__
|
|
510
|
+
print(f"API error ({error_type}): {e}")
|
|
511
|
+
print(f" Consecutive errors: {self._consecutive_errors}/{self._max_consecutive_errors}")
|
|
512
|
+
|
|
513
|
+
if self._consecutive_errors >= self._max_consecutive_errors:
|
|
514
|
+
return self._result(False, f"Too many consecutive API errors: {error_type}: {e}", step, start_time)
|
|
515
|
+
|
|
516
|
+
# Check for retryable errors
|
|
517
|
+
if "429" in str(e) or "quota" in str(e).lower() or "rate" in str(e).lower():
|
|
518
|
+
print(" Rate limited, waiting 10s...")
|
|
519
|
+
await asyncio.sleep(10)
|
|
520
|
+
continue
|
|
521
|
+
elif "503" in str(e) or "500" in str(e) or "overloaded" in str(e).lower():
|
|
522
|
+
print(" Server error, waiting 5s...")
|
|
523
|
+
await asyncio.sleep(5)
|
|
524
|
+
continue
|
|
525
|
+
else:
|
|
526
|
+
return self._result(False, f"{error_type}: {e}", step, start_time)
|
|
527
|
+
|
|
528
|
+
if not response.candidates:
|
|
529
|
+
print("[WARN] No candidates, retrying...")
|
|
530
|
+
log_verbose(f" Response: {response}")
|
|
531
|
+
continue
|
|
532
|
+
|
|
533
|
+
candidate = response.candidates[0]
|
|
534
|
+
if not candidate.content or not candidate.content.parts:
|
|
535
|
+
print("[WARN] Empty response, retrying...")
|
|
536
|
+
continue
|
|
537
|
+
|
|
538
|
+
# Extract reasoning trace
|
|
539
|
+
reasoning = extract_reasoning_from_candidate(candidate)
|
|
540
|
+
if reasoning:
|
|
541
|
+
preview = reasoning[:100] + "..." if len(reasoning) > 100 else reasoning
|
|
542
|
+
print(f"🧠 Thinking: {preview}")
|
|
543
|
+
|
|
544
|
+
# Log to Fleet session if available
|
|
545
|
+
if self.session:
|
|
546
|
+
try:
|
|
547
|
+
await self.session.log(history, response)
|
|
548
|
+
if step == 1 and self.session.session_id:
|
|
549
|
+
print(f"Session: https://fleetai.com/dashboard/sessions/{self.session.session_id}")
|
|
550
|
+
except Exception as e:
|
|
551
|
+
print(f" [WARN] Session log failed: {type(e).__name__}: {e}")
|
|
552
|
+
log_verbose(f" [WARN] Session log failed: {e}")
|
|
553
|
+
|
|
554
|
+
# Log all parts for debugging
|
|
555
|
+
log_verbose(f"\n Response parts ({len(candidate.content.parts)}):")
|
|
556
|
+
for i, part in enumerate(candidate.content.parts):
|
|
557
|
+
if part.text:
|
|
558
|
+
log_verbose(f" [{i}] TEXT: {part.text[:300]}{'...' if len(part.text) > 300 else ''}")
|
|
559
|
+
elif part.function_call:
|
|
560
|
+
fc = part.function_call
|
|
561
|
+
args_str = json.dumps(dict(fc.args) if fc.args else {})
|
|
562
|
+
log_verbose(f" [{i}] FUNCTION_CALL: {fc.name}({args_str})")
|
|
563
|
+
elif hasattr(part, 'thought') and part.thought:
|
|
564
|
+
log_verbose(f" [{i}] THOUGHT: {part.thought[:300]}{'...' if len(part.thought) > 300 else ''}")
|
|
565
|
+
else:
|
|
566
|
+
log_verbose(f" [{i}] OTHER: {type(part).__name__}")
|
|
567
|
+
|
|
568
|
+
# Extract function calls and text
|
|
569
|
+
function_calls = [p.function_call for p in candidate.content.parts if p.function_call]
|
|
570
|
+
text_parts = [p.text for p in candidate.content.parts if p.text and not getattr(p, "thought", False)]
|
|
571
|
+
|
|
572
|
+
# Print model output
|
|
573
|
+
if text_parts:
|
|
574
|
+
for text in text_parts:
|
|
575
|
+
display = text[:200] + "..." if len(text) > 200 else text
|
|
576
|
+
print(f"Model: {display}")
|
|
577
|
+
|
|
578
|
+
# Check for completion (no function calls)
|
|
579
|
+
if text_parts and not function_calls:
|
|
580
|
+
final_text = " ".join(text_parts)
|
|
581
|
+
self.transcript.append({"role": "assistant", "content": final_text})
|
|
582
|
+
|
|
583
|
+
if final_text.strip().upper().startswith("DONE:"):
|
|
584
|
+
answer = final_text.strip()[5:].strip()
|
|
585
|
+
print(f"\n✓ Agent completed: {answer[:100]}")
|
|
586
|
+
return self._result(True, None, step, start_time, answer)
|
|
587
|
+
elif final_text.strip().upper().startswith("FAILED:"):
|
|
588
|
+
error = final_text.strip()[7:].strip()
|
|
589
|
+
print(f"\n✗ Agent failed: {error[:100]}")
|
|
590
|
+
return self._result(False, error, step, start_time)
|
|
591
|
+
else:
|
|
592
|
+
print(f"\n✓ Agent finished with response")
|
|
593
|
+
return self._result(True, None, step, start_time, final_text)
|
|
594
|
+
|
|
595
|
+
# Check for thinking-only response (no function calls, no text)
|
|
596
|
+
if not function_calls and not text_parts:
|
|
597
|
+
print("🧠 Thinking-only response, continuing...")
|
|
598
|
+
# Add thinking to history so model has context
|
|
599
|
+
history.append(candidate.content)
|
|
600
|
+
continue
|
|
601
|
+
|
|
602
|
+
if function_calls:
|
|
603
|
+
# Add model's response to history
|
|
604
|
+
history.append(candidate.content)
|
|
605
|
+
|
|
606
|
+
log_verbose(f"\n Executing {len(function_calls)} function call(s):")
|
|
607
|
+
|
|
608
|
+
# Execute each function call
|
|
609
|
+
response_parts = []
|
|
610
|
+
for i, fc in enumerate(function_calls):
|
|
611
|
+
name = fc.name
|
|
612
|
+
args = dict(fc.args) if fc.args else {}
|
|
613
|
+
print(f" Tool {i+1}/{len(function_calls)}: {name}({json.dumps(args)})")
|
|
614
|
+
self.transcript.append({"role": "tool_call", "name": name, "args": args})
|
|
615
|
+
|
|
616
|
+
try:
|
|
617
|
+
result = await self._execute_gemini_function(name, args)
|
|
618
|
+
|
|
619
|
+
if result.get("isError"):
|
|
620
|
+
self._consecutive_errors += 1
|
|
621
|
+
error_text = ""
|
|
622
|
+
for c in result.get("content", []):
|
|
623
|
+
if c.get("type") == "text":
|
|
624
|
+
error_text = c.get("text", "")[:200]
|
|
625
|
+
print(f" Tool error: {error_text}")
|
|
626
|
+
|
|
627
|
+
# Return error to model
|
|
628
|
+
response_parts.append(Part(
|
|
629
|
+
function_response={
|
|
630
|
+
"name": name,
|
|
631
|
+
"response": {"status": "error", "error": error_text},
|
|
632
|
+
}
|
|
633
|
+
))
|
|
634
|
+
else:
|
|
635
|
+
self._consecutive_errors = 0
|
|
636
|
+
img_data = get_image_data(result)
|
|
637
|
+
|
|
638
|
+
if img_data:
|
|
639
|
+
# Function response with screenshot
|
|
640
|
+
response_parts.append(Part(
|
|
641
|
+
function_response={
|
|
642
|
+
"name": name,
|
|
643
|
+
"response": {"status": "success"},
|
|
644
|
+
}
|
|
645
|
+
))
|
|
646
|
+
# Add screenshot as inline_data
|
|
647
|
+
response_parts.append(Part(
|
|
648
|
+
inline_data={
|
|
649
|
+
"mime_type": "image/png",
|
|
650
|
+
"data": img_data,
|
|
651
|
+
}
|
|
652
|
+
))
|
|
653
|
+
log_verbose(" Response: screenshot captured")
|
|
654
|
+
else:
|
|
655
|
+
response_parts.append(Part(
|
|
656
|
+
function_response={
|
|
657
|
+
"name": name,
|
|
658
|
+
"response": {"status": "success"},
|
|
659
|
+
}
|
|
660
|
+
))
|
|
661
|
+
log_verbose(" Response: no screenshot")
|
|
662
|
+
|
|
663
|
+
except Exception as e:
|
|
664
|
+
self._consecutive_errors += 1
|
|
665
|
+
error_type = type(e).__name__
|
|
666
|
+
print(f" Tool exception ({error_type}): {e}")
|
|
667
|
+
|
|
668
|
+
if "connection" in str(e).lower() or "closed" in str(e).lower():
|
|
669
|
+
print(" MCP connection lost, failing task")
|
|
670
|
+
return self._result(False, f"MCP connection error: {e}", step, start_time)
|
|
671
|
+
|
|
672
|
+
response_parts.append(Part(
|
|
673
|
+
function_response={
|
|
674
|
+
"name": name,
|
|
675
|
+
"response": {"status": "error", "error": str(e)},
|
|
676
|
+
}
|
|
677
|
+
))
|
|
678
|
+
|
|
679
|
+
# Small delay between tool calls
|
|
680
|
+
if i < len(function_calls) - 1:
|
|
681
|
+
await asyncio.sleep(0.1)
|
|
682
|
+
|
|
683
|
+
# Add function responses to history as user role
|
|
684
|
+
# (Gemini expects function_response in user messages)
|
|
685
|
+
history.append(Content(role="user", parts=response_parts))
|
|
686
|
+
log_verbose(f" Added {len(response_parts)} response part(s) to history")
|
|
687
|
+
|
|
688
|
+
# Max steps reached
|
|
689
|
+
print(f"\n⚠ Max steps ({max_steps}) reached")
|
|
690
|
+
return self._result(True, "Max steps reached", max_steps, start_time, "Max steps reached - task may be complete")
|
|
691
|
+
|
|
692
|
+
def _result(self, completed: bool, error: Optional[str], steps: int, start_time: float, answer: str = None) -> Dict:
|
|
693
|
+
"""Build result dict."""
|
|
694
|
+
return {
|
|
695
|
+
"completed": completed,
|
|
696
|
+
"error": error,
|
|
697
|
+
"final_answer": answer,
|
|
698
|
+
"steps_taken": steps,
|
|
699
|
+
"execution_time_ms": int((time.time() - start_time) * 1000),
|
|
700
|
+
"transcript": self.transcript,
|
|
701
|
+
}
|
|
702
|
+
|
|
703
|
+
|
|
704
|
+
async def main():
|
|
705
|
+
"""Main entry point."""
|
|
706
|
+
config = {
|
|
707
|
+
"url": os.environ.get("FLEET_MCP_URL", "http://localhost:8765"),
|
|
708
|
+
"prompt": os.environ.get("FLEET_TASK_PROMPT", ""),
|
|
709
|
+
"task_key": os.environ.get("FLEET_TASK_KEY", ""),
|
|
710
|
+
"job_id": os.environ.get("FLEET_JOB_ID"),
|
|
711
|
+
"instance_id": os.environ.get("FLEET_INSTANCE_ID"),
|
|
712
|
+
"model": os.environ.get("FLEET_MODEL", "gemini-3-pro-preview"),
|
|
713
|
+
"max_steps": int(os.environ.get("FLEET_MAX_STEPS", "200")),
|
|
714
|
+
}
|
|
715
|
+
|
|
716
|
+
print("Gemini CUA Agent")
|
|
717
|
+
print(f" Model: {config['model']}")
|
|
718
|
+
print(f" MCP: {config['url']}")
|
|
719
|
+
print(f" Verbose: {VERBOSE}")
|
|
720
|
+
print(f" Task: {config['prompt'][:80]}...")
|
|
721
|
+
|
|
722
|
+
if not os.environ.get("GEMINI_API_KEY"):
|
|
723
|
+
result = {"task_key": config["task_key"], "completed": False, "error": "No GEMINI_API_KEY"}
|
|
724
|
+
print(json.dumps(result))
|
|
725
|
+
return result
|
|
726
|
+
|
|
727
|
+
try:
|
|
728
|
+
# Create Fleet session for live logging
|
|
729
|
+
session = None
|
|
730
|
+
if os.environ.get("FLEET_API_KEY"):
|
|
731
|
+
session = fleet.session_async(
|
|
732
|
+
job_id=config["job_id"],
|
|
733
|
+
model=config["model"],
|
|
734
|
+
task_key=config["task_key"],
|
|
735
|
+
instance_id=config["instance_id"],
|
|
736
|
+
)
|
|
737
|
+
|
|
738
|
+
async with MCP(config["url"]) as mcp:
|
|
739
|
+
agent = GeminiAgent(mcp, config["model"], session=session)
|
|
740
|
+
result = await agent.run(config["prompt"], config["max_steps"])
|
|
741
|
+
result["task_key"] = config["task_key"]
|
|
742
|
+
if session and session.session_id:
|
|
743
|
+
result["session_id"] = session.session_id
|
|
744
|
+
|
|
745
|
+
print(json.dumps(result))
|
|
746
|
+
return result
|
|
747
|
+
except Exception as e:
|
|
748
|
+
import traceback
|
|
749
|
+
error_msg = f"{type(e).__name__}: {e}"
|
|
750
|
+
print(f"Agent exception: {error_msg}", file=sys.stderr)
|
|
751
|
+
traceback.print_exc(file=sys.stderr)
|
|
752
|
+
result = {"task_key": config["task_key"], "completed": False, "error": error_msg}
|
|
753
|
+
print(json.dumps(result))
|
|
754
|
+
return result
|
|
755
|
+
|
|
756
|
+
|
|
757
|
+
if __name__ == "__main__":
|
|
758
|
+
result = asyncio.run(main())
|
|
759
|
+
sys.exit(0 if result.get("completed") else 1)
|