fleet-python 0.2.8__tar.gz → 0.2.10__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of fleet-python might be problematic. Click here for more details.
- {fleet_python-0.2.8 → fleet_python-0.2.10}/PKG-INFO +1 -1
- {fleet_python-0.2.8 → fleet_python-0.2.10}/examples/example_sync.py +1 -1
- fleet_python-0.2.10/examples/gemini_example.py +432 -0
- fleet_python-0.2.10/examples/json_tasks_example.py +164 -0
- {fleet_python-0.2.8 → fleet_python-0.2.10}/fleet/__init__.py +2 -0
- fleet_python-0.2.10/fleet/_async/base.py +132 -0
- {fleet_python-0.2.8 → fleet_python-0.2.10}/fleet/_async/client.py +11 -5
- {fleet_python-0.2.8 → fleet_python-0.2.10}/fleet/_async/env/client.py +6 -4
- {fleet_python-0.2.8/fleet → fleet_python-0.2.10/fleet/_async}/exceptions.py +9 -0
- {fleet_python-0.2.8 → fleet_python-0.2.10}/fleet/_async/instance/client.py +1 -1
- fleet_python-0.2.10/fleet/base.py +132 -0
- {fleet_python-0.2.8 → fleet_python-0.2.10}/fleet/client.py +11 -5
- {fleet_python-0.2.8 → fleet_python-0.2.10}/fleet/env/client.py +6 -4
- {fleet_python-0.2.8/fleet/_async → fleet_python-0.2.10/fleet}/exceptions.py +9 -0
- {fleet_python-0.2.8 → fleet_python-0.2.10}/fleet/instance/client.py +1 -1
- {fleet_python-0.2.8 → fleet_python-0.2.10}/fleet_python.egg-info/PKG-INFO +1 -1
- {fleet_python-0.2.8 → fleet_python-0.2.10}/fleet_python.egg-info/SOURCES.txt +1 -0
- {fleet_python-0.2.8 → fleet_python-0.2.10}/pyproject.toml +1 -1
- fleet_python-0.2.8/examples/json_tasks_example.py +0 -85
- fleet_python-0.2.8/fleet/_async/base.py +0 -51
- fleet_python-0.2.8/fleet/base.py +0 -51
- {fleet_python-0.2.8 → fleet_python-0.2.10}/LICENSE +0 -0
- {fleet_python-0.2.8 → fleet_python-0.2.10}/README.md +0 -0
- {fleet_python-0.2.8 → fleet_python-0.2.10}/examples/dsl_example.py +0 -0
- {fleet_python-0.2.8 → fleet_python-0.2.10}/examples/example.py +0 -0
- {fleet_python-0.2.8 → fleet_python-0.2.10}/examples/example_client.py +0 -0
- {fleet_python-0.2.8 → fleet_python-0.2.10}/examples/nova_act_example.py +0 -0
- {fleet_python-0.2.8 → fleet_python-0.2.10}/examples/openai_example.py +0 -0
- {fleet_python-0.2.8 → fleet_python-0.2.10}/examples/openai_simple_example.py +0 -0
- {fleet_python-0.2.8 → fleet_python-0.2.10}/examples/quickstart.py +0 -0
- {fleet_python-0.2.8 → fleet_python-0.2.10}/fleet/_async/__init__.py +0 -0
- {fleet_python-0.2.8 → fleet_python-0.2.10}/fleet/_async/env/__init__.py +0 -0
- {fleet_python-0.2.8 → fleet_python-0.2.10}/fleet/_async/instance/__init__.py +0 -0
- {fleet_python-0.2.8 → fleet_python-0.2.10}/fleet/_async/instance/base.py +0 -0
- {fleet_python-0.2.8 → fleet_python-0.2.10}/fleet/_async/instance/models.py +0 -0
- {fleet_python-0.2.8 → fleet_python-0.2.10}/fleet/_async/models.py +0 -0
- {fleet_python-0.2.8 → fleet_python-0.2.10}/fleet/_async/playwright.py +0 -0
- {fleet_python-0.2.8 → fleet_python-0.2.10}/fleet/_async/resources/__init__.py +0 -0
- {fleet_python-0.2.8 → fleet_python-0.2.10}/fleet/_async/resources/base.py +0 -0
- {fleet_python-0.2.8 → fleet_python-0.2.10}/fleet/_async/resources/browser.py +0 -0
- {fleet_python-0.2.8 → fleet_python-0.2.10}/fleet/_async/resources/sqlite.py +0 -0
- {fleet_python-0.2.8 → fleet_python-0.2.10}/fleet/env/__init__.py +0 -0
- {fleet_python-0.2.8 → fleet_python-0.2.10}/fleet/instance/__init__.py +0 -0
- {fleet_python-0.2.8 → fleet_python-0.2.10}/fleet/instance/base.py +0 -0
- {fleet_python-0.2.8 → fleet_python-0.2.10}/fleet/instance/models.py +0 -0
- {fleet_python-0.2.8 → fleet_python-0.2.10}/fleet/models.py +0 -0
- {fleet_python-0.2.8 → fleet_python-0.2.10}/fleet/playwright.py +0 -0
- {fleet_python-0.2.8 → fleet_python-0.2.10}/fleet/resources/__init__.py +0 -0
- {fleet_python-0.2.8 → fleet_python-0.2.10}/fleet/resources/base.py +0 -0
- {fleet_python-0.2.8 → fleet_python-0.2.10}/fleet/resources/browser.py +0 -0
- {fleet_python-0.2.8 → fleet_python-0.2.10}/fleet/resources/sqlite.py +0 -0
- {fleet_python-0.2.8 → fleet_python-0.2.10}/fleet/verifiers/__init__.py +0 -0
- {fleet_python-0.2.8 → fleet_python-0.2.10}/fleet/verifiers/code.py +0 -0
- {fleet_python-0.2.8 → fleet_python-0.2.10}/fleet/verifiers/db.py +0 -0
- {fleet_python-0.2.8 → fleet_python-0.2.10}/fleet/verifiers/sql_differ.py +0 -0
- {fleet_python-0.2.8 → fleet_python-0.2.10}/fleet_python.egg-info/dependency_links.txt +0 -0
- {fleet_python-0.2.8 → fleet_python-0.2.10}/fleet_python.egg-info/requires.txt +0 -0
- {fleet_python-0.2.8 → fleet_python-0.2.10}/fleet_python.egg-info/top_level.txt +0 -0
- {fleet_python-0.2.8 → fleet_python-0.2.10}/scripts/fix_sync_imports.py +0 -0
- {fleet_python-0.2.8 → fleet_python-0.2.10}/scripts/unasync.py +0 -0
- {fleet_python-0.2.8 → fleet_python-0.2.10}/setup.cfg +0 -0
|
@@ -11,7 +11,7 @@ def main():
|
|
|
11
11
|
environments = flt.env.list_envs()
|
|
12
12
|
print("Environments:", len(environments))
|
|
13
13
|
|
|
14
|
-
instances = flt.env.list_instances()
|
|
14
|
+
instances = flt.env.list_instances(status="running")
|
|
15
15
|
print("Instances:", len(instances))
|
|
16
16
|
|
|
17
17
|
# Create a new instance
|
|
@@ -0,0 +1,432 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import json
|
|
3
|
+
import argparse
|
|
4
|
+
from typing import List, Dict, Any, Optional, Tuple, TypedDict
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from google import genai
|
|
7
|
+
from google.genai import types
|
|
8
|
+
import fleet as flt
|
|
9
|
+
from dotenv import load_dotenv
|
|
10
|
+
import base64
|
|
11
|
+
import re
|
|
12
|
+
import time
|
|
13
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
14
|
+
|
|
15
|
+
load_dotenv()
|
|
16
|
+
|
|
17
|
+
# Initialize Gemini client
|
|
18
|
+
client = genai.Client(api_key=os.getenv("GEMINI_API_KEY"))
|
|
19
|
+
MODEL = "gemini-2.5-pro"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class Problem(TypedDict):
|
|
23
|
+
id: str
|
|
24
|
+
problem: str
|
|
25
|
+
category: str
|
|
26
|
+
difficulty: str
|
|
27
|
+
verifier_func: str
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class GeminiAgent:
|
|
31
|
+
def __init__(
|
|
32
|
+
self,
|
|
33
|
+
browser: flt.FleetPlaywrightWrapper,
|
|
34
|
+
model: str = MODEL,
|
|
35
|
+
print_steps: bool = True,
|
|
36
|
+
debug: bool = False,
|
|
37
|
+
):
|
|
38
|
+
self.browser = browser
|
|
39
|
+
self.model = model
|
|
40
|
+
self.print_steps = print_steps
|
|
41
|
+
self.debug = debug
|
|
42
|
+
self.conversation_history = []
|
|
43
|
+
self.last_action = None # Track the last action performed
|
|
44
|
+
|
|
45
|
+
@property
|
|
46
|
+
def page(self):
|
|
47
|
+
"""Access the underlying Playwright page object."""
|
|
48
|
+
return self.browser._page if hasattr(self.browser, "_page") else None
|
|
49
|
+
|
|
50
|
+
def debug_print(self, *args):
|
|
51
|
+
if self.debug:
|
|
52
|
+
print("[DEBUG]", *args)
|
|
53
|
+
|
|
54
|
+
def take_screenshot(self) -> str:
|
|
55
|
+
return self.browser.screenshot()
|
|
56
|
+
|
|
57
|
+
def execute_action(self, action: Dict[str, Any]) -> Dict[str, Any]:
|
|
58
|
+
action_type = action.get("type")
|
|
59
|
+
params = action.get("parameters", {})
|
|
60
|
+
|
|
61
|
+
if self.print_steps:
|
|
62
|
+
print(f"Action: {action_type}({params})")
|
|
63
|
+
|
|
64
|
+
try:
|
|
65
|
+
if action_type == "click":
|
|
66
|
+
self.browser.click(
|
|
67
|
+
x=params.get("x", params.get("coordinate", [0, 0])[0]),
|
|
68
|
+
y=params.get("y", params.get("coordinate", [0, 0])[1]),
|
|
69
|
+
)
|
|
70
|
+
# Small delay to ensure click is registered and element is focused
|
|
71
|
+
time.sleep(0.2)
|
|
72
|
+
self.last_action = {"type": "click", "target": params}
|
|
73
|
+
elif action_type == "type":
|
|
74
|
+
self.browser.type(text=params.get("text", ""))
|
|
75
|
+
self.last_action = {"type": "type", "text": params.get("text", "")}
|
|
76
|
+
elif action_type == "key":
|
|
77
|
+
# FleetPlaywrightWrapper expects keypress with a list of keys
|
|
78
|
+
key = params.get("key", "")
|
|
79
|
+
self.browser.keypress([key])
|
|
80
|
+
self.last_action = {"type": "key", "key": key}
|
|
81
|
+
elif action_type == "scroll":
|
|
82
|
+
# FleetPlaywrightWrapper expects scroll(x, y, scroll_x, scroll_y)
|
|
83
|
+
x = params.get("x", params.get("coordinate", [0, 0])[0])
|
|
84
|
+
y = params.get("y", params.get("coordinate", [0, 0])[1])
|
|
85
|
+
direction = params.get("direction", "down")
|
|
86
|
+
amount = params.get("amount", 5)
|
|
87
|
+
|
|
88
|
+
# Convert direction and amount to scroll_x and scroll_y
|
|
89
|
+
scroll_x = 0
|
|
90
|
+
scroll_y = 0
|
|
91
|
+
if direction == "down":
|
|
92
|
+
scroll_y = amount * 100
|
|
93
|
+
elif direction == "up":
|
|
94
|
+
scroll_y = -amount * 100
|
|
95
|
+
elif direction == "right":
|
|
96
|
+
scroll_x = amount * 100
|
|
97
|
+
elif direction == "left":
|
|
98
|
+
scroll_x = -amount * 100
|
|
99
|
+
|
|
100
|
+
self.browser.scroll(x=x, y=y, scroll_x=scroll_x, scroll_y=scroll_y)
|
|
101
|
+
self.last_action = {"type": "scroll"}
|
|
102
|
+
elif action_type == "wait":
|
|
103
|
+
time.sleep(params.get("seconds", 1))
|
|
104
|
+
self.last_action = {"type": "wait"}
|
|
105
|
+
elif action_type == "navigate":
|
|
106
|
+
# Use the browser's goto method
|
|
107
|
+
url = params.get("url", "")
|
|
108
|
+
if url:
|
|
109
|
+
self.browser.goto(url)
|
|
110
|
+
self.last_action = {"type": "navigate", "url": url}
|
|
111
|
+
else:
|
|
112
|
+
return {
|
|
113
|
+
"success": False,
|
|
114
|
+
"error": f"Unknown action type: {action_type}",
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
return {"success": True}
|
|
118
|
+
except Exception as e:
|
|
119
|
+
return {"success": False, "error": str(e)}
|
|
120
|
+
|
|
121
|
+
def create_prompt_with_screenshot(
|
|
122
|
+
self, task: str, screenshot_b64: str
|
|
123
|
+
) -> List[Any]:
|
|
124
|
+
# Add context about last action
|
|
125
|
+
last_action_context = ""
|
|
126
|
+
if self.last_action:
|
|
127
|
+
if self.last_action["type"] == "click":
|
|
128
|
+
last_action_context = f"\n\nIMPORTANT: You just clicked at coordinates {self.last_action['target']}. If you clicked on a text input field, search bar, or any editable element, you MUST now use the 'type' action to enter text. Do not click the same element again."
|
|
129
|
+
elif self.last_action["type"] == "type":
|
|
130
|
+
last_action_context = f"\n\nYou just typed: '{self.last_action['text']}'. You may now need to press Enter or click a button to submit."
|
|
131
|
+
|
|
132
|
+
prompt_text = (
|
|
133
|
+
"You are an AI agent that can interact with web browsers. "
|
|
134
|
+
f"Your task is to: {task}\n\n"
|
|
135
|
+
"You can see the current state of the browser in the screenshot provided."
|
|
136
|
+
f"{last_action_context}\n\n"
|
|
137
|
+
"You can perform the following actions:\n"
|
|
138
|
+
'- click: Click at specific coordinates {"type": "click", "parameters": {"x": x, "y": y}}\n'
|
|
139
|
+
'- type: Type text into the currently focused element {"type": "type", "parameters": {"text": "text to type"}}\n'
|
|
140
|
+
'- key: Press a special key {"type": "key", "parameters": {"key": "Enter"}} (e.g., "Enter", "Tab", "Escape")\n'
|
|
141
|
+
'- scroll: Scroll the page {"type": "scroll", "parameters": {"x": x, "y": y, "direction": "down", "amount": 5}} (direction: up/down/left/right)\n'
|
|
142
|
+
'- wait: Wait for a number of seconds {"type": "wait", "parameters": {"seconds": 1}}\n\n'
|
|
143
|
+
"CRITICAL RULES:\n"
|
|
144
|
+
"1. After clicking on ANY text input, search bar, or form field, you MUST type in the next step\n"
|
|
145
|
+
"2. Never click the same element twice in a row\n"
|
|
146
|
+
"3. If you mention searching for something in your reasoning, you must actually type the search query\n"
|
|
147
|
+
"4. Common workflow: click search bar → type query → press Enter\n\n"
|
|
148
|
+
"Analyze the screenshot and decide what action to take next. Respond with a JSON object containing:\n"
|
|
149
|
+
'- "reasoning": Your analysis of the current state and what needs to be done\n'
|
|
150
|
+
'- "action": The action to perform (as described above)\n'
|
|
151
|
+
'- "completed": true if the task is complete, false otherwise\n\n'
|
|
152
|
+
"Example responses:\n"
|
|
153
|
+
"{\n"
|
|
154
|
+
' "reasoning": "I can see a search bar at the top. I need to click on it first to focus it.",\n'
|
|
155
|
+
' "action": {"type": "click", "parameters": {"x": 450, "y": 30}},\n'
|
|
156
|
+
' "completed": false\n'
|
|
157
|
+
"}\n\n"
|
|
158
|
+
"{\n"
|
|
159
|
+
' "reasoning": "I just clicked on the search bar and it should now be focused. I need to type my search query for PHI encryption ticket.",\n'
|
|
160
|
+
' "action": {"type": "type", "parameters": {"text": "PHI encryption"}},\n'
|
|
161
|
+
' "completed": false\n'
|
|
162
|
+
"}\n\n"
|
|
163
|
+
"{\n"
|
|
164
|
+
' "reasoning": "I typed the search query. Now I need to press Enter to execute the search.",\n'
|
|
165
|
+
' "action": {"type": "key", "parameters": {"key": "Enter"}},\n'
|
|
166
|
+
' "completed": false\n'
|
|
167
|
+
"}"
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
return [
|
|
171
|
+
prompt_text,
|
|
172
|
+
types.Part.from_bytes(
|
|
173
|
+
data=base64.b64decode(screenshot_b64), mime_type="image/png"
|
|
174
|
+
),
|
|
175
|
+
]
|
|
176
|
+
|
|
177
|
+
def solve_task(self, task: str, max_steps: int = 30) -> Tuple[bool, str]:
|
|
178
|
+
steps = 0
|
|
179
|
+
|
|
180
|
+
try:
|
|
181
|
+
while steps < max_steps:
|
|
182
|
+
steps += 1
|
|
183
|
+
|
|
184
|
+
# Take screenshot
|
|
185
|
+
screenshot = self.take_screenshot()
|
|
186
|
+
|
|
187
|
+
# Create prompt with current state
|
|
188
|
+
prompt_parts = self.create_prompt_with_screenshot(task, screenshot)
|
|
189
|
+
|
|
190
|
+
# Get Gemini's response
|
|
191
|
+
response = client.models.generate_content(
|
|
192
|
+
model=self.model,
|
|
193
|
+
contents=prompt_parts,
|
|
194
|
+
config=types.GenerateContentConfig(
|
|
195
|
+
response_mime_type="application/json",
|
|
196
|
+
temperature=0.1, # Lower temperature for more deterministic behavior
|
|
197
|
+
),
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
# Parse response
|
|
201
|
+
try:
|
|
202
|
+
result = json.loads(response.text)
|
|
203
|
+
self.debug_print(f"Step {steps}: {result}")
|
|
204
|
+
|
|
205
|
+
if self.print_steps:
|
|
206
|
+
print(
|
|
207
|
+
f"Step {steps}: {result.get('reasoning', 'No reasoning provided')}"
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
# Debug: Print the full action if in debug mode
|
|
211
|
+
if self.debug and "action" in result:
|
|
212
|
+
print(f"[DEBUG] Full action: {result['action']}")
|
|
213
|
+
|
|
214
|
+
# Check if task is completed
|
|
215
|
+
if result.get("completed", False):
|
|
216
|
+
return True, "Task completed successfully"
|
|
217
|
+
|
|
218
|
+
# Execute the action
|
|
219
|
+
if "action" in result:
|
|
220
|
+
action_result = self.execute_action(result["action"])
|
|
221
|
+
if not action_result["success"]:
|
|
222
|
+
self.debug_print(
|
|
223
|
+
f"Action failed: {action_result.get('error')}"
|
|
224
|
+
)
|
|
225
|
+
else:
|
|
226
|
+
print(f"[WARNING] No action in response: {result}")
|
|
227
|
+
|
|
228
|
+
# Small delay to let the page update
|
|
229
|
+
time.sleep(0.5)
|
|
230
|
+
|
|
231
|
+
except json.JSONDecodeError as e:
|
|
232
|
+
self.debug_print(f"Failed to parse Gemini response: {e}")
|
|
233
|
+
self.debug_print(f"Response text: {response.text}")
|
|
234
|
+
# Try to extract any useful information from the response
|
|
235
|
+
print(
|
|
236
|
+
f"[ERROR] Invalid JSON response from Gemini: {response.text[:200]}..."
|
|
237
|
+
)
|
|
238
|
+
continue
|
|
239
|
+
|
|
240
|
+
return False, f"Max steps ({max_steps}) reached without completing the task"
|
|
241
|
+
|
|
242
|
+
except Exception as e:
|
|
243
|
+
return False, f"Error during task execution: {str(e)}"
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def extract_function_name(function_str: str) -> str:
|
|
247
|
+
match = re.search(r"(?:async\s+)?def\s+(\w+)\s*\(", function_str)
|
|
248
|
+
if match:
|
|
249
|
+
return match.group(1)
|
|
250
|
+
raise ValueError(f"No function name found in {function_str}")
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
def evaluate_problem(
|
|
254
|
+
problem: Problem,
|
|
255
|
+
problem_idx: int,
|
|
256
|
+
total_problems: int,
|
|
257
|
+
env_key: str,
|
|
258
|
+
max_steps: int = 30,
|
|
259
|
+
) -> Tuple[str, bool, Optional[str]]:
|
|
260
|
+
env = None
|
|
261
|
+
browser = None
|
|
262
|
+
|
|
263
|
+
try:
|
|
264
|
+
# Create environment
|
|
265
|
+
env = flt.env.make(env_key)
|
|
266
|
+
print(
|
|
267
|
+
f"[Problem {problem_idx + 1}/{total_problems}] Created environment for {problem['id']}: {env.urls.app}"
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
# Create browser wrapper
|
|
271
|
+
browser = flt.FleetPlaywrightWrapper(env)
|
|
272
|
+
browser.start()
|
|
273
|
+
|
|
274
|
+
# Create agent
|
|
275
|
+
agent = GeminiAgent(browser, print_steps=True, debug=False)
|
|
276
|
+
|
|
277
|
+
# Solve the problem
|
|
278
|
+
print(
|
|
279
|
+
f"[Problem {problem_idx + 1}/{total_problems}] Solving {problem['id']}..."
|
|
280
|
+
)
|
|
281
|
+
success, message = agent.solve_task(problem["problem"], max_steps=max_steps)
|
|
282
|
+
|
|
283
|
+
if not success:
|
|
284
|
+
print(
|
|
285
|
+
f"[Problem {problem_idx + 1}/{total_problems}] Failed to solve: {message}"
|
|
286
|
+
)
|
|
287
|
+
# return problem["id"], False, message
|
|
288
|
+
|
|
289
|
+
# Verify the solution
|
|
290
|
+
function_name = extract_function_name(problem["verifier_func"])
|
|
291
|
+
print(
|
|
292
|
+
f"[Problem {problem_idx + 1}/{total_problems}] Verifying {function_name} ({problem['id']})..."
|
|
293
|
+
)
|
|
294
|
+
response = env.verify_raw(problem["verifier_func"], function_name)
|
|
295
|
+
|
|
296
|
+
print(
|
|
297
|
+
f"[Problem {problem_idx + 1}/{total_problems}] Result for {problem['id']}: {'✓' if response.success else '✗'}"
|
|
298
|
+
)
|
|
299
|
+
|
|
300
|
+
return problem["id"], response.success, None
|
|
301
|
+
|
|
302
|
+
except Exception as e:
|
|
303
|
+
print(
|
|
304
|
+
f"[Problem {problem_idx + 1}/{total_problems}] Fatal error processing {problem['id']}: {e}"
|
|
305
|
+
)
|
|
306
|
+
return problem["id"], False, str(e)
|
|
307
|
+
finally:
|
|
308
|
+
# Clean up
|
|
309
|
+
if browser:
|
|
310
|
+
browser.close()
|
|
311
|
+
if env:
|
|
312
|
+
env.close()
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
def interactive_mode():
|
|
316
|
+
# Create a Fleet environment instance
|
|
317
|
+
instance = flt.env.make("hubspot")
|
|
318
|
+
|
|
319
|
+
# Create the browser wrapper
|
|
320
|
+
browser = flt.FleetPlaywrightWrapper(instance)
|
|
321
|
+
browser.start()
|
|
322
|
+
|
|
323
|
+
try:
|
|
324
|
+
agent = GeminiAgent(browser, print_steps=True, debug=False)
|
|
325
|
+
|
|
326
|
+
print("Gemini Agent Interactive Mode")
|
|
327
|
+
print("Type your task or 'quit' to exit")
|
|
328
|
+
print("-" * 60)
|
|
329
|
+
|
|
330
|
+
while True:
|
|
331
|
+
try:
|
|
332
|
+
user_input = input("\n> ")
|
|
333
|
+
if user_input.lower() in ["quit", "exit", "q"]:
|
|
334
|
+
break
|
|
335
|
+
|
|
336
|
+
success, message = agent.solve_task(user_input)
|
|
337
|
+
print(f"\nResult: {'Success' if success else 'Failed'} - {message}")
|
|
338
|
+
|
|
339
|
+
except KeyboardInterrupt:
|
|
340
|
+
print("\nShutting down...")
|
|
341
|
+
break
|
|
342
|
+
except Exception as e:
|
|
343
|
+
print(f"Error: {e}")
|
|
344
|
+
|
|
345
|
+
finally:
|
|
346
|
+
browser.close()
|
|
347
|
+
instance.close()
|
|
348
|
+
|
|
349
|
+
|
|
350
|
+
def evaluate_from_json(json_file: str, max_concurrent: int = 3, max_steps: int = 30):
|
|
351
|
+
file_path = Path(json_file)
|
|
352
|
+
if not file_path.exists():
|
|
353
|
+
raise FileNotFoundError(f"Error: File '{json_file}' not found")
|
|
354
|
+
|
|
355
|
+
with open(json_file, "r") as f:
|
|
356
|
+
data = json.load(f)
|
|
357
|
+
problems: List[Problem] = data["problems"]
|
|
358
|
+
|
|
359
|
+
print(f"Loaded {len(problems)} problems from '{json_file}'")
|
|
360
|
+
print(f"Running with max {max_concurrent} concurrent tasks")
|
|
361
|
+
print("-" * 60)
|
|
362
|
+
|
|
363
|
+
# Process problems with thread pool for concurrency
|
|
364
|
+
results = []
|
|
365
|
+
with ThreadPoolExecutor(max_workers=max_concurrent) as executor:
|
|
366
|
+
# Submit all tasks
|
|
367
|
+
future_to_problem = {
|
|
368
|
+
executor.submit(
|
|
369
|
+
evaluate_problem, problem, idx, len(problems), "fira:v1.3.1", max_steps
|
|
370
|
+
): (problem, idx)
|
|
371
|
+
for idx, problem in enumerate(problems)
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
# Collect results as they complete
|
|
375
|
+
for future in as_completed(future_to_problem):
|
|
376
|
+
result = future.result()
|
|
377
|
+
results.append(result)
|
|
378
|
+
|
|
379
|
+
# Display results
|
|
380
|
+
print("\n" + "=" * 60)
|
|
381
|
+
print("EVALUATION RESULTS")
|
|
382
|
+
print("=" * 60)
|
|
383
|
+
|
|
384
|
+
successes = 0
|
|
385
|
+
for problem_id, success, error in results:
|
|
386
|
+
status = "✓ PASS" if success else "✗ FAIL"
|
|
387
|
+
print(f"{status} | {problem_id}")
|
|
388
|
+
if error and not success:
|
|
389
|
+
print(f" └─ Error: {error}")
|
|
390
|
+
if success:
|
|
391
|
+
successes += 1
|
|
392
|
+
|
|
393
|
+
print("-" * 60)
|
|
394
|
+
print(f"Total problems: {len(problems)}")
|
|
395
|
+
print(f"Successes: {successes}")
|
|
396
|
+
print(f"Failures: {len(problems) - successes}")
|
|
397
|
+
print(f"Success rate: {successes / len(problems):.2%}")
|
|
398
|
+
|
|
399
|
+
|
|
400
|
+
def main():
|
|
401
|
+
parser = argparse.ArgumentParser(description="Gemini Agent for Fleet SDK")
|
|
402
|
+
parser.add_argument(
|
|
403
|
+
"--eval", type=str, help="Path to JSON file with problems to evaluate"
|
|
404
|
+
)
|
|
405
|
+
parser.add_argument(
|
|
406
|
+
"--max-concurrent",
|
|
407
|
+
type=int,
|
|
408
|
+
default=3,
|
|
409
|
+
help="Maximum number of concurrent evaluations (default: 3)",
|
|
410
|
+
)
|
|
411
|
+
parser.add_argument(
|
|
412
|
+
"--max-steps",
|
|
413
|
+
type=int,
|
|
414
|
+
default=30,
|
|
415
|
+
help="Maximum steps per problem (default: 30)",
|
|
416
|
+
)
|
|
417
|
+
parser.add_argument(
|
|
418
|
+
"--interactive", action="store_true", help="Run in interactive mode"
|
|
419
|
+
)
|
|
420
|
+
|
|
421
|
+
args = parser.parse_args()
|
|
422
|
+
|
|
423
|
+
if args.eval:
|
|
424
|
+
evaluate_from_json(args.eval, args.max_concurrent, args.max_steps)
|
|
425
|
+
elif args.interactive:
|
|
426
|
+
interactive_mode()
|
|
427
|
+
else:
|
|
428
|
+
raise ValueError("No arguments provided")
|
|
429
|
+
|
|
430
|
+
|
|
431
|
+
if __name__ == "__main__":
|
|
432
|
+
main()
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import asyncio
|
|
3
|
+
import argparse
|
|
4
|
+
import json
|
|
5
|
+
from typing import TypedDict, List, Optional, Tuple
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
import fleet as flt
|
|
8
|
+
from nova_act import NovaAct, ActResult
|
|
9
|
+
from dotenv import load_dotenv
|
|
10
|
+
|
|
11
|
+
load_dotenv()
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
MAX_STEPS = 30
|
|
15
|
+
MAX_CONCURRENT_TASKS = 5 # Limit concurrent tasks to avoid overwhelming the system
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class Problem(TypedDict):
|
|
19
|
+
id: str
|
|
20
|
+
problem: str
|
|
21
|
+
category: str
|
|
22
|
+
difficulty: str
|
|
23
|
+
verifier_func: str
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def extract_function_name(function_str: str) -> str | None:
|
|
27
|
+
match = re.search(r"(?:async\s+)?def\s+(\w+)\s*\(", function_str)
|
|
28
|
+
if match:
|
|
29
|
+
return match.group(1)
|
|
30
|
+
raise ValueError(f"No function name found in {function_str}")
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
async def process_problem(
|
|
34
|
+
problem: Problem, problem_idx: int, total_problems: int, env_key: str
|
|
35
|
+
) -> Tuple[str, bool, Optional[str]]:
|
|
36
|
+
env = None
|
|
37
|
+
try:
|
|
38
|
+
# Create a new environment instance for this problem
|
|
39
|
+
env = await flt.env.make_async(env_key)
|
|
40
|
+
print(
|
|
41
|
+
f"[Problem {problem_idx + 1}/{total_problems}] Created environment for {problem['id']}: {env.urls.app}"
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
# Run NovaAct in a thread (since it's synchronous)
|
|
45
|
+
def run_nova() -> ActResult:
|
|
46
|
+
with NovaAct(starting_page=env.urls.app, headless=True) as nova:
|
|
47
|
+
return nova.act(problem["problem"], max_steps=MAX_STEPS)
|
|
48
|
+
|
|
49
|
+
try:
|
|
50
|
+
print(
|
|
51
|
+
f"[Problem {problem_idx + 1}/{total_problems}] Solving {problem['id']}..."
|
|
52
|
+
)
|
|
53
|
+
await asyncio.to_thread(run_nova)
|
|
54
|
+
except Exception as e:
|
|
55
|
+
print(
|
|
56
|
+
f"[Problem {problem_idx + 1}/{total_problems}] Error during solving {problem['id']}: {e}"
|
|
57
|
+
)
|
|
58
|
+
error_msg = str(e)
|
|
59
|
+
else:
|
|
60
|
+
error_msg = None
|
|
61
|
+
|
|
62
|
+
# Verify the solution
|
|
63
|
+
function_name = extract_function_name(problem["verifier_func"])
|
|
64
|
+
print(
|
|
65
|
+
f"[Problem {problem_idx + 1}/{total_problems}] Verifying {function_name} ({problem['id']})..."
|
|
66
|
+
)
|
|
67
|
+
response = await env.verify_raw(problem["verifier_func"], function_name)
|
|
68
|
+
|
|
69
|
+
print(
|
|
70
|
+
f"[Problem {problem_idx + 1}/{total_problems}] Result for {problem['id']}: {'✓' if response.success else '✗'}"
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
return problem["id"], response.success, error_msg
|
|
74
|
+
|
|
75
|
+
except Exception as e:
|
|
76
|
+
print(
|
|
77
|
+
f"[Problem {problem_idx + 1}/{total_problems}] Fatal error processing {problem['id']}: {e}"
|
|
78
|
+
)
|
|
79
|
+
return problem["id"], False, str(e)
|
|
80
|
+
finally:
|
|
81
|
+
# Clean up the environment
|
|
82
|
+
if env:
|
|
83
|
+
try:
|
|
84
|
+
await env.close()
|
|
85
|
+
print(
|
|
86
|
+
f"[Problem {problem_idx + 1}/{total_problems}] Closed environment for {problem['id']}"
|
|
87
|
+
)
|
|
88
|
+
except Exception as e:
|
|
89
|
+
print(
|
|
90
|
+
f"[Problem {problem_idx + 1}/{total_problems}] Error closing environment for {problem['id']}: {e}"
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
async def main():
|
|
95
|
+
parser = argparse.ArgumentParser(
|
|
96
|
+
description="Load and display Jira problems from JSON file"
|
|
97
|
+
)
|
|
98
|
+
parser.add_argument(
|
|
99
|
+
"json_file", type=str, help="Path to the JSON file containing problems"
|
|
100
|
+
)
|
|
101
|
+
parser.add_argument(
|
|
102
|
+
"--max-concurrent",
|
|
103
|
+
type=int,
|
|
104
|
+
default=MAX_CONCURRENT_TASKS,
|
|
105
|
+
help=f"Maximum number of concurrent tasks (default: {MAX_CONCURRENT_TASKS})",
|
|
106
|
+
)
|
|
107
|
+
args = parser.parse_args()
|
|
108
|
+
|
|
109
|
+
file_path = Path(args.json_file)
|
|
110
|
+
if not file_path.exists():
|
|
111
|
+
raise FileNotFoundError(f"Error: File '{args.json_file}' not found")
|
|
112
|
+
|
|
113
|
+
try:
|
|
114
|
+
with open(args.json_file, "r") as f:
|
|
115
|
+
data = json.load(f)
|
|
116
|
+
problems: List[Problem] = data["problems"]
|
|
117
|
+
|
|
118
|
+
print(f"Loaded {len(problems)} problems from '{args.json_file}'")
|
|
119
|
+
print(f"Running with max {args.max_concurrent} concurrent tasks")
|
|
120
|
+
print("-" * 60)
|
|
121
|
+
|
|
122
|
+
# Create a semaphore to limit concurrent tasks
|
|
123
|
+
semaphore = asyncio.Semaphore(args.max_concurrent)
|
|
124
|
+
|
|
125
|
+
async def process_with_semaphore(
|
|
126
|
+
problem: Problem, idx: int
|
|
127
|
+
) -> Tuple[str, bool, Optional[str]]:
|
|
128
|
+
async with semaphore:
|
|
129
|
+
return await process_problem(problem, idx, len(problems), "fira:v1.2.7")
|
|
130
|
+
|
|
131
|
+
# Process all problems concurrently (with semaphore limiting)
|
|
132
|
+
tasks = [
|
|
133
|
+
process_with_semaphore(problem, i) for i, problem in enumerate(problems)
|
|
134
|
+
]
|
|
135
|
+
|
|
136
|
+
results = await asyncio.gather(*tasks)
|
|
137
|
+
|
|
138
|
+
# Count successes and display summary
|
|
139
|
+
print("\n" + "=" * 60)
|
|
140
|
+
print("EVALUATION RESULTS")
|
|
141
|
+
print("=" * 60)
|
|
142
|
+
|
|
143
|
+
successes = 0
|
|
144
|
+
for problem_id, success, error in results:
|
|
145
|
+
status = "✓ PASS" if success else "✗ FAIL"
|
|
146
|
+
print(f"{status} | {problem_id}")
|
|
147
|
+
if error and not success:
|
|
148
|
+
print(f" └─ Error: {error}")
|
|
149
|
+
if success:
|
|
150
|
+
successes += 1
|
|
151
|
+
|
|
152
|
+
print("-" * 60)
|
|
153
|
+
print(f"Total problems: {len(problems)}")
|
|
154
|
+
print(f"Successes: {successes}")
|
|
155
|
+
print(f"Failures: {len(problems) - successes}")
|
|
156
|
+
print(f"Success rate: {successes / len(problems):.2%}")
|
|
157
|
+
|
|
158
|
+
except Exception as e:
|
|
159
|
+
print(f"Fatal error: {e}")
|
|
160
|
+
raise
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
if __name__ == "__main__":
|
|
164
|
+
asyncio.run(main())
|