lybic-guiagents 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lybic-guiagents might be problematic. Click here for more details.

Files changed (85) hide show
  1. desktop_env/__init__.py +1 -0
  2. desktop_env/actions.py +203 -0
  3. desktop_env/controllers/__init__.py +0 -0
  4. desktop_env/controllers/python.py +471 -0
  5. desktop_env/controllers/setup.py +882 -0
  6. desktop_env/desktop_env.py +509 -0
  7. desktop_env/evaluators/__init__.py +5 -0
  8. desktop_env/evaluators/getters/__init__.py +41 -0
  9. desktop_env/evaluators/getters/calc.py +15 -0
  10. desktop_env/evaluators/getters/chrome.py +1774 -0
  11. desktop_env/evaluators/getters/file.py +154 -0
  12. desktop_env/evaluators/getters/general.py +42 -0
  13. desktop_env/evaluators/getters/gimp.py +38 -0
  14. desktop_env/evaluators/getters/impress.py +126 -0
  15. desktop_env/evaluators/getters/info.py +24 -0
  16. desktop_env/evaluators/getters/misc.py +406 -0
  17. desktop_env/evaluators/getters/replay.py +20 -0
  18. desktop_env/evaluators/getters/vlc.py +86 -0
  19. desktop_env/evaluators/getters/vscode.py +35 -0
  20. desktop_env/evaluators/metrics/__init__.py +160 -0
  21. desktop_env/evaluators/metrics/basic_os.py +68 -0
  22. desktop_env/evaluators/metrics/chrome.py +493 -0
  23. desktop_env/evaluators/metrics/docs.py +1011 -0
  24. desktop_env/evaluators/metrics/general.py +665 -0
  25. desktop_env/evaluators/metrics/gimp.py +637 -0
  26. desktop_env/evaluators/metrics/libreoffice.py +28 -0
  27. desktop_env/evaluators/metrics/others.py +92 -0
  28. desktop_env/evaluators/metrics/pdf.py +31 -0
  29. desktop_env/evaluators/metrics/slides.py +957 -0
  30. desktop_env/evaluators/metrics/table.py +585 -0
  31. desktop_env/evaluators/metrics/thunderbird.py +176 -0
  32. desktop_env/evaluators/metrics/utils.py +719 -0
  33. desktop_env/evaluators/metrics/vlc.py +524 -0
  34. desktop_env/evaluators/metrics/vscode.py +283 -0
  35. desktop_env/providers/__init__.py +35 -0
  36. desktop_env/providers/aws/__init__.py +0 -0
  37. desktop_env/providers/aws/manager.py +278 -0
  38. desktop_env/providers/aws/provider.py +186 -0
  39. desktop_env/providers/aws/provider_with_proxy.py +315 -0
  40. desktop_env/providers/aws/proxy_pool.py +193 -0
  41. desktop_env/providers/azure/__init__.py +0 -0
  42. desktop_env/providers/azure/manager.py +87 -0
  43. desktop_env/providers/azure/provider.py +207 -0
  44. desktop_env/providers/base.py +97 -0
  45. desktop_env/providers/gcp/__init__.py +0 -0
  46. desktop_env/providers/gcp/manager.py +0 -0
  47. desktop_env/providers/gcp/provider.py +0 -0
  48. desktop_env/providers/virtualbox/__init__.py +0 -0
  49. desktop_env/providers/virtualbox/manager.py +463 -0
  50. desktop_env/providers/virtualbox/provider.py +124 -0
  51. desktop_env/providers/vmware/__init__.py +0 -0
  52. desktop_env/providers/vmware/manager.py +455 -0
  53. desktop_env/providers/vmware/provider.py +105 -0
  54. gui_agents/__init__.py +0 -0
  55. gui_agents/agents/Action.py +209 -0
  56. gui_agents/agents/__init__.py +0 -0
  57. gui_agents/agents/agent_s.py +832 -0
  58. gui_agents/agents/global_state.py +610 -0
  59. gui_agents/agents/grounding.py +651 -0
  60. gui_agents/agents/hardware_interface.py +129 -0
  61. gui_agents/agents/manager.py +568 -0
  62. gui_agents/agents/translator.py +132 -0
  63. gui_agents/agents/worker.py +355 -0
  64. gui_agents/cli_app.py +560 -0
  65. gui_agents/core/__init__.py +0 -0
  66. gui_agents/core/engine.py +1496 -0
  67. gui_agents/core/knowledge.py +449 -0
  68. gui_agents/core/mllm.py +555 -0
  69. gui_agents/tools/__init__.py +0 -0
  70. gui_agents/tools/tools.py +727 -0
  71. gui_agents/unit_test/__init__.py +0 -0
  72. gui_agents/unit_test/run_tests.py +65 -0
  73. gui_agents/unit_test/test_manager.py +330 -0
  74. gui_agents/unit_test/test_worker.py +269 -0
  75. gui_agents/utils/__init__.py +0 -0
  76. gui_agents/utils/analyze_display.py +301 -0
  77. gui_agents/utils/common_utils.py +263 -0
  78. gui_agents/utils/display_viewer.py +281 -0
  79. gui_agents/utils/embedding_manager.py +53 -0
  80. gui_agents/utils/image_axis_utils.py +27 -0
  81. lybic_guiagents-0.1.0.dist-info/METADATA +416 -0
  82. lybic_guiagents-0.1.0.dist-info/RECORD +85 -0
  83. lybic_guiagents-0.1.0.dist-info/WHEEL +5 -0
  84. lybic_guiagents-0.1.0.dist-info/licenses/LICENSE +201 -0
  85. lybic_guiagents-0.1.0.dist-info/top_level.txt +2 -0
@@ -0,0 +1,449 @@
1
+ import json
2
+ import os
3
+ from typing import Dict, Tuple, List, Union
4
+ import numpy as np
5
+ from sklearn.metrics.pairwise import cosine_similarity
6
+ from gui_agents.utils.common_utils import (
7
+ load_embeddings,
8
+ load_knowledge_base,
9
+ save_embeddings,
10
+ )
11
+ from gui_agents.tools.tools import Tools
12
+ from gui_agents.agents.global_state import GlobalState
13
+ from gui_agents.store.registry import Registry
14
+ from gui_agents.core.mllm import CostManager
15
+
16
+ def get_embedding_dim(model_name):
17
+ if model_name == "doubao-embedding-large-text-250515":
18
+ return 2048
19
+ elif model_name == "doubao-embedding-text-240715":
20
+ return 2560
21
+ elif model_name == "text-embedding-ada-002":
22
+ return 1536
23
+ elif model_name == "text-embedding-3-small":
24
+ return 1536
25
+ elif model_name == "text-embedding-3-large":
26
+ return 3072
27
+ elif model_name == "gemini-embedding-001":
28
+ return 3072
29
+ elif model_name == "jina-embeddings-v4":
30
+ return 2048
31
+ elif model_name == "jina-embeddings-v3":
32
+ return 1024
33
+ elif model_name == "text-embedding-v4":
34
+ return 1024
35
+ elif model_name == "text-embedding-v3":
36
+ return 1024
37
+ elif model_name == "embedding-2" or model_name == "embedding-3":
38
+ return 2048
39
+ else:
40
+ return None
41
+
42
+ class KnowledgeBase:
43
+ def __init__(
44
+ self,
45
+ embedding_engine: Tools,
46
+ local_kb_path: str,
47
+ platform: str,
48
+ Tools_dict: Dict,
49
+ save_knowledge: bool = True,
50
+ ):
51
+ self.platform = platform
52
+
53
+ self.local_kb_path = local_kb_path
54
+
55
+ # initialize embedding engine
56
+ self.embedding_engine = embedding_engine
57
+
58
+ # Initialize paths for different memory types
59
+ self.episodic_memory_path = os.path.join(
60
+ self.local_kb_path, self.platform, "episodic_memory.json"
61
+ )
62
+ self.narrative_memory_path = os.path.join(
63
+ self.local_kb_path, self.platform, "narrative_memory.json"
64
+ )
65
+ embedding_model_name = ""
66
+ if hasattr(self.embedding_engine, "tools") and "embedding" in self.embedding_engine.tools:
67
+ embedding_model_name = self.embedding_engine.tools["embedding"].model_name
68
+ else:
69
+ embedding_model_name = "default"
70
+ embedding_dim = get_embedding_dim(embedding_model_name)
71
+ self.embeddings_path = os.path.join(
72
+ self.local_kb_path, self.platform, f"embeddings_{embedding_model_name}_{embedding_dim}.pkl"
73
+ )
74
+
75
+ # Initialize trajectory tracking
76
+ self.task_trajectory = ""
77
+ self.current_subtask_trajectory = ""
78
+ self.current_search_query = ""
79
+
80
+ self.query_formulator = Tools()
81
+ self.query_formulator.register_tool("query_formulator", Tools_dict["query_formulator"]["provider"], Tools_dict["query_formulator"]["model"])
82
+
83
+ self.knowledge_fusion_agent = Tools()
84
+ self.knowledge_fusion_agent.register_tool("context_fusion", Tools_dict["context_fusion"]["provider"], Tools_dict["context_fusion"]["model"])
85
+
86
+ self.narrative_summarization_agent = Tools()
87
+ self.narrative_summarization_agent.register_tool("narrative_summarization", Tools_dict["narrative_summarization"]["provider"], Tools_dict["narrative_summarization"]["model"])
88
+
89
+ self.episode_summarization_agent = Tools()
90
+ self.episode_summarization_agent.register_tool("episode_summarization", Tools_dict["episode_summarization"]["provider"], Tools_dict["episode_summarization"]["model"])
91
+
92
+ self.save_knowledge = save_knowledge
93
+
94
+ def retrieve_knowledge(
95
+ self, instruction: str, search_query: str, search_engine: Tools
96
+ ) -> Tuple[str, List[int], str]:
97
+ """Retrieve knowledge using search engine
98
+ Args:
99
+ instruction (str): task instruction
100
+ search_query (str): search query to use
101
+ search_engine (Tools): search engine tool to use
102
+
103
+ Returns:
104
+ Tuple[str, List[int], float]: The search results, token usage, and cost
105
+ """
106
+ search_results, total_tokens, cost_string = search_engine.execute_tool("websearch", {"str_input": instruction + " " + search_query})
107
+
108
+ return search_results, total_tokens, cost_string
109
+
110
+ def formulate_query(self, instruction: str, observation: Dict) -> Tuple[str, List[int], str]:
111
+ """Formulate search query based on instruction and current state
112
+
113
+ Args:
114
+ instruction (str): The task instruction
115
+ observation (Dict): Current observation including screenshot
116
+
117
+ Returns:
118
+ Tuple[str, List[int], float]: The formulated query, token usage, and cost
119
+ """
120
+ query_path = os.path.join(
121
+ self.local_kb_path, self.platform, "formulate_query.json"
122
+ )
123
+ try:
124
+ with open(query_path, "r") as f:
125
+ formulate_query = json.load(f)
126
+ except:
127
+ formulate_query = {}
128
+
129
+ if instruction in formulate_query:
130
+ return formulate_query[instruction], [0, 0, 0], ""
131
+
132
+ self.query_formulator.tools["query_formulator"].llm_agent.reset()
133
+
134
+ content, total_tokens, cost_string = self.query_formulator.execute_tool("query_formulator", {
135
+ "str_input": f"The task is: {instruction}\n" +
136
+ "To use google search to get some useful information, first carefully analyze " +
137
+ "the screenshot of the current desktop UI state, then given the task " +
138
+ "instruction, formulate a question that can be used to search on the Internet " +
139
+ "for information in helping with the task execution.\n" +
140
+ "The question should not be too general or too specific. Please ONLY provide " +
141
+ "the question.\nQuestion:",
142
+ "img_input": observation["screenshot"] if "screenshot" in observation else None
143
+ })
144
+
145
+ search_query = content.strip().replace('"', "")
146
+
147
+ print("search query: ", search_query)
148
+ formulate_query[instruction] = search_query
149
+ with open(query_path, "w") as f:
150
+ json.dump(formulate_query, f, indent=2)
151
+
152
+ return search_query, total_tokens, cost_string
153
+
154
+ def retrieve_narrative_experience(self, instruction: str) -> Tuple[str, str, List[int], str]:
155
+ """Retrieve narrative experience using embeddings
156
+
157
+ Args:
158
+ instruction (str): The task instruction
159
+
160
+ Returns:
161
+ Tuple[str, str]: The similar task key and its narrative experience
162
+ """
163
+
164
+ knowledge_base = load_knowledge_base(self.narrative_memory_path)
165
+ if not knowledge_base:
166
+ return "None", "None", [0, 0, 0], ""
167
+
168
+ embeddings = load_embeddings(self.embeddings_path)
169
+
170
+ # Get or create instruction embedding
171
+ instruction_embedding = embeddings.get(instruction)
172
+ total_tokens, cost_string = [0, 0, 0], ""
173
+
174
+ if instruction_embedding is None:
175
+ instruction_embedding, tokens, cost_string_now = self.embedding_engine.execute_tool("embedding", {"str_input": instruction})
176
+ embeddings[instruction] = instruction_embedding
177
+ # total_tokens += tokens
178
+ for i in range(len(total_tokens)):
179
+ total_tokens[i] += tokens[i]
180
+ cost_string = cost_string_now
181
+ # Get or create embeddings for knowledge base entries
182
+ candidate_embeddings = []
183
+ for key in knowledge_base:
184
+ candidate_embedding = embeddings.get(key)
185
+ if candidate_embedding is None:
186
+ candidate_embedding, tokens, cost_string_now = self.embedding_engine.execute_tool("embedding", {"str_input": key})
187
+ for i in range(len(tokens)):
188
+ total_tokens[i] += tokens[i]
189
+ # total_tokens += tokens
190
+ cost_string = CostManager.add_costs(cost_string, cost_string_now)
191
+ embeddings[key] = candidate_embedding
192
+
193
+ candidate_embeddings.append(candidate_embedding)
194
+
195
+ save_embeddings(self.embeddings_path, embeddings)
196
+
197
+ similarities = cosine_similarity(
198
+ instruction_embedding, np.vstack(candidate_embeddings)
199
+ )[0]
200
+ sorted_indices = np.argsort(similarities)[::-1]
201
+
202
+ keys = list(knowledge_base.keys())
203
+ idx = 1 if keys[sorted_indices[0]] == instruction else 0
204
+ return keys[sorted_indices[idx]], knowledge_base[keys[sorted_indices[idx]]], total_tokens, cost_string
205
+
206
+ def retrieve_episodic_experience(self, instruction: str) -> Tuple[str, str, List[int], str]:
207
+ """Retrieve similar task experience using embeddings
208
+
209
+ Args:
210
+ instruction (str): The task instruction
211
+
212
+ Returns:
213
+ Tuple[str, str]: The similar task key and its episodic experience
214
+ """
215
+
216
+ knowledge_base = load_knowledge_base(self.episodic_memory_path)
217
+ if not knowledge_base:
218
+ return "None", "None", [0, 0, 0], ""
219
+
220
+ embeddings = load_embeddings(self.embeddings_path)
221
+
222
+ # Get or create instruction embedding
223
+ instruction_embedding = embeddings.get(instruction)
224
+ total_tokens, cost_string = [0, 0, 0], ""
225
+
226
+ if instruction_embedding is None:
227
+ instruction_embedding, tokens, cost_string_now = self.embedding_engine.execute_tool("embedding", {"str_input": instruction})
228
+ embeddings[instruction] = instruction_embedding
229
+
230
+ # total_tokens += tokens
231
+ for i in range(len(total_tokens)):
232
+ total_tokens[i] += tokens[i]
233
+ cost_string = cost_string_now
234
+
235
+ # Get or create embeddings for knowledge base entries
236
+ candidate_embeddings = []
237
+ for key in knowledge_base:
238
+ candidate_embedding = embeddings.get(key)
239
+ if candidate_embedding is None:
240
+ candidate_embedding, tokens, cost_string_now = self.embedding_engine.execute_tool("embedding", {"str_input": key})
241
+ # total_tokens += tokens
242
+ for i in range(len(total_tokens)):
243
+ total_tokens[i] += tokens[i]
244
+ cost_string = CostManager.add_costs(cost_string, cost_string_now)
245
+ embeddings[key] = candidate_embedding
246
+
247
+ candidate_embeddings.append(candidate_embedding)
248
+
249
+ save_embeddings(self.embeddings_path, embeddings)
250
+
251
+ similarities = cosine_similarity(
252
+ instruction_embedding, np.vstack(candidate_embeddings)
253
+ )[0]
254
+ sorted_indices = np.argsort(similarities)[::-1]
255
+
256
+ keys = list(knowledge_base.keys())
257
+ idx = 1 if keys[sorted_indices[0]] == instruction else 0
258
+ return keys[sorted_indices[idx]], knowledge_base[keys[sorted_indices[idx]]], total_tokens, cost_string
259
+
260
+ def knowledge_fusion(
261
+ self,
262
+ observation: Dict,
263
+ instruction: str,
264
+ web_knowledge: str,
265
+ similar_task: str,
266
+ experience: str,
267
+ ) -> Tuple[str, list, str]:
268
+ """Combine web knowledge with similar task experience"""
269
+
270
+ content, total_tokens, cost = self.knowledge_fusion_agent.execute_tool("context_fusion", {
271
+ "str_input": f"Task: {instruction}\n" +
272
+ f"**Web search result**:\n{web_knowledge}\n\n" +
273
+ f"**Retrieved similar task experience**:\n" +
274
+ f"Similar task:{similar_task}\n{experience}\n\n" +
275
+ f"Based on the web search result and the retrieved similar task experience, " +
276
+ f"if you think the similar task experience is indeed useful to the main task, " +
277
+ f"integrate it with the web search result. Provide the final knowledge in a numbered list.",
278
+ "img_input": observation["screenshot"] if "screenshot" in observation else None
279
+ })
280
+
281
+ return content, total_tokens, cost
282
+
283
+
284
+ def save_episodic_memory(self, subtask_key: str, subtask_traj: str) -> None:
285
+ """Save episodic memory (subtask level knowledge).
286
+
287
+ Args:
288
+ subtask_key (str): Key identifying the subtask
289
+ subtask_traj (str): Trajectory/experience of the subtask
290
+ """
291
+ if not self.save_knowledge:
292
+ return
293
+
294
+ try:
295
+ kb = load_knowledge_base(self.episodic_memory_path)
296
+ except:
297
+ kb = {}
298
+
299
+ if subtask_key not in kb:
300
+ subtask_summarization = self.summarize_episode(subtask_traj)
301
+ kb[subtask_key] = subtask_summarization
302
+
303
+ if self.save_knowledge:
304
+ os.makedirs(os.path.dirname(self.episodic_memory_path), exist_ok=True)
305
+ with open(self.episodic_memory_path, "w") as fout:
306
+ json.dump(kb, fout, indent=2)
307
+
308
+ return kb.get(subtask_key)
309
+
310
+ def save_narrative_memory(self, task_key: str, task_traj: str) -> None:
311
+ """Save narrative memory (task level knowledge).
312
+
313
+ Args:
314
+ task_key (str): Key identifying the task
315
+ task_traj (str): Full trajectory/experience of the task
316
+ """
317
+ if not self.save_knowledge:
318
+ return
319
+
320
+ try:
321
+ kb = load_knowledge_base(self.narrative_memory_path)
322
+ except:
323
+ kb = {}
324
+
325
+ if task_key not in kb:
326
+ task_summarization = self.summarize_narrative(task_traj)
327
+ kb[task_key] = task_summarization
328
+
329
+ if self.save_knowledge:
330
+ os.makedirs(os.path.dirname(self.narrative_memory_path), exist_ok=True)
331
+ with open(self.narrative_memory_path, "w") as fout:
332
+ json.dump(kb, fout, indent=2)
333
+
334
+ return kb.get(task_key)
335
+
336
+ def initialize_task_trajectory(self, instruction: str) -> None:
337
+ """Initialize a new task trajectory.
338
+
339
+ Args:
340
+ instruction (str): The task instruction
341
+ """
342
+ self.task_trajectory = f"Task:\n{instruction}"
343
+ self.current_search_query = ""
344
+ self.current_subtask_trajectory = ""
345
+
346
+ def update_task_trajectory(self, meta_data: Dict) -> None:
347
+ """Update the task trajectory with new metadata.
348
+
349
+ Args:
350
+ meta_data (Dict): Metadata from the agent's prediction
351
+ """
352
+ if not self.current_search_query and "search_query" in meta_data:
353
+ self.current_search_query = meta_data["search_query"]
354
+
355
+ self.task_trajectory += (
356
+ "\n\nReflection:\n"
357
+ + str(meta_data["reflection"])
358
+ + "\n\n----------------------\n\nPlan:\n"
359
+ + meta_data["executor_plan"]
360
+ )
361
+
362
+ def handle_subtask_trajectory(self, meta_data: Dict):
363
+ """Handle subtask trajectory updates based on subtask status.
364
+
365
+ Args:
366
+ meta_data (Dict): Metadata containing subtask information
367
+
368
+ Returns:
369
+ bool: Whether the subtask was completed
370
+ """
371
+ subtask_status = meta_data["subtask_status"]
372
+ subtask = meta_data["subtask"]
373
+ subtask_info = meta_data["subtask_info"]
374
+
375
+ if subtask_status in ["Start", "Done"]:
376
+ # If there's an existing subtask trajectory, finalize it
377
+ if self.current_subtask_trajectory:
378
+ self.current_subtask_trajectory += "\nSubtask Completed.\n"
379
+ subtask_key = self.current_subtask_trajectory.split(
380
+ "\n----------------------\n\nPlan:\n"
381
+ )[0]
382
+ self.save_episodic_memory(subtask_key, self.current_subtask_trajectory)
383
+ self.current_subtask_trajectory = ""
384
+ return True
385
+
386
+ # Start new subtask trajectory
387
+ self.current_subtask_trajectory = (
388
+ f"Task:\n{self.current_search_query}\n\n"
389
+ f"Subtask: {subtask}\n"
390
+ f"Subtask Instruction: {subtask_info}\n"
391
+ f"----------------------\n\n"
392
+ f'Plan:\n{meta_data["executor_plan"]}\n'
393
+ )
394
+ return False
395
+
396
+ elif subtask_status == "In":
397
+ # Continue current subtask trajectory
398
+ self.current_subtask_trajectory += (
399
+ f'\n----------------------\n\nPlan:\n{meta_data["executor_plan"]}\n'
400
+ )
401
+ return False
402
+
403
+ def finalize_task(self) -> None:
404
+ """Finalize the task by saving any remaining trajectories."""
405
+ # Save any remaining subtask trajectory
406
+ if self.current_subtask_trajectory:
407
+ self.current_subtask_trajectory += "\nSubtask Completed.\n"
408
+ subtask_key = self.current_subtask_trajectory.split(
409
+ "\n----------------------\n\nPlan:\n"
410
+ )[0]
411
+ self.save_episodic_memory(subtask_key, self.current_subtask_trajectory)
412
+
413
+ # Save the complete task trajectory
414
+ if self.task_trajectory and self.current_search_query:
415
+ self.save_narrative_memory(self.current_search_query, self.task_trajectory)
416
+
417
+ # Reset trajectories
418
+ self.task_trajectory = ""
419
+ self.current_subtask_trajectory = ""
420
+ self.current_search_query = ""
421
+
422
+ def summarize_episode(self, trajectory: str) -> Tuple[str, List[int], str]:
423
+ """Summarize the episode experience for lifelong learning reflection
424
+
425
+ Args:
426
+ trajectory (str): The episode experience to be summarized
427
+
428
+ Returns:
429
+ str: The summarized episode experience
430
+ """
431
+
432
+ # Create Reflection on whole trajectories for next round trial, keep earlier messages as exemplars
433
+ content, total_tokens, cost = self.episode_summarization_agent.execute_tool("episode_summarization", {"str_input": trajectory})
434
+
435
+ return content, total_tokens, cost
436
+
437
+ def summarize_narrative(self, trajectory: str) -> Tuple[str, List[int], str]:
438
+ """Summarize the narrative experience for lifelong learning reflection
439
+
440
+ Args:
441
+ trajectory (str): The narrative experience to be summarized
442
+
443
+ Returns:
444
+ str: The summarized narrative experience
445
+ """
446
+ # Create Reflection on whole trajectories for next round trial
447
+ content, total_tokens, cost = self.narrative_summarization_agent.execute_tool("narrative_summarization", {"str_input": trajectory})
448
+
449
+ return content, total_tokens, cost