computer-use-ootb-internal 0.0.188__py3-none-any.whl → 0.0.189__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,237 +1,237 @@
1
- import argparse
2
- import time
3
- import json
4
- import platform
5
- import uuid
6
- import base64
7
- import datetime
8
- from datetime import datetime, timedelta, timezone
9
-
10
- from computer_use_ootb_internal.computer_use_demo.executor.teachmode_executor import TeachmodeExecutor
11
- from computer_use_ootb_internal.computer_use_demo.gui_agent.llm_utils.llm_utils import is_image_path
12
- from computer_use_ootb_internal.computer_use_demo.gui_agent.gui_parser.simple_parser.utils import get_screen_resize_factor
13
- from computer_use_ootb_internal.computer_use_demo.tools.aws_request import send_request_to_server
14
- from computer_use_ootb_internal.computer_use_demo.gui_agent.gui_parser.uia_tools.screenshot_service import get_screenshot_external_cmd
15
-
16
-
17
- utc_plus_8 = timezone(timedelta(hours=8))
18
-
19
-
20
- def simple_teachmode_sampling_loop(
21
- model: str,
22
- task: str,
23
- api_keys: dict = None,
24
- action_history: list[dict] = None,
25
- selected_screen: int = 0,
26
- user_id: str = None,
27
- trace_id: str = None,
28
- server_url: str = "http://localhost:5000/generate_action",
29
- max_steps: int = 20,
30
- full_screen_game_mode: int = 0, # 0: disabled, 1: starrail, 2: starrail browser
31
- ):
32
- """
33
- Synchronous sampling loop for assistant/tool interactions in 'teach mode'.
34
- """
35
- # Initialize action_history if it's None
36
- if action_history is None:
37
- action_history = []
38
-
39
- # if platform.system() != "Windows":
40
- # raise ValueError("Teach mode is only supported on Windows.")
41
-
42
- # # Set StarRail mode based on input parameter
43
- # # 0: disabled, 1: starrail, 2: starrail browser
44
- # full_screen_game_mode = 0
45
-
46
- # # TODO: set full_screen_game_mode adaptively
47
- # if "star_rail" in user_id or "star_rail" in user_id:
48
- # full_screen_game_mode = 1
49
-
50
- # if "star_rail_dev" in trace_id or "star_rail_dev" in user_id or "hero_case" in user_id or "official" in user_id:
51
- # full_screen_game_mode = 2
52
-
53
- print(f"Full Screen Game Mode: {full_screen_game_mode}")
54
- executor = TeachmodeExecutor(
55
- selected_screen=selected_screen,
56
- full_screen_game_mode=full_screen_game_mode,
57
- )
58
-
59
- timestamp = datetime.now(utc_plus_8).strftime("%m%d-%H%M%S")
60
-
61
- step_count = 1
62
- unique_task_id = f"{timestamp}_uid_{user_id}_tid_{trace_id}_{str(uuid.uuid4())[:6]}"
63
-
64
- print("[simple_teachmode_sampling_loop] starting task: ", task)
65
- print(f"[simple_teachmode_sampling_loop] unique_task_id: {unique_task_id}")
66
-
67
-
68
- while step_count < max_steps:
69
-
70
- print(f"step_count: {step_count}")
71
-
72
- # Pause briefly so we don't spam screenshots
73
- time.sleep(1)
74
-
75
- uia_meta, sc_path = get_screenshot_external_cmd(
76
- selected_screen=selected_screen,
77
- capture_uia_data=full_screen_game_mode==0
78
- )
79
-
80
- # yield {"role": "assistant", "content": "screenshot", "type": "action", "action_type": "screenshot"}
81
-
82
- if is_image_path(sc_path):
83
- # yield {"role": "assistant", "content": sc_path, "type": "image", "action_type": "screenshot"}
84
- with open(sc_path, "rb") as image_file:
85
- sc_base64 = base64.b64encode(image_file.read()).decode('utf-8')
86
- yield {"role": "assistant", "content": sc_base64, "type": "image_base64", "action_type": "screenshot"}
87
-
88
- payload = {
89
- "task_id": unique_task_id,
90
- "uia_data": uia_meta,
91
- "screenshot_path": sc_path,
92
- "query": task,
93
- "action_history": action_history,
94
- "mode": "teach",
95
- "user_id": user_id,
96
- "trace_id": trace_id,
97
- "scale_factor": get_screen_resize_factor(),
98
- "os_name": platform.system(),
99
- "api_keys": api_keys,
100
- }
101
-
102
- # Send request to Marbot Run server
103
- infer_server_response = send_request_to_server(payload, server_url)
104
-
105
- # infer_server_response = {
106
- # 'status': 'success',
107
- # 'generated_plan': plan_details,
108
- # 'generated_action': action,
109
- # 'todo_md': todo_md_content,
110
- # 'milestones': milestones,
111
- # 'current_step': current_step,
112
- # }
113
-
114
-
115
- if infer_server_response is None:
116
- print("No response from Marbot Run server. Exiting.")
117
- yield {"role": "assistant", "content": "No response from Marbot Run server. Exiting.", "type": "error"}
118
- action_history = []
119
- break
120
-
121
- try:
122
- step_plan = infer_server_response["generated_plan"]
123
- step_plan_observation = step_plan["observation"]
124
- step_plan_reasoning = step_plan["reasoning"]
125
- step_plan_info = step_plan["step_info"]
126
- step_action = infer_server_response["generated_action"]["content"]
127
- step_traj_idx = infer_server_response["current_traj_step"]
128
-
129
- # chat_visable_content = f"{step_plan_observation}{step_plan_reasoning}"
130
-
131
- except Exception as e:
132
- print("Error parsing generated_action content:", e)
133
- yield {"role": "assistant", "content": "Error parsing response from Marbot Run server. Exiting.", "type": "error"}
134
- break
135
-
136
- yield {"role": "assistant", "content": step_plan_observation, "type": "text"}
137
- yield {"role": "assistant", "content": step_plan_reasoning, "type": "text"}
138
-
139
- if step_action.get("action") == "STOP":
140
- final_sc, final_sc_path = get_screenshot_external_cmd(selected_screen=selected_screen)
141
-
142
- with open(final_sc_path, "rb") as image_file:
143
- final_sc_base64 = base64.b64encode(image_file.read()).decode('utf-8')
144
- yield {"role": "assistant", "content": "Task completed. Final screenshot:", "type": "text"}
145
- yield {"role": "assistant", "content": final_sc_base64, "type": "image_base64", "action_type": "screenshot"}
146
-
147
- # reset action history
148
- action_history = []
149
- break
150
-
151
- action_history.append(f"Executing guidance trajectory step [{step_traj_idx}]: {{Plan: {step_plan_info}, Action: {step_action}}}\n")
152
-
153
- for exec_message in executor({"role": "assistant", "content": step_action}):
154
- yield exec_message
155
-
156
- step_count += 1
157
-
158
- # reset action history
159
- action_history = []
160
-
161
-
162
-
163
- if __name__ == "__main__":
164
- parser = argparse.ArgumentParser(
165
- description="Run a synchronous sampling loop for assistant/tool interactions in teach-mode."
166
- )
167
- parser.add_argument(
168
- "--model",
169
- default="teach-mode",
170
- help="The model to use",
171
- )
172
- parser.add_argument(
173
- "--task",
174
- default="Click on the Google Chorme icon",
175
- help="The task to be completed by the assistant (e.g., 'Complete some data extraction.').",
176
- )
177
- parser.add_argument(
178
- "--selected_screen",
179
- type=int,
180
- default=0,
181
- help="Index of the screen to capture (default=0).",
182
- )
183
- parser.add_argument(
184
- "--user_id",
185
- default="star_rail",
186
- help="User ID for the session (default='liziqi').",
187
- )
188
- parser.add_argument(
189
- "--trace_id",
190
- default="ONG_JING_JIE_007-0213_0",
191
- help="Trace ID for the session (default='default_trace').",
192
- )
193
- parser.add_argument(
194
- "--api_key_file",
195
- default="api_key.json",
196
- help="Path to the JSON file containing API keys (default='api_key.json').",
197
- )
198
- parser.add_argument(
199
- "--max_steps",
200
- type=int,
201
- default=20,
202
- help="The maximum number of steps to take.",
203
- )
204
- parser.add_argument(
205
- "--full_screen_game_mode",
206
- type=int,
207
- default=0,
208
- help="Full screen game mode (0: disabled, 1: starrail, 2: starrail browser)",
209
- )
210
-
211
- args = parser.parse_args()
212
-
213
- # # Load API keys
214
- # with open(args.api_key_file, "r") as file:
215
- # api_keys = json.load(file)
216
- api_keys = None
217
-
218
- print(f"Starting task: {args.task}")
219
-
220
- # Execute the sampling loop
221
- sampling_loop = simple_teachmode_sampling_loop(
222
- model=args.model,
223
- task=args.task,
224
- selected_screen=args.selected_screen,
225
- user_id=args.user_id,
226
- trace_id=args.trace_id,
227
- api_keys=api_keys,
228
- max_steps=args.max_steps,
229
- full_screen_game_mode=args.full_screen_game_mode,
230
- )
231
-
232
- # # Print each step result
233
- for step in sampling_loop:
234
- print(step)
235
- time.sleep(1)
236
-
237
- print(f"Task '{args.task}' completed. Thanks for using Teachmode-OOTB.")
1
+ import argparse
2
+ import time
3
+ import json
4
+ import platform
5
+ import uuid
6
+ import base64
7
+ import datetime
8
+ from datetime import datetime, timedelta, timezone
9
+
10
+ from computer_use_ootb_internal.computer_use_demo.executor.teachmode_executor import TeachmodeExecutor
11
+ from computer_use_ootb_internal.computer_use_demo.gui_agent.llm_utils.llm_utils import is_image_path
12
+ from computer_use_ootb_internal.computer_use_demo.gui_agent.gui_parser.simple_parser.utils import get_screen_resize_factor
13
+ from computer_use_ootb_internal.computer_use_demo.tools.aws_request import send_request_to_server
14
+ from computer_use_ootb_internal.computer_use_demo.gui_agent.gui_parser.uia_tools.screenshot_service import get_screenshot_external_cmd
15
+
16
+
17
+ utc_plus_8 = timezone(timedelta(hours=8))
18
+
19
+
20
+ def simple_teachmode_sampling_loop(
21
+ model: str,
22
+ task: str,
23
+ api_keys: dict = None,
24
+ action_history: list[dict] = None,
25
+ selected_screen: int = 0,
26
+ user_id: str = None,
27
+ trace_id: str = None,
28
+ server_url: str = "http://localhost:5000/generate_action",
29
+ max_steps: int = 20,
30
+ full_screen_game_mode: int = 0, # 0: disabled, 1: starrail, 2: starrail browser
31
+ ):
32
+ """
33
+ Synchronous sampling loop for assistant/tool interactions in 'teach mode'.
34
+ """
35
+ # Initialize action_history if it's None
36
+ if action_history is None:
37
+ action_history = []
38
+
39
+ # if platform.system() != "Windows":
40
+ # raise ValueError("Teach mode is only supported on Windows.")
41
+
42
+ # # Set StarRail mode based on input parameter
43
+ # # 0: disabled, 1: starrail, 2: starrail browser
44
+ # full_screen_game_mode = 0
45
+
46
+ # # TODO: set full_screen_game_mode adaptively
47
+ # if "star_rail" in user_id or "star_rail" in user_id:
48
+ # full_screen_game_mode = 1
49
+
50
+ # if "star_rail_dev" in trace_id or "star_rail_dev" in user_id or "hero_case" in user_id or "official" in user_id:
51
+ # full_screen_game_mode = 2
52
+
53
+ print(f"Full Screen Game Mode: {full_screen_game_mode}")
54
+ executor = TeachmodeExecutor(
55
+ selected_screen=selected_screen,
56
+ full_screen_game_mode=full_screen_game_mode,
57
+ )
58
+
59
+ timestamp = datetime.now(utc_plus_8).strftime("%m%d-%H%M%S")
60
+
61
+ step_count = 1
62
+ unique_task_id = f"{timestamp}_uid_{user_id}_tid_{trace_id}_{str(uuid.uuid4())[:6]}"
63
+
64
+ print("[simple_teachmode_sampling_loop] starting task: ", task)
65
+ print(f"[simple_teachmode_sampling_loop] unique_task_id: {unique_task_id}")
66
+
67
+
68
+ while step_count < max_steps:
69
+
70
+ print(f"step_count: {step_count}")
71
+
72
+ # Pause briefly so we don't spam screenshots
73
+ time.sleep(1)
74
+
75
+ uia_meta, sc_path = get_screenshot_external_cmd(
76
+ selected_screen=selected_screen,
77
+ capture_uia_data=full_screen_game_mode==0
78
+ )
79
+
80
+ # yield {"role": "assistant", "content": "screenshot", "type": "action", "action_type": "screenshot"}
81
+
82
+ if is_image_path(sc_path):
83
+ # yield {"role": "assistant", "content": sc_path, "type": "image", "action_type": "screenshot"}
84
+ with open(sc_path, "rb") as image_file:
85
+ sc_base64 = base64.b64encode(image_file.read()).decode('utf-8')
86
+ yield {"role": "assistant", "content": sc_base64, "type": "image_base64", "action_type": "screenshot"}
87
+
88
+ payload = {
89
+ "task_id": unique_task_id,
90
+ "uia_data": uia_meta,
91
+ "screenshot_path": sc_path,
92
+ "query": task,
93
+ "action_history": action_history,
94
+ "mode": "teach",
95
+ "user_id": user_id,
96
+ "trace_id": trace_id,
97
+ "scale_factor": get_screen_resize_factor(),
98
+ "os_name": platform.system(),
99
+ "api_keys": api_keys,
100
+ }
101
+
102
+ # Send request to Marbot Run server
103
+ infer_server_response = send_request_to_server(payload, server_url)
104
+
105
+ # infer_server_response = {
106
+ # 'status': 'success',
107
+ # 'generated_plan': plan_details,
108
+ # 'generated_action': action,
109
+ # 'todo_md': todo_md_content,
110
+ # 'milestones': milestones,
111
+ # 'current_step': current_step,
112
+ # }
113
+
114
+
115
+ if infer_server_response is None:
116
+ print("No response from Marbot Run server. Exiting.")
117
+ yield {"role": "assistant", "content": "No response from Marbot Run server. Exiting.", "type": "error"}
118
+ action_history = []
119
+ break
120
+
121
+ try:
122
+ step_plan = infer_server_response["generated_plan"]
123
+ step_plan_observation = step_plan["observation"]
124
+ step_plan_reasoning = step_plan["reasoning"]
125
+ step_plan_info = step_plan["step_info"]
126
+ step_action = infer_server_response["generated_action"]["content"]
127
+ step_traj_idx = infer_server_response["current_traj_step"]
128
+
129
+ # chat_visable_content = f"{step_plan_observation}{step_plan_reasoning}"
130
+
131
+ except Exception as e:
132
+ print("Error parsing generated_action content:", e)
133
+ yield {"role": "assistant", "content": "Error parsing response from Marbot Run server. Exiting.", "type": "error"}
134
+ break
135
+
136
+ yield {"role": "assistant", "content": step_plan_observation, "type": "text"}
137
+ yield {"role": "assistant", "content": step_plan_reasoning, "type": "text"}
138
+
139
+ if step_action.get("action") == "STOP":
140
+ final_sc, final_sc_path = get_screenshot_external_cmd(selected_screen=selected_screen)
141
+
142
+ with open(final_sc_path, "rb") as image_file:
143
+ final_sc_base64 = base64.b64encode(image_file.read()).decode('utf-8')
144
+ yield {"role": "assistant", "content": "Task completed. Final screenshot:", "type": "text"}
145
+ yield {"role": "assistant", "content": final_sc_base64, "type": "image_base64", "action_type": "screenshot"}
146
+
147
+ # reset action history
148
+ action_history = []
149
+ break
150
+
151
+ action_history.append(f"Executing guidance trajectory step [{step_traj_idx}]: {{Plan: {step_plan_info}, Action: {step_action}}}\n")
152
+
153
+ for exec_message in executor({"role": "assistant", "content": step_action}):
154
+ yield exec_message
155
+
156
+ step_count += 1
157
+
158
+ # reset action history
159
+ action_history = []
160
+
161
+
162
+
163
+ if __name__ == "__main__":
164
+ parser = argparse.ArgumentParser(
165
+ description="Run a synchronous sampling loop for assistant/tool interactions in teach-mode."
166
+ )
167
+ parser.add_argument(
168
+ "--model",
169
+ default="teach-mode",
170
+ help="The model to use",
171
+ )
172
+ parser.add_argument(
173
+ "--task",
174
+ default="Click on the Google Chorme icon",
175
+ help="The task to be completed by the assistant (e.g., 'Complete some data extraction.').",
176
+ )
177
+ parser.add_argument(
178
+ "--selected_screen",
179
+ type=int,
180
+ default=0,
181
+ help="Index of the screen to capture (default=0).",
182
+ )
183
+ parser.add_argument(
184
+ "--user_id",
185
+ default="star_rail",
186
+ help="User ID for the session (default='liziqi').",
187
+ )
188
+ parser.add_argument(
189
+ "--trace_id",
190
+ default="ONG_JING_JIE_007-0213_0",
191
+ help="Trace ID for the session (default='default_trace').",
192
+ )
193
+ parser.add_argument(
194
+ "--api_key_file",
195
+ default="api_key.json",
196
+ help="Path to the JSON file containing API keys (default='api_key.json').",
197
+ )
198
+ parser.add_argument(
199
+ "--max_steps",
200
+ type=int,
201
+ default=20,
202
+ help="The maximum number of steps to take.",
203
+ )
204
+ parser.add_argument(
205
+ "--full_screen_game_mode",
206
+ type=int,
207
+ default=0,
208
+ help="Full screen game mode (0: disabled, 1: starrail, 2: starrail browser)",
209
+ )
210
+
211
+ args = parser.parse_args()
212
+
213
+ # # Load API keys
214
+ # with open(args.api_key_file, "r") as file:
215
+ # api_keys = json.load(file)
216
+ api_keys = None
217
+
218
+ print(f"Starting task: {args.task}")
219
+
220
+ # Execute the sampling loop
221
+ sampling_loop = simple_teachmode_sampling_loop(
222
+ model=args.model,
223
+ task=args.task,
224
+ selected_screen=args.selected_screen,
225
+ user_id=args.user_id,
226
+ trace_id=args.trace_id,
227
+ api_keys=api_keys,
228
+ max_steps=args.max_steps,
229
+ full_screen_game_mode=args.full_screen_game_mode,
230
+ )
231
+
232
+ # # Print each step result
233
+ for step in sampling_loop:
234
+ print(step)
235
+ time.sleep(1)
236
+
237
+ print(f"Task '{args.task}' completed. Thanks for using Teachmode-OOTB.")