computer-use-ootb-internal 0.0.179__py3-none-any.whl → 0.0.180__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,99 @@
1
+ import os
2
+ import platform
3
+ import subprocess
4
+ import logging
5
+ from pathlib import Path
6
+ import time
7
+
8
+ log = logging.getLogger(__name__)
9
+
10
+ def run_preparation(state):
11
+ """
12
+ Performs environment preparation specific to Word on Windows.
13
+ Opens a specific template file located on the user's desktop and maximizes the window.
14
+ Kills existing Word processes first.
15
+ """
16
+ if platform.system() != "Windows":
17
+ log.warning("Word preparation skipped: Not running on Windows.")
18
+ return
19
+
20
+ log.info(f"Word preparation: Starting on Windows platform...")
21
+
22
+ try:
23
+ # Determine the desktop path for Windows
24
+ try:
25
+ username = os.environ.get("USERNAME", "")
26
+ if not username:
27
+ log.error("Could not determine Windows username from environment")
28
+ return
29
+
30
+ log.info(f"Using Windows username: {username}")
31
+ desktop_path = Path(f"C:/Users/{username}/Desktop")
32
+
33
+ if not desktop_path.exists():
34
+ log.error(f"Desktop path not found at: {desktop_path}")
35
+ alt_path = Path(f"C:/Documents and Settings/{username}/Desktop")
36
+ if alt_path.exists():
37
+ desktop_path = alt_path
38
+ log.info(f"Using alternative desktop path: {desktop_path}")
39
+ else:
40
+ log.error("Failed to find user's desktop directory")
41
+ return
42
+
43
+ except Exception as e:
44
+ log.error(f"Error determining Windows user desktop: {e}", exc_info=True)
45
+ return
46
+
47
+ # Construct path to template file
48
+ template_file = desktop_path / "template.docx" # Changed extension
49
+ log.info(f"Looking for template file at: {template_file}")
50
+
51
+ if not template_file.exists():
52
+ log.error(f"Template file not found at: {template_file}")
53
+ return
54
+
55
+ # --- Kill existing Word processes ---
56
+ log.info("Attempting to close existing Microsoft Word processes...")
57
+ try:
58
+ # Command to forcefully terminate Word processes by image name
59
+ kill_cmd = ['taskkill', '/F', '/IM', 'WINWORD.EXE'] # Changed process name
60
+ kill_result = subprocess.run(kill_cmd,
61
+ capture_output=True, text=True, check=False)
62
+
63
+ # Check taskkill result
64
+ if kill_result.returncode == 0:
65
+ log.info("Successfully sent termination signal to WINWORD.EXE processes.")
66
+ elif "not found" in kill_result.stderr.lower() or "not found" in kill_result.stdout.lower():
67
+ log.info("No running WINWORD.EXE processes found to close.")
68
+ else:
69
+ log.warning(f"taskkill command finished with return code {kill_result.returncode}. Output: {kill_result.stdout} Stderr: {kill_result.stderr}")
70
+ time.sleep(2)
71
+ except FileNotFoundError:
72
+ log.error("Error: 'taskkill' command not found. Make sure it's in the system PATH.")
73
+ except Exception as e:
74
+ log.error(f"Error occurred while trying to close Word: {e}", exc_info=True)
75
+ # --- End of kill process ---
76
+
77
+ # Open the file with Word maximized on Windows
78
+ log.info(f"Attempting to open {template_file} with Word maximized on Windows...")
79
+ try:
80
+ # Use start command with /max flag on Windows
81
+ cmd = ['cmd', '/c', 'start', '/max', 'winword', str(template_file)] # Changed app name
82
+ # Temporarily commented out file opening
83
+ # result = subprocess.run(cmd, check=False, capture_output=True, text=True)
84
+ log.info(f"(Skipped) Would open file with command: {' '.join(cmd)}")
85
+ result = None # Set result to None or mock success if needed elsewhere
86
+
87
+ # if result and result.returncode == 0:
88
+ # log.info(f"Successfully launched Word maximized with {template_file}")
89
+ # else:
90
+ # log.error(f"Error opening Word: {result.stderr.strip() if result else 'Command not run'}")
91
+ # if result and result.stdout:
92
+ # log.error(f"Stdout from start command: {result.stdout.strip()}")
93
+ except FileNotFoundError:
94
+ log.error("Error: 'cmd' or 'start' command not found. Ensure system PATH is configured correctly.")
95
+ except Exception as e:
96
+ log.error(f"Exception opening Word on Windows: {e}", exc_info=True)
97
+
98
+ except Exception as e:
99
+ log.error(f"An unexpected error occurred during Word preparation: {e}", exc_info=True)
@@ -1,237 +1,237 @@
1
- import argparse
2
- import time
3
- import json
4
- import platform
5
- import uuid
6
- import base64
7
- import datetime
8
- from datetime import datetime, timedelta, timezone
9
-
10
- from computer_use_ootb_internal.computer_use_demo.executor.teachmode_executor import TeachmodeExecutor
11
- from computer_use_ootb_internal.computer_use_demo.gui_agent.llm_utils.llm_utils import is_image_path
12
- from computer_use_ootb_internal.computer_use_demo.gui_agent.gui_parser.simple_parser.utils import get_screen_resize_factor
13
- from computer_use_ootb_internal.computer_use_demo.tools.aws_request import send_request_to_server
14
- from computer_use_ootb_internal.computer_use_demo.gui_agent.gui_parser.uia_tools.screenshot_service import get_screenshot_external_cmd
15
-
16
-
17
- utc_plus_8 = timezone(timedelta(hours=8))
18
-
19
-
20
- def simple_teachmode_sampling_loop(
21
- model: str,
22
- task: str,
23
- api_keys: dict = None,
24
- action_history: list[dict] = None,
25
- selected_screen: int = 0,
26
- user_id: str = None,
27
- trace_id: str = None,
28
- server_url: str = "http://localhost:5000/generate_action",
29
- max_steps: int = 20,
30
- full_screen_game_mode: int = 0, # 0: disabled, 1: starrail, 2: starrail browser
31
- ):
32
- """
33
- Synchronous sampling loop for assistant/tool interactions in 'teach mode'.
34
- """
35
- # Initialize action_history if it's None
36
- if action_history is None:
37
- action_history = []
38
-
39
- # if platform.system() != "Windows":
40
- # raise ValueError("Teach mode is only supported on Windows.")
41
-
42
- # # Set StarRail mode based on input parameter
43
- # # 0: disabled, 1: starrail, 2: starrail browser
44
- # full_screen_game_mode = 0
45
-
46
- # # TODO: set full_screen_game_mode adaptively
47
- # if "star_rail" in user_id or "star_rail" in user_id:
48
- # full_screen_game_mode = 1
49
-
50
- # if "star_rail_dev" in trace_id or "star_rail_dev" in user_id or "hero_case" in user_id or "official" in user_id:
51
- # full_screen_game_mode = 2
52
-
53
- print(f"Full Screen Game Mode: {full_screen_game_mode}")
54
- executor = TeachmodeExecutor(
55
- selected_screen=selected_screen,
56
- full_screen_game_mode=full_screen_game_mode,
57
- )
58
-
59
- timestamp = datetime.now(utc_plus_8).strftime("%m%d-%H%M%S")
60
-
61
- step_count = 1
62
- unique_task_id = f"{timestamp}_uid_{user_id}_tid_{trace_id}_{str(uuid.uuid4())[:6]}"
63
-
64
- print("[simple_teachmode_sampling_loop] starting task: ", task)
65
- print(f"[simple_teachmode_sampling_loop] unique_task_id: {unique_task_id}")
66
-
67
-
68
- while step_count < max_steps:
69
-
70
- print(f"step_count: {step_count}")
71
-
72
- # Pause briefly so we don't spam screenshots
73
- time.sleep(1)
74
-
75
- uia_meta, sc_path = get_screenshot_external_cmd(
76
- selected_screen=selected_screen,
77
- capture_uia_data=full_screen_game_mode==0
78
- )
79
-
80
- # yield {"role": "assistant", "content": "screenshot", "type": "action", "action_type": "screenshot"}
81
-
82
- if is_image_path(sc_path):
83
- # yield {"role": "assistant", "content": sc_path, "type": "image", "action_type": "screenshot"}
84
- with open(sc_path, "rb") as image_file:
85
- sc_base64 = base64.b64encode(image_file.read()).decode('utf-8')
86
- yield {"role": "assistant", "content": sc_base64, "type": "image_base64", "action_type": "screenshot"}
87
-
88
- payload = {
89
- "task_id": unique_task_id,
90
- "uia_data": uia_meta,
91
- "screenshot_path": sc_path,
92
- "query": task,
93
- "action_history": action_history,
94
- "mode": "teach",
95
- "user_id": user_id,
96
- "trace_id": trace_id,
97
- "scale_factor": get_screen_resize_factor(),
98
- "os_name": platform.system(),
99
- "api_keys": api_keys,
100
- }
101
-
102
- # Send request to Marbot Run server
103
- infer_server_response = send_request_to_server(payload, server_url)
104
-
105
- # infer_server_response = {
106
- # 'status': 'success',
107
- # 'generated_plan': plan_details,
108
- # 'generated_action': action,
109
- # 'todo_md': todo_md_content,
110
- # 'milestones': milestones,
111
- # 'current_step': current_step,
112
- # }
113
-
114
-
115
- if infer_server_response is None:
116
- print("No response from Marbot Run server. Exiting.")
117
- yield {"role": "assistant", "content": "No response from Marbot Run server. Exiting.", "type": "error"}
118
- action_history = []
119
- break
120
-
121
- try:
122
- step_plan = infer_server_response["generated_plan"]
123
- step_plan_observation = step_plan["observation"]
124
- step_plan_reasoning = step_plan["reasoning"]
125
- step_plan_info = step_plan["step_info"]
126
- step_action = infer_server_response["generated_action"]["content"]
127
- step_traj_idx = infer_server_response["current_traj_step"]
128
-
129
- # chat_visable_content = f"{step_plan_observation}{step_plan_reasoning}"
130
-
131
- except Exception as e:
132
- print("Error parsing generated_action content:", e)
133
- yield {"role": "assistant", "content": "Error parsing response from Marbot Run server. Exiting.", "type": "error"}
134
- break
135
-
136
- yield {"role": "assistant", "content": step_plan_observation, "type": "text"}
137
- yield {"role": "assistant", "content": step_plan_reasoning, "type": "text"}
138
-
139
- if step_action.get("action") == "STOP":
140
- final_sc, final_sc_path = get_screenshot_external_cmd(selected_screen=selected_screen)
141
-
142
- with open(final_sc_path, "rb") as image_file:
143
- final_sc_base64 = base64.b64encode(image_file.read()).decode('utf-8')
144
- yield {"role": "assistant", "content": "Task completed. Final screenshot:", "type": "text"}
145
- yield {"role": "assistant", "content": final_sc_base64, "type": "image_base64", "action_type": "screenshot"}
146
-
147
- # reset action history
148
- action_history = []
149
- break
150
-
151
- action_history.append(f"Executing guidance trajectory step [{step_traj_idx}]: {{Plan: {step_plan_info}, Action: {step_action}}}\n")
152
-
153
- for exec_message in executor({"role": "assistant", "content": step_action}):
154
- yield exec_message
155
-
156
- step_count += 1
157
-
158
- # reset action history
159
- action_history = []
160
-
161
-
162
-
163
- if __name__ == "__main__":
164
- parser = argparse.ArgumentParser(
165
- description="Run a synchronous sampling loop for assistant/tool interactions in teach-mode."
166
- )
167
- parser.add_argument(
168
- "--model",
169
- default="teach-mode",
170
- help="The model to use",
171
- )
172
- parser.add_argument(
173
- "--task",
174
- default="Click on the Google Chorme icon",
175
- help="The task to be completed by the assistant (e.g., 'Complete some data extraction.').",
176
- )
177
- parser.add_argument(
178
- "--selected_screen",
179
- type=int,
180
- default=0,
181
- help="Index of the screen to capture (default=0).",
182
- )
183
- parser.add_argument(
184
- "--user_id",
185
- default="star_rail",
186
- help="User ID for the session (default='liziqi').",
187
- )
188
- parser.add_argument(
189
- "--trace_id",
190
- default="ONG_JING_JIE_007-0213_0",
191
- help="Trace ID for the session (default='default_trace').",
192
- )
193
- parser.add_argument(
194
- "--api_key_file",
195
- default="api_key.json",
196
- help="Path to the JSON file containing API keys (default='api_key.json').",
197
- )
198
- parser.add_argument(
199
- "--max_steps",
200
- type=int,
201
- default=20,
202
- help="The maximum number of steps to take.",
203
- )
204
- parser.add_argument(
205
- "--full_screen_game_mode",
206
- type=int,
207
- default=0,
208
- help="Full screen game mode (0: disabled, 1: starrail, 2: starrail browser)",
209
- )
210
-
211
- args = parser.parse_args()
212
-
213
- # # Load API keys
214
- # with open(args.api_key_file, "r") as file:
215
- # api_keys = json.load(file)
216
- api_keys = None
217
-
218
- print(f"Starting task: {args.task}")
219
-
220
- # Execute the sampling loop
221
- sampling_loop = simple_teachmode_sampling_loop(
222
- model=args.model,
223
- task=args.task,
224
- selected_screen=args.selected_screen,
225
- user_id=args.user_id,
226
- trace_id=args.trace_id,
227
- api_keys=api_keys,
228
- max_steps=args.max_steps,
229
- full_screen_game_mode=args.full_screen_game_mode,
230
- )
231
-
232
- # # Print each step result
233
- for step in sampling_loop:
234
- print(step)
235
- time.sleep(1)
236
-
237
- print(f"Task '{args.task}' completed. Thanks for using Teachmode-OOTB.")
1
+ import argparse
2
+ import time
3
+ import json
4
+ import platform
5
+ import uuid
6
+ import base64
7
+ import datetime
8
+ from datetime import datetime, timedelta, timezone
9
+
10
+ from computer_use_ootb_internal.computer_use_demo.executor.teachmode_executor import TeachmodeExecutor
11
+ from computer_use_ootb_internal.computer_use_demo.gui_agent.llm_utils.llm_utils import is_image_path
12
+ from computer_use_ootb_internal.computer_use_demo.gui_agent.gui_parser.simple_parser.utils import get_screen_resize_factor
13
+ from computer_use_ootb_internal.computer_use_demo.tools.aws_request import send_request_to_server
14
+ from computer_use_ootb_internal.computer_use_demo.gui_agent.gui_parser.uia_tools.screenshot_service import get_screenshot_external_cmd
15
+
16
+
17
+ utc_plus_8 = timezone(timedelta(hours=8))
18
+
19
+
20
+ def simple_teachmode_sampling_loop(
21
+ model: str,
22
+ task: str,
23
+ api_keys: dict = None,
24
+ action_history: list[dict] = None,
25
+ selected_screen: int = 0,
26
+ user_id: str = None,
27
+ trace_id: str = None,
28
+ server_url: str = "http://localhost:5000/generate_action",
29
+ max_steps: int = 20,
30
+ full_screen_game_mode: int = 0, # 0: disabled, 1: starrail, 2: starrail browser
31
+ ):
32
+ """
33
+ Synchronous sampling loop for assistant/tool interactions in 'teach mode'.
34
+ """
35
+ # Initialize action_history if it's None
36
+ if action_history is None:
37
+ action_history = []
38
+
39
+ # if platform.system() != "Windows":
40
+ # raise ValueError("Teach mode is only supported on Windows.")
41
+
42
+ # # Set StarRail mode based on input parameter
43
+ # # 0: disabled, 1: starrail, 2: starrail browser
44
+ # full_screen_game_mode = 0
45
+
46
+ # # TODO: set full_screen_game_mode adaptively
47
+ # if "star_rail" in user_id or "star_rail" in user_id:
48
+ # full_screen_game_mode = 1
49
+
50
+ # if "star_rail_dev" in trace_id or "star_rail_dev" in user_id or "hero_case" in user_id or "official" in user_id:
51
+ # full_screen_game_mode = 2
52
+
53
+ print(f"Full Screen Game Mode: {full_screen_game_mode}")
54
+ executor = TeachmodeExecutor(
55
+ selected_screen=selected_screen,
56
+ full_screen_game_mode=full_screen_game_mode,
57
+ )
58
+
59
+ timestamp = datetime.now(utc_plus_8).strftime("%m%d-%H%M%S")
60
+
61
+ step_count = 1
62
+ unique_task_id = f"{timestamp}_uid_{user_id}_tid_{trace_id}_{str(uuid.uuid4())[:6]}"
63
+
64
+ print("[simple_teachmode_sampling_loop] starting task: ", task)
65
+ print(f"[simple_teachmode_sampling_loop] unique_task_id: {unique_task_id}")
66
+
67
+
68
+ while step_count < max_steps:
69
+
70
+ print(f"step_count: {step_count}")
71
+
72
+ # Pause briefly so we don't spam screenshots
73
+ time.sleep(1)
74
+
75
+ uia_meta, sc_path = get_screenshot_external_cmd(
76
+ selected_screen=selected_screen,
77
+ capture_uia_data=full_screen_game_mode==0
78
+ )
79
+
80
+ # yield {"role": "assistant", "content": "screenshot", "type": "action", "action_type": "screenshot"}
81
+
82
+ if is_image_path(sc_path):
83
+ # yield {"role": "assistant", "content": sc_path, "type": "image", "action_type": "screenshot"}
84
+ with open(sc_path, "rb") as image_file:
85
+ sc_base64 = base64.b64encode(image_file.read()).decode('utf-8')
86
+ yield {"role": "assistant", "content": sc_base64, "type": "image_base64", "action_type": "screenshot"}
87
+
88
+ payload = {
89
+ "task_id": unique_task_id,
90
+ "uia_data": uia_meta,
91
+ "screenshot_path": sc_path,
92
+ "query": task,
93
+ "action_history": action_history,
94
+ "mode": "teach",
95
+ "user_id": user_id,
96
+ "trace_id": trace_id,
97
+ "scale_factor": get_screen_resize_factor(),
98
+ "os_name": platform.system(),
99
+ "api_keys": api_keys,
100
+ }
101
+
102
+ # Send request to Marbot Run server
103
+ infer_server_response = send_request_to_server(payload, server_url)
104
+
105
+ # infer_server_response = {
106
+ # 'status': 'success',
107
+ # 'generated_plan': plan_details,
108
+ # 'generated_action': action,
109
+ # 'todo_md': todo_md_content,
110
+ # 'milestones': milestones,
111
+ # 'current_step': current_step,
112
+ # }
113
+
114
+
115
+ if infer_server_response is None:
116
+ print("No response from Marbot Run server. Exiting.")
117
+ yield {"role": "assistant", "content": "No response from Marbot Run server. Exiting.", "type": "error"}
118
+ action_history = []
119
+ break
120
+
121
+ try:
122
+ step_plan = infer_server_response["generated_plan"]
123
+ step_plan_observation = step_plan["observation"]
124
+ step_plan_reasoning = step_plan["reasoning"]
125
+ step_plan_info = step_plan["step_info"]
126
+ step_action = infer_server_response["generated_action"]["content"]
127
+ step_traj_idx = infer_server_response["current_traj_step"]
128
+
129
+ # chat_visable_content = f"{step_plan_observation}{step_plan_reasoning}"
130
+
131
+ except Exception as e:
132
+ print("Error parsing generated_action content:", e)
133
+ yield {"role": "assistant", "content": "Error parsing response from Marbot Run server. Exiting.", "type": "error"}
134
+ break
135
+
136
+ yield {"role": "assistant", "content": step_plan_observation, "type": "text"}
137
+ yield {"role": "assistant", "content": step_plan_reasoning, "type": "text"}
138
+
139
+ if step_action.get("action") == "STOP":
140
+ final_sc, final_sc_path = get_screenshot_external_cmd(selected_screen=selected_screen)
141
+
142
+ with open(final_sc_path, "rb") as image_file:
143
+ final_sc_base64 = base64.b64encode(image_file.read()).decode('utf-8')
144
+ yield {"role": "assistant", "content": "Task completed. Final screenshot:", "type": "text"}
145
+ yield {"role": "assistant", "content": final_sc_base64, "type": "image_base64", "action_type": "screenshot"}
146
+
147
+ # reset action history
148
+ action_history = []
149
+ break
150
+
151
+ action_history.append(f"Executing guidance trajectory step [{step_traj_idx}]: {{Plan: {step_plan_info}, Action: {step_action}}}\n")
152
+
153
+ for exec_message in executor({"role": "assistant", "content": step_action}):
154
+ yield exec_message
155
+
156
+ step_count += 1
157
+
158
+ # reset action history
159
+ action_history = []
160
+
161
+
162
+
163
+ if __name__ == "__main__":
164
+ parser = argparse.ArgumentParser(
165
+ description="Run a synchronous sampling loop for assistant/tool interactions in teach-mode."
166
+ )
167
+ parser.add_argument(
168
+ "--model",
169
+ default="teach-mode",
170
+ help="The model to use",
171
+ )
172
+ parser.add_argument(
173
+ "--task",
174
+ default="Click on the Google Chorme icon",
175
+ help="The task to be completed by the assistant (e.g., 'Complete some data extraction.').",
176
+ )
177
+ parser.add_argument(
178
+ "--selected_screen",
179
+ type=int,
180
+ default=0,
181
+ help="Index of the screen to capture (default=0).",
182
+ )
183
+ parser.add_argument(
184
+ "--user_id",
185
+ default="star_rail",
186
+ help="User ID for the session (default='liziqi').",
187
+ )
188
+ parser.add_argument(
189
+ "--trace_id",
190
+ default="ONG_JING_JIE_007-0213_0",
191
+ help="Trace ID for the session (default='default_trace').",
192
+ )
193
+ parser.add_argument(
194
+ "--api_key_file",
195
+ default="api_key.json",
196
+ help="Path to the JSON file containing API keys (default='api_key.json').",
197
+ )
198
+ parser.add_argument(
199
+ "--max_steps",
200
+ type=int,
201
+ default=20,
202
+ help="The maximum number of steps to take.",
203
+ )
204
+ parser.add_argument(
205
+ "--full_screen_game_mode",
206
+ type=int,
207
+ default=0,
208
+ help="Full screen game mode (0: disabled, 1: starrail, 2: starrail browser)",
209
+ )
210
+
211
+ args = parser.parse_args()
212
+
213
+ # # Load API keys
214
+ # with open(args.api_key_file, "r") as file:
215
+ # api_keys = json.load(file)
216
+ api_keys = None
217
+
218
+ print(f"Starting task: {args.task}")
219
+
220
+ # Execute the sampling loop
221
+ sampling_loop = simple_teachmode_sampling_loop(
222
+ model=args.model,
223
+ task=args.task,
224
+ selected_screen=args.selected_screen,
225
+ user_id=args.user_id,
226
+ trace_id=args.trace_id,
227
+ api_keys=api_keys,
228
+ max_steps=args.max_steps,
229
+ full_screen_game_mode=args.full_screen_game_mode,
230
+ )
231
+
232
+ # # Print each step result
233
+ for step in sampling_loop:
234
+ print(step)
235
+ time.sleep(1)
236
+
237
+ print(f"Task '{args.task}' completed. Thanks for using Teachmode-OOTB.")