npcsh 1.0.11__py3-none-any.whl → 1.0.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
npcsh/plonk.py CHANGED
@@ -1,9 +1,17 @@
1
1
  from npcpy.data.image import capture_screenshot
2
2
  import time
3
+ import os
3
4
  import platform
4
5
  from npcpy.llm_funcs import get_llm_response
5
- from npcpy.work.desktop import perform_action, action_space
6
- from PIL import Image, ImageDraw, ImageFont
6
+ from npcpy.work.desktop import perform_action
7
+ import matplotlib.pyplot as plt
8
+ import matplotlib.patches as patches
9
+ from PIL import Image
10
+ import numpy as np
11
+ import imagehash # Using perceptual hashing for smarter screen comparison
12
+ from npcsh._state import NPCSH_VISION_MODEL, NPCSH_VISION_PROVIDER
13
+ import argparse
14
+ from npcpy.npc_compiler import NPC
7
15
 
8
16
  def get_system_examples():
9
17
  system = platform.system()
@@ -14,396 +22,321 @@ def get_system_examples():
14
22
  else:
15
23
  return "Examples: firefox &, gedit &, gnome-calculator &"
16
24
 
17
- def execute_plonk_command(request, action_space, model, provider, npc=None, max_iterations=10, debug=False):
18
- synthesized_summary = []
19
-
20
- """Synthesizes information gathered during the computer use run and logs key data points for
21
- analysis. This function can be extended to store or report the synthesized knowledge as required.
22
- """
23
-
24
- system = platform.system()
25
- system_examples = get_system_examples()
26
-
27
- messages = []
28
- last_action_feedback = "None"
29
- last_click_coords = None
30
-
31
- iteration_count = 0
32
- while iteration_count < max_iterations:
33
- # Gathering summary of actions performed this iteration
34
- synthesized_info = {
35
- 'iteration': iteration_count + 1,
36
- 'last_action_feedback': last_action_feedback,
37
- 'last_click_coords': last_click_coords
38
- }
39
- synthesized_summary.append(synthesized_info)
40
-
41
- if debug:
42
- print(f"Synthesized info at iteration {iteration_count + 1}: {synthesized_info}")
43
-
44
- if debug:
45
- print(f"Iteration {iteration_count + 1}/{max_iterations}")
46
-
47
- # YOUR PROMPT, UNTOUCHED
48
- prompt_template = f"""
49
- Goal: {request}
50
- Feedback from last action: {last_action_feedback}
51
-
52
- Your task is to control the computer to achieve the goal.
53
-
54
- THOUGHT PROCESS:
55
- 1. Analyze the screen. Is the application I need (e.g., a web browser) already open?
56
- 2. If YES, `click` it. If NO, use `bash` to launch it. Use the examples: {system_examples}.
57
-
58
-
59
- CRITICAL COMPLETION RULE:
60
- Once the goal is visually complete on the screen, your ONLY next action is to use the 'quit' action.
61
-
62
- Your response MUST be a JSON object with an "actions" key.
63
- All clicking actions should use percentage coordinates relative
64
- to the screen size, as we will
65
- manually translate them to the proper screen size.
66
- your x and y values for clicks must ALWAYS be between 0 and 100.
67
- The x and y are (0,0) at the TOP LEFT CORNER OF THE SCREEN.
68
- The bottom right corner of the screen is (100,100).
69
- the bottom left corner is (0,100) and the top right corner is (100,0).
70
-
71
-
72
-
73
-
74
- ---
75
- EXAMPLE 1: Task "Create and save a file named 'memo.txt' with the text 'Meeting at 3pm'"
76
- {{
77
- "actions": [
78
- {{ "type": "bash", "command": "gedit &" }},
79
- {{ "type": "wait", "duration": 2 }},
80
- {{'type':'click', 'x': 10, 'y': 30}},
81
- {{ "type": "type", "text": "Meeting at 3pm" }},
82
- {{ "type": "hotkey", "keys": ["ctrl", "s"] }},
83
- {{ "type": "wait", "duration": 1 }},
84
- {{ "type": "type", "text": "memo.txt" }},
85
- {{ "type": "key", "keys": ["enter"] }},
86
- ]
87
- }}
88
- ---
89
- EXAMPLE 2: Task "Search for news about space exploration"
90
- {{
91
- "actions": [
92
- {{ "type": "bash", "command": "firefox &" }},
93
- {{ "type": "wait", "duration": 3 }},
94
- {{ "type": "type", "text": "news about space exploration" }},
95
- {{ "type": "key", "keys": ["enter"] }},
96
- ]
97
- }}
98
-
99
- ---
100
-
101
- Once a task has been verified and completed, your action list should only be
102
- {{
103
- "actions": [
104
- {{ "type": "quit" }}
105
- ]
106
- }}
107
- """
108
-
109
- screenshot_path = capture_screenshot(npc=npc, full=True).get('file_path')
110
- if not screenshot_path:
111
- time.sleep(2)
112
- continue
113
-
114
- image_to_send_path = screenshot_path
115
- if last_click_coords:
116
- try:
117
- img = Image.open(screenshot_path)
118
- draw = ImageDraw.Draw(img)
119
- width, height = img.size
120
- x_pixel = int(last_click_coords['x'] * width / 100)
121
- y_pixel = int(last_click_coords['y'] * height / 100)
122
-
123
- try:
124
- font = ImageFont.truetype("DejaVuSans-Bold.ttf", size=48)
125
- except IOError:
126
- font = ImageFont.load_default()
127
-
128
- draw.text((x_pixel - 8, y_pixel - 12),
129
- f"+{last_click_coords['x'],last_click_coords['y']}",
130
- fill="red",
131
- font=font)
132
-
133
- marked_image_path = "/tmp/marked_screenshot.png"
134
- img.save(marked_image_path)
135
- image_to_send_path = marked_image_path
136
- print(f"Drew marker at ({x_pixel}, {y_pixel}) on new screenshot.")
137
- except Exception as e:
138
- print(f"Failed to draw marker on image: {e}")
139
-
140
- response = get_llm_response(
141
- prompt=prompt_template,
142
- model=model,
143
- provider=provider,
144
- npc=npc,
145
- images=[image_to_send_path],
146
- messages=messages,
147
- format="json",
148
- )
25
+ def format_plonk_summary(synthesized_summary: list) -> str:
26
+ """Formats the summary of a plonk session into a readable markdown report."""
27
+ if not synthesized_summary:
28
+ return "Plonk session ended with no actions performed."
149
29
 
150
- if "messages" in response:
151
- messages = response["messages"]
30
+ output = "## Plonk Session Summary\n\n"
31
+ for info in synthesized_summary:
32
+ iteration = info.get('iteration', 'N/A')
33
+ feedback = info.get('last_action_feedback', 'None')
34
+ coords = info.get('last_click_coords', 'None')
35
+ output += f"### Iteration {iteration}\n"
36
+ output += f"- **Feedback:** {feedback}\n"
37
+ output += f"- **Last Click:** {coords}\n\n"
38
+ return output
39
+
40
+ def get_image_hash(image_path):
41
+ """Generate a perceptual hash of the image to detect screen changes intelligently."""
42
+ try:
43
+ # Perceptual hash is more robust to minor changes like a blinking cursor
44
+ return imagehash.phash(Image.open(image_path))
45
+ except Exception as e:
46
+ print(f"Could not generate image hash: {e}")
47
+ return None
48
+
49
+ def add_click_vector_trail(image_path, click_history, output_path):
50
+ """Add click markers showing the progression/trail of clicks with arrows and numbers."""
51
+ try:
52
+ img = Image.open(image_path)
53
+ img_array = np.array(img)
54
+ height, width = img_array.shape[:2]
152
55
 
153
- response_data = response.get('response')
154
-
155
- if not isinstance(response_data, dict) or "actions" not in response_data:
156
- last_action_feedback = f"Invalid JSON response from model: {response_data}"
157
- continue
158
-
159
- actions_list = response_data.get("actions", [])
56
+ fig, ax = plt.subplots(1, 1, figsize=(width/100, height/100), dpi=100)
57
+ ax.imshow(img_array)
160
58
 
161
- if not isinstance(actions_list, list):
162
- last_action_feedback = "Model did not return a list in the 'actions' key."
163
- continue
59
+ font_size = max(12, min(width, height) // 80)
60
+ colors = plt.cm.viridis(np.linspace(0.3, 1.0, len(click_history)))
164
61
 
165
- # Reset last click before processing new actions
166
- last_click_coords = None
167
- for action in actions_list:
168
- if debug:
169
- print(f"Executing action: {action}")
170
- if action.get("type") == "quit":
171
- print("Task complete: Model returned 'quit' action.")
172
- return "SUCCESS"
173
-
174
- result = perform_action(action)
175
- last_action_feedback = result.get("message") or result.get("output")
176
-
177
- if action.get("type") == "click":
178
- last_click_coords = {"x": action.get("x"), "y": action.get("y")}
62
+ # Draw arrows connecting clicks first
63
+ if len(click_history) > 1:
64
+ for i in range(len(click_history) - 1):
65
+ x1, y1 = (click_history[i]['x'] * width / 100, click_history[i]['y'] * height / 100)
66
+ x2, y2 = (click_history[i+1]['x'] * width / 100, click_history[i+1]['y'] * height / 100)
67
+ ax.annotate('', xy=(x2, y2), xytext=(x1, y1),
68
+ arrowprops=dict(arrowstyle='->,head_width=0.6,head_length=0.8',
69
+ lw=3, color='cyan', alpha=0.9, shrinkA=25, shrinkB=25))
70
+
71
+ # Draw numbered points and coordinate labels for ALL points
72
+ for i, click in enumerate(click_history):
73
+ x_pixel = int(click['x'] * width / 100)
74
+ y_pixel = int(click['y'] * height / 100)
179
75
 
180
- if result.get("status") == "error":
181
- print(f"Action failed, providing feedback to model: {last_action_feedback}")
182
- break
183
- time.sleep(1)
76
+ radius = 25
77
+ circle = patches.Circle((x_pixel, y_pixel), radius=radius,
78
+ linewidth=3, edgecolor='white',
79
+ facecolor=colors[i], alpha=0.9)
80
+ ax.add_patch(circle)
81
+
82
+ # Draw the number inside the circle
83
+ ax.text(x_pixel, y_pixel, str(i+1),
84
+ fontsize=font_size + 4,
85
+ color='white', weight='bold', ha='center', va='center')
86
+
87
+ # FIXED: Draw the coordinate text label for EVERY point
88
+ coord_text = f"({click['x']}, {click['y']})"
89
+ ax.text(x_pixel + radius + 5, # Position text to the right of the circle
90
+ y_pixel, # Vertically centered with the circle
91
+ coord_text,
92
+ fontsize=font_size,
93
+ color='white',
94
+ weight='bold',
95
+ ha='left', va='center',
96
+ bbox=dict(boxstyle="round,pad=0.2", facecolor=colors[i],
97
+ alpha=0.9, edgecolor='white'))
184
98
 
185
- if not actions_list:
186
- last_action_feedback = "No actions were returned. The task is likely not complete. Re-evaluating."
187
- print(last_action_feedback)
99
+ ax.set_xlim(0, width)
100
+ ax.set_ylim(height, 0)
101
+ ax.axis('off')
102
+ plt.tight_layout(pad=0)
103
+ plt.savefig(output_path, bbox_inches='tight', pad_inches=0, dpi=100)
104
+ plt.close()
188
105
 
189
- iteration_count += 1
190
-
191
- return None
192
-
193
- def synthesize_and_display_summary(synthesized_summary, debug=False):
194
- """Synthesizes information gathered during the computer use run and logs key data points."""
195
- if not synthesized_summary:
196
- print("No synthesized info to display.")
197
- return
198
-
199
- print("\nSynthesized Summary of Computer Use Run:")
200
- for info in synthesized_summary:
201
- print(f"Iteration {info['iteration']}:\n"
202
- f" Last Action Feedback: {info['last_action_feedback']}\n"
203
- f" Last Click Coordinates: {info['last_click_coords']}")
204
- print("End of synthesized summary.\n")
205
-
206
-
106
+ return True
107
+ except Exception as e:
108
+ print(f"Failed to add click trail with matplotlib: {e}")
109
+ return False
207
110
 
208
- def repl_loop():
209
- print("Assistant REPL - Type your plonk command or 'exit' to quit.")
210
- while True:
211
- user_input = input("Enter your command: ").strip()
212
- if user_input.lower() == 'exit':
213
- print("Exiting REPL. Goodbye!")
214
- break
215
- if not user_input:
216
- continue
217
-
218
- # Run the plonk command and get synthesized summary
219
- synthesized_summary = execute_plonk_command(
220
- request=user_input,
221
- action_space=action_space,
222
- model="gpt-4o-mini",
223
- provider="openai",
224
- max_iterations=8,
225
- debug=True
226
- )
227
-
228
- if synthesized_summary and isinstance(synthesized_summary, list):
229
- print("Command executed with synthesized summary.")
230
- synthesize_and_display_summary(synthesized_summary)
231
- else:
232
- print("Command did not complete within iteration limit or returned no summary.")
233
-
234
-
235
- def execute_plonk_command(request, action_space, model, provider, npc=None, max_iterations=10, debug=False):
236
- """Synthesizes information gathered during the computer use run and logs key data points for
237
- analysis. This function can be extended to store or report the synthesized knowledge as required.
238
- """
239
-
240
- system = platform.system()
111
+ def execute_plonk_command(request, model, provider, npc=None, plonk_context=None, max_iterations=10, debug=False):
241
112
  system_examples = get_system_examples()
242
-
243
113
  messages = []
244
114
  last_action_feedback = "None"
245
115
  last_click_coords = None
246
-
247
- iteration_count = 0
248
-
249
116
  synthesized_summary = []
117
+
118
+ current_screen_hash = None
119
+ click_history = []
120
+ HASH_DISTANCE_THRESHOLD = 3
121
+
122
+ for iteration_count in range(max_iterations):
123
+ try:
124
+ screenshot_info = capture_screenshot(full=True)
125
+ screenshot_path = screenshot_info.get('file_path') if screenshot_info else None
126
+
127
+ if not screenshot_path:
128
+ last_action_feedback = "Error: Failed to capture screenshot."
129
+ time.sleep(1)
130
+ continue
250
131
 
251
- while iteration_count < max_iterations:
252
- synthesized_info = {
253
- 'iteration': iteration_count + 1,
254
- 'last_action_feedback': last_action_feedback,
255
- 'last_click_coords': last_click_coords
256
- }
257
- synthesized_summary.append(synthesized_info)
258
-
259
- if debug:
260
- print(f"Synthesized info at iteration {iteration_count + 1}: {synthesized_info}")
132
+ new_screen_hash = get_image_hash(screenshot_path)
133
+
134
+ if current_screen_hash is None or (new_screen_hash - current_screen_hash > HASH_DISTANCE_THRESHOLD):
135
+ if debug and current_screen_hash is not None:
136
+ print(f"Screen changed (hash distance: {new_screen_hash - current_screen_hash}) - resetting click history.")
137
+ click_history = []
138
+ current_screen_hash = new_screen_hash
139
+
140
+ summary_info = {
141
+ 'iteration': iteration_count + 1,
142
+ 'last_action_feedback': last_action_feedback,
143
+ 'last_click_coords': click_history[-1] if click_history else None
144
+ }
145
+ synthesized_summary.append(summary_info)
261
146
 
262
- if debug:
263
- print(f"Iteration {iteration_count + 1}/{max_iterations}")
147
+ if debug:
148
+ print(f"Iteration {iteration_count + 1}/{max_iterations}")
149
+
150
+ context_injection = ""
151
+ if plonk_context:
152
+ context_injection = f"""
153
+ ---
154
+ IMPORTANT TEAM CONTEXT FOR THIS TASK:
155
+ {plonk_context}
156
+ ---
157
+ """
158
+
159
+ completion_example_text = """
160
+ {
161
+ "actions": [],
162
+ "status": "Task appears complete. Waiting for user approval to proceed or finish."
163
+ }
164
+ """
165
+
166
+ quit_rule_text = 'NEVER include {"type": "quit"} in your actions - the user controls when to stop.'
167
+
168
+ prompt_examples = """
169
+ ---
170
+ EXAMPLE 1: Task "Create and save a file named 'memo.txt' with the text 'Meeting at 3pm'"
171
+ {
172
+ "actions": [
173
+ { "type": "bash", "command": "gedit &" },
174
+
175
+ {"type":"click", "x": 10, "y": 30}
176
+ ]
177
+ }
178
+ ---
179
+ EXAMPLE 2: Task "Search for news about space exploration"
180
+ {
181
+ "actions": [
182
+ { "type": "bash", "command": "firefox &" },
183
+
184
+ ]
185
+ }
186
+ ---
187
+ EXAMPLE 3: Task "Click the red button on the form"
188
+ {
189
+ "actions": [
190
+ { "type": "click", "x": 75, "y": 45 }
191
+ ]
192
+ }
193
+ ---
194
+ EXAMPLE 4: Task "Open Gmail and draft a reply to most recent email"
195
+ {
196
+ "actions": [
197
+ { "type": "bash", "command": "open -a Safari" },
198
+
199
+ ]
200
+ }
201
+ """
202
+
203
+ prompt_template = f"""
204
+ Goal: {request}
205
+ Feedback from last action: {last_action_feedback}
264
206
 
265
- prompt_template = f"""
266
- Goal: {request}
267
- Feedback from last action: {last_action_feedback}
207
+ {context_injection}
268
208
 
269
- Your task is to control the computer to achieve the goal.
270
-
271
- THOUGHT PROCESS:
272
- 1. Analyze the screen. Is the application I need (e.g., a web browser) already open?
273
- 2. If YES, `click` it. If NO, use `bash` to launch it. Use the examples: {system_examples}.
274
-
275
-
276
- CRITICAL COMPLETION RULE:
277
- Once the goal is visually complete on the screen, your ONLY next action is to use the 'quit' action.
209
+ Your task is to control the computer to achieve the goal.
210
+
211
+ IMPORTANT: You should take actions step-by-step and verify each step works before proceeding.
212
+ DO NOT plan all actions at once - take a few actions, then look at the screen again.
213
+
214
+ CRITICAL: NEVER use the 'quit' action automatically. Even if the task appears complete,
215
+ continue working or wait for user guidance. The user will decide when to quit.
216
+
217
+ THOUGHT PROCESS:
218
+ 1. Analyze the screen. Is the application I need (e.g., a web browser) already open?
219
+ 2. If YES, `click` it. If NO, use `bash` to launch it. Use the examples: {system_examples}.
220
+ 3. Take 2-3 actions maximum, then let me see the screen again to verify progress.
221
+ 4. If task appears complete, explain status but DO NOT quit - wait for user direction.
222
+
223
+ Your response MUST be a JSON object with an "actions" key.
224
+ All clicking actions should use percentage coordinates relative
225
+ to the screen size.
226
+ The x and y are (0,0) at the TOP LEFT CORNER OF THE SCREEN.
227
+
228
+ MAXIMUM 3 ACTIONS PER RESPONSE - then let me see the screen to verify progress.
229
+ Never do more than one click, type, or hotkey event per response. It is important to take a sequence of
230
+ slow actions separated to avoid making mistakes and falling in loops.
231
+
232
+ If the task appears complete, you can include an empty actions list and explain:
233
+ {completion_example_text}
234
+
235
+ {quit_rule_text}
236
+ """ + prompt_examples
278
237
 
279
- Your response MUST be a JSON object with an "actions" key.
280
- All clicking actions should use percentage coordinates relative
281
- to the screen size, as we will
282
- manually translate them to the proper screen size.
283
- your x and y values for clicks must ALWAYS be between 0 and 100.
284
- The x and y are (0,0) at the TOP LEFT CORNER OF THE SCREEN.
285
- The bottom right corner of the screen is (100,100).
286
- the bottom left corner is (0,100) and the top right corner is (100,0).
287
-
288
-
289
-
290
- ---
291
- EXAMPLE 1: Task "Create and save a file named 'memo.txt' with the text 'Meeting at 3pm'"
292
- {{
293
- "actions": [
294
- {{ "type": "bash", "command": "gedit &" }},
295
- {{ "type": "wait", "duration": 2 }},
296
- {{'type':'click', 'x': 10, 'y': 30}},
297
- {{ "type": "type", "text": "Meeting at 3pm" }},
298
- {{ "type": "hotkey", "keys": ["ctrl", "s"] }},
299
- {{ "type": "wait", "duration": 1 }},
300
- {{ "type": "type", "text": "memo.txt" }},
301
- {{ "type": "key", "keys": ["enter"] }},
302
- ]
303
- }}
304
- ---
305
- EXAMPLE 2: Task "Search for news about space exploration"
306
- {{
307
- "actions": [
308
- {{ "type": "bash", "command": "firefox &" }},
309
- {{ "type": "wait", "duration": 3 }},
310
- {{ "type": "type", "text": "news about space exploration" }},
311
- {{ "type": "key", "keys": ["enter"] }},
312
- ]
313
- }}
314
-
315
- ---
316
-
317
- Once a task has been verified and completed, your action list should only be
318
- {{
319
- "actions": [
320
- {{ "type": "quit" }}
321
- ]
322
- }}
323
- """
238
+ image_to_send_path = screenshot_path
239
+
240
+ if click_history:
241
+ marked_image_path = "/tmp/marked_screenshot.png"
242
+ if add_click_vector_trail(screenshot_path, click_history, marked_image_path):
243
+ image_to_send_path = marked_image_path
244
+ if debug:
245
+ print(f"Drew click trail with {len(click_history)} points.")
246
+
247
+ response = get_llm_response(prompt_template, model=model, provider=provider, npc=npc,
248
+ images=[image_to_send_path], messages=messages, format="json")
249
+ messages = response.get("messages", messages)
250
+ response_data = response.get('response')
251
+
252
+ if debug:
253
+ print(response_data)
324
254
 
325
- screenshot_path = capture_screenshot(npc=npc, full=True).get('file_path')
326
- if not screenshot_path:
327
- time.sleep(2)
328
- continue
255
+ if not isinstance(response_data, dict) or "actions" not in response_data:
256
+ last_action_feedback = f"Invalid JSON response from model: {response_data}"
257
+ continue
329
258
 
330
- image_to_send_path = screenshot_path
331
- if last_click_coords:
332
- try:
333
- img = Image.open(screenshot_path)
334
- draw = ImageDraw.Draw(img)
335
- width, height = img.size
336
- x_pixel = int(last_click_coords['x'] * width / 100)
337
- y_pixel = int(last_click_coords['y'] * height / 100)
259
+ actions_list = response_data.get("actions", [])
260
+ if not isinstance(actions_list, list):
261
+ last_action_feedback = "Model did not return a list in the 'actions' key."
262
+ continue
263
+
264
+ for action in actions_list:
265
+ if debug:
266
+ print(f"Executing action: {action}")
338
267
 
339
- try:
340
- font = ImageFont.truetype("DejaVuSans-Bold.ttf", size=48)
341
- except IOError:
342
- font = ImageFont.load_default()
343
-
344
- draw.text((x_pixel - 8, y_pixel - 12),
345
- f"+{last_click_coords['x'],last_click_coords['y']}",
346
- fill="red",
347
- font=font)
268
+ if action.get("type") == "quit":
269
+ print("⚠️ Model attempted to quit automatically. Ignoring.")
270
+ continue
271
+
272
+ result = perform_action(action)
273
+ last_action_feedback = result.get("message") or result.get("output")
274
+
275
+ if action.get("type") == "click":
276
+ click_info = {"x": action.get("x"), "y": action.get("y")}
277
+ click_history.append(click_info)
278
+ if len(click_history) > 6:
279
+ click_history.pop(0)
348
280
 
349
- marked_image_path = "/tmp/marked_screenshot.png"
350
- img.save(marked_image_path)
351
- image_to_send_path = marked_image_path
352
- print(f"Drew marker at ({x_pixel}, {y_pixel}) on new screenshot.")
353
- except Exception as e:
354
- print(f"Failed to draw marker on image: {e}")
355
-
356
- response = get_llm_response(
357
- prompt=prompt_template,
358
- model=model,
359
- provider=provider,
360
- npc=npc,
361
- images=[image_to_send_path],
362
- messages=messages,
363
- format="json",
364
- )
365
-
366
- if "messages" in response:
367
- messages = response["messages"]
368
-
369
- response_data = response.get('response')
370
-
371
- if not isinstance(response_data, dict) or "actions" not in response_data:
372
- last_action_feedback = f"Invalid JSON response from model: {response_data}"
373
- continue
374
-
375
- actions_list = response_data.get("actions", [])
281
+ if result.get("status") == "error":
282
+ last_action_feedback = f"Action failed: {last_action_feedback}"
283
+ print(f"Action failed, providing feedback to model: {last_action_feedback}")
284
+ break
285
+ time.sleep(1)
286
+
287
+ if response_data.get("status") and "complete" in response_data.get("status", "").lower():
288
+ print(f"🎯 Model reports: {response_data.get('status')}")
289
+ print(" Press Ctrl+C to provide guidance or approval, or let it continue...")
290
+
291
+ if not actions_list:
292
+ last_action_feedback = "No actions were returned by the model. Re-evaluating."
293
+ if debug:
294
+ print(last_action_feedback)
376
295
 
377
- if not isinstance(actions_list, list):
378
- last_action_feedback = "Model did not return a list in the 'actions' key."
379
- continue
380
-
381
- last_click_coords = None
382
- for action in actions_list:
383
- if debug:
384
- print(f"Executing action: {action}")
385
- if action.get("type") == "quit":
386
- print("Task complete: Model returned 'quit' action.")
387
- return synthesized_summary
388
-
389
- result = perform_action(action)
390
- last_action_feedback = result.get("message") or result.get("output")
391
-
392
- if action.get("type") == "click":
393
- last_click_coords = {"x": action.get("x"), "y": action.get("y")}
394
-
395
- if result.get("status") == "error":
396
- print(f"Action failed, providing feedback to model: {last_action_feedback}")
296
+ except KeyboardInterrupt:
297
+ print("\n⚠️ Plonk paused. Provide additional guidance or press Enter to continue.")
298
+ try:
299
+ user_guidance = input("Guidance > ").strip()
300
+ if user_guidance:
301
+ request += f"\n\n---\nUser Guidance: {user_guidance}\n---"
302
+ last_action_feedback = "User provided new guidance to correct the course."
303
+ print(" Guidance received. Resuming with updated instructions...")
304
+ else:
305
+ last_action_feedback = "User paused and resumed without new guidance."
306
+ print("✅ No guidance provided. Resuming...")
307
+ continue
308
+ except EOFError:
309
+ print("\nExiting plonk mode.")
397
310
  break
398
- time.sleep(1)
399
-
400
- if not actions_list:
401
- last_action_feedback = "No actions were returned. The task is likely not complete. Re-evaluating."
402
- print(last_action_feedback)
403
-
404
- iteration_count += 1
311
+
405
312
  return synthesized_summary
406
313
 
314
+ def main():
315
+ parser = argparse.ArgumentParser(description="Execute GUI automation tasks using vision models")
316
+ parser.add_argument("request", help="The task to perform")
317
+ parser.add_argument("--model", help="Model to use")
318
+ parser.add_argument("--provider", help="Provider to use")
319
+ parser.add_argument("--max-iterations", type=int, default=10, help="Maximum iterations")
320
+ parser.add_argument("--debug", action="store_true", help="Enable debug output")
321
+ parser.add_argument("--npc", type=str, default=os.path.expanduser('~/.npcsh/npc_team/plonk.npc'), help="Path to NPC file")
322
+
323
+ args = parser.parse_args()
324
+
325
+ npc = NPC(file=args.npc) if os.path.exists(os.path.expanduser(args.npc)) else None
326
+
327
+ model = args.model or (npc.model if npc else NPCSH_VISION_MODEL)
328
+ provider = args.provider or (npc.provider if npc else NPCSH_VISION_PROVIDER)
329
+
330
+ summary = execute_plonk_command(
331
+ request=args.request,
332
+ model=model,
333
+ provider=provider,
334
+ npc=npc,
335
+ max_iterations=args.max_iterations,
336
+ debug=args.debug
337
+ )
338
+
339
+ print(format_plonk_summary(summary))
407
340
 
408
341
  if __name__ == "__main__":
409
- repl_loop()
342
+ main()