npcsh 1.0.11__py3-none-any.whl → 1.0.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- npcsh/_state.py +89 -1
- npcsh/alicanto.py +22 -7
- npcsh/npcsh.py +434 -492
- npcsh/plonk.py +300 -367
- npcsh/routes.py +367 -162
- npcsh/spool.py +162 -221
- npcsh-1.0.13.dist-info/METADATA +775 -0
- npcsh-1.0.13.dist-info/RECORD +21 -0
- npcsh-1.0.11.dist-info/METADATA +0 -596
- npcsh-1.0.11.dist-info/RECORD +0 -21
- {npcsh-1.0.11.dist-info → npcsh-1.0.13.dist-info}/WHEEL +0 -0
- {npcsh-1.0.11.dist-info → npcsh-1.0.13.dist-info}/entry_points.txt +0 -0
- {npcsh-1.0.11.dist-info → npcsh-1.0.13.dist-info}/licenses/LICENSE +0 -0
- {npcsh-1.0.11.dist-info → npcsh-1.0.13.dist-info}/top_level.txt +0 -0
npcsh/plonk.py
CHANGED
|
@@ -1,9 +1,17 @@
|
|
|
1
1
|
from npcpy.data.image import capture_screenshot
|
|
2
2
|
import time
|
|
3
|
+
import os
|
|
3
4
|
import platform
|
|
4
5
|
from npcpy.llm_funcs import get_llm_response
|
|
5
|
-
from npcpy.work.desktop import perform_action
|
|
6
|
-
|
|
6
|
+
from npcpy.work.desktop import perform_action
|
|
7
|
+
import matplotlib.pyplot as plt
|
|
8
|
+
import matplotlib.patches as patches
|
|
9
|
+
from PIL import Image
|
|
10
|
+
import numpy as np
|
|
11
|
+
import imagehash # Using perceptual hashing for smarter screen comparison
|
|
12
|
+
from npcsh._state import NPCSH_VISION_MODEL, NPCSH_VISION_PROVIDER
|
|
13
|
+
import argparse
|
|
14
|
+
from npcpy.npc_compiler import NPC
|
|
7
15
|
|
|
8
16
|
def get_system_examples():
|
|
9
17
|
system = platform.system()
|
|
@@ -14,396 +22,321 @@ def get_system_examples():
|
|
|
14
22
|
else:
|
|
15
23
|
return "Examples: firefox &, gedit &, gnome-calculator &"
|
|
16
24
|
|
|
17
|
-
def
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
analysis. This function can be extended to store or report the synthesized knowledge as required.
|
|
22
|
-
"""
|
|
23
|
-
|
|
24
|
-
system = platform.system()
|
|
25
|
-
system_examples = get_system_examples()
|
|
26
|
-
|
|
27
|
-
messages = []
|
|
28
|
-
last_action_feedback = "None"
|
|
29
|
-
last_click_coords = None
|
|
30
|
-
|
|
31
|
-
iteration_count = 0
|
|
32
|
-
while iteration_count < max_iterations:
|
|
33
|
-
# Gathering summary of actions performed this iteration
|
|
34
|
-
synthesized_info = {
|
|
35
|
-
'iteration': iteration_count + 1,
|
|
36
|
-
'last_action_feedback': last_action_feedback,
|
|
37
|
-
'last_click_coords': last_click_coords
|
|
38
|
-
}
|
|
39
|
-
synthesized_summary.append(synthesized_info)
|
|
40
|
-
|
|
41
|
-
if debug:
|
|
42
|
-
print(f"Synthesized info at iteration {iteration_count + 1}: {synthesized_info}")
|
|
43
|
-
|
|
44
|
-
if debug:
|
|
45
|
-
print(f"Iteration {iteration_count + 1}/{max_iterations}")
|
|
46
|
-
|
|
47
|
-
# YOUR PROMPT, UNTOUCHED
|
|
48
|
-
prompt_template = f"""
|
|
49
|
-
Goal: {request}
|
|
50
|
-
Feedback from last action: {last_action_feedback}
|
|
51
|
-
|
|
52
|
-
Your task is to control the computer to achieve the goal.
|
|
53
|
-
|
|
54
|
-
THOUGHT PROCESS:
|
|
55
|
-
1. Analyze the screen. Is the application I need (e.g., a web browser) already open?
|
|
56
|
-
2. If YES, `click` it. If NO, use `bash` to launch it. Use the examples: {system_examples}.
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
CRITICAL COMPLETION RULE:
|
|
60
|
-
Once the goal is visually complete on the screen, your ONLY next action is to use the 'quit' action.
|
|
61
|
-
|
|
62
|
-
Your response MUST be a JSON object with an "actions" key.
|
|
63
|
-
All clicking actions should use percentage coordinates relative
|
|
64
|
-
to the screen size, as we will
|
|
65
|
-
manually translate them to the proper screen size.
|
|
66
|
-
your x and y values for clicks must ALWAYS be between 0 and 100.
|
|
67
|
-
The x and y are (0,0) at the TOP LEFT CORNER OF THE SCREEN.
|
|
68
|
-
The bottom right corner of the screen is (100,100).
|
|
69
|
-
the bottom left corner is (0,100) and the top right corner is (100,0).
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
---
|
|
75
|
-
EXAMPLE 1: Task "Create and save a file named 'memo.txt' with the text 'Meeting at 3pm'"
|
|
76
|
-
{{
|
|
77
|
-
"actions": [
|
|
78
|
-
{{ "type": "bash", "command": "gedit &" }},
|
|
79
|
-
{{ "type": "wait", "duration": 2 }},
|
|
80
|
-
{{'type':'click', 'x': 10, 'y': 30}},
|
|
81
|
-
{{ "type": "type", "text": "Meeting at 3pm" }},
|
|
82
|
-
{{ "type": "hotkey", "keys": ["ctrl", "s"] }},
|
|
83
|
-
{{ "type": "wait", "duration": 1 }},
|
|
84
|
-
{{ "type": "type", "text": "memo.txt" }},
|
|
85
|
-
{{ "type": "key", "keys": ["enter"] }},
|
|
86
|
-
]
|
|
87
|
-
}}
|
|
88
|
-
---
|
|
89
|
-
EXAMPLE 2: Task "Search for news about space exploration"
|
|
90
|
-
{{
|
|
91
|
-
"actions": [
|
|
92
|
-
{{ "type": "bash", "command": "firefox &" }},
|
|
93
|
-
{{ "type": "wait", "duration": 3 }},
|
|
94
|
-
{{ "type": "type", "text": "news about space exploration" }},
|
|
95
|
-
{{ "type": "key", "keys": ["enter"] }},
|
|
96
|
-
]
|
|
97
|
-
}}
|
|
98
|
-
|
|
99
|
-
---
|
|
100
|
-
|
|
101
|
-
Once a task has been verified and completed, your action list should only be
|
|
102
|
-
{{
|
|
103
|
-
"actions": [
|
|
104
|
-
{{ "type": "quit" }}
|
|
105
|
-
]
|
|
106
|
-
}}
|
|
107
|
-
"""
|
|
108
|
-
|
|
109
|
-
screenshot_path = capture_screenshot(npc=npc, full=True).get('file_path')
|
|
110
|
-
if not screenshot_path:
|
|
111
|
-
time.sleep(2)
|
|
112
|
-
continue
|
|
113
|
-
|
|
114
|
-
image_to_send_path = screenshot_path
|
|
115
|
-
if last_click_coords:
|
|
116
|
-
try:
|
|
117
|
-
img = Image.open(screenshot_path)
|
|
118
|
-
draw = ImageDraw.Draw(img)
|
|
119
|
-
width, height = img.size
|
|
120
|
-
x_pixel = int(last_click_coords['x'] * width / 100)
|
|
121
|
-
y_pixel = int(last_click_coords['y'] * height / 100)
|
|
122
|
-
|
|
123
|
-
try:
|
|
124
|
-
font = ImageFont.truetype("DejaVuSans-Bold.ttf", size=48)
|
|
125
|
-
except IOError:
|
|
126
|
-
font = ImageFont.load_default()
|
|
127
|
-
|
|
128
|
-
draw.text((x_pixel - 8, y_pixel - 12),
|
|
129
|
-
f"+{last_click_coords['x'],last_click_coords['y']}",
|
|
130
|
-
fill="red",
|
|
131
|
-
font=font)
|
|
132
|
-
|
|
133
|
-
marked_image_path = "/tmp/marked_screenshot.png"
|
|
134
|
-
img.save(marked_image_path)
|
|
135
|
-
image_to_send_path = marked_image_path
|
|
136
|
-
print(f"Drew marker at ({x_pixel}, {y_pixel}) on new screenshot.")
|
|
137
|
-
except Exception as e:
|
|
138
|
-
print(f"Failed to draw marker on image: {e}")
|
|
139
|
-
|
|
140
|
-
response = get_llm_response(
|
|
141
|
-
prompt=prompt_template,
|
|
142
|
-
model=model,
|
|
143
|
-
provider=provider,
|
|
144
|
-
npc=npc,
|
|
145
|
-
images=[image_to_send_path],
|
|
146
|
-
messages=messages,
|
|
147
|
-
format="json",
|
|
148
|
-
)
|
|
25
|
+
def format_plonk_summary(synthesized_summary: list) -> str:
|
|
26
|
+
"""Formats the summary of a plonk session into a readable markdown report."""
|
|
27
|
+
if not synthesized_summary:
|
|
28
|
+
return "Plonk session ended with no actions performed."
|
|
149
29
|
|
|
150
|
-
|
|
151
|
-
|
|
30
|
+
output = "## Plonk Session Summary\n\n"
|
|
31
|
+
for info in synthesized_summary:
|
|
32
|
+
iteration = info.get('iteration', 'N/A')
|
|
33
|
+
feedback = info.get('last_action_feedback', 'None')
|
|
34
|
+
coords = info.get('last_click_coords', 'None')
|
|
35
|
+
output += f"### Iteration {iteration}\n"
|
|
36
|
+
output += f"- **Feedback:** {feedback}\n"
|
|
37
|
+
output += f"- **Last Click:** {coords}\n\n"
|
|
38
|
+
return output
|
|
39
|
+
|
|
40
|
+
def get_image_hash(image_path):
|
|
41
|
+
"""Generate a perceptual hash of the image to detect screen changes intelligently."""
|
|
42
|
+
try:
|
|
43
|
+
# Perceptual hash is more robust to minor changes like a blinking cursor
|
|
44
|
+
return imagehash.phash(Image.open(image_path))
|
|
45
|
+
except Exception as e:
|
|
46
|
+
print(f"Could not generate image hash: {e}")
|
|
47
|
+
return None
|
|
48
|
+
|
|
49
|
+
def add_click_vector_trail(image_path, click_history, output_path):
|
|
50
|
+
"""Add click markers showing the progression/trail of clicks with arrows and numbers."""
|
|
51
|
+
try:
|
|
52
|
+
img = Image.open(image_path)
|
|
53
|
+
img_array = np.array(img)
|
|
54
|
+
height, width = img_array.shape[:2]
|
|
152
55
|
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
if not isinstance(response_data, dict) or "actions" not in response_data:
|
|
156
|
-
last_action_feedback = f"Invalid JSON response from model: {response_data}"
|
|
157
|
-
continue
|
|
158
|
-
|
|
159
|
-
actions_list = response_data.get("actions", [])
|
|
56
|
+
fig, ax = plt.subplots(1, 1, figsize=(width/100, height/100), dpi=100)
|
|
57
|
+
ax.imshow(img_array)
|
|
160
58
|
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
continue
|
|
59
|
+
font_size = max(12, min(width, height) // 80)
|
|
60
|
+
colors = plt.cm.viridis(np.linspace(0.3, 1.0, len(click_history)))
|
|
164
61
|
|
|
165
|
-
#
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
last_click_coords = {"x": action.get("x"), "y": action.get("y")}
|
|
62
|
+
# Draw arrows connecting clicks first
|
|
63
|
+
if len(click_history) > 1:
|
|
64
|
+
for i in range(len(click_history) - 1):
|
|
65
|
+
x1, y1 = (click_history[i]['x'] * width / 100, click_history[i]['y'] * height / 100)
|
|
66
|
+
x2, y2 = (click_history[i+1]['x'] * width / 100, click_history[i+1]['y'] * height / 100)
|
|
67
|
+
ax.annotate('', xy=(x2, y2), xytext=(x1, y1),
|
|
68
|
+
arrowprops=dict(arrowstyle='->,head_width=0.6,head_length=0.8',
|
|
69
|
+
lw=3, color='cyan', alpha=0.9, shrinkA=25, shrinkB=25))
|
|
70
|
+
|
|
71
|
+
# Draw numbered points and coordinate labels for ALL points
|
|
72
|
+
for i, click in enumerate(click_history):
|
|
73
|
+
x_pixel = int(click['x'] * width / 100)
|
|
74
|
+
y_pixel = int(click['y'] * height / 100)
|
|
179
75
|
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
76
|
+
radius = 25
|
|
77
|
+
circle = patches.Circle((x_pixel, y_pixel), radius=radius,
|
|
78
|
+
linewidth=3, edgecolor='white',
|
|
79
|
+
facecolor=colors[i], alpha=0.9)
|
|
80
|
+
ax.add_patch(circle)
|
|
81
|
+
|
|
82
|
+
# Draw the number inside the circle
|
|
83
|
+
ax.text(x_pixel, y_pixel, str(i+1),
|
|
84
|
+
fontsize=font_size + 4,
|
|
85
|
+
color='white', weight='bold', ha='center', va='center')
|
|
86
|
+
|
|
87
|
+
# FIXED: Draw the coordinate text label for EVERY point
|
|
88
|
+
coord_text = f"({click['x']}, {click['y']})"
|
|
89
|
+
ax.text(x_pixel + radius + 5, # Position text to the right of the circle
|
|
90
|
+
y_pixel, # Vertically centered with the circle
|
|
91
|
+
coord_text,
|
|
92
|
+
fontsize=font_size,
|
|
93
|
+
color='white',
|
|
94
|
+
weight='bold',
|
|
95
|
+
ha='left', va='center',
|
|
96
|
+
bbox=dict(boxstyle="round,pad=0.2", facecolor=colors[i],
|
|
97
|
+
alpha=0.9, edgecolor='white'))
|
|
184
98
|
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
99
|
+
ax.set_xlim(0, width)
|
|
100
|
+
ax.set_ylim(height, 0)
|
|
101
|
+
ax.axis('off')
|
|
102
|
+
plt.tight_layout(pad=0)
|
|
103
|
+
plt.savefig(output_path, bbox_inches='tight', pad_inches=0, dpi=100)
|
|
104
|
+
plt.close()
|
|
188
105
|
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
def synthesize_and_display_summary(synthesized_summary, debug=False):
|
|
194
|
-
"""Synthesizes information gathered during the computer use run and logs key data points."""
|
|
195
|
-
if not synthesized_summary:
|
|
196
|
-
print("No synthesized info to display.")
|
|
197
|
-
return
|
|
198
|
-
|
|
199
|
-
print("\nSynthesized Summary of Computer Use Run:")
|
|
200
|
-
for info in synthesized_summary:
|
|
201
|
-
print(f"Iteration {info['iteration']}:\n"
|
|
202
|
-
f" Last Action Feedback: {info['last_action_feedback']}\n"
|
|
203
|
-
f" Last Click Coordinates: {info['last_click_coords']}")
|
|
204
|
-
print("End of synthesized summary.\n")
|
|
205
|
-
|
|
206
|
-
|
|
106
|
+
return True
|
|
107
|
+
except Exception as e:
|
|
108
|
+
print(f"Failed to add click trail with matplotlib: {e}")
|
|
109
|
+
return False
|
|
207
110
|
|
|
208
|
-
def
|
|
209
|
-
print("Assistant REPL - Type your plonk command or 'exit' to quit.")
|
|
210
|
-
while True:
|
|
211
|
-
user_input = input("Enter your command: ").strip()
|
|
212
|
-
if user_input.lower() == 'exit':
|
|
213
|
-
print("Exiting REPL. Goodbye!")
|
|
214
|
-
break
|
|
215
|
-
if not user_input:
|
|
216
|
-
continue
|
|
217
|
-
|
|
218
|
-
# Run the plonk command and get synthesized summary
|
|
219
|
-
synthesized_summary = execute_plonk_command(
|
|
220
|
-
request=user_input,
|
|
221
|
-
action_space=action_space,
|
|
222
|
-
model="gpt-4o-mini",
|
|
223
|
-
provider="openai",
|
|
224
|
-
max_iterations=8,
|
|
225
|
-
debug=True
|
|
226
|
-
)
|
|
227
|
-
|
|
228
|
-
if synthesized_summary and isinstance(synthesized_summary, list):
|
|
229
|
-
print("Command executed with synthesized summary.")
|
|
230
|
-
synthesize_and_display_summary(synthesized_summary)
|
|
231
|
-
else:
|
|
232
|
-
print("Command did not complete within iteration limit or returned no summary.")
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
def execute_plonk_command(request, action_space, model, provider, npc=None, max_iterations=10, debug=False):
|
|
236
|
-
"""Synthesizes information gathered during the computer use run and logs key data points for
|
|
237
|
-
analysis. This function can be extended to store or report the synthesized knowledge as required.
|
|
238
|
-
"""
|
|
239
|
-
|
|
240
|
-
system = platform.system()
|
|
111
|
+
def execute_plonk_command(request, model, provider, npc=None, plonk_context=None, max_iterations=10, debug=False):
|
|
241
112
|
system_examples = get_system_examples()
|
|
242
|
-
|
|
243
113
|
messages = []
|
|
244
114
|
last_action_feedback = "None"
|
|
245
115
|
last_click_coords = None
|
|
246
|
-
|
|
247
|
-
iteration_count = 0
|
|
248
|
-
|
|
249
116
|
synthesized_summary = []
|
|
117
|
+
|
|
118
|
+
current_screen_hash = None
|
|
119
|
+
click_history = []
|
|
120
|
+
HASH_DISTANCE_THRESHOLD = 3
|
|
121
|
+
|
|
122
|
+
for iteration_count in range(max_iterations):
|
|
123
|
+
try:
|
|
124
|
+
screenshot_info = capture_screenshot(full=True)
|
|
125
|
+
screenshot_path = screenshot_info.get('file_path') if screenshot_info else None
|
|
126
|
+
|
|
127
|
+
if not screenshot_path:
|
|
128
|
+
last_action_feedback = "Error: Failed to capture screenshot."
|
|
129
|
+
time.sleep(1)
|
|
130
|
+
continue
|
|
250
131
|
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
132
|
+
new_screen_hash = get_image_hash(screenshot_path)
|
|
133
|
+
|
|
134
|
+
if current_screen_hash is None or (new_screen_hash - current_screen_hash > HASH_DISTANCE_THRESHOLD):
|
|
135
|
+
if debug and current_screen_hash is not None:
|
|
136
|
+
print(f"Screen changed (hash distance: {new_screen_hash - current_screen_hash}) - resetting click history.")
|
|
137
|
+
click_history = []
|
|
138
|
+
current_screen_hash = new_screen_hash
|
|
139
|
+
|
|
140
|
+
summary_info = {
|
|
141
|
+
'iteration': iteration_count + 1,
|
|
142
|
+
'last_action_feedback': last_action_feedback,
|
|
143
|
+
'last_click_coords': click_history[-1] if click_history else None
|
|
144
|
+
}
|
|
145
|
+
synthesized_summary.append(summary_info)
|
|
261
146
|
|
|
262
|
-
|
|
263
|
-
|
|
147
|
+
if debug:
|
|
148
|
+
print(f"Iteration {iteration_count + 1}/{max_iterations}")
|
|
149
|
+
|
|
150
|
+
context_injection = ""
|
|
151
|
+
if plonk_context:
|
|
152
|
+
context_injection = f"""
|
|
153
|
+
---
|
|
154
|
+
IMPORTANT TEAM CONTEXT FOR THIS TASK:
|
|
155
|
+
{plonk_context}
|
|
156
|
+
---
|
|
157
|
+
"""
|
|
158
|
+
|
|
159
|
+
completion_example_text = """
|
|
160
|
+
{
|
|
161
|
+
"actions": [],
|
|
162
|
+
"status": "Task appears complete. Waiting for user approval to proceed or finish."
|
|
163
|
+
}
|
|
164
|
+
"""
|
|
165
|
+
|
|
166
|
+
quit_rule_text = 'NEVER include {"type": "quit"} in your actions - the user controls when to stop.'
|
|
167
|
+
|
|
168
|
+
prompt_examples = """
|
|
169
|
+
---
|
|
170
|
+
EXAMPLE 1: Task "Create and save a file named 'memo.txt' with the text 'Meeting at 3pm'"
|
|
171
|
+
{
|
|
172
|
+
"actions": [
|
|
173
|
+
{ "type": "bash", "command": "gedit &" },
|
|
174
|
+
|
|
175
|
+
{"type":"click", "x": 10, "y": 30}
|
|
176
|
+
]
|
|
177
|
+
}
|
|
178
|
+
---
|
|
179
|
+
EXAMPLE 2: Task "Search for news about space exploration"
|
|
180
|
+
{
|
|
181
|
+
"actions": [
|
|
182
|
+
{ "type": "bash", "command": "firefox &" },
|
|
183
|
+
|
|
184
|
+
]
|
|
185
|
+
}
|
|
186
|
+
---
|
|
187
|
+
EXAMPLE 3: Task "Click the red button on the form"
|
|
188
|
+
{
|
|
189
|
+
"actions": [
|
|
190
|
+
{ "type": "click", "x": 75, "y": 45 }
|
|
191
|
+
]
|
|
192
|
+
}
|
|
193
|
+
---
|
|
194
|
+
EXAMPLE 4: Task "Open Gmail and draft a reply to most recent email"
|
|
195
|
+
{
|
|
196
|
+
"actions": [
|
|
197
|
+
{ "type": "bash", "command": "open -a Safari" },
|
|
198
|
+
|
|
199
|
+
]
|
|
200
|
+
}
|
|
201
|
+
"""
|
|
202
|
+
|
|
203
|
+
prompt_template = f"""
|
|
204
|
+
Goal: {request}
|
|
205
|
+
Feedback from last action: {last_action_feedback}
|
|
264
206
|
|
|
265
|
-
|
|
266
|
-
Goal: {request}
|
|
267
|
-
Feedback from last action: {last_action_feedback}
|
|
207
|
+
{context_injection}
|
|
268
208
|
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
209
|
+
Your task is to control the computer to achieve the goal.
|
|
210
|
+
|
|
211
|
+
IMPORTANT: You should take actions step-by-step and verify each step works before proceeding.
|
|
212
|
+
DO NOT plan all actions at once - take a few actions, then look at the screen again.
|
|
213
|
+
|
|
214
|
+
CRITICAL: NEVER use the 'quit' action automatically. Even if the task appears complete,
|
|
215
|
+
continue working or wait for user guidance. The user will decide when to quit.
|
|
216
|
+
|
|
217
|
+
THOUGHT PROCESS:
|
|
218
|
+
1. Analyze the screen. Is the application I need (e.g., a web browser) already open?
|
|
219
|
+
2. If YES, `click` it. If NO, use `bash` to launch it. Use the examples: {system_examples}.
|
|
220
|
+
3. Take 2-3 actions maximum, then let me see the screen again to verify progress.
|
|
221
|
+
4. If task appears complete, explain status but DO NOT quit - wait for user direction.
|
|
222
|
+
|
|
223
|
+
Your response MUST be a JSON object with an "actions" key.
|
|
224
|
+
All clicking actions should use percentage coordinates relative
|
|
225
|
+
to the screen size.
|
|
226
|
+
The x and y are (0,0) at the TOP LEFT CORNER OF THE SCREEN.
|
|
227
|
+
|
|
228
|
+
MAXIMUM 3 ACTIONS PER RESPONSE - then let me see the screen to verify progress.
|
|
229
|
+
Never do more than one click, type, or hotkey event per response. It is important to take a sequence of
|
|
230
|
+
slow actions separated to avoid making mistakes and falling in loops.
|
|
231
|
+
|
|
232
|
+
If the task appears complete, you can include an empty actions list and explain:
|
|
233
|
+
{completion_example_text}
|
|
234
|
+
|
|
235
|
+
{quit_rule_text}
|
|
236
|
+
""" + prompt_examples
|
|
278
237
|
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
{{ "type": "wait", "duration": 2 }},
|
|
296
|
-
{{'type':'click', 'x': 10, 'y': 30}},
|
|
297
|
-
{{ "type": "type", "text": "Meeting at 3pm" }},
|
|
298
|
-
{{ "type": "hotkey", "keys": ["ctrl", "s"] }},
|
|
299
|
-
{{ "type": "wait", "duration": 1 }},
|
|
300
|
-
{{ "type": "type", "text": "memo.txt" }},
|
|
301
|
-
{{ "type": "key", "keys": ["enter"] }},
|
|
302
|
-
]
|
|
303
|
-
}}
|
|
304
|
-
---
|
|
305
|
-
EXAMPLE 2: Task "Search for news about space exploration"
|
|
306
|
-
{{
|
|
307
|
-
"actions": [
|
|
308
|
-
{{ "type": "bash", "command": "firefox &" }},
|
|
309
|
-
{{ "type": "wait", "duration": 3 }},
|
|
310
|
-
{{ "type": "type", "text": "news about space exploration" }},
|
|
311
|
-
{{ "type": "key", "keys": ["enter"] }},
|
|
312
|
-
]
|
|
313
|
-
}}
|
|
314
|
-
|
|
315
|
-
---
|
|
316
|
-
|
|
317
|
-
Once a task has been verified and completed, your action list should only be
|
|
318
|
-
{{
|
|
319
|
-
"actions": [
|
|
320
|
-
{{ "type": "quit" }}
|
|
321
|
-
]
|
|
322
|
-
}}
|
|
323
|
-
"""
|
|
238
|
+
image_to_send_path = screenshot_path
|
|
239
|
+
|
|
240
|
+
if click_history:
|
|
241
|
+
marked_image_path = "/tmp/marked_screenshot.png"
|
|
242
|
+
if add_click_vector_trail(screenshot_path, click_history, marked_image_path):
|
|
243
|
+
image_to_send_path = marked_image_path
|
|
244
|
+
if debug:
|
|
245
|
+
print(f"Drew click trail with {len(click_history)} points.")
|
|
246
|
+
|
|
247
|
+
response = get_llm_response(prompt_template, model=model, provider=provider, npc=npc,
|
|
248
|
+
images=[image_to_send_path], messages=messages, format="json")
|
|
249
|
+
messages = response.get("messages", messages)
|
|
250
|
+
response_data = response.get('response')
|
|
251
|
+
|
|
252
|
+
if debug:
|
|
253
|
+
print(response_data)
|
|
324
254
|
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
continue
|
|
255
|
+
if not isinstance(response_data, dict) or "actions" not in response_data:
|
|
256
|
+
last_action_feedback = f"Invalid JSON response from model: {response_data}"
|
|
257
|
+
continue
|
|
329
258
|
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
259
|
+
actions_list = response_data.get("actions", [])
|
|
260
|
+
if not isinstance(actions_list, list):
|
|
261
|
+
last_action_feedback = "Model did not return a list in the 'actions' key."
|
|
262
|
+
continue
|
|
263
|
+
|
|
264
|
+
for action in actions_list:
|
|
265
|
+
if debug:
|
|
266
|
+
print(f"Executing action: {action}")
|
|
338
267
|
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
268
|
+
if action.get("type") == "quit":
|
|
269
|
+
print("⚠️ Model attempted to quit automatically. Ignoring.")
|
|
270
|
+
continue
|
|
271
|
+
|
|
272
|
+
result = perform_action(action)
|
|
273
|
+
last_action_feedback = result.get("message") or result.get("output")
|
|
274
|
+
|
|
275
|
+
if action.get("type") == "click":
|
|
276
|
+
click_info = {"x": action.get("x"), "y": action.get("y")}
|
|
277
|
+
click_history.append(click_info)
|
|
278
|
+
if len(click_history) > 6:
|
|
279
|
+
click_history.pop(0)
|
|
348
280
|
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
format="json",
|
|
364
|
-
)
|
|
365
|
-
|
|
366
|
-
if "messages" in response:
|
|
367
|
-
messages = response["messages"]
|
|
368
|
-
|
|
369
|
-
response_data = response.get('response')
|
|
370
|
-
|
|
371
|
-
if not isinstance(response_data, dict) or "actions" not in response_data:
|
|
372
|
-
last_action_feedback = f"Invalid JSON response from model: {response_data}"
|
|
373
|
-
continue
|
|
374
|
-
|
|
375
|
-
actions_list = response_data.get("actions", [])
|
|
281
|
+
if result.get("status") == "error":
|
|
282
|
+
last_action_feedback = f"Action failed: {last_action_feedback}"
|
|
283
|
+
print(f"Action failed, providing feedback to model: {last_action_feedback}")
|
|
284
|
+
break
|
|
285
|
+
time.sleep(1)
|
|
286
|
+
|
|
287
|
+
if response_data.get("status") and "complete" in response_data.get("status", "").lower():
|
|
288
|
+
print(f"🎯 Model reports: {response_data.get('status')}")
|
|
289
|
+
print(" Press Ctrl+C to provide guidance or approval, or let it continue...")
|
|
290
|
+
|
|
291
|
+
if not actions_list:
|
|
292
|
+
last_action_feedback = "No actions were returned by the model. Re-evaluating."
|
|
293
|
+
if debug:
|
|
294
|
+
print(last_action_feedback)
|
|
376
295
|
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
if action.get("type") == "click":
|
|
393
|
-
last_click_coords = {"x": action.get("x"), "y": action.get("y")}
|
|
394
|
-
|
|
395
|
-
if result.get("status") == "error":
|
|
396
|
-
print(f"Action failed, providing feedback to model: {last_action_feedback}")
|
|
296
|
+
except KeyboardInterrupt:
|
|
297
|
+
print("\n⚠️ Plonk paused. Provide additional guidance or press Enter to continue.")
|
|
298
|
+
try:
|
|
299
|
+
user_guidance = input("Guidance > ").strip()
|
|
300
|
+
if user_guidance:
|
|
301
|
+
request += f"\n\n---\nUser Guidance: {user_guidance}\n---"
|
|
302
|
+
last_action_feedback = "User provided new guidance to correct the course."
|
|
303
|
+
print("✅ Guidance received. Resuming with updated instructions...")
|
|
304
|
+
else:
|
|
305
|
+
last_action_feedback = "User paused and resumed without new guidance."
|
|
306
|
+
print("✅ No guidance provided. Resuming...")
|
|
307
|
+
continue
|
|
308
|
+
except EOFError:
|
|
309
|
+
print("\nExiting plonk mode.")
|
|
397
310
|
break
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
if not actions_list:
|
|
401
|
-
last_action_feedback = "No actions were returned. The task is likely not complete. Re-evaluating."
|
|
402
|
-
print(last_action_feedback)
|
|
403
|
-
|
|
404
|
-
iteration_count += 1
|
|
311
|
+
|
|
405
312
|
return synthesized_summary
|
|
406
313
|
|
|
314
|
+
def main():
|
|
315
|
+
parser = argparse.ArgumentParser(description="Execute GUI automation tasks using vision models")
|
|
316
|
+
parser.add_argument("request", help="The task to perform")
|
|
317
|
+
parser.add_argument("--model", help="Model to use")
|
|
318
|
+
parser.add_argument("--provider", help="Provider to use")
|
|
319
|
+
parser.add_argument("--max-iterations", type=int, default=10, help="Maximum iterations")
|
|
320
|
+
parser.add_argument("--debug", action="store_true", help="Enable debug output")
|
|
321
|
+
parser.add_argument("--npc", type=str, default=os.path.expanduser('~/.npcsh/npc_team/plonk.npc'), help="Path to NPC file")
|
|
322
|
+
|
|
323
|
+
args = parser.parse_args()
|
|
324
|
+
|
|
325
|
+
npc = NPC(file=args.npc) if os.path.exists(os.path.expanduser(args.npc)) else None
|
|
326
|
+
|
|
327
|
+
model = args.model or (npc.model if npc else NPCSH_VISION_MODEL)
|
|
328
|
+
provider = args.provider or (npc.provider if npc else NPCSH_VISION_PROVIDER)
|
|
329
|
+
|
|
330
|
+
summary = execute_plonk_command(
|
|
331
|
+
request=args.request,
|
|
332
|
+
model=model,
|
|
333
|
+
provider=provider,
|
|
334
|
+
npc=npc,
|
|
335
|
+
max_iterations=args.max_iterations,
|
|
336
|
+
debug=args.debug
|
|
337
|
+
)
|
|
338
|
+
|
|
339
|
+
print(format_plonk_summary(summary))
|
|
407
340
|
|
|
408
341
|
if __name__ == "__main__":
|
|
409
|
-
|
|
342
|
+
main()
|