npcsh 0.3.31__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- npcsh/_state.py +942 -0
- npcsh/alicanto.py +1074 -0
- npcsh/guac.py +785 -0
- npcsh/mcp_helpers.py +357 -0
- npcsh/mcp_npcsh.py +822 -0
- npcsh/mcp_server.py +184 -0
- npcsh/npc.py +218 -0
- npcsh/npcsh.py +1161 -0
- npcsh/plonk.py +387 -269
- npcsh/pti.py +234 -0
- npcsh/routes.py +958 -0
- npcsh/spool.py +315 -0
- npcsh/wander.py +550 -0
- npcsh/yap.py +573 -0
- npcsh-1.0.0.dist-info/METADATA +596 -0
- npcsh-1.0.0.dist-info/RECORD +21 -0
- {npcsh-0.3.31.dist-info → npcsh-1.0.0.dist-info}/WHEEL +1 -1
- npcsh-1.0.0.dist-info/entry_points.txt +9 -0
- {npcsh-0.3.31.dist-info → npcsh-1.0.0.dist-info}/licenses/LICENSE +1 -1
- npcsh/audio.py +0 -210
- npcsh/cli.py +0 -545
- npcsh/command_history.py +0 -566
- npcsh/conversation.py +0 -291
- npcsh/data_models.py +0 -46
- npcsh/dataframes.py +0 -163
- npcsh/embeddings.py +0 -168
- npcsh/helpers.py +0 -641
- npcsh/image.py +0 -298
- npcsh/image_gen.py +0 -79
- npcsh/knowledge_graph.py +0 -1006
- npcsh/llm_funcs.py +0 -2027
- npcsh/load_data.py +0 -83
- npcsh/main.py +0 -5
- npcsh/model_runner.py +0 -189
- npcsh/npc_compiler.py +0 -2870
- npcsh/npc_sysenv.py +0 -383
- npcsh/npc_team/assembly_lines/test_pipeline.py +0 -181
- npcsh/npc_team/corca.npc +0 -13
- npcsh/npc_team/foreman.npc +0 -7
- npcsh/npc_team/npcsh.ctx +0 -11
- npcsh/npc_team/sibiji.npc +0 -4
- npcsh/npc_team/templates/analytics/celona.npc +0 -0
- npcsh/npc_team/templates/hr_support/raone.npc +0 -0
- npcsh/npc_team/templates/humanities/eriane.npc +0 -4
- npcsh/npc_team/templates/it_support/lineru.npc +0 -0
- npcsh/npc_team/templates/marketing/slean.npc +0 -4
- npcsh/npc_team/templates/philosophy/maurawa.npc +0 -0
- npcsh/npc_team/templates/sales/turnic.npc +0 -4
- npcsh/npc_team/templates/software/welxor.npc +0 -0
- npcsh/npc_team/tools/bash_executer.tool +0 -32
- npcsh/npc_team/tools/calculator.tool +0 -8
- npcsh/npc_team/tools/code_executor.tool +0 -16
- npcsh/npc_team/tools/generic_search.tool +0 -27
- npcsh/npc_team/tools/image_generation.tool +0 -25
- npcsh/npc_team/tools/local_search.tool +0 -149
- npcsh/npc_team/tools/npcsh_executor.tool +0 -9
- npcsh/npc_team/tools/screen_cap.tool +0 -27
- npcsh/npc_team/tools/sql_executor.tool +0 -26
- npcsh/response.py +0 -623
- npcsh/search.py +0 -248
- npcsh/serve.py +0 -1460
- npcsh/shell.py +0 -538
- npcsh/shell_helpers.py +0 -3529
- npcsh/stream.py +0 -700
- npcsh/video.py +0 -49
- npcsh-0.3.31.data/data/npcsh/npc_team/bash_executer.tool +0 -32
- npcsh-0.3.31.data/data/npcsh/npc_team/calculator.tool +0 -8
- npcsh-0.3.31.data/data/npcsh/npc_team/celona.npc +0 -0
- npcsh-0.3.31.data/data/npcsh/npc_team/code_executor.tool +0 -16
- npcsh-0.3.31.data/data/npcsh/npc_team/corca.npc +0 -13
- npcsh-0.3.31.data/data/npcsh/npc_team/eriane.npc +0 -4
- npcsh-0.3.31.data/data/npcsh/npc_team/foreman.npc +0 -7
- npcsh-0.3.31.data/data/npcsh/npc_team/generic_search.tool +0 -27
- npcsh-0.3.31.data/data/npcsh/npc_team/image_generation.tool +0 -25
- npcsh-0.3.31.data/data/npcsh/npc_team/lineru.npc +0 -0
- npcsh-0.3.31.data/data/npcsh/npc_team/local_search.tool +0 -149
- npcsh-0.3.31.data/data/npcsh/npc_team/maurawa.npc +0 -0
- npcsh-0.3.31.data/data/npcsh/npc_team/npcsh.ctx +0 -11
- npcsh-0.3.31.data/data/npcsh/npc_team/npcsh_executor.tool +0 -9
- npcsh-0.3.31.data/data/npcsh/npc_team/raone.npc +0 -0
- npcsh-0.3.31.data/data/npcsh/npc_team/screen_cap.tool +0 -27
- npcsh-0.3.31.data/data/npcsh/npc_team/sibiji.npc +0 -4
- npcsh-0.3.31.data/data/npcsh/npc_team/slean.npc +0 -4
- npcsh-0.3.31.data/data/npcsh/npc_team/sql_executor.tool +0 -26
- npcsh-0.3.31.data/data/npcsh/npc_team/test_pipeline.py +0 -181
- npcsh-0.3.31.data/data/npcsh/npc_team/turnic.npc +0 -4
- npcsh-0.3.31.data/data/npcsh/npc_team/welxor.npc +0 -0
- npcsh-0.3.31.dist-info/METADATA +0 -1853
- npcsh-0.3.31.dist-info/RECORD +0 -76
- npcsh-0.3.31.dist-info/entry_points.txt +0 -3
- {npcsh-0.3.31.dist-info → npcsh-1.0.0.dist-info}/top_level.txt +0 -0
npcsh/plonk.py
CHANGED
|
@@ -1,291 +1,409 @@
|
|
|
1
|
-
import
|
|
1
|
+
from npcpy.data.image import capture_screenshot
|
|
2
2
|
import time
|
|
3
|
+
import platform
|
|
4
|
+
from npcpy.llm_funcs import get_llm_response
|
|
5
|
+
from npcpy.work.desktop import perform_action, action_space
|
|
6
|
+
from PIL import Image, ImageDraw, ImageFont
|
|
3
7
|
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
action_space = {
|
|
19
|
-
"hotkey": {"key": "string"}, # For pressing hotkeys
|
|
20
|
-
"click": {
|
|
21
|
-
"x": "int between 0 and 100",
|
|
22
|
-
"y": "int between 0 and 100",
|
|
23
|
-
}, # For clicking
|
|
24
|
-
"drag": {
|
|
25
|
-
"x": "int between 0 and 100",
|
|
26
|
-
"y": "int between 0 and 100",
|
|
27
|
-
"duration": "int",
|
|
28
|
-
}, # For dragging
|
|
29
|
-
"wait": {"duration": "int"}, # For waiting
|
|
30
|
-
"type": {"text": "string"},
|
|
31
|
-
"right_click": {"x": "int between 0 and 100", "y": "int between 0 and 100"},
|
|
32
|
-
"double_click": {"x": "int between 0 and 100", "y": "int between 0 and 100"},
|
|
33
|
-
"bash": {"command": "string"},
|
|
34
|
-
}
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
def perform_action(action):
|
|
38
|
-
"""
|
|
39
|
-
Execute different types of actions using PyAutoGUI
|
|
40
|
-
"""
|
|
41
|
-
try:
|
|
42
|
-
pyautogui.PAUSE = 1 # Add a small pause between actions
|
|
43
|
-
pyautogui.FAILSAFE = (
|
|
44
|
-
True # Enable fail-safe to stop script by moving mouse to corner
|
|
45
|
-
)
|
|
46
|
-
|
|
47
|
-
print(f"Action received: {action}") # Debug print
|
|
48
|
-
|
|
49
|
-
if action["type"] == "click":
|
|
50
|
-
pyautogui.click(x=action.get("x"), y=action.get("y"))
|
|
51
|
-
|
|
52
|
-
elif action["type"] == "double_click":
|
|
53
|
-
pyautogui.doubleClick(x=action.get("x"), y=action.get("y"))
|
|
54
|
-
|
|
55
|
-
elif action["type"] == "right_click":
|
|
56
|
-
pyautogui.rightClick(x=action.get("x"), y=action.get("y"))
|
|
57
|
-
|
|
58
|
-
elif action["type"] == "drag":
|
|
59
|
-
pyautogui.dragTo(
|
|
60
|
-
x=action.get("x"), y=action.get("y"), duration=action.get("duration", 1)
|
|
61
|
-
)
|
|
62
|
-
|
|
63
|
-
elif action["type"] == "type":
|
|
64
|
-
text = action.get("text", "")
|
|
65
|
-
if isinstance(text, dict):
|
|
66
|
-
text = text.get("text", "")
|
|
67
|
-
pyautogui.typewrite(text)
|
|
68
|
-
|
|
69
|
-
elif action["type"] == "hotkey":
|
|
70
|
-
keys = action.get("text", "")
|
|
71
|
-
print(f"Hotkey action: {keys}") # Debug print
|
|
72
|
-
if isinstance(keys, str):
|
|
73
|
-
keys = [keys]
|
|
74
|
-
elif isinstance(keys, dict):
|
|
75
|
-
keys = [keys.get("key", "")]
|
|
76
|
-
pyautogui.hotkey(*keys)
|
|
77
|
-
|
|
78
|
-
elif action["type"] == "wait":
|
|
79
|
-
time.sleep(action.get("duration", 1)) # Wait for the given time in seconds
|
|
80
|
-
|
|
81
|
-
elif action["type"] == "bash":
|
|
82
|
-
command = action.get("command", "")
|
|
83
|
-
print(f"Running bash command: {command}") # Debug print
|
|
84
|
-
subprocess.Popen(
|
|
85
|
-
command, shell=True
|
|
86
|
-
) # Run the command without waiting for it to complete
|
|
87
|
-
print(f"Bash Command Output: {result.stdout.decode()}") # Debug output
|
|
88
|
-
print(f"Bash Command Error: {result.stderr.decode()}") # Debug error
|
|
89
|
-
|
|
90
|
-
return {"status": "success"}
|
|
91
|
-
|
|
92
|
-
except Exception as e:
|
|
93
|
-
return {"status": "error", "message": str(e)}
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
def plonk(request, action_space, model=None, provider=None, npc=None):
|
|
97
|
-
"""
|
|
98
|
-
Main interaction loop with LLM for action determination
|
|
99
|
-
|
|
100
|
-
Args:
|
|
101
|
-
request (str): The task to be performed
|
|
102
|
-
action_space (dict): Available action types and the inputs they require
|
|
103
|
-
npc (optional): NPC object for context and screenshot
|
|
104
|
-
**kwargs: Additional arguments for LLM response
|
|
8
|
+
def get_system_examples():
|
|
9
|
+
system = platform.system()
|
|
10
|
+
if system == "Windows":
|
|
11
|
+
return "Examples: start firefox, notepad, calc, explorer"
|
|
12
|
+
elif system == "Darwin":
|
|
13
|
+
return "Examples: open -a Firefox, open -a TextEdit, open -a Calculator"
|
|
14
|
+
else:
|
|
15
|
+
return "Examples: firefox &, gedit &, gnome-calculator &"
|
|
16
|
+
|
|
17
|
+
def execute_plonk_command(request, action_space, model, provider, npc=None, max_iterations=10, debug=False):
|
|
18
|
+
synthesized_summary = []
|
|
19
|
+
|
|
20
|
+
"""Synthesizes information gathered during the computer use run and logs key data points for
|
|
21
|
+
analysis. This function can be extended to store or report the synthesized knowledge as required.
|
|
105
22
|
"""
|
|
106
|
-
prompt = f"""
|
|
107
|
-
Here is a request from a user:
|
|
108
|
-
{request}
|
|
109
|
-
|
|
110
|
-
Your job is to choose certain actions, take screenshots,
|
|
111
|
-
and evaluate what the next step is to complete the task.
|
|
112
|
-
|
|
113
|
-
You can choose from the following action types:
|
|
114
|
-
{json.dumps(action_space)}
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
Attached to the message is a screenshot of the current screen.
|
|
119
|
-
|
|
120
|
-
Please use that information to determine the next steps.
|
|
121
|
-
Your response must be a JSON with an 'actions' key containing a list of actions.
|
|
122
|
-
Each action should have a 'type' and any necessary parameters.https://www.reddit.com
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
For example:
|
|
126
|
-
Your response should look like:
|
|
127
23
|
|
|
24
|
+
system = platform.system()
|
|
25
|
+
system_examples = get_system_examples()
|
|
26
|
+
|
|
27
|
+
messages = []
|
|
28
|
+
last_action_feedback = "None"
|
|
29
|
+
last_click_coords = None
|
|
30
|
+
|
|
31
|
+
iteration_count = 0
|
|
32
|
+
while iteration_count < max_iterations:
|
|
33
|
+
# Gathering summary of actions performed this iteration
|
|
34
|
+
synthesized_info = {
|
|
35
|
+
'iteration': iteration_count + 1,
|
|
36
|
+
'last_action_feedback': last_action_feedback,
|
|
37
|
+
'last_click_coords': last_click_coords
|
|
38
|
+
}
|
|
39
|
+
synthesized_summary.append(synthesized_info)
|
|
40
|
+
|
|
41
|
+
if debug:
|
|
42
|
+
print(f"Synthesized info at iteration {iteration_count + 1}: {synthesized_info}")
|
|
43
|
+
|
|
44
|
+
if debug:
|
|
45
|
+
print(f"Iteration {iteration_count + 1}/{max_iterations}")
|
|
46
|
+
|
|
47
|
+
# YOUR PROMPT, UNTOUCHED
|
|
48
|
+
prompt_template = f"""
|
|
49
|
+
Goal: {request}
|
|
50
|
+
Feedback from last action: {last_action_feedback}
|
|
51
|
+
|
|
52
|
+
Your task is to control the computer to achieve the goal.
|
|
53
|
+
|
|
54
|
+
THOUGHT PROCESS:
|
|
55
|
+
1. Analyze the screen. Is the application I need (e.g., a web browser) already open?
|
|
56
|
+
2. If YES, `click` it. If NO, use `bash` to launch it. Use the examples: {system_examples}.
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
CRITICAL COMPLETION RULE:
|
|
60
|
+
Once the goal is visually complete on the screen, your ONLY next action is to use the 'quit' action.
|
|
61
|
+
|
|
62
|
+
Your response MUST be a JSON object with an "actions" key.
|
|
63
|
+
All clicking actions should use percentage coordinates relative
|
|
64
|
+
to the screen size, as we will
|
|
65
|
+
manually translate them to the proper screen size.
|
|
66
|
+
your x and y values for clicks must ALWAYS be between 0 and 100.
|
|
67
|
+
The x and y are (0,0) at the TOP LEFT CORNER OF THE SCREEN.
|
|
68
|
+
The bottom right corner of the screen is (100,100).
|
|
69
|
+
the bottom left corner is (0,100) and the top right corner is (100,0).
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
---
|
|
75
|
+
EXAMPLE 1: Task "Create and save a file named 'memo.txt' with the text 'Meeting at 3pm'"
|
|
128
76
|
{{
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
77
|
+
"actions": [
|
|
78
|
+
{{ "type": "bash", "command": "gedit &" }},
|
|
79
|
+
{{ "type": "wait", "duration": 2 }},
|
|
80
|
+
{{'type':'click', 'x': 10, 'y': 30}},
|
|
81
|
+
{{ "type": "type", "text": "Meeting at 3pm" }},
|
|
82
|
+
{{ "type": "hotkey", "keys": ["ctrl", "s"] }},
|
|
83
|
+
{{ "type": "wait", "duration": 1 }},
|
|
84
|
+
{{ "type": "type", "text": "memo.txt" }},
|
|
85
|
+
{{ "type": "key", "keys": ["enter"] }},
|
|
86
|
+
]
|
|
87
|
+
}}
|
|
88
|
+
---
|
|
89
|
+
EXAMPLE 2: Task "Search for news about space exploration"
|
|
90
|
+
{{
|
|
91
|
+
"actions": [
|
|
92
|
+
{{ "type": "bash", "command": "firefox &" }},
|
|
93
|
+
{{ "type": "wait", "duration": 3 }},
|
|
94
|
+
{{ "type": "type", "text": "news about space exploration" }},
|
|
95
|
+
{{ "type": "key", "keys": ["enter"] }},
|
|
96
|
+
]
|
|
97
|
+
}}
|
|
98
|
+
|
|
99
|
+
---
|
|
100
|
+
|
|
101
|
+
Once a task has been verified and completed, your action list should only be
|
|
102
|
+
{{
|
|
103
|
+
"actions": [
|
|
104
|
+
{{ "type": "quit" }}
|
|
105
|
+
]
|
|
106
|
+
}}
|
|
107
|
+
"""
|
|
108
|
+
|
|
109
|
+
screenshot_path = capture_screenshot(npc=npc, full=True).get('file_path')
|
|
110
|
+
if not screenshot_path:
|
|
111
|
+
time.sleep(2)
|
|
112
|
+
continue
|
|
113
|
+
|
|
114
|
+
image_to_send_path = screenshot_path
|
|
115
|
+
if last_click_coords:
|
|
116
|
+
try:
|
|
117
|
+
img = Image.open(screenshot_path)
|
|
118
|
+
draw = ImageDraw.Draw(img)
|
|
119
|
+
width, height = img.size
|
|
120
|
+
x_pixel = int(last_click_coords['x'] * width / 100)
|
|
121
|
+
y_pixel = int(last_click_coords['y'] * height / 100)
|
|
122
|
+
|
|
123
|
+
try:
|
|
124
|
+
font = ImageFont.truetype("DejaVuSans-Bold.ttf", size=48)
|
|
125
|
+
except IOError:
|
|
126
|
+
font = ImageFont.load_default()
|
|
127
|
+
|
|
128
|
+
draw.text((x_pixel - 8, y_pixel - 12),
|
|
129
|
+
f"+{last_click_coords['x'],last_click_coords['y']}",
|
|
130
|
+
fill="red",
|
|
131
|
+
font=font)
|
|
132
|
+
|
|
133
|
+
marked_image_path = "/tmp/marked_screenshot.png"
|
|
134
|
+
img.save(marked_image_path)
|
|
135
|
+
image_to_send_path = marked_image_path
|
|
136
|
+
print(f"Drew marker at ({x_pixel}, {y_pixel}) on new screenshot.")
|
|
137
|
+
except Exception as e:
|
|
138
|
+
print(f"Failed to draw marker on image: {e}")
|
|
139
|
+
|
|
155
140
|
response = get_llm_response(
|
|
156
|
-
prompt,
|
|
157
|
-
images=[screenshot],
|
|
141
|
+
prompt=prompt_template,
|
|
158
142
|
model=model,
|
|
159
143
|
provider=provider,
|
|
160
144
|
npc=npc,
|
|
145
|
+
images=[image_to_send_path],
|
|
146
|
+
messages=messages,
|
|
161
147
|
format="json",
|
|
162
148
|
)
|
|
163
|
-
# print("LLM Response:", response, type(response))
|
|
164
|
-
# Check if task is complete
|
|
165
|
-
print(response["response"])
|
|
166
|
-
if not response["response"].get("actions", []):
|
|
167
|
-
return response
|
|
168
|
-
|
|
169
|
-
# Execute actions
|
|
170
|
-
for action in response["response"]["actions"]:
|
|
171
|
-
print("Performing action:", action)
|
|
172
|
-
action_result = perform_action(action)
|
|
173
|
-
perform_action({"type": "wait", "duration": 5})
|
|
174
149
|
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
150
|
+
if "messages" in response:
|
|
151
|
+
messages = response["messages"]
|
|
152
|
+
|
|
153
|
+
response_data = response.get('response')
|
|
154
|
+
|
|
155
|
+
if not isinstance(response_data, dict) or "actions" not in response_data:
|
|
156
|
+
last_action_feedback = f"Invalid JSON response from model: {response_data}"
|
|
157
|
+
continue
|
|
158
|
+
|
|
159
|
+
actions_list = response_data.get("actions", [])
|
|
160
|
+
|
|
161
|
+
if not isinstance(actions_list, list):
|
|
162
|
+
last_action_feedback = "Model did not return a list in the 'actions' key."
|
|
163
|
+
continue
|
|
164
|
+
|
|
165
|
+
# Reset last click before processing new actions
|
|
166
|
+
last_click_coords = None
|
|
167
|
+
for action in actions_list:
|
|
168
|
+
if debug:
|
|
169
|
+
print(f"Executing action: {action}")
|
|
170
|
+
if action.get("type") == "quit":
|
|
171
|
+
print("Task complete: Model returned 'quit' action.")
|
|
172
|
+
return "SUCCESS"
|
|
173
|
+
|
|
174
|
+
result = perform_action(action)
|
|
175
|
+
last_action_feedback = result.get("message") or result.get("output")
|
|
176
|
+
|
|
177
|
+
if action.get("type") == "click":
|
|
178
|
+
last_click_coords = {"x": action.get("x"), "y": action.get("y")}
|
|
179
|
+
|
|
180
|
+
if result.get("status") == "error":
|
|
181
|
+
print(f"Action failed, providing feedback to model: {last_action_feedback}")
|
|
182
|
+
break
|
|
183
|
+
time.sleep(1)
|
|
184
|
+
|
|
185
|
+
if not actions_list:
|
|
186
|
+
last_action_feedback = "No actions were returned. The task is likely not complete. Re-evaluating."
|
|
187
|
+
print(last_action_feedback)
|
|
188
|
+
|
|
189
|
+
iteration_count += 1
|
|
190
|
+
|
|
191
|
+
return None
|
|
192
|
+
|
|
193
|
+
def synthesize_and_display_summary(synthesized_summary, debug=False):
|
|
194
|
+
"""Synthesizes information gathered during the computer use run and logs key data points."""
|
|
195
|
+
if not synthesized_summary:
|
|
196
|
+
print("No synthesized info to display.")
|
|
197
|
+
return
|
|
198
|
+
|
|
199
|
+
print("\nSynthesized Summary of Computer Use Run:")
|
|
200
|
+
for info in synthesized_summary:
|
|
201
|
+
print(f"Iteration {info['iteration']}:\n"
|
|
202
|
+
f" Last Action Feedback: {info['last_action_feedback']}\n"
|
|
203
|
+
f" Last Click Coordinates: {info['last_click_coords']}")
|
|
204
|
+
print("End of synthesized summary.\n")
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def repl_loop():
|
|
209
|
+
print("Assistant REPL - Type your plonk command or 'exit' to quit.")
|
|
210
|
+
while True:
|
|
211
|
+
user_input = input("Enter your command: ").strip()
|
|
212
|
+
if user_input.lower() == 'exit':
|
|
213
|
+
print("Exiting REPL. Goodbye!")
|
|
214
|
+
break
|
|
215
|
+
if not user_input:
|
|
216
|
+
continue
|
|
217
|
+
|
|
218
|
+
# Run the plonk command and get synthesized summary
|
|
219
|
+
synthesized_summary = execute_plonk_command(
|
|
220
|
+
request=user_input,
|
|
221
|
+
action_space=action_space,
|
|
222
|
+
model="gpt-4o-mini",
|
|
223
|
+
provider="openai",
|
|
224
|
+
max_iterations=8,
|
|
225
|
+
debug=True
|
|
226
|
+
)
|
|
178
227
|
|
|
179
|
-
|
|
180
|
-
|
|
228
|
+
if synthesized_summary and isinstance(synthesized_summary, list):
|
|
229
|
+
print("Command executed with synthesized summary.")
|
|
230
|
+
synthesize_and_display_summary(synthesized_summary)
|
|
231
|
+
else:
|
|
232
|
+
print("Command did not complete within iteration limit or returned no summary.")
|
|
181
233
|
|
|
182
234
|
|
|
183
|
-
def
|
|
235
|
+
def execute_plonk_command(request, action_space, model, provider, npc=None, max_iterations=10, debug=False):
|
|
236
|
+
"""Synthesizes information gathered during the computer use run and logs key data points for
|
|
237
|
+
analysis. This function can be extended to store or report the synthesized knowledge as required.
|
|
184
238
|
"""
|
|
185
|
-
Test function to open a web browser and navigate to Reddit using plonk
|
|
186
|
-
"""
|
|
187
|
-
# Define the action space for web navigation
|
|
188
|
-
|
|
189
|
-
# Request to navigate to Reddit
|
|
190
|
-
request = "Open a web browser and go to reddit.com"
|
|
191
|
-
|
|
192
|
-
# Determine the browser launch hotkey based on the operating system
|
|
193
|
-
import platform
|
|
194
239
|
|
|
195
240
|
system = platform.system()
|
|
241
|
+
system_examples = get_system_examples()
|
|
242
|
+
|
|
243
|
+
messages = []
|
|
244
|
+
last_action_feedback = "None"
|
|
245
|
+
last_click_coords = None
|
|
246
|
+
|
|
247
|
+
iteration_count = 0
|
|
248
|
+
|
|
249
|
+
synthesized_summary = []
|
|
250
|
+
|
|
251
|
+
while iteration_count < max_iterations:
|
|
252
|
+
synthesized_info = {
|
|
253
|
+
'iteration': iteration_count + 1,
|
|
254
|
+
'last_action_feedback': last_action_feedback,
|
|
255
|
+
'last_click_coords': last_click_coords
|
|
256
|
+
}
|
|
257
|
+
synthesized_summary.append(synthesized_info)
|
|
258
|
+
|
|
259
|
+
if debug:
|
|
260
|
+
print(f"Synthesized info at iteration {iteration_count + 1}: {synthesized_info}")
|
|
261
|
+
|
|
262
|
+
if debug:
|
|
263
|
+
print(f"Iteration {iteration_count + 1}/{max_iterations}")
|
|
264
|
+
|
|
265
|
+
prompt_template = f"""
|
|
266
|
+
Goal: {request}
|
|
267
|
+
Feedback from last action: {last_action_feedback}
|
|
268
|
+
|
|
269
|
+
Your task is to control the computer to achieve the goal.
|
|
270
|
+
|
|
271
|
+
THOUGHT PROCESS:
|
|
272
|
+
1. Analyze the screen. Is the application I need (e.g., a web browser) already open?
|
|
273
|
+
2. If YES, `click` it. If NO, use `bash` to launch it. Use the examples: {system_examples}.
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
CRITICAL COMPLETION RULE:
|
|
277
|
+
Once the goal is visually complete on the screen, your ONLY next action is to use the 'quit' action.
|
|
278
|
+
|
|
279
|
+
Your response MUST be a JSON object with an "actions" key.
|
|
280
|
+
All clicking actions should use percentage coordinates relative
|
|
281
|
+
to the screen size, as we will
|
|
282
|
+
manually translate them to the proper screen size.
|
|
283
|
+
your x and y values for clicks must ALWAYS be between 0 and 100.
|
|
284
|
+
The x and y are (0,0) at the TOP LEFT CORNER OF THE SCREEN.
|
|
285
|
+
The bottom right corner of the screen is (100,100).
|
|
286
|
+
the bottom left corner is (0,100) and the top right corner is (100,0).
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
---
|
|
291
|
+
EXAMPLE 1: Task "Create and save a file named 'memo.txt' with the text 'Meeting at 3pm'"
|
|
292
|
+
{{
|
|
293
|
+
"actions": [
|
|
294
|
+
{{ "type": "bash", "command": "gedit &" }},
|
|
295
|
+
{{ "type": "wait", "duration": 2 }},
|
|
296
|
+
{{'type':'click', 'x': 10, 'y': 30}},
|
|
297
|
+
{{ "type": "type", "text": "Meeting at 3pm" }},
|
|
298
|
+
{{ "type": "hotkey", "keys": ["ctrl", "s"] }},
|
|
299
|
+
{{ "type": "wait", "duration": 1 }},
|
|
300
|
+
{{ "type": "type", "text": "memo.txt" }},
|
|
301
|
+
{{ "type": "key", "keys": ["enter"] }},
|
|
302
|
+
]
|
|
303
|
+
}}
|
|
304
|
+
---
|
|
305
|
+
EXAMPLE 2: Task "Search for news about space exploration"
|
|
306
|
+
{{
|
|
307
|
+
"actions": [
|
|
308
|
+
{{ "type": "bash", "command": "firefox &" }},
|
|
309
|
+
{{ "type": "wait", "duration": 3 }},
|
|
310
|
+
{{ "type": "type", "text": "news about space exploration" }},
|
|
311
|
+
{{ "type": "key", "keys": ["enter"] }},
|
|
312
|
+
]
|
|
313
|
+
}}
|
|
314
|
+
|
|
315
|
+
---
|
|
316
|
+
|
|
317
|
+
Once a task has been verified and completed, your action list should only be
|
|
318
|
+
{{
|
|
319
|
+
"actions": [
|
|
320
|
+
{{ "type": "quit" }}
|
|
321
|
+
]
|
|
322
|
+
}}
|
|
323
|
+
"""
|
|
324
|
+
|
|
325
|
+
screenshot_path = capture_screenshot(npc=npc, full=True).get('file_path')
|
|
326
|
+
if not screenshot_path:
|
|
327
|
+
time.sleep(2)
|
|
328
|
+
continue
|
|
329
|
+
|
|
330
|
+
image_to_send_path = screenshot_path
|
|
331
|
+
if last_click_coords:
|
|
332
|
+
try:
|
|
333
|
+
img = Image.open(screenshot_path)
|
|
334
|
+
draw = ImageDraw.Draw(img)
|
|
335
|
+
width, height = img.size
|
|
336
|
+
x_pixel = int(last_click_coords['x'] * width / 100)
|
|
337
|
+
y_pixel = int(last_click_coords['y'] * height / 100)
|
|
338
|
+
|
|
339
|
+
try:
|
|
340
|
+
font = ImageFont.truetype("DejaVuSans-Bold.ttf", size=48)
|
|
341
|
+
except IOError:
|
|
342
|
+
font = ImageFont.load_default()
|
|
343
|
+
|
|
344
|
+
draw.text((x_pixel - 8, y_pixel - 12),
|
|
345
|
+
f"+{last_click_coords['x'],last_click_coords['y']}",
|
|
346
|
+
fill="red",
|
|
347
|
+
font=font)
|
|
348
|
+
|
|
349
|
+
marked_image_path = "/tmp/marked_screenshot.png"
|
|
350
|
+
img.save(marked_image_path)
|
|
351
|
+
image_to_send_path = marked_image_path
|
|
352
|
+
print(f"Drew marker at ({x_pixel}, {y_pixel}) on new screenshot.")
|
|
353
|
+
except Exception as e:
|
|
354
|
+
print(f"Failed to draw marker on image: {e}")
|
|
355
|
+
|
|
356
|
+
response = get_llm_response(
|
|
357
|
+
prompt=prompt_template,
|
|
358
|
+
model=model,
|
|
359
|
+
provider=provider,
|
|
360
|
+
npc=npc,
|
|
361
|
+
images=[image_to_send_path],
|
|
362
|
+
messages=messages,
|
|
363
|
+
format="json",
|
|
364
|
+
)
|
|
196
365
|
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
):
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
Here is an example of a question and answer that you might generate:
|
|
239
|
-
|
|
240
|
-
Question: "Set up an automation system that will open a web browser every morning
|
|
241
|
-
and go to my bank account and export my transactions."
|
|
242
|
-
|
|
243
|
-
Answer:
|
|
244
|
-
"{{'plonk plan': ```
|
|
245
|
-
|
|
246
|
-
from npcsh.llm_funcs import get_llm_response
|
|
247
|
-
|
|
248
|
-
automation_script = get_llm_response( '''
|
|
249
|
-
Write a python script that will request input from a user about what bank they use. Then use selenium to open the browser and navigate to the bank's website.
|
|
250
|
-
Get the user's username and password and log in, also through raw input.
|
|
251
|
-
Then navigate to the transactions page and export the transactions. Ensure you are sufficiently logging information at each step of the way so that the results can be
|
|
252
|
-
debugged efficiently.
|
|
253
|
-
Return the script without any additional comment or Markdown formatting. It is imperative that you do not include any additional text.
|
|
254
|
-
''')
|
|
255
|
-
# write the automation script to a file
|
|
256
|
-
automation_script_file = open('automation_script.py', 'w')
|
|
257
|
-
automation_script_file.write(automation_script)
|
|
258
|
-
automation_script_file.close()
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
scheduling_script = get_llm_response( f'''
|
|
262
|
-
Write a bash script that will set up an OS scheduler to run the automation script every morning at 8 am.
|
|
263
|
-
The automation script is located at ./automation_script.py.
|
|
264
|
-
You'll need to ensure that the full path is used in the scheduling script.
|
|
265
|
-
Return the script without any additional comment or Markdown formatting.
|
|
266
|
-
It is imperative that you do not include any additional text.
|
|
267
|
-
Do not leave any placeholder paths or variables in the script.
|
|
268
|
-
They must be able to execute without
|
|
269
|
-
any further modification by you or the user.
|
|
270
|
-
''')
|
|
271
|
-
# write the scheduling script to a file
|
|
272
|
-
scheduling_script_file = open('scheduling_script.sh', 'w')
|
|
273
|
-
scheduling_script_file.write(scheduling_script)
|
|
274
|
-
|
|
275
|
-
scheduling_script_file.close()
|
|
276
|
-
# attempt to run the scheduling script
|
|
277
|
-
import subprocess
|
|
278
|
-
subprocess.run(['bash', 'scheduling_script.sh'])
|
|
279
|
-
```}}
|
|
280
|
-
|
|
281
|
-
In this example, we have set up a plan that will require multiple other LLM calls to generate the necessary items to
|
|
282
|
-
accomplish the user's request.
|
|
283
|
-
|
|
284
|
-
"""
|
|
285
|
-
|
|
286
|
-
return get_llm_response(prompt)
|
|
366
|
+
if "messages" in response:
|
|
367
|
+
messages = response["messages"]
|
|
368
|
+
|
|
369
|
+
response_data = response.get('response')
|
|
370
|
+
|
|
371
|
+
if not isinstance(response_data, dict) or "actions" not in response_data:
|
|
372
|
+
last_action_feedback = f"Invalid JSON response from model: {response_data}"
|
|
373
|
+
continue
|
|
374
|
+
|
|
375
|
+
actions_list = response_data.get("actions", [])
|
|
376
|
+
|
|
377
|
+
if not isinstance(actions_list, list):
|
|
378
|
+
last_action_feedback = "Model did not return a list in the 'actions' key."
|
|
379
|
+
continue
|
|
380
|
+
|
|
381
|
+
last_click_coords = None
|
|
382
|
+
for action in actions_list:
|
|
383
|
+
if debug:
|
|
384
|
+
print(f"Executing action: {action}")
|
|
385
|
+
if action.get("type") == "quit":
|
|
386
|
+
print("Task complete: Model returned 'quit' action.")
|
|
387
|
+
return synthesized_summary
|
|
388
|
+
|
|
389
|
+
result = perform_action(action)
|
|
390
|
+
last_action_feedback = result.get("message") or result.get("output")
|
|
391
|
+
|
|
392
|
+
if action.get("type") == "click":
|
|
393
|
+
last_click_coords = {"x": action.get("x"), "y": action.get("y")}
|
|
394
|
+
|
|
395
|
+
if result.get("status") == "error":
|
|
396
|
+
print(f"Action failed, providing feedback to model: {last_action_feedback}")
|
|
397
|
+
break
|
|
398
|
+
time.sleep(1)
|
|
399
|
+
|
|
400
|
+
if not actions_list:
|
|
401
|
+
last_action_feedback = "No actions were returned. The task is likely not complete. Re-evaluating."
|
|
402
|
+
print(last_action_feedback)
|
|
403
|
+
|
|
404
|
+
iteration_count += 1
|
|
405
|
+
return synthesized_summary
|
|
287
406
|
|
|
288
407
|
|
|
289
|
-
# Optional: If you want to run this as a standalone script
|
|
290
408
|
if __name__ == "__main__":
|
|
291
|
-
|
|
409
|
+
repl_loop()
|