npcsh 0.3.32__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. npcsh/_state.py +942 -0
  2. npcsh/alicanto.py +1074 -0
  3. npcsh/guac.py +785 -0
  4. npcsh/mcp_helpers.py +357 -0
  5. npcsh/mcp_npcsh.py +822 -0
  6. npcsh/mcp_server.py +184 -0
  7. npcsh/npc.py +218 -0
  8. npcsh/npcsh.py +1161 -0
  9. npcsh/plonk.py +387 -269
  10. npcsh/pti.py +234 -0
  11. npcsh/routes.py +958 -0
  12. npcsh/spool.py +315 -0
  13. npcsh/wander.py +550 -0
  14. npcsh/yap.py +573 -0
  15. npcsh-1.0.1.dist-info/METADATA +596 -0
  16. npcsh-1.0.1.dist-info/RECORD +21 -0
  17. {npcsh-0.3.32.dist-info → npcsh-1.0.1.dist-info}/WHEEL +1 -1
  18. npcsh-1.0.1.dist-info/entry_points.txt +9 -0
  19. {npcsh-0.3.32.dist-info → npcsh-1.0.1.dist-info}/licenses/LICENSE +1 -1
  20. npcsh/audio.py +0 -569
  21. npcsh/audio_gen.py +0 -1
  22. npcsh/cli.py +0 -543
  23. npcsh/command_history.py +0 -566
  24. npcsh/conversation.py +0 -54
  25. npcsh/data_models.py +0 -46
  26. npcsh/dataframes.py +0 -171
  27. npcsh/embeddings.py +0 -168
  28. npcsh/helpers.py +0 -646
  29. npcsh/image.py +0 -298
  30. npcsh/image_gen.py +0 -79
  31. npcsh/knowledge_graph.py +0 -1006
  32. npcsh/llm_funcs.py +0 -2195
  33. npcsh/load_data.py +0 -83
  34. npcsh/main.py +0 -5
  35. npcsh/model_runner.py +0 -189
  36. npcsh/npc_compiler.py +0 -2879
  37. npcsh/npc_sysenv.py +0 -388
  38. npcsh/npc_team/assembly_lines/test_pipeline.py +0 -181
  39. npcsh/npc_team/corca.npc +0 -13
  40. npcsh/npc_team/foreman.npc +0 -7
  41. npcsh/npc_team/npcsh.ctx +0 -11
  42. npcsh/npc_team/sibiji.npc +0 -4
  43. npcsh/npc_team/templates/analytics/celona.npc +0 -0
  44. npcsh/npc_team/templates/hr_support/raone.npc +0 -0
  45. npcsh/npc_team/templates/humanities/eriane.npc +0 -4
  46. npcsh/npc_team/templates/it_support/lineru.npc +0 -0
  47. npcsh/npc_team/templates/marketing/slean.npc +0 -4
  48. npcsh/npc_team/templates/philosophy/maurawa.npc +0 -0
  49. npcsh/npc_team/templates/sales/turnic.npc +0 -4
  50. npcsh/npc_team/templates/software/welxor.npc +0 -0
  51. npcsh/npc_team/tools/bash_executer.tool +0 -32
  52. npcsh/npc_team/tools/calculator.tool +0 -8
  53. npcsh/npc_team/tools/code_executor.tool +0 -16
  54. npcsh/npc_team/tools/generic_search.tool +0 -27
  55. npcsh/npc_team/tools/image_generation.tool +0 -25
  56. npcsh/npc_team/tools/local_search.tool +0 -149
  57. npcsh/npc_team/tools/npcsh_executor.tool +0 -9
  58. npcsh/npc_team/tools/screen_cap.tool +0 -27
  59. npcsh/npc_team/tools/sql_executor.tool +0 -26
  60. npcsh/response.py +0 -272
  61. npcsh/search.py +0 -252
  62. npcsh/serve.py +0 -1467
  63. npcsh/shell.py +0 -524
  64. npcsh/shell_helpers.py +0 -3919
  65. npcsh/stream.py +0 -233
  66. npcsh/video.py +0 -52
  67. npcsh/video_gen.py +0 -69
  68. npcsh-0.3.32.data/data/npcsh/npc_team/bash_executer.tool +0 -32
  69. npcsh-0.3.32.data/data/npcsh/npc_team/calculator.tool +0 -8
  70. npcsh-0.3.32.data/data/npcsh/npc_team/celona.npc +0 -0
  71. npcsh-0.3.32.data/data/npcsh/npc_team/code_executor.tool +0 -16
  72. npcsh-0.3.32.data/data/npcsh/npc_team/corca.npc +0 -13
  73. npcsh-0.3.32.data/data/npcsh/npc_team/eriane.npc +0 -4
  74. npcsh-0.3.32.data/data/npcsh/npc_team/foreman.npc +0 -7
  75. npcsh-0.3.32.data/data/npcsh/npc_team/generic_search.tool +0 -27
  76. npcsh-0.3.32.data/data/npcsh/npc_team/image_generation.tool +0 -25
  77. npcsh-0.3.32.data/data/npcsh/npc_team/lineru.npc +0 -0
  78. npcsh-0.3.32.data/data/npcsh/npc_team/local_search.tool +0 -149
  79. npcsh-0.3.32.data/data/npcsh/npc_team/maurawa.npc +0 -0
  80. npcsh-0.3.32.data/data/npcsh/npc_team/npcsh.ctx +0 -11
  81. npcsh-0.3.32.data/data/npcsh/npc_team/npcsh_executor.tool +0 -9
  82. npcsh-0.3.32.data/data/npcsh/npc_team/raone.npc +0 -0
  83. npcsh-0.3.32.data/data/npcsh/npc_team/screen_cap.tool +0 -27
  84. npcsh-0.3.32.data/data/npcsh/npc_team/sibiji.npc +0 -4
  85. npcsh-0.3.32.data/data/npcsh/npc_team/slean.npc +0 -4
  86. npcsh-0.3.32.data/data/npcsh/npc_team/sql_executor.tool +0 -26
  87. npcsh-0.3.32.data/data/npcsh/npc_team/test_pipeline.py +0 -181
  88. npcsh-0.3.32.data/data/npcsh/npc_team/turnic.npc +0 -4
  89. npcsh-0.3.32.data/data/npcsh/npc_team/welxor.npc +0 -0
  90. npcsh-0.3.32.dist-info/METADATA +0 -779
  91. npcsh-0.3.32.dist-info/RECORD +0 -78
  92. npcsh-0.3.32.dist-info/entry_points.txt +0 -3
  93. {npcsh-0.3.32.dist-info → npcsh-1.0.1.dist-info}/top_level.txt +0 -0
npcsh/plonk.py CHANGED
@@ -1,291 +1,409 @@
1
- import json
1
+ from npcpy.data.image import capture_screenshot
2
2
  import time
3
+ import platform
4
+ from npcpy.llm_funcs import get_llm_response
5
+ from npcpy.work.desktop import perform_action, action_space
6
+ from PIL import Image, ImageDraw, ImageFont
3
7
 
4
- try:
5
- import pyautogui
6
- except KeyError as e:
7
- print(f"Could not load pyautogui due to the following error: {e}")
8
-
9
- from npcsh.image import capture_screenshot
10
- from npcsh.llm_funcs import get_llm_response
11
-
12
- import subprocess
13
- import os
14
-
15
-
16
- from typing import Any
17
-
18
- action_space = {
19
- "hotkey": {"key": "string"}, # For pressing hotkeys
20
- "click": {
21
- "x": "int between 0 and 100",
22
- "y": "int between 0 and 100",
23
- }, # For clicking
24
- "drag": {
25
- "x": "int between 0 and 100",
26
- "y": "int between 0 and 100",
27
- "duration": "int",
28
- }, # For dragging
29
- "wait": {"duration": "int"}, # For waiting
30
- "type": {"text": "string"},
31
- "right_click": {"x": "int between 0 and 100", "y": "int between 0 and 100"},
32
- "double_click": {"x": "int between 0 and 100", "y": "int between 0 and 100"},
33
- "bash": {"command": "string"},
34
- }
35
-
36
-
37
- def perform_action(action):
38
- """
39
- Execute different types of actions using PyAutoGUI
40
- """
41
- try:
42
- pyautogui.PAUSE = 1 # Add a small pause between actions
43
- pyautogui.FAILSAFE = (
44
- True # Enable fail-safe to stop script by moving mouse to corner
45
- )
46
-
47
- print(f"Action received: {action}") # Debug print
48
-
49
- if action["type"] == "click":
50
- pyautogui.click(x=action.get("x"), y=action.get("y"))
51
-
52
- elif action["type"] == "double_click":
53
- pyautogui.doubleClick(x=action.get("x"), y=action.get("y"))
54
-
55
- elif action["type"] == "right_click":
56
- pyautogui.rightClick(x=action.get("x"), y=action.get("y"))
57
-
58
- elif action["type"] == "drag":
59
- pyautogui.dragTo(
60
- x=action.get("x"), y=action.get("y"), duration=action.get("duration", 1)
61
- )
62
-
63
- elif action["type"] == "type":
64
- text = action.get("text", "")
65
- if isinstance(text, dict):
66
- text = text.get("text", "")
67
- pyautogui.typewrite(text)
68
-
69
- elif action["type"] == "hotkey":
70
- keys = action.get("text", "")
71
- print(f"Hotkey action: {keys}") # Debug print
72
- if isinstance(keys, str):
73
- keys = [keys]
74
- elif isinstance(keys, dict):
75
- keys = [keys.get("key", "")]
76
- pyautogui.hotkey(*keys)
77
-
78
- elif action["type"] == "wait":
79
- time.sleep(action.get("duration", 1)) # Wait for the given time in seconds
80
-
81
- elif action["type"] == "bash":
82
- command = action.get("command", "")
83
- print(f"Running bash command: {command}") # Debug print
84
- subprocess.Popen(
85
- command, shell=True
86
- ) # Run the command without waiting for it to complete
87
- print(f"Bash Command Output: {result.stdout.decode()}") # Debug output
88
- print(f"Bash Command Error: {result.stderr.decode()}") # Debug error
89
-
90
- return {"status": "success"}
91
-
92
- except Exception as e:
93
- return {"status": "error", "message": str(e)}
94
-
95
-
96
- def plonk(request, action_space, model=None, provider=None, npc=None):
97
- """
98
- Main interaction loop with LLM for action determination
99
-
100
- Args:
101
- request (str): The task to be performed
102
- action_space (dict): Available action types and the inputs they require
103
- npc (optional): NPC object for context and screenshot
104
- **kwargs: Additional arguments for LLM response
8
+ def get_system_examples():
9
+ system = platform.system()
10
+ if system == "Windows":
11
+ return "Examples: start firefox, notepad, calc, explorer"
12
+ elif system == "Darwin":
13
+ return "Examples: open -a Firefox, open -a TextEdit, open -a Calculator"
14
+ else:
15
+ return "Examples: firefox &, gedit &, gnome-calculator &"
16
+
17
+ def execute_plonk_command(request, action_space, model, provider, npc=None, max_iterations=10, debug=False):
18
+ synthesized_summary = []
19
+
20
+ """Synthesizes information gathered during the computer use run and logs key data points for
21
+ analysis. This function can be extended to store or report the synthesized knowledge as required.
105
22
  """
106
- prompt = f"""
107
- Here is a request from a user:
108
- {request}
109
-
110
- Your job is to choose certain actions, take screenshots,
111
- and evaluate what the next step is to complete the task.
112
-
113
- You can choose from the following action types:
114
- {json.dumps(action_space)}
115
-
116
-
117
-
118
- Attached to the message is a screenshot of the current screen.
119
-
120
- Please use that information to determine the next steps.
121
- Your response must be a JSON with an 'actions' key containing a list of actions.
122
- Each action should have a 'type' and any necessary parameters.https://www.reddit.com
123
-
124
-
125
- For example:
126
- Your response should look like:
127
23
 
24
+ system = platform.system()
25
+ system_examples = get_system_examples()
26
+
27
+ messages = []
28
+ last_action_feedback = "None"
29
+ last_click_coords = None
30
+
31
+ iteration_count = 0
32
+ while iteration_count < max_iterations:
33
+ # Gathering summary of actions performed this iteration
34
+ synthesized_info = {
35
+ 'iteration': iteration_count + 1,
36
+ 'last_action_feedback': last_action_feedback,
37
+ 'last_click_coords': last_click_coords
38
+ }
39
+ synthesized_summary.append(synthesized_info)
40
+
41
+ if debug:
42
+ print(f"Synthesized info at iteration {iteration_count + 1}: {synthesized_info}")
43
+
44
+ if debug:
45
+ print(f"Iteration {iteration_count + 1}/{max_iterations}")
46
+
47
+ # YOUR PROMPT, UNTOUCHED
48
+ prompt_template = f"""
49
+ Goal: {request}
50
+ Feedback from last action: {last_action_feedback}
51
+
52
+ Your task is to control the computer to achieve the goal.
53
+
54
+ THOUGHT PROCESS:
55
+ 1. Analyze the screen. Is the application I need (e.g., a web browser) already open?
56
+ 2. If YES, `click` it. If NO, use `bash` to launch it. Use the examples: {system_examples}.
57
+
58
+
59
+ CRITICAL COMPLETION RULE:
60
+ Once the goal is visually complete on the screen, your ONLY next action is to use the 'quit' action.
61
+
62
+ Your response MUST be a JSON object with an "actions" key.
63
+ All clicking actions should use percentage coordinates relative
64
+ to the screen size, as we will
65
+ manually translate them to the proper screen size.
66
+ your x and y values for clicks must ALWAYS be between 0 and 100.
67
+ The x and y are (0,0) at the TOP LEFT CORNER OF THE SCREEN.
68
+ The bottom right corner of the screen is (100,100).
69
+ the bottom left corner is (0,100) and the top right corner is (100,0).
70
+
71
+
72
+
73
+
74
+ ---
75
+ EXAMPLE 1: Task "Create and save a file named 'memo.txt' with the text 'Meeting at 3pm'"
128
76
  {{
129
- "actions": [
130
- {{"type":"bash", "command":"firefox &"}},
131
- {{"type": "click", "x": 5, "y": 5}},
132
- {{'type': 'type', 'text': 'https://www.google.com'}}
133
- ]
134
- }}
135
-
136
- IF you have to type something, ensure that it iis first opened and selected. Do not
137
- begin with typing immediately.
138
- If you have to click, the numbers should range from 0 to 100 in x and y with 0,0 being in the upper left.
139
-
140
-
141
- IF you have accomplished the task, return an empty list.
142
- Do not include any additional markdown formatting.
143
- """
144
-
145
- while True:
146
- # Capture screenshot using NPC-based method
147
- screenshot = capture_screenshot(npc=npc, full=True)
148
-
149
- # Ensure screenshot was captured successfully
150
- if not screenshot:
151
- print("Screenshot capture failed")
152
- return None
153
-
154
- # Get LLM response
77
+ "actions": [
78
+ {{ "type": "bash", "command": "gedit &" }},
79
+ {{ "type": "wait", "duration": 2 }},
80
+ {{'type':'click', 'x': 10, 'y': 30}},
81
+ {{ "type": "type", "text": "Meeting at 3pm" }},
82
+ {{ "type": "hotkey", "keys": ["ctrl", "s"] }},
83
+ {{ "type": "wait", "duration": 1 }},
84
+ {{ "type": "type", "text": "memo.txt" }},
85
+ {{ "type": "key", "keys": ["enter"] }},
86
+ ]
87
+ }}
88
+ ---
89
+ EXAMPLE 2: Task "Search for news about space exploration"
90
+ {{
91
+ "actions": [
92
+ {{ "type": "bash", "command": "firefox &" }},
93
+ {{ "type": "wait", "duration": 3 }},
94
+ {{ "type": "type", "text": "news about space exploration" }},
95
+ {{ "type": "key", "keys": ["enter"] }},
96
+ ]
97
+ }}
98
+
99
+ ---
100
+
101
+ Once a task has been verified and completed, your action list should only be
102
+ {{
103
+ "actions": [
104
+ {{ "type": "quit" }}
105
+ ]
106
+ }}
107
+ """
108
+
109
+ screenshot_path = capture_screenshot(npc=npc, full=True).get('file_path')
110
+ if not screenshot_path:
111
+ time.sleep(2)
112
+ continue
113
+
114
+ image_to_send_path = screenshot_path
115
+ if last_click_coords:
116
+ try:
117
+ img = Image.open(screenshot_path)
118
+ draw = ImageDraw.Draw(img)
119
+ width, height = img.size
120
+ x_pixel = int(last_click_coords['x'] * width / 100)
121
+ y_pixel = int(last_click_coords['y'] * height / 100)
122
+
123
+ try:
124
+ font = ImageFont.truetype("DejaVuSans-Bold.ttf", size=48)
125
+ except IOError:
126
+ font = ImageFont.load_default()
127
+
128
+ draw.text((x_pixel - 8, y_pixel - 12),
129
+ f"+{last_click_coords['x'],last_click_coords['y']}",
130
+ fill="red",
131
+ font=font)
132
+
133
+ marked_image_path = "/tmp/marked_screenshot.png"
134
+ img.save(marked_image_path)
135
+ image_to_send_path = marked_image_path
136
+ print(f"Drew marker at ({x_pixel}, {y_pixel}) on new screenshot.")
137
+ except Exception as e:
138
+ print(f"Failed to draw marker on image: {e}")
139
+
155
140
  response = get_llm_response(
156
- prompt,
157
- images=[screenshot],
141
+ prompt=prompt_template,
158
142
  model=model,
159
143
  provider=provider,
160
144
  npc=npc,
145
+ images=[image_to_send_path],
146
+ messages=messages,
161
147
  format="json",
162
148
  )
163
- # print("LLM Response:", response, type(response))
164
- # Check if task is complete
165
- print(response["response"])
166
- if not response["response"].get("actions", []):
167
- return response
168
-
169
- # Execute actions
170
- for action in response["response"]["actions"]:
171
- print("Performing action:", action)
172
- action_result = perform_action(action)
173
- perform_action({"type": "wait", "duration": 5})
174
149
 
175
- # Optional: Add error handling or logging
176
- if action_result.get("status") == "error":
177
- print(f"Error performing action: {action_result.get('message')}")
150
+ if "messages" in response:
151
+ messages = response["messages"]
152
+
153
+ response_data = response.get('response')
154
+
155
+ if not isinstance(response_data, dict) or "actions" not in response_data:
156
+ last_action_feedback = f"Invalid JSON response from model: {response_data}"
157
+ continue
158
+
159
+ actions_list = response_data.get("actions", [])
160
+
161
+ if not isinstance(actions_list, list):
162
+ last_action_feedback = "Model did not return a list in the 'actions' key."
163
+ continue
164
+
165
+ # Reset last click before processing new actions
166
+ last_click_coords = None
167
+ for action in actions_list:
168
+ if debug:
169
+ print(f"Executing action: {action}")
170
+ if action.get("type") == "quit":
171
+ print("Task complete: Model returned 'quit' action.")
172
+ return "SUCCESS"
173
+
174
+ result = perform_action(action)
175
+ last_action_feedback = result.get("message") or result.get("output")
176
+
177
+ if action.get("type") == "click":
178
+ last_click_coords = {"x": action.get("x"), "y": action.get("y")}
179
+
180
+ if result.get("status") == "error":
181
+ print(f"Action failed, providing feedback to model: {last_action_feedback}")
182
+ break
183
+ time.sleep(1)
184
+
185
+ if not actions_list:
186
+ last_action_feedback = "No actions were returned. The task is likely not complete. Re-evaluating."
187
+ print(last_action_feedback)
188
+
189
+ iteration_count += 1
190
+
191
+ return None
192
+
193
+ def synthesize_and_display_summary(synthesized_summary, debug=False):
194
+ """Synthesizes information gathered during the computer use run and logs key data points."""
195
+ if not synthesized_summary:
196
+ print("No synthesized info to display.")
197
+ return
198
+
199
+ print("\nSynthesized Summary of Computer Use Run:")
200
+ for info in synthesized_summary:
201
+ print(f"Iteration {info['iteration']}:\n"
202
+ f" Last Action Feedback: {info['last_action_feedback']}\n"
203
+ f" Last Click Coordinates: {info['last_click_coords']}")
204
+ print("End of synthesized summary.\n")
205
+
206
+
207
+
208
+ def repl_loop():
209
+ print("Assistant REPL - Type your plonk command or 'exit' to quit.")
210
+ while True:
211
+ user_input = input("Enter your command: ").strip()
212
+ if user_input.lower() == 'exit':
213
+ print("Exiting REPL. Goodbye!")
214
+ break
215
+ if not user_input:
216
+ continue
217
+
218
+ # Run the plonk command and get synthesized summary
219
+ synthesized_summary = execute_plonk_command(
220
+ request=user_input,
221
+ action_space=action_space,
222
+ model="gpt-4o-mini",
223
+ provider="openai",
224
+ max_iterations=8,
225
+ debug=True
226
+ )
178
227
 
179
- # Small delay between action batches
180
- time.sleep(1)
228
+ if synthesized_summary and isinstance(synthesized_summary, list):
229
+ print("Command executed with synthesized summary.")
230
+ synthesize_and_display_summary(synthesized_summary)
231
+ else:
232
+ print("Command did not complete within iteration limit or returned no summary.")
181
233
 
182
234
 
183
- def test_open_reddit(npc: Any = None):
235
+ def execute_plonk_command(request, action_space, model, provider, npc=None, max_iterations=10, debug=False):
236
+ """Synthesizes information gathered during the computer use run and logs key data points for
237
+ analysis. This function can be extended to store or report the synthesized knowledge as required.
184
238
  """
185
- Test function to open a web browser and navigate to Reddit using plonk
186
- """
187
- # Define the action space for web navigation
188
-
189
- # Request to navigate to Reddit
190
- request = "Open a web browser and go to reddit.com"
191
-
192
- # Determine the browser launch hotkey based on the operating system
193
- import platform
194
239
 
195
240
  system = platform.system()
241
+ system_examples = get_system_examples()
242
+
243
+ messages = []
244
+ last_action_feedback = "None"
245
+ last_click_coords = None
246
+
247
+ iteration_count = 0
248
+
249
+ synthesized_summary = []
250
+
251
+ while iteration_count < max_iterations:
252
+ synthesized_info = {
253
+ 'iteration': iteration_count + 1,
254
+ 'last_action_feedback': last_action_feedback,
255
+ 'last_click_coords': last_click_coords
256
+ }
257
+ synthesized_summary.append(synthesized_info)
258
+
259
+ if debug:
260
+ print(f"Synthesized info at iteration {iteration_count + 1}: {synthesized_info}")
261
+
262
+ if debug:
263
+ print(f"Iteration {iteration_count + 1}/{max_iterations}")
264
+
265
+ prompt_template = f"""
266
+ Goal: {request}
267
+ Feedback from last action: {last_action_feedback}
268
+
269
+ Your task is to control the computer to achieve the goal.
270
+
271
+ THOUGHT PROCESS:
272
+ 1. Analyze the screen. Is the application I need (e.g., a web browser) already open?
273
+ 2. If YES, `click` it. If NO, use `bash` to launch it. Use the examples: {system_examples}.
274
+
275
+
276
+ CRITICAL COMPLETION RULE:
277
+ Once the goal is visually complete on the screen, your ONLY next action is to use the 'quit' action.
278
+
279
+ Your response MUST be a JSON object with an "actions" key.
280
+ All clicking actions should use percentage coordinates relative
281
+ to the screen size, as we will
282
+ manually translate them to the proper screen size.
283
+ your x and y values for clicks must ALWAYS be between 0 and 100.
284
+ The x and y are (0,0) at the TOP LEFT CORNER OF THE SCREEN.
285
+ The bottom right corner of the screen is (100,100).
286
+ the bottom left corner is (0,100) and the top right corner is (100,0).
287
+
288
+
289
+
290
+ ---
291
+ EXAMPLE 1: Task "Create and save a file named 'memo.txt' with the text 'Meeting at 3pm'"
292
+ {{
293
+ "actions": [
294
+ {{ "type": "bash", "command": "gedit &" }},
295
+ {{ "type": "wait", "duration": 2 }},
296
+ {{'type':'click', 'x': 10, 'y': 30}},
297
+ {{ "type": "type", "text": "Meeting at 3pm" }},
298
+ {{ "type": "hotkey", "keys": ["ctrl", "s"] }},
299
+ {{ "type": "wait", "duration": 1 }},
300
+ {{ "type": "type", "text": "memo.txt" }},
301
+ {{ "type": "key", "keys": ["enter"] }},
302
+ ]
303
+ }}
304
+ ---
305
+ EXAMPLE 2: Task "Search for news about space exploration"
306
+ {{
307
+ "actions": [
308
+ {{ "type": "bash", "command": "firefox &" }},
309
+ {{ "type": "wait", "duration": 3 }},
310
+ {{ "type": "type", "text": "news about space exploration" }},
311
+ {{ "type": "key", "keys": ["enter"] }},
312
+ ]
313
+ }}
314
+
315
+ ---
316
+
317
+ Once a task has been verified and completed, your action list should only be
318
+ {{
319
+ "actions": [
320
+ {{ "type": "quit" }}
321
+ ]
322
+ }}
323
+ """
324
+
325
+ screenshot_path = capture_screenshot(npc=npc, full=True).get('file_path')
326
+ if not screenshot_path:
327
+ time.sleep(2)
328
+ continue
329
+
330
+ image_to_send_path = screenshot_path
331
+ if last_click_coords:
332
+ try:
333
+ img = Image.open(screenshot_path)
334
+ draw = ImageDraw.Draw(img)
335
+ width, height = img.size
336
+ x_pixel = int(last_click_coords['x'] * width / 100)
337
+ y_pixel = int(last_click_coords['y'] * height / 100)
338
+
339
+ try:
340
+ font = ImageFont.truetype("DejaVuSans-Bold.ttf", size=48)
341
+ except IOError:
342
+ font = ImageFont.load_default()
343
+
344
+ draw.text((x_pixel - 8, y_pixel - 12),
345
+ f"+{last_click_coords['x'],last_click_coords['y']}",
346
+ fill="red",
347
+ font=font)
348
+
349
+ marked_image_path = "/tmp/marked_screenshot.png"
350
+ img.save(marked_image_path)
351
+ image_to_send_path = marked_image_path
352
+ print(f"Drew marker at ({x_pixel}, {y_pixel}) on new screenshot.")
353
+ except Exception as e:
354
+ print(f"Failed to draw marker on image: {e}")
355
+
356
+ response = get_llm_response(
357
+ prompt=prompt_template,
358
+ model=model,
359
+ provider=provider,
360
+ npc=npc,
361
+ images=[image_to_send_path],
362
+ messages=messages,
363
+ format="json",
364
+ )
196
365
 
197
- if system == "Darwin": # macOS
198
- browser_launch_keys = ["command", "space"]
199
- browser_search = "chrome"
200
- elif system == "Windows":
201
- browser_launch_keys = ["win", "r"]
202
- browser_search = "chrome"
203
- else: # Linux or other
204
- browser_launch_keys = ["alt", "f2"]
205
- browser_search = "firefox"
206
-
207
- # Perform the task using plonk
208
- result = plonk(
209
- request,
210
- action_space,
211
- model="gpt-4o-mini",
212
- provider="openai",
213
- )
214
-
215
- # Optionally, you can add assertions or print results
216
- print("Reddit navigation test result:", result)
217
-
218
- return result
219
-
220
-
221
- def generate_plonk(
222
- request,
223
- ):
224
- prompt = f"""
225
-
226
- A user asked the following question: {request}
227
-
228
- You are in charge of creating a plonk plan that will handle their request.
229
- This plonk plan will be a series of steps that you will write that will be
230
- used to generate a fully functioning system that will accomplish the user's request.
231
- your plonk plan should be a python script that generates LLM prompts
232
- that will be used to generate the distinct pieces of software.
233
-
234
- The goal here is modularization, abstraction, separation of scales.
235
- A careful set of instructions can pave the way for a system that can be iterated on
236
- and improved with successive steps.
237
-
238
- Here is an example of a question and answer that you might generate:
239
-
240
- Question: "Set up an automation system that will open a web browser every morning
241
- and go to my bank account and export my transactions."
242
-
243
- Answer:
244
- "{{'plonk plan': ```
245
-
246
- from npcsh.llm_funcs import get_llm_response
247
-
248
- automation_script = get_llm_response( '''
249
- Write a python script that will request input from a user about what bank they use. Then use selenium to open the browser and navigate to the bank's website.
250
- Get the user's username and password and log in, also through raw input.
251
- Then navigate to the transactions page and export the transactions. Ensure you are sufficiently logging information at each step of the way so that the results can be
252
- debugged efficiently.
253
- Return the script without any additional comment or Markdown formatting. It is imperative that you do not include any additional text.
254
- ''')
255
- # write the automation script to a file
256
- automation_script_file = open('automation_script.py', 'w')
257
- automation_script_file.write(automation_script)
258
- automation_script_file.close()
259
-
260
-
261
- scheduling_script = get_llm_response( f'''
262
- Write a bash script that will set up an OS scheduler to run the automation script every morning at 8 am.
263
- The automation script is located at ./automation_script.py.
264
- You'll need to ensure that the full path is used in the scheduling script.
265
- Return the script without any additional comment or Markdown formatting.
266
- It is imperative that you do not include any additional text.
267
- Do not leave any placeholder paths or variables in the script.
268
- They must be able to execute without
269
- any further modification by you or the user.
270
- ''')
271
- # write the scheduling script to a file
272
- scheduling_script_file = open('scheduling_script.sh', 'w')
273
- scheduling_script_file.write(scheduling_script)
274
-
275
- scheduling_script_file.close()
276
- # attempt to run the scheduling script
277
- import subprocess
278
- subprocess.run(['bash', 'scheduling_script.sh'])
279
- ```}}
280
-
281
- In this example, we have set up a plan that will require multiple other LLM calls to generate the necessary items to
282
- accomplish the user's request.
283
-
284
- """
285
-
286
- return get_llm_response(prompt)
366
+ if "messages" in response:
367
+ messages = response["messages"]
368
+
369
+ response_data = response.get('response')
370
+
371
+ if not isinstance(response_data, dict) or "actions" not in response_data:
372
+ last_action_feedback = f"Invalid JSON response from model: {response_data}"
373
+ continue
374
+
375
+ actions_list = response_data.get("actions", [])
376
+
377
+ if not isinstance(actions_list, list):
378
+ last_action_feedback = "Model did not return a list in the 'actions' key."
379
+ continue
380
+
381
+ last_click_coords = None
382
+ for action in actions_list:
383
+ if debug:
384
+ print(f"Executing action: {action}")
385
+ if action.get("type") == "quit":
386
+ print("Task complete: Model returned 'quit' action.")
387
+ return synthesized_summary
388
+
389
+ result = perform_action(action)
390
+ last_action_feedback = result.get("message") or result.get("output")
391
+
392
+ if action.get("type") == "click":
393
+ last_click_coords = {"x": action.get("x"), "y": action.get("y")}
394
+
395
+ if result.get("status") == "error":
396
+ print(f"Action failed, providing feedback to model: {last_action_feedback}")
397
+ break
398
+ time.sleep(1)
399
+
400
+ if not actions_list:
401
+ last_action_feedback = "No actions were returned. The task is likely not complete. Re-evaluating."
402
+ print(last_action_feedback)
403
+
404
+ iteration_count += 1
405
+ return synthesized_summary
287
406
 
288
407
 
289
- # Optional: If you want to run this as a standalone script
290
408
  if __name__ == "__main__":
291
- test_open_reddit()
409
+ repl_loop()