khoj 1.41.1.dev40__py3-none-any.whl → 1.41.1.dev90__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. khoj/database/adapters/__init__.py +1 -1
  2. khoj/interface/compiled/404/index.html +1 -1
  3. khoj/interface/compiled/_next/static/chunks/4986-9ddd694756d03aa1.js +1 -0
  4. khoj/interface/compiled/agents/index.html +1 -1
  5. khoj/interface/compiled/agents/index.txt +1 -1
  6. khoj/interface/compiled/automations/index.html +1 -1
  7. khoj/interface/compiled/automations/index.txt +1 -1
  8. khoj/interface/compiled/chat/index.html +2 -2
  9. khoj/interface/compiled/chat/index.txt +2 -2
  10. khoj/interface/compiled/index.html +1 -1
  11. khoj/interface/compiled/index.txt +1 -1
  12. khoj/interface/compiled/search/index.html +1 -1
  13. khoj/interface/compiled/search/index.txt +1 -1
  14. khoj/interface/compiled/settings/index.html +1 -1
  15. khoj/interface/compiled/settings/index.txt +1 -1
  16. khoj/interface/compiled/share/chat/index.html +2 -2
  17. khoj/interface/compiled/share/chat/index.txt +2 -2
  18. khoj/processor/conversation/anthropic/anthropic_chat.py +5 -0
  19. khoj/processor/conversation/google/gemini_chat.py +5 -0
  20. khoj/processor/conversation/google/utils.py +4 -0
  21. khoj/processor/conversation/openai/gpt.py +5 -0
  22. khoj/processor/conversation/prompts.py +12 -1
  23. khoj/processor/conversation/utils.py +13 -1
  24. khoj/processor/operator/grounding_agent.py +345 -0
  25. khoj/processor/operator/grounding_agent_uitars.py +973 -0
  26. khoj/processor/operator/operate_browser.py +152 -0
  27. khoj/processor/operator/operator_actions.py +149 -0
  28. khoj/processor/operator/operator_agent_anthropic.py +383 -0
  29. khoj/processor/operator/operator_agent_base.py +80 -0
  30. khoj/processor/operator/operator_agent_binary.py +336 -0
  31. khoj/processor/operator/operator_agent_openai.py +349 -0
  32. khoj/processor/operator/operator_environment_base.py +37 -0
  33. khoj/processor/operator/operator_environment_browser.py +395 -0
  34. khoj/routers/api_chat.py +42 -3
  35. khoj/routers/helpers.py +14 -3
  36. khoj/routers/research.py +48 -1
  37. khoj/utils/helpers.py +17 -0
  38. {khoj-1.41.1.dev40.dist-info → khoj-1.41.1.dev90.dist-info}/METADATA +5 -3
  39. {khoj-1.41.1.dev40.dist-info → khoj-1.41.1.dev90.dist-info}/RECORD +44 -34
  40. khoj/interface/compiled/_next/static/chunks/4986-14ea63faad1615a4.js +0 -1
  41. /khoj/interface/compiled/_next/static/{ifuY0XkcvaIiCG3xJl8zw → WLmcH2J-wz36GlS6O8HSL}/_buildManifest.js +0 -0
  42. /khoj/interface/compiled/_next/static/{ifuY0XkcvaIiCG3xJl8zw → WLmcH2J-wz36GlS6O8HSL}/_ssgManifest.js +0 -0
  43. {khoj-1.41.1.dev40.dist-info → khoj-1.41.1.dev90.dist-info}/WHEEL +0 -0
  44. {khoj-1.41.1.dev40.dist-info → khoj-1.41.1.dev90.dist-info}/entry_points.txt +0 -0
  45. {khoj-1.41.1.dev40.dist-info → khoj-1.41.1.dev90.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,395 @@
1
+ import asyncio
2
+ import base64
3
+ import io
4
+ import logging
5
+ import os
6
+ from typing import Optional, Set, Union
7
+
8
+ from khoj.processor.operator.operator_actions import OperatorAction, Point
9
+ from khoj.processor.operator.operator_environment_base import (
10
+ Environment,
11
+ EnvState,
12
+ EnvStepResult,
13
+ )
14
+ from khoj.utils.helpers import convert_image_to_webp
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+ try:
19
+ from playwright.async_api import Browser, Page, Playwright, async_playwright
20
+ except ImportError:
21
+ logger.debug(
22
+ "Playwright not found. To use browser operator, run 'pip install playwright' and 'playwright install' first."
23
+ )
24
+
25
+
26
+ # --- Concrete BrowserEnvironment ---
27
+ class BrowserEnvironment(Environment):
28
+ def __init__(self):
29
+ self.playwright: Optional[Playwright] = None
30
+ self.browser: Optional[Browser] = None
31
+ self.page: Optional[Page] = None
32
+ self.width: int = 1024
33
+ self.height: int = 768
34
+ self.visited_urls: Set[str] = set()
35
+ self.excluded_urls = {"about:blank", "https://duckduckgo.com", "https://www.bing.com", "https://www.google.com"}
36
+ self.navigation_history: list[str] = []
37
+ self.mouse_pos = Point(x=self.width / 2, y=self.height / 2)
38
+
39
+ async def start(self, width: int = 1024, height: int = 768) -> None:
40
+ self.width = width
41
+ self.height = height
42
+ self.playwright = await async_playwright().start()
43
+
44
+ if cdp_url := os.getenv("KHOJ_CDP_URL"):
45
+ self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url)
46
+ else:
47
+ launch_args = [f"--window-size={width},{height}", "--disable-extensions", "--disable-file-system"]
48
+ self.browser = await self.playwright.chromium.launch(
49
+ chromium_sandbox=True, headless=False, args=launch_args, env={}
50
+ )
51
+
52
+ # Get the initial browser, page or create one if none exist
53
+ default_context = self.browser.contexts[0] if self.browser.contexts else await self.browser.new_context()
54
+ self.page = default_context.pages[0] if default_context.pages else await default_context.new_page()
55
+
56
+ # Define a handler for page load events to capture URLs
57
+ async def handle_load(loaded_page: Page):
58
+ url = loaded_page.url
59
+ if not url:
60
+ return
61
+
62
+ if not self.navigation_history or self.navigation_history[-1] != url:
63
+ self.navigation_history.append(url)
64
+
65
+ if url not in self.excluded_urls and url not in self.visited_urls:
66
+ logger.debug(f"Page loaded: {url}")
67
+ self.visited_urls.add(url)
68
+
69
+ # Listen for load events on the main page
70
+ self.page.on("load", handle_load)
71
+
72
+ # Define a handler for new pages
73
+ async def handle_new_page(new_page: Page):
74
+ # Get the target URL of the new page
75
+ target_url = new_page.url
76
+ # Close the new page if it is not closed
77
+ if not new_page.is_closed():
78
+ await new_page.close()
79
+ # Open the target url in the current page instead
80
+ if target_url and target_url != "about:blank" and self.page:
81
+ logger.debug(f"Load {target_url} in current page instead of new tab.")
82
+ await self.page.goto(target_url)
83
+
84
+ # Listen for new pages being created in the context
85
+ default_context.on("page", handle_new_page)
86
+
87
+ # If page url is blank, navigate to DuckDuckGo
88
+ if self.page.url == "about:blank":
89
+ await self.page.goto("https://duckduckgo.com")
90
+ await self.page.set_viewport_size({"width": self.width, "height": self.height})
91
+ logger.info("Browser environment started.")
92
+
93
+ async def _get_screenshot(self) -> Optional[str]:
94
+ if not self.page or self.page.is_closed():
95
+ return None
96
+ try:
97
+ screenshot_bytes = await self.page.screenshot(caret="initial", full_page=False, type="png")
98
+ # Draw mouse position on the screenshot image
99
+ if self.mouse_pos:
100
+ screenshot_bytes = await self._draw_mouse_position(screenshot_bytes, self.mouse_pos)
101
+ screenshot_webp_bytes = convert_image_to_webp(screenshot_bytes)
102
+ return base64.b64encode(screenshot_webp_bytes).decode("utf-8")
103
+ except Exception as e:
104
+ logger.error(f"Failed to get screenshot: {e}")
105
+ return None
106
+
107
+ async def _draw_mouse_position(self, screenshot_bytes: bytes, mouse_pos: Point) -> bytes:
108
+ from PIL import Image, ImageDraw
109
+
110
+ # Load the screenshot into a PIL image
111
+ image = Image.open(io.BytesIO(screenshot_bytes))
112
+
113
+ # Draw a red circle at the mouse position
114
+ draw = ImageDraw.Draw(image)
115
+ radius = 5
116
+ draw.ellipse(
117
+ (mouse_pos.x - radius, mouse_pos.y - radius, mouse_pos.x + radius, mouse_pos.y + radius), fill="red"
118
+ )
119
+
120
+ # Save the modified image to a bytes buffer
121
+ output_buffer = io.BytesIO()
122
+ image.save(output_buffer, format="PNG")
123
+ return output_buffer.getvalue()
124
+
125
+ async def get_state(self) -> EnvState:
126
+ if not self.page or self.page.is_closed():
127
+ return EnvState(url="about:blank", screenshot=None)
128
+ url = self.page.url
129
+ screenshot = await self._get_screenshot()
130
+ return EnvState(url=url, screenshot=screenshot)
131
+
132
+ async def step(self, action: OperatorAction) -> EnvStepResult:
133
+ if not self.page or self.page.is_closed():
134
+ return EnvStepResult(error="Browser page is not available or closed.")
135
+
136
+ before_state = await self.get_state()
137
+ output: Optional[Union[str, dict]] = None
138
+ error: Optional[str] = None
139
+ step_type: str = "text"
140
+ try:
141
+ match action.type:
142
+ case "click":
143
+ x, y, button = action.x, action.y, action.button
144
+ if button == "wheel":
145
+ await self.page.mouse.wheel(x, y)
146
+ output = f"Scrolled wheel at ({x}, {y})"
147
+ else:
148
+ modifiers = self.parse_key_combination(action.modifiers) if action.modifiers else []
149
+ for modifier in modifiers:
150
+ await self.page.keyboard.down(modifier)
151
+ await self.page.mouse.click(x, y, button=button)
152
+ for modifier in reversed(modifiers):
153
+ await self.page.keyboard.up(modifier)
154
+ output = f"{button.capitalize()} clicked at ({x}, {y})"
155
+ self.mouse_pos = Point(x=x, y=y)
156
+ logger.debug(f"Action: {action.type} {button} at ({x},{y})")
157
+
158
+ case "double_click":
159
+ x, y = action.x, action.y
160
+ await self.page.mouse.dblclick(x, y)
161
+ self.mouse_pos = Point(x=x, y=y)
162
+ output = f"Double clicked at ({x}, {y})"
163
+ logger.debug(f"Action: {action.type} at ({x},{y})")
164
+
165
+ case "triple_click":
166
+ x, y = action.x, action.y
167
+ await self.page.mouse.click(x, y, click_count=3)
168
+ self.mouse_pos = Point(x=x, y=y)
169
+ output = f"Triple clicked at ({x}, {y})"
170
+ logger.debug(f"Action: {action.type} at ({x},{y})")
171
+
172
+ case "scroll":
173
+ # Prefer explicit scroll_x/y if provided (from OpenAI style)
174
+ if action.scroll_x is not None or action.scroll_y is not None:
175
+ scroll_x = action.scroll_x or 0
176
+ scroll_y = action.scroll_y or 0
177
+ if action.x is not None and action.y is not None:
178
+ await self.page.mouse.move(action.x, action.y)
179
+ self.mouse_pos = Point(x=action.x, y=action.y)
180
+ await self.page.evaluate(f"window.scrollBy({scroll_x}, {scroll_y})")
181
+ output = f"Scrolled by ({scroll_x}, {scroll_y})"
182
+ logger.debug(f"Action: {action.type} by ({scroll_x},{scroll_y}) at ({action.x},{action.y})")
183
+ # Otherwise use direction/amount (from Anthropic style)
184
+ elif action.scroll_direction:
185
+ scale = 40.0
186
+ dx, dy = 0.0, 0.0
187
+ amount = action.scroll_amount or 1
188
+ if action.scroll_direction == "up":
189
+ dy = -scale * amount
190
+ elif action.scroll_direction == "down":
191
+ dy = scale * amount
192
+ elif action.scroll_direction == "left":
193
+ dx = -scale * amount
194
+ elif action.scroll_direction == "right":
195
+ dx = scale * amount
196
+
197
+ if action.x is not None and action.y is not None:
198
+ await self.page.mouse.move(action.x, action.y)
199
+ self.mouse_pos = Point(x=action.x, y=action.y)
200
+ await self.page.mouse.wheel(dx, dy)
201
+ output = f"Scrolled {action.scroll_direction} by {amount}"
202
+ logger.debug(
203
+ f"Action: {action.type} {action.scroll_direction} by {amount} at ({action.x},{action.y})"
204
+ )
205
+ else:
206
+ error = "Scroll action requires either scroll_x/y or scroll_direction"
207
+
208
+ case "keypress":
209
+ keys = action.keys
210
+ if len(keys) > 1: # Handle combinations like ctrl+a
211
+ modifiers = [BrowserEnvironment.CUA_KEY_TO_PLAYWRIGHT_KEY.get(k.lower(), k) for k in keys[:-1]]
212
+ main_key = BrowserEnvironment.CUA_KEY_TO_PLAYWRIGHT_KEY.get(keys[-1].lower(), keys[-1])
213
+ key_string = "+".join(modifiers + [main_key])
214
+ await self.page.keyboard.press(key_string)
215
+ elif keys: # Single key
216
+ key_string = BrowserEnvironment.CUA_KEY_TO_PLAYWRIGHT_KEY.get(keys[0].lower(), keys[0])
217
+ await self.page.keyboard.press(key_string)
218
+ else:
219
+ error = "Keypress action requires at least one key"
220
+ key_string = "N/A"
221
+ output = f"Pressed key(s): {key_string}"
222
+ logger.debug(f"Action: {action.type} '{key_string}'")
223
+
224
+ case "type":
225
+ text = action.text
226
+ await self.page.keyboard.type(text)
227
+ output = f"Typed text: {text}"
228
+ logger.debug(f"Action: {action.type} '{text}'")
229
+
230
+ case "wait":
231
+ duration = action.duration
232
+ await asyncio.sleep(duration)
233
+ output = f"Waited for {duration} seconds"
234
+ logger.debug(f"Action: {action.type} for {duration}s")
235
+
236
+ case "screenshot":
237
+ step_type = "image"
238
+ output = {"image": before_state.screenshot, "url": before_state.url}
239
+ logger.debug(f"Action: {action.type}")
240
+
241
+ case "move":
242
+ x, y = action.x, action.y
243
+ await self.page.mouse.move(x, y)
244
+ self.mouse_pos = Point(x=x, y=y)
245
+ output = f"Moved mouse to ({x}, {y})"
246
+ logger.debug(f"Action: {action.type} to ({x},{y})")
247
+
248
+ case "drag":
249
+ path = action.path
250
+ if not path:
251
+ error = "Missing path for drag action"
252
+ else:
253
+ await self.page.mouse.move(path[0].x, path[0].y)
254
+ await self.page.mouse.down()
255
+ for point in path[1:]:
256
+ await self.page.mouse.move(point.x, point.y)
257
+ await self.page.mouse.up()
258
+ self.mouse_pos = Point(x=path[-1].x, y=path[-1].y)
259
+ output = f"Drag along path starting at ({path[0].x},{path[0].y})"
260
+ logger.debug(f"Action: {action.type} with {len(path)} points")
261
+
262
+ case "mouse_down":
263
+ await self.page.mouse.down(button=action.button)
264
+ output = f"{action.button.capitalize()} mouse button down"
265
+ logger.debug(f"Action: {action.type} {action.button}")
266
+
267
+ case "mouse_up":
268
+ await self.page.mouse.up(button=action.button)
269
+ output = f"{action.button.capitalize()} mouse button up"
270
+ logger.debug(f"Action: {action.type} {action.button}")
271
+
272
+ case "hold_key":
273
+ keys_to_parse = action.text
274
+ duration = action.duration
275
+ keys = self.parse_key_combination(keys_to_parse)
276
+ for key in keys:
277
+ await self.page.keyboard.down(key)
278
+ await asyncio.sleep(duration)
279
+ for key in reversed(keys):
280
+ await self.page.keyboard.up(key)
281
+ output = f"Held key{'s' if len(keys) > 1 else ''} {keys_to_parse} for {duration} seconds"
282
+ logger.debug(f"Action: {action.type} '{keys_to_parse}' for {duration}s")
283
+
284
+ case "key_down":
285
+ key = action.key
286
+ await self.page.keyboard.down(key)
287
+ output = f"Key down: {key}"
288
+ logger.debug(f"Action: {action.type} {key}")
289
+
290
+ case "key_up":
291
+ key = action.key
292
+ await self.page.keyboard.up(key)
293
+ output = f"Key up: {key}"
294
+ logger.debug(f"Action: {action.type} {key}")
295
+
296
+ case "cursor_position":
297
+ # Playwright doesn't directly expose mouse position easily without JS injection
298
+ # Returning a placeholder for now
299
+ output = "Cursor position requested (not directly available)"
300
+ logger.debug(f"Action: {action.type}")
301
+
302
+ case "goto":
303
+ url = action.url
304
+ if not url:
305
+ error = "Missing URL for goto action"
306
+ else:
307
+ await self.page.goto(url)
308
+ output = f"Navigated to {url}"
309
+ logger.debug(f"Action: {action.type} to {url}")
310
+
311
+ case "back":
312
+ if len(self.navigation_history) > 1:
313
+ self.navigation_history.pop()
314
+ previous_url = self.navigation_history[-1]
315
+ await self.page.goto(previous_url)
316
+ output = f"Navigated back to {previous_url}"
317
+ else:
318
+ output = "No previous URL to navigate back"
319
+ previous_url = "about:blank"
320
+ logger.debug(f"Action: {action.type} to {previous_url}")
321
+
322
+ case _:
323
+ error = f"Unrecognized action type: {action.type}"
324
+ logger.warning(error)
325
+
326
+ except Exception as e:
327
+ error = f"Error executing action {action.type}: {e}"
328
+ logger.exception(f"Error during step execution for action: {action.model_dump_json()}")
329
+
330
+ after_state = await self.get_state()
331
+ return EnvStepResult(
332
+ type=step_type,
333
+ output=output,
334
+ error=error,
335
+ current_url=after_state.url,
336
+ screenshot_base64=after_state.screenshot,
337
+ )
338
+
339
+ def reset(self) -> None:
340
+ self.visited_urls.clear()
341
+
342
+ async def close(self) -> None:
343
+ if self.browser:
344
+ await self.browser.close()
345
+ logger.info("Browser closed.")
346
+ if self.playwright:
347
+ await self.playwright.stop()
348
+ logger.info("Playwright stopped.")
349
+ self.browser = None
350
+ self.playwright = None
351
+ self.page = None
352
+
353
+ # Mapping of Operator Agent keys to Playwright keys
354
+ CUA_KEY_TO_PLAYWRIGHT_KEY = {
355
+ "/": "Divide",
356
+ "\\": "Backslash",
357
+ "alt": "Alt",
358
+ "arrowdown": "ArrowDown",
359
+ "arrowleft": "ArrowLeft",
360
+ "arrowright": "ArrowRight",
361
+ "arrowup": "ArrowUp",
362
+ "backspace": "Backspace",
363
+ "capslock": "CapsLock",
364
+ "cmd": "Meta",
365
+ "ctrl": "ControlOrMeta",
366
+ "delete": "Delete",
367
+ "end": "End",
368
+ "enter": "Enter",
369
+ "return": "Enter",
370
+ "esc": "Escape",
371
+ "home": "Home",
372
+ "insert": "Insert",
373
+ "option": "Alt",
374
+ "pagedown": "PageDown",
375
+ "pageup": "PageUp",
376
+ "shift": "Shift",
377
+ "space": " ",
378
+ "super": "Meta",
379
+ "tab": "Tab",
380
+ "win": "Meta",
381
+ }
382
+
383
+ @staticmethod
384
+ def parse_key_combination(text: str) -> list[str]:
385
+ """
386
+ Parse an xdotool-style key combination (e.g., "ctrl+o", "shift+tab")
387
+ and return a list of Playwright-compatible key names.
388
+ """
389
+ if "+" in text:
390
+ keys = text.split("+")
391
+ # Map each key to its Playwright equivalent
392
+ return [BrowserEnvironment.CUA_KEY_TO_PLAYWRIGHT_KEY.get(k.lower(), k) for k in keys]
393
+ else:
394
+ # Single key
395
+ return [BrowserEnvironment.CUA_KEY_TO_PLAYWRIGHT_KEY.get(text.lower(), text)]
khoj/routers/api_chat.py CHANGED
@@ -31,6 +31,7 @@ from khoj.processor.conversation.utils import (
31
31
  save_to_conversation_log,
32
32
  )
33
33
  from khoj.processor.image.generate import text_to_image
34
+ from khoj.processor.operator.operate_browser import operate_browser
34
35
  from khoj.processor.speech.text_to_speech import generate_text_to_speech
35
36
  from khoj.processor.tools.online_search import (
36
37
  deduplicate_organic_results,
@@ -78,6 +79,7 @@ from khoj.utils.helpers import (
78
79
  get_country_name_from_timezone,
79
80
  get_device,
80
81
  is_none_or_empty,
82
+ is_operator_enabled,
81
83
  )
82
84
  from khoj.utils.rawconfig import (
83
85
  ChatRequestBody,
@@ -569,6 +571,8 @@ async def chat_options(
569
571
  ) -> Response:
570
572
  cmd_options = {}
571
573
  for cmd in ConversationCommand:
574
+ if cmd == ConversationCommand.Operator and not is_operator_enabled():
575
+ continue
572
576
  if cmd in command_descriptions:
573
577
  cmd_options[cmd.value] = command_descriptions[cmd]
574
578
 
@@ -882,6 +886,7 @@ async def chat(
882
886
  researched_results = ""
883
887
  online_results: Dict = dict()
884
888
  code_results: Dict = dict()
889
+ operator_results: Dict[str, str] = {}
885
890
  generated_asset_results: Dict = dict()
886
891
  ## Extract Document References
887
892
  compiled_references: List[Any] = []
@@ -956,7 +961,8 @@ async def chat(
956
961
  code_results.update(research_result.codeContext)
957
962
  if research_result.context:
958
963
  compiled_references.extend(research_result.context)
959
-
964
+ if research_result.operatorContext:
965
+ operator_results.update(research_result.operatorContext)
960
966
  researched_results += research_result.summarizedResult
961
967
 
962
968
  else:
@@ -1207,14 +1213,45 @@ async def chat(
1207
1213
  yield result[ChatEvent.STATUS]
1208
1214
  else:
1209
1215
  code_results = result
1210
- async for result in send_event(ChatEvent.STATUS, f"**Ran code snippets**: {len(code_results)}"):
1211
- yield result
1212
1216
  except ValueError as e:
1213
1217
  program_execution_context.append(f"Failed to run code")
1214
1218
  logger.warning(
1215
1219
  f"Failed to use code tool: {e}. Attempting to respond without code results",
1216
1220
  exc_info=True,
1217
1221
  )
1222
+ if ConversationCommand.Operator in conversation_commands:
1223
+ try:
1224
+ async for result in operate_browser(
1225
+ defiltered_query,
1226
+ user,
1227
+ meta_log,
1228
+ location,
1229
+ query_images=uploaded_images,
1230
+ query_files=attached_file_context,
1231
+ send_status_func=partial(send_event, ChatEvent.STATUS),
1232
+ agent=agent,
1233
+ cancellation_event=cancellation_event,
1234
+ tracer=tracer,
1235
+ ):
1236
+ if isinstance(result, dict) and ChatEvent.STATUS in result:
1237
+ yield result[ChatEvent.STATUS]
1238
+ else:
1239
+ operator_results = {result["query"]: result["result"]}
1240
+ # Add webpages visited while operating browser to references
1241
+ if result.get("webpages"):
1242
+ if not online_results.get(defiltered_query):
1243
+ online_results[defiltered_query] = {"webpages": result["webpages"]}
1244
+ elif not online_results[defiltered_query].get("webpages"):
1245
+ online_results[defiltered_query]["webpages"] = result["webpages"]
1246
+ else:
1247
+ online_results[defiltered_query]["webpages"] += result["webpages"]
1248
+ except ValueError as e:
1249
+ program_execution_context.append(f"Browser operation error: {e}")
1250
+ logger.warning(f"Failed to operate browser with {e}", exc_info=True)
1251
+ async for result in send_event(
1252
+ ChatEvent.STATUS, "Operating browser failed. I'll try respond appropriately"
1253
+ ):
1254
+ yield result
1218
1255
 
1219
1256
  ## Send Gathered References
1220
1257
  unique_online_results = deduplicate_organic_results(online_results)
@@ -1225,6 +1262,7 @@ async def chat(
1225
1262
  "context": compiled_references,
1226
1263
  "onlineContext": unique_online_results,
1227
1264
  "codeContext": code_results,
1265
+ "operatorContext": operator_results,
1228
1266
  },
1229
1267
  ):
1230
1268
  yield result
@@ -1340,6 +1378,7 @@ async def chat(
1340
1378
  compiled_references,
1341
1379
  online_results,
1342
1380
  code_results,
1381
+ operator_results,
1343
1382
  inferred_queries,
1344
1383
  conversation_commands,
1345
1384
  user,
khoj/routers/helpers.py CHANGED
@@ -113,6 +113,7 @@ from khoj.utils.helpers import (
113
113
  get_file_type,
114
114
  in_debug_mode,
115
115
  is_none_or_empty,
116
+ is_operator_enabled,
116
117
  is_valid_url,
117
118
  log_telemetry,
118
119
  mode_descriptions_for_llm,
@@ -253,6 +254,8 @@ def get_conversation_command(query: str) -> ConversationCommand:
253
254
  return ConversationCommand.Code
254
255
  elif query.startswith("/research"):
255
256
  return ConversationCommand.Research
257
+ elif query.startswith("/operator") and is_operator_enabled():
258
+ return ConversationCommand.Operator
256
259
  else:
257
260
  return ConversationCommand.Default
258
261
 
@@ -362,6 +365,8 @@ async def aget_data_sources_and_output_format(
362
365
  # Skip showing Notes tool as an option if user has no entries
363
366
  if source == ConversationCommand.Notes and not user_has_entries:
364
367
  continue
368
+ if source == ConversationCommand.Operator and not is_operator_enabled():
369
+ continue
365
370
  source_options[source.value] = description
366
371
  if len(agent_sources) == 0 or source.value in agent_sources:
367
372
  source_options_str += f'- "{source.value}": "{description}"\n'
@@ -1349,6 +1354,7 @@ async def agenerate_chat_response(
1349
1354
  compiled_references: List[Dict] = [],
1350
1355
  online_results: Dict[str, Dict] = {},
1351
1356
  code_results: Dict[str, Dict] = {},
1357
+ operator_results: Dict[str, str] = {},
1352
1358
  inferred_queries: List[str] = [],
1353
1359
  conversation_commands: List[ConversationCommand] = [ConversationCommand.Default],
1354
1360
  user: KhojUser = None,
@@ -1385,6 +1391,7 @@ async def agenerate_chat_response(
1385
1391
  compiled_references=compiled_references,
1386
1392
  online_results=online_results,
1387
1393
  code_results=code_results,
1394
+ operator_results=operator_results,
1388
1395
  inferred_queries=inferred_queries,
1389
1396
  client_application=client_application,
1390
1397
  conversation_id=conversation_id,
@@ -1404,6 +1411,7 @@ async def agenerate_chat_response(
1404
1411
  compiled_references = []
1405
1412
  online_results = {}
1406
1413
  code_results = {}
1414
+ operator_results = {}
1407
1415
  deepthought = True
1408
1416
 
1409
1417
  chat_model = await ConversationAdapters.aget_valid_chat_model(user, conversation, is_subscribed)
@@ -1446,6 +1454,7 @@ async def agenerate_chat_response(
1446
1454
  query_images=query_images,
1447
1455
  online_results=online_results,
1448
1456
  code_results=code_results,
1457
+ operator_results=operator_results,
1449
1458
  conversation_log=meta_log,
1450
1459
  model=chat_model_name,
1451
1460
  api_key=api_key,
@@ -1475,6 +1484,7 @@ async def agenerate_chat_response(
1475
1484
  query_images=query_images,
1476
1485
  online_results=online_results,
1477
1486
  code_results=code_results,
1487
+ operator_results=operator_results,
1478
1488
  conversation_log=meta_log,
1479
1489
  model=chat_model.name,
1480
1490
  api_key=api_key,
@@ -1500,9 +1510,10 @@ async def agenerate_chat_response(
1500
1510
  chat_response_generator = converse_gemini(
1501
1511
  compiled_references,
1502
1512
  query_to_run,
1503
- online_results,
1504
- code_results,
1505
- meta_log,
1513
+ online_results=online_results,
1514
+ code_results=code_results,
1515
+ operator_results=operator_results,
1516
+ conversation_log=meta_log,
1506
1517
  model=chat_model.name,
1507
1518
  api_key=api_key,
1508
1519
  api_base_url=api_base_url,
khoj/routers/research.py CHANGED
@@ -17,6 +17,7 @@ from khoj.processor.conversation.utils import (
17
17
  construct_tool_chat_history,
18
18
  load_complex_json,
19
19
  )
20
+ from khoj.processor.operator.operate_browser import operate_browser
20
21
  from khoj.processor.tools.online_search import read_webpages, search_online
21
22
  from khoj.processor.tools.run_code import run_code
22
23
  from khoj.routers.api import extract_references_and_questions
@@ -28,6 +29,7 @@ from khoj.routers.helpers import (
28
29
  from khoj.utils.helpers import (
29
30
  ConversationCommand,
30
31
  is_none_or_empty,
32
+ is_operator_enabled,
31
33
  timer,
32
34
  tool_description_for_research_llm,
33
35
  truncate_code_context,
@@ -98,6 +100,9 @@ async def apick_next_tool(
98
100
  agent_tools = agent.input_tools if agent else []
99
101
  user_has_entries = await EntryAdapters.auser_has_entries(user)
100
102
  for tool, description in tool_description_for_research_llm.items():
103
+ # Skip showing operator tool as an option if not enabled
104
+ if tool == ConversationCommand.Operator and not is_operator_enabled():
105
+ continue
101
106
  # Skip showing Notes tool as an option if user has no entries
102
107
  if tool == ConversationCommand.Notes:
103
108
  if not user_has_entries:
@@ -232,6 +237,7 @@ async def execute_information_collection(
232
237
  online_results: Dict = dict()
233
238
  code_results: Dict = dict()
234
239
  document_results: List[Dict[str, str]] = []
240
+ operator_results: Dict[str, str] = {}
235
241
  summarize_files: str = ""
236
242
  this_iteration = InformationCollectionIteration(tool=None, query=query)
237
243
 
@@ -398,6 +404,38 @@ async def execute_information_collection(
398
404
  this_iteration.warning = f"Error running code: {e}"
399
405
  logger.warning(this_iteration.warning, exc_info=True)
400
406
 
407
+ elif this_iteration.tool == ConversationCommand.Operator:
408
+ try:
409
+ async for result in operate_browser(
410
+ this_iteration.query,
411
+ user,
412
+ construct_tool_chat_history(previous_iterations, ConversationCommand.Operator),
413
+ location,
414
+ send_status_func,
415
+ query_images=query_images,
416
+ agent=agent,
417
+ query_files=query_files,
418
+ cancellation_event=cancellation_event,
419
+ tracer=tracer,
420
+ ):
421
+ if isinstance(result, dict) and ChatEvent.STATUS in result:
422
+ yield result[ChatEvent.STATUS]
423
+ else:
424
+ operator_results = {result["query"]: result["result"]}
425
+ this_iteration.operatorContext = operator_results
426
+ # Add webpages visited while operating browser to references
427
+ if result.get("webpages"):
428
+ if not online_results.get(this_iteration.query):
429
+ online_results[this_iteration.query] = {"webpages": result["webpages"]}
430
+ elif not online_results[this_iteration.query].get("webpages"):
431
+ online_results[this_iteration.query]["webpages"] = result["webpages"]
432
+ else:
433
+ online_results[this_iteration.query]["webpages"] += result["webpages"]
434
+ this_iteration.onlineContext = online_results
435
+ except Exception as e:
436
+ this_iteration.warning = f"Error operating browser: {e}"
437
+ logger.error(this_iteration.warning, exc_info=True)
438
+
401
439
  elif this_iteration.tool == ConversationCommand.Summarize:
402
440
  try:
403
441
  async for result in generate_summary_from_files(
@@ -424,7 +462,14 @@ async def execute_information_collection(
424
462
 
425
463
  current_iteration += 1
426
464
 
427
- if document_results or online_results or code_results or summarize_files or this_iteration.warning:
465
+ if (
466
+ document_results
467
+ or online_results
468
+ or code_results
469
+ or operator_results
470
+ or summarize_files
471
+ or this_iteration.warning
472
+ ):
428
473
  results_data = f"\n<iteration>{current_iteration}\n<tool>{this_iteration.tool}</tool>\n<query>{this_iteration.query}</query>\n<results>"
429
474
  if document_results:
430
475
  results_data += f"\n<document_references>\n{yaml.dump(document_results, allow_unicode=True, sort_keys=False, default_flow_style=False)}\n</document_references>"
@@ -432,6 +477,8 @@ async def execute_information_collection(
432
477
  results_data += f"\n<online_results>\n{yaml.dump(online_results, allow_unicode=True, sort_keys=False, default_flow_style=False)}\n</online_results>"
433
478
  if code_results:
434
479
  results_data += f"\n<code_results>\n{yaml.dump(truncate_code_context(code_results), allow_unicode=True, sort_keys=False, default_flow_style=False)}\n</code_results>"
480
+ if operator_results:
481
+ results_data += f"\n<browser_operator_results>\n{next(iter(operator_results.values()))}\n</browser_operator_results>"
435
482
  if summarize_files:
436
483
  results_data += f"\n<summarized_files>\n{yaml.dump(summarize_files, allow_unicode=True, sort_keys=False, default_flow_style=False)}\n</summarized_files>"
437
484
  if this_iteration.warning: