khoj 1.41.1.dev43__py3-none-any.whl → 1.41.1.dev97__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. khoj/database/adapters/__init__.py +17 -6
  2. khoj/interface/compiled/404/index.html +2 -2
  3. khoj/interface/compiled/_next/static/chunks/{2327-f03b2a77f67b8f8c.js → 2327-aa22697ed9c8d54a.js} +1 -1
  4. khoj/interface/compiled/_next/static/chunks/4986-9ddd694756d03aa1.js +1 -0
  5. khoj/interface/compiled/_next/static/chunks/{8515-010dd769c584b672.js → 8515-f305779d95dd5780.js} +1 -1
  6. khoj/interface/compiled/_next/static/chunks/app/agents/layout-4e2a134ec26aa606.js +1 -0
  7. khoj/interface/compiled/_next/static/chunks/app/chat/layout-ad4d1792ab1a4108.js +1 -0
  8. khoj/interface/compiled/_next/static/chunks/app/chat/{page-14ac9d1ad5cb84c5.js → page-7e780dc11eb5e5d3.js} +1 -1
  9. khoj/interface/compiled/_next/static/chunks/{webpack-1169ca6e9e7e6247.js → webpack-21f76f7f59582bc7.js} +1 -1
  10. khoj/interface/compiled/agents/index.html +2 -2
  11. khoj/interface/compiled/agents/index.txt +2 -2
  12. khoj/interface/compiled/automations/index.html +2 -2
  13. khoj/interface/compiled/automations/index.txt +2 -2
  14. khoj/interface/compiled/chat/index.html +2 -2
  15. khoj/interface/compiled/chat/index.txt +2 -2
  16. khoj/interface/compiled/index.html +2 -2
  17. khoj/interface/compiled/index.txt +2 -2
  18. khoj/interface/compiled/search/index.html +2 -2
  19. khoj/interface/compiled/search/index.txt +2 -2
  20. khoj/interface/compiled/settings/index.html +2 -2
  21. khoj/interface/compiled/settings/index.txt +2 -2
  22. khoj/interface/compiled/share/chat/index.html +2 -2
  23. khoj/interface/compiled/share/chat/index.txt +2 -2
  24. khoj/processor/conversation/anthropic/anthropic_chat.py +7 -2
  25. khoj/processor/conversation/anthropic/utils.py +37 -19
  26. khoj/processor/conversation/google/gemini_chat.py +7 -2
  27. khoj/processor/conversation/offline/chat_model.py +2 -2
  28. khoj/processor/conversation/openai/gpt.py +7 -2
  29. khoj/processor/conversation/prompts.py +13 -2
  30. khoj/processor/conversation/utils.py +34 -6
  31. khoj/processor/operator/grounding_agent.py +345 -0
  32. khoj/processor/operator/grounding_agent_uitars.py +973 -0
  33. khoj/processor/operator/operate_browser.py +165 -0
  34. khoj/processor/operator/operator_actions.py +149 -0
  35. khoj/processor/operator/operator_agent_anthropic.py +402 -0
  36. khoj/processor/operator/operator_agent_base.py +80 -0
  37. khoj/processor/operator/operator_agent_binary.py +336 -0
  38. khoj/processor/operator/operator_agent_openai.py +349 -0
  39. khoj/processor/operator/operator_environment_base.py +37 -0
  40. khoj/processor/operator/operator_environment_browser.py +395 -0
  41. khoj/routers/api_chat.py +44 -6
  42. khoj/routers/helpers.py +18 -8
  43. khoj/routers/research.py +48 -1
  44. khoj/utils/constants.py +6 -0
  45. khoj/utils/helpers.py +17 -0
  46. {khoj-1.41.1.dev43.dist-info → khoj-1.41.1.dev97.dist-info}/METADATA +4 -2
  47. {khoj-1.41.1.dev43.dist-info → khoj-1.41.1.dev97.dist-info}/RECORD +52 -42
  48. khoj/interface/compiled/_next/static/chunks/4986-14ea63faad1615a4.js +0 -1
  49. khoj/interface/compiled/_next/static/chunks/app/agents/layout-e00fb81dca656a10.js +0 -1
  50. khoj/interface/compiled/_next/static/chunks/app/chat/layout-33934fc2d6ae6838.js +0 -1
  51. /khoj/interface/compiled/_next/static/{doKtSKC0j2ECO8K8viDKD → o6zlo73DbD2lS92jWHS8o}/_buildManifest.js +0 -0
  52. /khoj/interface/compiled/_next/static/{doKtSKC0j2ECO8K8viDKD → o6zlo73DbD2lS92jWHS8o}/_ssgManifest.js +0 -0
  53. {khoj-1.41.1.dev43.dist-info → khoj-1.41.1.dev97.dist-info}/WHEEL +0 -0
  54. {khoj-1.41.1.dev43.dist-info → khoj-1.41.1.dev97.dist-info}/entry_points.txt +0 -0
  55. {khoj-1.41.1.dev43.dist-info → khoj-1.41.1.dev97.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,395 @@
1
+ import asyncio
2
+ import base64
3
+ import io
4
+ import logging
5
+ import os
6
+ from typing import Optional, Set, Union
7
+
8
+ from khoj.processor.operator.operator_actions import OperatorAction, Point
9
+ from khoj.processor.operator.operator_environment_base import (
10
+ Environment,
11
+ EnvState,
12
+ EnvStepResult,
13
+ )
14
+ from khoj.utils.helpers import convert_image_to_webp
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+ try:
19
+ from playwright.async_api import Browser, Page, Playwright, async_playwright
20
+ except ImportError:
21
+ logger.debug(
22
+ "Playwright not found. To use browser operator, run 'pip install playwright' and 'playwright install' first."
23
+ )
24
+
25
+
26
+ # --- Concrete BrowserEnvironment ---
27
+ class BrowserEnvironment(Environment):
28
+ def __init__(self):
29
+ self.playwright: Optional[Playwright] = None
30
+ self.browser: Optional[Browser] = None
31
+ self.page: Optional[Page] = None
32
+ self.width: int = 1024
33
+ self.height: int = 768
34
+ self.visited_urls: Set[str] = set()
35
+ self.excluded_urls = {"about:blank", "https://duckduckgo.com", "https://www.bing.com", "https://www.google.com"}
36
+ self.navigation_history: list[str] = []
37
+ self.mouse_pos = Point(x=self.width / 2, y=self.height / 2)
38
+
39
+ async def start(self, width: int = 1024, height: int = 768) -> None:
40
+ self.width = width
41
+ self.height = height
42
+ self.playwright = await async_playwright().start()
43
+
44
+ if cdp_url := os.getenv("KHOJ_CDP_URL"):
45
+ self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url)
46
+ else:
47
+ launch_args = [f"--window-size={width},{height}", "--disable-extensions", "--disable-file-system"]
48
+ self.browser = await self.playwright.chromium.launch(
49
+ chromium_sandbox=True, headless=False, args=launch_args, env={}
50
+ )
51
+
52
+ # Get the initial browser, page or create one if none exist
53
+ default_context = self.browser.contexts[0] if self.browser.contexts else await self.browser.new_context()
54
+ self.page = default_context.pages[0] if default_context.pages else await default_context.new_page()
55
+
56
+ # Define a handler for page load events to capture URLs
57
+ async def handle_load(loaded_page: Page):
58
+ url = loaded_page.url
59
+ if not url:
60
+ return
61
+
62
+ if not self.navigation_history or self.navigation_history[-1] != url:
63
+ self.navigation_history.append(url)
64
+
65
+ if url not in self.excluded_urls and url not in self.visited_urls:
66
+ logger.debug(f"Page loaded: {url}")
67
+ self.visited_urls.add(url)
68
+
69
+ # Listen for load events on the main page
70
+ self.page.on("load", handle_load)
71
+
72
+ # Define a handler for new pages
73
+ async def handle_new_page(new_page: Page):
74
+ # Get the target URL of the new page
75
+ target_url = new_page.url
76
+ # Close the new page if it is not closed
77
+ if not new_page.is_closed():
78
+ await new_page.close()
79
+ # Open the target url in the current page instead
80
+ if target_url and target_url != "about:blank" and self.page:
81
+ logger.debug(f"Load {target_url} in current page instead of new tab.")
82
+ await self.page.goto(target_url)
83
+
84
+ # Listen for new pages being created in the context
85
+ default_context.on("page", handle_new_page)
86
+
87
+ # If page url is blank, navigate to DuckDuckGo
88
+ if self.page.url == "about:blank":
89
+ await self.page.goto("https://duckduckgo.com")
90
+ await self.page.set_viewport_size({"width": self.width, "height": self.height})
91
+ logger.info("Browser environment started.")
92
+
93
+ async def _get_screenshot(self) -> Optional[str]:
94
+ if not self.page or self.page.is_closed():
95
+ return None
96
+ try:
97
+ screenshot_bytes = await self.page.screenshot(caret="initial", full_page=False, type="png")
98
+ # Draw mouse position on the screenshot image
99
+ if self.mouse_pos:
100
+ screenshot_bytes = await self._draw_mouse_position(screenshot_bytes, self.mouse_pos)
101
+ screenshot_webp_bytes = convert_image_to_webp(screenshot_bytes)
102
+ return base64.b64encode(screenshot_webp_bytes).decode("utf-8")
103
+ except Exception as e:
104
+ logger.error(f"Failed to get screenshot: {e}")
105
+ return None
106
+
107
+ async def _draw_mouse_position(self, screenshot_bytes: bytes, mouse_pos: Point) -> bytes:
108
+ from PIL import Image, ImageDraw
109
+
110
+ # Load the screenshot into a PIL image
111
+ image = Image.open(io.BytesIO(screenshot_bytes))
112
+
113
+ # Draw a red circle at the mouse position
114
+ draw = ImageDraw.Draw(image)
115
+ radius = 5
116
+ draw.ellipse(
117
+ (mouse_pos.x - radius, mouse_pos.y - radius, mouse_pos.x + radius, mouse_pos.y + radius), fill="red"
118
+ )
119
+
120
+ # Save the modified image to a bytes buffer
121
+ output_buffer = io.BytesIO()
122
+ image.save(output_buffer, format="PNG")
123
+ return output_buffer.getvalue()
124
+
125
+ async def get_state(self) -> EnvState:
126
+ if not self.page or self.page.is_closed():
127
+ return EnvState(url="about:blank", screenshot=None)
128
+ url = self.page.url
129
+ screenshot = await self._get_screenshot()
130
+ return EnvState(url=url, screenshot=screenshot)
131
+
132
+ async def step(self, action: OperatorAction) -> EnvStepResult:
133
+ if not self.page or self.page.is_closed():
134
+ return EnvStepResult(error="Browser page is not available or closed.")
135
+
136
+ before_state = await self.get_state()
137
+ output: Optional[Union[str, dict]] = None
138
+ error: Optional[str] = None
139
+ step_type: str = "text"
140
+ try:
141
+ match action.type:
142
+ case "click":
143
+ x, y, button = action.x, action.y, action.button
144
+ if button == "wheel":
145
+ await self.page.mouse.wheel(x, y)
146
+ output = f"Scrolled wheel at ({x}, {y})"
147
+ else:
148
+ modifiers = self.parse_key_combination(action.modifiers) if action.modifiers else []
149
+ for modifier in modifiers:
150
+ await self.page.keyboard.down(modifier)
151
+ await self.page.mouse.click(x, y, button=button)
152
+ for modifier in reversed(modifiers):
153
+ await self.page.keyboard.up(modifier)
154
+ output = f"{button.capitalize()} clicked at ({x}, {y})"
155
+ self.mouse_pos = Point(x=x, y=y)
156
+ logger.debug(f"Action: {action.type} {button} at ({x},{y})")
157
+
158
+ case "double_click":
159
+ x, y = action.x, action.y
160
+ await self.page.mouse.dblclick(x, y)
161
+ self.mouse_pos = Point(x=x, y=y)
162
+ output = f"Double clicked at ({x}, {y})"
163
+ logger.debug(f"Action: {action.type} at ({x},{y})")
164
+
165
+ case "triple_click":
166
+ x, y = action.x, action.y
167
+ await self.page.mouse.click(x, y, click_count=3)
168
+ self.mouse_pos = Point(x=x, y=y)
169
+ output = f"Triple clicked at ({x}, {y})"
170
+ logger.debug(f"Action: {action.type} at ({x},{y})")
171
+
172
+ case "scroll":
173
+ # Prefer explicit scroll_x/y if provided (from OpenAI style)
174
+ if action.scroll_x is not None or action.scroll_y is not None:
175
+ scroll_x = action.scroll_x or 0
176
+ scroll_y = action.scroll_y or 0
177
+ if action.x is not None and action.y is not None:
178
+ await self.page.mouse.move(action.x, action.y)
179
+ self.mouse_pos = Point(x=action.x, y=action.y)
180
+ await self.page.evaluate(f"window.scrollBy({scroll_x}, {scroll_y})")
181
+ output = f"Scrolled by ({scroll_x}, {scroll_y})"
182
+ logger.debug(f"Action: {action.type} by ({scroll_x},{scroll_y}) at ({action.x},{action.y})")
183
+ # Otherwise use direction/amount (from Anthropic style)
184
+ elif action.scroll_direction:
185
+ scale = 40.0
186
+ dx, dy = 0.0, 0.0
187
+ amount = action.scroll_amount or 1
188
+ if action.scroll_direction == "up":
189
+ dy = -scale * amount
190
+ elif action.scroll_direction == "down":
191
+ dy = scale * amount
192
+ elif action.scroll_direction == "left":
193
+ dx = -scale * amount
194
+ elif action.scroll_direction == "right":
195
+ dx = scale * amount
196
+
197
+ if action.x is not None and action.y is not None:
198
+ await self.page.mouse.move(action.x, action.y)
199
+ self.mouse_pos = Point(x=action.x, y=action.y)
200
+ await self.page.mouse.wheel(dx, dy)
201
+ output = f"Scrolled {action.scroll_direction} by {amount}"
202
+ logger.debug(
203
+ f"Action: {action.type} {action.scroll_direction} by {amount} at ({action.x},{action.y})"
204
+ )
205
+ else:
206
+ error = "Scroll action requires either scroll_x/y or scroll_direction"
207
+
208
+ case "keypress":
209
+ keys = action.keys
210
+ if len(keys) > 1: # Handle combinations like ctrl+a
211
+ modifiers = [BrowserEnvironment.CUA_KEY_TO_PLAYWRIGHT_KEY.get(k.lower(), k) for k in keys[:-1]]
212
+ main_key = BrowserEnvironment.CUA_KEY_TO_PLAYWRIGHT_KEY.get(keys[-1].lower(), keys[-1])
213
+ key_string = "+".join(modifiers + [main_key])
214
+ await self.page.keyboard.press(key_string)
215
+ elif keys: # Single key
216
+ key_string = BrowserEnvironment.CUA_KEY_TO_PLAYWRIGHT_KEY.get(keys[0].lower(), keys[0])
217
+ await self.page.keyboard.press(key_string)
218
+ else:
219
+ error = "Keypress action requires at least one key"
220
+ key_string = "N/A"
221
+ output = f"Pressed key(s): {key_string}"
222
+ logger.debug(f"Action: {action.type} '{key_string}'")
223
+
224
+ case "type":
225
+ text = action.text
226
+ await self.page.keyboard.type(text)
227
+ output = f"Typed text: {text}"
228
+ logger.debug(f"Action: {action.type} '{text}'")
229
+
230
+ case "wait":
231
+ duration = action.duration
232
+ await asyncio.sleep(duration)
233
+ output = f"Waited for {duration} seconds"
234
+ logger.debug(f"Action: {action.type} for {duration}s")
235
+
236
+ case "screenshot":
237
+ step_type = "image"
238
+ output = {"image": before_state.screenshot, "url": before_state.url}
239
+ logger.debug(f"Action: {action.type}")
240
+
241
+ case "move":
242
+ x, y = action.x, action.y
243
+ await self.page.mouse.move(x, y)
244
+ self.mouse_pos = Point(x=x, y=y)
245
+ output = f"Moved mouse to ({x}, {y})"
246
+ logger.debug(f"Action: {action.type} to ({x},{y})")
247
+
248
+ case "drag":
249
+ path = action.path
250
+ if not path:
251
+ error = "Missing path for drag action"
252
+ else:
253
+ await self.page.mouse.move(path[0].x, path[0].y)
254
+ await self.page.mouse.down()
255
+ for point in path[1:]:
256
+ await self.page.mouse.move(point.x, point.y)
257
+ await self.page.mouse.up()
258
+ self.mouse_pos = Point(x=path[-1].x, y=path[-1].y)
259
+ output = f"Drag along path starting at ({path[0].x},{path[0].y})"
260
+ logger.debug(f"Action: {action.type} with {len(path)} points")
261
+
262
+ case "mouse_down":
263
+ await self.page.mouse.down(button=action.button)
264
+ output = f"{action.button.capitalize()} mouse button down"
265
+ logger.debug(f"Action: {action.type} {action.button}")
266
+
267
+ case "mouse_up":
268
+ await self.page.mouse.up(button=action.button)
269
+ output = f"{action.button.capitalize()} mouse button up"
270
+ logger.debug(f"Action: {action.type} {action.button}")
271
+
272
+ case "hold_key":
273
+ keys_to_parse = action.text
274
+ duration = action.duration
275
+ keys = self.parse_key_combination(keys_to_parse)
276
+ for key in keys:
277
+ await self.page.keyboard.down(key)
278
+ await asyncio.sleep(duration)
279
+ for key in reversed(keys):
280
+ await self.page.keyboard.up(key)
281
+ output = f"Held key{'s' if len(keys) > 1 else ''} {keys_to_parse} for {duration} seconds"
282
+ logger.debug(f"Action: {action.type} '{keys_to_parse}' for {duration}s")
283
+
284
+ case "key_down":
285
+ key = action.key
286
+ await self.page.keyboard.down(key)
287
+ output = f"Key down: {key}"
288
+ logger.debug(f"Action: {action.type} {key}")
289
+
290
+ case "key_up":
291
+ key = action.key
292
+ await self.page.keyboard.up(key)
293
+ output = f"Key up: {key}"
294
+ logger.debug(f"Action: {action.type} {key}")
295
+
296
+ case "cursor_position":
297
+ # Playwright doesn't directly expose mouse position easily without JS injection
298
+ # Returning a placeholder for now
299
+ output = "Cursor position requested (not directly available)"
300
+ logger.debug(f"Action: {action.type}")
301
+
302
+ case "goto":
303
+ url = action.url
304
+ if not url:
305
+ error = "Missing URL for goto action"
306
+ else:
307
+ await self.page.goto(url)
308
+ output = f"Navigated to {url}"
309
+ logger.debug(f"Action: {action.type} to {url}")
310
+
311
+ case "back":
312
+ if len(self.navigation_history) > 1:
313
+ self.navigation_history.pop()
314
+ previous_url = self.navigation_history[-1]
315
+ await self.page.goto(previous_url)
316
+ output = f"Navigated back to {previous_url}"
317
+ else:
318
+ output = "No previous URL to navigate back"
319
+ previous_url = "about:blank"
320
+ logger.debug(f"Action: {action.type} to {previous_url}")
321
+
322
+ case _:
323
+ error = f"Unrecognized action type: {action.type}"
324
+ logger.warning(error)
325
+
326
+ except Exception as e:
327
+ error = f"Error executing action {action.type}: {e}"
328
+ logger.exception(f"Error during step execution for action: {action.model_dump_json()}")
329
+
330
+ after_state = await self.get_state()
331
+ return EnvStepResult(
332
+ type=step_type,
333
+ output=output,
334
+ error=error,
335
+ current_url=after_state.url,
336
+ screenshot_base64=after_state.screenshot,
337
+ )
338
+
339
+ def reset(self) -> None:
340
+ self.visited_urls.clear()
341
+
342
+ async def close(self) -> None:
343
+ if self.browser:
344
+ await self.browser.close()
345
+ logger.info("Browser closed.")
346
+ if self.playwright:
347
+ await self.playwright.stop()
348
+ logger.info("Playwright stopped.")
349
+ self.browser = None
350
+ self.playwright = None
351
+ self.page = None
352
+
353
+ # Mapping of Operator Agent keys to Playwright keys
354
+ CUA_KEY_TO_PLAYWRIGHT_KEY = {
355
+ "/": "Divide",
356
+ "\\": "Backslash",
357
+ "alt": "Alt",
358
+ "arrowdown": "ArrowDown",
359
+ "arrowleft": "ArrowLeft",
360
+ "arrowright": "ArrowRight",
361
+ "arrowup": "ArrowUp",
362
+ "backspace": "Backspace",
363
+ "capslock": "CapsLock",
364
+ "cmd": "Meta",
365
+ "ctrl": "ControlOrMeta",
366
+ "delete": "Delete",
367
+ "end": "End",
368
+ "enter": "Enter",
369
+ "return": "Enter",
370
+ "esc": "Escape",
371
+ "home": "Home",
372
+ "insert": "Insert",
373
+ "option": "Alt",
374
+ "pagedown": "PageDown",
375
+ "pageup": "PageUp",
376
+ "shift": "Shift",
377
+ "space": " ",
378
+ "super": "Meta",
379
+ "tab": "Tab",
380
+ "win": "Meta",
381
+ }
382
+
383
+ @staticmethod
384
+ def parse_key_combination(text: str) -> list[str]:
385
+ """
386
+ Parse an xdotool-style key combination (e.g., "ctrl+o", "shift+tab")
387
+ and return a list of Playwright-compatible key names.
388
+ """
389
+ if "+" in text:
390
+ keys = text.split("+")
391
+ # Map each key to its Playwright equivalent
392
+ return [BrowserEnvironment.CUA_KEY_TO_PLAYWRIGHT_KEY.get(k.lower(), k) for k in keys]
393
+ else:
394
+ # Single key
395
+ return [BrowserEnvironment.CUA_KEY_TO_PLAYWRIGHT_KEY.get(text.lower(), text)]
khoj/routers/api_chat.py CHANGED
@@ -31,6 +31,7 @@ from khoj.processor.conversation.utils import (
31
31
  save_to_conversation_log,
32
32
  )
33
33
  from khoj.processor.image.generate import text_to_image
34
+ from khoj.processor.operator.operate_browser import operate_browser
34
35
  from khoj.processor.speech.text_to_speech import generate_text_to_speech
35
36
  from khoj.processor.tools.online_search import (
36
37
  deduplicate_organic_results,
@@ -78,6 +79,7 @@ from khoj.utils.helpers import (
78
79
  get_country_name_from_timezone,
79
80
  get_device,
80
81
  is_none_or_empty,
82
+ is_operator_enabled,
81
83
  )
82
84
  from khoj.utils.rawconfig import (
83
85
  ChatRequestBody,
@@ -569,6 +571,8 @@ async def chat_options(
569
571
  ) -> Response:
570
572
  cmd_options = {}
571
573
  for cmd in ConversationCommand:
574
+ if cmd == ConversationCommand.Operator and not is_operator_enabled():
575
+ continue
572
576
  if cmd in command_descriptions:
573
577
  cmd_options[cmd.value] = command_descriptions[cmd]
574
578
 
@@ -856,9 +860,9 @@ async def chat(
856
860
  async for result in send_llm_response(f"Conversation {conversation_id} not found", tracer.get("usage")):
857
861
  yield result
858
862
  return
859
- conversation_id = conversation.id
863
+ conversation_id = str(conversation.id)
860
864
 
861
- async for event in send_event(ChatEvent.METADATA, {"conversationId": str(conversation_id), "turnId": turn_id}):
865
+ async for event in send_event(ChatEvent.METADATA, {"conversationId": conversation_id, "turnId": turn_id}):
862
866
  yield event
863
867
 
864
868
  agent: Agent | None = None
@@ -882,6 +886,7 @@ async def chat(
882
886
  researched_results = ""
883
887
  online_results: Dict = dict()
884
888
  code_results: Dict = dict()
889
+ operator_results: Dict[str, str] = {}
885
890
  generated_asset_results: Dict = dict()
886
891
  ## Extract Document References
887
892
  compiled_references: List[Any] = []
@@ -956,7 +961,8 @@ async def chat(
956
961
  code_results.update(research_result.codeContext)
957
962
  if research_result.context:
958
963
  compiled_references.extend(research_result.context)
959
-
964
+ if research_result.operatorContext:
965
+ operator_results.update(research_result.operatorContext)
960
966
  researched_results += research_result.summarizedResult
961
967
 
962
968
  else:
@@ -1207,14 +1213,45 @@ async def chat(
1207
1213
  yield result[ChatEvent.STATUS]
1208
1214
  else:
1209
1215
  code_results = result
1210
- async for result in send_event(ChatEvent.STATUS, f"**Ran code snippets**: {len(code_results)}"):
1211
- yield result
1212
1216
  except ValueError as e:
1213
1217
  program_execution_context.append(f"Failed to run code")
1214
1218
  logger.warning(
1215
1219
  f"Failed to use code tool: {e}. Attempting to respond without code results",
1216
1220
  exc_info=True,
1217
1221
  )
1222
+ if ConversationCommand.Operator in conversation_commands:
1223
+ try:
1224
+ async for result in operate_browser(
1225
+ defiltered_query,
1226
+ user,
1227
+ meta_log,
1228
+ location,
1229
+ query_images=uploaded_images,
1230
+ query_files=attached_file_context,
1231
+ send_status_func=partial(send_event, ChatEvent.STATUS),
1232
+ agent=agent,
1233
+ cancellation_event=cancellation_event,
1234
+ tracer=tracer,
1235
+ ):
1236
+ if isinstance(result, dict) and ChatEvent.STATUS in result:
1237
+ yield result[ChatEvent.STATUS]
1238
+ else:
1239
+ operator_results = {result["query"]: result["result"]}
1240
+ # Add webpages visited while operating browser to references
1241
+ if result.get("webpages"):
1242
+ if not online_results.get(defiltered_query):
1243
+ online_results[defiltered_query] = {"webpages": result["webpages"]}
1244
+ elif not online_results[defiltered_query].get("webpages"):
1245
+ online_results[defiltered_query]["webpages"] = result["webpages"]
1246
+ else:
1247
+ online_results[defiltered_query]["webpages"] += result["webpages"]
1248
+ except ValueError as e:
1249
+ program_execution_context.append(f"Browser operation error: {e}")
1250
+ logger.warning(f"Failed to operate browser with {e}", exc_info=True)
1251
+ async for result in send_event(
1252
+ ChatEvent.STATUS, "Operating browser failed. I'll try respond appropriately"
1253
+ ):
1254
+ yield result
1218
1255
 
1219
1256
  ## Send Gathered References
1220
1257
  unique_online_results = deduplicate_organic_results(online_results)
@@ -1225,6 +1262,7 @@ async def chat(
1225
1262
  "context": compiled_references,
1226
1263
  "onlineContext": unique_online_results,
1227
1264
  "codeContext": code_results,
1265
+ "operatorContext": operator_results,
1228
1266
  },
1229
1267
  ):
1230
1268
  yield result
@@ -1340,11 +1378,11 @@ async def chat(
1340
1378
  compiled_references,
1341
1379
  online_results,
1342
1380
  code_results,
1381
+ operator_results,
1343
1382
  inferred_queries,
1344
1383
  conversation_commands,
1345
1384
  user,
1346
1385
  request.user.client_app,
1347
- conversation_id,
1348
1386
  location,
1349
1387
  user_name,
1350
1388
  researched_results,
khoj/routers/helpers.py CHANGED
@@ -113,6 +113,7 @@ from khoj.utils.helpers import (
113
113
  get_file_type,
114
114
  in_debug_mode,
115
115
  is_none_or_empty,
116
+ is_operator_enabled,
116
117
  is_valid_url,
117
118
  log_telemetry,
118
119
  mode_descriptions_for_llm,
@@ -253,6 +254,8 @@ def get_conversation_command(query: str) -> ConversationCommand:
253
254
  return ConversationCommand.Code
254
255
  elif query.startswith("/research"):
255
256
  return ConversationCommand.Research
257
+ elif query.startswith("/operator") and is_operator_enabled():
258
+ return ConversationCommand.Operator
256
259
  else:
257
260
  return ConversationCommand.Default
258
261
 
@@ -362,6 +365,8 @@ async def aget_data_sources_and_output_format(
362
365
  # Skip showing Notes tool as an option if user has no entries
363
366
  if source == ConversationCommand.Notes and not user_has_entries:
364
367
  continue
368
+ if source == ConversationCommand.Operator and not is_operator_enabled():
369
+ continue
365
370
  source_options[source.value] = description
366
371
  if len(agent_sources) == 0 or source.value in agent_sources:
367
372
  source_options_str += f'- "{source.value}": "{description}"\n'
@@ -1349,11 +1354,11 @@ async def agenerate_chat_response(
1349
1354
  compiled_references: List[Dict] = [],
1350
1355
  online_results: Dict[str, Dict] = {},
1351
1356
  code_results: Dict[str, Dict] = {},
1357
+ operator_results: Dict[str, str] = {},
1352
1358
  inferred_queries: List[str] = [],
1353
1359
  conversation_commands: List[ConversationCommand] = [ConversationCommand.Default],
1354
1360
  user: KhojUser = None,
1355
1361
  client_application: ClientApplication = None,
1356
- conversation_id: str = None,
1357
1362
  location_data: LocationData = None,
1358
1363
  user_name: Optional[str] = None,
1359
1364
  meta_research: str = "",
@@ -1385,9 +1390,10 @@ async def agenerate_chat_response(
1385
1390
  compiled_references=compiled_references,
1386
1391
  online_results=online_results,
1387
1392
  code_results=code_results,
1393
+ operator_results=operator_results,
1388
1394
  inferred_queries=inferred_queries,
1389
1395
  client_application=client_application,
1390
- conversation_id=conversation_id,
1396
+ conversation_id=str(conversation.id),
1391
1397
  query_images=query_images,
1392
1398
  train_of_thought=train_of_thought,
1393
1399
  raw_query_files=raw_query_files,
@@ -1404,6 +1410,7 @@ async def agenerate_chat_response(
1404
1410
  compiled_references = []
1405
1411
  online_results = {}
1406
1412
  code_results = {}
1413
+ operator_results = {}
1407
1414
  deepthought = True
1408
1415
 
1409
1416
  chat_model = await ConversationAdapters.aget_valid_chat_model(user, conversation, is_subscribed)
@@ -1441,11 +1448,12 @@ async def agenerate_chat_response(
1441
1448
  api_key = openai_chat_config.api_key
1442
1449
  chat_model_name = chat_model.name
1443
1450
  chat_response_generator = converse_openai(
1444
- compiled_references,
1445
1451
  query_to_run,
1452
+ compiled_references,
1446
1453
  query_images=query_images,
1447
1454
  online_results=online_results,
1448
1455
  code_results=code_results,
1456
+ operator_results=operator_results,
1449
1457
  conversation_log=meta_log,
1450
1458
  model=chat_model_name,
1451
1459
  api_key=api_key,
@@ -1470,11 +1478,12 @@ async def agenerate_chat_response(
1470
1478
  api_key = chat_model.ai_model_api.api_key
1471
1479
  api_base_url = chat_model.ai_model_api.api_base_url
1472
1480
  chat_response_generator = converse_anthropic(
1473
- compiled_references,
1474
1481
  query_to_run,
1482
+ compiled_references,
1475
1483
  query_images=query_images,
1476
1484
  online_results=online_results,
1477
1485
  code_results=code_results,
1486
+ operator_results=operator_results,
1478
1487
  conversation_log=meta_log,
1479
1488
  model=chat_model.name,
1480
1489
  api_key=api_key,
@@ -1498,11 +1507,12 @@ async def agenerate_chat_response(
1498
1507
  api_key = chat_model.ai_model_api.api_key
1499
1508
  api_base_url = chat_model.ai_model_api.api_base_url
1500
1509
  chat_response_generator = converse_gemini(
1501
- compiled_references,
1502
1510
  query_to_run,
1503
- online_results,
1504
- code_results,
1505
- meta_log,
1511
+ compiled_references,
1512
+ online_results=online_results,
1513
+ code_results=code_results,
1514
+ operator_results=operator_results,
1515
+ conversation_log=meta_log,
1506
1516
  model=chat_model.name,
1507
1517
  api_key=api_key,
1508
1518
  api_base_url=api_base_url,