khoj 1.41.1.dev40__py3-none-any.whl → 1.41.1.dev90__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- khoj/database/adapters/__init__.py +1 -1
- khoj/interface/compiled/404/index.html +1 -1
- khoj/interface/compiled/_next/static/chunks/4986-9ddd694756d03aa1.js +1 -0
- khoj/interface/compiled/agents/index.html +1 -1
- khoj/interface/compiled/agents/index.txt +1 -1
- khoj/interface/compiled/automations/index.html +1 -1
- khoj/interface/compiled/automations/index.txt +1 -1
- khoj/interface/compiled/chat/index.html +2 -2
- khoj/interface/compiled/chat/index.txt +2 -2
- khoj/interface/compiled/index.html +1 -1
- khoj/interface/compiled/index.txt +1 -1
- khoj/interface/compiled/search/index.html +1 -1
- khoj/interface/compiled/search/index.txt +1 -1
- khoj/interface/compiled/settings/index.html +1 -1
- khoj/interface/compiled/settings/index.txt +1 -1
- khoj/interface/compiled/share/chat/index.html +2 -2
- khoj/interface/compiled/share/chat/index.txt +2 -2
- khoj/processor/conversation/anthropic/anthropic_chat.py +5 -0
- khoj/processor/conversation/google/gemini_chat.py +5 -0
- khoj/processor/conversation/google/utils.py +4 -0
- khoj/processor/conversation/openai/gpt.py +5 -0
- khoj/processor/conversation/prompts.py +12 -1
- khoj/processor/conversation/utils.py +13 -1
- khoj/processor/operator/grounding_agent.py +345 -0
- khoj/processor/operator/grounding_agent_uitars.py +973 -0
- khoj/processor/operator/operate_browser.py +152 -0
- khoj/processor/operator/operator_actions.py +149 -0
- khoj/processor/operator/operator_agent_anthropic.py +383 -0
- khoj/processor/operator/operator_agent_base.py +80 -0
- khoj/processor/operator/operator_agent_binary.py +336 -0
- khoj/processor/operator/operator_agent_openai.py +349 -0
- khoj/processor/operator/operator_environment_base.py +37 -0
- khoj/processor/operator/operator_environment_browser.py +395 -0
- khoj/routers/api_chat.py +42 -3
- khoj/routers/helpers.py +14 -3
- khoj/routers/research.py +48 -1
- khoj/utils/helpers.py +17 -0
- {khoj-1.41.1.dev40.dist-info → khoj-1.41.1.dev90.dist-info}/METADATA +5 -3
- {khoj-1.41.1.dev40.dist-info → khoj-1.41.1.dev90.dist-info}/RECORD +44 -34
- khoj/interface/compiled/_next/static/chunks/4986-14ea63faad1615a4.js +0 -1
- /khoj/interface/compiled/_next/static/{ifuY0XkcvaIiCG3xJl8zw → WLmcH2J-wz36GlS6O8HSL}/_buildManifest.js +0 -0
- /khoj/interface/compiled/_next/static/{ifuY0XkcvaIiCG3xJl8zw → WLmcH2J-wz36GlS6O8HSL}/_ssgManifest.js +0 -0
- {khoj-1.41.1.dev40.dist-info → khoj-1.41.1.dev90.dist-info}/WHEEL +0 -0
- {khoj-1.41.1.dev40.dist-info → khoj-1.41.1.dev90.dist-info}/entry_points.txt +0 -0
- {khoj-1.41.1.dev40.dist-info → khoj-1.41.1.dev90.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,395 @@
|
|
1
|
+
import asyncio
|
2
|
+
import base64
|
3
|
+
import io
|
4
|
+
import logging
|
5
|
+
import os
|
6
|
+
from typing import Optional, Set, Union
|
7
|
+
|
8
|
+
from khoj.processor.operator.operator_actions import OperatorAction, Point
|
9
|
+
from khoj.processor.operator.operator_environment_base import (
|
10
|
+
Environment,
|
11
|
+
EnvState,
|
12
|
+
EnvStepResult,
|
13
|
+
)
|
14
|
+
from khoj.utils.helpers import convert_image_to_webp
|
15
|
+
|
16
|
+
logger = logging.getLogger(__name__)
|
17
|
+
|
18
|
+
try:
|
19
|
+
from playwright.async_api import Browser, Page, Playwright, async_playwright
|
20
|
+
except ImportError:
|
21
|
+
logger.debug(
|
22
|
+
"Playwright not found. To use browser operator, run 'pip install playwright' and 'playwright install' first."
|
23
|
+
)
|
24
|
+
|
25
|
+
|
26
|
+
# --- Concrete BrowserEnvironment ---
|
27
|
+
class BrowserEnvironment(Environment):
|
28
|
+
def __init__(self):
|
29
|
+
self.playwright: Optional[Playwright] = None
|
30
|
+
self.browser: Optional[Browser] = None
|
31
|
+
self.page: Optional[Page] = None
|
32
|
+
self.width: int = 1024
|
33
|
+
self.height: int = 768
|
34
|
+
self.visited_urls: Set[str] = set()
|
35
|
+
self.excluded_urls = {"about:blank", "https://duckduckgo.com", "https://www.bing.com", "https://www.google.com"}
|
36
|
+
self.navigation_history: list[str] = []
|
37
|
+
self.mouse_pos = Point(x=self.width / 2, y=self.height / 2)
|
38
|
+
|
39
|
+
async def start(self, width: int = 1024, height: int = 768) -> None:
|
40
|
+
self.width = width
|
41
|
+
self.height = height
|
42
|
+
self.playwright = await async_playwright().start()
|
43
|
+
|
44
|
+
if cdp_url := os.getenv("KHOJ_CDP_URL"):
|
45
|
+
self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url)
|
46
|
+
else:
|
47
|
+
launch_args = [f"--window-size={width},{height}", "--disable-extensions", "--disable-file-system"]
|
48
|
+
self.browser = await self.playwright.chromium.launch(
|
49
|
+
chromium_sandbox=True, headless=False, args=launch_args, env={}
|
50
|
+
)
|
51
|
+
|
52
|
+
# Get the initial browser, page or create one if none exist
|
53
|
+
default_context = self.browser.contexts[0] if self.browser.contexts else await self.browser.new_context()
|
54
|
+
self.page = default_context.pages[0] if default_context.pages else await default_context.new_page()
|
55
|
+
|
56
|
+
# Define a handler for page load events to capture URLs
|
57
|
+
async def handle_load(loaded_page: Page):
|
58
|
+
url = loaded_page.url
|
59
|
+
if not url:
|
60
|
+
return
|
61
|
+
|
62
|
+
if not self.navigation_history or self.navigation_history[-1] != url:
|
63
|
+
self.navigation_history.append(url)
|
64
|
+
|
65
|
+
if url not in self.excluded_urls and url not in self.visited_urls:
|
66
|
+
logger.debug(f"Page loaded: {url}")
|
67
|
+
self.visited_urls.add(url)
|
68
|
+
|
69
|
+
# Listen for load events on the main page
|
70
|
+
self.page.on("load", handle_load)
|
71
|
+
|
72
|
+
# Define a handler for new pages
|
73
|
+
async def handle_new_page(new_page: Page):
|
74
|
+
# Get the target URL of the new page
|
75
|
+
target_url = new_page.url
|
76
|
+
# Close the new page if it is not closed
|
77
|
+
if not new_page.is_closed():
|
78
|
+
await new_page.close()
|
79
|
+
# Open the target url in the current page instead
|
80
|
+
if target_url and target_url != "about:blank" and self.page:
|
81
|
+
logger.debug(f"Load {target_url} in current page instead of new tab.")
|
82
|
+
await self.page.goto(target_url)
|
83
|
+
|
84
|
+
# Listen for new pages being created in the context
|
85
|
+
default_context.on("page", handle_new_page)
|
86
|
+
|
87
|
+
# If page url is blank, navigate to DuckDuckGo
|
88
|
+
if self.page.url == "about:blank":
|
89
|
+
await self.page.goto("https://duckduckgo.com")
|
90
|
+
await self.page.set_viewport_size({"width": self.width, "height": self.height})
|
91
|
+
logger.info("Browser environment started.")
|
92
|
+
|
93
|
+
async def _get_screenshot(self) -> Optional[str]:
|
94
|
+
if not self.page or self.page.is_closed():
|
95
|
+
return None
|
96
|
+
try:
|
97
|
+
screenshot_bytes = await self.page.screenshot(caret="initial", full_page=False, type="png")
|
98
|
+
# Draw mouse position on the screenshot image
|
99
|
+
if self.mouse_pos:
|
100
|
+
screenshot_bytes = await self._draw_mouse_position(screenshot_bytes, self.mouse_pos)
|
101
|
+
screenshot_webp_bytes = convert_image_to_webp(screenshot_bytes)
|
102
|
+
return base64.b64encode(screenshot_webp_bytes).decode("utf-8")
|
103
|
+
except Exception as e:
|
104
|
+
logger.error(f"Failed to get screenshot: {e}")
|
105
|
+
return None
|
106
|
+
|
107
|
+
async def _draw_mouse_position(self, screenshot_bytes: bytes, mouse_pos: Point) -> bytes:
|
108
|
+
from PIL import Image, ImageDraw
|
109
|
+
|
110
|
+
# Load the screenshot into a PIL image
|
111
|
+
image = Image.open(io.BytesIO(screenshot_bytes))
|
112
|
+
|
113
|
+
# Draw a red circle at the mouse position
|
114
|
+
draw = ImageDraw.Draw(image)
|
115
|
+
radius = 5
|
116
|
+
draw.ellipse(
|
117
|
+
(mouse_pos.x - radius, mouse_pos.y - radius, mouse_pos.x + radius, mouse_pos.y + radius), fill="red"
|
118
|
+
)
|
119
|
+
|
120
|
+
# Save the modified image to a bytes buffer
|
121
|
+
output_buffer = io.BytesIO()
|
122
|
+
image.save(output_buffer, format="PNG")
|
123
|
+
return output_buffer.getvalue()
|
124
|
+
|
125
|
+
async def get_state(self) -> EnvState:
|
126
|
+
if not self.page or self.page.is_closed():
|
127
|
+
return EnvState(url="about:blank", screenshot=None)
|
128
|
+
url = self.page.url
|
129
|
+
screenshot = await self._get_screenshot()
|
130
|
+
return EnvState(url=url, screenshot=screenshot)
|
131
|
+
|
132
|
+
async def step(self, action: OperatorAction) -> EnvStepResult:
|
133
|
+
if not self.page or self.page.is_closed():
|
134
|
+
return EnvStepResult(error="Browser page is not available or closed.")
|
135
|
+
|
136
|
+
before_state = await self.get_state()
|
137
|
+
output: Optional[Union[str, dict]] = None
|
138
|
+
error: Optional[str] = None
|
139
|
+
step_type: str = "text"
|
140
|
+
try:
|
141
|
+
match action.type:
|
142
|
+
case "click":
|
143
|
+
x, y, button = action.x, action.y, action.button
|
144
|
+
if button == "wheel":
|
145
|
+
await self.page.mouse.wheel(x, y)
|
146
|
+
output = f"Scrolled wheel at ({x}, {y})"
|
147
|
+
else:
|
148
|
+
modifiers = self.parse_key_combination(action.modifiers) if action.modifiers else []
|
149
|
+
for modifier in modifiers:
|
150
|
+
await self.page.keyboard.down(modifier)
|
151
|
+
await self.page.mouse.click(x, y, button=button)
|
152
|
+
for modifier in reversed(modifiers):
|
153
|
+
await self.page.keyboard.up(modifier)
|
154
|
+
output = f"{button.capitalize()} clicked at ({x}, {y})"
|
155
|
+
self.mouse_pos = Point(x=x, y=y)
|
156
|
+
logger.debug(f"Action: {action.type} {button} at ({x},{y})")
|
157
|
+
|
158
|
+
case "double_click":
|
159
|
+
x, y = action.x, action.y
|
160
|
+
await self.page.mouse.dblclick(x, y)
|
161
|
+
self.mouse_pos = Point(x=x, y=y)
|
162
|
+
output = f"Double clicked at ({x}, {y})"
|
163
|
+
logger.debug(f"Action: {action.type} at ({x},{y})")
|
164
|
+
|
165
|
+
case "triple_click":
|
166
|
+
x, y = action.x, action.y
|
167
|
+
await self.page.mouse.click(x, y, click_count=3)
|
168
|
+
self.mouse_pos = Point(x=x, y=y)
|
169
|
+
output = f"Triple clicked at ({x}, {y})"
|
170
|
+
logger.debug(f"Action: {action.type} at ({x},{y})")
|
171
|
+
|
172
|
+
case "scroll":
|
173
|
+
# Prefer explicit scroll_x/y if provided (from OpenAI style)
|
174
|
+
if action.scroll_x is not None or action.scroll_y is not None:
|
175
|
+
scroll_x = action.scroll_x or 0
|
176
|
+
scroll_y = action.scroll_y or 0
|
177
|
+
if action.x is not None and action.y is not None:
|
178
|
+
await self.page.mouse.move(action.x, action.y)
|
179
|
+
self.mouse_pos = Point(x=action.x, y=action.y)
|
180
|
+
await self.page.evaluate(f"window.scrollBy({scroll_x}, {scroll_y})")
|
181
|
+
output = f"Scrolled by ({scroll_x}, {scroll_y})"
|
182
|
+
logger.debug(f"Action: {action.type} by ({scroll_x},{scroll_y}) at ({action.x},{action.y})")
|
183
|
+
# Otherwise use direction/amount (from Anthropic style)
|
184
|
+
elif action.scroll_direction:
|
185
|
+
scale = 40.0
|
186
|
+
dx, dy = 0.0, 0.0
|
187
|
+
amount = action.scroll_amount or 1
|
188
|
+
if action.scroll_direction == "up":
|
189
|
+
dy = -scale * amount
|
190
|
+
elif action.scroll_direction == "down":
|
191
|
+
dy = scale * amount
|
192
|
+
elif action.scroll_direction == "left":
|
193
|
+
dx = -scale * amount
|
194
|
+
elif action.scroll_direction == "right":
|
195
|
+
dx = scale * amount
|
196
|
+
|
197
|
+
if action.x is not None and action.y is not None:
|
198
|
+
await self.page.mouse.move(action.x, action.y)
|
199
|
+
self.mouse_pos = Point(x=action.x, y=action.y)
|
200
|
+
await self.page.mouse.wheel(dx, dy)
|
201
|
+
output = f"Scrolled {action.scroll_direction} by {amount}"
|
202
|
+
logger.debug(
|
203
|
+
f"Action: {action.type} {action.scroll_direction} by {amount} at ({action.x},{action.y})"
|
204
|
+
)
|
205
|
+
else:
|
206
|
+
error = "Scroll action requires either scroll_x/y or scroll_direction"
|
207
|
+
|
208
|
+
case "keypress":
|
209
|
+
keys = action.keys
|
210
|
+
if len(keys) > 1: # Handle combinations like ctrl+a
|
211
|
+
modifiers = [BrowserEnvironment.CUA_KEY_TO_PLAYWRIGHT_KEY.get(k.lower(), k) for k in keys[:-1]]
|
212
|
+
main_key = BrowserEnvironment.CUA_KEY_TO_PLAYWRIGHT_KEY.get(keys[-1].lower(), keys[-1])
|
213
|
+
key_string = "+".join(modifiers + [main_key])
|
214
|
+
await self.page.keyboard.press(key_string)
|
215
|
+
elif keys: # Single key
|
216
|
+
key_string = BrowserEnvironment.CUA_KEY_TO_PLAYWRIGHT_KEY.get(keys[0].lower(), keys[0])
|
217
|
+
await self.page.keyboard.press(key_string)
|
218
|
+
else:
|
219
|
+
error = "Keypress action requires at least one key"
|
220
|
+
key_string = "N/A"
|
221
|
+
output = f"Pressed key(s): {key_string}"
|
222
|
+
logger.debug(f"Action: {action.type} '{key_string}'")
|
223
|
+
|
224
|
+
case "type":
|
225
|
+
text = action.text
|
226
|
+
await self.page.keyboard.type(text)
|
227
|
+
output = f"Typed text: {text}"
|
228
|
+
logger.debug(f"Action: {action.type} '{text}'")
|
229
|
+
|
230
|
+
case "wait":
|
231
|
+
duration = action.duration
|
232
|
+
await asyncio.sleep(duration)
|
233
|
+
output = f"Waited for {duration} seconds"
|
234
|
+
logger.debug(f"Action: {action.type} for {duration}s")
|
235
|
+
|
236
|
+
case "screenshot":
|
237
|
+
step_type = "image"
|
238
|
+
output = {"image": before_state.screenshot, "url": before_state.url}
|
239
|
+
logger.debug(f"Action: {action.type}")
|
240
|
+
|
241
|
+
case "move":
|
242
|
+
x, y = action.x, action.y
|
243
|
+
await self.page.mouse.move(x, y)
|
244
|
+
self.mouse_pos = Point(x=x, y=y)
|
245
|
+
output = f"Moved mouse to ({x}, {y})"
|
246
|
+
logger.debug(f"Action: {action.type} to ({x},{y})")
|
247
|
+
|
248
|
+
case "drag":
|
249
|
+
path = action.path
|
250
|
+
if not path:
|
251
|
+
error = "Missing path for drag action"
|
252
|
+
else:
|
253
|
+
await self.page.mouse.move(path[0].x, path[0].y)
|
254
|
+
await self.page.mouse.down()
|
255
|
+
for point in path[1:]:
|
256
|
+
await self.page.mouse.move(point.x, point.y)
|
257
|
+
await self.page.mouse.up()
|
258
|
+
self.mouse_pos = Point(x=path[-1].x, y=path[-1].y)
|
259
|
+
output = f"Drag along path starting at ({path[0].x},{path[0].y})"
|
260
|
+
logger.debug(f"Action: {action.type} with {len(path)} points")
|
261
|
+
|
262
|
+
case "mouse_down":
|
263
|
+
await self.page.mouse.down(button=action.button)
|
264
|
+
output = f"{action.button.capitalize()} mouse button down"
|
265
|
+
logger.debug(f"Action: {action.type} {action.button}")
|
266
|
+
|
267
|
+
case "mouse_up":
|
268
|
+
await self.page.mouse.up(button=action.button)
|
269
|
+
output = f"{action.button.capitalize()} mouse button up"
|
270
|
+
logger.debug(f"Action: {action.type} {action.button}")
|
271
|
+
|
272
|
+
case "hold_key":
|
273
|
+
keys_to_parse = action.text
|
274
|
+
duration = action.duration
|
275
|
+
keys = self.parse_key_combination(keys_to_parse)
|
276
|
+
for key in keys:
|
277
|
+
await self.page.keyboard.down(key)
|
278
|
+
await asyncio.sleep(duration)
|
279
|
+
for key in reversed(keys):
|
280
|
+
await self.page.keyboard.up(key)
|
281
|
+
output = f"Held key{'s' if len(keys) > 1 else ''} {keys_to_parse} for {duration} seconds"
|
282
|
+
logger.debug(f"Action: {action.type} '{keys_to_parse}' for {duration}s")
|
283
|
+
|
284
|
+
case "key_down":
|
285
|
+
key = action.key
|
286
|
+
await self.page.keyboard.down(key)
|
287
|
+
output = f"Key down: {key}"
|
288
|
+
logger.debug(f"Action: {action.type} {key}")
|
289
|
+
|
290
|
+
case "key_up":
|
291
|
+
key = action.key
|
292
|
+
await self.page.keyboard.up(key)
|
293
|
+
output = f"Key up: {key}"
|
294
|
+
logger.debug(f"Action: {action.type} {key}")
|
295
|
+
|
296
|
+
case "cursor_position":
|
297
|
+
# Playwright doesn't directly expose mouse position easily without JS injection
|
298
|
+
# Returning a placeholder for now
|
299
|
+
output = "Cursor position requested (not directly available)"
|
300
|
+
logger.debug(f"Action: {action.type}")
|
301
|
+
|
302
|
+
case "goto":
|
303
|
+
url = action.url
|
304
|
+
if not url:
|
305
|
+
error = "Missing URL for goto action"
|
306
|
+
else:
|
307
|
+
await self.page.goto(url)
|
308
|
+
output = f"Navigated to {url}"
|
309
|
+
logger.debug(f"Action: {action.type} to {url}")
|
310
|
+
|
311
|
+
case "back":
|
312
|
+
if len(self.navigation_history) > 1:
|
313
|
+
self.navigation_history.pop()
|
314
|
+
previous_url = self.navigation_history[-1]
|
315
|
+
await self.page.goto(previous_url)
|
316
|
+
output = f"Navigated back to {previous_url}"
|
317
|
+
else:
|
318
|
+
output = "No previous URL to navigate back"
|
319
|
+
previous_url = "about:blank"
|
320
|
+
logger.debug(f"Action: {action.type} to {previous_url}")
|
321
|
+
|
322
|
+
case _:
|
323
|
+
error = f"Unrecognized action type: {action.type}"
|
324
|
+
logger.warning(error)
|
325
|
+
|
326
|
+
except Exception as e:
|
327
|
+
error = f"Error executing action {action.type}: {e}"
|
328
|
+
logger.exception(f"Error during step execution for action: {action.model_dump_json()}")
|
329
|
+
|
330
|
+
after_state = await self.get_state()
|
331
|
+
return EnvStepResult(
|
332
|
+
type=step_type,
|
333
|
+
output=output,
|
334
|
+
error=error,
|
335
|
+
current_url=after_state.url,
|
336
|
+
screenshot_base64=after_state.screenshot,
|
337
|
+
)
|
338
|
+
|
339
|
+
def reset(self) -> None:
|
340
|
+
self.visited_urls.clear()
|
341
|
+
|
342
|
+
async def close(self) -> None:
|
343
|
+
if self.browser:
|
344
|
+
await self.browser.close()
|
345
|
+
logger.info("Browser closed.")
|
346
|
+
if self.playwright:
|
347
|
+
await self.playwright.stop()
|
348
|
+
logger.info("Playwright stopped.")
|
349
|
+
self.browser = None
|
350
|
+
self.playwright = None
|
351
|
+
self.page = None
|
352
|
+
|
353
|
+
# Mapping of Operator Agent keys to Playwright keys
|
354
|
+
CUA_KEY_TO_PLAYWRIGHT_KEY = {
|
355
|
+
"/": "Divide",
|
356
|
+
"\\": "Backslash",
|
357
|
+
"alt": "Alt",
|
358
|
+
"arrowdown": "ArrowDown",
|
359
|
+
"arrowleft": "ArrowLeft",
|
360
|
+
"arrowright": "ArrowRight",
|
361
|
+
"arrowup": "ArrowUp",
|
362
|
+
"backspace": "Backspace",
|
363
|
+
"capslock": "CapsLock",
|
364
|
+
"cmd": "Meta",
|
365
|
+
"ctrl": "ControlOrMeta",
|
366
|
+
"delete": "Delete",
|
367
|
+
"end": "End",
|
368
|
+
"enter": "Enter",
|
369
|
+
"return": "Enter",
|
370
|
+
"esc": "Escape",
|
371
|
+
"home": "Home",
|
372
|
+
"insert": "Insert",
|
373
|
+
"option": "Alt",
|
374
|
+
"pagedown": "PageDown",
|
375
|
+
"pageup": "PageUp",
|
376
|
+
"shift": "Shift",
|
377
|
+
"space": " ",
|
378
|
+
"super": "Meta",
|
379
|
+
"tab": "Tab",
|
380
|
+
"win": "Meta",
|
381
|
+
}
|
382
|
+
|
383
|
+
@staticmethod
|
384
|
+
def parse_key_combination(text: str) -> list[str]:
|
385
|
+
"""
|
386
|
+
Parse an xdotool-style key combination (e.g., "ctrl+o", "shift+tab")
|
387
|
+
and return a list of Playwright-compatible key names.
|
388
|
+
"""
|
389
|
+
if "+" in text:
|
390
|
+
keys = text.split("+")
|
391
|
+
# Map each key to its Playwright equivalent
|
392
|
+
return [BrowserEnvironment.CUA_KEY_TO_PLAYWRIGHT_KEY.get(k.lower(), k) for k in keys]
|
393
|
+
else:
|
394
|
+
# Single key
|
395
|
+
return [BrowserEnvironment.CUA_KEY_TO_PLAYWRIGHT_KEY.get(text.lower(), text)]
|
khoj/routers/api_chat.py
CHANGED
@@ -31,6 +31,7 @@ from khoj.processor.conversation.utils import (
|
|
31
31
|
save_to_conversation_log,
|
32
32
|
)
|
33
33
|
from khoj.processor.image.generate import text_to_image
|
34
|
+
from khoj.processor.operator.operate_browser import operate_browser
|
34
35
|
from khoj.processor.speech.text_to_speech import generate_text_to_speech
|
35
36
|
from khoj.processor.tools.online_search import (
|
36
37
|
deduplicate_organic_results,
|
@@ -78,6 +79,7 @@ from khoj.utils.helpers import (
|
|
78
79
|
get_country_name_from_timezone,
|
79
80
|
get_device,
|
80
81
|
is_none_or_empty,
|
82
|
+
is_operator_enabled,
|
81
83
|
)
|
82
84
|
from khoj.utils.rawconfig import (
|
83
85
|
ChatRequestBody,
|
@@ -569,6 +571,8 @@ async def chat_options(
|
|
569
571
|
) -> Response:
|
570
572
|
cmd_options = {}
|
571
573
|
for cmd in ConversationCommand:
|
574
|
+
if cmd == ConversationCommand.Operator and not is_operator_enabled():
|
575
|
+
continue
|
572
576
|
if cmd in command_descriptions:
|
573
577
|
cmd_options[cmd.value] = command_descriptions[cmd]
|
574
578
|
|
@@ -882,6 +886,7 @@ async def chat(
|
|
882
886
|
researched_results = ""
|
883
887
|
online_results: Dict = dict()
|
884
888
|
code_results: Dict = dict()
|
889
|
+
operator_results: Dict[str, str] = {}
|
885
890
|
generated_asset_results: Dict = dict()
|
886
891
|
## Extract Document References
|
887
892
|
compiled_references: List[Any] = []
|
@@ -956,7 +961,8 @@ async def chat(
|
|
956
961
|
code_results.update(research_result.codeContext)
|
957
962
|
if research_result.context:
|
958
963
|
compiled_references.extend(research_result.context)
|
959
|
-
|
964
|
+
if research_result.operatorContext:
|
965
|
+
operator_results.update(research_result.operatorContext)
|
960
966
|
researched_results += research_result.summarizedResult
|
961
967
|
|
962
968
|
else:
|
@@ -1207,14 +1213,45 @@ async def chat(
|
|
1207
1213
|
yield result[ChatEvent.STATUS]
|
1208
1214
|
else:
|
1209
1215
|
code_results = result
|
1210
|
-
async for result in send_event(ChatEvent.STATUS, f"**Ran code snippets**: {len(code_results)}"):
|
1211
|
-
yield result
|
1212
1216
|
except ValueError as e:
|
1213
1217
|
program_execution_context.append(f"Failed to run code")
|
1214
1218
|
logger.warning(
|
1215
1219
|
f"Failed to use code tool: {e}. Attempting to respond without code results",
|
1216
1220
|
exc_info=True,
|
1217
1221
|
)
|
1222
|
+
if ConversationCommand.Operator in conversation_commands:
|
1223
|
+
try:
|
1224
|
+
async for result in operate_browser(
|
1225
|
+
defiltered_query,
|
1226
|
+
user,
|
1227
|
+
meta_log,
|
1228
|
+
location,
|
1229
|
+
query_images=uploaded_images,
|
1230
|
+
query_files=attached_file_context,
|
1231
|
+
send_status_func=partial(send_event, ChatEvent.STATUS),
|
1232
|
+
agent=agent,
|
1233
|
+
cancellation_event=cancellation_event,
|
1234
|
+
tracer=tracer,
|
1235
|
+
):
|
1236
|
+
if isinstance(result, dict) and ChatEvent.STATUS in result:
|
1237
|
+
yield result[ChatEvent.STATUS]
|
1238
|
+
else:
|
1239
|
+
operator_results = {result["query"]: result["result"]}
|
1240
|
+
# Add webpages visited while operating browser to references
|
1241
|
+
if result.get("webpages"):
|
1242
|
+
if not online_results.get(defiltered_query):
|
1243
|
+
online_results[defiltered_query] = {"webpages": result["webpages"]}
|
1244
|
+
elif not online_results[defiltered_query].get("webpages"):
|
1245
|
+
online_results[defiltered_query]["webpages"] = result["webpages"]
|
1246
|
+
else:
|
1247
|
+
online_results[defiltered_query]["webpages"] += result["webpages"]
|
1248
|
+
except ValueError as e:
|
1249
|
+
program_execution_context.append(f"Browser operation error: {e}")
|
1250
|
+
logger.warning(f"Failed to operate browser with {e}", exc_info=True)
|
1251
|
+
async for result in send_event(
|
1252
|
+
ChatEvent.STATUS, "Operating browser failed. I'll try respond appropriately"
|
1253
|
+
):
|
1254
|
+
yield result
|
1218
1255
|
|
1219
1256
|
## Send Gathered References
|
1220
1257
|
unique_online_results = deduplicate_organic_results(online_results)
|
@@ -1225,6 +1262,7 @@ async def chat(
|
|
1225
1262
|
"context": compiled_references,
|
1226
1263
|
"onlineContext": unique_online_results,
|
1227
1264
|
"codeContext": code_results,
|
1265
|
+
"operatorContext": operator_results,
|
1228
1266
|
},
|
1229
1267
|
):
|
1230
1268
|
yield result
|
@@ -1340,6 +1378,7 @@ async def chat(
|
|
1340
1378
|
compiled_references,
|
1341
1379
|
online_results,
|
1342
1380
|
code_results,
|
1381
|
+
operator_results,
|
1343
1382
|
inferred_queries,
|
1344
1383
|
conversation_commands,
|
1345
1384
|
user,
|
khoj/routers/helpers.py
CHANGED
@@ -113,6 +113,7 @@ from khoj.utils.helpers import (
|
|
113
113
|
get_file_type,
|
114
114
|
in_debug_mode,
|
115
115
|
is_none_or_empty,
|
116
|
+
is_operator_enabled,
|
116
117
|
is_valid_url,
|
117
118
|
log_telemetry,
|
118
119
|
mode_descriptions_for_llm,
|
@@ -253,6 +254,8 @@ def get_conversation_command(query: str) -> ConversationCommand:
|
|
253
254
|
return ConversationCommand.Code
|
254
255
|
elif query.startswith("/research"):
|
255
256
|
return ConversationCommand.Research
|
257
|
+
elif query.startswith("/operator") and is_operator_enabled():
|
258
|
+
return ConversationCommand.Operator
|
256
259
|
else:
|
257
260
|
return ConversationCommand.Default
|
258
261
|
|
@@ -362,6 +365,8 @@ async def aget_data_sources_and_output_format(
|
|
362
365
|
# Skip showing Notes tool as an option if user has no entries
|
363
366
|
if source == ConversationCommand.Notes and not user_has_entries:
|
364
367
|
continue
|
368
|
+
if source == ConversationCommand.Operator and not is_operator_enabled():
|
369
|
+
continue
|
365
370
|
source_options[source.value] = description
|
366
371
|
if len(agent_sources) == 0 or source.value in agent_sources:
|
367
372
|
source_options_str += f'- "{source.value}": "{description}"\n'
|
@@ -1349,6 +1354,7 @@ async def agenerate_chat_response(
|
|
1349
1354
|
compiled_references: List[Dict] = [],
|
1350
1355
|
online_results: Dict[str, Dict] = {},
|
1351
1356
|
code_results: Dict[str, Dict] = {},
|
1357
|
+
operator_results: Dict[str, str] = {},
|
1352
1358
|
inferred_queries: List[str] = [],
|
1353
1359
|
conversation_commands: List[ConversationCommand] = [ConversationCommand.Default],
|
1354
1360
|
user: KhojUser = None,
|
@@ -1385,6 +1391,7 @@ async def agenerate_chat_response(
|
|
1385
1391
|
compiled_references=compiled_references,
|
1386
1392
|
online_results=online_results,
|
1387
1393
|
code_results=code_results,
|
1394
|
+
operator_results=operator_results,
|
1388
1395
|
inferred_queries=inferred_queries,
|
1389
1396
|
client_application=client_application,
|
1390
1397
|
conversation_id=conversation_id,
|
@@ -1404,6 +1411,7 @@ async def agenerate_chat_response(
|
|
1404
1411
|
compiled_references = []
|
1405
1412
|
online_results = {}
|
1406
1413
|
code_results = {}
|
1414
|
+
operator_results = {}
|
1407
1415
|
deepthought = True
|
1408
1416
|
|
1409
1417
|
chat_model = await ConversationAdapters.aget_valid_chat_model(user, conversation, is_subscribed)
|
@@ -1446,6 +1454,7 @@ async def agenerate_chat_response(
|
|
1446
1454
|
query_images=query_images,
|
1447
1455
|
online_results=online_results,
|
1448
1456
|
code_results=code_results,
|
1457
|
+
operator_results=operator_results,
|
1449
1458
|
conversation_log=meta_log,
|
1450
1459
|
model=chat_model_name,
|
1451
1460
|
api_key=api_key,
|
@@ -1475,6 +1484,7 @@ async def agenerate_chat_response(
|
|
1475
1484
|
query_images=query_images,
|
1476
1485
|
online_results=online_results,
|
1477
1486
|
code_results=code_results,
|
1487
|
+
operator_results=operator_results,
|
1478
1488
|
conversation_log=meta_log,
|
1479
1489
|
model=chat_model.name,
|
1480
1490
|
api_key=api_key,
|
@@ -1500,9 +1510,10 @@ async def agenerate_chat_response(
|
|
1500
1510
|
chat_response_generator = converse_gemini(
|
1501
1511
|
compiled_references,
|
1502
1512
|
query_to_run,
|
1503
|
-
online_results,
|
1504
|
-
code_results,
|
1505
|
-
|
1513
|
+
online_results=online_results,
|
1514
|
+
code_results=code_results,
|
1515
|
+
operator_results=operator_results,
|
1516
|
+
conversation_log=meta_log,
|
1506
1517
|
model=chat_model.name,
|
1507
1518
|
api_key=api_key,
|
1508
1519
|
api_base_url=api_base_url,
|
khoj/routers/research.py
CHANGED
@@ -17,6 +17,7 @@ from khoj.processor.conversation.utils import (
|
|
17
17
|
construct_tool_chat_history,
|
18
18
|
load_complex_json,
|
19
19
|
)
|
20
|
+
from khoj.processor.operator.operate_browser import operate_browser
|
20
21
|
from khoj.processor.tools.online_search import read_webpages, search_online
|
21
22
|
from khoj.processor.tools.run_code import run_code
|
22
23
|
from khoj.routers.api import extract_references_and_questions
|
@@ -28,6 +29,7 @@ from khoj.routers.helpers import (
|
|
28
29
|
from khoj.utils.helpers import (
|
29
30
|
ConversationCommand,
|
30
31
|
is_none_or_empty,
|
32
|
+
is_operator_enabled,
|
31
33
|
timer,
|
32
34
|
tool_description_for_research_llm,
|
33
35
|
truncate_code_context,
|
@@ -98,6 +100,9 @@ async def apick_next_tool(
|
|
98
100
|
agent_tools = agent.input_tools if agent else []
|
99
101
|
user_has_entries = await EntryAdapters.auser_has_entries(user)
|
100
102
|
for tool, description in tool_description_for_research_llm.items():
|
103
|
+
# Skip showing operator tool as an option if not enabled
|
104
|
+
if tool == ConversationCommand.Operator and not is_operator_enabled():
|
105
|
+
continue
|
101
106
|
# Skip showing Notes tool as an option if user has no entries
|
102
107
|
if tool == ConversationCommand.Notes:
|
103
108
|
if not user_has_entries:
|
@@ -232,6 +237,7 @@ async def execute_information_collection(
|
|
232
237
|
online_results: Dict = dict()
|
233
238
|
code_results: Dict = dict()
|
234
239
|
document_results: List[Dict[str, str]] = []
|
240
|
+
operator_results: Dict[str, str] = {}
|
235
241
|
summarize_files: str = ""
|
236
242
|
this_iteration = InformationCollectionIteration(tool=None, query=query)
|
237
243
|
|
@@ -398,6 +404,38 @@ async def execute_information_collection(
|
|
398
404
|
this_iteration.warning = f"Error running code: {e}"
|
399
405
|
logger.warning(this_iteration.warning, exc_info=True)
|
400
406
|
|
407
|
+
elif this_iteration.tool == ConversationCommand.Operator:
|
408
|
+
try:
|
409
|
+
async for result in operate_browser(
|
410
|
+
this_iteration.query,
|
411
|
+
user,
|
412
|
+
construct_tool_chat_history(previous_iterations, ConversationCommand.Operator),
|
413
|
+
location,
|
414
|
+
send_status_func,
|
415
|
+
query_images=query_images,
|
416
|
+
agent=agent,
|
417
|
+
query_files=query_files,
|
418
|
+
cancellation_event=cancellation_event,
|
419
|
+
tracer=tracer,
|
420
|
+
):
|
421
|
+
if isinstance(result, dict) and ChatEvent.STATUS in result:
|
422
|
+
yield result[ChatEvent.STATUS]
|
423
|
+
else:
|
424
|
+
operator_results = {result["query"]: result["result"]}
|
425
|
+
this_iteration.operatorContext = operator_results
|
426
|
+
# Add webpages visited while operating browser to references
|
427
|
+
if result.get("webpages"):
|
428
|
+
if not online_results.get(this_iteration.query):
|
429
|
+
online_results[this_iteration.query] = {"webpages": result["webpages"]}
|
430
|
+
elif not online_results[this_iteration.query].get("webpages"):
|
431
|
+
online_results[this_iteration.query]["webpages"] = result["webpages"]
|
432
|
+
else:
|
433
|
+
online_results[this_iteration.query]["webpages"] += result["webpages"]
|
434
|
+
this_iteration.onlineContext = online_results
|
435
|
+
except Exception as e:
|
436
|
+
this_iteration.warning = f"Error operating browser: {e}"
|
437
|
+
logger.error(this_iteration.warning, exc_info=True)
|
438
|
+
|
401
439
|
elif this_iteration.tool == ConversationCommand.Summarize:
|
402
440
|
try:
|
403
441
|
async for result in generate_summary_from_files(
|
@@ -424,7 +462,14 @@ async def execute_information_collection(
|
|
424
462
|
|
425
463
|
current_iteration += 1
|
426
464
|
|
427
|
-
if
|
465
|
+
if (
|
466
|
+
document_results
|
467
|
+
or online_results
|
468
|
+
or code_results
|
469
|
+
or operator_results
|
470
|
+
or summarize_files
|
471
|
+
or this_iteration.warning
|
472
|
+
):
|
428
473
|
results_data = f"\n<iteration>{current_iteration}\n<tool>{this_iteration.tool}</tool>\n<query>{this_iteration.query}</query>\n<results>"
|
429
474
|
if document_results:
|
430
475
|
results_data += f"\n<document_references>\n{yaml.dump(document_results, allow_unicode=True, sort_keys=False, default_flow_style=False)}\n</document_references>"
|
@@ -432,6 +477,8 @@ async def execute_information_collection(
|
|
432
477
|
results_data += f"\n<online_results>\n{yaml.dump(online_results, allow_unicode=True, sort_keys=False, default_flow_style=False)}\n</online_results>"
|
433
478
|
if code_results:
|
434
479
|
results_data += f"\n<code_results>\n{yaml.dump(truncate_code_context(code_results), allow_unicode=True, sort_keys=False, default_flow_style=False)}\n</code_results>"
|
480
|
+
if operator_results:
|
481
|
+
results_data += f"\n<browser_operator_results>\n{next(iter(operator_results.values()))}\n</browser_operator_results>"
|
435
482
|
if summarize_files:
|
436
483
|
results_data += f"\n<summarized_files>\n{yaml.dump(summarize_files, allow_unicode=True, sort_keys=False, default_flow_style=False)}\n</summarized_files>"
|
437
484
|
if this_iteration.warning:
|