khoj 1.41.1.dev43__py3-none-any.whl → 1.41.1.dev97__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. khoj/database/adapters/__init__.py +17 -6
  2. khoj/interface/compiled/404/index.html +2 -2
  3. khoj/interface/compiled/_next/static/chunks/{2327-f03b2a77f67b8f8c.js → 2327-aa22697ed9c8d54a.js} +1 -1
  4. khoj/interface/compiled/_next/static/chunks/4986-9ddd694756d03aa1.js +1 -0
  5. khoj/interface/compiled/_next/static/chunks/{8515-010dd769c584b672.js → 8515-f305779d95dd5780.js} +1 -1
  6. khoj/interface/compiled/_next/static/chunks/app/agents/layout-4e2a134ec26aa606.js +1 -0
  7. khoj/interface/compiled/_next/static/chunks/app/chat/layout-ad4d1792ab1a4108.js +1 -0
  8. khoj/interface/compiled/_next/static/chunks/app/chat/{page-14ac9d1ad5cb84c5.js → page-7e780dc11eb5e5d3.js} +1 -1
  9. khoj/interface/compiled/_next/static/chunks/{webpack-1169ca6e9e7e6247.js → webpack-21f76f7f59582bc7.js} +1 -1
  10. khoj/interface/compiled/agents/index.html +2 -2
  11. khoj/interface/compiled/agents/index.txt +2 -2
  12. khoj/interface/compiled/automations/index.html +2 -2
  13. khoj/interface/compiled/automations/index.txt +2 -2
  14. khoj/interface/compiled/chat/index.html +2 -2
  15. khoj/interface/compiled/chat/index.txt +2 -2
  16. khoj/interface/compiled/index.html +2 -2
  17. khoj/interface/compiled/index.txt +2 -2
  18. khoj/interface/compiled/search/index.html +2 -2
  19. khoj/interface/compiled/search/index.txt +2 -2
  20. khoj/interface/compiled/settings/index.html +2 -2
  21. khoj/interface/compiled/settings/index.txt +2 -2
  22. khoj/interface/compiled/share/chat/index.html +2 -2
  23. khoj/interface/compiled/share/chat/index.txt +2 -2
  24. khoj/processor/conversation/anthropic/anthropic_chat.py +7 -2
  25. khoj/processor/conversation/anthropic/utils.py +37 -19
  26. khoj/processor/conversation/google/gemini_chat.py +7 -2
  27. khoj/processor/conversation/offline/chat_model.py +2 -2
  28. khoj/processor/conversation/openai/gpt.py +7 -2
  29. khoj/processor/conversation/prompts.py +13 -2
  30. khoj/processor/conversation/utils.py +34 -6
  31. khoj/processor/operator/grounding_agent.py +345 -0
  32. khoj/processor/operator/grounding_agent_uitars.py +973 -0
  33. khoj/processor/operator/operate_browser.py +165 -0
  34. khoj/processor/operator/operator_actions.py +149 -0
  35. khoj/processor/operator/operator_agent_anthropic.py +402 -0
  36. khoj/processor/operator/operator_agent_base.py +80 -0
  37. khoj/processor/operator/operator_agent_binary.py +336 -0
  38. khoj/processor/operator/operator_agent_openai.py +349 -0
  39. khoj/processor/operator/operator_environment_base.py +37 -0
  40. khoj/processor/operator/operator_environment_browser.py +395 -0
  41. khoj/routers/api_chat.py +44 -6
  42. khoj/routers/helpers.py +18 -8
  43. khoj/routers/research.py +48 -1
  44. khoj/utils/constants.py +6 -0
  45. khoj/utils/helpers.py +17 -0
  46. {khoj-1.41.1.dev43.dist-info → khoj-1.41.1.dev97.dist-info}/METADATA +4 -2
  47. {khoj-1.41.1.dev43.dist-info → khoj-1.41.1.dev97.dist-info}/RECORD +52 -42
  48. khoj/interface/compiled/_next/static/chunks/4986-14ea63faad1615a4.js +0 -1
  49. khoj/interface/compiled/_next/static/chunks/app/agents/layout-e00fb81dca656a10.js +0 -1
  50. khoj/interface/compiled/_next/static/chunks/app/chat/layout-33934fc2d6ae6838.js +0 -1
  51. /khoj/interface/compiled/_next/static/{doKtSKC0j2ECO8K8viDKD → o6zlo73DbD2lS92jWHS8o}/_buildManifest.js +0 -0
  52. /khoj/interface/compiled/_next/static/{doKtSKC0j2ECO8K8viDKD → o6zlo73DbD2lS92jWHS8o}/_ssgManifest.js +0 -0
  53. {khoj-1.41.1.dev43.dist-info → khoj-1.41.1.dev97.dist-info}/WHEEL +0 -0
  54. {khoj-1.41.1.dev43.dist-info → khoj-1.41.1.dev97.dist-info}/entry_points.txt +0 -0
  55. {khoj-1.41.1.dev43.dist-info → khoj-1.41.1.dev97.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,165 @@
1
+ import asyncio
2
+ import json
3
+ import logging
4
+ import os
5
+ from typing import Callable, List, Optional
6
+
7
+ from khoj.database.adapters import AgentAdapters, ConversationAdapters
8
+ from khoj.database.models import Agent, ChatModel, KhojUser
9
+ from khoj.processor.operator.operator_actions import *
10
+ from khoj.processor.operator.operator_agent_anthropic import AnthropicOperatorAgent
11
+ from khoj.processor.operator.operator_agent_base import OperatorAgent
12
+ from khoj.processor.operator.operator_agent_binary import BinaryOperatorAgent
13
+ from khoj.processor.operator.operator_agent_openai import OpenAIOperatorAgent
14
+ from khoj.processor.operator.operator_environment_base import EnvStepResult
15
+ from khoj.processor.operator.operator_environment_browser import BrowserEnvironment
16
+ from khoj.routers.helpers import ChatEvent
17
+ from khoj.utils.helpers import timer
18
+ from khoj.utils.rawconfig import LocationData
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ # --- Browser Operator Function ---
24
+ async def operate_browser(
25
+ query: str,
26
+ user: KhojUser,
27
+ conversation_log: dict,
28
+ location_data: LocationData,
29
+ send_status_func: Optional[Callable] = None,
30
+ query_images: Optional[List[str]] = None, # TODO: Handle query images
31
+ agent: Agent = None,
32
+ query_files: str = None, # TODO: Handle query files
33
+ cancellation_event: Optional[asyncio.Event] = None,
34
+ tracer: dict = {},
35
+ ):
36
+ response, summary_message, user_input_message = None, None, None
37
+ environment: Optional[BrowserEnvironment] = None
38
+
39
+ # Get the agent chat model
40
+ agent_chat_model = await AgentAdapters.aget_agent_chat_model(agent, user) if agent else None
41
+ reasoning_model: ChatModel = await ConversationAdapters.aget_default_chat_model(user, agent_chat_model)
42
+ if not reasoning_model or not reasoning_model.vision_enabled:
43
+ reasoning_model = await ConversationAdapters.aget_vision_enabled_config()
44
+ if not reasoning_model:
45
+ raise ValueError(f"No vision enabled chat model found. Configure a vision chat model to operate browser.")
46
+
47
+ # Initialize Agent
48
+ max_iterations = int(os.getenv("KHOJ_OPERATOR_ITERATIONS", 40))
49
+ operator_agent: OperatorAgent
50
+ if is_operator_model(reasoning_model.name) == ChatModel.ModelType.OPENAI:
51
+ operator_agent = OpenAIOperatorAgent(query, reasoning_model, max_iterations, tracer)
52
+ elif is_operator_model(reasoning_model.name) == ChatModel.ModelType.ANTHROPIC:
53
+ operator_agent = AnthropicOperatorAgent(query, reasoning_model, max_iterations, tracer)
54
+ else:
55
+ grounding_model_name = "ui-tars-1.5"
56
+ grounding_model = await ConversationAdapters.aget_chat_model_by_name(grounding_model_name)
57
+ if (
58
+ not grounding_model
59
+ or not grounding_model.vision_enabled
60
+ or not grounding_model.model_type == ChatModel.ModelType.OPENAI
61
+ ):
62
+ raise ValueError("No supported visual grounding model for binary operator agent found.")
63
+ operator_agent = BinaryOperatorAgent(query, reasoning_model, grounding_model, max_iterations, tracer)
64
+
65
+ # Initialize Environment
66
+ if send_status_func:
67
+ async for event in send_status_func(f"**Launching Browser**"):
68
+ yield {ChatEvent.STATUS: event}
69
+ environment = BrowserEnvironment()
70
+ await environment.start(width=1024, height=768)
71
+
72
+ # Start Operator Loop
73
+ try:
74
+ summarize_prompt = f"Use the results of our research to provide a comprehensive, self-contained answer for the target query:\n{query}."
75
+ task_completed = False
76
+ iterations = 0
77
+
78
+ with timer(f"Operating browser with {reasoning_model.model_type} {reasoning_model.name}", logger):
79
+ while iterations < max_iterations and not task_completed:
80
+ if cancellation_event and cancellation_event.is_set():
81
+ logger.debug(f"Browser operator cancelled by client disconnect")
82
+ break
83
+
84
+ iterations += 1
85
+
86
+ # 1. Get current environment state
87
+ browser_state = await environment.get_state()
88
+
89
+ # 2. Agent decides action(s)
90
+ agent_result = await operator_agent.act(browser_state)
91
+
92
+ # 3. Execute actions in the environment
93
+ env_steps: List[EnvStepResult] = []
94
+ for action in agent_result.actions:
95
+ if cancellation_event and cancellation_event.is_set():
96
+ logger.debug(f"Browser operator cancelled by client disconnect")
97
+ break
98
+ # Handle request for user action and break the loop
99
+ if isinstance(action, RequestUserAction):
100
+ user_input_message = action.request
101
+ if send_status_func:
102
+ async for event in send_status_func(f"**Requesting User Input**:\n{action.request}"):
103
+ yield {ChatEvent.STATUS: event}
104
+ break
105
+ env_step = await environment.step(action)
106
+ env_steps.append(env_step)
107
+
108
+ # Render status update
109
+ latest_screenshot = f"data:image/webp;base64,{env_steps[-1].screenshot_base64 if env_steps else browser_state.screenshot}"
110
+ render_payload = agent_result.rendered_response
111
+ render_payload["image"] = latest_screenshot
112
+ render_content = f"**Action**: {json.dumps(render_payload)}"
113
+ if send_status_func:
114
+ async for event in send_status_func(f"**Operating Browser**:\n{render_content}"):
115
+ yield {ChatEvent.STATUS: event}
116
+
117
+ # Check if termination conditions are met
118
+ task_completed = not agent_result.actions # No actions requested by agent
119
+ trigger_iteration_limit = iterations == max_iterations
120
+ if user_input_message:
121
+ logger.info(f"User input requested: {user_input_message}")
122
+ break
123
+ if task_completed or trigger_iteration_limit:
124
+ # Summarize results of operator run on last iteration
125
+ operator_agent.add_action_results(env_steps, agent_result)
126
+ summary_message = await operator_agent.summarize(summarize_prompt, browser_state)
127
+ logger.info(f"Task completed: {task_completed}, Iteration limit: {trigger_iteration_limit}")
128
+ break
129
+
130
+ # 4. Update agent on the results of its action on the environment
131
+ operator_agent.add_action_results(env_steps, agent_result)
132
+
133
+ # Determine final response message
134
+ if user_input_message:
135
+ response = user_input_message
136
+ elif task_completed:
137
+ response = summary_message
138
+ else: # Hit iteration limit
139
+ response = f"Operator hit iteration limit ({max_iterations}). If the results seem incomplete try again, assign a smaller task or try a different approach.\nThese were the results till now:\n{summary_message}"
140
+ finally:
141
+ if environment and not user_input_message: # Don't close browser if user input required
142
+ await environment.close()
143
+ if operator_agent:
144
+ operator_agent.reset()
145
+
146
+ yield {
147
+ "query": query,
148
+ "result": user_input_message or response,
149
+ "webpages": [{"link": url, "snippet": ""} for url in environment.visited_urls],
150
+ }
151
+
152
+
153
+ def is_operator_model(model: str) -> ChatModel.ModelType | None:
154
+ """Check if the model is an operator model."""
155
+ operator_models = {
156
+ "gpt-4o": ChatModel.ModelType.OPENAI,
157
+ "claude-3-7-sonnet": ChatModel.ModelType.ANTHROPIC,
158
+ "claude-sonnet-4": ChatModel.ModelType.ANTHROPIC,
159
+ "claude-opus-4": ChatModel.ModelType.ANTHROPIC,
160
+ "ui-tars-1.5": ChatModel.ModelType.OFFLINE,
161
+ }
162
+ for operator_model in operator_models:
163
+ if model.startswith(operator_model):
164
+ return operator_models[operator_model] # type: ignore[return-value]
165
+ return None
@@ -0,0 +1,149 @@
1
+ # --- Standardized Action Models ---
2
+ from typing import List, Literal, Optional, Union
3
+
4
+ from pydantic import BaseModel
5
+
6
+
7
+ class Point(BaseModel):
8
+ x: float
9
+ y: float
10
+
11
+
12
+ class BaseAction(BaseModel):
13
+ type: str
14
+
15
+
16
+ class ClickAction(BaseAction):
17
+ type: Literal["click"] = "click"
18
+ x: float
19
+ y: float
20
+ button: Literal["left", "right", "middle", "wheel"] = "left"
21
+ modifiers: str = None
22
+
23
+
24
+ class DoubleClickAction(BaseAction):
25
+ type: Literal["double_click"] = "double_click"
26
+ x: float
27
+ y: float
28
+
29
+
30
+ class TripleClickAction(BaseAction):
31
+ type: Literal["triple_click"] = "triple_click"
32
+ x: float
33
+ y: float
34
+
35
+
36
+ class ScrollAction(BaseAction):
37
+ type: Literal["scroll"] = "scroll"
38
+ x: Optional[float] = None
39
+ y: Optional[float] = None
40
+ scroll_x: Optional[float] = None
41
+ scroll_y: Optional[float] = None
42
+ scroll_direction: Optional[Literal["up", "down", "left", "right"]] = None
43
+ scroll_amount: Optional[float] = 2.0
44
+
45
+
46
+ class KeypressAction(BaseAction):
47
+ type: Literal["keypress"] = "keypress"
48
+ keys: List[str] # Standardized on list of keys
49
+
50
+
51
+ class TypeAction(BaseAction):
52
+ type: Literal["type"] = "type"
53
+ text: str
54
+
55
+
56
+ class WaitAction(BaseAction):
57
+ type: Literal["wait"] = "wait"
58
+ duration: float = 1.0
59
+
60
+
61
+ class ScreenshotAction(BaseAction):
62
+ type: Literal["screenshot"] = "screenshot"
63
+
64
+
65
+ class MoveAction(BaseAction):
66
+ type: Literal["move"] = "move"
67
+ x: float
68
+ y: float
69
+
70
+
71
+ class DragAction(BaseAction):
72
+ type: Literal["drag"] = "drag"
73
+ path: List[Point]
74
+
75
+
76
+ class MouseDownAction(BaseAction):
77
+ type: Literal["mouse_down"] = "mouse_down"
78
+ button: Literal["left", "right", "middle"] = "left"
79
+
80
+
81
+ class MouseUpAction(BaseAction):
82
+ type: Literal["mouse_up"] = "mouse_up"
83
+ button: Literal["left", "right", "middle"] = "left"
84
+
85
+
86
+ class HoldKeyAction(BaseAction):
87
+ type: Literal["hold_key"] = "hold_key"
88
+ text: str # xdotool style key combination string
89
+ duration: float = 1.0
90
+
91
+
92
+ class KeyUpAction(BaseAction):
93
+ type: Literal["key_up"] = "key_up"
94
+ key: str
95
+
96
+
97
+ class KeyDownAction(BaseAction):
98
+ type: Literal["key_down"] = "key_down"
99
+ key: str
100
+
101
+
102
+ class CursorPositionAction(BaseAction):
103
+ type: Literal["cursor_position"] = "cursor_position"
104
+
105
+
106
+ class GotoAction(BaseAction):
107
+ type: Literal["goto"] = "goto"
108
+ url: str
109
+
110
+
111
+ class BackAction(BaseAction):
112
+ type: Literal["back"] = "back"
113
+
114
+
115
+ class RequestUserAction(BaseAction):
116
+ """Request user action to confirm or provide input."""
117
+
118
+ type: Literal["request_user"] = "request_user"
119
+ request: str
120
+
121
+
122
+ class NoopAction(BaseAction):
123
+ """No operation action."""
124
+
125
+ type: Literal["noop"] = "noop"
126
+
127
+
128
+ OperatorAction = Union[
129
+ ClickAction,
130
+ DoubleClickAction,
131
+ TripleClickAction,
132
+ ScrollAction,
133
+ KeypressAction,
134
+ TypeAction,
135
+ WaitAction,
136
+ ScreenshotAction,
137
+ MoveAction,
138
+ DragAction,
139
+ MouseDownAction,
140
+ MouseUpAction,
141
+ HoldKeyAction,
142
+ KeyDownAction,
143
+ KeyUpAction,
144
+ CursorPositionAction,
145
+ GotoAction,
146
+ BackAction,
147
+ RequestUserAction,
148
+ NoopAction,
149
+ ]