zrb 1.8.10__py3-none-any.whl → 1.21.29__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of zrb might be problematic. Click here for more details.

Files changed (147) hide show
  1. zrb/__init__.py +126 -113
  2. zrb/__main__.py +1 -1
  3. zrb/attr/type.py +10 -7
  4. zrb/builtin/__init__.py +2 -50
  5. zrb/builtin/git.py +12 -1
  6. zrb/builtin/group.py +31 -15
  7. zrb/builtin/http.py +7 -8
  8. zrb/builtin/llm/attachment.py +40 -0
  9. zrb/builtin/llm/chat_completion.py +274 -0
  10. zrb/builtin/llm/chat_session.py +152 -85
  11. zrb/builtin/llm/chat_session_cmd.py +288 -0
  12. zrb/builtin/llm/chat_trigger.py +79 -0
  13. zrb/builtin/llm/history.py +7 -9
  14. zrb/builtin/llm/llm_ask.py +221 -98
  15. zrb/builtin/llm/tool/api.py +74 -52
  16. zrb/builtin/llm/tool/cli.py +46 -17
  17. zrb/builtin/llm/tool/code.py +71 -90
  18. zrb/builtin/llm/tool/file.py +301 -241
  19. zrb/builtin/llm/tool/note.py +84 -0
  20. zrb/builtin/llm/tool/rag.py +38 -8
  21. zrb/builtin/llm/tool/sub_agent.py +67 -50
  22. zrb/builtin/llm/tool/web.py +146 -122
  23. zrb/builtin/project/add/fastapp/fastapp_template/my_app_name/_zrb/entity/add_entity_util.py +7 -7
  24. zrb/builtin/project/add/fastapp/fastapp_template/my_app_name/_zrb/module/add_module_util.py +5 -5
  25. zrb/builtin/project/add/fastapp/fastapp_util.py +1 -1
  26. zrb/builtin/searxng/config/settings.yml +5671 -0
  27. zrb/builtin/searxng/start.py +21 -0
  28. zrb/builtin/setup/latex/ubuntu.py +1 -0
  29. zrb/builtin/setup/ubuntu.py +1 -1
  30. zrb/builtin/shell/autocomplete/bash.py +4 -3
  31. zrb/builtin/shell/autocomplete/zsh.py +4 -3
  32. zrb/builtin/todo.py +13 -2
  33. zrb/config/config.py +614 -0
  34. zrb/config/default_prompt/file_extractor_system_prompt.md +112 -0
  35. zrb/config/default_prompt/interactive_system_prompt.md +29 -0
  36. zrb/config/default_prompt/persona.md +1 -0
  37. zrb/config/default_prompt/repo_extractor_system_prompt.md +112 -0
  38. zrb/config/default_prompt/repo_summarizer_system_prompt.md +29 -0
  39. zrb/config/default_prompt/summarization_prompt.md +57 -0
  40. zrb/config/default_prompt/system_prompt.md +38 -0
  41. zrb/config/llm_config.py +339 -0
  42. zrb/config/llm_context/config.py +166 -0
  43. zrb/config/llm_context/config_parser.py +40 -0
  44. zrb/config/llm_context/workflow.py +81 -0
  45. zrb/config/llm_rate_limitter.py +190 -0
  46. zrb/{runner → config}/web_auth_config.py +17 -22
  47. zrb/context/any_shared_context.py +17 -1
  48. zrb/context/context.py +16 -2
  49. zrb/context/shared_context.py +18 -8
  50. zrb/group/any_group.py +12 -5
  51. zrb/group/group.py +67 -3
  52. zrb/input/any_input.py +5 -1
  53. zrb/input/base_input.py +18 -6
  54. zrb/input/option_input.py +13 -1
  55. zrb/input/text_input.py +8 -25
  56. zrb/runner/cli.py +25 -23
  57. zrb/runner/common_util.py +24 -19
  58. zrb/runner/web_app.py +3 -3
  59. zrb/runner/web_route/docs_route.py +1 -1
  60. zrb/runner/web_route/error_page/serve_default_404.py +1 -1
  61. zrb/runner/web_route/error_page/show_error_page.py +1 -1
  62. zrb/runner/web_route/home_page/home_page_route.py +2 -2
  63. zrb/runner/web_route/login_api_route.py +1 -1
  64. zrb/runner/web_route/login_page/login_page_route.py +2 -2
  65. zrb/runner/web_route/logout_api_route.py +1 -1
  66. zrb/runner/web_route/logout_page/logout_page_route.py +2 -2
  67. zrb/runner/web_route/node_page/group/show_group_page.py +1 -1
  68. zrb/runner/web_route/node_page/node_page_route.py +1 -1
  69. zrb/runner/web_route/node_page/task/show_task_page.py +1 -1
  70. zrb/runner/web_route/refresh_token_api_route.py +1 -1
  71. zrb/runner/web_route/static/static_route.py +1 -1
  72. zrb/runner/web_route/task_input_api_route.py +6 -6
  73. zrb/runner/web_route/task_session_api_route.py +20 -12
  74. zrb/runner/web_util/cookie.py +1 -1
  75. zrb/runner/web_util/token.py +1 -1
  76. zrb/runner/web_util/user.py +8 -4
  77. zrb/session/any_session.py +24 -17
  78. zrb/session/session.py +50 -25
  79. zrb/session_state_logger/any_session_state_logger.py +9 -4
  80. zrb/session_state_logger/file_session_state_logger.py +16 -6
  81. zrb/session_state_logger/session_state_logger_factory.py +1 -1
  82. zrb/task/any_task.py +30 -9
  83. zrb/task/base/context.py +17 -9
  84. zrb/task/base/execution.py +15 -8
  85. zrb/task/base/lifecycle.py +8 -4
  86. zrb/task/base/monitoring.py +12 -7
  87. zrb/task/base_task.py +69 -5
  88. zrb/task/base_trigger.py +12 -5
  89. zrb/task/cmd_task.py +1 -1
  90. zrb/task/llm/agent.py +154 -161
  91. zrb/task/llm/agent_runner.py +152 -0
  92. zrb/task/llm/config.py +47 -18
  93. zrb/task/llm/conversation_history.py +209 -0
  94. zrb/task/llm/conversation_history_model.py +67 -0
  95. zrb/task/llm/default_workflow/coding/workflow.md +41 -0
  96. zrb/task/llm/default_workflow/copywriting/workflow.md +68 -0
  97. zrb/task/llm/default_workflow/git/workflow.md +118 -0
  98. zrb/task/llm/default_workflow/golang/workflow.md +128 -0
  99. zrb/task/llm/default_workflow/html-css/workflow.md +135 -0
  100. zrb/task/llm/default_workflow/java/workflow.md +146 -0
  101. zrb/task/llm/default_workflow/javascript/workflow.md +158 -0
  102. zrb/task/llm/default_workflow/python/workflow.md +160 -0
  103. zrb/task/llm/default_workflow/researching/workflow.md +153 -0
  104. zrb/task/llm/default_workflow/rust/workflow.md +162 -0
  105. zrb/task/llm/default_workflow/shell/workflow.md +299 -0
  106. zrb/task/llm/error.py +24 -10
  107. zrb/task/llm/file_replacement.py +206 -0
  108. zrb/task/llm/file_tool_model.py +57 -0
  109. zrb/task/llm/history_processor.py +206 -0
  110. zrb/task/llm/history_summarization.py +11 -166
  111. zrb/task/llm/print_node.py +193 -69
  112. zrb/task/llm/prompt.py +242 -45
  113. zrb/task/llm/subagent_conversation_history.py +41 -0
  114. zrb/task/llm/tool_wrapper.py +260 -57
  115. zrb/task/llm/workflow.py +76 -0
  116. zrb/task/llm_task.py +182 -171
  117. zrb/task/make_task.py +2 -3
  118. zrb/task/rsync_task.py +26 -11
  119. zrb/task/scheduler.py +4 -4
  120. zrb/util/attr.py +54 -39
  121. zrb/util/callable.py +23 -0
  122. zrb/util/cli/markdown.py +12 -0
  123. zrb/util/cli/text.py +30 -0
  124. zrb/util/file.py +29 -11
  125. zrb/util/git.py +8 -11
  126. zrb/util/git_diff_model.py +10 -0
  127. zrb/util/git_subtree.py +9 -14
  128. zrb/util/git_subtree_model.py +32 -0
  129. zrb/util/init_path.py +1 -1
  130. zrb/util/markdown.py +62 -0
  131. zrb/util/string/conversion.py +2 -2
  132. zrb/util/todo.py +17 -50
  133. zrb/util/todo_model.py +46 -0
  134. zrb/util/truncate.py +23 -0
  135. zrb/util/yaml.py +204 -0
  136. zrb/xcom/xcom.py +10 -0
  137. zrb-1.21.29.dist-info/METADATA +270 -0
  138. {zrb-1.8.10.dist-info → zrb-1.21.29.dist-info}/RECORD +140 -98
  139. {zrb-1.8.10.dist-info → zrb-1.21.29.dist-info}/WHEEL +1 -1
  140. zrb/config.py +0 -335
  141. zrb/llm_config.py +0 -411
  142. zrb/llm_rate_limitter.py +0 -125
  143. zrb/task/llm/context.py +0 -102
  144. zrb/task/llm/context_enrichment.py +0 -199
  145. zrb/task/llm/history.py +0 -211
  146. zrb-1.8.10.dist-info/METADATA +0 -264
  147. {zrb-1.8.10.dist-info → zrb-1.21.29.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,84 @@
1
+ import os
2
+
3
+ from zrb.config.llm_context.config import llm_context_config
4
+
5
+
6
+ def read_long_term_note() -> str:
7
+ """
8
+ Retrieves the GLOBAL long-term memory shared across ALL sessions and projects.
9
+
10
+ CRITICAL: Consult this first for user preferences, facts, and cross-project context.
11
+
12
+ Returns:
13
+ str: The current global note content.
14
+ """
15
+ contexts = llm_context_config.get_notes()
16
+ return contexts.get("/", "")
17
+
18
+
19
+ def write_long_term_note(content: str) -> str:
20
+ """
21
+ Persists CRITICAL facts to the GLOBAL long-term memory.
22
+
23
+ USE EAGERLY to save or update:
24
+ - User preferences (e.g., "I prefer Python", "No unit tests").
25
+ - User information (e.g., user name, user email address).
26
+ - Important facts (e.g., "My API key is in .env").
27
+ - Cross-project goals.
28
+ - Anything that will be useful for future interaction across projects.
29
+
30
+ WARNING: This OVERWRITES the entire global note.
31
+
32
+ Args:
33
+ content (str): The text to strictly memorize.
34
+
35
+ Returns:
36
+ str: Confirmation message.
37
+ """
38
+ llm_context_config.write_note(content, "/")
39
+ return "Global long-term note saved."
40
+
41
+
42
+ def read_contextual_note(path: str | None = None) -> str:
43
+ """
44
+ Retrieves LOCAL memory specific to a file or directory path.
45
+
46
+ Use to recall project-specific architecture, code summaries, or past decisions
47
+ relevant to the current working location.
48
+
49
+ Args:
50
+ path (str | None): Target file/dir. Defaults to current working directory (CWD).
51
+
52
+ Returns:
53
+ str: The local note content for the path.
54
+ """
55
+ if path is None:
56
+ path = os.getcwd()
57
+ abs_path = os.path.abspath(path)
58
+ contexts = llm_context_config.get_notes(cwd=abs_path)
59
+ return contexts.get(abs_path, "")
60
+
61
+
62
+ def write_contextual_note(content: str, path: str | None = None) -> str:
63
+ """
64
+ Persists LOCAL facts specific to a file or directory.
65
+
66
+ USE EAGERLY to save or update:
67
+ - Architectural patterns for this project/directory.
68
+ - Summaries of large files or directories.
69
+ - Specific guidelines for this project.
70
+ - Anything related to this directory that will be useful for future interaction.
71
+
72
+ WARNING: This OVERWRITES the note for the specific path.
73
+
74
+ Args:
75
+ content (str): The text to memorize for this location.
76
+ path (str | None): Target file/dir. Defaults to CWD.
77
+
78
+ Returns:
79
+ str: Confirmation message.
80
+ """
81
+ if path is None:
82
+ path = os.getcwd()
83
+ llm_context_config.write_note(content, path)
84
+ return f"Contextual note saved for: {path}"
@@ -5,10 +5,11 @@ import os
5
5
  import sys
6
6
  from collections.abc import Callable
7
7
  from textwrap import dedent
8
+ from typing import Any
8
9
 
9
10
  import ulid
10
11
 
11
- from zrb.config import CFG
12
+ from zrb.config.config import CFG
12
13
  from zrb.util.cli.style import stylize_error, stylize_faint
13
14
  from zrb.util.file import read_file
14
15
 
@@ -43,13 +44,41 @@ def create_rag_from_directory(
43
44
  openai_base_url: str | None = None,
44
45
  openai_embedding_model: str | None = None,
45
46
  ):
46
- """Create a RAG retrieval tool function for LLM use.
47
- This factory configures and returns an async function that takes a query,
48
- updates a vector database if needed, performs a similarity search,
49
- and returns relevant document chunks.
47
+ """
48
+ Create a powerful RAG (Retrieval-Augmented Generation) tool for querying a local
49
+ knowledge base.
50
+
51
+ This factory function generates a tool that performs semantic search over a directory of
52
+ documents. It automatically indexes the documents into a vector database (ChromaDB) and
53
+ keeps it updated as files change.
54
+
55
+ The generated tool is ideal for answering questions based on a specific set of documents,
56
+ such as project documentation or internal wikis.
57
+
58
+ Args:
59
+ tool_name (str): The name for the generated RAG tool (e.g., "search_project_docs").
60
+ tool_description (str): A clear description of what the tool does and when to use it.
61
+ This is what the LLM will see.
62
+ document_dir_path (str, optional): The path to the directory containing the documents
63
+ to be indexed.
64
+ vector_db_path (str, optional): The path where the ChromaDB vector database will be
65
+ stored.
66
+ vector_db_collection (str, optional): The name of the collection within the vector
67
+ database.
68
+ chunk_size (int, optional): The size of text chunks for embedding.
69
+ overlap (int, optional): The overlap between text chunks.
70
+ max_result_count (int, optional): The maximum number of search results to return.
71
+ file_reader (list[RAGFileReader], optional): A list of custom file readers for
72
+ specific file types.
73
+ openai_api_key (str, optional): Your OpenAI API key for generating embeddings.
74
+ openai_base_url (str, optional): An optional base URL for the OpenAI API.
75
+ openai_embedding_model (str, optional): The embedding model to use.
76
+
77
+ Returns:
78
+ An asynchronous function that serves as the RAG tool.
50
79
  """
51
80
 
52
- async def retrieve(query: str) -> str:
81
+ async def retrieve(query: str) -> dict[str, Any]:
53
82
  # Docstring will be set dynamically below
54
83
  from chromadb import PersistentClient
55
84
  from chromadb.config import Settings
@@ -164,7 +193,7 @@ def create_rag_from_directory(
164
193
  query_embeddings=query_vector,
165
194
  n_results=max_result_count_val,
166
195
  )
167
- return json.dumps(results)
196
+ return dict(results)
168
197
 
169
198
  retrieve.__name__ = tool_name
170
199
  retrieve.__doc__ = dedent(
@@ -173,7 +202,8 @@ def create_rag_from_directory(
173
202
  Args:
174
203
  query (str): The user query to search for in documents.
175
204
  Returns:
176
- str: JSON string with search results: {{"ids": [...], "documents": [...], ...}}
205
+ dict[str, Any]: dictionary with search results:
206
+ {{"ids": [...], "documents": [...], ...}}
177
207
  """
178
208
  ).strip()
179
209
  return retrieve
@@ -3,60 +3,70 @@ from collections.abc import Callable
3
3
  from textwrap import dedent
4
4
  from typing import TYPE_CHECKING, Any, Coroutine
5
5
 
6
+ from zrb.context.any_context import AnyContext
7
+ from zrb.task.llm.agent import create_agent_instance
8
+ from zrb.task.llm.agent_runner import run_agent_iteration
9
+ from zrb.task.llm.config import get_model, get_model_settings
10
+ from zrb.task.llm.prompt import get_system_and_user_prompt
11
+ from zrb.task.llm.subagent_conversation_history import (
12
+ get_ctx_subagent_history,
13
+ set_ctx_subagent_history,
14
+ )
15
+
6
16
  if TYPE_CHECKING:
7
17
  from pydantic_ai import Tool
8
- from pydantic_ai.mcp import MCPServer
18
+ from pydantic_ai._agent_graph import HistoryProcessor
9
19
  from pydantic_ai.models import Model
10
20
  from pydantic_ai.settings import ModelSettings
11
- else:
12
- Tool = Any
13
- MCPServer = Any
14
- Model = Any
15
- ModelSettings = Any
21
+ from pydantic_ai.toolsets import AbstractToolset
16
22
 
17
- from zrb.context.any_context import AnyContext
18
- from zrb.task.llm.agent import create_agent_instance, run_agent_iteration
19
- from zrb.task.llm.config import get_model, get_model_settings
20
- from zrb.task.llm.prompt import get_combined_system_prompt
21
-
22
- if TYPE_CHECKING:
23
23
  ToolOrCallable = Tool | Callable
24
- else:
25
- ToolOrCallable = Any
26
24
 
27
25
 
28
26
  def create_sub_agent_tool(
29
27
  tool_name: str,
30
28
  tool_description: str,
31
29
  system_prompt: str | None = None,
32
- model: str | Model | None = None,
33
- model_settings: ModelSettings | None = None,
34
- tools: list[ToolOrCallable] = [],
35
- mcp_servers: list[MCPServer] = [],
36
- ) -> Callable[[AnyContext, str], Coroutine[Any, Any, str]]:
30
+ model: "str | Model | None" = None,
31
+ model_settings: "ModelSettings | None" = None,
32
+ tools: "list[ToolOrCallable]" = [],
33
+ toolsets: list["AbstractToolset[None]"] = [],
34
+ yolo_mode: bool | list[str] | None = None,
35
+ history_processors: list["HistoryProcessor"] | None = None,
36
+ log_indent_level: int = 2,
37
+ agent_name: str | None = None,
38
+ auto_summarize: bool = True,
39
+ remember_history: bool = True,
40
+ ) -> Callable[[AnyContext, str], Coroutine[Any, Any, Any]]:
37
41
  """
38
- Create an LLM "sub-agent" tool function for use by a main LLM agent.
42
+ Create a tool that is another AI agent, capable of handling complex, multi-step sub-tasks.
39
43
 
40
- This factory configures and returns an async function that, when called
41
- by the main agent, instantiates and runs a sub-agent (the sub-agent)
42
- with a given query and returns the sub-agent's final response.
44
+ This factory function generates a tool that, when used, spins up a temporary, specialized
45
+ AI agent. This "sub-agent" has its own system prompt, tools, and context, allowing it to
46
+ focus on accomplishing a specific task without being distracted by the main conversation.
47
+
48
+ This is ideal for delegating complex tasks like analyzing a file or a repository.
43
49
 
44
50
  Args:
45
- tool_name: The name of the tool for the main agent.
46
- tool_description: The description of the tool for the main agent.
47
- sub_agent_system_prompt: The system prompt for the sub-agent.
48
- sub_agent_model: The model for the sub-agent (optional).
49
- sub_agent_model_settings: Model settings for the sub-agent (optional).
50
- sub_agent_tools: A list of tools (Tool instances or callables) for the
51
- sub-agent (optional).
52
- sub_agent_mcp_servers: A list of MCP servers for the sub-agent (optional).
51
+ tool_name (str): The name for the generated sub-agent tool.
52
+ tool_description (str): A clear description of the sub-agent's purpose and when to
53
+ use it. This is what the LLM will see.
54
+ system_prompt (str, optional): The system prompt that will guide the sub-agent's
55
+ behavior.
56
+ model (str | Model, optional): The language model the sub-agent will use.
57
+ model_settings (ModelSettings, optional): Specific settings for the sub-agent's model.
58
+ tools (list, optional): A list of tools that will be exclusively available to the
59
+ sub-agent.
60
+ toolsets (list, optional): A list of Toolsets for the sub-agent.
53
61
 
54
62
  Returns:
55
- An async callable function that takes a context and a query string,
56
- runs the sub-agent, and returns the sub-agent's final message content.
63
+ An asynchronous function that serves as the sub-agent tool. When called, it runs the
64
+ sub-agent with a given query and returns its final result.
57
65
  """
66
+ if agent_name is None:
67
+ agent_name = f"{tool_name}_agent"
58
68
 
59
- async def run_sub_agent(ctx: AnyContext, query: str) -> str:
69
+ async def run_sub_agent(ctx: AnyContext, query: str) -> Any:
60
70
  """
61
71
  Runs the sub-agent with the given query.
62
72
  """
@@ -75,16 +85,13 @@ def create_sub_agent_tool(
75
85
  ctx=ctx,
76
86
  model_settings_attr=model_settings,
77
87
  )
78
-
79
88
  if system_prompt is None:
80
- resolved_system_prompt = get_combined_system_prompt(
89
+ resolved_system_prompt, query = get_system_and_user_prompt(
81
90
  ctx=ctx,
91
+ user_message=query,
82
92
  persona_attr=None,
83
- render_persona=False,
84
93
  system_prompt_attr=None,
85
- render_system_prompt=False,
86
94
  special_instruction_prompt_attr=None,
87
- render_special_instruction_prompt=False,
88
95
  )
89
96
  else:
90
97
  resolved_system_prompt = system_prompt
@@ -95,26 +102,36 @@ def create_sub_agent_tool(
95
102
  system_prompt=resolved_system_prompt,
96
103
  model_settings=resolved_model_settings,
97
104
  tools=tools,
98
- mcp_servers=mcp_servers,
105
+ toolsets=toolsets,
106
+ yolo_mode=yolo_mode,
107
+ history_processors=history_processors,
108
+ auto_summarize=auto_summarize,
99
109
  )
100
-
101
110
  sub_agent_run = None
102
111
  # Run the sub-agent iteration
103
- # Start with an empty history for the sub-agent
112
+ history_list = (
113
+ get_ctx_subagent_history(ctx, agent_name) if remember_history else []
114
+ )
104
115
  sub_agent_run = await run_agent_iteration(
105
116
  ctx=ctx,
106
117
  agent=sub_agent_agent,
107
118
  user_prompt=query,
108
- history_list=[], # Start with empty history for the sub-agent
119
+ attachments=[],
120
+ history_list=history_list,
121
+ log_indent_level=log_indent_level,
109
122
  )
110
-
111
123
  # Return the sub-agent's final message content
112
124
  if sub_agent_run and sub_agent_run.result:
113
- # Return the final message content as a string
114
- return json.dumps({"result": sub_agent_run.result.output})
115
- else:
116
- ctx.log_warning("Sub-agent run did not produce a result.")
117
- return "Sub-agent failed to produce a result."
125
+ # Return the final message content
126
+ if remember_history:
127
+ set_ctx_subagent_history(
128
+ ctx,
129
+ agent_name,
130
+ json.loads(sub_agent_run.result.all_messages_json()),
131
+ )
132
+ return sub_agent_run.result.output
133
+ ctx.log_warning("Sub-agent run did not produce a result.")
134
+ raise ValueError(f"{tool_name} not returning any result")
118
135
 
119
136
  # Set the name and docstring for the callable function
120
137
  run_sub_agent.__name__ = tool_name
@@ -126,7 +143,7 @@ def create_sub_agent_tool(
126
143
  query (str): The query or task for the sub-agent.
127
144
 
128
145
  Returns:
129
- str: The final response or result from the sub-agent.
146
+ Any: The final response or result from the sub-agent.
130
147
  """
131
148
  ).strip()
132
149
 
@@ -1,147 +1,171 @@
1
- import json
2
1
  from collections.abc import Callable
2
+ from typing import Any
3
+ from urllib.parse import urljoin
3
4
 
5
+ from zrb.config.config import CFG
6
+ from zrb.config.llm_config import llm_config
7
+
8
+ _DEFAULT_USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" # noqa
9
+
10
+
11
+ async def open_web_page(url: str) -> dict[str, Any]:
12
+ """
13
+ Fetches, parses, and converts a web page to readable Markdown.
14
+ Preserves semantic structure, removes non-essentials, and extracts all absolute links.
15
+
16
+ Example:
17
+ open_web_page(url='https://www.example.com/article')
4
18
 
5
- async def open_web_page(url: str) -> str:
6
- """Get parsed text content and links from a web page URL.
7
19
  Args:
8
- url (str): The URL of the web page to open.
20
+ url (str): The full URL of the web page.
21
+
9
22
  Returns:
10
- str: JSON: {"content": "parsed text content", "links_on_page": ["url1", ...]}
23
+ dict: Markdown content and a list of absolute links.
11
24
  """
25
+ html_content, links = await _fetch_page_content(url)
26
+ markdown_content = _convert_html_to_markdown(html_content)
27
+ return {"content": markdown_content, "links_on_page": links}
28
+
29
+
30
+ def create_search_internet_tool() -> Callable:
31
+ if llm_config.default_search_internet_tool is not None:
32
+ return llm_config.default_search_internet_tool
33
+
34
+ def search_internet(query: str, page: int = 1) -> dict[str, Any]:
35
+ """
36
+ Performs an internet search using a search engine.
37
+ Use to find information, answer general knowledge, or research topics.
38
+
39
+ Example:
40
+ search_internet(query='latest AI advancements', page=1)
12
41
 
13
- async def get_page_content(page_url: str):
14
- user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" # noqa
15
- try:
16
- from playwright.async_api import async_playwright
17
-
18
- async with async_playwright() as p:
19
- browser = await p.chromium.launch(headless=True)
20
- page = await browser.new_page()
21
- await page.set_extra_http_headers({"User-Agent": user_agent})
22
- try:
23
- # Navigate to the URL with a timeout of 30 seconds
24
- await page.goto(page_url, wait_until="networkidle", timeout=30000)
25
- # Wait for the content to load
26
- await page.wait_for_load_state("domcontentloaded")
27
- # Get the page content
28
- content = await page.content()
29
- # Extract all links from the page
30
- links = await page.eval_on_selector_all(
31
- "a[href]",
32
- """
33
- (elements) => elements.map(el => {
34
- const href = el.getAttribute('href');
35
- if (href && !href.startsWith('#') && !href.startsWith('/')) {
36
- return href;
37
- }
38
- return null;
39
- }).filter(href => href !== null)
40
- """,
41
- )
42
- return {"content": content, "links_on_page": links}
43
- finally:
44
- await browser.close()
45
- except ImportError:
46
- import requests
47
-
48
- response = requests.get(url, headers={"User-Agent": user_agent})
49
- if response.status_code != 200:
50
- msg = f"Unable to retrieve search results. Status code: {response.status_code}"
51
- raise Exception(msg)
52
- return {"content": response.text, "links_on_page": []}
53
-
54
- result = await get_page_content(url)
55
- # Parse the HTML content
56
- return json.dumps(parse_html_text(result["content"]))
57
-
58
-
59
- def create_search_internet_tool(serp_api_key: str) -> Callable[[str, int], str]:
60
- def search_internet(query: str, num_results: int = 10) -> str:
61
- """Search the internet using SerpApi (Google Search) and return parsed results.
62
42
  Args:
63
- query (str): Search query.
64
- num_results (int): Search result count. Defaults to 10.
43
+ query (str): The search query.
44
+ page (int, optional): Search result page number. Defaults to 1.
45
+
65
46
  Returns:
66
- str: JSON: {"content": "parsed text content", "links_on_page": ["url1", ...]}
47
+ dict: Summary of search results (titles, links, snippets).
67
48
  """
68
49
  import requests
69
50
 
70
- response = requests.get(
71
- "https://serpapi.com/search",
72
- headers={
73
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" # noqa
74
- },
75
- params={
76
- "q": query,
77
- "num": num_results,
78
- "hl": "en",
79
- "safe": "off",
80
- "api_key": serp_api_key,
81
- },
82
- )
51
+ if (
52
+ CFG.SEARCH_INTERNET_METHOD.strip().lower() == "serpapi"
53
+ and CFG.SERPAPI_KEY != ""
54
+ ):
55
+ response = requests.get(
56
+ "https://serpapi.com/search",
57
+ headers={"User-Agent": _DEFAULT_USER_AGENT},
58
+ params={
59
+ "q": query,
60
+ "start": (page - 1) * 10,
61
+ "hl": CFG.SERPAPI_LANG,
62
+ "safe": CFG.SERPAPI_SAFE,
63
+ "api_key": CFG.SERPAPI_KEY,
64
+ },
65
+ )
66
+ elif (
67
+ CFG.SEARCH_INTERNET_METHOD.strip().lower() == "brave"
68
+ and CFG.BRAVE_API_KEY != ""
69
+ ):
70
+ response = requests.get(
71
+ "https://api.search.brave.com/res/v1/web/search",
72
+ headers={
73
+ "User-Agent": _DEFAULT_USER_AGENT,
74
+ "Accept": "application/json",
75
+ "x-subscription-token": CFG.BRAVE_API_KEY,
76
+ },
77
+ params={
78
+ "q": query,
79
+ "count": "10",
80
+ "offset": (page - 1) * 10,
81
+ "safesearch": CFG.BRAVE_API_SAFE,
82
+ "search_lang": CFG.BRAVE_API_LANG,
83
+ "summary": "true",
84
+ },
85
+ )
86
+ else:
87
+ response = requests.get(
88
+ url=f"{CFG.SEARXNG_BASE_URL}/search",
89
+ headers={"User-Agent": _DEFAULT_USER_AGENT},
90
+ params={
91
+ "q": query,
92
+ "format": "json",
93
+ "pageno": page,
94
+ "safesearch": CFG.SEARXNG_SAFE,
95
+ "language": CFG.SEARXNG_LANG,
96
+ },
97
+ )
83
98
  if response.status_code != 200:
84
99
  raise Exception(
85
100
  f"Error: Unable to retrieve search results (status code: {response.status_code})" # noqa
86
101
  )
87
- return json.dumps(parse_html_text(response.text))
102
+ return response.json()
88
103
 
89
104
  return search_internet
90
105
 
91
106
 
92
- def search_wikipedia(query: str) -> str:
93
- """Search Wikipedia using its API.
94
- Args:
95
- query (str): Search query.
96
- Returns:
97
- str: JSON from Wikipedia API: {"batchcomplete": ..., "query": {"search": [...]}}
98
- """
99
- import requests
100
-
101
- params = {"action": "query", "list": "search", "srsearch": query, "format": "json"}
102
- response = requests.get("https://en.wikipedia.org/w/api.php", params=params)
103
- return response.json()
104
-
105
-
106
- def search_arxiv(query: str, num_results: int = 10) -> str:
107
- """Search ArXiv for papers using its API.
108
- Args:
109
- query (str): Search query.
110
- num_results (int): Search result count. Defaults to 10.
111
- Returns:
112
- str: XML string from ArXiv API containing search results.
113
- """
114
- import requests
115
-
116
- params = {"search_query": f"all:{query}", "start": 0, "max_results": num_results}
117
- response = requests.get("http://export.arxiv.org/api/query", params=params)
118
- return response.content
119
-
107
+ async def _fetch_page_content(url: str) -> tuple[str, list[str]]:
108
+ """Fetches the HTML content and all absolute links from a URL."""
109
+ try:
110
+ from playwright.async_api import async_playwright
111
+
112
+ async with async_playwright() as p:
113
+ browser = await p.chromium.launch(headless=True)
114
+ page = await browser.new_page()
115
+ await page.set_extra_http_headers({"User-Agent": _DEFAULT_USER_AGENT})
116
+ try:
117
+ await page.goto(url, wait_until="networkidle", timeout=30000)
118
+ await page.wait_for_load_state("domcontentloaded")
119
+ content = await page.content()
120
+ links = await page.eval_on_selector_all(
121
+ "a[href]",
122
+ """
123
+ (elements, baseUrl) => elements.map(el => {
124
+ const href = el.getAttribute('href');
125
+ if (!href || href.startsWith('#')) return null;
126
+ try {
127
+ return new URL(href, baseUrl).href;
128
+ } catch (e) {
129
+ return null;
130
+ }
131
+ }).filter(href => href !== null)
132
+ """,
133
+ url,
134
+ )
135
+ return content, links
136
+ # return json.dumps({"content": content, "links": links})
137
+ finally:
138
+ await browser.close()
139
+ except Exception:
140
+ import requests
141
+ from bs4 import BeautifulSoup
120
142
 
121
- def parse_html_text(html_text: str) -> dict[str, str]:
143
+ response = requests.get(url, headers={"User-Agent": _DEFAULT_USER_AGENT})
144
+ if response.status_code != 200:
145
+ raise Exception(
146
+ f"Unable to retrieve page content. Status code: {response.status_code}"
147
+ )
148
+ content = response.text
149
+ soup = BeautifulSoup(content, "html.parser")
150
+ links = [
151
+ urljoin(url, a["href"])
152
+ for a in soup.find_all("a", href=True)
153
+ if not a["href"].startswith("#")
154
+ ]
155
+ return content, links
156
+ # return json.dumps({"content": content, "links": links})
157
+
158
+
159
+ def _convert_html_to_markdown(html_text: str) -> str:
160
+ """Converts HTML content to a clean Markdown representation."""
122
161
  from bs4 import BeautifulSoup
162
+ from markdownify import markdownify as md
123
163
 
124
- ignored_tags = [
125
- "script",
126
- "link",
127
- "meta",
128
- "style",
129
- "code",
130
- "footer",
131
- "nav",
132
- "header",
133
- "aside",
134
- ]
135
164
  soup = BeautifulSoup(html_text, "html.parser")
136
- links = []
137
- for anchor in soup.find_all("a"):
138
- if not anchor or "href" not in anchor.attrs:
139
- continue
140
- link: str = anchor["href"]
141
- if link.startswith("#") or link.startswith("/"):
142
- continue
143
- links.append(link)
144
- for tag in soup(ignored_tags):
165
+ # Remove non-content tags
166
+ for tag in soup(
167
+ ["script", "link", "meta", "style", "header", "footer", "nav", "aside"]
168
+ ):
145
169
  tag.decompose()
146
- html_text = soup.get_text(separator=" ", strip=True)
147
- return {"content": html_text, "links_on_page": links}
170
+ # Convert the cleaned HTML to Markdown
171
+ return md(str(soup))