noesium 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- noesium/agents/askura_agent/__init__.py +22 -0
- noesium/agents/askura_agent/askura_agent.py +480 -0
- noesium/agents/askura_agent/conversation.py +164 -0
- noesium/agents/askura_agent/extractor.py +175 -0
- noesium/agents/askura_agent/memory.py +14 -0
- noesium/agents/askura_agent/models.py +239 -0
- noesium/agents/askura_agent/prompts.py +202 -0
- noesium/agents/askura_agent/reflection.py +234 -0
- noesium/agents/askura_agent/summarizer.py +30 -0
- noesium/agents/askura_agent/utils.py +6 -0
- noesium/agents/deep_research/__init__.py +13 -0
- noesium/agents/deep_research/agent.py +398 -0
- noesium/agents/deep_research/prompts.py +84 -0
- noesium/agents/deep_research/schemas.py +42 -0
- noesium/agents/deep_research/state.py +54 -0
- noesium/agents/search/__init__.py +5 -0
- noesium/agents/search/agent.py +474 -0
- noesium/agents/search/state.py +28 -0
- noesium/core/__init__.py +1 -1
- noesium/core/agent/base.py +10 -2
- noesium/core/goalith/decomposer/llm_decomposer.py +1 -1
- noesium/core/llm/__init__.py +1 -1
- noesium/core/llm/base.py +2 -2
- noesium/core/llm/litellm.py +42 -21
- noesium/core/llm/llamacpp.py +25 -4
- noesium/core/llm/ollama.py +43 -22
- noesium/core/llm/openai.py +25 -5
- noesium/core/llm/openrouter.py +1 -1
- noesium/core/toolify/base.py +9 -2
- noesium/core/toolify/config.py +2 -2
- noesium/core/toolify/registry.py +21 -5
- noesium/core/tracing/opik_tracing.py +7 -7
- noesium/core/vector_store/__init__.py +2 -2
- noesium/core/vector_store/base.py +1 -1
- noesium/core/vector_store/pgvector.py +10 -13
- noesium/core/vector_store/weaviate.py +2 -1
- noesium/toolkits/__init__.py +1 -0
- noesium/toolkits/arxiv_toolkit.py +310 -0
- noesium/toolkits/audio_aliyun_toolkit.py +441 -0
- noesium/toolkits/audio_toolkit.py +370 -0
- noesium/toolkits/bash_toolkit.py +332 -0
- noesium/toolkits/document_toolkit.py +454 -0
- noesium/toolkits/file_edit_toolkit.py +552 -0
- noesium/toolkits/github_toolkit.py +395 -0
- noesium/toolkits/gmail_toolkit.py +575 -0
- noesium/toolkits/image_toolkit.py +425 -0
- noesium/toolkits/memory_toolkit.py +398 -0
- noesium/toolkits/python_executor_toolkit.py +334 -0
- noesium/toolkits/search_toolkit.py +451 -0
- noesium/toolkits/serper_toolkit.py +623 -0
- noesium/toolkits/tabular_data_toolkit.py +537 -0
- noesium/toolkits/user_interaction_toolkit.py +365 -0
- noesium/toolkits/video_toolkit.py +168 -0
- noesium/toolkits/wikipedia_toolkit.py +420 -0
- {noesium-0.1.0.dist-info → noesium-0.2.0.dist-info}/METADATA +56 -48
- {noesium-0.1.0.dist-info → noesium-0.2.0.dist-info}/RECORD +59 -23
- {noesium-0.1.0.dist-info → noesium-0.2.0.dist-info}/licenses/LICENSE +1 -1
- {noesium-0.1.0.dist-info → noesium-0.2.0.dist-info}/WHEEL +0 -0
- {noesium-0.1.0.dist-info → noesium-0.2.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Prompts for the DeepResearchAgent agent.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
query_writer_instructions = """Your goal is to generate sophisticated and diverse web search queries for research. These queries are intended for an advanced automated web research tool capable of analyzing complex results, following links, and synthesizing information.
|
|
6
|
+
|
|
7
|
+
Instructions:
|
|
8
|
+
- Always prefer a single search query, only add another query if the original question requests multiple aspects or elements and one query is not enough.
|
|
9
|
+
- Each query should focus on one specific aspect of the original question.
|
|
10
|
+
- Don't produce more than {number_queries} queries.
|
|
11
|
+
- Each query should be LESS than 40 characters.
|
|
12
|
+
- Rewritten queries should be in the same language as the original query.
|
|
13
|
+
- Queries should be diverse, if the topic is broad, generate more than 1 query.
|
|
14
|
+
- Don't generate multiple similar queries, 1 is enough.
|
|
15
|
+
- Query should ensure that the most current information is gathered. The current date is {current_date}.
|
|
16
|
+
|
|
17
|
+
Format:
|
|
18
|
+
- Format your response as a JSON object with ALL three of these exact keys:
|
|
19
|
+
- "rationale": Brief explanation of why these queries are relevant
|
|
20
|
+
- "query": A list of search queries
|
|
21
|
+
|
|
22
|
+
Example:
|
|
23
|
+
|
|
24
|
+
Topic: Research the latest developments in renewable energy
|
|
25
|
+
```json
|
|
26
|
+
{{
|
|
27
|
+
"rationale": "To gather comprehensive information about renewable energy developments, we need current data on technological advances, market trends, and policy updates. These queries target the specific information needed for thorough research.",
|
|
28
|
+
"query": ["renewable energy development 2024", "solar power technology advances", "renewable energy market trends"]
|
|
29
|
+
}}
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
Context: {research_topic}"""
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
reflection_instructions = """You are an expert research assistant analyzing summaries about "{research_topic}".
|
|
36
|
+
|
|
37
|
+
Instructions:
|
|
38
|
+
- Identify knowledge gaps or areas that need deeper exploration and generate a follow-up query. (1 or multiple).
|
|
39
|
+
- If provided summaries are sufficient to answer the user's question, don't generate a follow-up query.
|
|
40
|
+
- The follow-up query should be less than 40 characters.
|
|
41
|
+
- The follow-up query should be in the same language as the original query.
|
|
42
|
+
- Focus on gathering comprehensive and accurate information relevant to the research topic.
|
|
43
|
+
|
|
44
|
+
Output Format:
|
|
45
|
+
- Format your response as a JSON object with these exact keys:
|
|
46
|
+
- "is_sufficient": true or false
|
|
47
|
+
- "knowledge_gap": Describe what information is missing or needs clarification
|
|
48
|
+
- "follow_up_queries": Write a specific question to address this gap
|
|
49
|
+
|
|
50
|
+
Example:
|
|
51
|
+
```json
|
|
52
|
+
{{
|
|
53
|
+
"is_sufficient": true, // or false
|
|
54
|
+
"knowledge_gap": "The summary lacks information about recent developments and current market conditions", // "" if is_sufficient is true
|
|
55
|
+
"follow_up_queries": ["example follow-up query"] // [] if is_sufficient is true
|
|
56
|
+
}}
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
Reflect carefully on the Summaries to identify knowledge gaps and produce a follow-up query. Then, produce your output following this JSON format:
|
|
60
|
+
|
|
61
|
+
Summaries:
|
|
62
|
+
{summaries}
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
answer_instructions = """Generate a high-quality answer to the user's question based on the provided summaries.
|
|
67
|
+
|
|
68
|
+
Instructions:
|
|
69
|
+
- The current date is {current_date}.
|
|
70
|
+
- You are the final step of a multi-step research process, don't mention that you are the final step.
|
|
71
|
+
- You have access to all the information gathered from the previous steps.
|
|
72
|
+
- You have access to the user's question.
|
|
73
|
+
- Generate a high-quality answer to the user's question based on the provided summaries and the user's question.
|
|
74
|
+
- You MUST include all the citations (if available in the summaries) in the answer correctly.
|
|
75
|
+
- DO NOT mention summary indicators in the answer.
|
|
76
|
+
- Structure the answer logically and comprehensively.
|
|
77
|
+
- Include specific details, facts, and current information when available.
|
|
78
|
+
- Provide actionable insights and practical information when relevant.
|
|
79
|
+
|
|
80
|
+
User Context:
|
|
81
|
+
- {research_topic}
|
|
82
|
+
|
|
83
|
+
Summaries:
|
|
84
|
+
{summaries}"""
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Pydantic schemas for structured LLM output in the DeepResearchAgent module.
|
|
3
|
+
Enhanced for use with instructor library.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from typing import List
|
|
7
|
+
|
|
8
|
+
from pydantic import BaseModel, Field
|
|
9
|
+
|
|
10
|
+
try:
|
|
11
|
+
from instructor import OpenAISchema
|
|
12
|
+
|
|
13
|
+
INSTRUCTOR_AVAILABLE = True
|
|
14
|
+
except ImportError:
|
|
15
|
+
# Fallback: use BaseModel if instructor is not available
|
|
16
|
+
OpenAISchema = BaseModel
|
|
17
|
+
INSTRUCTOR_AVAILABLE = False
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class SearchQueryList(OpenAISchema):
|
|
21
|
+
"""Schema for search query generation using instructor."""
|
|
22
|
+
|
|
23
|
+
query: List[str] = Field(
|
|
24
|
+
description="A list of search queries to be used for web research. Each query should be specific and focused on one aspect of the research topic."
|
|
25
|
+
)
|
|
26
|
+
rationale: str = Field(
|
|
27
|
+
description="A brief explanation of why these queries are relevant to the research topic and how they will help gather comprehensive information."
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class Reflection(OpenAISchema):
|
|
32
|
+
"""Schema for reflection and evaluation using instructor."""
|
|
33
|
+
|
|
34
|
+
is_sufficient: bool = Field(
|
|
35
|
+
description="Whether the provided summaries are sufficient to answer the user's question comprehensively."
|
|
36
|
+
)
|
|
37
|
+
knowledge_gap: str = Field(
|
|
38
|
+
description="A detailed description of what information is missing or needs clarification to provide a complete answer."
|
|
39
|
+
)
|
|
40
|
+
follow_up_queries: List[str] = Field(
|
|
41
|
+
description="A list of specific follow-up queries to address the identified knowledge gap. Each query should be focused and actionable."
|
|
42
|
+
)
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
"""
|
|
2
|
+
State definitions for the DeepResearchAgent agent.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import operator
|
|
8
|
+
from typing import Any, Dict, List, TypedDict
|
|
9
|
+
|
|
10
|
+
from langgraph.graph import add_messages
|
|
11
|
+
from typing_extensions import Annotated
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class ResearchState(TypedDict):
|
|
15
|
+
"""Main state for the research workflow."""
|
|
16
|
+
|
|
17
|
+
messages: Annotated[list, add_messages]
|
|
18
|
+
search_query: Annotated[list, operator.add]
|
|
19
|
+
search_summaries: Annotated[list, operator.add]
|
|
20
|
+
sources_gathered: Annotated[list, operator.add]
|
|
21
|
+
initial_search_query_count: int
|
|
22
|
+
max_research_loops: int
|
|
23
|
+
research_loop_count: int
|
|
24
|
+
context: Dict[str, Any]
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class Query(TypedDict):
|
|
28
|
+
"""Individual query with rationale."""
|
|
29
|
+
|
|
30
|
+
query: str
|
|
31
|
+
rationale: str
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class QueryState(TypedDict):
|
|
35
|
+
"""State for query generation."""
|
|
36
|
+
|
|
37
|
+
query_list: List[Query]
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class WebSearchState(TypedDict):
|
|
41
|
+
"""State for web search operations."""
|
|
42
|
+
|
|
43
|
+
search_query: str
|
|
44
|
+
id: str
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class ReflectionState(TypedDict):
|
|
48
|
+
"""State for reflection and evaluation."""
|
|
49
|
+
|
|
50
|
+
is_sufficient: bool
|
|
51
|
+
knowledge_gap: str
|
|
52
|
+
follow_up_queries: Annotated[list, operator.add]
|
|
53
|
+
research_loop_count: int
|
|
54
|
+
number_of_ran_queries: int
|
|
@@ -0,0 +1,474 @@
|
|
|
1
|
+
from typing import Dict, List, Optional, Type, override
|
|
2
|
+
|
|
3
|
+
try:
|
|
4
|
+
from langchain_core.runnables import RunnableConfig
|
|
5
|
+
from langgraph.graph import END, START, StateGraph
|
|
6
|
+
|
|
7
|
+
LANGCHAIN_AVAILABLE = True
|
|
8
|
+
except ImportError:
|
|
9
|
+
RunnableConfig = None
|
|
10
|
+
StateGraph = None
|
|
11
|
+
END = None
|
|
12
|
+
START = None
|
|
13
|
+
LANGCHAIN_AVAILABLE = False
|
|
14
|
+
|
|
15
|
+
try:
|
|
16
|
+
from wizsearch import PageCrawler, WizSearch, WizSearchConfig
|
|
17
|
+
|
|
18
|
+
WIZSEARCH_AVAILABLE = True
|
|
19
|
+
except ImportError:
|
|
20
|
+
PageCrawler = None
|
|
21
|
+
WizSearch = None
|
|
22
|
+
WizSearchConfig = None
|
|
23
|
+
WIZSEARCH_AVAILABLE = False
|
|
24
|
+
|
|
25
|
+
from noesium.core.agent import BaseGraphicAgent
|
|
26
|
+
from noesium.core.llm import BaseLLMClient
|
|
27
|
+
from noesium.core.utils.logging import get_logger
|
|
28
|
+
|
|
29
|
+
from .state import SearchState
|
|
30
|
+
|
|
31
|
+
# Configure logging
|
|
32
|
+
logger = get_logger(__name__)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class SearchAgent(BaseGraphicAgent):
|
|
36
|
+
"""Web search agent with optional AI crawling."""
|
|
37
|
+
|
|
38
|
+
def __init__(
|
|
39
|
+
self,
|
|
40
|
+
polish_query: bool = False,
|
|
41
|
+
llm_provider: str = "openai",
|
|
42
|
+
rerank_results: bool = False,
|
|
43
|
+
rerank_llm: BaseLLMClient | None = None,
|
|
44
|
+
search_engines: List[str] = ["tavily", "duckduckgo"],
|
|
45
|
+
max_results_per_engine: int = 5,
|
|
46
|
+
search_timeout: int = 20,
|
|
47
|
+
crawl_content: bool = False,
|
|
48
|
+
content_format: str = "markdown",
|
|
49
|
+
adaptive_crawl: bool = False,
|
|
50
|
+
crawl_depth: int = 1,
|
|
51
|
+
crawl_external_links: bool = False,
|
|
52
|
+
**kwargs,
|
|
53
|
+
):
|
|
54
|
+
# Initialize base class
|
|
55
|
+
super().__init__(llm_provider=llm_provider, **kwargs)
|
|
56
|
+
self.enable_query_polishing = polish_query
|
|
57
|
+
self.enable_reranking = rerank_results
|
|
58
|
+
self.rerank_llm = rerank_llm if rerank_llm else self.llm
|
|
59
|
+
self.search_engines = search_engines
|
|
60
|
+
self.max_results_per_engine = max_results_per_engine
|
|
61
|
+
self.search_timeout = search_timeout
|
|
62
|
+
self.crawl_content = crawl_content
|
|
63
|
+
self.content_format = content_format
|
|
64
|
+
self.adaptive_crawl = adaptive_crawl
|
|
65
|
+
self.crawl_depth = crawl_depth
|
|
66
|
+
self.crawl_external_links = crawl_external_links
|
|
67
|
+
|
|
68
|
+
# Build the graph
|
|
69
|
+
self.graph = self._build_graph()
|
|
70
|
+
|
|
71
|
+
@override
|
|
72
|
+
async def run(
|
|
73
|
+
self, user_message: str, context: Optional[Dict] = None, config: Optional[RunnableConfig] = None
|
|
74
|
+
) -> str:
|
|
75
|
+
"""
|
|
76
|
+
Run the SearchAgent with a user message as search query.
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
user_message: The search query to execute
|
|
80
|
+
context: Optional context dictionary (unused currently)
|
|
81
|
+
config: Optional RunnableConfig for graph execution
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
str: Formatted search results as a string
|
|
85
|
+
"""
|
|
86
|
+
try:
|
|
87
|
+
# Create initial state for the search workflow
|
|
88
|
+
initial_state = {
|
|
89
|
+
"search_query": user_message.strip(),
|
|
90
|
+
"raw_query": "",
|
|
91
|
+
"search_results": None,
|
|
92
|
+
"messages": [],
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
logger.info(f"Starting search workflow for query: '{user_message}'")
|
|
96
|
+
|
|
97
|
+
# Execute the search workflow
|
|
98
|
+
if config:
|
|
99
|
+
result = await self.graph.ainvoke(initial_state, config=config)
|
|
100
|
+
else:
|
|
101
|
+
result = await self.graph.ainvoke(initial_state)
|
|
102
|
+
|
|
103
|
+
# Format and return the results
|
|
104
|
+
return self._format_search_results(result)
|
|
105
|
+
|
|
106
|
+
except Exception as e:
|
|
107
|
+
error_msg = f"Search workflow failed: {str(e)}"
|
|
108
|
+
logger.error(error_msg)
|
|
109
|
+
return f"❌ {error_msg}"
|
|
110
|
+
|
|
111
|
+
def _format_search_results(self, result: dict) -> str:
|
|
112
|
+
"""
|
|
113
|
+
Format search results into a readable string.
|
|
114
|
+
|
|
115
|
+
Args:
|
|
116
|
+
result: The result dictionary from the graph execution
|
|
117
|
+
|
|
118
|
+
Returns:
|
|
119
|
+
str: Formatted search results
|
|
120
|
+
"""
|
|
121
|
+
if not result or "search_results" not in result or not result["search_results"]:
|
|
122
|
+
return "❌ No search results found."
|
|
123
|
+
|
|
124
|
+
search_results = result["search_results"]
|
|
125
|
+
output_lines = []
|
|
126
|
+
|
|
127
|
+
# Header with query information
|
|
128
|
+
query = result.get("search_query", "unknown")
|
|
129
|
+
raw_query = result.get("raw_query", "")
|
|
130
|
+
|
|
131
|
+
output_lines.append(f"🔍 Search Results for: '{query}'")
|
|
132
|
+
if raw_query and raw_query != query:
|
|
133
|
+
output_lines.append(f"📝 Original query: '{raw_query}'")
|
|
134
|
+
|
|
135
|
+
output_lines.append("")
|
|
136
|
+
|
|
137
|
+
# Search statistics
|
|
138
|
+
total_sources = len(search_results.sources) if search_results.sources else 0
|
|
139
|
+
sources_with_content = sum(1 for source in (search_results.sources or []) if source.content)
|
|
140
|
+
|
|
141
|
+
output_lines.append(f"📊 Found {total_sources} results")
|
|
142
|
+
if self.crawl_content and sources_with_content > 0:
|
|
143
|
+
output_lines.append(f"📄 {sources_with_content} sources with crawled content")
|
|
144
|
+
|
|
145
|
+
# Response time if available
|
|
146
|
+
if search_results.response_time:
|
|
147
|
+
output_lines.append(f"⏱️ Response time: {search_results.response_time:.2f}s")
|
|
148
|
+
|
|
149
|
+
output_lines.append("")
|
|
150
|
+
|
|
151
|
+
# Direct answer if available
|
|
152
|
+
if search_results.answer:
|
|
153
|
+
output_lines.append("💡 Direct Answer:")
|
|
154
|
+
output_lines.append(search_results.answer)
|
|
155
|
+
output_lines.append("")
|
|
156
|
+
|
|
157
|
+
# Search results
|
|
158
|
+
if search_results.sources:
|
|
159
|
+
output_lines.append("📋 Sources:")
|
|
160
|
+
for i, source in enumerate(search_results.sources, 1):
|
|
161
|
+
output_lines.append(f"\n{i}. **{source.title}**")
|
|
162
|
+
output_lines.append(f" 🔗 {source.url}")
|
|
163
|
+
|
|
164
|
+
if source.score:
|
|
165
|
+
output_lines.append(f" ⭐ Score: {source.score:.3f}")
|
|
166
|
+
|
|
167
|
+
if not source.content and self.crawl_content:
|
|
168
|
+
output_lines.append(" 📝 Content: [Crawling failed]")
|
|
169
|
+
|
|
170
|
+
# Features used
|
|
171
|
+
features = []
|
|
172
|
+
if self.enable_query_polishing:
|
|
173
|
+
features.append("Query Polishing")
|
|
174
|
+
if self.enable_reranking:
|
|
175
|
+
features.append("Result Reranking")
|
|
176
|
+
if self.crawl_content:
|
|
177
|
+
features.append(f"Content Crawling ({self.content_format})")
|
|
178
|
+
|
|
179
|
+
if features:
|
|
180
|
+
output_lines.append(f"\n🔧 Features used: {', '.join(features)}")
|
|
181
|
+
|
|
182
|
+
return "\n".join(output_lines)
|
|
183
|
+
|
|
184
|
+
@override
|
|
185
|
+
def get_state_class(self) -> Type:
|
|
186
|
+
"""
|
|
187
|
+
Get the state class for this search agent.
|
|
188
|
+
Override this method in subclasses for specialized state.
|
|
189
|
+
|
|
190
|
+
Returns:
|
|
191
|
+
The state class to use for the search workflow
|
|
192
|
+
"""
|
|
193
|
+
return SearchState
|
|
194
|
+
|
|
195
|
+
@override
|
|
196
|
+
def _build_graph(self) -> StateGraph:
|
|
197
|
+
"""Create the LangGraph search workflow."""
|
|
198
|
+
state_class = self.get_state_class()
|
|
199
|
+
workflow = StateGraph(state_class)
|
|
200
|
+
|
|
201
|
+
# Add nodes
|
|
202
|
+
workflow.add_node("polish_query", self._polish_query_node)
|
|
203
|
+
workflow.add_node("web_search", self._web_search_node)
|
|
204
|
+
workflow.add_node("crawl_web", self._crawl_web_node)
|
|
205
|
+
workflow.add_node("rank_results", self._rank_results_node)
|
|
206
|
+
workflow.add_node("finalize_search", self._finalize_search_node)
|
|
207
|
+
|
|
208
|
+
# Set entry point
|
|
209
|
+
workflow.add_edge(START, "polish_query")
|
|
210
|
+
|
|
211
|
+
# Add conditional edges
|
|
212
|
+
workflow.add_edge("polish_query", "web_search")
|
|
213
|
+
workflow.add_edge("web_search", "crawl_web")
|
|
214
|
+
workflow.add_conditional_edges("crawl_web", self._evaluate_crawl_web, ["rank_results", "finalize_search"])
|
|
215
|
+
workflow.add_edge("rank_results", "finalize_search")
|
|
216
|
+
workflow.add_edge("finalize_search", END)
|
|
217
|
+
|
|
218
|
+
return workflow.compile()
|
|
219
|
+
|
|
220
|
+
def _polish_query_node(self, state: SearchState, config: RunnableConfig) -> SearchState:
|
|
221
|
+
"""Polish the query using LLM to improve search effectiveness."""
|
|
222
|
+
if self.enable_query_polishing:
|
|
223
|
+
try:
|
|
224
|
+
# Save original query
|
|
225
|
+
state["raw_query"] = state["search_query"]
|
|
226
|
+
|
|
227
|
+
# Create a prompt to polish the query
|
|
228
|
+
polish_prompt = f"""
|
|
229
|
+
You are a search query optimization expert. Your task is to improve the given search query to make it more effective for web search engines.
|
|
230
|
+
|
|
231
|
+
Original query: "{state['search_query']}"
|
|
232
|
+
|
|
233
|
+
Please provide an improved version that:
|
|
234
|
+
1. Uses more specific and relevant keywords
|
|
235
|
+
2. Removes unnecessary words or ambiguity
|
|
236
|
+
3. Maintains the original intent
|
|
237
|
+
4. Is optimized for search engines
|
|
238
|
+
4. The query is less than 40 characters
|
|
239
|
+
|
|
240
|
+
Return only the improved query, nothing else.
|
|
241
|
+
"""
|
|
242
|
+
|
|
243
|
+
# Polish the query using LLM
|
|
244
|
+
polished_query = self.llm.completion(
|
|
245
|
+
messages=[{"role": "user", "content": polish_prompt}], temperature=0.3, max_tokens=100
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
# Update the search query if polishing was successful
|
|
249
|
+
if polished_query and polished_query.strip():
|
|
250
|
+
state["search_query"] = polished_query.strip()
|
|
251
|
+
logger.info(f"Query polished from '{state['raw_query']}' to '{state['search_query']}'")
|
|
252
|
+
else:
|
|
253
|
+
logger.warning("Query polishing failed, using original query")
|
|
254
|
+
|
|
255
|
+
except Exception as e:
|
|
256
|
+
logger.error(f"Error polishing query: {e}")
|
|
257
|
+
# Keep original query if polishing fails
|
|
258
|
+
state["raw_query"] = state["search_query"]
|
|
259
|
+
|
|
260
|
+
return state
|
|
261
|
+
|
|
262
|
+
async def _web_search_node(self, state: SearchState, config: RunnableConfig) -> SearchState:
|
|
263
|
+
"""Perform a web search."""
|
|
264
|
+
try:
|
|
265
|
+
config = WizSearchConfig(
|
|
266
|
+
enabled_engines=self.search_engines,
|
|
267
|
+
max_results_per_engine=self.max_results_per_engine,
|
|
268
|
+
timeout=self.search_timeout,
|
|
269
|
+
)
|
|
270
|
+
omnisearch = WizSearch(config=config)
|
|
271
|
+
result = await omnisearch.search(query=state["search_query"])
|
|
272
|
+
state["search_results"] = result
|
|
273
|
+
return state
|
|
274
|
+
except Exception as e:
|
|
275
|
+
logger.error(f"Failed to search: {e}")
|
|
276
|
+
raise
|
|
277
|
+
|
|
278
|
+
async def _crawl_web_node(self, state: SearchState, config: RunnableConfig) -> SearchState:
|
|
279
|
+
"""Crawl the web."""
|
|
280
|
+
if self.crawl_content:
|
|
281
|
+
for source in state["search_results"].sources:
|
|
282
|
+
crawler = PageCrawler(
|
|
283
|
+
url=source.url,
|
|
284
|
+
external_links=self.crawl_external_links,
|
|
285
|
+
content_format=self.content_format,
|
|
286
|
+
adaptive_crawl=self.adaptive_crawl,
|
|
287
|
+
depth=self.crawl_depth,
|
|
288
|
+
)
|
|
289
|
+
content = await crawler.crawl()
|
|
290
|
+
source.content = content
|
|
291
|
+
return state
|
|
292
|
+
|
|
293
|
+
def _evaluate_crawl_web(self, state: SearchState, config: RunnableConfig) -> str:
|
|
294
|
+
"""Evaluate the crawl web results and decide next step.
|
|
295
|
+
|
|
296
|
+
If there are valid search results and reranking is enabled, rank the results.
|
|
297
|
+
Otherwise, finalize the search.
|
|
298
|
+
|
|
299
|
+
Args:
|
|
300
|
+
state: The state of the search.
|
|
301
|
+
config: The config of the search.
|
|
302
|
+
|
|
303
|
+
Returns:
|
|
304
|
+
str: The next node to execute ('rank_results' or 'finalize_search')
|
|
305
|
+
"""
|
|
306
|
+
# Check if we have search results
|
|
307
|
+
if "search_results" not in state or not state["search_results"] or not state["search_results"].sources:
|
|
308
|
+
logger.warning("No search results found, proceeding to finalize")
|
|
309
|
+
return "finalize_search"
|
|
310
|
+
|
|
311
|
+
# Check if reranking is enabled and we have multiple results
|
|
312
|
+
if self.enable_reranking and len(state["search_results"].sources) > 1:
|
|
313
|
+
logger.info(f"Reranking enabled with {len(state['search_results'].sources)} results")
|
|
314
|
+
return "rank_results"
|
|
315
|
+
else:
|
|
316
|
+
logger.info("Skipping reranking, proceeding to finalize")
|
|
317
|
+
return "finalize_search"
|
|
318
|
+
|
|
319
|
+
def _rank_results_node(self, state: SearchState, config: RunnableConfig) -> SearchState:
|
|
320
|
+
"""Rank the search results using LLM to improve relevance ordering."""
|
|
321
|
+
try:
|
|
322
|
+
if not state["search_results"] or not state["search_results"].sources:
|
|
323
|
+
logger.warning("No search results to rank")
|
|
324
|
+
return state
|
|
325
|
+
|
|
326
|
+
# Create documents for reranking
|
|
327
|
+
documents = []
|
|
328
|
+
for i, source in enumerate(state["search_results"].sources):
|
|
329
|
+
# Create a text representation for ranking
|
|
330
|
+
doc_text = f"Title: {source.title}\nURL: {source.url}"
|
|
331
|
+
if source.content:
|
|
332
|
+
# Truncate content to avoid token limits
|
|
333
|
+
content_preview = source.content[:500] + "..." if len(source.content) > 500 else source.content
|
|
334
|
+
doc_text += f"\nContent: {content_preview}"
|
|
335
|
+
documents.append(doc_text)
|
|
336
|
+
|
|
337
|
+
# Use the query for reranking
|
|
338
|
+
query = state.get("search_query", "")
|
|
339
|
+
|
|
340
|
+
if not query:
|
|
341
|
+
logger.warning("No query available for reranking")
|
|
342
|
+
return state
|
|
343
|
+
|
|
344
|
+
# Check if LLM has rerank capability
|
|
345
|
+
if hasattr(self.rerank_llm, "rerank"):
|
|
346
|
+
try:
|
|
347
|
+
# Use built-in rerank function if available
|
|
348
|
+
reranked_result = self.rerank_llm.rerank(query=query, chunks=documents)
|
|
349
|
+
|
|
350
|
+
if reranked_result:
|
|
351
|
+
# Check if rerank returns indices (List[int]) or reranked chunks (List[str])
|
|
352
|
+
if isinstance(reranked_result[0], int):
|
|
353
|
+
# Handle case where rerank returns indices
|
|
354
|
+
reranked_sources = [state["search_results"].sources[i] for i in reranked_result]
|
|
355
|
+
else:
|
|
356
|
+
# Handle case where rerank returns reranked chunks
|
|
357
|
+
# Create a mapping from documents to sources
|
|
358
|
+
doc_to_source = {
|
|
359
|
+
doc: source for doc, source in zip(documents, state["search_results"].sources)
|
|
360
|
+
}
|
|
361
|
+
reranked_sources = [
|
|
362
|
+
doc_to_source[chunk] for chunk in reranked_result if chunk in doc_to_source
|
|
363
|
+
]
|
|
364
|
+
|
|
365
|
+
state["search_results"].sources = reranked_sources
|
|
366
|
+
logger.info(f"Reranked {len(reranked_sources)} results")
|
|
367
|
+
|
|
368
|
+
except Exception as e:
|
|
369
|
+
logger.error(f"Built-in reranking failed: {e}")
|
|
370
|
+
# Fall back to LLM-based reranking
|
|
371
|
+
self._llm_based_reranking(state, query, documents)
|
|
372
|
+
else:
|
|
373
|
+
# Use LLM-based reranking
|
|
374
|
+
self._llm_based_reranking(state, query, documents)
|
|
375
|
+
|
|
376
|
+
except Exception as e:
|
|
377
|
+
logger.error(f"Error in ranking results: {e}")
|
|
378
|
+
|
|
379
|
+
return state
|
|
380
|
+
|
|
381
|
+
def _llm_based_reranking(self, state: SearchState, query: str, documents: list) -> None:
|
|
382
|
+
"""Perform LLM-based reranking of search results."""
|
|
383
|
+
try:
|
|
384
|
+
# Create reranking prompt
|
|
385
|
+
docs_text = "\n\n".join([f"{i+1}. {doc}" for i, doc in enumerate(documents)])
|
|
386
|
+
|
|
387
|
+
rerank_prompt = f"""
|
|
388
|
+
You are a search result ranking expert. Given a search query and a list of search results,
|
|
389
|
+
please rank them in order of relevance to the query.
|
|
390
|
+
|
|
391
|
+
Query: "{query}"
|
|
392
|
+
|
|
393
|
+
Search Results:
|
|
394
|
+
{docs_text}
|
|
395
|
+
|
|
396
|
+
Please provide the ranking as a comma-separated list of numbers (1-{len(documents)})
|
|
397
|
+
ordered from most relevant to least relevant. For example: 3,1,2,4
|
|
398
|
+
|
|
399
|
+
Only return the ranking numbers, nothing else.
|
|
400
|
+
"""
|
|
401
|
+
|
|
402
|
+
ranking_response = self.rerank_llm.completion(
|
|
403
|
+
messages=[{"role": "user", "content": rerank_prompt}], temperature=0.1, max_tokens=50
|
|
404
|
+
)
|
|
405
|
+
|
|
406
|
+
if ranking_response:
|
|
407
|
+
# Parse the ranking response
|
|
408
|
+
try:
|
|
409
|
+
ranking_str = ranking_response.strip()
|
|
410
|
+
ranking_indices = [int(x.strip()) - 1 for x in ranking_str.split(",")]
|
|
411
|
+
|
|
412
|
+
# Validate indices
|
|
413
|
+
if (
|
|
414
|
+
len(ranking_indices) == len(documents)
|
|
415
|
+
and all(0 <= i < len(documents) for i in ranking_indices)
|
|
416
|
+
and len(set(ranking_indices)) == len(ranking_indices)
|
|
417
|
+
):
|
|
418
|
+
|
|
419
|
+
# Reorder sources based on LLM ranking
|
|
420
|
+
reranked_sources = [state["search_results"].sources[i] for i in ranking_indices]
|
|
421
|
+
state["search_results"].sources = reranked_sources
|
|
422
|
+
logger.info(f"LLM-based reranking completed for {len(reranked_sources)} results")
|
|
423
|
+
else:
|
|
424
|
+
logger.warning("Invalid ranking response, keeping original order")
|
|
425
|
+
|
|
426
|
+
except (ValueError, IndexError) as e:
|
|
427
|
+
logger.warning(f"Failed to parse ranking response '{ranking_response}': {e}")
|
|
428
|
+
|
|
429
|
+
except Exception as e:
|
|
430
|
+
logger.error(f"LLM-based reranking failed: {e}")
|
|
431
|
+
|
|
432
|
+
def _finalize_search_node(self, state: SearchState, config: RunnableConfig) -> SearchState:
|
|
433
|
+
"""Finalize the search results and prepare final response."""
|
|
434
|
+
try:
|
|
435
|
+
# Ensure we have search results
|
|
436
|
+
if "search_results" not in state or not state["search_results"]:
|
|
437
|
+
logger.warning("No search results to finalize")
|
|
438
|
+
return state
|
|
439
|
+
|
|
440
|
+
# Log final statistics
|
|
441
|
+
total_sources = len(state["search_results"].sources) if state["search_results"].sources else 0
|
|
442
|
+
sources_with_content = sum(1 for source in (state["search_results"].sources or []) if source.content)
|
|
443
|
+
|
|
444
|
+
logger.info(f"Search finalized: {total_sources} total sources, {sources_with_content} with crawled content")
|
|
445
|
+
|
|
446
|
+
# Add final message to state if messages are being tracked
|
|
447
|
+
if "messages" in state:
|
|
448
|
+
from langchain_core.messages import AIMessage
|
|
449
|
+
|
|
450
|
+
# Create summary message
|
|
451
|
+
summary = f"Search completed for query: '{state.get('search_query', 'unknown')}'. "
|
|
452
|
+
summary += f"Found {total_sources} results"
|
|
453
|
+
if sources_with_content > 0:
|
|
454
|
+
summary += f", {sources_with_content} with detailed content"
|
|
455
|
+
summary += "."
|
|
456
|
+
|
|
457
|
+
# Add the summary as an AI message
|
|
458
|
+
final_message = AIMessage(content=summary)
|
|
459
|
+
if isinstance(state["messages"], list):
|
|
460
|
+
state["messages"].append(final_message)
|
|
461
|
+
else:
|
|
462
|
+
state["messages"] = [final_message]
|
|
463
|
+
|
|
464
|
+
# Optionally truncate very long content to avoid memory issues
|
|
465
|
+
if state["search_results"].sources:
|
|
466
|
+
for source in state["search_results"].sources:
|
|
467
|
+
if source.content and len(source.content) > 10000:
|
|
468
|
+
# Keep first 9000 chars and add truncation notice
|
|
469
|
+
source.content = source.content[:9000] + "\n\n[Content truncated for length]"
|
|
470
|
+
|
|
471
|
+
except Exception as e:
|
|
472
|
+
logger.error(f"Error finalizing search: {e}")
|
|
473
|
+
|
|
474
|
+
return state
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
from typing import TypedDict
|
|
2
|
+
|
|
3
|
+
try:
|
|
4
|
+
from langgraph.graph import add_messages
|
|
5
|
+
|
|
6
|
+
LANGGRAPH_AVAILABLE = True
|
|
7
|
+
except ImportError:
|
|
8
|
+
add_messages = None
|
|
9
|
+
LANGGRAPH_AVAILABLE = False
|
|
10
|
+
|
|
11
|
+
from typing_extensions import Annotated
|
|
12
|
+
|
|
13
|
+
try:
|
|
14
|
+
from wizsearch import SearchResult
|
|
15
|
+
|
|
16
|
+
WIZSEARCH_AVAILABLE = True
|
|
17
|
+
except ImportError:
|
|
18
|
+
SearchResult = None
|
|
19
|
+
WIZSEARCH_AVAILABLE = False
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class SearchState(TypedDict):
|
|
23
|
+
"""Main state for the research workflow."""
|
|
24
|
+
|
|
25
|
+
search_query: str
|
|
26
|
+
raw_query: str
|
|
27
|
+
search_results: SearchResult
|
|
28
|
+
messages: Annotated[list, add_messages]
|