noesium 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. noesium/agents/askura_agent/__init__.py +22 -0
  2. noesium/agents/askura_agent/askura_agent.py +480 -0
  3. noesium/agents/askura_agent/conversation.py +164 -0
  4. noesium/agents/askura_agent/extractor.py +175 -0
  5. noesium/agents/askura_agent/memory.py +14 -0
  6. noesium/agents/askura_agent/models.py +239 -0
  7. noesium/agents/askura_agent/prompts.py +202 -0
  8. noesium/agents/askura_agent/reflection.py +234 -0
  9. noesium/agents/askura_agent/summarizer.py +30 -0
  10. noesium/agents/askura_agent/utils.py +6 -0
  11. noesium/agents/deep_research/__init__.py +13 -0
  12. noesium/agents/deep_research/agent.py +398 -0
  13. noesium/agents/deep_research/prompts.py +84 -0
  14. noesium/agents/deep_research/schemas.py +42 -0
  15. noesium/agents/deep_research/state.py +54 -0
  16. noesium/agents/search/__init__.py +5 -0
  17. noesium/agents/search/agent.py +474 -0
  18. noesium/agents/search/state.py +28 -0
  19. noesium/core/__init__.py +1 -1
  20. noesium/core/agent/base.py +10 -2
  21. noesium/core/goalith/decomposer/llm_decomposer.py +1 -1
  22. noesium/core/llm/__init__.py +1 -1
  23. noesium/core/llm/base.py +2 -2
  24. noesium/core/llm/litellm.py +42 -21
  25. noesium/core/llm/llamacpp.py +25 -4
  26. noesium/core/llm/ollama.py +43 -22
  27. noesium/core/llm/openai.py +25 -5
  28. noesium/core/llm/openrouter.py +1 -1
  29. noesium/core/toolify/base.py +9 -2
  30. noesium/core/toolify/config.py +2 -2
  31. noesium/core/toolify/registry.py +21 -5
  32. noesium/core/tracing/opik_tracing.py +7 -7
  33. noesium/core/vector_store/__init__.py +2 -2
  34. noesium/core/vector_store/base.py +1 -1
  35. noesium/core/vector_store/pgvector.py +10 -13
  36. noesium/core/vector_store/weaviate.py +2 -1
  37. noesium/toolkits/__init__.py +1 -0
  38. noesium/toolkits/arxiv_toolkit.py +310 -0
  39. noesium/toolkits/audio_aliyun_toolkit.py +441 -0
  40. noesium/toolkits/audio_toolkit.py +370 -0
  41. noesium/toolkits/bash_toolkit.py +332 -0
  42. noesium/toolkits/document_toolkit.py +454 -0
  43. noesium/toolkits/file_edit_toolkit.py +552 -0
  44. noesium/toolkits/github_toolkit.py +395 -0
  45. noesium/toolkits/gmail_toolkit.py +575 -0
  46. noesium/toolkits/image_toolkit.py +425 -0
  47. noesium/toolkits/memory_toolkit.py +398 -0
  48. noesium/toolkits/python_executor_toolkit.py +334 -0
  49. noesium/toolkits/search_toolkit.py +451 -0
  50. noesium/toolkits/serper_toolkit.py +623 -0
  51. noesium/toolkits/tabular_data_toolkit.py +537 -0
  52. noesium/toolkits/user_interaction_toolkit.py +365 -0
  53. noesium/toolkits/video_toolkit.py +168 -0
  54. noesium/toolkits/wikipedia_toolkit.py +420 -0
  55. {noesium-0.1.0.dist-info → noesium-0.2.0.dist-info}/METADATA +56 -48
  56. {noesium-0.1.0.dist-info → noesium-0.2.0.dist-info}/RECORD +59 -23
  57. {noesium-0.1.0.dist-info → noesium-0.2.0.dist-info}/licenses/LICENSE +1 -1
  58. {noesium-0.1.0.dist-info → noesium-0.2.0.dist-info}/WHEEL +0 -0
  59. {noesium-0.1.0.dist-info → noesium-0.2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,84 @@
1
+ """
2
+ Prompts for the DeepResearchAgent agent.
3
+ """
4
+
5
+ query_writer_instructions = """Your goal is to generate sophisticated and diverse web search queries for research. These queries are intended for an advanced automated web research tool capable of analyzing complex results, following links, and synthesizing information.
6
+
7
+ Instructions:
8
+ - Always prefer a single search query, only add another query if the original question requests multiple aspects or elements and one query is not enough.
9
+ - Each query should focus on one specific aspect of the original question.
10
+ - Don't produce more than {number_queries} queries.
11
+ - Each query should be LESS than 40 characters.
12
+ - Rewritten queries should be in the same language as the original query.
13
+ - Queries should be diverse, if the topic is broad, generate more than 1 query.
14
+ - Don't generate multiple similar queries, 1 is enough.
15
+ - Query should ensure that the most current information is gathered. The current date is {current_date}.
16
+
17
+ Format:
18
+ - Format your response as a JSON object with ALL three of these exact keys:
19
+ - "rationale": Brief explanation of why these queries are relevant
20
+ - "query": A list of search queries
21
+
22
+ Example:
23
+
24
+ Topic: Research the latest developments in renewable energy
25
+ ```json
26
+ {{
27
+ "rationale": "To gather comprehensive information about renewable energy developments, we need current data on technological advances, market trends, and policy updates. These queries target the specific information needed for thorough research.",
28
+ "query": ["renewable energy development 2024", "solar power technology advances", "renewable energy market trends"]
29
+ }}
30
+ ```
31
+
32
+ Context: {research_topic}"""
33
+
34
+
35
+ reflection_instructions = """You are an expert research assistant analyzing summaries about "{research_topic}".
36
+
37
+ Instructions:
38
+ - Identify knowledge gaps or areas that need deeper exploration and generate a follow-up query. (1 or multiple).
39
+ - If provided summaries are sufficient to answer the user's question, don't generate a follow-up query.
40
+ - The follow-up query should be less than 40 characters.
41
+ - The follow-up query should be in the same language as the original query.
42
+ - Focus on gathering comprehensive and accurate information relevant to the research topic.
43
+
44
+ Output Format:
45
+ - Format your response as a JSON object with these exact keys:
46
+ - "is_sufficient": true or false
47
+ - "knowledge_gap": Describe what information is missing or needs clarification
48
+ - "follow_up_queries": Write a specific question to address this gap
49
+
50
+ Example:
51
+ ```json
52
+ {{
53
+ "is_sufficient": true, // or false
54
+ "knowledge_gap": "The summary lacks information about recent developments and current market conditions", // "" if is_sufficient is true
55
+ "follow_up_queries": ["example follow-up query"] // [] if is_sufficient is true
56
+ }}
57
+ ```
58
+
59
+ Reflect carefully on the Summaries to identify knowledge gaps and produce a follow-up query. Then, produce your output following this JSON format:
60
+
61
+ Summaries:
62
+ {summaries}
63
+ """
64
+
65
+
66
+ answer_instructions = """Generate a high-quality answer to the user's question based on the provided summaries.
67
+
68
+ Instructions:
69
+ - The current date is {current_date}.
70
+ - You are the final step of a multi-step research process, don't mention that you are the final step.
71
+ - You have access to all the information gathered from the previous steps.
72
+ - You have access to the user's question.
73
+ - Generate a high-quality answer to the user's question based on the provided summaries and the user's question.
74
+ - You MUST include all the citations (if available in the summaries) in the answer correctly.
75
+ - DO NOT mention summary indicators in the answer.
76
+ - Structure the answer logically and comprehensively.
77
+ - Include specific details, facts, and current information when available.
78
+ - Provide actionable insights and practical information when relevant.
79
+
80
+ User Context:
81
+ - {research_topic}
82
+
83
+ Summaries:
84
+ {summaries}"""
@@ -0,0 +1,42 @@
1
+ """
2
+ Pydantic schemas for structured LLM output in the DeepResearchAgent module.
3
+ Enhanced for use with instructor library.
4
+ """
5
+
6
+ from typing import List
7
+
8
+ from pydantic import BaseModel, Field
9
+
10
+ try:
11
+ from instructor import OpenAISchema
12
+
13
+ INSTRUCTOR_AVAILABLE = True
14
+ except ImportError:
15
+ # Fallback: use BaseModel if instructor is not available
16
+ OpenAISchema = BaseModel
17
+ INSTRUCTOR_AVAILABLE = False
18
+
19
+
20
+ class SearchQueryList(OpenAISchema):
21
+ """Schema for search query generation using instructor."""
22
+
23
+ query: List[str] = Field(
24
+ description="A list of search queries to be used for web research. Each query should be specific and focused on one aspect of the research topic."
25
+ )
26
+ rationale: str = Field(
27
+ description="A brief explanation of why these queries are relevant to the research topic and how they will help gather comprehensive information."
28
+ )
29
+
30
+
31
+ class Reflection(OpenAISchema):
32
+ """Schema for reflection and evaluation using instructor."""
33
+
34
+ is_sufficient: bool = Field(
35
+ description="Whether the provided summaries are sufficient to answer the user's question comprehensively."
36
+ )
37
+ knowledge_gap: str = Field(
38
+ description="A detailed description of what information is missing or needs clarification to provide a complete answer."
39
+ )
40
+ follow_up_queries: List[str] = Field(
41
+ description="A list of specific follow-up queries to address the identified knowledge gap. Each query should be focused and actionable."
42
+ )
@@ -0,0 +1,54 @@
1
+ """
2
+ State definitions for the DeepResearchAgent agent.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ import operator
8
+ from typing import Any, Dict, List, TypedDict
9
+
10
+ from langgraph.graph import add_messages
11
+ from typing_extensions import Annotated
12
+
13
+
14
+ class ResearchState(TypedDict):
15
+ """Main state for the research workflow."""
16
+
17
+ messages: Annotated[list, add_messages]
18
+ search_query: Annotated[list, operator.add]
19
+ search_summaries: Annotated[list, operator.add]
20
+ sources_gathered: Annotated[list, operator.add]
21
+ initial_search_query_count: int
22
+ max_research_loops: int
23
+ research_loop_count: int
24
+ context: Dict[str, Any]
25
+
26
+
27
+ class Query(TypedDict):
28
+ """Individual query with rationale."""
29
+
30
+ query: str
31
+ rationale: str
32
+
33
+
34
+ class QueryState(TypedDict):
35
+ """State for query generation."""
36
+
37
+ query_list: List[Query]
38
+
39
+
40
+ class WebSearchState(TypedDict):
41
+ """State for web search operations."""
42
+
43
+ search_query: str
44
+ id: str
45
+
46
+
47
+ class ReflectionState(TypedDict):
48
+ """State for reflection and evaluation."""
49
+
50
+ is_sufficient: bool
51
+ knowledge_gap: str
52
+ follow_up_queries: Annotated[list, operator.add]
53
+ research_loop_count: int
54
+ number_of_ran_queries: int
@@ -0,0 +1,5 @@
1
+ from .agent import SearchAgent
2
+
3
+ __all__ = [
4
+ "SearchAgent",
5
+ ]
@@ -0,0 +1,474 @@
1
+ from typing import Dict, List, Optional, Type, override
2
+
3
+ try:
4
+ from langchain_core.runnables import RunnableConfig
5
+ from langgraph.graph import END, START, StateGraph
6
+
7
+ LANGCHAIN_AVAILABLE = True
8
+ except ImportError:
9
+ RunnableConfig = None
10
+ StateGraph = None
11
+ END = None
12
+ START = None
13
+ LANGCHAIN_AVAILABLE = False
14
+
15
+ try:
16
+ from wizsearch import PageCrawler, WizSearch, WizSearchConfig
17
+
18
+ WIZSEARCH_AVAILABLE = True
19
+ except ImportError:
20
+ PageCrawler = None
21
+ WizSearch = None
22
+ WizSearchConfig = None
23
+ WIZSEARCH_AVAILABLE = False
24
+
25
+ from noesium.core.agent import BaseGraphicAgent
26
+ from noesium.core.llm import BaseLLMClient
27
+ from noesium.core.utils.logging import get_logger
28
+
29
+ from .state import SearchState
30
+
31
+ # Configure logging
32
+ logger = get_logger(__name__)
33
+
34
+
35
+ class SearchAgent(BaseGraphicAgent):
36
+ """Web search agent with optional AI crawling."""
37
+
38
+ def __init__(
39
+ self,
40
+ polish_query: bool = False,
41
+ llm_provider: str = "openai",
42
+ rerank_results: bool = False,
43
+ rerank_llm: BaseLLMClient | None = None,
44
+ search_engines: List[str] = ["tavily", "duckduckgo"],
45
+ max_results_per_engine: int = 5,
46
+ search_timeout: int = 20,
47
+ crawl_content: bool = False,
48
+ content_format: str = "markdown",
49
+ adaptive_crawl: bool = False,
50
+ crawl_depth: int = 1,
51
+ crawl_external_links: bool = False,
52
+ **kwargs,
53
+ ):
54
+ # Initialize base class
55
+ super().__init__(llm_provider=llm_provider, **kwargs)
56
+ self.enable_query_polishing = polish_query
57
+ self.enable_reranking = rerank_results
58
+ self.rerank_llm = rerank_llm if rerank_llm else self.llm
59
+ self.search_engines = search_engines
60
+ self.max_results_per_engine = max_results_per_engine
61
+ self.search_timeout = search_timeout
62
+ self.crawl_content = crawl_content
63
+ self.content_format = content_format
64
+ self.adaptive_crawl = adaptive_crawl
65
+ self.crawl_depth = crawl_depth
66
+ self.crawl_external_links = crawl_external_links
67
+
68
+ # Build the graph
69
+ self.graph = self._build_graph()
70
+
71
+ @override
72
+ async def run(
73
+ self, user_message: str, context: Optional[Dict] = None, config: Optional[RunnableConfig] = None
74
+ ) -> str:
75
+ """
76
+ Run the SearchAgent with a user message as search query.
77
+
78
+ Args:
79
+ user_message: The search query to execute
80
+ context: Optional context dictionary (unused currently)
81
+ config: Optional RunnableConfig for graph execution
82
+
83
+ Returns:
84
+ str: Formatted search results as a string
85
+ """
86
+ try:
87
+ # Create initial state for the search workflow
88
+ initial_state = {
89
+ "search_query": user_message.strip(),
90
+ "raw_query": "",
91
+ "search_results": None,
92
+ "messages": [],
93
+ }
94
+
95
+ logger.info(f"Starting search workflow for query: '{user_message}'")
96
+
97
+ # Execute the search workflow
98
+ if config:
99
+ result = await self.graph.ainvoke(initial_state, config=config)
100
+ else:
101
+ result = await self.graph.ainvoke(initial_state)
102
+
103
+ # Format and return the results
104
+ return self._format_search_results(result)
105
+
106
+ except Exception as e:
107
+ error_msg = f"Search workflow failed: {str(e)}"
108
+ logger.error(error_msg)
109
+ return f"❌ {error_msg}"
110
+
111
+ def _format_search_results(self, result: dict) -> str:
112
+ """
113
+ Format search results into a readable string.
114
+
115
+ Args:
116
+ result: The result dictionary from the graph execution
117
+
118
+ Returns:
119
+ str: Formatted search results
120
+ """
121
+ if not result or "search_results" not in result or not result["search_results"]:
122
+ return "❌ No search results found."
123
+
124
+ search_results = result["search_results"]
125
+ output_lines = []
126
+
127
+ # Header with query information
128
+ query = result.get("search_query", "unknown")
129
+ raw_query = result.get("raw_query", "")
130
+
131
+ output_lines.append(f"🔍 Search Results for: '{query}'")
132
+ if raw_query and raw_query != query:
133
+ output_lines.append(f"📝 Original query: '{raw_query}'")
134
+
135
+ output_lines.append("")
136
+
137
+ # Search statistics
138
+ total_sources = len(search_results.sources) if search_results.sources else 0
139
+ sources_with_content = sum(1 for source in (search_results.sources or []) if source.content)
140
+
141
+ output_lines.append(f"📊 Found {total_sources} results")
142
+ if self.crawl_content and sources_with_content > 0:
143
+ output_lines.append(f"📄 {sources_with_content} sources with crawled content")
144
+
145
+ # Response time if available
146
+ if search_results.response_time:
147
+ output_lines.append(f"⏱️ Response time: {search_results.response_time:.2f}s")
148
+
149
+ output_lines.append("")
150
+
151
+ # Direct answer if available
152
+ if search_results.answer:
153
+ output_lines.append("💡 Direct Answer:")
154
+ output_lines.append(search_results.answer)
155
+ output_lines.append("")
156
+
157
+ # Search results
158
+ if search_results.sources:
159
+ output_lines.append("📋 Sources:")
160
+ for i, source in enumerate(search_results.sources, 1):
161
+ output_lines.append(f"\n{i}. **{source.title}**")
162
+ output_lines.append(f" 🔗 {source.url}")
163
+
164
+ if source.score:
165
+ output_lines.append(f" ⭐ Score: {source.score:.3f}")
166
+
167
+ if not source.content and self.crawl_content:
168
+ output_lines.append(" 📝 Content: [Crawling failed]")
169
+
170
+ # Features used
171
+ features = []
172
+ if self.enable_query_polishing:
173
+ features.append("Query Polishing")
174
+ if self.enable_reranking:
175
+ features.append("Result Reranking")
176
+ if self.crawl_content:
177
+ features.append(f"Content Crawling ({self.content_format})")
178
+
179
+ if features:
180
+ output_lines.append(f"\n🔧 Features used: {', '.join(features)}")
181
+
182
+ return "\n".join(output_lines)
183
+
184
+ @override
185
+ def get_state_class(self) -> Type:
186
+ """
187
+ Get the state class for this search agent.
188
+ Override this method in subclasses for specialized state.
189
+
190
+ Returns:
191
+ The state class to use for the search workflow
192
+ """
193
+ return SearchState
194
+
195
+ @override
196
+ def _build_graph(self) -> StateGraph:
197
+ """Create the LangGraph search workflow."""
198
+ state_class = self.get_state_class()
199
+ workflow = StateGraph(state_class)
200
+
201
+ # Add nodes
202
+ workflow.add_node("polish_query", self._polish_query_node)
203
+ workflow.add_node("web_search", self._web_search_node)
204
+ workflow.add_node("crawl_web", self._crawl_web_node)
205
+ workflow.add_node("rank_results", self._rank_results_node)
206
+ workflow.add_node("finalize_search", self._finalize_search_node)
207
+
208
+ # Set entry point
209
+ workflow.add_edge(START, "polish_query")
210
+
211
+ # Add conditional edges
212
+ workflow.add_edge("polish_query", "web_search")
213
+ workflow.add_edge("web_search", "crawl_web")
214
+ workflow.add_conditional_edges("crawl_web", self._evaluate_crawl_web, ["rank_results", "finalize_search"])
215
+ workflow.add_edge("rank_results", "finalize_search")
216
+ workflow.add_edge("finalize_search", END)
217
+
218
+ return workflow.compile()
219
+
220
+ def _polish_query_node(self, state: SearchState, config: RunnableConfig) -> SearchState:
221
+ """Polish the query using LLM to improve search effectiveness."""
222
+ if self.enable_query_polishing:
223
+ try:
224
+ # Save original query
225
+ state["raw_query"] = state["search_query"]
226
+
227
+ # Create a prompt to polish the query
228
+ polish_prompt = f"""
229
+ You are a search query optimization expert. Your task is to improve the given search query to make it more effective for web search engines.
230
+
231
+ Original query: "{state['search_query']}"
232
+
233
+ Please provide an improved version that:
234
+ 1. Uses more specific and relevant keywords
235
+ 2. Removes unnecessary words or ambiguity
236
+ 3. Maintains the original intent
237
+ 4. Is optimized for search engines
238
+ 4. The query is less than 40 characters
239
+
240
+ Return only the improved query, nothing else.
241
+ """
242
+
243
+ # Polish the query using LLM
244
+ polished_query = self.llm.completion(
245
+ messages=[{"role": "user", "content": polish_prompt}], temperature=0.3, max_tokens=100
246
+ )
247
+
248
+ # Update the search query if polishing was successful
249
+ if polished_query and polished_query.strip():
250
+ state["search_query"] = polished_query.strip()
251
+ logger.info(f"Query polished from '{state['raw_query']}' to '{state['search_query']}'")
252
+ else:
253
+ logger.warning("Query polishing failed, using original query")
254
+
255
+ except Exception as e:
256
+ logger.error(f"Error polishing query: {e}")
257
+ # Keep original query if polishing fails
258
+ state["raw_query"] = state["search_query"]
259
+
260
+ return state
261
+
262
+ async def _web_search_node(self, state: SearchState, config: RunnableConfig) -> SearchState:
263
+ """Perform a web search."""
264
+ try:
265
+ config = WizSearchConfig(
266
+ enabled_engines=self.search_engines,
267
+ max_results_per_engine=self.max_results_per_engine,
268
+ timeout=self.search_timeout,
269
+ )
270
+ omnisearch = WizSearch(config=config)
271
+ result = await omnisearch.search(query=state["search_query"])
272
+ state["search_results"] = result
273
+ return state
274
+ except Exception as e:
275
+ logger.error(f"Failed to search: {e}")
276
+ raise
277
+
278
+ async def _crawl_web_node(self, state: SearchState, config: RunnableConfig) -> SearchState:
279
+ """Crawl the web."""
280
+ if self.crawl_content:
281
+ for source in state["search_results"].sources:
282
+ crawler = PageCrawler(
283
+ url=source.url,
284
+ external_links=self.crawl_external_links,
285
+ content_format=self.content_format,
286
+ adaptive_crawl=self.adaptive_crawl,
287
+ depth=self.crawl_depth,
288
+ )
289
+ content = await crawler.crawl()
290
+ source.content = content
291
+ return state
292
+
293
+ def _evaluate_crawl_web(self, state: SearchState, config: RunnableConfig) -> str:
294
+ """Evaluate the crawl web results and decide next step.
295
+
296
+ If there are valid search results and reranking is enabled, rank the results.
297
+ Otherwise, finalize the search.
298
+
299
+ Args:
300
+ state: The state of the search.
301
+ config: The config of the search.
302
+
303
+ Returns:
304
+ str: The next node to execute ('rank_results' or 'finalize_search')
305
+ """
306
+ # Check if we have search results
307
+ if "search_results" not in state or not state["search_results"] or not state["search_results"].sources:
308
+ logger.warning("No search results found, proceeding to finalize")
309
+ return "finalize_search"
310
+
311
+ # Check if reranking is enabled and we have multiple results
312
+ if self.enable_reranking and len(state["search_results"].sources) > 1:
313
+ logger.info(f"Reranking enabled with {len(state['search_results'].sources)} results")
314
+ return "rank_results"
315
+ else:
316
+ logger.info("Skipping reranking, proceeding to finalize")
317
+ return "finalize_search"
318
+
319
+ def _rank_results_node(self, state: SearchState, config: RunnableConfig) -> SearchState:
320
+ """Rank the search results using LLM to improve relevance ordering."""
321
+ try:
322
+ if not state["search_results"] or not state["search_results"].sources:
323
+ logger.warning("No search results to rank")
324
+ return state
325
+
326
+ # Create documents for reranking
327
+ documents = []
328
+ for i, source in enumerate(state["search_results"].sources):
329
+ # Create a text representation for ranking
330
+ doc_text = f"Title: {source.title}\nURL: {source.url}"
331
+ if source.content:
332
+ # Truncate content to avoid token limits
333
+ content_preview = source.content[:500] + "..." if len(source.content) > 500 else source.content
334
+ doc_text += f"\nContent: {content_preview}"
335
+ documents.append(doc_text)
336
+
337
+ # Use the query for reranking
338
+ query = state.get("search_query", "")
339
+
340
+ if not query:
341
+ logger.warning("No query available for reranking")
342
+ return state
343
+
344
+ # Check if LLM has rerank capability
345
+ if hasattr(self.rerank_llm, "rerank"):
346
+ try:
347
+ # Use built-in rerank function if available
348
+ reranked_result = self.rerank_llm.rerank(query=query, chunks=documents)
349
+
350
+ if reranked_result:
351
+ # Check if rerank returns indices (List[int]) or reranked chunks (List[str])
352
+ if isinstance(reranked_result[0], int):
353
+ # Handle case where rerank returns indices
354
+ reranked_sources = [state["search_results"].sources[i] for i in reranked_result]
355
+ else:
356
+ # Handle case where rerank returns reranked chunks
357
+ # Create a mapping from documents to sources
358
+ doc_to_source = {
359
+ doc: source for doc, source in zip(documents, state["search_results"].sources)
360
+ }
361
+ reranked_sources = [
362
+ doc_to_source[chunk] for chunk in reranked_result if chunk in doc_to_source
363
+ ]
364
+
365
+ state["search_results"].sources = reranked_sources
366
+ logger.info(f"Reranked {len(reranked_sources)} results")
367
+
368
+ except Exception as e:
369
+ logger.error(f"Built-in reranking failed: {e}")
370
+ # Fall back to LLM-based reranking
371
+ self._llm_based_reranking(state, query, documents)
372
+ else:
373
+ # Use LLM-based reranking
374
+ self._llm_based_reranking(state, query, documents)
375
+
376
+ except Exception as e:
377
+ logger.error(f"Error in ranking results: {e}")
378
+
379
+ return state
380
+
381
+ def _llm_based_reranking(self, state: SearchState, query: str, documents: list) -> None:
382
+ """Perform LLM-based reranking of search results."""
383
+ try:
384
+ # Create reranking prompt
385
+ docs_text = "\n\n".join([f"{i+1}. {doc}" for i, doc in enumerate(documents)])
386
+
387
+ rerank_prompt = f"""
388
+ You are a search result ranking expert. Given a search query and a list of search results,
389
+ please rank them in order of relevance to the query.
390
+
391
+ Query: "{query}"
392
+
393
+ Search Results:
394
+ {docs_text}
395
+
396
+ Please provide the ranking as a comma-separated list of numbers (1-{len(documents)})
397
+ ordered from most relevant to least relevant. For example: 3,1,2,4
398
+
399
+ Only return the ranking numbers, nothing else.
400
+ """
401
+
402
+ ranking_response = self.rerank_llm.completion(
403
+ messages=[{"role": "user", "content": rerank_prompt}], temperature=0.1, max_tokens=50
404
+ )
405
+
406
+ if ranking_response:
407
+ # Parse the ranking response
408
+ try:
409
+ ranking_str = ranking_response.strip()
410
+ ranking_indices = [int(x.strip()) - 1 for x in ranking_str.split(",")]
411
+
412
+ # Validate indices
413
+ if (
414
+ len(ranking_indices) == len(documents)
415
+ and all(0 <= i < len(documents) for i in ranking_indices)
416
+ and len(set(ranking_indices)) == len(ranking_indices)
417
+ ):
418
+
419
+ # Reorder sources based on LLM ranking
420
+ reranked_sources = [state["search_results"].sources[i] for i in ranking_indices]
421
+ state["search_results"].sources = reranked_sources
422
+ logger.info(f"LLM-based reranking completed for {len(reranked_sources)} results")
423
+ else:
424
+ logger.warning("Invalid ranking response, keeping original order")
425
+
426
+ except (ValueError, IndexError) as e:
427
+ logger.warning(f"Failed to parse ranking response '{ranking_response}': {e}")
428
+
429
+ except Exception as e:
430
+ logger.error(f"LLM-based reranking failed: {e}")
431
+
432
+ def _finalize_search_node(self, state: SearchState, config: RunnableConfig) -> SearchState:
433
+ """Finalize the search results and prepare final response."""
434
+ try:
435
+ # Ensure we have search results
436
+ if "search_results" not in state or not state["search_results"]:
437
+ logger.warning("No search results to finalize")
438
+ return state
439
+
440
+ # Log final statistics
441
+ total_sources = len(state["search_results"].sources) if state["search_results"].sources else 0
442
+ sources_with_content = sum(1 for source in (state["search_results"].sources or []) if source.content)
443
+
444
+ logger.info(f"Search finalized: {total_sources} total sources, {sources_with_content} with crawled content")
445
+
446
+ # Add final message to state if messages are being tracked
447
+ if "messages" in state:
448
+ from langchain_core.messages import AIMessage
449
+
450
+ # Create summary message
451
+ summary = f"Search completed for query: '{state.get('search_query', 'unknown')}'. "
452
+ summary += f"Found {total_sources} results"
453
+ if sources_with_content > 0:
454
+ summary += f", {sources_with_content} with detailed content"
455
+ summary += "."
456
+
457
+ # Add the summary as an AI message
458
+ final_message = AIMessage(content=summary)
459
+ if isinstance(state["messages"], list):
460
+ state["messages"].append(final_message)
461
+ else:
462
+ state["messages"] = [final_message]
463
+
464
+ # Optionally truncate very long content to avoid memory issues
465
+ if state["search_results"].sources:
466
+ for source in state["search_results"].sources:
467
+ if source.content and len(source.content) > 10000:
468
+ # Keep first 9000 chars and add truncation notice
469
+ source.content = source.content[:9000] + "\n\n[Content truncated for length]"
470
+
471
+ except Exception as e:
472
+ logger.error(f"Error finalizing search: {e}")
473
+
474
+ return state
@@ -0,0 +1,28 @@
1
+ from typing import TypedDict
2
+
3
+ try:
4
+ from langgraph.graph import add_messages
5
+
6
+ LANGGRAPH_AVAILABLE = True
7
+ except ImportError:
8
+ add_messages = None
9
+ LANGGRAPH_AVAILABLE = False
10
+
11
+ from typing_extensions import Annotated
12
+
13
+ try:
14
+ from wizsearch import SearchResult
15
+
16
+ WIZSEARCH_AVAILABLE = True
17
+ except ImportError:
18
+ SearchResult = None
19
+ WIZSEARCH_AVAILABLE = False
20
+
21
+
22
+ class SearchState(TypedDict):
23
+ """Main state for the research workflow."""
24
+
25
+ search_query: str
26
+ raw_query: str
27
+ search_results: SearchResult
28
+ messages: Annotated[list, add_messages]