noesium 0.1.0__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. noesium/agents/askura_agent/__init__.py +22 -0
  2. noesium/agents/askura_agent/askura_agent.py +480 -0
  3. noesium/agents/askura_agent/conversation.py +164 -0
  4. noesium/agents/askura_agent/extractor.py +175 -0
  5. noesium/agents/askura_agent/memory.py +14 -0
  6. noesium/agents/askura_agent/models.py +239 -0
  7. noesium/agents/askura_agent/prompts.py +202 -0
  8. noesium/agents/askura_agent/reflection.py +234 -0
  9. noesium/agents/askura_agent/summarizer.py +30 -0
  10. noesium/agents/askura_agent/utils.py +6 -0
  11. noesium/agents/deep_research/__init__.py +13 -0
  12. noesium/agents/deep_research/agent.py +398 -0
  13. noesium/agents/deep_research/prompts.py +84 -0
  14. noesium/agents/deep_research/schemas.py +42 -0
  15. noesium/agents/deep_research/state.py +54 -0
  16. noesium/agents/search/__init__.py +5 -0
  17. noesium/agents/search/agent.py +474 -0
  18. noesium/agents/search/state.py +28 -0
  19. noesium/core/__init__.py +1 -1
  20. noesium/core/agent/base.py +10 -2
  21. noesium/core/goalith/decomposer/llm_decomposer.py +1 -1
  22. noesium/core/llm/__init__.py +1 -1
  23. noesium/core/llm/base.py +2 -2
  24. noesium/core/llm/litellm.py +42 -21
  25. noesium/core/llm/llamacpp.py +25 -4
  26. noesium/core/llm/ollama.py +43 -22
  27. noesium/core/llm/openai.py +25 -5
  28. noesium/core/llm/openrouter.py +1 -1
  29. noesium/core/toolify/base.py +9 -2
  30. noesium/core/toolify/config.py +2 -2
  31. noesium/core/toolify/registry.py +21 -5
  32. noesium/core/tracing/opik_tracing.py +7 -7
  33. noesium/core/vector_store/__init__.py +2 -2
  34. noesium/core/vector_store/base.py +1 -1
  35. noesium/core/vector_store/pgvector.py +10 -13
  36. noesium/core/vector_store/weaviate.py +2 -1
  37. noesium/toolkits/__init__.py +1 -0
  38. noesium/toolkits/arxiv_toolkit.py +310 -0
  39. noesium/toolkits/audio_aliyun_toolkit.py +441 -0
  40. noesium/toolkits/audio_toolkit.py +370 -0
  41. noesium/toolkits/bash_toolkit.py +332 -0
  42. noesium/toolkits/document_toolkit.py +454 -0
  43. noesium/toolkits/file_edit_toolkit.py +552 -0
  44. noesium/toolkits/github_toolkit.py +395 -0
  45. noesium/toolkits/gmail_toolkit.py +575 -0
  46. noesium/toolkits/image_toolkit.py +425 -0
  47. noesium/toolkits/memory_toolkit.py +398 -0
  48. noesium/toolkits/python_executor_toolkit.py +334 -0
  49. noesium/toolkits/search_toolkit.py +451 -0
  50. noesium/toolkits/serper_toolkit.py +623 -0
  51. noesium/toolkits/tabular_data_toolkit.py +537 -0
  52. noesium/toolkits/user_interaction_toolkit.py +365 -0
  53. noesium/toolkits/video_toolkit.py +168 -0
  54. noesium/toolkits/wikipedia_toolkit.py +420 -0
  55. noesium-0.2.1.dist-info/METADATA +253 -0
  56. {noesium-0.1.0.dist-info → noesium-0.2.1.dist-info}/RECORD +59 -23
  57. {noesium-0.1.0.dist-info → noesium-0.2.1.dist-info}/licenses/LICENSE +1 -1
  58. noesium-0.1.0.dist-info/METADATA +0 -525
  59. {noesium-0.1.0.dist-info → noesium-0.2.1.dist-info}/WHEEL +0 -0
  60. {noesium-0.1.0.dist-info → noesium-0.2.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,451 @@
1
+ """
2
+ Search toolkit for web search and content retrieval.
3
+
4
+ Provides tools for Google search, web content extraction, web-based Q&A,
5
+ Tavily search, and Google AI search functionality.
6
+ """
7
+
8
+ import asyncio
9
+ import re
10
+ from typing import Callable, Dict, Optional
11
+
12
+ import aiohttp
13
+
14
+ try:
15
+ from wizsearch import SearchResult
16
+
17
+ WIZSEARCH_AVAILABLE = True
18
+ except ImportError:
19
+ SearchResult = None
20
+ WIZSEARCH_AVAILABLE = False
21
+
22
+ from noesium.core.toolify.base import AsyncBaseToolkit
23
+ from noesium.core.toolify.config import ToolkitConfig
24
+ from noesium.core.toolify.registry import register_toolkit
25
+ from noesium.core.utils.logging import get_logger
26
+
27
+ logger = get_logger(__name__)
28
+
29
+ # Banned sites that should be filtered from search results
30
+ BANNED_SITES = ("https://huggingface.co/", "https://grok.com/share/", "https://modelscope.cn/datasets/")
31
+ RE_BANNED_SITES = re.compile(r"^(" + "|".join(BANNED_SITES) + r")")
32
+
33
+
34
+ @register_toolkit("search")
35
+ class SearchToolkit(AsyncBaseToolkit):
36
+ """
37
+ Toolkit for web search and content retrieval.
38
+
39
+ Provides functionality for:
40
+ - Google search via Serper API
41
+ - Web content extraction via Jina Reader API
42
+ - Web-based question answering
43
+
44
+ Required configuration:
45
+ - JINA_API_KEY: API key for Jina Reader service
46
+ - SERPER_API_KEY: API key for Serper Google search service
47
+ """
48
+
49
+ def __init__(self, config: ToolkitConfig = None):
50
+ """
51
+ Initialize the SearchToolkit.
52
+
53
+ Args:
54
+ config: Toolkit configuration containing API keys and settings
55
+ """
56
+ super().__init__(config)
57
+
58
+ # API configuration
59
+ self.jina_url_template = "https://r.jina.ai/{url}"
60
+ self.serper_url = "https://google.serper.dev/search"
61
+
62
+ # Get API keys from config
63
+ jina_api_key = self.config.config.get("JINA_API_KEY")
64
+ serper_api_key = self.config.config.get("SERPER_API_KEY")
65
+
66
+ if not jina_api_key:
67
+ self.logger.warning("JINA_API_KEY not found in config - web content extraction may fail")
68
+ if not serper_api_key:
69
+ self.logger.warning("SERPER_API_KEY not found in config - Google search may fail")
70
+
71
+ self.jina_headers = {"Authorization": f"Bearer {jina_api_key}"} if jina_api_key else {}
72
+ self.serper_headers = (
73
+ {"X-API-KEY": serper_api_key, "Content-Type": "application/json"} if serper_api_key else {}
74
+ )
75
+
76
+ # Configuration
77
+ self.summary_token_limit = self.config.config.get("summary_token_limit", 1000)
78
+
79
+ async def search_google_api(self, query: str, num_results: int = 5) -> str:
80
+ """
81
+ Perform a web search using Google via Serper API.
82
+
83
+ Tips for effective searching:
84
+ 1. Use concrete, specific queries rather than vague or overly long ones
85
+ 2. Utilize Google search operators when needed:
86
+ - Use quotes ("") for exact phrase matching
87
+ - Use minus (-) to exclude terms
88
+ - Use asterisk (*) as wildcard
89
+ - Use filetype: to search for specific file types
90
+ - Use site: to search within specific sites
91
+ - Use before:/after: for date-based filtering (YYYY-MM-DD format)
92
+
93
+ Args:
94
+ query: The search query string
95
+ num_results: Maximum number of results to return (default: 5)
96
+
97
+ Returns:
98
+ Formatted search results as a string
99
+ """
100
+ self.logger.info(f"Searching Google for: {query}")
101
+
102
+ if not self.serper_headers.get("X-API-KEY"):
103
+ raise ValueError("SERPER_API_KEY not configured")
104
+
105
+ # Prepare search parameters
106
+ params = {
107
+ "q": query,
108
+ "gl": "us", # Geographic location
109
+ "hl": "en", # Language
110
+ "num": min(num_results * 2, 100), # Get more results to filter
111
+ }
112
+
113
+ try:
114
+ async with aiohttp.ClientSession() as session:
115
+ async with session.post(self.serper_url, headers=self.serper_headers, json=params) as response:
116
+ response.raise_for_status()
117
+ results = await response.json()
118
+
119
+ # Filter and format results
120
+ organic_results = results.get("organic", [])
121
+ filtered_results = self._filter_search_results(organic_results, num_results)
122
+
123
+ formatted_results = []
124
+ for i, result in enumerate(filtered_results, 1):
125
+ entry = f"{i}. {result['title']} ({result['link']})"
126
+
127
+ if "snippet" in result:
128
+ entry += f"\n {result['snippet']}"
129
+
130
+ if "sitelinks" in result:
131
+ sitelinks = ", ".join([sl.get("title", "") for sl in result["sitelinks"][:3]])
132
+ entry += f"\n Related: {sitelinks}"
133
+
134
+ formatted_results.append(entry)
135
+
136
+ result_text = "\n\n".join(formatted_results)
137
+ self.logger.info(f"Found {len(filtered_results)} search results")
138
+ return result_text
139
+
140
+ except Exception as e:
141
+ self.logger.error(f"Google search failed: {e}")
142
+ return f"Error performing Google search: {str(e)}"
143
+
144
+ def _filter_search_results(self, results: list, limit: int) -> list:
145
+ """
146
+ Filter search results to remove banned sites and limit count.
147
+
148
+ Args:
149
+ results: Raw search results from API
150
+ limit: Maximum number of results to return
151
+
152
+ Returns:
153
+ Filtered list of search results
154
+ """
155
+ filtered = []
156
+ for result in results:
157
+ link = result.get("link", "")
158
+ if not RE_BANNED_SITES.match(link):
159
+ filtered.append(result)
160
+ if len(filtered) >= limit:
161
+ break
162
+ return filtered
163
+
164
+ async def get_web_content(self, url: str) -> str:
165
+ """
166
+ Extract readable content from a web page using Jina Reader API.
167
+
168
+ Args:
169
+ url: The URL to extract content from
170
+
171
+ Returns:
172
+ Extracted text content from the web page
173
+ """
174
+ self.logger.info(f"Extracting content from: {url}")
175
+
176
+ if not self.jina_headers.get("Authorization"):
177
+ raise ValueError("JINA_API_KEY not configured")
178
+
179
+ try:
180
+ jina_url = self.jina_url_template.format(url=url)
181
+ async with aiohttp.ClientSession() as session:
182
+ async with session.get(jina_url, headers=self.jina_headers) as response:
183
+ response.raise_for_status()
184
+ content = await response.text()
185
+
186
+ self.logger.info(f"Extracted {len(content)} characters from {url}")
187
+ return content
188
+
189
+ except Exception as e:
190
+ self.logger.error(f"Content extraction failed for {url}: {e}")
191
+ return f"Error extracting content from {url}: {str(e)}"
192
+
193
+ async def web_qa(self, url: str, question: str) -> str:
194
+ """
195
+ Ask a question about a specific web page.
196
+
197
+ This tool extracts content from the given URL and uses the LLM to answer
198
+ the provided question based on that content. It also attempts to find
199
+ related links within the content.
200
+
201
+ Use cases:
202
+ - Gather specific information from a webpage
203
+ - Ask detailed questions about web content
204
+ - Get summaries of web articles
205
+
206
+ Args:
207
+ url: The URL of the webpage to analyze
208
+ question: The question to ask about the webpage content
209
+
210
+ Returns:
211
+ Answer to the question with related links
212
+ """
213
+ self.logger.info(f"Performing web Q&A for {url} with question: {question}")
214
+
215
+ try:
216
+ # Extract content from the webpage
217
+ content = await self.get_web_content(url)
218
+
219
+ if not content.strip():
220
+ return f"Could not extract readable content from {url}"
221
+
222
+ # Use default question if none provided
223
+ if not question.strip():
224
+ question = "Summarize the main content and key points of this webpage."
225
+
226
+ # Prepare tasks for parallel execution
227
+ qa_task = self._answer_question(content, question)
228
+ links_task = self._extract_related_links(url, content, question)
229
+
230
+ # Execute both tasks concurrently
231
+ answer, related_links = await asyncio.gather(qa_task, links_task)
232
+
233
+ result = f"Answer: {answer}"
234
+ if related_links.strip():
235
+ result += f"\n\nRelated Links: {related_links}"
236
+
237
+ return result
238
+
239
+ except Exception as e:
240
+ self.logger.error(f"Web Q&A failed for {url}: {e}")
241
+ return f"Error processing {url}: {str(e)}"
242
+
243
+ async def _answer_question(self, content: str, question: str) -> str:
244
+ """
245
+ Use LLM to answer a question based on web content.
246
+
247
+ Args:
248
+ content: Web page content
249
+ question: Question to answer
250
+
251
+ Returns:
252
+ LLM-generated answer
253
+ """
254
+ # Truncate content if it's too long
255
+ if len(content) > self.summary_token_limit * 4: # Rough token estimation
256
+ content = content[: self.summary_token_limit * 4] + "..."
257
+
258
+ prompt = f"""Based on the following web content, please answer the question.
259
+
260
+ Web Content:
261
+ {content}
262
+
263
+ Question: {question}
264
+
265
+ Please provide a clear, concise answer based on the content above. If the content doesn't contain enough information to answer the question, please state that clearly."""
266
+
267
+ try:
268
+ response = self.llm_client.completion(
269
+ messages=[{"role": "user", "content": prompt}], temperature=0.1, max_tokens=500
270
+ )
271
+ return response.strip()
272
+ except Exception as e:
273
+ self.logger.error(f"LLM question answering failed: {e}")
274
+ return f"Could not generate answer: {str(e)}"
275
+
276
+ async def _extract_related_links(self, url: str, content: str, question: str) -> str:
277
+ """
278
+ Extract related links from web content based on the question.
279
+
280
+ Args:
281
+ url: Original URL
282
+ content: Web page content
283
+ question: Original question for context
284
+
285
+ Returns:
286
+ Formatted list of related links
287
+ """
288
+ prompt = f"""From the following web content, extract any relevant links that might be related to this question: "{question}"
289
+
290
+ Original URL: {url}
291
+ Content: {content[:2000]}...
292
+
293
+ Please list any URLs, links, or references mentioned in the content that could provide additional information related to the question. Format as a simple list, one per line. If no relevant links are found, respond with "No related links found."
294
+ """
295
+
296
+ try:
297
+ response = self.llm_client.completion(
298
+ messages=[{"role": "user", "content": prompt}], temperature=0.1, max_tokens=200
299
+ )
300
+ return response.strip()
301
+ except Exception as e:
302
+ self.logger.error(f"Link extraction failed: {e}")
303
+ return "Could not extract related links"
304
+
305
+ async def tavily_search(
306
+ self,
307
+ query: str,
308
+ max_results: Optional[int] = 10,
309
+ search_depth: Optional[str] = "advanced",
310
+ include_answer: Optional[bool] = False,
311
+ include_raw_content: Optional[bool] = False,
312
+ ) -> SearchResult:
313
+ """
314
+ Search the web using Tavily Search API to find relevant information and sources.
315
+
316
+ This tool performs comprehensive web searches and can optionally generate AI-powered
317
+ summaries of the search results. It's ideal for research tasks, fact-checking,
318
+ and gathering current information from across the internet.
319
+
320
+ Args:
321
+ query: The search query to execute. Be specific and descriptive for better results.
322
+ Examples: "latest news on AI developments", "best restaurants in Paris 2024"
323
+ max_results: Maximum number of search results to return (1-50).
324
+ Higher numbers provide more comprehensive coverage but may be slower.
325
+ search_depth: Search depth level - "basic" for quick results or "advanced" for
326
+ more thorough, comprehensive search with better source quality.
327
+ include_answer: When True, generates an AI-powered summary of the search results.
328
+ Useful for getting a quick overview of the findings.
329
+ include_raw_content: When True, includes the full content of web pages in results.
330
+ Increases response size but provides more detailed information.
331
+
332
+ Returns:
333
+ SearchResult object containing:
334
+ - sources: List of web sources with titles, URLs, and content snippets
335
+ - answer: AI-generated summary (if include_answer=True)
336
+ - query: The original search query for reference
337
+
338
+ Example usage:
339
+ - Research current events: Use "advanced" depth with include_answer=True
340
+ - Quick fact-checking: Use "basic" depth with max_results=5
341
+ - Comprehensive research: Use "advanced" depth with max_results=20 and include_raw_content=True
342
+ """
343
+ if not WIZSEARCH_AVAILABLE:
344
+ raise ImportError("wizsearch package is not installed. Install it with: pip install 'noesium[tools]'")
345
+
346
+ self.logger.info(f"Performing Tavily search for query: {query}")
347
+
348
+ try:
349
+ from wizsearch import TavilySearch
350
+
351
+ # Initialize Tavily Search client with configuration
352
+ config_kwargs = {
353
+ "max_results": max_results,
354
+ "search_depth": search_depth,
355
+ "include_answer": include_answer,
356
+ "include_raw_content": include_raw_content,
357
+ }
358
+
359
+ tavily_search = TavilySearch(**config_kwargs)
360
+
361
+ # Perform search
362
+ result = tavily_search.search(query=query)
363
+
364
+ # Generate summary if requested
365
+ # TODO: add a more sophisticated summary generation
366
+ summary = None
367
+ if include_answer and result.answer:
368
+ summary = result.answer
369
+ elif len(result.sources) > 0:
370
+ # Create a basic summary from the top results
371
+ top_results = result.sources[:3]
372
+ summary = f"Found {len(result.sources)} results for '{query}'. Top results include: " + ", ".join(
373
+ [f"'{s.title}'" for s in top_results]
374
+ )
375
+
376
+ return SearchResult(query=query, sources=result.sources, answer=summary)
377
+
378
+ except Exception as e:
379
+ self.logger.error(f"Error in Tavily search: {e}")
380
+ raise RuntimeError(f"Tavily search failed: {str(e)}")
381
+
382
+ async def google_ai_search(
383
+ self,
384
+ query: str,
385
+ model: Optional[str] = "gemini-2.5-flash",
386
+ temperature: Optional[float] = 0.0,
387
+ ) -> SearchResult:
388
+ """
389
+ Search the web using Google AI Search powered by Gemini models.
390
+
391
+ This tool leverages Google's advanced AI search capabilities to find information,
392
+ generate comprehensive research summaries, and provide detailed citations. It's
393
+ particularly effective for academic research, detailed analysis, and tasks requiring
394
+ well-sourced information with proper attribution.
395
+
396
+ Args:
397
+ query: The search query to execute. Be specific and detailed for best results.
398
+ Examples: "climate change impact on agriculture 2024", "machine learning trends in healthcare"
399
+ model: The Gemini model to use for search and content generation.
400
+ - "gemini-2.5-flash": Fast, efficient model (recommended)
401
+ - "gemini-2.0-flash-exp": Experimental version with latest features
402
+ temperature: Controls creativity vs accuracy in the generated content (0.0-1.0).
403
+ - 0.0: Most factual and consistent (recommended for research)
404
+ - Higher values: More creative but potentially less accurate
405
+
406
+ Returns:
407
+ SearchResult object containing:
408
+ - sources: List of cited sources with URLs and metadata
409
+ - answer: Comprehensive research summary with inline citations
410
+ - query: The original search query for reference
411
+
412
+ Key features:
413
+ - Automatic citation generation with source links
414
+ - Comprehensive research summaries
415
+ - High-quality source selection
416
+ - Factual accuracy with proper attribution
417
+
418
+ Example usage:
419
+ - Academic research: Use temperature=0.0 for maximum accuracy
420
+ - Creative exploration: Use temperature=0.3-0.7 for more varied perspectives
421
+ - Fact-checking: Use temperature=0.0 with specific, detailed queries
422
+ """
423
+ self.logger.info(f"Performing Google AI search for query: {query}")
424
+
425
+ try:
426
+ from wizsearch import GoogleAISearch
427
+
428
+ # Initialize Google AI Search client
429
+ google = GoogleAISearch()
430
+
431
+ # Perform search
432
+ return google.search(query=query, model=model, temperature=temperature)
433
+
434
+ except Exception as e:
435
+ self.logger.error(f"Error in Google AI search: {e}")
436
+ raise RuntimeError(f"Google AI search failed: {str(e)}")
437
+
438
+ async def get_tools_map(self) -> Dict[str, Callable]:
439
+ """
440
+ Get the mapping of tool names to their implementation functions.
441
+
442
+ Returns:
443
+ Dictionary mapping tool names to callable functions
444
+ """
445
+ return {
446
+ "search_google_api": self.search_google_api,
447
+ "get_web_content": self.get_web_content,
448
+ "web_qa": self.web_qa,
449
+ "tavily_search": self.tavily_search,
450
+ "google_ai_search": self.google_ai_search,
451
+ }