noesium 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. noesium/agents/askura_agent/__init__.py +22 -0
  2. noesium/agents/askura_agent/askura_agent.py +480 -0
  3. noesium/agents/askura_agent/conversation.py +164 -0
  4. noesium/agents/askura_agent/extractor.py +175 -0
  5. noesium/agents/askura_agent/memory.py +14 -0
  6. noesium/agents/askura_agent/models.py +239 -0
  7. noesium/agents/askura_agent/prompts.py +202 -0
  8. noesium/agents/askura_agent/reflection.py +234 -0
  9. noesium/agents/askura_agent/summarizer.py +30 -0
  10. noesium/agents/askura_agent/utils.py +6 -0
  11. noesium/agents/deep_research/__init__.py +13 -0
  12. noesium/agents/deep_research/agent.py +398 -0
  13. noesium/agents/deep_research/prompts.py +84 -0
  14. noesium/agents/deep_research/schemas.py +42 -0
  15. noesium/agents/deep_research/state.py +54 -0
  16. noesium/agents/search/__init__.py +5 -0
  17. noesium/agents/search/agent.py +474 -0
  18. noesium/agents/search/state.py +28 -0
  19. noesium/core/__init__.py +1 -1
  20. noesium/core/agent/base.py +10 -2
  21. noesium/core/goalith/decomposer/llm_decomposer.py +1 -1
  22. noesium/core/llm/__init__.py +1 -1
  23. noesium/core/llm/base.py +2 -2
  24. noesium/core/llm/litellm.py +42 -21
  25. noesium/core/llm/llamacpp.py +25 -4
  26. noesium/core/llm/ollama.py +43 -22
  27. noesium/core/llm/openai.py +25 -5
  28. noesium/core/llm/openrouter.py +1 -1
  29. noesium/core/toolify/base.py +9 -2
  30. noesium/core/toolify/config.py +2 -2
  31. noesium/core/toolify/registry.py +21 -5
  32. noesium/core/tracing/opik_tracing.py +7 -7
  33. noesium/core/vector_store/__init__.py +2 -2
  34. noesium/core/vector_store/base.py +1 -1
  35. noesium/core/vector_store/pgvector.py +10 -13
  36. noesium/core/vector_store/weaviate.py +2 -1
  37. noesium/toolkits/__init__.py +1 -0
  38. noesium/toolkits/arxiv_toolkit.py +310 -0
  39. noesium/toolkits/audio_aliyun_toolkit.py +441 -0
  40. noesium/toolkits/audio_toolkit.py +370 -0
  41. noesium/toolkits/bash_toolkit.py +332 -0
  42. noesium/toolkits/document_toolkit.py +454 -0
  43. noesium/toolkits/file_edit_toolkit.py +552 -0
  44. noesium/toolkits/github_toolkit.py +395 -0
  45. noesium/toolkits/gmail_toolkit.py +575 -0
  46. noesium/toolkits/image_toolkit.py +425 -0
  47. noesium/toolkits/memory_toolkit.py +398 -0
  48. noesium/toolkits/python_executor_toolkit.py +334 -0
  49. noesium/toolkits/search_toolkit.py +451 -0
  50. noesium/toolkits/serper_toolkit.py +623 -0
  51. noesium/toolkits/tabular_data_toolkit.py +537 -0
  52. noesium/toolkits/user_interaction_toolkit.py +365 -0
  53. noesium/toolkits/video_toolkit.py +168 -0
  54. noesium/toolkits/wikipedia_toolkit.py +420 -0
  55. {noesium-0.1.0.dist-info → noesium-0.2.0.dist-info}/METADATA +56 -48
  56. {noesium-0.1.0.dist-info → noesium-0.2.0.dist-info}/RECORD +59 -23
  57. {noesium-0.1.0.dist-info → noesium-0.2.0.dist-info}/licenses/LICENSE +1 -1
  58. {noesium-0.1.0.dist-info → noesium-0.2.0.dist-info}/WHEEL +0 -0
  59. {noesium-0.1.0.dist-info → noesium-0.2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,454 @@
1
+ """
2
+ Document processing toolkit for parsing and analyzing various document formats.
3
+
4
+ Provides tools for document parsing, content extraction, and Q&A capabilities
5
+ supporting multiple backends including Chunkr and PyMuPDF.
6
+ """
7
+
8
+ import hashlib
9
+ import os
10
+ from pathlib import Path
11
+ from typing import Callable, Dict, Optional
12
+ from urllib.parse import urlparse
13
+
14
+ import aiohttp
15
+
16
+ from noesium.core.toolify.base import AsyncBaseToolkit
17
+ from noesium.core.toolify.config import ToolkitConfig
18
+ from noesium.core.toolify.registry import register_toolkit
19
+ from noesium.core.utils.logging import get_logger
20
+
21
+ logger = get_logger(__name__)
22
+
23
+ # Document processing backends
24
+ try:
25
+ import fitz # PyMuPDF
26
+
27
+ PYMUPDF_AVAILABLE = True
28
+ except ImportError:
29
+ fitz = None
30
+ PYMUPDF_AVAILABLE = False
31
+
32
+ # Chunkr would be imported dynamically if configured
33
+
34
+
35
+ @register_toolkit("document")
36
+ class DocumentToolkit(AsyncBaseToolkit):
37
+ """
38
+ Toolkit for document processing and analysis.
39
+
40
+ This toolkit provides capabilities for:
41
+ - Multi-format document parsing (PDF, DOCX, PPTX, XLSX, etc.)
42
+ - Content extraction and text processing
43
+ - Document Q&A using LLM analysis
44
+ - Support for multiple parsing backends
45
+ - URL and local file processing
46
+
47
+ Features:
48
+ - Multiple backend support (Chunkr, PyMuPDF)
49
+ - Automatic format detection
50
+ - Content size limiting and chunking
51
+ - LLM-powered document analysis
52
+ - Caching for repeated processing
53
+ - Comprehensive error handling
54
+
55
+ Supported Formats:
56
+ - PDF documents
57
+ - Microsoft Office (DOCX, PPTX, XLSX, XLS, PPT, DOC)
58
+ - Text-based formats
59
+ - Web URLs to documents
60
+
61
+ Backends:
62
+ - **Chunkr**: Advanced document parsing with layout understanding
63
+ - **PyMuPDF**: Fast PDF processing with text and metadata extraction
64
+
65
+ Required configuration:
66
+ - parser: Backend to use ("chunkr" or "pymupdf")
67
+ - Backend-specific configuration (API keys, etc.)
68
+ """
69
+
70
+ def __init__(self, config: ToolkitConfig = None):
71
+ """
72
+ Initialize the document toolkit.
73
+
74
+ Args:
75
+ config: Toolkit configuration containing parser settings
76
+ """
77
+ super().__init__(config)
78
+
79
+ # Configuration
80
+ self.parser_type = self.config.config.get("parser", "pymupdf")
81
+ self.text_limit = self.config.config.get("text_limit", 100000)
82
+ self.cache_dir = Path(self.config.config.get("cache_dir", "./document_cache"))
83
+ self.download_dir = Path(self.config.config.get("download_dir", "./document_downloads"))
84
+
85
+ # Create directories
86
+ self.cache_dir.mkdir(parents=True, exist_ok=True)
87
+ self.download_dir.mkdir(parents=True, exist_ok=True)
88
+
89
+ # Initialize parser
90
+ self.parser = None
91
+ self._init_parser()
92
+
93
+ # Cache for MD5 to file path mapping
94
+ self.md5_to_path = {}
95
+
96
+ self.logger.info(f"Document toolkit initialized with {self.parser_type} parser")
97
+
98
+ def _init_parser(self):
99
+ """Initialize the document parser based on configuration."""
100
+ if self.parser_type == "chunkr":
101
+ try:
102
+ self.parser = ChunkrParser(self.config.config)
103
+ self.logger.info("Chunkr parser initialized")
104
+ except Exception as e:
105
+ self.logger.error(f"Failed to initialize Chunkr parser: {e}")
106
+ self._fallback_to_pymupdf()
107
+
108
+ elif self.parser_type == "pymupdf":
109
+ if PYMUPDF_AVAILABLE:
110
+ self.parser = PyMuPDFParser(self.config.config)
111
+ self.logger.info("PyMuPDF parser initialized")
112
+ else:
113
+ self.logger.error("PyMuPDF not available, install with: pip install PyMuPDF")
114
+ self.parser = None
115
+ else:
116
+ self.logger.error(f"Unknown parser type: {self.parser_type}")
117
+ self._fallback_to_pymupdf()
118
+
119
+ def _fallback_to_pymupdf(self):
120
+ """Fallback to PyMuPDF parser if available."""
121
+ if PYMUPDF_AVAILABLE:
122
+ self.parser = PyMuPDFParser(self.config.config)
123
+ self.parser_type = "pymupdf"
124
+ self.logger.info("Fell back to PyMuPDF parser")
125
+ else:
126
+ self.parser = None
127
+ self.logger.error("No document parser available")
128
+
129
+ def _get_file_md5(self, file_path: str) -> str:
130
+ """Calculate MD5 hash of a file."""
131
+ hash_md5 = hashlib.md5()
132
+ with open(file_path, "rb") as f:
133
+ for chunk in iter(lambda: f.read(4096), b""):
134
+ hash_md5.update(chunk)
135
+ return hash_md5.hexdigest()
136
+
137
+ def _is_url(self, path: str) -> bool:
138
+ """Check if the path is a URL."""
139
+ try:
140
+ result = urlparse(path)
141
+ return all([result.scheme, result.netloc])
142
+ except Exception:
143
+ return False
144
+
145
+ def _get_file_extension(self, path: str) -> str:
146
+ """Get file extension from path or URL."""
147
+ parsed = urlparse(path)
148
+ suffix = Path(parsed.path).suffix.lower()
149
+ # Remove the leading dot if present
150
+ return suffix[1:] if suffix.startswith(".") else suffix
151
+
152
+ async def _download_document(self, url: str, output_path: Path) -> Path:
153
+ """Download document from URL."""
154
+ self.logger.info(f"Downloading document from: {url}")
155
+
156
+ try:
157
+ async with aiohttp.ClientSession() as session:
158
+ async with session.get(url, timeout=30) as response:
159
+ response.raise_for_status()
160
+
161
+ with open(output_path, "wb") as f:
162
+ async for chunk in response.content.iter_chunked(8192):
163
+ f.write(chunk)
164
+
165
+ self.logger.info(f"Document downloaded to: {output_path}")
166
+ return output_path
167
+
168
+ except Exception as e:
169
+ self.logger.error(f"Failed to download document: {e}")
170
+ raise
171
+
172
+ async def _handle_document_path(self, document_path: str) -> str:
173
+ """
174
+ Handle document path - download if URL, calculate MD5, and cache.
175
+
176
+ Args:
177
+ document_path: Path or URL to document
178
+
179
+ Returns:
180
+ MD5 hash of the document file
181
+ """
182
+ if self._is_url(document_path):
183
+ # Generate filename based on URL
184
+ ext = self._get_file_extension(document_path) or ".pdf"
185
+ url_hash = hashlib.md5(document_path.encode()).hexdigest()[:8]
186
+ local_path = self.download_dir / f"{url_hash}{ext}"
187
+
188
+ # Download if not already cached
189
+ if not local_path.exists():
190
+ await self._download_document(document_path, local_path)
191
+
192
+ file_path = str(local_path)
193
+ else:
194
+ # Local file
195
+ if not os.path.exists(document_path):
196
+ raise FileNotFoundError(f"Document file not found: {document_path}")
197
+ file_path = document_path
198
+
199
+ # Calculate MD5 and cache the mapping
200
+ md5_hash = self._get_file_md5(file_path)
201
+ self.md5_to_path[md5_hash] = file_path
202
+
203
+ return md5_hash
204
+
205
+ async def _parse_document(self, md5_hash: str) -> str:
206
+ """
207
+ Parse document using the configured parser.
208
+
209
+ Args:
210
+ md5_hash: MD5 hash of the document
211
+
212
+ Returns:
213
+ Parsed document content as markdown/text
214
+ """
215
+ if not self.parser:
216
+ raise ValueError("No document parser available")
217
+
218
+ # Check cache first
219
+ cache_file = self.cache_dir / f"{md5_hash}.txt"
220
+ if cache_file.exists():
221
+ with open(cache_file, "r", encoding="utf-8") as f:
222
+ return f.read()
223
+
224
+ # Get file path
225
+ if md5_hash not in self.md5_to_path:
226
+ raise ValueError(f"Document with MD5 {md5_hash} not found in cache")
227
+
228
+ file_path = self.md5_to_path[md5_hash]
229
+
230
+ try:
231
+ # Parse the document
232
+ self.logger.info(f"Parsing document: {file_path}")
233
+ content = await self.parser.parse(file_path)
234
+
235
+ # Cache the result
236
+ with open(cache_file, "w", encoding="utf-8") as f:
237
+ f.write(content)
238
+
239
+ self.logger.info(f"Document parsed successfully ({len(content)} characters)")
240
+ return content
241
+
242
+ except Exception as e:
243
+ self.logger.error(f"Document parsing failed: {e}")
244
+ raise
245
+
246
+ async def document_qa(self, document_path: str, question: Optional[str] = None) -> str:
247
+ """
248
+ Analyze a document and answer questions about its content.
249
+
250
+ This tool processes various document formats and uses LLM analysis to
251
+ answer questions about the content or provide summaries. It supports
252
+ multiple document types and provides intelligent content analysis.
253
+
254
+ Supported file types:
255
+ - **PDF**: Portable Document Format files
256
+ - **Microsoft Office**: DOCX, PPTX, XLSX, XLS, PPT, DOC
257
+ - **Text formats**: TXT, MD, RTF
258
+ - **Web URLs**: Direct links to documents
259
+
260
+ Features:
261
+ - Automatic format detection and parsing
262
+ - Content extraction with layout preservation
263
+ - Intelligent summarization when no question provided
264
+ - Context-aware question answering
265
+ - Large document handling with chunking
266
+
267
+ Args:
268
+ document_path: Local path or URL to the document
269
+ question: Specific question about the document (optional)
270
+
271
+ Returns:
272
+ Answer to the question or document summary
273
+
274
+ Examples:
275
+ - document_qa("report.pdf", "What are the key findings?")
276
+ - document_qa("presentation.pptx", "Summarize the main points")
277
+ - document_qa("data.xlsx") # Returns summary
278
+ - document_qa("https://example.com/doc.pdf", "What is the conclusion?")
279
+ """
280
+ self.logger.info(f"Processing document Q&A for: {document_path}")
281
+ if question:
282
+ self.logger.info(f"Question: {question}")
283
+
284
+ try:
285
+ # Handle document path and get MD5
286
+ md5_hash = await self._handle_document_path(document_path)
287
+
288
+ # Parse the document
289
+ document_content = await self._parse_document(md5_hash)
290
+
291
+ if not document_content.strip():
292
+ return "No content could be extracted from the document."
293
+
294
+ # Limit content size for LLM processing
295
+ if len(document_content) > self.text_limit:
296
+ document_content = document_content[: self.text_limit] + "\n..."
297
+ self.logger.info(f"Content truncated to {self.text_limit} characters")
298
+
299
+ # Prepare LLM prompt
300
+ if question:
301
+ prompt = f"""Based on the following document content, please answer the question.
302
+
303
+ Document: {document_path}
304
+ Content:
305
+ {document_content}
306
+
307
+ Question: {question}
308
+
309
+ Please provide a detailed answer based on the document content above."""
310
+ else:
311
+ prompt = f"""Please provide a comprehensive summary of the following document.
312
+
313
+ Document: {document_path}
314
+ Content:
315
+ {document_content}
316
+
317
+ Please summarize the key points, main topics, and important information from this document."""
318
+
319
+ # Use LLM for analysis
320
+ response = await self.llm_client.completion(
321
+ messages=[
322
+ {
323
+ "role": "system",
324
+ "content": "You are a helpful assistant specializing in document analysis. Provide clear, accurate responses based on the provided document content.",
325
+ },
326
+ {"role": "user", "content": prompt},
327
+ ],
328
+ temperature=0.1,
329
+ max_tokens=1000,
330
+ )
331
+
332
+ # Format response
333
+ if not question:
334
+ response = f"Document summary for {document_path}:\n\n{response}"
335
+
336
+ return response.strip()
337
+
338
+ except Exception as e:
339
+ error_msg = f"Document analysis failed: {str(e)}"
340
+ self.logger.error(error_msg)
341
+ return error_msg
342
+
343
+ async def extract_text(self, document_path: str) -> str:
344
+ """
345
+ Extract raw text content from a document.
346
+
347
+ Args:
348
+ document_path: Path to local document file or URL
349
+
350
+ Returns:
351
+ Extracted text content
352
+ """
353
+ try:
354
+ md5_hash = await self._handle_document_path(document_path)
355
+ content = await self._parse_document(md5_hash)
356
+ return content
357
+
358
+ except Exception as e:
359
+ return f"Text extraction failed: {str(e)}"
360
+
361
+ async def get_document_info(self, document_path: str) -> Dict:
362
+ """
363
+ Get information about a document file.
364
+
365
+ Args:
366
+ document_path: Path to document file or URL
367
+
368
+ Returns:
369
+ Dictionary with document metadata
370
+ """
371
+ try:
372
+ md5_hash = await self._handle_document_path(document_path)
373
+ file_path = self.md5_to_path[md5_hash]
374
+
375
+ # Get basic file info
376
+ file_stat = os.stat(file_path)
377
+ file_size = file_stat.st_size
378
+
379
+ # Get content info
380
+ content = await self._parse_document(md5_hash)
381
+
382
+ return {
383
+ "path": document_path,
384
+ "local_path": file_path,
385
+ "file_size_bytes": file_size,
386
+ "file_size_mb": round(file_size / (1024 * 1024), 2),
387
+ "extension": Path(file_path).suffix,
388
+ "md5_hash": md5_hash,
389
+ "content_length": len(content),
390
+ "parser_used": self.parser_type,
391
+ "word_count": len(content.split()) if content else 0,
392
+ }
393
+
394
+ except Exception as e:
395
+ return {"error": f"Failed to get document info: {str(e)}"}
396
+
397
+ async def get_tools_map(self) -> Dict[str, Callable]:
398
+ """
399
+ Get the mapping of tool names to their implementation functions.
400
+
401
+ Returns:
402
+ Dictionary mapping tool names to callable functions
403
+ """
404
+ return {
405
+ "document_qa": self.document_qa,
406
+ "extract_text": self.extract_text,
407
+ "get_document_info": self.get_document_info,
408
+ }
409
+
410
+
411
+ class PyMuPDFParser:
412
+ """Simple PDF parser using PyMuPDF."""
413
+
414
+ def __init__(self, config: Dict):
415
+ self.config = config
416
+
417
+ async def parse(self, file_path: str) -> str:
418
+ """Parse PDF file and extract text."""
419
+ if not PYMUPDF_AVAILABLE:
420
+ raise ImportError("PyMuPDF not available")
421
+
422
+ try:
423
+ doc = fitz.open(file_path)
424
+ text_content = []
425
+
426
+ for page_num in range(doc.page_count):
427
+ page = doc[page_num]
428
+ text = page.get_text()
429
+ if text.strip():
430
+ text_content.append(f"## Page {page_num + 1}\n\n{text}")
431
+
432
+ doc.close()
433
+ return "\n\n".join(text_content)
434
+
435
+ except Exception as e:
436
+ raise ValueError(f"Failed to parse PDF: {str(e)}")
437
+
438
+
439
+ class ChunkrParser:
440
+ """Document parser using Chunkr service."""
441
+
442
+ def __init__(self, config: Dict):
443
+ self.config = config
444
+ self.api_key = config.get("chunkr_api_key")
445
+ self.base_url = config.get("chunkr_base_url", "https://api.chunkr.ai")
446
+
447
+ async def parse(self, file_path: str) -> str:
448
+ """Parse document using Chunkr API."""
449
+ if not self.api_key:
450
+ raise ValueError("Chunkr API key not configured")
451
+
452
+ # This would implement the actual Chunkr API integration
453
+ # For now, return a placeholder
454
+ return f"Chunkr parsing not fully implemented for: {file_path}"