noesium 0.1.0__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- noesium/agents/askura_agent/__init__.py +22 -0
- noesium/agents/askura_agent/askura_agent.py +480 -0
- noesium/agents/askura_agent/conversation.py +164 -0
- noesium/agents/askura_agent/extractor.py +175 -0
- noesium/agents/askura_agent/memory.py +14 -0
- noesium/agents/askura_agent/models.py +239 -0
- noesium/agents/askura_agent/prompts.py +202 -0
- noesium/agents/askura_agent/reflection.py +234 -0
- noesium/agents/askura_agent/summarizer.py +30 -0
- noesium/agents/askura_agent/utils.py +6 -0
- noesium/agents/deep_research/__init__.py +13 -0
- noesium/agents/deep_research/agent.py +398 -0
- noesium/agents/deep_research/prompts.py +84 -0
- noesium/agents/deep_research/schemas.py +42 -0
- noesium/agents/deep_research/state.py +54 -0
- noesium/agents/search/__init__.py +5 -0
- noesium/agents/search/agent.py +474 -0
- noesium/agents/search/state.py +28 -0
- noesium/core/__init__.py +1 -1
- noesium/core/agent/base.py +10 -2
- noesium/core/goalith/decomposer/llm_decomposer.py +1 -1
- noesium/core/llm/__init__.py +1 -1
- noesium/core/llm/base.py +2 -2
- noesium/core/llm/litellm.py +42 -21
- noesium/core/llm/llamacpp.py +25 -4
- noesium/core/llm/ollama.py +43 -22
- noesium/core/llm/openai.py +25 -5
- noesium/core/llm/openrouter.py +1 -1
- noesium/core/toolify/base.py +9 -2
- noesium/core/toolify/config.py +2 -2
- noesium/core/toolify/registry.py +21 -5
- noesium/core/tracing/opik_tracing.py +7 -7
- noesium/core/vector_store/__init__.py +2 -2
- noesium/core/vector_store/base.py +1 -1
- noesium/core/vector_store/pgvector.py +10 -13
- noesium/core/vector_store/weaviate.py +2 -1
- noesium/toolkits/__init__.py +1 -0
- noesium/toolkits/arxiv_toolkit.py +310 -0
- noesium/toolkits/audio_aliyun_toolkit.py +441 -0
- noesium/toolkits/audio_toolkit.py +370 -0
- noesium/toolkits/bash_toolkit.py +332 -0
- noesium/toolkits/document_toolkit.py +454 -0
- noesium/toolkits/file_edit_toolkit.py +552 -0
- noesium/toolkits/github_toolkit.py +395 -0
- noesium/toolkits/gmail_toolkit.py +575 -0
- noesium/toolkits/image_toolkit.py +425 -0
- noesium/toolkits/memory_toolkit.py +398 -0
- noesium/toolkits/python_executor_toolkit.py +334 -0
- noesium/toolkits/search_toolkit.py +451 -0
- noesium/toolkits/serper_toolkit.py +623 -0
- noesium/toolkits/tabular_data_toolkit.py +537 -0
- noesium/toolkits/user_interaction_toolkit.py +365 -0
- noesium/toolkits/video_toolkit.py +168 -0
- noesium/toolkits/wikipedia_toolkit.py +420 -0
- noesium-0.2.1.dist-info/METADATA +253 -0
- {noesium-0.1.0.dist-info → noesium-0.2.1.dist-info}/RECORD +59 -23
- {noesium-0.1.0.dist-info → noesium-0.2.1.dist-info}/licenses/LICENSE +1 -1
- noesium-0.1.0.dist-info/METADATA +0 -525
- {noesium-0.1.0.dist-info → noesium-0.2.1.dist-info}/WHEEL +0 -0
- {noesium-0.1.0.dist-info → noesium-0.2.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,454 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Document processing toolkit for parsing and analyzing various document formats.
|
|
3
|
+
|
|
4
|
+
Provides tools for document parsing, content extraction, and Q&A capabilities
|
|
5
|
+
supporting multiple backends including Chunkr and PyMuPDF.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import hashlib
|
|
9
|
+
import os
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Callable, Dict, Optional
|
|
12
|
+
from urllib.parse import urlparse
|
|
13
|
+
|
|
14
|
+
import aiohttp
|
|
15
|
+
|
|
16
|
+
from noesium.core.toolify.base import AsyncBaseToolkit
|
|
17
|
+
from noesium.core.toolify.config import ToolkitConfig
|
|
18
|
+
from noesium.core.toolify.registry import register_toolkit
|
|
19
|
+
from noesium.core.utils.logging import get_logger
|
|
20
|
+
|
|
21
|
+
logger = get_logger(__name__)
|
|
22
|
+
|
|
23
|
+
# Document processing backends
|
|
24
|
+
try:
|
|
25
|
+
import fitz # PyMuPDF
|
|
26
|
+
|
|
27
|
+
PYMUPDF_AVAILABLE = True
|
|
28
|
+
except ImportError:
|
|
29
|
+
fitz = None
|
|
30
|
+
PYMUPDF_AVAILABLE = False
|
|
31
|
+
|
|
32
|
+
# Chunkr would be imported dynamically if configured
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@register_toolkit("document")
|
|
36
|
+
class DocumentToolkit(AsyncBaseToolkit):
|
|
37
|
+
"""
|
|
38
|
+
Toolkit for document processing and analysis.
|
|
39
|
+
|
|
40
|
+
This toolkit provides capabilities for:
|
|
41
|
+
- Multi-format document parsing (PDF, DOCX, PPTX, XLSX, etc.)
|
|
42
|
+
- Content extraction and text processing
|
|
43
|
+
- Document Q&A using LLM analysis
|
|
44
|
+
- Support for multiple parsing backends
|
|
45
|
+
- URL and local file processing
|
|
46
|
+
|
|
47
|
+
Features:
|
|
48
|
+
- Multiple backend support (Chunkr, PyMuPDF)
|
|
49
|
+
- Automatic format detection
|
|
50
|
+
- Content size limiting and chunking
|
|
51
|
+
- LLM-powered document analysis
|
|
52
|
+
- Caching for repeated processing
|
|
53
|
+
- Comprehensive error handling
|
|
54
|
+
|
|
55
|
+
Supported Formats:
|
|
56
|
+
- PDF documents
|
|
57
|
+
- Microsoft Office (DOCX, PPTX, XLSX, XLS, PPT, DOC)
|
|
58
|
+
- Text-based formats
|
|
59
|
+
- Web URLs to documents
|
|
60
|
+
|
|
61
|
+
Backends:
|
|
62
|
+
- **Chunkr**: Advanced document parsing with layout understanding
|
|
63
|
+
- **PyMuPDF**: Fast PDF processing with text and metadata extraction
|
|
64
|
+
|
|
65
|
+
Required configuration:
|
|
66
|
+
- parser: Backend to use ("chunkr" or "pymupdf")
|
|
67
|
+
- Backend-specific configuration (API keys, etc.)
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
def __init__(self, config: ToolkitConfig = None):
|
|
71
|
+
"""
|
|
72
|
+
Initialize the document toolkit.
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
config: Toolkit configuration containing parser settings
|
|
76
|
+
"""
|
|
77
|
+
super().__init__(config)
|
|
78
|
+
|
|
79
|
+
# Configuration
|
|
80
|
+
self.parser_type = self.config.config.get("parser", "pymupdf")
|
|
81
|
+
self.text_limit = self.config.config.get("text_limit", 100000)
|
|
82
|
+
self.cache_dir = Path(self.config.config.get("cache_dir", "./document_cache"))
|
|
83
|
+
self.download_dir = Path(self.config.config.get("download_dir", "./document_downloads"))
|
|
84
|
+
|
|
85
|
+
# Create directories
|
|
86
|
+
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
|
87
|
+
self.download_dir.mkdir(parents=True, exist_ok=True)
|
|
88
|
+
|
|
89
|
+
# Initialize parser
|
|
90
|
+
self.parser = None
|
|
91
|
+
self._init_parser()
|
|
92
|
+
|
|
93
|
+
# Cache for MD5 to file path mapping
|
|
94
|
+
self.md5_to_path = {}
|
|
95
|
+
|
|
96
|
+
self.logger.info(f"Document toolkit initialized with {self.parser_type} parser")
|
|
97
|
+
|
|
98
|
+
def _init_parser(self):
|
|
99
|
+
"""Initialize the document parser based on configuration."""
|
|
100
|
+
if self.parser_type == "chunkr":
|
|
101
|
+
try:
|
|
102
|
+
self.parser = ChunkrParser(self.config.config)
|
|
103
|
+
self.logger.info("Chunkr parser initialized")
|
|
104
|
+
except Exception as e:
|
|
105
|
+
self.logger.error(f"Failed to initialize Chunkr parser: {e}")
|
|
106
|
+
self._fallback_to_pymupdf()
|
|
107
|
+
|
|
108
|
+
elif self.parser_type == "pymupdf":
|
|
109
|
+
if PYMUPDF_AVAILABLE:
|
|
110
|
+
self.parser = PyMuPDFParser(self.config.config)
|
|
111
|
+
self.logger.info("PyMuPDF parser initialized")
|
|
112
|
+
else:
|
|
113
|
+
self.logger.error("PyMuPDF not available, install with: pip install PyMuPDF")
|
|
114
|
+
self.parser = None
|
|
115
|
+
else:
|
|
116
|
+
self.logger.error(f"Unknown parser type: {self.parser_type}")
|
|
117
|
+
self._fallback_to_pymupdf()
|
|
118
|
+
|
|
119
|
+
def _fallback_to_pymupdf(self):
|
|
120
|
+
"""Fallback to PyMuPDF parser if available."""
|
|
121
|
+
if PYMUPDF_AVAILABLE:
|
|
122
|
+
self.parser = PyMuPDFParser(self.config.config)
|
|
123
|
+
self.parser_type = "pymupdf"
|
|
124
|
+
self.logger.info("Fell back to PyMuPDF parser")
|
|
125
|
+
else:
|
|
126
|
+
self.parser = None
|
|
127
|
+
self.logger.error("No document parser available")
|
|
128
|
+
|
|
129
|
+
def _get_file_md5(self, file_path: str) -> str:
|
|
130
|
+
"""Calculate MD5 hash of a file."""
|
|
131
|
+
hash_md5 = hashlib.md5()
|
|
132
|
+
with open(file_path, "rb") as f:
|
|
133
|
+
for chunk in iter(lambda: f.read(4096), b""):
|
|
134
|
+
hash_md5.update(chunk)
|
|
135
|
+
return hash_md5.hexdigest()
|
|
136
|
+
|
|
137
|
+
def _is_url(self, path: str) -> bool:
|
|
138
|
+
"""Check if the path is a URL."""
|
|
139
|
+
try:
|
|
140
|
+
result = urlparse(path)
|
|
141
|
+
return all([result.scheme, result.netloc])
|
|
142
|
+
except Exception:
|
|
143
|
+
return False
|
|
144
|
+
|
|
145
|
+
def _get_file_extension(self, path: str) -> str:
|
|
146
|
+
"""Get file extension from path or URL."""
|
|
147
|
+
parsed = urlparse(path)
|
|
148
|
+
suffix = Path(parsed.path).suffix.lower()
|
|
149
|
+
# Remove the leading dot if present
|
|
150
|
+
return suffix[1:] if suffix.startswith(".") else suffix
|
|
151
|
+
|
|
152
|
+
async def _download_document(self, url: str, output_path: Path) -> Path:
|
|
153
|
+
"""Download document from URL."""
|
|
154
|
+
self.logger.info(f"Downloading document from: {url}")
|
|
155
|
+
|
|
156
|
+
try:
|
|
157
|
+
async with aiohttp.ClientSession() as session:
|
|
158
|
+
async with session.get(url, timeout=30) as response:
|
|
159
|
+
response.raise_for_status()
|
|
160
|
+
|
|
161
|
+
with open(output_path, "wb") as f:
|
|
162
|
+
async for chunk in response.content.iter_chunked(8192):
|
|
163
|
+
f.write(chunk)
|
|
164
|
+
|
|
165
|
+
self.logger.info(f"Document downloaded to: {output_path}")
|
|
166
|
+
return output_path
|
|
167
|
+
|
|
168
|
+
except Exception as e:
|
|
169
|
+
self.logger.error(f"Failed to download document: {e}")
|
|
170
|
+
raise
|
|
171
|
+
|
|
172
|
+
async def _handle_document_path(self, document_path: str) -> str:
|
|
173
|
+
"""
|
|
174
|
+
Handle document path - download if URL, calculate MD5, and cache.
|
|
175
|
+
|
|
176
|
+
Args:
|
|
177
|
+
document_path: Path or URL to document
|
|
178
|
+
|
|
179
|
+
Returns:
|
|
180
|
+
MD5 hash of the document file
|
|
181
|
+
"""
|
|
182
|
+
if self._is_url(document_path):
|
|
183
|
+
# Generate filename based on URL
|
|
184
|
+
ext = self._get_file_extension(document_path) or ".pdf"
|
|
185
|
+
url_hash = hashlib.md5(document_path.encode()).hexdigest()[:8]
|
|
186
|
+
local_path = self.download_dir / f"{url_hash}{ext}"
|
|
187
|
+
|
|
188
|
+
# Download if not already cached
|
|
189
|
+
if not local_path.exists():
|
|
190
|
+
await self._download_document(document_path, local_path)
|
|
191
|
+
|
|
192
|
+
file_path = str(local_path)
|
|
193
|
+
else:
|
|
194
|
+
# Local file
|
|
195
|
+
if not os.path.exists(document_path):
|
|
196
|
+
raise FileNotFoundError(f"Document file not found: {document_path}")
|
|
197
|
+
file_path = document_path
|
|
198
|
+
|
|
199
|
+
# Calculate MD5 and cache the mapping
|
|
200
|
+
md5_hash = self._get_file_md5(file_path)
|
|
201
|
+
self.md5_to_path[md5_hash] = file_path
|
|
202
|
+
|
|
203
|
+
return md5_hash
|
|
204
|
+
|
|
205
|
+
async def _parse_document(self, md5_hash: str) -> str:
|
|
206
|
+
"""
|
|
207
|
+
Parse document using the configured parser.
|
|
208
|
+
|
|
209
|
+
Args:
|
|
210
|
+
md5_hash: MD5 hash of the document
|
|
211
|
+
|
|
212
|
+
Returns:
|
|
213
|
+
Parsed document content as markdown/text
|
|
214
|
+
"""
|
|
215
|
+
if not self.parser:
|
|
216
|
+
raise ValueError("No document parser available")
|
|
217
|
+
|
|
218
|
+
# Check cache first
|
|
219
|
+
cache_file = self.cache_dir / f"{md5_hash}.txt"
|
|
220
|
+
if cache_file.exists():
|
|
221
|
+
with open(cache_file, "r", encoding="utf-8") as f:
|
|
222
|
+
return f.read()
|
|
223
|
+
|
|
224
|
+
# Get file path
|
|
225
|
+
if md5_hash not in self.md5_to_path:
|
|
226
|
+
raise ValueError(f"Document with MD5 {md5_hash} not found in cache")
|
|
227
|
+
|
|
228
|
+
file_path = self.md5_to_path[md5_hash]
|
|
229
|
+
|
|
230
|
+
try:
|
|
231
|
+
# Parse the document
|
|
232
|
+
self.logger.info(f"Parsing document: {file_path}")
|
|
233
|
+
content = await self.parser.parse(file_path)
|
|
234
|
+
|
|
235
|
+
# Cache the result
|
|
236
|
+
with open(cache_file, "w", encoding="utf-8") as f:
|
|
237
|
+
f.write(content)
|
|
238
|
+
|
|
239
|
+
self.logger.info(f"Document parsed successfully ({len(content)} characters)")
|
|
240
|
+
return content
|
|
241
|
+
|
|
242
|
+
except Exception as e:
|
|
243
|
+
self.logger.error(f"Document parsing failed: {e}")
|
|
244
|
+
raise
|
|
245
|
+
|
|
246
|
+
async def document_qa(self, document_path: str, question: Optional[str] = None) -> str:
|
|
247
|
+
"""
|
|
248
|
+
Analyze a document and answer questions about its content.
|
|
249
|
+
|
|
250
|
+
This tool processes various document formats and uses LLM analysis to
|
|
251
|
+
answer questions about the content or provide summaries. It supports
|
|
252
|
+
multiple document types and provides intelligent content analysis.
|
|
253
|
+
|
|
254
|
+
Supported file types:
|
|
255
|
+
- **PDF**: Portable Document Format files
|
|
256
|
+
- **Microsoft Office**: DOCX, PPTX, XLSX, XLS, PPT, DOC
|
|
257
|
+
- **Text formats**: TXT, MD, RTF
|
|
258
|
+
- **Web URLs**: Direct links to documents
|
|
259
|
+
|
|
260
|
+
Features:
|
|
261
|
+
- Automatic format detection and parsing
|
|
262
|
+
- Content extraction with layout preservation
|
|
263
|
+
- Intelligent summarization when no question provided
|
|
264
|
+
- Context-aware question answering
|
|
265
|
+
- Large document handling with chunking
|
|
266
|
+
|
|
267
|
+
Args:
|
|
268
|
+
document_path: Local path or URL to the document
|
|
269
|
+
question: Specific question about the document (optional)
|
|
270
|
+
|
|
271
|
+
Returns:
|
|
272
|
+
Answer to the question or document summary
|
|
273
|
+
|
|
274
|
+
Examples:
|
|
275
|
+
- document_qa("report.pdf", "What are the key findings?")
|
|
276
|
+
- document_qa("presentation.pptx", "Summarize the main points")
|
|
277
|
+
- document_qa("data.xlsx") # Returns summary
|
|
278
|
+
- document_qa("https://example.com/doc.pdf", "What is the conclusion?")
|
|
279
|
+
"""
|
|
280
|
+
self.logger.info(f"Processing document Q&A for: {document_path}")
|
|
281
|
+
if question:
|
|
282
|
+
self.logger.info(f"Question: {question}")
|
|
283
|
+
|
|
284
|
+
try:
|
|
285
|
+
# Handle document path and get MD5
|
|
286
|
+
md5_hash = await self._handle_document_path(document_path)
|
|
287
|
+
|
|
288
|
+
# Parse the document
|
|
289
|
+
document_content = await self._parse_document(md5_hash)
|
|
290
|
+
|
|
291
|
+
if not document_content.strip():
|
|
292
|
+
return "No content could be extracted from the document."
|
|
293
|
+
|
|
294
|
+
# Limit content size for LLM processing
|
|
295
|
+
if len(document_content) > self.text_limit:
|
|
296
|
+
document_content = document_content[: self.text_limit] + "\n..."
|
|
297
|
+
self.logger.info(f"Content truncated to {self.text_limit} characters")
|
|
298
|
+
|
|
299
|
+
# Prepare LLM prompt
|
|
300
|
+
if question:
|
|
301
|
+
prompt = f"""Based on the following document content, please answer the question.
|
|
302
|
+
|
|
303
|
+
Document: {document_path}
|
|
304
|
+
Content:
|
|
305
|
+
{document_content}
|
|
306
|
+
|
|
307
|
+
Question: {question}
|
|
308
|
+
|
|
309
|
+
Please provide a detailed answer based on the document content above."""
|
|
310
|
+
else:
|
|
311
|
+
prompt = f"""Please provide a comprehensive summary of the following document.
|
|
312
|
+
|
|
313
|
+
Document: {document_path}
|
|
314
|
+
Content:
|
|
315
|
+
{document_content}
|
|
316
|
+
|
|
317
|
+
Please summarize the key points, main topics, and important information from this document."""
|
|
318
|
+
|
|
319
|
+
# Use LLM for analysis
|
|
320
|
+
response = await self.llm_client.completion(
|
|
321
|
+
messages=[
|
|
322
|
+
{
|
|
323
|
+
"role": "system",
|
|
324
|
+
"content": "You are a helpful assistant specializing in document analysis. Provide clear, accurate responses based on the provided document content.",
|
|
325
|
+
},
|
|
326
|
+
{"role": "user", "content": prompt},
|
|
327
|
+
],
|
|
328
|
+
temperature=0.1,
|
|
329
|
+
max_tokens=1000,
|
|
330
|
+
)
|
|
331
|
+
|
|
332
|
+
# Format response
|
|
333
|
+
if not question:
|
|
334
|
+
response = f"Document summary for {document_path}:\n\n{response}"
|
|
335
|
+
|
|
336
|
+
return response.strip()
|
|
337
|
+
|
|
338
|
+
except Exception as e:
|
|
339
|
+
error_msg = f"Document analysis failed: {str(e)}"
|
|
340
|
+
self.logger.error(error_msg)
|
|
341
|
+
return error_msg
|
|
342
|
+
|
|
343
|
+
async def extract_text(self, document_path: str) -> str:
|
|
344
|
+
"""
|
|
345
|
+
Extract raw text content from a document.
|
|
346
|
+
|
|
347
|
+
Args:
|
|
348
|
+
document_path: Path to local document file or URL
|
|
349
|
+
|
|
350
|
+
Returns:
|
|
351
|
+
Extracted text content
|
|
352
|
+
"""
|
|
353
|
+
try:
|
|
354
|
+
md5_hash = await self._handle_document_path(document_path)
|
|
355
|
+
content = await self._parse_document(md5_hash)
|
|
356
|
+
return content
|
|
357
|
+
|
|
358
|
+
except Exception as e:
|
|
359
|
+
return f"Text extraction failed: {str(e)}"
|
|
360
|
+
|
|
361
|
+
async def get_document_info(self, document_path: str) -> Dict:
|
|
362
|
+
"""
|
|
363
|
+
Get information about a document file.
|
|
364
|
+
|
|
365
|
+
Args:
|
|
366
|
+
document_path: Path to document file or URL
|
|
367
|
+
|
|
368
|
+
Returns:
|
|
369
|
+
Dictionary with document metadata
|
|
370
|
+
"""
|
|
371
|
+
try:
|
|
372
|
+
md5_hash = await self._handle_document_path(document_path)
|
|
373
|
+
file_path = self.md5_to_path[md5_hash]
|
|
374
|
+
|
|
375
|
+
# Get basic file info
|
|
376
|
+
file_stat = os.stat(file_path)
|
|
377
|
+
file_size = file_stat.st_size
|
|
378
|
+
|
|
379
|
+
# Get content info
|
|
380
|
+
content = await self._parse_document(md5_hash)
|
|
381
|
+
|
|
382
|
+
return {
|
|
383
|
+
"path": document_path,
|
|
384
|
+
"local_path": file_path,
|
|
385
|
+
"file_size_bytes": file_size,
|
|
386
|
+
"file_size_mb": round(file_size / (1024 * 1024), 2),
|
|
387
|
+
"extension": Path(file_path).suffix,
|
|
388
|
+
"md5_hash": md5_hash,
|
|
389
|
+
"content_length": len(content),
|
|
390
|
+
"parser_used": self.parser_type,
|
|
391
|
+
"word_count": len(content.split()) if content else 0,
|
|
392
|
+
}
|
|
393
|
+
|
|
394
|
+
except Exception as e:
|
|
395
|
+
return {"error": f"Failed to get document info: {str(e)}"}
|
|
396
|
+
|
|
397
|
+
async def get_tools_map(self) -> Dict[str, Callable]:
|
|
398
|
+
"""
|
|
399
|
+
Get the mapping of tool names to their implementation functions.
|
|
400
|
+
|
|
401
|
+
Returns:
|
|
402
|
+
Dictionary mapping tool names to callable functions
|
|
403
|
+
"""
|
|
404
|
+
return {
|
|
405
|
+
"document_qa": self.document_qa,
|
|
406
|
+
"extract_text": self.extract_text,
|
|
407
|
+
"get_document_info": self.get_document_info,
|
|
408
|
+
}
|
|
409
|
+
|
|
410
|
+
|
|
411
|
+
class PyMuPDFParser:
|
|
412
|
+
"""Simple PDF parser using PyMuPDF."""
|
|
413
|
+
|
|
414
|
+
def __init__(self, config: Dict):
|
|
415
|
+
self.config = config
|
|
416
|
+
|
|
417
|
+
async def parse(self, file_path: str) -> str:
|
|
418
|
+
"""Parse PDF file and extract text."""
|
|
419
|
+
if not PYMUPDF_AVAILABLE:
|
|
420
|
+
raise ImportError("PyMuPDF not available")
|
|
421
|
+
|
|
422
|
+
try:
|
|
423
|
+
doc = fitz.open(file_path)
|
|
424
|
+
text_content = []
|
|
425
|
+
|
|
426
|
+
for page_num in range(doc.page_count):
|
|
427
|
+
page = doc[page_num]
|
|
428
|
+
text = page.get_text()
|
|
429
|
+
if text.strip():
|
|
430
|
+
text_content.append(f"## Page {page_num + 1}\n\n{text}")
|
|
431
|
+
|
|
432
|
+
doc.close()
|
|
433
|
+
return "\n\n".join(text_content)
|
|
434
|
+
|
|
435
|
+
except Exception as e:
|
|
436
|
+
raise ValueError(f"Failed to parse PDF: {str(e)}")
|
|
437
|
+
|
|
438
|
+
|
|
439
|
+
class ChunkrParser:
|
|
440
|
+
"""Document parser using Chunkr service."""
|
|
441
|
+
|
|
442
|
+
def __init__(self, config: Dict):
|
|
443
|
+
self.config = config
|
|
444
|
+
self.api_key = config.get("chunkr_api_key")
|
|
445
|
+
self.base_url = config.get("chunkr_base_url", "https://api.chunkr.ai")
|
|
446
|
+
|
|
447
|
+
async def parse(self, file_path: str) -> str:
|
|
448
|
+
"""Parse document using Chunkr API."""
|
|
449
|
+
if not self.api_key:
|
|
450
|
+
raise ValueError("Chunkr API key not configured")
|
|
451
|
+
|
|
452
|
+
# This would implement the actual Chunkr API integration
|
|
453
|
+
# For now, return a placeholder
|
|
454
|
+
return f"Chunkr parsing not fully implemented for: {file_path}"
|