arionxiv 1.0.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arionxiv/__init__.py +40 -0
- arionxiv/__main__.py +10 -0
- arionxiv/arxiv_operations/__init__.py +0 -0
- arionxiv/arxiv_operations/client.py +225 -0
- arionxiv/arxiv_operations/fetcher.py +173 -0
- arionxiv/arxiv_operations/searcher.py +122 -0
- arionxiv/arxiv_operations/utils.py +293 -0
- arionxiv/cli/__init__.py +4 -0
- arionxiv/cli/commands/__init__.py +1 -0
- arionxiv/cli/commands/analyze.py +587 -0
- arionxiv/cli/commands/auth.py +365 -0
- arionxiv/cli/commands/chat.py +714 -0
- arionxiv/cli/commands/daily.py +482 -0
- arionxiv/cli/commands/fetch.py +217 -0
- arionxiv/cli/commands/library.py +295 -0
- arionxiv/cli/commands/preferences.py +426 -0
- arionxiv/cli/commands/search.py +254 -0
- arionxiv/cli/commands/settings_unified.py +1407 -0
- arionxiv/cli/commands/trending.py +41 -0
- arionxiv/cli/commands/welcome.py +168 -0
- arionxiv/cli/main.py +407 -0
- arionxiv/cli/ui/__init__.py +1 -0
- arionxiv/cli/ui/global_theme_manager.py +173 -0
- arionxiv/cli/ui/logo.py +127 -0
- arionxiv/cli/ui/splash.py +89 -0
- arionxiv/cli/ui/theme.py +32 -0
- arionxiv/cli/ui/theme_system.py +391 -0
- arionxiv/cli/utils/__init__.py +54 -0
- arionxiv/cli/utils/animations.py +522 -0
- arionxiv/cli/utils/api_client.py +583 -0
- arionxiv/cli/utils/api_config.py +505 -0
- arionxiv/cli/utils/command_suggestions.py +147 -0
- arionxiv/cli/utils/db_config_manager.py +254 -0
- arionxiv/github_actions_runner.py +206 -0
- arionxiv/main.py +23 -0
- arionxiv/prompts/__init__.py +9 -0
- arionxiv/prompts/prompts.py +247 -0
- arionxiv/rag_techniques/__init__.py +8 -0
- arionxiv/rag_techniques/basic_rag.py +1531 -0
- arionxiv/scheduler_daemon.py +139 -0
- arionxiv/server.py +1000 -0
- arionxiv/server_main.py +24 -0
- arionxiv/services/__init__.py +73 -0
- arionxiv/services/llm_client.py +30 -0
- arionxiv/services/llm_inference/__init__.py +58 -0
- arionxiv/services/llm_inference/groq_client.py +469 -0
- arionxiv/services/llm_inference/llm_utils.py +250 -0
- arionxiv/services/llm_inference/openrouter_client.py +564 -0
- arionxiv/services/unified_analysis_service.py +872 -0
- arionxiv/services/unified_auth_service.py +457 -0
- arionxiv/services/unified_config_service.py +456 -0
- arionxiv/services/unified_daily_dose_service.py +823 -0
- arionxiv/services/unified_database_service.py +1633 -0
- arionxiv/services/unified_llm_service.py +366 -0
- arionxiv/services/unified_paper_service.py +604 -0
- arionxiv/services/unified_pdf_service.py +522 -0
- arionxiv/services/unified_prompt_service.py +344 -0
- arionxiv/services/unified_scheduler_service.py +589 -0
- arionxiv/services/unified_user_service.py +954 -0
- arionxiv/utils/__init__.py +51 -0
- arionxiv/utils/api_helpers.py +200 -0
- arionxiv/utils/file_cleanup.py +150 -0
- arionxiv/utils/ip_helper.py +96 -0
- arionxiv-1.0.32.dist-info/METADATA +336 -0
- arionxiv-1.0.32.dist-info/RECORD +69 -0
- arionxiv-1.0.32.dist-info/WHEEL +5 -0
- arionxiv-1.0.32.dist-info/entry_points.txt +4 -0
- arionxiv-1.0.32.dist-info/licenses/LICENSE +21 -0
- arionxiv-1.0.32.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,522 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Unified PDF Processing Service for ArionXiv
|
|
3
|
+
Consolidates pdf_processor.py and advanced_pdf_processor.py
|
|
4
|
+
Supports basic text extraction, OCR, table extraction, image analysis, and metadata extraction
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import os
|
|
8
|
+
import asyncio
|
|
9
|
+
import logging
|
|
10
|
+
from typing import Dict, List, Any, Optional, Tuple
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
import tempfile
|
|
13
|
+
import base64
|
|
14
|
+
from io import BytesIO
|
|
15
|
+
|
|
16
|
+
import PyPDF2
|
|
17
|
+
|
|
18
|
+
try:
|
|
19
|
+
import fitz
|
|
20
|
+
PYMUPDF_AVAILABLE = True
|
|
21
|
+
except ImportError:
|
|
22
|
+
PYMUPDF_AVAILABLE = False
|
|
23
|
+
|
|
24
|
+
try:
|
|
25
|
+
import pdfplumber
|
|
26
|
+
PDFPLUMBER_AVAILABLE = True
|
|
27
|
+
except ImportError:
|
|
28
|
+
PDFPLUMBER_AVAILABLE = False
|
|
29
|
+
|
|
30
|
+
from PIL import Image
|
|
31
|
+
|
|
32
|
+
try:
|
|
33
|
+
import pytesseract
|
|
34
|
+
OCR_AVAILABLE = True
|
|
35
|
+
except ImportError:
|
|
36
|
+
OCR_AVAILABLE = False
|
|
37
|
+
|
|
38
|
+
try:
|
|
39
|
+
import tabula
|
|
40
|
+
TABULA_AVAILABLE = True
|
|
41
|
+
except ImportError:
|
|
42
|
+
TABULA_AVAILABLE = False
|
|
43
|
+
|
|
44
|
+
logger = logging.getLogger(__name__)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class UnifiedPDFProcessor:
|
|
48
|
+
"""
|
|
49
|
+
Comprehensive PDF processor supporting both basic and advanced operations:
|
|
50
|
+
- Basic text extraction (PyPDF2)
|
|
51
|
+
- Advanced text and layout analysis (PyMuPDF, pdfplumber)
|
|
52
|
+
- OCR for scanned documents (Tesseract)
|
|
53
|
+
- Table extraction (tabula-py)
|
|
54
|
+
- Image extraction and analysis
|
|
55
|
+
- Metadata extraction
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
def __init__(self):
|
|
59
|
+
self.temp_dir = Path(tempfile.gettempdir()) / "arionxiv_pdf_processing"
|
|
60
|
+
self.temp_dir.mkdir(exist_ok=True)
|
|
61
|
+
|
|
62
|
+
# Check available features
|
|
63
|
+
self.features = {
|
|
64
|
+
"basic_extraction": True, # Always available with PyPDF2
|
|
65
|
+
"advanced_extraction": PYMUPDF_AVAILABLE and PDFPLUMBER_AVAILABLE,
|
|
66
|
+
"ocr": OCR_AVAILABLE,
|
|
67
|
+
"table_extraction": TABULA_AVAILABLE,
|
|
68
|
+
"image_extraction": PYMUPDF_AVAILABLE
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
logger.info(f"UnifiedPDFProcessor initialized: features={self.features}")
|
|
72
|
+
|
|
73
|
+
# ====================
|
|
74
|
+
# BASIC PDF PROCESSING (from pdf_processor.py)
|
|
75
|
+
# ====================
|
|
76
|
+
|
|
77
|
+
async def extract_text_basic(self, pdf_path: str) -> str:
|
|
78
|
+
"""Extract text from PDF using PyPDF2 (basic method)"""
|
|
79
|
+
try:
|
|
80
|
+
with open(pdf_path, 'rb') as file:
|
|
81
|
+
reader = PyPDF2.PdfReader(file)
|
|
82
|
+
text = ""
|
|
83
|
+
for page in reader.pages:
|
|
84
|
+
text += page.extract_text()
|
|
85
|
+
return text
|
|
86
|
+
except Exception as e:
|
|
87
|
+
logger.error(f"Basic text extraction failed: path={pdf_path}, error={str(e)}")
|
|
88
|
+
return f"Error extracting text: {str(e)}"
|
|
89
|
+
|
|
90
|
+
async def extract_metadata_basic(self, pdf_path: str) -> Dict[str, Any]:
|
|
91
|
+
"""Extract metadata from PDF using PyPDF2"""
|
|
92
|
+
try:
|
|
93
|
+
with open(pdf_path, 'rb') as file:
|
|
94
|
+
reader = PyPDF2.PdfReader(file)
|
|
95
|
+
metadata = reader.metadata
|
|
96
|
+
return {
|
|
97
|
+
"title": metadata.get("/Title", "Unknown"),
|
|
98
|
+
"author": metadata.get("/Author", "Unknown"),
|
|
99
|
+
"subject": metadata.get("/Subject", "Unknown"),
|
|
100
|
+
"creator": metadata.get("/Creator", "Unknown"),
|
|
101
|
+
"producer": metadata.get("/Producer", "Unknown"),
|
|
102
|
+
"creation_date": str(metadata.get("/CreationDate", "Unknown")),
|
|
103
|
+
"modification_date": str(metadata.get("/ModDate", "Unknown")),
|
|
104
|
+
"pages": len(reader.pages)
|
|
105
|
+
}
|
|
106
|
+
except Exception as e:
|
|
107
|
+
logger.error(f"Basic metadata extraction failed: path={pdf_path}, error={str(e)}")
|
|
108
|
+
return {"error": f"Metadata extraction failed: {str(e)}"}
|
|
109
|
+
|
|
110
|
+
# ====================
|
|
111
|
+
# ADVANCED PDF PROCESSING (from advanced_pdf_processor.py)
|
|
112
|
+
# ====================
|
|
113
|
+
|
|
114
|
+
async def extract_text_advanced(self, pdf_path: str, ocr_fallback: bool = True) -> Dict[str, Any]:
|
|
115
|
+
"""
|
|
116
|
+
Advanced text extraction with multiple fallback methods
|
|
117
|
+
"""
|
|
118
|
+
result = {
|
|
119
|
+
"success": False,
|
|
120
|
+
"text": "",
|
|
121
|
+
"method": "",
|
|
122
|
+
"pages": 0,
|
|
123
|
+
"error": None
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
try:
|
|
127
|
+
# Method 1: Try PyMuPDF first (fastest and most accurate for text PDFs)
|
|
128
|
+
if PYMUPDF_AVAILABLE:
|
|
129
|
+
try:
|
|
130
|
+
doc = fitz.open(pdf_path)
|
|
131
|
+
text = ""
|
|
132
|
+
page_count = len(doc) # Get page count before closing
|
|
133
|
+
for page in doc:
|
|
134
|
+
text += page.get_text()
|
|
135
|
+
doc.close()
|
|
136
|
+
|
|
137
|
+
if text.strip(): # Check if we got meaningful text
|
|
138
|
+
result.update({
|
|
139
|
+
"success": True,
|
|
140
|
+
"text": text,
|
|
141
|
+
"method": "pymupdf",
|
|
142
|
+
"pages": page_count
|
|
143
|
+
})
|
|
144
|
+
return result
|
|
145
|
+
except Exception as e:
|
|
146
|
+
logger.warning(f"PyMuPDF extraction failed: {str(e)}")
|
|
147
|
+
|
|
148
|
+
# Method 2: Try pdfplumber (better for complex layouts)
|
|
149
|
+
if PDFPLUMBER_AVAILABLE:
|
|
150
|
+
try:
|
|
151
|
+
import pdfplumber
|
|
152
|
+
with pdfplumber.open(pdf_path) as pdf:
|
|
153
|
+
text = ""
|
|
154
|
+
for page in pdf.pages:
|
|
155
|
+
page_text = page.extract_text()
|
|
156
|
+
if page_text:
|
|
157
|
+
text += page_text + "\n"
|
|
158
|
+
|
|
159
|
+
if text.strip():
|
|
160
|
+
result.update({
|
|
161
|
+
"success": True,
|
|
162
|
+
"text": text,
|
|
163
|
+
"method": "pdfplumber",
|
|
164
|
+
"pages": len(pdf.pages)
|
|
165
|
+
})
|
|
166
|
+
return result
|
|
167
|
+
except Exception as e:
|
|
168
|
+
logger.warning(f"pdfplumber extraction failed: {str(e)}")
|
|
169
|
+
|
|
170
|
+
# Method 3: Fallback to basic PyPDF2
|
|
171
|
+
text = await self.extract_text_basic(pdf_path)
|
|
172
|
+
if text and not text.startswith("Error"):
|
|
173
|
+
result.update({
|
|
174
|
+
"success": True,
|
|
175
|
+
"text": text,
|
|
176
|
+
"method": "pypdf2",
|
|
177
|
+
"pages": len(PyPDF2.PdfReader(open(pdf_path, 'rb')).pages)
|
|
178
|
+
})
|
|
179
|
+
return result
|
|
180
|
+
|
|
181
|
+
# Method 4: OCR as last resort (for scanned PDFs)
|
|
182
|
+
if ocr_fallback and OCR_AVAILABLE:
|
|
183
|
+
ocr_result = await self.extract_text_with_ocr(pdf_path)
|
|
184
|
+
if ocr_result["success"]:
|
|
185
|
+
result.update({
|
|
186
|
+
"success": True,
|
|
187
|
+
"text": ocr_result["text"],
|
|
188
|
+
"method": "ocr",
|
|
189
|
+
"pages": ocr_result.get("pages", 0)
|
|
190
|
+
})
|
|
191
|
+
return result
|
|
192
|
+
|
|
193
|
+
result["error"] = "All text extraction methods failed"
|
|
194
|
+
return result
|
|
195
|
+
|
|
196
|
+
except Exception as e:
|
|
197
|
+
logger.error(f"Advanced text extraction failed: path={pdf_path}, error={str(e)}")
|
|
198
|
+
result["error"] = f"Extraction failed: {str(e)}"
|
|
199
|
+
return result
|
|
200
|
+
|
|
201
|
+
async def extract_text_with_ocr(self, pdf_path: str) -> Dict[str, Any]:
|
|
202
|
+
"""
|
|
203
|
+
Extract text using OCR for scanned PDFs
|
|
204
|
+
"""
|
|
205
|
+
if not OCR_AVAILABLE:
|
|
206
|
+
return {
|
|
207
|
+
"success": False,
|
|
208
|
+
"error": "OCR not available (pytesseract not installed)"
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
try:
|
|
212
|
+
if not PYMUPDF_AVAILABLE:
|
|
213
|
+
return {
|
|
214
|
+
"success": False,
|
|
215
|
+
"error": "PyMuPDF required for OCR processing"
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
doc = fitz.open(pdf_path)
|
|
219
|
+
full_text = ""
|
|
220
|
+
|
|
221
|
+
for page_num in range(len(doc)):
|
|
222
|
+
page = doc[page_num]
|
|
223
|
+
|
|
224
|
+
# Convert page to image
|
|
225
|
+
mat = fitz.Matrix(2.0, 2.0) # Increase resolution for better OCR
|
|
226
|
+
pix = page.get_pixmap(matrix=mat)
|
|
227
|
+
img_data = pix.tobytes("png")
|
|
228
|
+
|
|
229
|
+
# Convert to PIL Image
|
|
230
|
+
image = Image.open(BytesIO(img_data))
|
|
231
|
+
|
|
232
|
+
# Perform OCR
|
|
233
|
+
page_text = pytesseract.image_to_string(image, lang='eng')
|
|
234
|
+
full_text += f"\n--- Page {page_num + 1} ---\n{page_text}\n"
|
|
235
|
+
|
|
236
|
+
doc.close()
|
|
237
|
+
|
|
238
|
+
return {
|
|
239
|
+
"success": True,
|
|
240
|
+
"text": full_text,
|
|
241
|
+
"method": "ocr",
|
|
242
|
+
"pages": len(doc)
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
except Exception as e:
|
|
246
|
+
logger.error(f"OCR text extraction failed: path={pdf_path}, error={str(e)}")
|
|
247
|
+
return {
|
|
248
|
+
"success": False,
|
|
249
|
+
"error": f"OCR failed: {str(e)}"
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
async def extract_tables(self, pdf_path: str) -> Dict[str, Any]:
|
|
253
|
+
"""
|
|
254
|
+
Extract tables from PDF using tabula-py
|
|
255
|
+
"""
|
|
256
|
+
if not TABULA_AVAILABLE:
|
|
257
|
+
return {
|
|
258
|
+
"success": False,
|
|
259
|
+
"error": "Table extraction not available (tabula-py not installed)"
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
try:
|
|
263
|
+
# Extract all tables from all pages
|
|
264
|
+
tables = tabula.read_pdf(pdf_path, pages='all', multiple_tables=True)
|
|
265
|
+
|
|
266
|
+
table_data = []
|
|
267
|
+
for i, table in enumerate(tables):
|
|
268
|
+
table_dict = {
|
|
269
|
+
"table_id": i + 1,
|
|
270
|
+
"columns": table.columns.tolist(),
|
|
271
|
+
"rows": table.values.tolist(),
|
|
272
|
+
"shape": table.shape
|
|
273
|
+
}
|
|
274
|
+
table_data.append(table_dict)
|
|
275
|
+
|
|
276
|
+
return {
|
|
277
|
+
"success": True,
|
|
278
|
+
"tables": table_data,
|
|
279
|
+
"count": len(tables)
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
except Exception as e:
|
|
283
|
+
logger.error(f"Table extraction failed: path={pdf_path}, error={str(e)}")
|
|
284
|
+
return {
|
|
285
|
+
"success": False,
|
|
286
|
+
"error": f"Table extraction failed: {str(e)}"
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
async def extract_images(self, pdf_path: str, save_images: bool = False) -> Dict[str, Any]:
|
|
290
|
+
"""
|
|
291
|
+
Extract images from PDF
|
|
292
|
+
"""
|
|
293
|
+
if not PYMUPDF_AVAILABLE:
|
|
294
|
+
return {
|
|
295
|
+
"success": False,
|
|
296
|
+
"error": "Image extraction not available (PyMuPDF not installed)"
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
try:
|
|
300
|
+
doc = fitz.open(pdf_path)
|
|
301
|
+
images = []
|
|
302
|
+
|
|
303
|
+
for page_num in range(len(doc)):
|
|
304
|
+
page = doc[page_num]
|
|
305
|
+
image_list = page.get_images()
|
|
306
|
+
|
|
307
|
+
for img_index, img in enumerate(image_list):
|
|
308
|
+
xref = img[0]
|
|
309
|
+
pix = fitz.Pixmap(doc, xref)
|
|
310
|
+
|
|
311
|
+
if pix.n - pix.alpha < 4: # GRAY or RGB
|
|
312
|
+
img_data = {
|
|
313
|
+
"page": page_num + 1,
|
|
314
|
+
"image_index": img_index,
|
|
315
|
+
"width": pix.width,
|
|
316
|
+
"height": pix.height,
|
|
317
|
+
"colorspace": pix.colorspace.name if pix.colorspace else "Unknown"
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
if save_images:
|
|
321
|
+
# Save image to temp directory
|
|
322
|
+
img_filename = f"page_{page_num + 1}_img_{img_index}.png"
|
|
323
|
+
img_path = self.temp_dir / img_filename
|
|
324
|
+
pix.save(str(img_path))
|
|
325
|
+
img_data["saved_path"] = str(img_path)
|
|
326
|
+
else:
|
|
327
|
+
# Convert to base64 for embedding
|
|
328
|
+
img_bytes = pix.tobytes("png")
|
|
329
|
+
img_base64 = base64.b64encode(img_bytes).decode()
|
|
330
|
+
img_data["base64"] = img_base64
|
|
331
|
+
|
|
332
|
+
images.append(img_data)
|
|
333
|
+
|
|
334
|
+
pix = None # Free memory
|
|
335
|
+
|
|
336
|
+
doc.close()
|
|
337
|
+
|
|
338
|
+
return {
|
|
339
|
+
"success": True,
|
|
340
|
+
"images": images,
|
|
341
|
+
"count": len(images)
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
except Exception as e:
|
|
345
|
+
logger.error(f"Image extraction failed: path={pdf_path}, error={str(e)}")
|
|
346
|
+
return {
|
|
347
|
+
"success": False,
|
|
348
|
+
"error": f"Image extraction failed: {str(e)}"
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
async def get_document_structure(self, pdf_path: str) -> Dict[str, Any]:
|
|
352
|
+
"""
|
|
353
|
+
Analyze document structure and extract metadata
|
|
354
|
+
"""
|
|
355
|
+
try:
|
|
356
|
+
# Get basic metadata first
|
|
357
|
+
basic_metadata = await self.extract_metadata_basic(pdf_path)
|
|
358
|
+
|
|
359
|
+
structure = {
|
|
360
|
+
"metadata": basic_metadata,
|
|
361
|
+
"structure": {},
|
|
362
|
+
"features": self.features
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
if PYMUPDF_AVAILABLE:
|
|
366
|
+
doc = fitz.open(pdf_path)
|
|
367
|
+
|
|
368
|
+
# Get document outline/bookmarks
|
|
369
|
+
outline = doc.get_toc()
|
|
370
|
+
structure["structure"]["outline"] = outline
|
|
371
|
+
|
|
372
|
+
# Analyze pages
|
|
373
|
+
pages_info = []
|
|
374
|
+
for page_num in range(len(doc)):
|
|
375
|
+
page = doc[page_num]
|
|
376
|
+
page_info = {
|
|
377
|
+
"page_number": page_num + 1,
|
|
378
|
+
"width": page.rect.width,
|
|
379
|
+
"height": page.rect.height,
|
|
380
|
+
"rotation": page.rotation,
|
|
381
|
+
"has_text": bool(page.get_text().strip()),
|
|
382
|
+
"image_count": len(page.get_images()),
|
|
383
|
+
"annotation_count": len(page.annots())
|
|
384
|
+
}
|
|
385
|
+
pages_info.append(page_info)
|
|
386
|
+
|
|
387
|
+
structure["structure"]["pages"] = pages_info
|
|
388
|
+
doc.close()
|
|
389
|
+
|
|
390
|
+
return {
|
|
391
|
+
"success": True,
|
|
392
|
+
"data": structure
|
|
393
|
+
}
|
|
394
|
+
|
|
395
|
+
except Exception as e:
|
|
396
|
+
logger.error(f"Document structure analysis failed: path={pdf_path}, error={str(e)}")
|
|
397
|
+
return {
|
|
398
|
+
"success": False,
|
|
399
|
+
"error": f"Structure analysis failed: {str(e)}"
|
|
400
|
+
}
|
|
401
|
+
|
|
402
|
+
# ====================
|
|
403
|
+
# UNIFIED INTERFACE
|
|
404
|
+
# ====================
|
|
405
|
+
|
|
406
|
+
async def process_pdf(self, pdf_path: str, options: Dict[str, bool] = None) -> Dict[str, Any]:
|
|
407
|
+
"""
|
|
408
|
+
Process PDF with all available methods based on options
|
|
409
|
+
"""
|
|
410
|
+
if options is None:
|
|
411
|
+
options = {
|
|
412
|
+
"extract_text": True,
|
|
413
|
+
"extract_tables": False,
|
|
414
|
+
"extract_images": False,
|
|
415
|
+
"extract_metadata": True,
|
|
416
|
+
"use_ocr": False,
|
|
417
|
+
"analyze_structure": False
|
|
418
|
+
}
|
|
419
|
+
|
|
420
|
+
result = {
|
|
421
|
+
"success": True,
|
|
422
|
+
"file": pdf_path,
|
|
423
|
+
"features_used": [],
|
|
424
|
+
"errors": []
|
|
425
|
+
}
|
|
426
|
+
|
|
427
|
+
try:
|
|
428
|
+
# Extract text
|
|
429
|
+
if options.get("extract_text", True):
|
|
430
|
+
if self.features["advanced_extraction"]:
|
|
431
|
+
text_result = await self.extract_text_advanced(pdf_path, options.get("use_ocr", False))
|
|
432
|
+
result["text_extraction"] = text_result
|
|
433
|
+
result["features_used"].append("advanced_text_extraction")
|
|
434
|
+
else:
|
|
435
|
+
text = await self.extract_text_basic(pdf_path)
|
|
436
|
+
result["text_extraction"] = {
|
|
437
|
+
"success": not text.startswith("Error"),
|
|
438
|
+
"text": text,
|
|
439
|
+
"method": "basic"
|
|
440
|
+
}
|
|
441
|
+
result["features_used"].append("basic_text_extraction")
|
|
442
|
+
|
|
443
|
+
# Extract metadata
|
|
444
|
+
if options.get("extract_metadata", True):
|
|
445
|
+
metadata = await self.extract_metadata_basic(pdf_path)
|
|
446
|
+
result["metadata"] = metadata
|
|
447
|
+
result["features_used"].append("metadata_extraction")
|
|
448
|
+
|
|
449
|
+
# Extract tables
|
|
450
|
+
if options.get("extract_tables", False):
|
|
451
|
+
tables_result = await self.extract_tables(pdf_path)
|
|
452
|
+
result["tables"] = tables_result
|
|
453
|
+
result["features_used"].append("table_extraction")
|
|
454
|
+
if not tables_result["success"]:
|
|
455
|
+
result["errors"].append(tables_result["error"])
|
|
456
|
+
|
|
457
|
+
# Extract images
|
|
458
|
+
if options.get("extract_images", False):
|
|
459
|
+
images_result = await self.extract_images(pdf_path)
|
|
460
|
+
result["images"] = images_result
|
|
461
|
+
result["features_used"].append("image_extraction")
|
|
462
|
+
if not images_result["success"]:
|
|
463
|
+
result["errors"].append(images_result["error"])
|
|
464
|
+
|
|
465
|
+
# Analyze structure
|
|
466
|
+
if options.get("analyze_structure", False):
|
|
467
|
+
structure_result = await self.get_document_structure(pdf_path)
|
|
468
|
+
result["structure"] = structure_result
|
|
469
|
+
result["features_used"].append("structure_analysis")
|
|
470
|
+
if not structure_result["success"]:
|
|
471
|
+
result["errors"].append(structure_result["error"])
|
|
472
|
+
|
|
473
|
+
return result
|
|
474
|
+
|
|
475
|
+
except Exception as e:
|
|
476
|
+
logger.error(f"PDF processing failed: path={pdf_path}, error={str(e)}")
|
|
477
|
+
result["success"] = False
|
|
478
|
+
result["error"] = f"Processing failed: {str(e)}"
|
|
479
|
+
return result
|
|
480
|
+
|
|
481
|
+
# ====================
|
|
482
|
+
# BACKWARDS COMPATIBILITY
|
|
483
|
+
# ====================
|
|
484
|
+
|
|
485
|
+
async def extract_text(self, pdf_path: str) -> str:
|
|
486
|
+
"""Backwards compatible text extraction method"""
|
|
487
|
+
if self.features["advanced_extraction"]:
|
|
488
|
+
result = await self.extract_text_advanced(pdf_path)
|
|
489
|
+
return result.get("text", "")
|
|
490
|
+
else:
|
|
491
|
+
return await self.extract_text_basic(pdf_path)
|
|
492
|
+
|
|
493
|
+
async def extract_metadata(self, pdf_path: str) -> Dict[str, Any]:
|
|
494
|
+
"""Backwards compatible metadata extraction method"""
|
|
495
|
+
return await self.extract_metadata_basic(pdf_path)
|
|
496
|
+
|
|
497
|
+
|
|
498
|
+
# Global instance
|
|
499
|
+
unified_pdf_processor = UnifiedPDFProcessor()
|
|
500
|
+
|
|
501
|
+
# Backwards compatibility
|
|
502
|
+
pdf_processor = unified_pdf_processor
|
|
503
|
+
advanced_pdf_processor = unified_pdf_processor
|
|
504
|
+
|
|
505
|
+
# Export commonly used functions
|
|
506
|
+
extract_text = unified_pdf_processor.extract_text
|
|
507
|
+
extract_metadata = unified_pdf_processor.extract_metadata
|
|
508
|
+
process_pdf = unified_pdf_processor.process_pdf
|
|
509
|
+
|
|
510
|
+
# Additional aliases for compatibility
|
|
511
|
+
pdf_service = unified_pdf_processor
|
|
512
|
+
|
|
513
|
+
__all__ = [
|
|
514
|
+
'UnifiedPDFProcessor',
|
|
515
|
+
'unified_pdf_processor',
|
|
516
|
+
'pdf_processor',
|
|
517
|
+
'pdf_service',
|
|
518
|
+
'advanced_pdf_processor',
|
|
519
|
+
'extract_text',
|
|
520
|
+
'extract_metadata',
|
|
521
|
+
'process_pdf'
|
|
522
|
+
]
|