arionxiv 1.0.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. arionxiv/__init__.py +40 -0
  2. arionxiv/__main__.py +10 -0
  3. arionxiv/arxiv_operations/__init__.py +0 -0
  4. arionxiv/arxiv_operations/client.py +225 -0
  5. arionxiv/arxiv_operations/fetcher.py +173 -0
  6. arionxiv/arxiv_operations/searcher.py +122 -0
  7. arionxiv/arxiv_operations/utils.py +293 -0
  8. arionxiv/cli/__init__.py +4 -0
  9. arionxiv/cli/commands/__init__.py +1 -0
  10. arionxiv/cli/commands/analyze.py +587 -0
  11. arionxiv/cli/commands/auth.py +365 -0
  12. arionxiv/cli/commands/chat.py +714 -0
  13. arionxiv/cli/commands/daily.py +482 -0
  14. arionxiv/cli/commands/fetch.py +217 -0
  15. arionxiv/cli/commands/library.py +295 -0
  16. arionxiv/cli/commands/preferences.py +426 -0
  17. arionxiv/cli/commands/search.py +254 -0
  18. arionxiv/cli/commands/settings_unified.py +1407 -0
  19. arionxiv/cli/commands/trending.py +41 -0
  20. arionxiv/cli/commands/welcome.py +168 -0
  21. arionxiv/cli/main.py +407 -0
  22. arionxiv/cli/ui/__init__.py +1 -0
  23. arionxiv/cli/ui/global_theme_manager.py +173 -0
  24. arionxiv/cli/ui/logo.py +127 -0
  25. arionxiv/cli/ui/splash.py +89 -0
  26. arionxiv/cli/ui/theme.py +32 -0
  27. arionxiv/cli/ui/theme_system.py +391 -0
  28. arionxiv/cli/utils/__init__.py +54 -0
  29. arionxiv/cli/utils/animations.py +522 -0
  30. arionxiv/cli/utils/api_client.py +583 -0
  31. arionxiv/cli/utils/api_config.py +505 -0
  32. arionxiv/cli/utils/command_suggestions.py +147 -0
  33. arionxiv/cli/utils/db_config_manager.py +254 -0
  34. arionxiv/github_actions_runner.py +206 -0
  35. arionxiv/main.py +23 -0
  36. arionxiv/prompts/__init__.py +9 -0
  37. arionxiv/prompts/prompts.py +247 -0
  38. arionxiv/rag_techniques/__init__.py +8 -0
  39. arionxiv/rag_techniques/basic_rag.py +1531 -0
  40. arionxiv/scheduler_daemon.py +139 -0
  41. arionxiv/server.py +1000 -0
  42. arionxiv/server_main.py +24 -0
  43. arionxiv/services/__init__.py +73 -0
  44. arionxiv/services/llm_client.py +30 -0
  45. arionxiv/services/llm_inference/__init__.py +58 -0
  46. arionxiv/services/llm_inference/groq_client.py +469 -0
  47. arionxiv/services/llm_inference/llm_utils.py +250 -0
  48. arionxiv/services/llm_inference/openrouter_client.py +564 -0
  49. arionxiv/services/unified_analysis_service.py +872 -0
  50. arionxiv/services/unified_auth_service.py +457 -0
  51. arionxiv/services/unified_config_service.py +456 -0
  52. arionxiv/services/unified_daily_dose_service.py +823 -0
  53. arionxiv/services/unified_database_service.py +1633 -0
  54. arionxiv/services/unified_llm_service.py +366 -0
  55. arionxiv/services/unified_paper_service.py +604 -0
  56. arionxiv/services/unified_pdf_service.py +522 -0
  57. arionxiv/services/unified_prompt_service.py +344 -0
  58. arionxiv/services/unified_scheduler_service.py +589 -0
  59. arionxiv/services/unified_user_service.py +954 -0
  60. arionxiv/utils/__init__.py +51 -0
  61. arionxiv/utils/api_helpers.py +200 -0
  62. arionxiv/utils/file_cleanup.py +150 -0
  63. arionxiv/utils/ip_helper.py +96 -0
  64. arionxiv-1.0.32.dist-info/METADATA +336 -0
  65. arionxiv-1.0.32.dist-info/RECORD +69 -0
  66. arionxiv-1.0.32.dist-info/WHEEL +5 -0
  67. arionxiv-1.0.32.dist-info/entry_points.txt +4 -0
  68. arionxiv-1.0.32.dist-info/licenses/LICENSE +21 -0
  69. arionxiv-1.0.32.dist-info/top_level.txt +1 -0
@@ -0,0 +1,522 @@
1
+ """
2
+ Unified PDF Processing Service for ArionXiv
3
+ Consolidates pdf_processor.py and advanced_pdf_processor.py
4
+ Supports basic text extraction, OCR, table extraction, image analysis, and metadata extraction
5
+ """
6
+
7
+ import os
8
+ import asyncio
9
+ import logging
10
+ from typing import Dict, List, Any, Optional, Tuple
11
+ from pathlib import Path
12
+ import tempfile
13
+ import base64
14
+ from io import BytesIO
15
+
16
+ import PyPDF2
17
+
18
+ try:
19
+ import fitz
20
+ PYMUPDF_AVAILABLE = True
21
+ except ImportError:
22
+ PYMUPDF_AVAILABLE = False
23
+
24
+ try:
25
+ import pdfplumber
26
+ PDFPLUMBER_AVAILABLE = True
27
+ except ImportError:
28
+ PDFPLUMBER_AVAILABLE = False
29
+
30
+ from PIL import Image
31
+
32
+ try:
33
+ import pytesseract
34
+ OCR_AVAILABLE = True
35
+ except ImportError:
36
+ OCR_AVAILABLE = False
37
+
38
+ try:
39
+ import tabula
40
+ TABULA_AVAILABLE = True
41
+ except ImportError:
42
+ TABULA_AVAILABLE = False
43
+
44
+ logger = logging.getLogger(__name__)
45
+
46
+
47
+ class UnifiedPDFProcessor:
48
+ """
49
+ Comprehensive PDF processor supporting both basic and advanced operations:
50
+ - Basic text extraction (PyPDF2)
51
+ - Advanced text and layout analysis (PyMuPDF, pdfplumber)
52
+ - OCR for scanned documents (Tesseract)
53
+ - Table extraction (tabula-py)
54
+ - Image extraction and analysis
55
+ - Metadata extraction
56
+ """
57
+
58
+ def __init__(self):
59
+ self.temp_dir = Path(tempfile.gettempdir()) / "arionxiv_pdf_processing"
60
+ self.temp_dir.mkdir(exist_ok=True)
61
+
62
+ # Check available features
63
+ self.features = {
64
+ "basic_extraction": True, # Always available with PyPDF2
65
+ "advanced_extraction": PYMUPDF_AVAILABLE and PDFPLUMBER_AVAILABLE,
66
+ "ocr": OCR_AVAILABLE,
67
+ "table_extraction": TABULA_AVAILABLE,
68
+ "image_extraction": PYMUPDF_AVAILABLE
69
+ }
70
+
71
+ logger.info(f"UnifiedPDFProcessor initialized: features={self.features}")
72
+
73
+ # ====================
74
+ # BASIC PDF PROCESSING (from pdf_processor.py)
75
+ # ====================
76
+
77
+ async def extract_text_basic(self, pdf_path: str) -> str:
78
+ """Extract text from PDF using PyPDF2 (basic method)"""
79
+ try:
80
+ with open(pdf_path, 'rb') as file:
81
+ reader = PyPDF2.PdfReader(file)
82
+ text = ""
83
+ for page in reader.pages:
84
+ text += page.extract_text()
85
+ return text
86
+ except Exception as e:
87
+ logger.error(f"Basic text extraction failed: path={pdf_path}, error={str(e)}")
88
+ return f"Error extracting text: {str(e)}"
89
+
90
+ async def extract_metadata_basic(self, pdf_path: str) -> Dict[str, Any]:
91
+ """Extract metadata from PDF using PyPDF2"""
92
+ try:
93
+ with open(pdf_path, 'rb') as file:
94
+ reader = PyPDF2.PdfReader(file)
95
+ metadata = reader.metadata
96
+ return {
97
+ "title": metadata.get("/Title", "Unknown"),
98
+ "author": metadata.get("/Author", "Unknown"),
99
+ "subject": metadata.get("/Subject", "Unknown"),
100
+ "creator": metadata.get("/Creator", "Unknown"),
101
+ "producer": metadata.get("/Producer", "Unknown"),
102
+ "creation_date": str(metadata.get("/CreationDate", "Unknown")),
103
+ "modification_date": str(metadata.get("/ModDate", "Unknown")),
104
+ "pages": len(reader.pages)
105
+ }
106
+ except Exception as e:
107
+ logger.error(f"Basic metadata extraction failed: path={pdf_path}, error={str(e)}")
108
+ return {"error": f"Metadata extraction failed: {str(e)}"}
109
+
110
+ # ====================
111
+ # ADVANCED PDF PROCESSING (from advanced_pdf_processor.py)
112
+ # ====================
113
+
114
+ async def extract_text_advanced(self, pdf_path: str, ocr_fallback: bool = True) -> Dict[str, Any]:
115
+ """
116
+ Advanced text extraction with multiple fallback methods
117
+ """
118
+ result = {
119
+ "success": False,
120
+ "text": "",
121
+ "method": "",
122
+ "pages": 0,
123
+ "error": None
124
+ }
125
+
126
+ try:
127
+ # Method 1: Try PyMuPDF first (fastest and most accurate for text PDFs)
128
+ if PYMUPDF_AVAILABLE:
129
+ try:
130
+ doc = fitz.open(pdf_path)
131
+ text = ""
132
+ page_count = len(doc) # Get page count before closing
133
+ for page in doc:
134
+ text += page.get_text()
135
+ doc.close()
136
+
137
+ if text.strip(): # Check if we got meaningful text
138
+ result.update({
139
+ "success": True,
140
+ "text": text,
141
+ "method": "pymupdf",
142
+ "pages": page_count
143
+ })
144
+ return result
145
+ except Exception as e:
146
+ logger.warning(f"PyMuPDF extraction failed: {str(e)}")
147
+
148
+ # Method 2: Try pdfplumber (better for complex layouts)
149
+ if PDFPLUMBER_AVAILABLE:
150
+ try:
151
+ import pdfplumber
152
+ with pdfplumber.open(pdf_path) as pdf:
153
+ text = ""
154
+ for page in pdf.pages:
155
+ page_text = page.extract_text()
156
+ if page_text:
157
+ text += page_text + "\n"
158
+
159
+ if text.strip():
160
+ result.update({
161
+ "success": True,
162
+ "text": text,
163
+ "method": "pdfplumber",
164
+ "pages": len(pdf.pages)
165
+ })
166
+ return result
167
+ except Exception as e:
168
+ logger.warning(f"pdfplumber extraction failed: {str(e)}")
169
+
170
+ # Method 3: Fallback to basic PyPDF2
171
+ text = await self.extract_text_basic(pdf_path)
172
+ if text and not text.startswith("Error"):
173
+ result.update({
174
+ "success": True,
175
+ "text": text,
176
+ "method": "pypdf2",
177
+ "pages": len(PyPDF2.PdfReader(open(pdf_path, 'rb')).pages)
178
+ })
179
+ return result
180
+
181
+ # Method 4: OCR as last resort (for scanned PDFs)
182
+ if ocr_fallback and OCR_AVAILABLE:
183
+ ocr_result = await self.extract_text_with_ocr(pdf_path)
184
+ if ocr_result["success"]:
185
+ result.update({
186
+ "success": True,
187
+ "text": ocr_result["text"],
188
+ "method": "ocr",
189
+ "pages": ocr_result.get("pages", 0)
190
+ })
191
+ return result
192
+
193
+ result["error"] = "All text extraction methods failed"
194
+ return result
195
+
196
+ except Exception as e:
197
+ logger.error(f"Advanced text extraction failed: path={pdf_path}, error={str(e)}")
198
+ result["error"] = f"Extraction failed: {str(e)}"
199
+ return result
200
+
201
+ async def extract_text_with_ocr(self, pdf_path: str) -> Dict[str, Any]:
202
+ """
203
+ Extract text using OCR for scanned PDFs
204
+ """
205
+ if not OCR_AVAILABLE:
206
+ return {
207
+ "success": False,
208
+ "error": "OCR not available (pytesseract not installed)"
209
+ }
210
+
211
+ try:
212
+ if not PYMUPDF_AVAILABLE:
213
+ return {
214
+ "success": False,
215
+ "error": "PyMuPDF required for OCR processing"
216
+ }
217
+
218
+ doc = fitz.open(pdf_path)
219
+ full_text = ""
220
+
221
+ for page_num in range(len(doc)):
222
+ page = doc[page_num]
223
+
224
+ # Convert page to image
225
+ mat = fitz.Matrix(2.0, 2.0) # Increase resolution for better OCR
226
+ pix = page.get_pixmap(matrix=mat)
227
+ img_data = pix.tobytes("png")
228
+
229
+ # Convert to PIL Image
230
+ image = Image.open(BytesIO(img_data))
231
+
232
+ # Perform OCR
233
+ page_text = pytesseract.image_to_string(image, lang='eng')
234
+ full_text += f"\n--- Page {page_num + 1} ---\n{page_text}\n"
235
+
236
+ doc.close()
237
+
238
+ return {
239
+ "success": True,
240
+ "text": full_text,
241
+ "method": "ocr",
242
+ "pages": len(doc)
243
+ }
244
+
245
+ except Exception as e:
246
+ logger.error(f"OCR text extraction failed: path={pdf_path}, error={str(e)}")
247
+ return {
248
+ "success": False,
249
+ "error": f"OCR failed: {str(e)}"
250
+ }
251
+
252
+ async def extract_tables(self, pdf_path: str) -> Dict[str, Any]:
253
+ """
254
+ Extract tables from PDF using tabula-py
255
+ """
256
+ if not TABULA_AVAILABLE:
257
+ return {
258
+ "success": False,
259
+ "error": "Table extraction not available (tabula-py not installed)"
260
+ }
261
+
262
+ try:
263
+ # Extract all tables from all pages
264
+ tables = tabula.read_pdf(pdf_path, pages='all', multiple_tables=True)
265
+
266
+ table_data = []
267
+ for i, table in enumerate(tables):
268
+ table_dict = {
269
+ "table_id": i + 1,
270
+ "columns": table.columns.tolist(),
271
+ "rows": table.values.tolist(),
272
+ "shape": table.shape
273
+ }
274
+ table_data.append(table_dict)
275
+
276
+ return {
277
+ "success": True,
278
+ "tables": table_data,
279
+ "count": len(tables)
280
+ }
281
+
282
+ except Exception as e:
283
+ logger.error(f"Table extraction failed: path={pdf_path}, error={str(e)}")
284
+ return {
285
+ "success": False,
286
+ "error": f"Table extraction failed: {str(e)}"
287
+ }
288
+
289
+ async def extract_images(self, pdf_path: str, save_images: bool = False) -> Dict[str, Any]:
290
+ """
291
+ Extract images from PDF
292
+ """
293
+ if not PYMUPDF_AVAILABLE:
294
+ return {
295
+ "success": False,
296
+ "error": "Image extraction not available (PyMuPDF not installed)"
297
+ }
298
+
299
+ try:
300
+ doc = fitz.open(pdf_path)
301
+ images = []
302
+
303
+ for page_num in range(len(doc)):
304
+ page = doc[page_num]
305
+ image_list = page.get_images()
306
+
307
+ for img_index, img in enumerate(image_list):
308
+ xref = img[0]
309
+ pix = fitz.Pixmap(doc, xref)
310
+
311
+ if pix.n - pix.alpha < 4: # GRAY or RGB
312
+ img_data = {
313
+ "page": page_num + 1,
314
+ "image_index": img_index,
315
+ "width": pix.width,
316
+ "height": pix.height,
317
+ "colorspace": pix.colorspace.name if pix.colorspace else "Unknown"
318
+ }
319
+
320
+ if save_images:
321
+ # Save image to temp directory
322
+ img_filename = f"page_{page_num + 1}_img_{img_index}.png"
323
+ img_path = self.temp_dir / img_filename
324
+ pix.save(str(img_path))
325
+ img_data["saved_path"] = str(img_path)
326
+ else:
327
+ # Convert to base64 for embedding
328
+ img_bytes = pix.tobytes("png")
329
+ img_base64 = base64.b64encode(img_bytes).decode()
330
+ img_data["base64"] = img_base64
331
+
332
+ images.append(img_data)
333
+
334
+ pix = None # Free memory
335
+
336
+ doc.close()
337
+
338
+ return {
339
+ "success": True,
340
+ "images": images,
341
+ "count": len(images)
342
+ }
343
+
344
+ except Exception as e:
345
+ logger.error(f"Image extraction failed: path={pdf_path}, error={str(e)}")
346
+ return {
347
+ "success": False,
348
+ "error": f"Image extraction failed: {str(e)}"
349
+ }
350
+
351
+ async def get_document_structure(self, pdf_path: str) -> Dict[str, Any]:
352
+ """
353
+ Analyze document structure and extract metadata
354
+ """
355
+ try:
356
+ # Get basic metadata first
357
+ basic_metadata = await self.extract_metadata_basic(pdf_path)
358
+
359
+ structure = {
360
+ "metadata": basic_metadata,
361
+ "structure": {},
362
+ "features": self.features
363
+ }
364
+
365
+ if PYMUPDF_AVAILABLE:
366
+ doc = fitz.open(pdf_path)
367
+
368
+ # Get document outline/bookmarks
369
+ outline = doc.get_toc()
370
+ structure["structure"]["outline"] = outline
371
+
372
+ # Analyze pages
373
+ pages_info = []
374
+ for page_num in range(len(doc)):
375
+ page = doc[page_num]
376
+ page_info = {
377
+ "page_number": page_num + 1,
378
+ "width": page.rect.width,
379
+ "height": page.rect.height,
380
+ "rotation": page.rotation,
381
+ "has_text": bool(page.get_text().strip()),
382
+ "image_count": len(page.get_images()),
383
+ "annotation_count": len(page.annots())
384
+ }
385
+ pages_info.append(page_info)
386
+
387
+ structure["structure"]["pages"] = pages_info
388
+ doc.close()
389
+
390
+ return {
391
+ "success": True,
392
+ "data": structure
393
+ }
394
+
395
+ except Exception as e:
396
+ logger.error(f"Document structure analysis failed: path={pdf_path}, error={str(e)}")
397
+ return {
398
+ "success": False,
399
+ "error": f"Structure analysis failed: {str(e)}"
400
+ }
401
+
402
+ # ====================
403
+ # UNIFIED INTERFACE
404
+ # ====================
405
+
406
+ async def process_pdf(self, pdf_path: str, options: Dict[str, bool] = None) -> Dict[str, Any]:
407
+ """
408
+ Process PDF with all available methods based on options
409
+ """
410
+ if options is None:
411
+ options = {
412
+ "extract_text": True,
413
+ "extract_tables": False,
414
+ "extract_images": False,
415
+ "extract_metadata": True,
416
+ "use_ocr": False,
417
+ "analyze_structure": False
418
+ }
419
+
420
+ result = {
421
+ "success": True,
422
+ "file": pdf_path,
423
+ "features_used": [],
424
+ "errors": []
425
+ }
426
+
427
+ try:
428
+ # Extract text
429
+ if options.get("extract_text", True):
430
+ if self.features["advanced_extraction"]:
431
+ text_result = await self.extract_text_advanced(pdf_path, options.get("use_ocr", False))
432
+ result["text_extraction"] = text_result
433
+ result["features_used"].append("advanced_text_extraction")
434
+ else:
435
+ text = await self.extract_text_basic(pdf_path)
436
+ result["text_extraction"] = {
437
+ "success": not text.startswith("Error"),
438
+ "text": text,
439
+ "method": "basic"
440
+ }
441
+ result["features_used"].append("basic_text_extraction")
442
+
443
+ # Extract metadata
444
+ if options.get("extract_metadata", True):
445
+ metadata = await self.extract_metadata_basic(pdf_path)
446
+ result["metadata"] = metadata
447
+ result["features_used"].append("metadata_extraction")
448
+
449
+ # Extract tables
450
+ if options.get("extract_tables", False):
451
+ tables_result = await self.extract_tables(pdf_path)
452
+ result["tables"] = tables_result
453
+ result["features_used"].append("table_extraction")
454
+ if not tables_result["success"]:
455
+ result["errors"].append(tables_result["error"])
456
+
457
+ # Extract images
458
+ if options.get("extract_images", False):
459
+ images_result = await self.extract_images(pdf_path)
460
+ result["images"] = images_result
461
+ result["features_used"].append("image_extraction")
462
+ if not images_result["success"]:
463
+ result["errors"].append(images_result["error"])
464
+
465
+ # Analyze structure
466
+ if options.get("analyze_structure", False):
467
+ structure_result = await self.get_document_structure(pdf_path)
468
+ result["structure"] = structure_result
469
+ result["features_used"].append("structure_analysis")
470
+ if not structure_result["success"]:
471
+ result["errors"].append(structure_result["error"])
472
+
473
+ return result
474
+
475
+ except Exception as e:
476
+ logger.error(f"PDF processing failed: path={pdf_path}, error={str(e)}")
477
+ result["success"] = False
478
+ result["error"] = f"Processing failed: {str(e)}"
479
+ return result
480
+
481
+ # ====================
482
+ # BACKWARDS COMPATIBILITY
483
+ # ====================
484
+
485
+ async def extract_text(self, pdf_path: str) -> str:
486
+ """Backwards compatible text extraction method"""
487
+ if self.features["advanced_extraction"]:
488
+ result = await self.extract_text_advanced(pdf_path)
489
+ return result.get("text", "")
490
+ else:
491
+ return await self.extract_text_basic(pdf_path)
492
+
493
+ async def extract_metadata(self, pdf_path: str) -> Dict[str, Any]:
494
+ """Backwards compatible metadata extraction method"""
495
+ return await self.extract_metadata_basic(pdf_path)
496
+
497
+
498
+ # Global instance
499
+ unified_pdf_processor = UnifiedPDFProcessor()
500
+
501
+ # Backwards compatibility
502
+ pdf_processor = unified_pdf_processor
503
+ advanced_pdf_processor = unified_pdf_processor
504
+
505
+ # Export commonly used functions
506
+ extract_text = unified_pdf_processor.extract_text
507
+ extract_metadata = unified_pdf_processor.extract_metadata
508
+ process_pdf = unified_pdf_processor.process_pdf
509
+
510
+ # Additional aliases for compatibility
511
+ pdf_service = unified_pdf_processor
512
+
513
+ __all__ = [
514
+ 'UnifiedPDFProcessor',
515
+ 'unified_pdf_processor',
516
+ 'pdf_processor',
517
+ 'pdf_service',
518
+ 'advanced_pdf_processor',
519
+ 'extract_text',
520
+ 'extract_metadata',
521
+ 'process_pdf'
522
+ ]