arionxiv 1.0.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. arionxiv/__init__.py +40 -0
  2. arionxiv/__main__.py +10 -0
  3. arionxiv/arxiv_operations/__init__.py +0 -0
  4. arionxiv/arxiv_operations/client.py +225 -0
  5. arionxiv/arxiv_operations/fetcher.py +173 -0
  6. arionxiv/arxiv_operations/searcher.py +122 -0
  7. arionxiv/arxiv_operations/utils.py +293 -0
  8. arionxiv/cli/__init__.py +4 -0
  9. arionxiv/cli/commands/__init__.py +1 -0
  10. arionxiv/cli/commands/analyze.py +587 -0
  11. arionxiv/cli/commands/auth.py +365 -0
  12. arionxiv/cli/commands/chat.py +714 -0
  13. arionxiv/cli/commands/daily.py +482 -0
  14. arionxiv/cli/commands/fetch.py +217 -0
  15. arionxiv/cli/commands/library.py +295 -0
  16. arionxiv/cli/commands/preferences.py +426 -0
  17. arionxiv/cli/commands/search.py +254 -0
  18. arionxiv/cli/commands/settings_unified.py +1407 -0
  19. arionxiv/cli/commands/trending.py +41 -0
  20. arionxiv/cli/commands/welcome.py +168 -0
  21. arionxiv/cli/main.py +407 -0
  22. arionxiv/cli/ui/__init__.py +1 -0
  23. arionxiv/cli/ui/global_theme_manager.py +173 -0
  24. arionxiv/cli/ui/logo.py +127 -0
  25. arionxiv/cli/ui/splash.py +89 -0
  26. arionxiv/cli/ui/theme.py +32 -0
  27. arionxiv/cli/ui/theme_system.py +391 -0
  28. arionxiv/cli/utils/__init__.py +54 -0
  29. arionxiv/cli/utils/animations.py +522 -0
  30. arionxiv/cli/utils/api_client.py +583 -0
  31. arionxiv/cli/utils/api_config.py +505 -0
  32. arionxiv/cli/utils/command_suggestions.py +147 -0
  33. arionxiv/cli/utils/db_config_manager.py +254 -0
  34. arionxiv/github_actions_runner.py +206 -0
  35. arionxiv/main.py +23 -0
  36. arionxiv/prompts/__init__.py +9 -0
  37. arionxiv/prompts/prompts.py +247 -0
  38. arionxiv/rag_techniques/__init__.py +8 -0
  39. arionxiv/rag_techniques/basic_rag.py +1531 -0
  40. arionxiv/scheduler_daemon.py +139 -0
  41. arionxiv/server.py +1000 -0
  42. arionxiv/server_main.py +24 -0
  43. arionxiv/services/__init__.py +73 -0
  44. arionxiv/services/llm_client.py +30 -0
  45. arionxiv/services/llm_inference/__init__.py +58 -0
  46. arionxiv/services/llm_inference/groq_client.py +469 -0
  47. arionxiv/services/llm_inference/llm_utils.py +250 -0
  48. arionxiv/services/llm_inference/openrouter_client.py +564 -0
  49. arionxiv/services/unified_analysis_service.py +872 -0
  50. arionxiv/services/unified_auth_service.py +457 -0
  51. arionxiv/services/unified_config_service.py +456 -0
  52. arionxiv/services/unified_daily_dose_service.py +823 -0
  53. arionxiv/services/unified_database_service.py +1633 -0
  54. arionxiv/services/unified_llm_service.py +366 -0
  55. arionxiv/services/unified_paper_service.py +604 -0
  56. arionxiv/services/unified_pdf_service.py +522 -0
  57. arionxiv/services/unified_prompt_service.py +344 -0
  58. arionxiv/services/unified_scheduler_service.py +589 -0
  59. arionxiv/services/unified_user_service.py +954 -0
  60. arionxiv/utils/__init__.py +51 -0
  61. arionxiv/utils/api_helpers.py +200 -0
  62. arionxiv/utils/file_cleanup.py +150 -0
  63. arionxiv/utils/ip_helper.py +96 -0
  64. arionxiv-1.0.32.dist-info/METADATA +336 -0
  65. arionxiv-1.0.32.dist-info/RECORD +69 -0
  66. arionxiv-1.0.32.dist-info/WHEEL +5 -0
  67. arionxiv-1.0.32.dist-info/entry_points.txt +4 -0
  68. arionxiv-1.0.32.dist-info/licenses/LICENSE +21 -0
  69. arionxiv-1.0.32.dist-info/top_level.txt +1 -0
@@ -0,0 +1,823 @@
1
+ """
2
+ Unified Daily Dose Service for ArionXiv
3
+ Handles the complete daily dose workflow:
4
+ - Fetching papers based on user keywords via arXiv search
5
+ - Extracting text and generating embeddings
6
+ - Creating thorough analysis for each paper
7
+ - Storing results in MongoDB against user_id
8
+ - Replacing previous day's analysis when cron runs again
9
+ """
10
+
11
+ import asyncio
12
+ import logging
13
+ import os
14
+ from datetime import datetime, timedelta
15
+ from typing import Dict, Any, List, Optional
16
+ import time
17
+
18
+ from .unified_database_service import unified_database_service
19
+ from .unified_config_service import unified_config_service
20
+ from .unified_pdf_service import unified_pdf_processor
21
+ from ..arxiv_operations.client import arxiv_client
22
+ from ..prompts import format_prompt
23
+
24
+ # Use OpenRouter as primary LLM provider (FREE tier available)
25
+ # Falls back to Groq if OpenRouter unavailable
26
+ def _get_llm_client():
27
+ """Get the appropriate LLM client based on environment."""
28
+ provider = os.getenv("RAG_LLM_PROVIDER", "openrouter").lower()
29
+
30
+ if provider == "openrouter" or os.getenv("OPENROUTER_API_KEY"):
31
+ from .llm_inference.openrouter_client import openrouter_client
32
+ return openrouter_client
33
+ else:
34
+ from .llm_client import llm_client
35
+ return llm_client
36
+
37
+ logger = logging.getLogger(__name__)
38
+
39
+
40
+ # Rate limiting constants
41
+ ARXIV_REQUEST_DELAY = 3.0 # seconds between arXiv requests
42
+ LLM_REQUEST_DELAY = 1.0 # seconds between LLM requests
43
+ MAX_PAPERS_V1 = 10 # Maximum papers for version 1
44
+
45
+
46
+ class UnifiedDailyDoseService:
47
+ """
48
+ Service for managing daily dose paper recommendations and analysis.
49
+
50
+ Features:
51
+ - Fetches papers based on user-saved keywords from DB
52
+ - Respects rate limits for arXiv and LLM APIs
53
+ - Generates embeddings and thorough analysis for each paper
54
+ - Stores everything in MongoDB against user_id
55
+ - Replaces previous day's analysis on each cron run
56
+ """
57
+
58
+ def __init__(self):
59
+ self.max_papers = MAX_PAPERS_V1
60
+ self.arxiv_delay = ARXIV_REQUEST_DELAY
61
+ self.llm_delay = LLM_REQUEST_DELAY
62
+ logger.info("UnifiedDailyDoseService initialized")
63
+
64
+ async def get_user_daily_dose_settings(self, user_id: str) -> Dict[str, Any]:
65
+ """
66
+ Get user's daily dose settings from the database or API.
67
+
68
+ Returns settings including:
69
+ - keywords: List of search keywords
70
+ - max_papers: Number of papers to fetch (max 10)
71
+ - scheduled_time: Time for cron job (HH:MM format)
72
+ - enabled: Whether daily dose is enabled
73
+ """
74
+ try:
75
+ # Try API first for hosted users (no local MongoDB)
76
+ try:
77
+ from ..cli.utils.api_client import api_client
78
+ if api_client.is_authenticated():
79
+ result = await api_client.get_settings()
80
+ if result.get("success"):
81
+ settings = result.get("settings", {})
82
+ daily_dose = settings.get("daily_dose", {})
83
+ preferences = settings.get("preferences", {})
84
+ return {
85
+ "success": True,
86
+ "settings": {
87
+ "keywords": daily_dose.get("keywords", preferences.get("keywords", [])),
88
+ "max_papers": min(daily_dose.get("max_papers", 5), self.max_papers),
89
+ "scheduled_time": daily_dose.get("scheduled_time", None),
90
+ "enabled": daily_dose.get("enabled", False),
91
+ "categories": preferences.get("categories", ["cs.AI", "cs.LG"])
92
+ }
93
+ }
94
+ except Exception as api_err:
95
+ logger.debug(f"API settings fetch failed, trying local DB: {api_err}")
96
+
97
+ # Fall back to local MongoDB
98
+ if unified_database_service.db is None:
99
+ try:
100
+ await unified_database_service.connect_mongodb()
101
+ except Exception as db_err:
102
+ logger.debug(f"Local MongoDB not available: {db_err}")
103
+ return {
104
+ "success": False,
105
+ "message": "No database connection available. Please ensure you're logged in.",
106
+ "settings": self._get_default_settings()
107
+ }
108
+
109
+ # Get user preferences from local DB
110
+ user = await unified_database_service.find_one("users", {"_id": user_id})
111
+ if not user:
112
+ # Try alternate lookup
113
+ from bson import ObjectId
114
+ try:
115
+ user = await unified_database_service.find_one("users", {"_id": ObjectId(user_id)})
116
+ except Exception:
117
+ pass
118
+
119
+ if not user:
120
+ return {
121
+ "success": False,
122
+ "message": "User not found",
123
+ "settings": self._get_default_settings()
124
+ }
125
+
126
+ preferences = user.get("preferences", {})
127
+ daily_dose_settings = preferences.get("daily_dose", {})
128
+
129
+ return {
130
+ "success": True,
131
+ "settings": {
132
+ "keywords": daily_dose_settings.get("keywords", preferences.get("keywords", [])),
133
+ "max_papers": min(daily_dose_settings.get("max_papers", 5), self.max_papers),
134
+ "scheduled_time": daily_dose_settings.get("scheduled_time", None),
135
+ "enabled": daily_dose_settings.get("enabled", False),
136
+ "categories": preferences.get("categories", ["cs.AI", "cs.LG"])
137
+ }
138
+ }
139
+
140
+ except Exception as e:
141
+ logger.error(f"Failed to get daily dose settings for user {user_id}: {e}")
142
+ return {
143
+ "success": False,
144
+ "message": str(e),
145
+ "settings": self._get_default_settings()
146
+ }
147
+
148
+ def _get_default_settings(self) -> Dict[str, Any]:
149
+ """Return default daily dose settings."""
150
+ return {
151
+ "keywords": [],
152
+ "max_papers": 5,
153
+ "scheduled_time": None,
154
+ "enabled": False,
155
+ "categories": ["cs.AI", "cs.LG"]
156
+ }
157
+
158
+ async def update_user_daily_dose_settings(
159
+ self,
160
+ user_id: str,
161
+ keywords: List[str] = None,
162
+ max_papers: int = None,
163
+ scheduled_time: str = None,
164
+ enabled: bool = None
165
+ ) -> Dict[str, Any]:
166
+ """
167
+ Update user's daily dose settings in the database.
168
+ """
169
+ try:
170
+ if unified_database_service.db is None:
171
+ await unified_database_service.connect_mongodb()
172
+
173
+ # Build update dict
174
+ updates = {}
175
+ if keywords is not None:
176
+ updates["preferences.daily_dose.keywords"] = keywords
177
+ updates["preferences.keywords"] = keywords # Also update main keywords
178
+ if max_papers is not None:
179
+ updates["preferences.daily_dose.max_papers"] = min(max_papers, self.max_papers)
180
+ if scheduled_time is not None:
181
+ updates["preferences.daily_dose.scheduled_time"] = scheduled_time
182
+ if enabled is not None:
183
+ updates["preferences.daily_dose.enabled"] = enabled
184
+
185
+ updates["preferences.daily_dose.updated_at"] = datetime.utcnow()
186
+
187
+ # Update user document
188
+ from bson import ObjectId
189
+ try:
190
+ filter_query = {"_id": ObjectId(user_id)}
191
+ except Exception:
192
+ filter_query = {"_id": user_id}
193
+
194
+ result = await unified_database_service.update_one(
195
+ "users",
196
+ filter_query,
197
+ {"$set": updates}
198
+ )
199
+
200
+ if result:
201
+ logger.info(f"Updated daily dose settings for user {user_id}")
202
+ return {"success": True, "message": "Settings updated successfully"}
203
+ else:
204
+ return {"success": False, "message": "Failed to update settings"}
205
+
206
+ except Exception as e:
207
+ logger.error(f"Failed to update daily dose settings: {e}")
208
+ return {"success": False, "message": str(e)}
209
+
210
+ async def execute_daily_dose(self, user_id: str, progress_callback=None) -> Dict[str, Any]:
211
+ """
212
+ Execute the daily dose workflow for a user.
213
+
214
+ Steps:
215
+ 1. Get user's keywords from DB
216
+ 2. Search arXiv for matching papers (with rate limiting)
217
+ 3. Extract text from each paper
218
+ 4. Generate embeddings for each paper
219
+ 5. Create thorough analysis for each paper
220
+ 6. Store everything in DB, replacing previous day's data
221
+
222
+ Args:
223
+ user_id: User ID to run daily dose for
224
+ progress_callback: Optional callback function(step: str, detail: str) for progress updates
225
+
226
+ Returns:
227
+ Dict with success status, papers processed, and analysis_id
228
+ """
229
+ def log_progress(step: str, detail: str = ""):
230
+ """Log progress both to logger and callback"""
231
+ logger.info(f"{step}: {detail}" if detail else step)
232
+ if progress_callback:
233
+ progress_callback(step, detail)
234
+
235
+ start_time = datetime.utcnow()
236
+ log_progress("Starting daily dose", f"User: {user_id}")
237
+
238
+ try:
239
+ # Step 1: Get user settings
240
+ log_progress("Loading settings", "Fetching user preferences...")
241
+ settings_result = await self.get_user_daily_dose_settings(user_id)
242
+ if not settings_result["success"]:
243
+ return {
244
+ "success": False,
245
+ "message": f"Failed to get user settings: {settings_result['message']}",
246
+ "papers_count": 0
247
+ }
248
+
249
+ settings = settings_result["settings"]
250
+ keywords = settings["keywords"]
251
+ categories = settings["categories"]
252
+ max_papers = settings["max_papers"]
253
+
254
+ if not keywords and not categories:
255
+ return {
256
+ "success": False,
257
+ "message": "No keywords or categories configured. Please set up your preferences in settings.",
258
+ "papers_count": 0
259
+ }
260
+
261
+ log_progress("Settings loaded", f"Keywords: {len(keywords)}, Categories: {len(categories)}, Max papers: {max_papers}")
262
+
263
+ # Step 2: Search arXiv for papers
264
+ log_progress("Searching arXiv", "Finding papers matching your keywords...")
265
+ papers = await self._fetch_papers_from_arxiv(keywords, categories, max_papers)
266
+
267
+ if not papers:
268
+ return {
269
+ "success": True,
270
+ "message": "No new papers found matching your keywords.",
271
+ "papers_count": 0
272
+ }
273
+
274
+ log_progress("Papers found", f"Found {len(papers)} papers matching your criteria")
275
+
276
+ # Step 3-5: Process each paper (text extraction, embeddings, analysis)
277
+ processed_papers = []
278
+ for i, paper in enumerate(papers):
279
+ paper_title = paper.get('title', 'Unknown')[:50]
280
+ log_progress(f"Analyzing paper {i+1}/{len(papers)}", paper_title)
281
+
282
+ try:
283
+ processed_paper = await self._process_paper(paper, user_id)
284
+ processed_papers.append(processed_paper)
285
+ log_progress(f"Paper {i+1} analyzed", f"Score: {processed_paper.get('analysis', {}).get('relevance_score', 'N/A')}/10")
286
+ except Exception as e:
287
+ logger.error(f"Failed to process paper {paper.get('arxiv_id')}: {e}")
288
+ # Continue with other papers
289
+ processed_papers.append({
290
+ "paper": paper,
291
+ "analysis": None,
292
+ "error": str(e)
293
+ })
294
+ log_progress(f"Paper {i+1} failed", str(e)[:50])
295
+
296
+ # Rate limiting between papers
297
+ if i < len(papers) - 1:
298
+ await asyncio.sleep(self.llm_delay)
299
+
300
+ # Step 6: Store in DB, replacing previous day's data
301
+ log_progress("Saving to database", "Storing analysis results...")
302
+ analysis_result = await self._store_daily_analysis(user_id, processed_papers, start_time)
303
+
304
+ execution_time = (datetime.utcnow() - start_time).total_seconds()
305
+ log_progress("Complete", f"Saved {len(processed_papers)} papers in {execution_time:.1f}s")
306
+
307
+ # Build dose object for return
308
+ successful_papers = [p for p in processed_papers if p.get("analysis") and not p.get("error")]
309
+
310
+ # Calculate summary statistics
311
+ avg_relevance = 0
312
+ if successful_papers:
313
+ scores = [p["analysis"].get("relevance_score", 5) for p in successful_papers]
314
+ avg_relevance = sum(scores) / len(scores)
315
+
316
+ dose = {
317
+ "papers": [
318
+ {
319
+ "arxiv_id": p["paper"]["arxiv_id"],
320
+ "title": p["paper"]["title"],
321
+ "authors": p["paper"]["authors"],
322
+ "abstract": p["paper"]["abstract"],
323
+ "categories": p["paper"]["categories"],
324
+ "published": p["paper"]["published"],
325
+ "pdf_url": p["paper"]["pdf_url"],
326
+ "analysis": p["analysis"],
327
+ "relevance_score": p["analysis"].get("relevance_score", 5) if p["analysis"] else 0
328
+ }
329
+ for p in processed_papers if p.get("analysis")
330
+ ],
331
+ "summary": {
332
+ "total_papers": len(processed_papers),
333
+ "successful_analyses": len(successful_papers),
334
+ "avg_relevance_score": round(avg_relevance, 2),
335
+ },
336
+ "generated_at": datetime.utcnow().isoformat()
337
+ }
338
+
339
+ return {
340
+ "success": True,
341
+ "message": "Daily dose generated successfully",
342
+ "papers_count": len(processed_papers),
343
+ "analysis_id": analysis_result.get("analysis_id"),
344
+ "execution_time": execution_time,
345
+ "dose": dose
346
+ }
347
+
348
+ except Exception as e:
349
+ logger.error(f"Daily dose execution failed for user {user_id}: {e}")
350
+ return {
351
+ "success": False,
352
+ "message": f"Daily dose execution failed: {str(e)}",
353
+ "papers_count": 0
354
+ }
355
+
356
+ async def _fetch_papers_from_arxiv(
357
+ self,
358
+ keywords: List[str],
359
+ categories: List[str],
360
+ max_papers: int
361
+ ) -> List[Dict[str, Any]]:
362
+ """
363
+ Fetch papers from arXiv based on keywords and categories.
364
+ Uses Atlas-style search combining keywords with category filters.
365
+ """
366
+ try:
367
+ # Normalize keywords - split any space-separated strings into individual keywords
368
+ normalized_keywords = []
369
+ for kw in keywords:
370
+ if isinstance(kw, str):
371
+ # Split by common separators and filter empty strings
372
+ # Handle both comma-separated and space-separated keywords
373
+ if ',' in kw:
374
+ # Comma-separated: "DPO, alignment, RL"
375
+ parts = [p.strip() for p in kw.split(',') if p.strip()]
376
+ else:
377
+ # Space-separated single words or phrases
378
+ # Keep multi-word phrases together (e.g., "test time scaling")
379
+ parts = [p.strip() for p in kw.split() if p.strip()]
380
+ normalized_keywords.extend(parts)
381
+ else:
382
+ normalized_keywords.append(str(kw))
383
+
384
+ # Remove duplicates while preserving order
385
+ seen = set()
386
+ unique_keywords = []
387
+ for kw in normalized_keywords:
388
+ kw_lower = kw.lower()
389
+ if kw_lower not in seen:
390
+ seen.add(kw_lower)
391
+ unique_keywords.append(kw)
392
+
393
+ logger.info(f"Normalized keywords: {unique_keywords}")
394
+
395
+ # Build search query
396
+ query_parts = []
397
+
398
+ # Add category filter
399
+ if categories:
400
+ cat_query = " OR ".join([f"cat:{cat}" for cat in categories])
401
+ query_parts.append(f"({cat_query})")
402
+
403
+ # Add keyword filter - don't use quotes for single words
404
+ if unique_keywords:
405
+ # For single words, just use the word; for phrases, use quotes
406
+ kw_parts = []
407
+ for kw in unique_keywords:
408
+ if ' ' in kw:
409
+ kw_parts.append(f'"{kw}"') # Phrase
410
+ else:
411
+ kw_parts.append(kw) # Single word
412
+ kw_query = " OR ".join(kw_parts)
413
+ query_parts.append(f"({kw_query})")
414
+
415
+ # Combine with AND
416
+ if query_parts:
417
+ full_query = " AND ".join(query_parts)
418
+ else:
419
+ # Fallback to general CS papers
420
+ full_query = "cat:cs.AI OR cat:cs.LG"
421
+
422
+ logger.info(f"Searching arXiv with query: {full_query}")
423
+
424
+ # Rate limit before API call
425
+ await asyncio.sleep(self.arxiv_delay)
426
+
427
+ # Search arXiv - sort by SubmittedDate (descending) to get most recent papers
428
+ import arxiv
429
+ papers = arxiv_client.search_papers(
430
+ query=full_query,
431
+ max_results=min(max_papers * 2, 20), # Fetch extra to allow filtering
432
+ sort_by=arxiv.SortCriterion.SubmittedDate # Most recent first!
433
+ )
434
+
435
+ # Filter to most recent papers and limit
436
+ papers = papers[:max_papers]
437
+
438
+ return papers
439
+
440
+ except Exception as e:
441
+ logger.error(f"Failed to fetch papers from arXiv: {e}")
442
+ return []
443
+
444
+ async def _process_paper(self, paper: Dict[str, Any], user_id: str) -> Dict[str, Any]:
445
+ """
446
+ Process a single paper: extract text, generate embeddings, create analysis.
447
+ """
448
+ arxiv_id = paper.get("arxiv_id", "unknown")
449
+
450
+ # Use abstract as primary text source (faster than PDF extraction)
451
+ paper_text = f"Title: {paper.get('title', '')}\n\n"
452
+ paper_text += f"Authors: {', '.join(paper.get('authors', []))}\n\n"
453
+ paper_text += f"Abstract: {paper.get('abstract', '')}\n\n"
454
+ paper_text += f"Categories: {', '.join(paper.get('categories', []))}"
455
+
456
+ # Generate thorough analysis using LLM
457
+ analysis = await self._generate_paper_analysis(paper)
458
+
459
+ # Generate embeddings for the paper
460
+ embeddings = await self._generate_embeddings(paper_text)
461
+
462
+ return {
463
+ "paper": {
464
+ "arxiv_id": arxiv_id,
465
+ "title": paper.get("title", ""),
466
+ "authors": paper.get("authors", []),
467
+ "abstract": paper.get("abstract", ""),
468
+ "categories": paper.get("categories", []),
469
+ "published": paper.get("published"),
470
+ "pdf_url": paper.get("pdf_url"),
471
+ "entry_id": paper.get("entry_id")
472
+ },
473
+ "text_content": paper_text,
474
+ "embeddings": embeddings,
475
+ "analysis": analysis,
476
+ "processed_at": datetime.utcnow().isoformat()
477
+ }
478
+
479
+ async def _generate_paper_analysis(self, paper: Dict[str, Any]) -> Dict[str, Any]:
480
+ """
481
+ Generate thorough analysis for a paper using LLM.
482
+ """
483
+ try:
484
+ title = paper.get("title", "")
485
+ abstract = paper.get("abstract", "")
486
+ categories = paper.get("categories", [])
487
+ authors = paper.get("authors", [])
488
+
489
+ # Use centralized prompt from prompts module
490
+ prompt = format_prompt(
491
+ "daily_dose_analysis",
492
+ title=title,
493
+ authors=', '.join(authors[:5]) + ('...' if len(authors) > 5 else ''),
494
+ categories=', '.join(categories),
495
+ abstract=abstract
496
+ )
497
+
498
+ # Rate limit before LLM call
499
+ await asyncio.sleep(self.llm_delay)
500
+
501
+ # Get LLM response using the appropriate client
502
+ client = _get_llm_client()
503
+ response = await client.get_completion(prompt)
504
+
505
+ if not response or response.startswith("Error"):
506
+ logger.warning(f"LLM analysis failed for paper: {paper.get('arxiv_id')}")
507
+ return self._get_fallback_analysis(paper)
508
+
509
+ # Parse the response into structured format
510
+ analysis = self._parse_analysis_response(response)
511
+ analysis["raw_response"] = response
512
+
513
+ return analysis
514
+
515
+ except Exception as e:
516
+ logger.error(f"Failed to generate analysis for paper: {e}")
517
+ return self._get_fallback_analysis(paper)
518
+
519
+ def _parse_analysis_response(self, response: str) -> Dict[str, Any]:
520
+ """Parse LLM response into structured analysis with robust section detection."""
521
+ import re
522
+
523
+ sections = {
524
+ "summary": "",
525
+ "key_findings": [],
526
+ "methodology": "",
527
+ "significance": "",
528
+ "limitations": "",
529
+ "relevance_score": 5
530
+ }
531
+
532
+ try:
533
+ # Define section patterns - order matters for matching priority
534
+ # Handles: "1. SUMMARY:", "SUMMARY:", "**SUMMARY**:", "Summary:", etc.
535
+ section_patterns = [
536
+ (r'(?:^|\n)\s*(?:\d+\.\s*)?(?:\*\*)?SUMMARY(?:\*\*)?[:\s]*', 'summary'),
537
+ (r'(?:^|\n)\s*(?:\d+\.\s*)?(?:\*\*)?KEY\s*FINDINGS?(?:\*\*)?[:\s]*', 'key_findings'),
538
+ (r'(?:^|\n)\s*(?:\d+\.\s*)?(?:\*\*)?METHODOLOGY(?:\*\*)?[:\s]*', 'methodology'),
539
+ (r'(?:^|\n)\s*(?:\d+\.\s*)?(?:\*\*)?SIGNIFICANCE(?:\*\*)?[:\s]*', 'significance'),
540
+ (r'(?:^|\n)\s*(?:\d+\.\s*)?(?:\*\*)?LIMITATIONS?(?:\*\*)?[:\s]*', 'limitations'),
541
+ (r'(?:^|\n)\s*(?:\d+\.\s*)?(?:\*\*)?RELEVANCE\s*SCORE(?:\*\*)?[:\s]*', 'relevance_score'),
542
+ ]
543
+
544
+ # Find all section positions, keeping only earliest match per section
545
+ best_positions = {}
546
+ for pattern, section_name in section_patterns:
547
+ for match in re.finditer(pattern, response, re.IGNORECASE | re.MULTILINE):
548
+ start = match.start()
549
+ end = match.end()
550
+ existing = best_positions.get(section_name)
551
+ if existing is None or start < existing[2]:
552
+ best_positions[section_name] = (end, section_name, start)
553
+
554
+ section_positions = list(best_positions.values())
555
+ # Sort by position in text
556
+ section_positions.sort(key=lambda x: x[0])
557
+
558
+ # Extract content for each section
559
+ for i, (start_pos, section_name, header_start) in enumerate(section_positions):
560
+ # Find end position (start of next section or end of text)
561
+ if i + 1 < len(section_positions):
562
+ end_pos = section_positions[i + 1][2] # header_start of next section
563
+ else:
564
+ end_pos = len(response)
565
+
566
+ content = response[start_pos:end_pos].strip()
567
+
568
+ if section_name == 'key_findings':
569
+ # Parse key findings as list - handle numbered items and bullet points
570
+ findings = []
571
+ # Split by numbered items (1., 2., etc.) or bullet points
572
+ finding_pattern = r'(?:^|\n)\s*(?:\d+[\.\)]\s*|[-•*]\s*)'
573
+ raw_items = re.split(finding_pattern, content)
574
+ # Filter out empty or whitespace-only items explicitly
575
+ items = [item for item in raw_items if item and item.strip()]
576
+ for item in items:
577
+ cleaned = item.strip()
578
+ # Skip items that look like section headers
579
+ if not re.match(r'^(?:METHODOLOGY|SIGNIFICANCE|LIMITATIONS?|RELEVANCE)', cleaned, re.IGNORECASE):
580
+ findings.append(cleaned)
581
+ sections['key_findings'] = findings if findings else [content] if content else []
582
+
583
+ elif section_name == 'relevance_score':
584
+ # Extract numeric score
585
+ score_match = re.search(r'(\d+)', content)
586
+ if score_match:
587
+ sections['relevance_score'] = min(10, max(1, int(score_match.group(1))))
588
+
589
+ elif section_name == 'limitations':
590
+ # Handle limitations - always store as string for consistency
591
+ # Clean up any list formatting markers but keep as single text block
592
+ cleaned_content = re.sub(r'(?:^|\n)\s*(?:\d+[\.\)]\s*|[-•*]\s*)', ' ', content)
593
+ cleaned_content = ' '.join(cleaned_content.split()) # Normalize whitespace
594
+ sections['limitations'] = cleaned_content.strip() if cleaned_content.strip() else content
595
+ else:
596
+ # Store as plain text for other sections
597
+ sections[section_name] = content
598
+
599
+ # If no sections were found, try fallback parsing
600
+ if not sections["summary"] and not sections["key_findings"]:
601
+ sections["summary"] = response[:500] + "..." if len(response) > 500 else response
602
+
603
+ except Exception as e:
604
+ logger.error(f"Failed to parse analysis response: {e}")
605
+ sections["summary"] = response[:500] + "..." if len(response) > 500 else response
606
+
607
+ return sections
608
+
609
+ def _get_fallback_analysis(self, paper: Dict[str, Any]) -> Dict[str, Any]:
610
+ """Return a fallback analysis when LLM fails."""
611
+ abstract = paper.get("abstract", "")
612
+ return {
613
+ "summary": abstract[:300] + "..." if len(abstract) > 300 else abstract,
614
+ "key_findings": ["Analysis unavailable - see abstract for details"],
615
+ "methodology": "See paper for methodology details",
616
+ "significance": "Please review the paper for significance",
617
+ "limitations": "Unable to determine",
618
+ "relevance_score": 5,
619
+ "error": "LLM analysis failed"
620
+ }
621
+
622
+ async def _generate_embeddings(self, text: str) -> Optional[List[float]]:
623
+ """Generate embeddings for the paper text."""
624
+ try:
625
+ from .unified_analysis_service import unified_analysis_service
626
+ embedding = await unified_analysis_service.get_single_embedding(text[:4000]) # Limit text length
627
+ return embedding
628
+ except Exception as e:
629
+ logger.warning(f"Failed to generate embeddings: {e}")
630
+ return None
631
+
632
+ async def _store_daily_analysis(
633
+ self,
634
+ user_id: str,
635
+ processed_papers: List[Dict[str, Any]],
636
+ start_time: datetime
637
+ ) -> Dict[str, Any]:
638
+ """
639
+ Store daily analysis in MongoDB, replacing any existing analysis for today.
640
+ """
641
+ try:
642
+ if unified_database_service.db is None:
643
+ await unified_database_service.connect_mongodb()
644
+
645
+ # Get today's date boundaries
646
+ today = datetime.utcnow().date()
647
+ start_of_day = datetime.combine(today, datetime.min.time())
648
+ end_of_day = datetime.combine(today, datetime.max.time())
649
+
650
+ # Delete any existing daily dose for this user (replace logic)
651
+ await unified_database_service.db.daily_dose.delete_many({
652
+ "user_id": user_id
653
+ })
654
+
655
+ # Prepare the daily dose document
656
+ successful_papers = [p for p in processed_papers if p.get("analysis") and not p.get("error")]
657
+
658
+ # Calculate summary statistics
659
+ avg_relevance = 0
660
+ if successful_papers:
661
+ scores = [p["analysis"].get("relevance_score", 5) for p in successful_papers]
662
+ avg_relevance = sum(scores) / len(scores)
663
+
664
+ # Get all categories covered
665
+ categories_covered = set()
666
+ for p in successful_papers:
667
+ categories_covered.update(p["paper"].get("categories", []))
668
+
669
+ # Get top keywords from papers
670
+ all_text = " ".join([p["paper"].get("title", "") for p in successful_papers])
671
+ top_keywords = self._extract_top_keywords(all_text)
672
+
673
+ daily_dose_doc = {
674
+ "user_id": user_id,
675
+ "generated_at": datetime.utcnow(),
676
+ "date": today.isoformat(),
677
+ "papers": [
678
+ {
679
+ "arxiv_id": p["paper"]["arxiv_id"],
680
+ "title": p["paper"]["title"],
681
+ "authors": p["paper"]["authors"],
682
+ "abstract": p["paper"]["abstract"],
683
+ "categories": p["paper"]["categories"],
684
+ "published": p["paper"]["published"],
685
+ "pdf_url": p["paper"]["pdf_url"],
686
+ "analysis": p["analysis"],
687
+ "relevance_score": p["analysis"].get("relevance_score", 5) if p["analysis"] else 0
688
+ }
689
+ for p in processed_papers
690
+ ],
691
+ "summary": {
692
+ "total_papers": len(processed_papers),
693
+ "successful_analyses": len(successful_papers),
694
+ "avg_relevance_score": round(avg_relevance, 2),
695
+ "categories_covered": list(categories_covered),
696
+ "top_keywords": top_keywords
697
+ },
698
+ "execution_time_seconds": (datetime.utcnow() - start_time).total_seconds(),
699
+ "created_at": datetime.utcnow()
700
+ }
701
+
702
+ # Insert new daily dose
703
+ result = await unified_database_service.db.daily_dose.insert_one(daily_dose_doc)
704
+
705
+ logger.info(f"Stored daily dose for user {user_id}, analysis_id: {result.inserted_id}")
706
+
707
+ return {
708
+ "success": True,
709
+ "analysis_id": str(result.inserted_id)
710
+ }
711
+
712
+ except Exception as e:
713
+ logger.error(f"Failed to store daily analysis: {e}")
714
+ return {
715
+ "success": False,
716
+ "error": str(e)
717
+ }
718
+
719
+ def _extract_top_keywords(self, text: str, max_keywords: int = 10) -> List[str]:
720
+ """Extract top keywords from text (simple frequency-based)."""
721
+ import re
722
+
723
+ # Simple keyword extraction
724
+ words = re.findall(r'\b[a-zA-Z]{4,}\b', text.lower())
725
+
726
+ # Filter common words
727
+ stopwords = {
728
+ "with", "from", "this", "that", "have", "been", "were", "their",
729
+ "which", "when", "where", "what", "will", "would", "could", "should",
730
+ "using", "based", "approach", "method", "paper", "model", "learning",
731
+ "neural", "network", "data", "results", "show", "propose", "proposed"
732
+ }
733
+
734
+ word_counts = {}
735
+ for word in words:
736
+ if word not in stopwords:
737
+ word_counts[word] = word_counts.get(word, 0) + 1
738
+
739
+ # Sort by frequency and return top keywords
740
+ sorted_words = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)
741
+ return [word for word, count in sorted_words[:max_keywords]]
742
+
743
+ async def get_user_daily_dose(self, user_id: str) -> Dict[str, Any]:
744
+ """
745
+ Get the latest daily dose for a user from the database.
746
+ """
747
+ try:
748
+ if unified_database_service.db is None:
749
+ await unified_database_service.connect_mongodb()
750
+
751
+ # Get the most recent daily dose
752
+ daily_dose = await unified_database_service.db.daily_dose.find_one(
753
+ {"user_id": user_id},
754
+ sort=[("generated_at", -1)]
755
+ )
756
+
757
+ if not daily_dose:
758
+ return {
759
+ "success": False,
760
+ "message": "No daily dose found. Generate one using 'arionxiv daily --run'",
761
+ "data": None
762
+ }
763
+
764
+ # Convert ObjectId to string
765
+ daily_dose["_id"] = str(daily_dose["_id"])
766
+
767
+ return {
768
+ "success": True,
769
+ "data": daily_dose
770
+ }
771
+
772
+ except Exception as e:
773
+ logger.error(f"Failed to get daily dose for user {user_id}: {e}")
774
+ return {
775
+ "success": False,
776
+ "message": str(e),
777
+ "data": None
778
+ }
779
+
780
+ async def get_paper_analysis(self, user_id: str, arxiv_id: str) -> Dict[str, Any]:
781
+ """
782
+ Get the stored analysis for a specific paper from the daily dose.
783
+ """
784
+ try:
785
+ daily_dose_result = await self.get_user_daily_dose(user_id)
786
+
787
+ if not daily_dose_result["success"]:
788
+ return daily_dose_result
789
+
790
+ papers = daily_dose_result["data"].get("papers", [])
791
+
792
+ for paper in papers:
793
+ if paper.get("arxiv_id") == arxiv_id:
794
+ return {
795
+ "success": True,
796
+ "paper": paper,
797
+ "analysis": paper.get("analysis")
798
+ }
799
+
800
+ return {
801
+ "success": False,
802
+ "message": f"Paper {arxiv_id} not found in daily dose"
803
+ }
804
+
805
+ except Exception as e:
806
+ logger.error(f"Failed to get paper analysis: {e}")
807
+ return {
808
+ "success": False,
809
+ "message": str(e)
810
+ }
811
+
812
+
813
+ # Global instance
814
+ unified_daily_dose_service = UnifiedDailyDoseService()
815
+
816
+ # Backwards compatibility
817
+ daily_dose_service = unified_daily_dose_service
818
+
819
+ __all__ = [
820
+ 'UnifiedDailyDoseService',
821
+ 'unified_daily_dose_service',
822
+ 'daily_dose_service'
823
+ ]