arionxiv 1.0.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arionxiv/__init__.py +40 -0
- arionxiv/__main__.py +10 -0
- arionxiv/arxiv_operations/__init__.py +0 -0
- arionxiv/arxiv_operations/client.py +225 -0
- arionxiv/arxiv_operations/fetcher.py +173 -0
- arionxiv/arxiv_operations/searcher.py +122 -0
- arionxiv/arxiv_operations/utils.py +293 -0
- arionxiv/cli/__init__.py +4 -0
- arionxiv/cli/commands/__init__.py +1 -0
- arionxiv/cli/commands/analyze.py +587 -0
- arionxiv/cli/commands/auth.py +365 -0
- arionxiv/cli/commands/chat.py +714 -0
- arionxiv/cli/commands/daily.py +482 -0
- arionxiv/cli/commands/fetch.py +217 -0
- arionxiv/cli/commands/library.py +295 -0
- arionxiv/cli/commands/preferences.py +426 -0
- arionxiv/cli/commands/search.py +254 -0
- arionxiv/cli/commands/settings_unified.py +1407 -0
- arionxiv/cli/commands/trending.py +41 -0
- arionxiv/cli/commands/welcome.py +168 -0
- arionxiv/cli/main.py +407 -0
- arionxiv/cli/ui/__init__.py +1 -0
- arionxiv/cli/ui/global_theme_manager.py +173 -0
- arionxiv/cli/ui/logo.py +127 -0
- arionxiv/cli/ui/splash.py +89 -0
- arionxiv/cli/ui/theme.py +32 -0
- arionxiv/cli/ui/theme_system.py +391 -0
- arionxiv/cli/utils/__init__.py +54 -0
- arionxiv/cli/utils/animations.py +522 -0
- arionxiv/cli/utils/api_client.py +583 -0
- arionxiv/cli/utils/api_config.py +505 -0
- arionxiv/cli/utils/command_suggestions.py +147 -0
- arionxiv/cli/utils/db_config_manager.py +254 -0
- arionxiv/github_actions_runner.py +206 -0
- arionxiv/main.py +23 -0
- arionxiv/prompts/__init__.py +9 -0
- arionxiv/prompts/prompts.py +247 -0
- arionxiv/rag_techniques/__init__.py +8 -0
- arionxiv/rag_techniques/basic_rag.py +1531 -0
- arionxiv/scheduler_daemon.py +139 -0
- arionxiv/server.py +1000 -0
- arionxiv/server_main.py +24 -0
- arionxiv/services/__init__.py +73 -0
- arionxiv/services/llm_client.py +30 -0
- arionxiv/services/llm_inference/__init__.py +58 -0
- arionxiv/services/llm_inference/groq_client.py +469 -0
- arionxiv/services/llm_inference/llm_utils.py +250 -0
- arionxiv/services/llm_inference/openrouter_client.py +564 -0
- arionxiv/services/unified_analysis_service.py +872 -0
- arionxiv/services/unified_auth_service.py +457 -0
- arionxiv/services/unified_config_service.py +456 -0
- arionxiv/services/unified_daily_dose_service.py +823 -0
- arionxiv/services/unified_database_service.py +1633 -0
- arionxiv/services/unified_llm_service.py +366 -0
- arionxiv/services/unified_paper_service.py +604 -0
- arionxiv/services/unified_pdf_service.py +522 -0
- arionxiv/services/unified_prompt_service.py +344 -0
- arionxiv/services/unified_scheduler_service.py +589 -0
- arionxiv/services/unified_user_service.py +954 -0
- arionxiv/utils/__init__.py +51 -0
- arionxiv/utils/api_helpers.py +200 -0
- arionxiv/utils/file_cleanup.py +150 -0
- arionxiv/utils/ip_helper.py +96 -0
- arionxiv-1.0.32.dist-info/METADATA +336 -0
- arionxiv-1.0.32.dist-info/RECORD +69 -0
- arionxiv-1.0.32.dist-info/WHEEL +5 -0
- arionxiv-1.0.32.dist-info/entry_points.txt +4 -0
- arionxiv-1.0.32.dist-info/licenses/LICENSE +21 -0
- arionxiv-1.0.32.dist-info/top_level.txt +1 -0
arionxiv/__init__.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"""
|
|
2
|
+
ArionXiv - AI-Powered Research Paper Analysis and Management
|
|
3
|
+
|
|
4
|
+
A comprehensive tool for discovering, analyzing, and managing research papers
|
|
5
|
+
from arXiv with AI-powered insights and organizational features.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
__version__ = "1.0.29"
|
|
9
|
+
__author__ = "Arion Das"
|
|
10
|
+
__email__ = "ariondasad@gmail.com"
|
|
11
|
+
__description__ = "AI-Powered Research Paper Analysis and Management"
|
|
12
|
+
|
|
13
|
+
# Lazy imports to avoid requiring fastapi for CLI/GitHub Actions usage
|
|
14
|
+
# Services are imported on-demand when accessed
|
|
15
|
+
def __getattr__(name):
|
|
16
|
+
"""Lazy import of services to avoid loading fastapi for CLI usage."""
|
|
17
|
+
if name == "config":
|
|
18
|
+
from .services.unified_config_service import config
|
|
19
|
+
return config
|
|
20
|
+
elif name == "database_service":
|
|
21
|
+
from .services.unified_database_service import database_service
|
|
22
|
+
return database_service
|
|
23
|
+
elif name == "paper_service":
|
|
24
|
+
from .services.unified_paper_service import paper_service
|
|
25
|
+
return paper_service
|
|
26
|
+
elif name == "analysis_service":
|
|
27
|
+
from .services.unified_analysis_service import analysis_service
|
|
28
|
+
return analysis_service
|
|
29
|
+
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
|
30
|
+
|
|
31
|
+
__all__ = [
|
|
32
|
+
"__version__",
|
|
33
|
+
"__author__",
|
|
34
|
+
"__email__",
|
|
35
|
+
"__description__",
|
|
36
|
+
"config",
|
|
37
|
+
"database_service",
|
|
38
|
+
"paper_service",
|
|
39
|
+
"analysis_service"
|
|
40
|
+
]
|
arionxiv/__main__.py
ADDED
|
File without changes
|
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
# Arxiv API client for fetching papers
|
|
2
|
+
import arxiv
|
|
3
|
+
import logging
|
|
4
|
+
from typing import List, Dict, Any, Optional
|
|
5
|
+
from datetime import datetime, timedelta
|
|
6
|
+
|
|
7
|
+
logger = logging.getLogger(__name__)
|
|
8
|
+
|
|
9
|
+
class ArxivClient:
|
|
10
|
+
"""Client for interacting with Arxiv API"""
|
|
11
|
+
|
|
12
|
+
def __init__(self):
|
|
13
|
+
self.client = arxiv.Client()
|
|
14
|
+
self.default_page_size = 100
|
|
15
|
+
self.max_results = 100
|
|
16
|
+
|
|
17
|
+
# Short words to skip in title searches (arXiv doesn't index these well)
|
|
18
|
+
SKIP_WORDS = {'a', 'an', 'the', 'is', 'are', 'be', 'to', 'of', 'in', 'on',
|
|
19
|
+
'at', 'by', 'for', 'and', 'or', 'but', 'not', 'all', 'you',
|
|
20
|
+
'it', 'its', 'as', 'so', 'if', 'do', 'no', 'up', 'we', 'my'}
|
|
21
|
+
|
|
22
|
+
def search_papers(self, query: str, max_results: int = None, sort_by=arxiv.SortCriterion.Relevance) -> List[Dict[str, Any]]:
|
|
23
|
+
"""Search for papers on Arxiv with relevance scoring"""
|
|
24
|
+
try:
|
|
25
|
+
max_results = max_results or self.default_page_size
|
|
26
|
+
|
|
27
|
+
# If query already contains arXiv operators (cat:, au:, ti:, abs:, AND, OR), use as-is
|
|
28
|
+
has_operators = any(op in query for op in ['cat:', 'au:', 'ti:', 'abs:', ' AND ', ' OR '])
|
|
29
|
+
|
|
30
|
+
if has_operators:
|
|
31
|
+
# Query already formatted with operators
|
|
32
|
+
search_query = query
|
|
33
|
+
else:
|
|
34
|
+
# Build title search - skip short common words that arXiv doesn't handle well
|
|
35
|
+
words = [w.strip() for w in query.split() if w.strip()]
|
|
36
|
+
content_words = [w for w in words if w.lower() not in self.SKIP_WORDS]
|
|
37
|
+
|
|
38
|
+
if content_words:
|
|
39
|
+
title_parts = [f"ti:{word.title()}" for word in content_words]
|
|
40
|
+
search_query = " AND ".join(title_parts)
|
|
41
|
+
else:
|
|
42
|
+
# All words were skipped, use plain query
|
|
43
|
+
search_query = query
|
|
44
|
+
|
|
45
|
+
# Fetch more results than requested so we can filter/sort better
|
|
46
|
+
fetch_count = min(max_results * 3, self.max_results) if not has_operators else max_results
|
|
47
|
+
|
|
48
|
+
search = arxiv.Search(
|
|
49
|
+
query=search_query,
|
|
50
|
+
max_results=min(fetch_count, self.max_results),
|
|
51
|
+
sort_by=sort_by
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
papers = []
|
|
55
|
+
for result in self.client.results(search):
|
|
56
|
+
paper_data = {
|
|
57
|
+
"arxiv_id": result.entry_id.split('/')[-1],
|
|
58
|
+
"title": result.title,
|
|
59
|
+
"abstract": result.summary,
|
|
60
|
+
"authors": [str(author) for author in result.authors],
|
|
61
|
+
"published": result.published.isoformat() if result.published else None,
|
|
62
|
+
"updated": result.updated.isoformat() if result.updated else None,
|
|
63
|
+
"categories": result.categories,
|
|
64
|
+
"primary_category": result.primary_category,
|
|
65
|
+
"pdf_url": result.pdf_url,
|
|
66
|
+
"entry_id": result.entry_id,
|
|
67
|
+
"doi": result.doi,
|
|
68
|
+
"journal_ref": result.journal_ref,
|
|
69
|
+
"comment": result.comment,
|
|
70
|
+
"links": [{"href": link.href, "title": link.title, "rel": link.rel} for link in result.links]
|
|
71
|
+
}
|
|
72
|
+
papers.append(paper_data)
|
|
73
|
+
|
|
74
|
+
# Re-score and sort papers by title match quality, then limit to requested count
|
|
75
|
+
if not has_operators and papers:
|
|
76
|
+
papers = self._score_and_sort_papers(papers, query)[:max_results]
|
|
77
|
+
|
|
78
|
+
logger.info(f"Found {len(papers)} papers for query: {query}")
|
|
79
|
+
return papers
|
|
80
|
+
except Exception as e:
|
|
81
|
+
logger.error(f"Error searching papers: {str(e)}")
|
|
82
|
+
return []
|
|
83
|
+
|
|
84
|
+
def _score_and_sort_papers(self, papers: List[Dict[str, Any]], query: str) -> List[Dict[str, Any]]:
|
|
85
|
+
"""Score papers by how well their title matches the query and sort by score"""
|
|
86
|
+
query_lower = query.lower().strip()
|
|
87
|
+
query_words = set(query_lower.split())
|
|
88
|
+
|
|
89
|
+
scored_papers = []
|
|
90
|
+
for paper in papers:
|
|
91
|
+
title_lower = paper['title'].lower()
|
|
92
|
+
score = 0
|
|
93
|
+
|
|
94
|
+
# Exact title match (highest priority)
|
|
95
|
+
if title_lower == query_lower:
|
|
96
|
+
score += 1000
|
|
97
|
+
# Title contains the exact query phrase
|
|
98
|
+
elif query_lower in title_lower:
|
|
99
|
+
score += 500
|
|
100
|
+
|
|
101
|
+
# Count matching words in title
|
|
102
|
+
title_words = set(title_lower.split())
|
|
103
|
+
matching_words = query_words & title_words
|
|
104
|
+
score += len(matching_words) * 50
|
|
105
|
+
|
|
106
|
+
# Bonus for shorter titles (more likely to be exact match)
|
|
107
|
+
if len(title_words) <= len(query_words) + 2:
|
|
108
|
+
score += 100
|
|
109
|
+
|
|
110
|
+
# Bonus for title starting with query words
|
|
111
|
+
if title_lower.startswith(query_lower.split()[0]):
|
|
112
|
+
score += 75
|
|
113
|
+
|
|
114
|
+
scored_papers.append((score, paper))
|
|
115
|
+
|
|
116
|
+
# Sort by score descending
|
|
117
|
+
scored_papers.sort(key=lambda x: x[0], reverse=True)
|
|
118
|
+
return [paper for score, paper in scored_papers]
|
|
119
|
+
|
|
120
|
+
def get_paper_by_id(self, arxiv_id: str) -> Optional[Dict[str, Any]]:
|
|
121
|
+
"""Get a specific paper by its Arxiv ID"""
|
|
122
|
+
try:
|
|
123
|
+
search = arxiv.Search(id_list=[arxiv_id])
|
|
124
|
+
|
|
125
|
+
for result in self.client.results(search):
|
|
126
|
+
paper_data = {
|
|
127
|
+
"arxiv_id": result.entry_id.split('/')[-1],
|
|
128
|
+
"title": result.title,
|
|
129
|
+
"abstract": result.summary,
|
|
130
|
+
"authors": [str(author) for author in result.authors],
|
|
131
|
+
"published": result.published.isoformat() if result.published else None,
|
|
132
|
+
"updated": result.updated.isoformat() if result.updated else None,
|
|
133
|
+
"categories": result.categories,
|
|
134
|
+
"primary_category": result.primary_category,
|
|
135
|
+
"pdf_url": result.pdf_url,
|
|
136
|
+
"entry_id": result.entry_id,
|
|
137
|
+
"doi": result.doi,
|
|
138
|
+
"journal_ref": result.journal_ref,
|
|
139
|
+
"comment": result.comment,
|
|
140
|
+
"links": [{"href": link.href, "title": link.title, "rel": link.rel} for link in result.links]
|
|
141
|
+
}
|
|
142
|
+
return paper_data
|
|
143
|
+
|
|
144
|
+
return None
|
|
145
|
+
except Exception as e:
|
|
146
|
+
logger.error(f"Error fetching paper {arxiv_id}: {str(e)}")
|
|
147
|
+
return None
|
|
148
|
+
|
|
149
|
+
def get_recent_papers(self, category: str = None, days: int = 7, max_results: int = 50) -> List[Dict[str, Any]]:
|
|
150
|
+
"""Get recent papers from the last N days"""
|
|
151
|
+
try:
|
|
152
|
+
# Build query for recent papers
|
|
153
|
+
query_parts = []
|
|
154
|
+
|
|
155
|
+
if category:
|
|
156
|
+
query_parts.append(f"cat:{category}")
|
|
157
|
+
|
|
158
|
+
# Date filter (Arxiv doesn't support date ranges directly, so we'll filter results)
|
|
159
|
+
cutoff_date = datetime.now() - timedelta(days=days)
|
|
160
|
+
|
|
161
|
+
query = " AND ".join(query_parts) if query_parts else "all:machine learning"
|
|
162
|
+
|
|
163
|
+
search = arxiv.Search(
|
|
164
|
+
query=query,
|
|
165
|
+
max_results=max_results,
|
|
166
|
+
sort_by=arxiv.SortCriterion.SubmittedDate
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
papers = []
|
|
170
|
+
for result in self.client.results(search):
|
|
171
|
+
# Filter by date
|
|
172
|
+
if result.published and result.published.replace(tzinfo=None) >= cutoff_date:
|
|
173
|
+
paper_data = {
|
|
174
|
+
"arxiv_id": result.entry_id.split('/')[-1],
|
|
175
|
+
"title": result.title,
|
|
176
|
+
"abstract": result.summary,
|
|
177
|
+
"authors": [str(author) for author in result.authors],
|
|
178
|
+
"published": result.published.isoformat() if result.published else None,
|
|
179
|
+
"updated": result.updated.isoformat() if result.updated else None,
|
|
180
|
+
"categories": result.categories,
|
|
181
|
+
"primary_category": result.primary_category,
|
|
182
|
+
"pdf_url": result.pdf_url,
|
|
183
|
+
"entry_id": result.entry_id,
|
|
184
|
+
"doi": result.doi,
|
|
185
|
+
"journal_ref": result.journal_ref,
|
|
186
|
+
"comment": result.comment
|
|
187
|
+
}
|
|
188
|
+
papers.append(paper_data)
|
|
189
|
+
|
|
190
|
+
logger.info(f"Found {len(papers)} recent papers in category: {category}")
|
|
191
|
+
return papers
|
|
192
|
+
except Exception as e:
|
|
193
|
+
logger.error(f"Error fetching recent papers: {str(e)}")
|
|
194
|
+
return []
|
|
195
|
+
|
|
196
|
+
def get_papers_by_category(self, category: str, max_results: int = 20) -> List[Dict[str, Any]]:
|
|
197
|
+
"""Get papers by category"""
|
|
198
|
+
try:
|
|
199
|
+
query = f"cat:{category}"
|
|
200
|
+
return self.search_papers(query, max_results)
|
|
201
|
+
except Exception as e:
|
|
202
|
+
logger.error(f"Error fetching papers by category {category}: {str(e)}")
|
|
203
|
+
return []
|
|
204
|
+
|
|
205
|
+
def get_papers_by_author(self, author: str, max_results: int = 20) -> List[Dict[str, Any]]:
|
|
206
|
+
"""Get papers by author"""
|
|
207
|
+
try:
|
|
208
|
+
query = f"au:{author}"
|
|
209
|
+
return self.search_papers(query, max_results)
|
|
210
|
+
except Exception as e:
|
|
211
|
+
logger.error(f"Error fetching papers by author {author}: {str(e)}")
|
|
212
|
+
return []
|
|
213
|
+
|
|
214
|
+
def get_trending_papers(self, category: str = None, days: int = 30) -> List[Dict[str, Any]]:
|
|
215
|
+
"""Get trending papers (most recent with high engagement indicators)"""
|
|
216
|
+
try:
|
|
217
|
+
# For now, we'll use recent papers as a proxy for trending
|
|
218
|
+
# In a full implementation, this could consider download counts, citations, etc.
|
|
219
|
+
return self.get_recent_papers(category=category, days=days, max_results=30)
|
|
220
|
+
except Exception as e:
|
|
221
|
+
logger.error(f"Error fetching trending papers: {str(e)}")
|
|
222
|
+
return []
|
|
223
|
+
|
|
224
|
+
# Global instance
|
|
225
|
+
arxiv_client = ArxivClient()
|
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
import requests
|
|
2
|
+
import aiohttp
|
|
3
|
+
import asyncio
|
|
4
|
+
import os
|
|
5
|
+
import logging
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Optional, Dict, Any
|
|
8
|
+
|
|
9
|
+
from ..services.unified_pdf_service import pdf_processor
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
class ArxivFetcher:
|
|
14
|
+
"""Fetches and processes papers from Arxiv"""
|
|
15
|
+
|
|
16
|
+
def __init__(self):
|
|
17
|
+
self.session = None
|
|
18
|
+
self.download_dir = "downloads"
|
|
19
|
+
os.makedirs(self.download_dir, exist_ok=True)
|
|
20
|
+
|
|
21
|
+
async def _get_session(self):
|
|
22
|
+
"""Get or create aiohttp session"""
|
|
23
|
+
if self.session is None:
|
|
24
|
+
self.session = aiohttp.ClientSession()
|
|
25
|
+
return self.session
|
|
26
|
+
|
|
27
|
+
async def fetch_paper_pdf(self, arxiv_id: str, pdf_url: str) -> Optional[str]:
|
|
28
|
+
"""Fetch PDF for a paper"""
|
|
29
|
+
try:
|
|
30
|
+
session = await self._get_session()
|
|
31
|
+
|
|
32
|
+
# Clean arxiv_id for filename
|
|
33
|
+
safe_id = arxiv_id.replace('/', '_').replace(':', '_')
|
|
34
|
+
pdf_path = os.path.join(self.download_dir, f"{safe_id}.pdf")
|
|
35
|
+
|
|
36
|
+
# Check if already downloaded
|
|
37
|
+
if os.path.exists(pdf_path):
|
|
38
|
+
logger.info(f"PDF already exists: {pdf_path}")
|
|
39
|
+
return pdf_path
|
|
40
|
+
|
|
41
|
+
# Download PDF
|
|
42
|
+
async with session.get(pdf_url) as response:
|
|
43
|
+
if response.status == 200:
|
|
44
|
+
content = await response.read()
|
|
45
|
+
with open(pdf_path, 'wb') as f:
|
|
46
|
+
f.write(content)
|
|
47
|
+
logger.info(f"Downloaded PDF: {pdf_path}")
|
|
48
|
+
return pdf_path
|
|
49
|
+
else:
|
|
50
|
+
logger.error(f"Failed to download PDF: {response.status}")
|
|
51
|
+
return None
|
|
52
|
+
except Exception as e:
|
|
53
|
+
logger.error(f"Error fetching PDF for {arxiv_id}: {str(e)}")
|
|
54
|
+
return None
|
|
55
|
+
|
|
56
|
+
async def fetch_and_process_paper(self, paper_data: Dict[str, Any]) -> Dict[str, Any]:
|
|
57
|
+
"""Fetch and process a complete paper"""
|
|
58
|
+
try:
|
|
59
|
+
arxiv_id = paper_data.get("arxiv_id")
|
|
60
|
+
pdf_url = paper_data.get("pdf_url")
|
|
61
|
+
|
|
62
|
+
logger.info(f"Processing paper: {arxiv_id}")
|
|
63
|
+
|
|
64
|
+
if not arxiv_id or not pdf_url:
|
|
65
|
+
logger.warning(f"Missing arxiv_id or pdf_url for paper")
|
|
66
|
+
return {"error": "Missing arxiv_id or pdf_url"}
|
|
67
|
+
|
|
68
|
+
# Fetch PDF
|
|
69
|
+
logger.debug(f"Fetching PDF from: {pdf_url}")
|
|
70
|
+
pdf_path = await self.fetch_paper_pdf(arxiv_id, pdf_url)
|
|
71
|
+
if not pdf_path:
|
|
72
|
+
logger.error(f"Failed to download PDF for {arxiv_id}")
|
|
73
|
+
return {"error": "Failed to download PDF"}
|
|
74
|
+
|
|
75
|
+
# Process PDF
|
|
76
|
+
logger.debug(f"Processing PDF: {pdf_path}")
|
|
77
|
+
processing_result = await pdf_processor.process_pdf(pdf_path)
|
|
78
|
+
|
|
79
|
+
logger.info(f"Successfully processed paper: {arxiv_id}")
|
|
80
|
+
|
|
81
|
+
# Combine paper metadata with processed content
|
|
82
|
+
result = {
|
|
83
|
+
**paper_data,
|
|
84
|
+
"pdf_path": pdf_path,
|
|
85
|
+
"processed_content": processing_result,
|
|
86
|
+
"fetch_timestamp": asyncio.get_event_loop().time()
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
return result
|
|
90
|
+
except Exception as e:
|
|
91
|
+
logger.error(f"Error processing paper: {str(e)}")
|
|
92
|
+
return {"error": str(e)}
|
|
93
|
+
|
|
94
|
+
async def batch_fetch_papers(self, papers: list) -> list:
|
|
95
|
+
"""Fetch multiple papers concurrently"""
|
|
96
|
+
try:
|
|
97
|
+
logger.info(f"Starting batch fetch for {len(papers)} papers")
|
|
98
|
+
tasks = []
|
|
99
|
+
for paper in papers:
|
|
100
|
+
task = self.fetch_and_process_paper(paper)
|
|
101
|
+
tasks.append(task)
|
|
102
|
+
|
|
103
|
+
# Limit concurrent downloads
|
|
104
|
+
semaphore = asyncio.Semaphore(3)
|
|
105
|
+
|
|
106
|
+
async def bounded_fetch(paper):
|
|
107
|
+
async with semaphore:
|
|
108
|
+
return await self.fetch_and_process_paper(paper)
|
|
109
|
+
|
|
110
|
+
bounded_tasks = [bounded_fetch(paper) for paper in papers]
|
|
111
|
+
results = await asyncio.gather(*bounded_tasks, return_exceptions=True)
|
|
112
|
+
|
|
113
|
+
# Filter out exceptions
|
|
114
|
+
successful_results = []
|
|
115
|
+
for result in results:
|
|
116
|
+
if isinstance(result, Exception):
|
|
117
|
+
logger.error(f"Batch fetch error: {str(result)}", exc_info=True)
|
|
118
|
+
else:
|
|
119
|
+
successful_results.append(result)
|
|
120
|
+
|
|
121
|
+
logger.info(f"Batch fetch completed: {len(successful_results)}/{len(papers)} successful")
|
|
122
|
+
return successful_results
|
|
123
|
+
except Exception as e:
|
|
124
|
+
logger.error(f"Batch fetch error: {str(e)}", exc_info=True)
|
|
125
|
+
return []
|
|
126
|
+
|
|
127
|
+
def fetch_paper_sync(self, arxiv_id: str, pdf_url: str) -> Optional[str]:
|
|
128
|
+
"""Synchronous version of PDF fetch"""
|
|
129
|
+
try:
|
|
130
|
+
safe_id = arxiv_id.replace('/', '_').replace(':', '_')
|
|
131
|
+
pdf_path = os.path.join(self.download_dir, f"{safe_id}.pdf")
|
|
132
|
+
|
|
133
|
+
if os.path.exists(pdf_path):
|
|
134
|
+
return pdf_path
|
|
135
|
+
|
|
136
|
+
response = requests.get(pdf_url, timeout=30)
|
|
137
|
+
if response.status_code == 200:
|
|
138
|
+
with open(pdf_path, 'wb') as f:
|
|
139
|
+
f.write(response.content)
|
|
140
|
+
logger.info(f"Downloaded PDF: {pdf_path}")
|
|
141
|
+
return pdf_path
|
|
142
|
+
else:
|
|
143
|
+
logger.error(f"Failed to download PDF: {response.status_code}")
|
|
144
|
+
return None
|
|
145
|
+
except Exception as e:
|
|
146
|
+
logger.error(f"Error fetching PDF sync for {arxiv_id}: {str(e)}")
|
|
147
|
+
return None
|
|
148
|
+
|
|
149
|
+
async def cleanup_downloads(self, max_age_days: int = 7):
|
|
150
|
+
"""Clean up old downloaded files"""
|
|
151
|
+
try:
|
|
152
|
+
import time
|
|
153
|
+
current_time = time.time()
|
|
154
|
+
max_age_seconds = max_age_days * 24 * 60 * 60
|
|
155
|
+
|
|
156
|
+
for filename in os.listdir(self.download_dir):
|
|
157
|
+
file_path = os.path.join(self.download_dir, filename)
|
|
158
|
+
if os.path.isfile(file_path):
|
|
159
|
+
file_age = current_time - os.path.getmtime(file_path)
|
|
160
|
+
if file_age > max_age_seconds:
|
|
161
|
+
os.remove(file_path)
|
|
162
|
+
logger.info(f"Cleaned up old file: {filename}")
|
|
163
|
+
except Exception as e:
|
|
164
|
+
logger.error(f"Error during cleanup: {str(e)}")
|
|
165
|
+
|
|
166
|
+
async def close(self):
|
|
167
|
+
"""Close the session"""
|
|
168
|
+
if self.session:
|
|
169
|
+
await self.session.close()
|
|
170
|
+
self.session = None
|
|
171
|
+
|
|
172
|
+
# Global instance
|
|
173
|
+
arxiv_fetcher = ArxivFetcher()
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
# Simple text-based search for arXiv papers
|
|
2
|
+
from typing import List, Dict, Any, Optional
|
|
3
|
+
import logging
|
|
4
|
+
|
|
5
|
+
from .client import arxiv_client
|
|
6
|
+
|
|
7
|
+
logger = logging.getLogger(__name__)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ArxivSearcher:
|
|
11
|
+
"""Simple text-based search for arXiv papers"""
|
|
12
|
+
|
|
13
|
+
def __init__(self):
|
|
14
|
+
self.client = arxiv_client
|
|
15
|
+
|
|
16
|
+
# Common categories for reference
|
|
17
|
+
self.categories = {
|
|
18
|
+
"cs.AI": "Artificial Intelligence",
|
|
19
|
+
"cs.LG": "Machine Learning",
|
|
20
|
+
"cs.CV": "Computer Vision",
|
|
21
|
+
"cs.CL": "Computation and Language",
|
|
22
|
+
"cs.RO": "Robotics",
|
|
23
|
+
"stat.ML": "Machine Learning (Statistics)",
|
|
24
|
+
"cs.DC": "Distributed Computing",
|
|
25
|
+
"cs.DB": "Databases",
|
|
26
|
+
"cs.IR": "Information Retrieval",
|
|
27
|
+
"math.OC": "Optimization and Control",
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
async def search(self, query: str, max_results: int = 10) -> Dict[str, Any]:
|
|
31
|
+
"""
|
|
32
|
+
Simple text search that returns the closest matching papers.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
query: Search text
|
|
36
|
+
max_results: Number of results to return (default 10)
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
Dict with success status and list of papers
|
|
40
|
+
"""
|
|
41
|
+
try:
|
|
42
|
+
logger.info(f"Searching arXiv: query='{query}', max_results={max_results}")
|
|
43
|
+
# Direct search via arXiv API (uses relevance sorting by default)
|
|
44
|
+
papers = self.client.search_papers(query=query, max_results=max_results)
|
|
45
|
+
|
|
46
|
+
logger.info(f"Search completed: found {len(papers)} papers")
|
|
47
|
+
return {
|
|
48
|
+
"success": True,
|
|
49
|
+
"papers": papers,
|
|
50
|
+
"count": len(papers),
|
|
51
|
+
"query": query
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
except Exception as e:
|
|
55
|
+
logger.error(f"Search failed: {str(e)}", exc_info=True)
|
|
56
|
+
return {"success": False, "error": str(e), "papers": []}
|
|
57
|
+
|
|
58
|
+
async def search_by_category(self, query: str, category: str, max_results: int = 10) -> Dict[str, Any]:
|
|
59
|
+
"""
|
|
60
|
+
Search within a specific category.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
query: Search text
|
|
64
|
+
category: arXiv category (e.g., cs.LG, cs.AI)
|
|
65
|
+
max_results: Number of results to return
|
|
66
|
+
|
|
67
|
+
Returns:
|
|
68
|
+
Dict with success status and list of papers
|
|
69
|
+
"""
|
|
70
|
+
try:
|
|
71
|
+
# Combine query with category filter
|
|
72
|
+
full_query = f"{query} AND cat:{category}" if query else f"cat:{category}"
|
|
73
|
+
logger.info(f"Searching by category: query='{full_query}', category={category}")
|
|
74
|
+
papers = self.client.search_papers(query=full_query, max_results=max_results)
|
|
75
|
+
|
|
76
|
+
logger.info(f"Category search completed: found {len(papers)} papers in {category}")
|
|
77
|
+
return {
|
|
78
|
+
"success": True,
|
|
79
|
+
"papers": papers,
|
|
80
|
+
"count": len(papers),
|
|
81
|
+
"query": query,
|
|
82
|
+
"category": category
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
except Exception as e:
|
|
86
|
+
logger.error(f"Category search failed: {str(e)}", exc_info=True)
|
|
87
|
+
return {"success": False, "error": str(e), "papers": []}
|
|
88
|
+
|
|
89
|
+
async def search_by_author(self, author: str, max_results: int = 10) -> Dict[str, Any]:
|
|
90
|
+
"""
|
|
91
|
+
Search papers by author name.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
author: Author name
|
|
95
|
+
max_results: Number of results to return
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
Dict with success status and list of papers
|
|
99
|
+
"""
|
|
100
|
+
try:
|
|
101
|
+
logger.info(f"Searching by author: author='{author}', max_results={max_results}")
|
|
102
|
+
papers = self.client.get_papers_by_author(author=author, max_results=max_results)
|
|
103
|
+
|
|
104
|
+
logger.info(f"Author search completed: found {len(papers)} papers by {author}")
|
|
105
|
+
return {
|
|
106
|
+
"success": True,
|
|
107
|
+
"papers": papers,
|
|
108
|
+
"count": len(papers),
|
|
109
|
+
"author": author
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
except Exception as e:
|
|
113
|
+
logger.error(f"Author search failed: {str(e)}", exc_info=True)
|
|
114
|
+
return {"success": False, "error": str(e), "papers": []}
|
|
115
|
+
|
|
116
|
+
def get_available_categories(self) -> Dict[str, str]:
|
|
117
|
+
"""Get available paper categories"""
|
|
118
|
+
return self.categories.copy()
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
# Global instance
|
|
122
|
+
arxiv_searcher = ArxivSearcher()
|