scholarx 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scholarx/__init__.py +59 -0
- scholarx/__main__.py +7 -0
- scholarx/agent_server.py +94 -0
- scholarx/api_client.py +253 -0
- scholarx/cli.py +897 -0
- scholarx/deduplication.py +195 -0
- scholarx/kg_integration.py +251 -0
- scholarx/main_agent.json +15 -0
- scholarx/mcp_config.json +25 -0
- scholarx/mcp_server.py +370 -0
- scholarx/models.py +223 -0
- scholarx/paper_storage.py +155 -0
- scholarx/providers/__init__.py +21 -0
- scholarx/providers/arxiv.py +269 -0
- scholarx/providers/base.py +148 -0
- scholarx/providers/biorxiv.py +147 -0
- scholarx/providers/osf.py +110 -0
- scholarx/providers/pmc.py +189 -0
- scholarx/providers/rss.py +304 -0
- scholarx/providers/semantic_scholar.py +96 -0
- scholarx/scanner.py +953 -0
- scholarx-0.6.0.dist-info/METADATA +370 -0
- scholarx-0.6.0.dist-info/RECORD +27 -0
- scholarx-0.6.0.dist-info/WHEEL +5 -0
- scholarx-0.6.0.dist-info/entry_points.txt +4 -0
- scholarx-0.6.0.dist-info/licenses/LICENSE +21 -0
- scholarx-0.6.0.dist-info/top_level.txt +1 -0
scholarx/__init__.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
"""ScholarX — Universal Research Paper API.
|
|
3
|
+
|
|
4
|
+
A single entry point for querying research papers from arXiv, PMC,
|
|
5
|
+
bioRxiv, medRxiv, PsyArXiv, OSF, and Semantic Scholar.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import importlib
|
|
9
|
+
import inspect
|
|
10
|
+
|
|
11
|
+
__all__: list[str] = []
|
|
12
|
+
|
|
13
|
+
CORE_MODULES = [
|
|
14
|
+
"scholarx.models",
|
|
15
|
+
"scholarx.api_client",
|
|
16
|
+
"scholarx.deduplication",
|
|
17
|
+
"scholarx.paper_storage",
|
|
18
|
+
"scholarx.scanner",
|
|
19
|
+
]
|
|
20
|
+
|
|
21
|
+
OPTIONAL_MODULES = {
|
|
22
|
+
"scholarx.agent_server": "agent_server",
|
|
23
|
+
"scholarx.mcp_server": "mcp_server",
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _import_module_safely(module_name: str):
|
|
28
|
+
"""Try to import a module and return it, or None if not available."""
|
|
29
|
+
try:
|
|
30
|
+
return importlib.import_module(module_name)
|
|
31
|
+
except ImportError:
|
|
32
|
+
return None
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _expose_members(module):
|
|
36
|
+
"""Expose public classes and functions from a module into globals and __all__."""
|
|
37
|
+
for name, obj in inspect.getmembers(module):
|
|
38
|
+
if (inspect.isclass(obj) or inspect.isfunction(obj)) and not name.startswith("_"):
|
|
39
|
+
globals()[name] = obj
|
|
40
|
+
__all__.append(name)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
for _module_name in CORE_MODULES:
|
|
44
|
+
_module = _import_module_safely(_module_name)
|
|
45
|
+
if _module is not None:
|
|
46
|
+
_expose_members(_module)
|
|
47
|
+
|
|
48
|
+
for _module_name, _extra_name in OPTIONAL_MODULES.items():
|
|
49
|
+
_module = _import_module_safely(_module_name)
|
|
50
|
+
if _module is not None:
|
|
51
|
+
_expose_members(_module)
|
|
52
|
+
globals()[f"_{_extra_name.upper()}_AVAILABLE"] = True
|
|
53
|
+
else:
|
|
54
|
+
globals()[f"_{_extra_name.upper()}_AVAILABLE"] = False
|
|
55
|
+
|
|
56
|
+
__all__.extend(["_MCP_AVAILABLE", "_AGENT_AVAILABLE"])
|
|
57
|
+
|
|
58
|
+
_MCP_AVAILABLE = globals().get("_MCP_SERVER_AVAILABLE", False)
|
|
59
|
+
_AGENT_AVAILABLE = globals().get("_AGENT_SERVER_AVAILABLE", False)
|
scholarx/__main__.py
ADDED
scholarx/agent_server.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
"""ScholarX Agent Server.
|
|
3
|
+
|
|
4
|
+
Standard graph agent server following the genius-agent pattern.
|
|
5
|
+
Uses create_graph_agent_server from agent-utilities for graph orchestration.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
import os
|
|
10
|
+
import sys
|
|
11
|
+
import warnings
|
|
12
|
+
|
|
13
|
+
# Filter warnings early
|
|
14
|
+
with warnings.catch_warnings():
|
|
15
|
+
warnings.simplefilter("ignore")
|
|
16
|
+
try:
|
|
17
|
+
from requests.exceptions import RequestsDependencyWarning
|
|
18
|
+
|
|
19
|
+
warnings.filterwarnings("ignore", category=RequestsDependencyWarning)
|
|
20
|
+
except ImportError:
|
|
21
|
+
pass
|
|
22
|
+
|
|
23
|
+
warnings.filterwarnings("ignore", message=".*urllib3.*or chardet.*")
|
|
24
|
+
warnings.filterwarnings("ignore", message=".*urllib3.*or charset_normalizer.*")
|
|
25
|
+
warnings.filterwarnings("ignore", category=DeprecationWarning, module="fastmcp")
|
|
26
|
+
|
|
27
|
+
from agent_utilities import (
|
|
28
|
+
build_system_prompt_from_workspace,
|
|
29
|
+
create_agent_parser,
|
|
30
|
+
create_graph_agent_server,
|
|
31
|
+
initialize_workspace,
|
|
32
|
+
load_identity,
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
__version__ = "0.6.0"
|
|
36
|
+
|
|
37
|
+
logging.basicConfig(
|
|
38
|
+
level=logging.INFO,
|
|
39
|
+
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
|
40
|
+
handlers=[logging.StreamHandler()],
|
|
41
|
+
)
|
|
42
|
+
logger = logging.getLogger(__name__)
|
|
43
|
+
|
|
44
|
+
initialize_workspace()
|
|
45
|
+
meta = load_identity()
|
|
46
|
+
|
|
47
|
+
DEFAULT_AGENT_NAME = os.getenv("DEFAULT_AGENT_NAME", meta.get("name", "ScholarX Agent"))
|
|
48
|
+
DEFAULT_AGENT_DESCRIPTION = os.getenv(
|
|
49
|
+
"AGENT_DESCRIPTION",
|
|
50
|
+
meta.get("description", "ScholarX — Universal research paper discovery and analysis."),
|
|
51
|
+
)
|
|
52
|
+
DEFAULT_AGENT_SYSTEM_PROMPT = os.getenv(
|
|
53
|
+
"AGENT_SYSTEM_PROMPT",
|
|
54
|
+
meta.get("content") or build_system_prompt_from_workspace(),
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def agent_server():
|
|
59
|
+
"""Agent server entry point."""
|
|
60
|
+
print(f"{DEFAULT_AGENT_NAME} v{__version__}", file=sys.stderr)
|
|
61
|
+
logger.info("Application startup complete")
|
|
62
|
+
|
|
63
|
+
parser = create_agent_parser()
|
|
64
|
+
args = parser.parse_args()
|
|
65
|
+
|
|
66
|
+
if args.debug:
|
|
67
|
+
logging.getLogger().setLevel(logging.DEBUG)
|
|
68
|
+
logger.debug("Debug mode enabled")
|
|
69
|
+
|
|
70
|
+
create_graph_agent_server(
|
|
71
|
+
mcp_url=args.mcp_url,
|
|
72
|
+
mcp_config=args.mcp_config or "mcp_config.json",
|
|
73
|
+
host=args.host,
|
|
74
|
+
port=args.port,
|
|
75
|
+
provider=args.provider,
|
|
76
|
+
model_id=args.model_id,
|
|
77
|
+
router_model=args.model_id,
|
|
78
|
+
agent_model=args.model_id,
|
|
79
|
+
base_url=args.base_url,
|
|
80
|
+
api_key=args.api_key,
|
|
81
|
+
custom_skills_directory=args.custom_skills_directory,
|
|
82
|
+
enable_web_ui=args.web,
|
|
83
|
+
enable_otel=args.otel,
|
|
84
|
+
otel_endpoint=args.otel_endpoint,
|
|
85
|
+
otel_headers=args.otel_headers,
|
|
86
|
+
otel_public_key=args.otel_public_key,
|
|
87
|
+
otel_secret_key=args.otel_secret_key,
|
|
88
|
+
otel_protocol=args.otel_protocol,
|
|
89
|
+
debug=args.debug,
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
if __name__ == "__main__":
|
|
94
|
+
agent_server()
|
scholarx/api_client.py
ADDED
|
@@ -0,0 +1,253 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
"""ScholarX Unified API Client.
|
|
3
|
+
|
|
4
|
+
The single entry point for all paper operations. Users interact with
|
|
5
|
+
ScholarXClient — never with providers directly. The client handles
|
|
6
|
+
fan-out to configured providers, cross-source deduplication, and
|
|
7
|
+
result aggregation.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import asyncio
|
|
13
|
+
import logging
|
|
14
|
+
|
|
15
|
+
from .deduplication import deduplicate_papers
|
|
16
|
+
from .models import (
|
|
17
|
+
DEFAULT_SOURCE_CONFIGS,
|
|
18
|
+
Paper,
|
|
19
|
+
PaperSource,
|
|
20
|
+
SearchQuery,
|
|
21
|
+
SearchResult,
|
|
22
|
+
SourceConfig,
|
|
23
|
+
SourceStatus,
|
|
24
|
+
)
|
|
25
|
+
from .paper_storage import PaperStorage
|
|
26
|
+
from .providers.base import PaperProvider
|
|
27
|
+
|
|
28
|
+
logger = logging.getLogger(__name__)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _create_provider(source: PaperSource, config: SourceConfig) -> PaperProvider:
|
|
32
|
+
"""Factory to create a provider instance for a given source."""
|
|
33
|
+
from .providers.arxiv import ArxivProvider
|
|
34
|
+
from .providers.biorxiv import BiorxivProvider, MedrxivProvider
|
|
35
|
+
from .providers.osf import OSFProvider, PsyarxivProvider
|
|
36
|
+
from .providers.pmc import PMCProvider
|
|
37
|
+
from .providers.semantic_scholar import SemanticScholarProvider
|
|
38
|
+
|
|
39
|
+
_PROVIDER_MAP: dict[PaperSource, type[PaperProvider]] = {
|
|
40
|
+
PaperSource.ARXIV: ArxivProvider,
|
|
41
|
+
PaperSource.PMC: PMCProvider,
|
|
42
|
+
PaperSource.BIORXIV: BiorxivProvider,
|
|
43
|
+
PaperSource.MEDRXIV: MedrxivProvider,
|
|
44
|
+
PaperSource.PSYARXIV: PsyarxivProvider,
|
|
45
|
+
PaperSource.OSF: OSFProvider,
|
|
46
|
+
PaperSource.SEMANTIC_SCHOLAR: SemanticScholarProvider,
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
provider_cls = _PROVIDER_MAP.get(source)
|
|
50
|
+
if not provider_cls:
|
|
51
|
+
raise ValueError(f"Unknown paper source: {source}")
|
|
52
|
+
return provider_cls(config)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class ScholarXClient:
|
|
56
|
+
"""Universal research paper client.
|
|
57
|
+
|
|
58
|
+
Fan-out queries to all configured sources, deduplicate results,
|
|
59
|
+
and present a unified view. Users never need to know about
|
|
60
|
+
individual provider implementations.
|
|
61
|
+
|
|
62
|
+
Usage::
|
|
63
|
+
|
|
64
|
+
client = ScholarXClient()
|
|
65
|
+
result = await client.search(SearchQuery(query="multi-agent systems"))
|
|
66
|
+
for paper in result.papers:
|
|
67
|
+
print(f"{paper.title} ({paper.source})")
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
def __init__(
|
|
71
|
+
self,
|
|
72
|
+
sources: list[PaperSource] | None = None,
|
|
73
|
+
configs: dict[PaperSource, SourceConfig] | None = None,
|
|
74
|
+
storage_dir: str | None = None,
|
|
75
|
+
):
|
|
76
|
+
"""Initialize the client.
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
sources: List of sources to enable (default: all).
|
|
80
|
+
configs: Optional per-source configuration overrides.
|
|
81
|
+
storage_dir: Optional custom paper storage directory.
|
|
82
|
+
"""
|
|
83
|
+
self._configs = configs or dict(DEFAULT_SOURCE_CONFIGS)
|
|
84
|
+
enabled_sources = sources or list(PaperSource)
|
|
85
|
+
|
|
86
|
+
self._providers: dict[PaperSource, PaperProvider] = {}
|
|
87
|
+
for source in enabled_sources:
|
|
88
|
+
config = self._configs.get(source, DEFAULT_SOURCE_CONFIGS.get(source))
|
|
89
|
+
if config and config.enabled:
|
|
90
|
+
try:
|
|
91
|
+
self._providers[source] = _create_provider(source, config)
|
|
92
|
+
except Exception as e:
|
|
93
|
+
logger.warning(f"Failed to initialize provider {source}: {e}")
|
|
94
|
+
|
|
95
|
+
self.storage = PaperStorage(storage_dir)
|
|
96
|
+
logger.info(
|
|
97
|
+
f"ScholarX initialized with {len(self._providers)} sources: {', '.join(s.value for s in self._providers)}"
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
@property
|
|
101
|
+
def enabled_sources(self) -> list[PaperSource]:
|
|
102
|
+
"""List of currently enabled sources."""
|
|
103
|
+
return list(self._providers.keys())
|
|
104
|
+
|
|
105
|
+
async def search(self, query: SearchQuery) -> SearchResult:
|
|
106
|
+
"""Search across all configured sources with deduplication.
|
|
107
|
+
|
|
108
|
+
Args:
|
|
109
|
+
query: Unified search query.
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
Aggregated, deduplicated SearchResult.
|
|
113
|
+
"""
|
|
114
|
+
# Filter to requested sources
|
|
115
|
+
target_sources = [s for s in query.sources if s in self._providers]
|
|
116
|
+
if not target_sources:
|
|
117
|
+
target_sources = list(self._providers.keys())
|
|
118
|
+
|
|
119
|
+
# Fan out to all providers concurrently
|
|
120
|
+
all_papers: list[Paper] = []
|
|
121
|
+
sources_failed: list[str] = []
|
|
122
|
+
|
|
123
|
+
async def _query_source(source: PaperSource) -> list[Paper]:
|
|
124
|
+
try:
|
|
125
|
+
return await self._providers[source].search(query)
|
|
126
|
+
except Exception as e:
|
|
127
|
+
logger.error(f"Search failed for {source.value}: {e}")
|
|
128
|
+
sources_failed.append(f"{source.value}: {e}")
|
|
129
|
+
return []
|
|
130
|
+
|
|
131
|
+
results = await asyncio.gather(
|
|
132
|
+
*[_query_source(s) for s in target_sources],
|
|
133
|
+
return_exceptions=True,
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
for i, result in enumerate(results):
|
|
137
|
+
if isinstance(result, list):
|
|
138
|
+
all_papers.extend(result)
|
|
139
|
+
elif isinstance(result, Exception):
|
|
140
|
+
sources_failed.append(f"{target_sources[i].value}: {result}")
|
|
141
|
+
|
|
142
|
+
# Deduplicate
|
|
143
|
+
deduped, dup_count = deduplicate_papers(all_papers)
|
|
144
|
+
|
|
145
|
+
# Sort
|
|
146
|
+
if query.sort_by == "date":
|
|
147
|
+
deduped.sort(key=lambda p: p.published_date or "", reverse=True)
|
|
148
|
+
|
|
149
|
+
return SearchResult(
|
|
150
|
+
papers=deduped[: query.max_results],
|
|
151
|
+
total_count=len(deduped),
|
|
152
|
+
sources_queried=[s for s in target_sources],
|
|
153
|
+
sources_failed=sources_failed,
|
|
154
|
+
deduplicated_count=dup_count,
|
|
155
|
+
query=query.query,
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
async def get_paper(self, source: PaperSource, paper_id: str) -> Paper | None:
|
|
159
|
+
"""Retrieve a single paper from a specific source.
|
|
160
|
+
|
|
161
|
+
Args:
|
|
162
|
+
source: The paper source to query.
|
|
163
|
+
paper_id: Source-specific paper identifier.
|
|
164
|
+
|
|
165
|
+
Returns:
|
|
166
|
+
Paper or None if not found.
|
|
167
|
+
"""
|
|
168
|
+
provider = self._providers.get(source)
|
|
169
|
+
if not provider:
|
|
170
|
+
logger.error(f"Source not configured: {source}")
|
|
171
|
+
return None
|
|
172
|
+
return await provider.get_paper(paper_id)
|
|
173
|
+
|
|
174
|
+
async def get_recent_papers(
|
|
175
|
+
self,
|
|
176
|
+
categories: list[str] | None = None,
|
|
177
|
+
days: int = 1,
|
|
178
|
+
sources: list[PaperSource] | None = None,
|
|
179
|
+
) -> SearchResult:
|
|
180
|
+
"""Retrieve recently published papers.
|
|
181
|
+
|
|
182
|
+
Args:
|
|
183
|
+
categories: Optional category filters.
|
|
184
|
+
days: Number of days to look back.
|
|
185
|
+
sources: Optional source filter (default: all).
|
|
186
|
+
|
|
187
|
+
Returns:
|
|
188
|
+
Aggregated, deduplicated SearchResult.
|
|
189
|
+
"""
|
|
190
|
+
target_sources = [s for s in (sources or list(self._providers.keys())) if s in self._providers]
|
|
191
|
+
|
|
192
|
+
all_papers: list[Paper] = []
|
|
193
|
+
sources_failed: list[str] = []
|
|
194
|
+
|
|
195
|
+
async def _fetch(source: PaperSource) -> list[Paper]:
|
|
196
|
+
try:
|
|
197
|
+
return await self._providers[source].get_recent(categories, days)
|
|
198
|
+
except Exception as e:
|
|
199
|
+
logger.error(f"get_recent failed for {source.value}: {e}")
|
|
200
|
+
sources_failed.append(f"{source.value}: {e}")
|
|
201
|
+
return []
|
|
202
|
+
|
|
203
|
+
results = await asyncio.gather(
|
|
204
|
+
*[_fetch(s) for s in target_sources],
|
|
205
|
+
return_exceptions=True,
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
for i, result in enumerate(results):
|
|
209
|
+
if isinstance(result, list):
|
|
210
|
+
all_papers.extend(result)
|
|
211
|
+
elif isinstance(result, Exception):
|
|
212
|
+
sources_failed.append(f"{target_sources[i].value}: {result}")
|
|
213
|
+
|
|
214
|
+
deduped, dup_count = deduplicate_papers(all_papers)
|
|
215
|
+
deduped.sort(key=lambda p: p.published_date or "", reverse=True)
|
|
216
|
+
|
|
217
|
+
return SearchResult(
|
|
218
|
+
papers=deduped,
|
|
219
|
+
total_count=len(deduped),
|
|
220
|
+
sources_queried=target_sources,
|
|
221
|
+
sources_failed=sources_failed,
|
|
222
|
+
deduplicated_count=dup_count,
|
|
223
|
+
query="recent papers",
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
async def download_paper(self, paper: Paper) -> str | None:
|
|
227
|
+
"""Download a paper's full PDF.
|
|
228
|
+
|
|
229
|
+
Args:
|
|
230
|
+
paper: Paper to download.
|
|
231
|
+
|
|
232
|
+
Returns:
|
|
233
|
+
Local file path string, or None on failure.
|
|
234
|
+
"""
|
|
235
|
+
path = await self.storage.download_paper(paper)
|
|
236
|
+
return str(path) if path else None
|
|
237
|
+
|
|
238
|
+
async def get_source_status(self) -> list[SourceStatus]:
|
|
239
|
+
"""Get the status of all configured sources."""
|
|
240
|
+
return [SourceStatus(source=source, available=True) for source in self._providers]
|
|
241
|
+
|
|
242
|
+
async def list_categories(self, source: PaperSource | None = None) -> dict[str, list[dict]]:
|
|
243
|
+
"""List available categories for each source."""
|
|
244
|
+
result: dict[str, list[dict]] = {}
|
|
245
|
+
targets = [source] if source and source in self._providers else list(self._providers.keys())
|
|
246
|
+
for s in targets:
|
|
247
|
+
try:
|
|
248
|
+
cats = await self._providers[s].get_categories()
|
|
249
|
+
result[s.value] = cats
|
|
250
|
+
except Exception as e:
|
|
251
|
+
logger.warning(f"Failed to get categories for {s.value}: {e}")
|
|
252
|
+
result[s.value] = []
|
|
253
|
+
return result
|