scholarx 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
scholarx/__init__.py ADDED
@@ -0,0 +1,59 @@
1
+ #!/usr/bin/env python
2
+ """ScholarX — Universal Research Paper API.
3
+
4
+ A single entry point for querying research papers from arXiv, PMC,
5
+ bioRxiv, medRxiv, PsyArXiv, OSF, and Semantic Scholar.
6
+ """
7
+
8
+ import importlib
9
+ import inspect
10
+
11
+ __all__: list[str] = []
12
+
13
+ CORE_MODULES = [
14
+ "scholarx.models",
15
+ "scholarx.api_client",
16
+ "scholarx.deduplication",
17
+ "scholarx.paper_storage",
18
+ "scholarx.scanner",
19
+ ]
20
+
21
+ OPTIONAL_MODULES = {
22
+ "scholarx.agent_server": "agent_server",
23
+ "scholarx.mcp_server": "mcp_server",
24
+ }
25
+
26
+
27
+ def _import_module_safely(module_name: str):
28
+ """Try to import a module and return it, or None if not available."""
29
+ try:
30
+ return importlib.import_module(module_name)
31
+ except ImportError:
32
+ return None
33
+
34
+
35
+ def _expose_members(module):
36
+ """Expose public classes and functions from a module into globals and __all__."""
37
+ for name, obj in inspect.getmembers(module):
38
+ if (inspect.isclass(obj) or inspect.isfunction(obj)) and not name.startswith("_"):
39
+ globals()[name] = obj
40
+ __all__.append(name)
41
+
42
+
43
+ for _module_name in CORE_MODULES:
44
+ _module = _import_module_safely(_module_name)
45
+ if _module is not None:
46
+ _expose_members(_module)
47
+
48
+ for _module_name, _extra_name in OPTIONAL_MODULES.items():
49
+ _module = _import_module_safely(_module_name)
50
+ if _module is not None:
51
+ _expose_members(_module)
52
+ globals()[f"_{_extra_name.upper()}_AVAILABLE"] = True
53
+ else:
54
+ globals()[f"_{_extra_name.upper()}_AVAILABLE"] = False
55
+
56
+ __all__.extend(["_MCP_AVAILABLE", "_AGENT_AVAILABLE"])
57
+
58
+ _MCP_AVAILABLE = globals().get("_MCP_SERVER_AVAILABLE", False)
59
+ _AGENT_AVAILABLE = globals().get("_AGENT_SERVER_AVAILABLE", False)
scholarx/__main__.py ADDED
@@ -0,0 +1,7 @@
1
+ #!/usr/bin/env python
2
+ """ScholarX CLI entry point."""
3
+
4
+ from scholarx.cli import cli
5
+
6
+ if __name__ == "__main__":
7
+ cli()
@@ -0,0 +1,94 @@
1
+ #!/usr/bin/python
2
+ """ScholarX Agent Server.
3
+
4
+ Standard graph agent server following the genius-agent pattern.
5
+ Uses create_graph_agent_server from agent-utilities for graph orchestration.
6
+ """
7
+
8
+ import logging
9
+ import os
10
+ import sys
11
+ import warnings
12
+
13
+ # Filter warnings early
14
+ with warnings.catch_warnings():
15
+ warnings.simplefilter("ignore")
16
+ try:
17
+ from requests.exceptions import RequestsDependencyWarning
18
+
19
+ warnings.filterwarnings("ignore", category=RequestsDependencyWarning)
20
+ except ImportError:
21
+ pass
22
+
23
+ warnings.filterwarnings("ignore", message=".*urllib3.*or chardet.*")
24
+ warnings.filterwarnings("ignore", message=".*urllib3.*or charset_normalizer.*")
25
+ warnings.filterwarnings("ignore", category=DeprecationWarning, module="fastmcp")
26
+
27
+ from agent_utilities import (
28
+ build_system_prompt_from_workspace,
29
+ create_agent_parser,
30
+ create_graph_agent_server,
31
+ initialize_workspace,
32
+ load_identity,
33
+ )
34
+
35
+ __version__ = "0.6.0"
36
+
37
+ logging.basicConfig(
38
+ level=logging.INFO,
39
+ format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
40
+ handlers=[logging.StreamHandler()],
41
+ )
42
+ logger = logging.getLogger(__name__)
43
+
44
+ initialize_workspace()
45
+ meta = load_identity()
46
+
47
+ DEFAULT_AGENT_NAME = os.getenv("DEFAULT_AGENT_NAME", meta.get("name", "ScholarX Agent"))
48
+ DEFAULT_AGENT_DESCRIPTION = os.getenv(
49
+ "AGENT_DESCRIPTION",
50
+ meta.get("description", "ScholarX — Universal research paper discovery and analysis."),
51
+ )
52
+ DEFAULT_AGENT_SYSTEM_PROMPT = os.getenv(
53
+ "AGENT_SYSTEM_PROMPT",
54
+ meta.get("content") or build_system_prompt_from_workspace(),
55
+ )
56
+
57
+
58
+ def agent_server():
59
+ """Agent server entry point."""
60
+ print(f"{DEFAULT_AGENT_NAME} v{__version__}", file=sys.stderr)
61
+ logger.info("Application startup complete")
62
+
63
+ parser = create_agent_parser()
64
+ args = parser.parse_args()
65
+
66
+ if args.debug:
67
+ logging.getLogger().setLevel(logging.DEBUG)
68
+ logger.debug("Debug mode enabled")
69
+
70
+ create_graph_agent_server(
71
+ mcp_url=args.mcp_url,
72
+ mcp_config=args.mcp_config or "mcp_config.json",
73
+ host=args.host,
74
+ port=args.port,
75
+ provider=args.provider,
76
+ model_id=args.model_id,
77
+ router_model=args.model_id,
78
+ agent_model=args.model_id,
79
+ base_url=args.base_url,
80
+ api_key=args.api_key,
81
+ custom_skills_directory=args.custom_skills_directory,
82
+ enable_web_ui=args.web,
83
+ enable_otel=args.otel,
84
+ otel_endpoint=args.otel_endpoint,
85
+ otel_headers=args.otel_headers,
86
+ otel_public_key=args.otel_public_key,
87
+ otel_secret_key=args.otel_secret_key,
88
+ otel_protocol=args.otel_protocol,
89
+ debug=args.debug,
90
+ )
91
+
92
+
93
+ if __name__ == "__main__":
94
+ agent_server()
scholarx/api_client.py ADDED
@@ -0,0 +1,253 @@
1
+ #!/usr/bin/python
2
+ """ScholarX Unified API Client.
3
+
4
+ The single entry point for all paper operations. Users interact with
5
+ ScholarXClient — never with providers directly. The client handles
6
+ fan-out to configured providers, cross-source deduplication, and
7
+ result aggregation.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import asyncio
13
+ import logging
14
+
15
+ from .deduplication import deduplicate_papers
16
+ from .models import (
17
+ DEFAULT_SOURCE_CONFIGS,
18
+ Paper,
19
+ PaperSource,
20
+ SearchQuery,
21
+ SearchResult,
22
+ SourceConfig,
23
+ SourceStatus,
24
+ )
25
+ from .paper_storage import PaperStorage
26
+ from .providers.base import PaperProvider
27
+
28
+ logger = logging.getLogger(__name__)
29
+
30
+
31
+ def _create_provider(source: PaperSource, config: SourceConfig) -> PaperProvider:
32
+ """Factory to create a provider instance for a given source."""
33
+ from .providers.arxiv import ArxivProvider
34
+ from .providers.biorxiv import BiorxivProvider, MedrxivProvider
35
+ from .providers.osf import OSFProvider, PsyarxivProvider
36
+ from .providers.pmc import PMCProvider
37
+ from .providers.semantic_scholar import SemanticScholarProvider
38
+
39
+ _PROVIDER_MAP: dict[PaperSource, type[PaperProvider]] = {
40
+ PaperSource.ARXIV: ArxivProvider,
41
+ PaperSource.PMC: PMCProvider,
42
+ PaperSource.BIORXIV: BiorxivProvider,
43
+ PaperSource.MEDRXIV: MedrxivProvider,
44
+ PaperSource.PSYARXIV: PsyarxivProvider,
45
+ PaperSource.OSF: OSFProvider,
46
+ PaperSource.SEMANTIC_SCHOLAR: SemanticScholarProvider,
47
+ }
48
+
49
+ provider_cls = _PROVIDER_MAP.get(source)
50
+ if not provider_cls:
51
+ raise ValueError(f"Unknown paper source: {source}")
52
+ return provider_cls(config)
53
+
54
+
55
+ class ScholarXClient:
56
+ """Universal research paper client.
57
+
58
+ Fan-out queries to all configured sources, deduplicate results,
59
+ and present a unified view. Users never need to know about
60
+ individual provider implementations.
61
+
62
+ Usage::
63
+
64
+ client = ScholarXClient()
65
+ result = await client.search(SearchQuery(query="multi-agent systems"))
66
+ for paper in result.papers:
67
+ print(f"{paper.title} ({paper.source})")
68
+ """
69
+
70
+ def __init__(
71
+ self,
72
+ sources: list[PaperSource] | None = None,
73
+ configs: dict[PaperSource, SourceConfig] | None = None,
74
+ storage_dir: str | None = None,
75
+ ):
76
+ """Initialize the client.
77
+
78
+ Args:
79
+ sources: List of sources to enable (default: all).
80
+ configs: Optional per-source configuration overrides.
81
+ storage_dir: Optional custom paper storage directory.
82
+ """
83
+ self._configs = configs or dict(DEFAULT_SOURCE_CONFIGS)
84
+ enabled_sources = sources or list(PaperSource)
85
+
86
+ self._providers: dict[PaperSource, PaperProvider] = {}
87
+ for source in enabled_sources:
88
+ config = self._configs.get(source, DEFAULT_SOURCE_CONFIGS.get(source))
89
+ if config and config.enabled:
90
+ try:
91
+ self._providers[source] = _create_provider(source, config)
92
+ except Exception as e:
93
+ logger.warning(f"Failed to initialize provider {source}: {e}")
94
+
95
+ self.storage = PaperStorage(storage_dir)
96
+ logger.info(
97
+ f"ScholarX initialized with {len(self._providers)} sources: {', '.join(s.value for s in self._providers)}"
98
+ )
99
+
100
+ @property
101
+ def enabled_sources(self) -> list[PaperSource]:
102
+ """List of currently enabled sources."""
103
+ return list(self._providers.keys())
104
+
105
+ async def search(self, query: SearchQuery) -> SearchResult:
106
+ """Search across all configured sources with deduplication.
107
+
108
+ Args:
109
+ query: Unified search query.
110
+
111
+ Returns:
112
+ Aggregated, deduplicated SearchResult.
113
+ """
114
+ # Filter to requested sources
115
+ target_sources = [s for s in query.sources if s in self._providers]
116
+ if not target_sources:
117
+ target_sources = list(self._providers.keys())
118
+
119
+ # Fan out to all providers concurrently
120
+ all_papers: list[Paper] = []
121
+ sources_failed: list[str] = []
122
+
123
+ async def _query_source(source: PaperSource) -> list[Paper]:
124
+ try:
125
+ return await self._providers[source].search(query)
126
+ except Exception as e:
127
+ logger.error(f"Search failed for {source.value}: {e}")
128
+ sources_failed.append(f"{source.value}: {e}")
129
+ return []
130
+
131
+ results = await asyncio.gather(
132
+ *[_query_source(s) for s in target_sources],
133
+ return_exceptions=True,
134
+ )
135
+
136
+ for i, result in enumerate(results):
137
+ if isinstance(result, list):
138
+ all_papers.extend(result)
139
+ elif isinstance(result, Exception):
140
+ sources_failed.append(f"{target_sources[i].value}: {result}")
141
+
142
+ # Deduplicate
143
+ deduped, dup_count = deduplicate_papers(all_papers)
144
+
145
+ # Sort
146
+ if query.sort_by == "date":
147
+ deduped.sort(key=lambda p: p.published_date or "", reverse=True)
148
+
149
+ return SearchResult(
150
+ papers=deduped[: query.max_results],
151
+ total_count=len(deduped),
152
+ sources_queried=[s for s in target_sources],
153
+ sources_failed=sources_failed,
154
+ deduplicated_count=dup_count,
155
+ query=query.query,
156
+ )
157
+
158
+ async def get_paper(self, source: PaperSource, paper_id: str) -> Paper | None:
159
+ """Retrieve a single paper from a specific source.
160
+
161
+ Args:
162
+ source: The paper source to query.
163
+ paper_id: Source-specific paper identifier.
164
+
165
+ Returns:
166
+ Paper or None if not found.
167
+ """
168
+ provider = self._providers.get(source)
169
+ if not provider:
170
+ logger.error(f"Source not configured: {source}")
171
+ return None
172
+ return await provider.get_paper(paper_id)
173
+
174
+ async def get_recent_papers(
175
+ self,
176
+ categories: list[str] | None = None,
177
+ days: int = 1,
178
+ sources: list[PaperSource] | None = None,
179
+ ) -> SearchResult:
180
+ """Retrieve recently published papers.
181
+
182
+ Args:
183
+ categories: Optional category filters.
184
+ days: Number of days to look back.
185
+ sources: Optional source filter (default: all).
186
+
187
+ Returns:
188
+ Aggregated, deduplicated SearchResult.
189
+ """
190
+ target_sources = [s for s in (sources or list(self._providers.keys())) if s in self._providers]
191
+
192
+ all_papers: list[Paper] = []
193
+ sources_failed: list[str] = []
194
+
195
+ async def _fetch(source: PaperSource) -> list[Paper]:
196
+ try:
197
+ return await self._providers[source].get_recent(categories, days)
198
+ except Exception as e:
199
+ logger.error(f"get_recent failed for {source.value}: {e}")
200
+ sources_failed.append(f"{source.value}: {e}")
201
+ return []
202
+
203
+ results = await asyncio.gather(
204
+ *[_fetch(s) for s in target_sources],
205
+ return_exceptions=True,
206
+ )
207
+
208
+ for i, result in enumerate(results):
209
+ if isinstance(result, list):
210
+ all_papers.extend(result)
211
+ elif isinstance(result, Exception):
212
+ sources_failed.append(f"{target_sources[i].value}: {result}")
213
+
214
+ deduped, dup_count = deduplicate_papers(all_papers)
215
+ deduped.sort(key=lambda p: p.published_date or "", reverse=True)
216
+
217
+ return SearchResult(
218
+ papers=deduped,
219
+ total_count=len(deduped),
220
+ sources_queried=target_sources,
221
+ sources_failed=sources_failed,
222
+ deduplicated_count=dup_count,
223
+ query="recent papers",
224
+ )
225
+
226
+ async def download_paper(self, paper: Paper) -> str | None:
227
+ """Download a paper's full PDF.
228
+
229
+ Args:
230
+ paper: Paper to download.
231
+
232
+ Returns:
233
+ Local file path string, or None on failure.
234
+ """
235
+ path = await self.storage.download_paper(paper)
236
+ return str(path) if path else None
237
+
238
+ async def get_source_status(self) -> list[SourceStatus]:
239
+ """Get the status of all configured sources."""
240
+ return [SourceStatus(source=source, available=True) for source in self._providers]
241
+
242
+ async def list_categories(self, source: PaperSource | None = None) -> dict[str, list[dict]]:
243
+ """List available categories for each source."""
244
+ result: dict[str, list[dict]] = {}
245
+ targets = [source] if source and source in self._providers else list(self._providers.keys())
246
+ for s in targets:
247
+ try:
248
+ cats = await self._providers[s].get_categories()
249
+ result[s.value] = cats
250
+ except Exception as e:
251
+ logger.warning(f"Failed to get categories for {s.value}: {e}")
252
+ result[s.value] = []
253
+ return result