scholarx 0.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. scholarx-0.6.0/LICENSE +21 -0
  2. scholarx-0.6.0/PKG-INFO +370 -0
  3. scholarx-0.6.0/README.md +343 -0
  4. scholarx-0.6.0/pyproject.toml +55 -0
  5. scholarx-0.6.0/scholarx/__init__.py +59 -0
  6. scholarx-0.6.0/scholarx/__main__.py +7 -0
  7. scholarx-0.6.0/scholarx/agent_server.py +94 -0
  8. scholarx-0.6.0/scholarx/api_client.py +253 -0
  9. scholarx-0.6.0/scholarx/cli.py +897 -0
  10. scholarx-0.6.0/scholarx/deduplication.py +195 -0
  11. scholarx-0.6.0/scholarx/kg_integration.py +251 -0
  12. scholarx-0.6.0/scholarx/main_agent.json +15 -0
  13. scholarx-0.6.0/scholarx/mcp_config.json +25 -0
  14. scholarx-0.6.0/scholarx/mcp_server.py +370 -0
  15. scholarx-0.6.0/scholarx/models.py +223 -0
  16. scholarx-0.6.0/scholarx/paper_storage.py +155 -0
  17. scholarx-0.6.0/scholarx/providers/__init__.py +21 -0
  18. scholarx-0.6.0/scholarx/providers/arxiv.py +269 -0
  19. scholarx-0.6.0/scholarx/providers/base.py +148 -0
  20. scholarx-0.6.0/scholarx/providers/biorxiv.py +147 -0
  21. scholarx-0.6.0/scholarx/providers/osf.py +110 -0
  22. scholarx-0.6.0/scholarx/providers/pmc.py +189 -0
  23. scholarx-0.6.0/scholarx/providers/rss.py +304 -0
  24. scholarx-0.6.0/scholarx/providers/semantic_scholar.py +96 -0
  25. scholarx-0.6.0/scholarx/scanner.py +953 -0
  26. scholarx-0.6.0/scholarx.egg-info/PKG-INFO +370 -0
  27. scholarx-0.6.0/scholarx.egg-info/SOURCES.txt +33 -0
  28. scholarx-0.6.0/scholarx.egg-info/dependency_links.txt +1 -0
  29. scholarx-0.6.0/scholarx.egg-info/entry_points.txt +4 -0
  30. scholarx-0.6.0/scholarx.egg-info/requires.txt +20 -0
  31. scholarx-0.6.0/scholarx.egg-info/top_level.txt +1 -0
  32. scholarx-0.6.0/setup.cfg +4 -0
  33. scholarx-0.6.0/tests/test_cli.py +198 -0
  34. scholarx-0.6.0/tests/test_concept_parity.py +79 -0
  35. scholarx-0.6.0/tests/test_models.py +83 -0
scholarx-0.6.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Knuckles Team
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,370 @@
1
+ Metadata-Version: 2.4
2
+ Name: scholarx
3
+ Version: 0.6.0
4
+ Summary: Universal Research Paper API — single entry point for arXiv, PMC, bioRxiv, medRxiv, PsyArXiv, OSF, and Semantic Scholar
5
+ Author-email: Audel Rouhi <knucklessg1@gmail.com>
6
+ License: MIT
7
+ Requires-Python: <3.14,>=3.11
8
+ Description-Content-Type: text/markdown
9
+ License-File: LICENSE
10
+ Requires-Dist: agent-utilities>=0.9.0
11
+ Requires-Dist: httpx>=0.27
12
+ Requires-Dist: python-dotenv
13
+ Requires-Dist: Levenshtein>=0.26
14
+ Requires-Dist: pypdf>=5.0
15
+ Requires-Dist: rich>=13.0
16
+ Provides-Extra: mcp
17
+ Requires-Dist: agent-utilities[mcp]>=0.9.0; extra == "mcp"
18
+ Provides-Extra: agent
19
+ Requires-Dist: agent-utilities[agent,logfire]>=0.9.0; extra == "agent"
20
+ Provides-Extra: all
21
+ Requires-Dist: scholarx[agent,mcp]; extra == "all"
22
+ Provides-Extra: test
23
+ Requires-Dist: pytest; extra == "test"
24
+ Requires-Dist: pytest-asyncio; extra == "test"
25
+ Requires-Dist: respx; extra == "test"
26
+ Dynamic: license-file
27
+
28
+ # ScholarX 📚 - API | MCP | AgentOS
29
+
30
+ ![PyPI - Version](https://img.shields.io/pypi/v/scholarx)
31
+ ![MCP Server](https://badge.mcpx.dev?type=server 'MCP Server')
32
+ ![PyPI - Downloads](https://img.shields.io/pypi/dd/scholarx)
33
+ ![GitHub Repo stars](https://img.shields.io/github/stars/Knuckles-Team/scholarx)
34
+ ![GitHub forks](https://img.shields.io/github/forks/Knuckles-Team/scholarx)
35
+ ![GitHub contributors](https://img.shields.io/github/contributors/Knuckles-Team/scholarx)
36
+ ![PyPI - License](https://img.shields.io/pypi/l/scholarx)
37
+ ![GitHub](https://img.shields.io/github/license/Knuckles-Team/scholarx)
38
+
39
+ ![GitHub last commit (by committer)](https://img.shields.io/github/last-commit/Knuckles-Team/scholarx)
40
+ ![GitHub pull requests](https://img.shields.io/github/issues-pr/Knuckles-Team/scholarx)
41
+ ![GitHub closed pull requests](https://img.shields.io/github/issues-pr-closed/Knuckles-Team/scholarx)
42
+ ![GitHub issues](https://img.shields.io/github/issues/Knuckles-Team/scholarx)
43
+
44
+ ![GitHub top language](https://img.shields.io/github/languages/top/Knuckles-Team/scholarx)
45
+ ![GitHub language count](https://img.shields.io/github/languages/count/Knuckles-Team/scholarx)
46
+ ![GitHub repo size](https://img.shields.io/github/repo-size/Knuckles-Team/scholarx)
47
+ ![GitHub repo file count (file type)](https://img.shields.io/github/directory-file-count/Knuckles-Team/scholarx)
48
+ ![PyPI - Wheel](https://img.shields.io/pypi/wheel/scholarx)
49
+ ![PyPI - Implementation](https://img.shields.io/pypi/implementation/scholarx)
50
+
51
+ *Version: 1.8.0*
52
+
53
+ **Universal Research Paper API** — a single entry point for querying, downloading, and ingesting research papers from all major preprint and academic repositories.
54
+
55
+ Version: 0.6.0
56
+
57
+ ## Overview
58
+
59
+ ScholarX provides a unified interface to search across **7 paper sources** simultaneously, with automatic cross-source deduplication, full PDF downloads, and Knowledge Graph integration. It is registered as an **Agent OS subsystem** in the genius-agent ecosystem.
60
+
61
+ ### Supported Sources
62
+
63
+ | Source | API | Auth | Rate Limit |
64
+ |--------|-----|------|------------|
65
+ | **arXiv** | Atom/OpenSearch | Free | 1 req/3s |
66
+ | **PubMed Central** | NCBI E-utilities | Optional `NCBI_API_KEY` | 3 req/s (10 with key) |
67
+ | **bioRxiv** | bioRxiv REST | Free | 1 req/s |
68
+ | **medRxiv** | bioRxiv REST | Free | 1 req/s |
69
+ | **PsyArXiv** | OSF v2 | `OSF_TOKEN` | 1 req/s |
70
+ | **OSF** | OSF v2 | `OSF_TOKEN` | 1 req/s |
71
+ | **Semantic Scholar** | Academic Graph v1 | Optional `S2_API_KEY` | 100 req/min |
72
+
73
+ ### Key Features
74
+
75
+ - **Unified Search** — Single `SearchQuery` model works across all sources
76
+ - **3-Tier Deduplication** — DOI exact match → cross-ID mapping → fuzzy title+author (Levenshtein ≥ 0.90)
77
+ - **Full Paper Download** — Download and store complete PDFs locally (`~/.scholarx/papers/`)
78
+ - **Knowledge Graph Integration** — Ingest papers via existing `KBIngestionEngine` (ArticleNode, SourceNode, PersonNode)
79
+ - **RLM Auto-Trigger** — Large papers (>50K chars) automatically route through Recursive Language Model decomposition
80
+ - **Per-Source Rate Limiting** — Token-bucket rate limiter in the abstract provider base class
81
+ - **Configurable Watchlists** — Register custom research topics as MaintenanceCron tasks
82
+
83
+ ## Installation
84
+
85
+ ```bash
86
+ # Core (API client only)
87
+ pip install scholarx
88
+
89
+ # With MCP server
90
+ pip install scholarx[mcp]
91
+
92
+ # With agent server
93
+ pip install scholarx[agent]
94
+
95
+ # Everything
96
+ pip install scholarx[all]
97
+ ```
98
+
99
+ ## Quick Start
100
+
101
+ ### Python API
102
+
103
+ ```python
104
+ import asyncio
105
+ from scholarx.api_client import ScholarXClient
106
+ from scholarx.models import SearchQuery, PaperSource
107
+
108
+ async def main():
109
+ client = ScholarXClient()
110
+
111
+ # Search across all sources
112
+ result = await client.search(SearchQuery(
113
+ query="multi-agent orchestration",
114
+ categories=["cs.AI", "cs.MA"],
115
+ max_results=10,
116
+ ))
117
+
118
+ for paper in result.papers:
119
+ print(f"[{paper.source}] {paper.title}")
120
+ print(f" Authors: {', '.join(paper.authors[:3])}")
121
+ print(f" DOI: {paper.doi}")
122
+ print()
123
+
124
+ # Download a paper
125
+ if result.papers:
126
+ path = await client.download_paper(result.papers[0])
127
+ print(f"Downloaded to: {path}")
128
+
129
+ asyncio.run(main())
130
+ ```
131
+
132
+ ### CLI
133
+
134
+ ScholarX includes a rich CLI with progress bars for paper discovery, relevance scoring, and PDF downloads.
135
+
136
+ ```bash
137
+ # Scan for recent AI papers across 7 CS categories
138
+ scholarx scan --query "artificial intelligence" --output-dir ./papers
139
+
140
+ # Customize categories and result count
141
+ scholarx scan --categories cs.AI,cs.LG,cs.CL --max-results 30 --output-dir ./papers
142
+
143
+ # Use a custom relevance taxonomy
144
+ scholarx scan --query "knowledge graphs" --taxonomy custom_taxonomy.json --output-dir ./papers
145
+
146
+ # Auto-trigger comparative analysis on high-confidence papers
147
+ scholarx scan --analyze --output-dir ./papers
148
+
149
+ # Show stored paper library status
150
+ scholarx status
151
+ ```
152
+
153
+ #### Relevance Scoring
154
+
155
+ The CLI scores each paper's abstract against a 9-domain weighted keyword taxonomy:
156
+
157
+ | Domain | Weight | Focus |
158
+ |--------|--------|-------|
159
+ | Orchestration | 3.0 | Multi-agent, workflow, task decomposition |
160
+ | Knowledge Graph | 3.0 | Ontology, OWL, entity relations, graph reasoning |
161
+ | Planning & Reasoning | 2.5 | Chain-of-thought, MCTS, deliberation |
162
+ | Memory & Retrieval | 2.5 | RAG, episodic memory, continual learning |
163
+ | Tool Use | 2.0 | Function calling, MCP, code generation |
164
+ | Evaluation & Safety | 2.0 | Benchmarks, red teaming, hallucination |
165
+ | Swarm & Evolution | 2.0 | Evolutionary methods, stigmergy, biomimicry |
166
+ | LLM Architecture | 1.5 | Transformers, MoE, distillation |
167
+ | Human-AI | 1.0 | Human-in-the-loop, decision support |
168
+
169
+ Papers are classified into three tiers:
170
+ - **✅ Relevant** (score ≥ 3.0) — Direct value for the target domain
171
+ - **🟡 Marginal** (score 1.0–2.9) — Potential indirect value
172
+ - **❌ Irrelevant** (score < 1.0) — Filtered out
173
+
174
+ #### Deduplication
175
+
176
+ ScholarX prevents duplicate downloads through two mechanisms:
177
+
178
+ 1. **Cross-source deduplication** (`deduplication.py`): 3-tier matching removes duplicates when the same paper appears across multiple sources:
179
+ - **Tier 1**: DOI exact match
180
+ - **Tier 2**: Cross-ID mapping (arXiv ID ↔ S2 corpus ID via metadata)
181
+ - **Tier 3**: Normalized title + first-author last name (Levenshtein ≥ 0.90)
182
+
183
+ 2. **Storage deduplication** (`paper_storage.py`): Before downloading, `PaperStorage.download_paper()` checks if the paper ID's metadata hash already exists in `~/.scholarx/papers/.metadata/`. Already-downloaded papers are skipped instantly.
184
+
185
+ ### MCP Server
186
+
187
+ ```bash
188
+ # Start in stdio mode (for agent integration)
189
+ scholarx-mcp --transport stdio
190
+
191
+ # Start in HTTP mode
192
+ scholarx-mcp --transport streamable-http --host 0.0.0.0 --port 9600
193
+ ```
194
+
195
+ ### MCP Tools
196
+
197
+ | Tool | Description |
198
+ |------|-------------|
199
+ | `search_papers` | Multi-source search with deduplication |
200
+ | `get_paper` | Single paper by source + ID |
201
+ | `search_by_author` | Author-based search |
202
+ | `get_recent_papers` | Papers from last N days |
203
+ | `list_sources` | Available sources and status |
204
+ | `list_categories` | Categories per source |
205
+ | `download_paper` | Download full PDF |
206
+ | `get_stored_papers` | List locally stored papers |
207
+
208
+ ### MCP Prompts
209
+
210
+ | Prompt | Purpose |
211
+ |--------|---------|
212
+ | `agent_utilities_enhancement_scan` | Scan CS/AI papers for AU concept enhancement opportunities |
213
+ | `biomimicry_innovation_scan` | Scan biology papers for biomimetic agent patterns |
214
+
215
+ ## Docker
216
+
217
+ ```bash
218
+ # Build and run
219
+ docker compose up -d
220
+
221
+ # Debug mode (mounts local source)
222
+ docker compose -f compose.yml up --build
223
+ ```
224
+
225
+ ## Environment Variables
226
+
227
+ ```bash
228
+ # API Keys (all optional for basic functionality)
229
+ OSF_TOKEN= # OSF/PsyArXiv API token
230
+ S2_API_KEY= # Semantic Scholar (higher rate limits)
231
+ NCBI_API_KEY= # PubMed Central (higher rate limits)
232
+
233
+ # MCP Server
234
+ TRANSPORT=stdio # stdio | streamable-http
235
+ HOST=0.0.0.0
236
+ PORT=9600
237
+
238
+ # Tool Toggles
239
+ SEARCHTOOL=True
240
+ DISCOVERYTOOL=True
241
+ STORAGETOOL=True
242
+
243
+ # Paper Storage
244
+ SCHOLARX_STORAGE_DIR= # Default: ~/.scholarx/papers/
245
+ ```
246
+
247
+ ## Architecture
248
+
249
+ ```
250
+ User/Agent
251
+
252
+
253
+ ┌─────────────────────────┐
254
+ │ ScholarX MCP Server │ 12 tools + prompts
255
+ │ (mcp_server.py) │
256
+ └────────┬────────────────┘
257
+
258
+
259
+ ┌─────────────────────────┐
260
+ │ ScholarXClient │ Unified API
261
+ │ (api_client.py) │
262
+ └────────┬────────────────┘
263
+
264
+ ┌────┼────┬────┬────┬────┬────┐
265
+ ▼ ▼ ▼ ▼ ▼ ▼ ▼
266
+ arXiv PMC bioRx medRx PsyAr OSF S2 ← Per-source rate limiting
267
+ │ │ │ │ │ │ │
268
+ └────┴────┴────┴────┴────┴────┘
269
+
270
+
271
+ ┌─────────────────────────┐
272
+ │ Deduplication Engine │ DOI → cross-ID → fuzzy title
273
+ │ (deduplication.py) │
274
+ └────────┬────────────────┘
275
+
276
+
277
+ ┌─────────────────────────┐
278
+ │ Paper Storage │ Full PDF download
279
+ │ (~/.scholarx/papers/) │
280
+ │ │ │
281
+ │ ▼ │
282
+ │ KBIngestionEngine │ → ArticleNode + PersonNode
283
+ │ (KG auto-ingest) │ + SourceNode + KBConceptNode
284
+ │ │ │
285
+ │ RLM (AU-007) │ Auto-triggers for >50K char papers
286
+ └─────────────────────────┘
287
+ ```
288
+
289
+ ## Agent OS Subsystem
290
+
291
+ ScholarX is registered as an Agent OS subsystem alongside:
292
+
293
+ | Subsystem | Role |
294
+ |-----------|------|
295
+ | `container-manager-mcp` | Infrastructure provisioning |
296
+ | `systems-manager` | Host/OS operations |
297
+ | `tunnel-manager` | Network tunneling |
298
+ | `repository-manager` | Git/repo operations |
299
+ | **`scholarx`** | **Research intelligence** |
300
+
301
+ ## Maintenance Cron
302
+
303
+ A `SIX_HOURLY` maintenance task (`scholarx_paper_discovery`) automatically:
304
+ 1. Checks for new papers across configured categories
305
+ 2. Evaluates relevance to Knowledge Graph concepts
306
+ 3. Ingests high-relevance papers (score > 0.6)
307
+ 4. Produces actionable research digests
308
+
309
+ Custom watchlists can be added via `MaintenanceCron.add_task()` or the `create_research_watchlist` MCP tool.
310
+
311
+ ## License
312
+
313
+ MIT
314
+
315
+
316
+ ## MCP Configuration Examples
317
+
318
+ ### 1. Standard IO (stdio) Deployment
319
+
320
+ ```json
321
+ {
322
+ "mcpServers": {
323
+ "scholarx": {
324
+ "command": "uv",
325
+ "args": [
326
+ "run",
327
+ "scholarx-mcp"
328
+ ],
329
+ "env": {
330
+ "AGENT_DESCRIPTION": "<YOUR_AGENT_DESCRIPTION>",
331
+ "AGENT_SYSTEM_PROMPT": "<YOUR_AGENT_SYSTEM_PROMPT>",
332
+ "DEFAULT_AGENT_NAME": "<YOUR_DEFAULT_AGENT_NAME>",
333
+ "DISCOVERYTOOL": "True",
334
+ "SEARCHTOOL": "True",
335
+ "STORAGETOOL": "True"
336
+ }
337
+ }
338
+ }
339
+ }
340
+ ```
341
+
342
+ ### 2. Streamable HTTP (SSE) Deployment
343
+
344
+ ```json
345
+ {
346
+ "mcpServers": {
347
+ "scholarx": {
348
+ "command": "uv",
349
+ "args": [
350
+ "run",
351
+ "scholarx-mcp",
352
+ "--transport",
353
+ "http",
354
+ "--host",
355
+ "0.0.0.0",
356
+ "--port",
357
+ "8000"
358
+ ],
359
+ "env": {
360
+ "AGENT_DESCRIPTION": "<YOUR_AGENT_DESCRIPTION>",
361
+ "AGENT_SYSTEM_PROMPT": "<YOUR_AGENT_SYSTEM_PROMPT>",
362
+ "DEFAULT_AGENT_NAME": "<YOUR_DEFAULT_AGENT_NAME>",
363
+ "DISCOVERYTOOL": "True",
364
+ "SEARCHTOOL": "True",
365
+ "STORAGETOOL": "True"
366
+ }
367
+ }
368
+ }
369
+ }
370
+ ```