papermind 1.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. papermind-1.3.0/.github/workflows/ci.yml +35 -0
  2. papermind-1.3.0/.github/workflows/publish.yml +33 -0
  3. papermind-1.3.0/.gitignore +8 -0
  4. papermind-1.3.0/LICENSE +21 -0
  5. papermind-1.3.0/LICENSE_THIRD_PARTY.md +108 -0
  6. papermind-1.3.0/PKG-INFO +256 -0
  7. papermind-1.3.0/README.md +223 -0
  8. papermind-1.3.0/docs/next-steps.md +64 -0
  9. papermind-1.3.0/docs/plans/v1.0-overnight.md +186 -0
  10. papermind-1.3.0/pyproject.toml +44 -0
  11. papermind-1.3.0/src/papermind/__init__.py +3 -0
  12. papermind-1.3.0/src/papermind/catalog/__init__.py +5 -0
  13. papermind-1.3.0/src/papermind/catalog/index.py +157 -0
  14. papermind-1.3.0/src/papermind/catalog/render.py +69 -0
  15. papermind-1.3.0/src/papermind/cli/__init__.py +0 -0
  16. papermind-1.3.0/src/papermind/cli/catalog.py +159 -0
  17. papermind-1.3.0/src/papermind/cli/discover.py +157 -0
  18. papermind-1.3.0/src/papermind/cli/doctor.py +98 -0
  19. papermind-1.3.0/src/papermind/cli/download.py +266 -0
  20. papermind-1.3.0/src/papermind/cli/ingest.py +229 -0
  21. papermind-1.3.0/src/papermind/cli/init.py +115 -0
  22. papermind-1.3.0/src/papermind/cli/main.py +288 -0
  23. papermind-1.3.0/src/papermind/cli/related.py +201 -0
  24. papermind-1.3.0/src/papermind/cli/search.py +123 -0
  25. papermind-1.3.0/src/papermind/cli/utils.py +38 -0
  26. papermind-1.3.0/src/papermind/config.py +111 -0
  27. papermind-1.3.0/src/papermind/discovery/__init__.py +14 -0
  28. papermind-1.3.0/src/papermind/discovery/base.py +45 -0
  29. papermind-1.3.0/src/papermind/discovery/downloader.py +172 -0
  30. papermind-1.3.0/src/papermind/discovery/exa.py +82 -0
  31. papermind-1.3.0/src/papermind/discovery/openalex.py +125 -0
  32. papermind-1.3.0/src/papermind/discovery/orchestrator.py +270 -0
  33. papermind-1.3.0/src/papermind/discovery/providers.py +41 -0
  34. papermind-1.3.0/src/papermind/discovery/semantic_scholar.py +136 -0
  35. papermind-1.3.0/src/papermind/discovery/unpaywall.py +69 -0
  36. papermind-1.3.0/src/papermind/ingestion/__init__.py +1 -0
  37. papermind-1.3.0/src/papermind/ingestion/codebase.py +310 -0
  38. papermind-1.3.0/src/papermind/ingestion/codebase_render.py +57 -0
  39. papermind-1.3.0/src/papermind/ingestion/common.py +106 -0
  40. papermind-1.3.0/src/papermind/ingestion/glm_ocr.py +293 -0
  41. papermind-1.3.0/src/papermind/ingestion/package.py +340 -0
  42. papermind-1.3.0/src/papermind/ingestion/paper.py +303 -0
  43. papermind-1.3.0/src/papermind/ingestion/validation.py +49 -0
  44. papermind-1.3.0/src/papermind/mcp_server.py +297 -0
  45. papermind-1.3.0/src/papermind/query/__init__.py +7 -0
  46. papermind-1.3.0/src/papermind/query/fallback.py +160 -0
  47. papermind-1.3.0/src/papermind/query/qmd.py +106 -0
  48. papermind-1.3.0/tests/__init__.py +0 -0
  49. papermind-1.3.0/tests/conftest.py +1 -0
  50. papermind-1.3.0/tests/test_catalog/__init__.py +0 -0
  51. papermind-1.3.0/tests/test_catalog/test_index.py +114 -0
  52. papermind-1.3.0/tests/test_catalog/test_render.py +54 -0
  53. papermind-1.3.0/tests/test_cli_catalog.py +235 -0
  54. papermind-1.3.0/tests/test_cli_doctor.py +250 -0
  55. papermind-1.3.0/tests/test_cli_ingest.py +137 -0
  56. papermind-1.3.0/tests/test_cli_init.py +56 -0
  57. papermind-1.3.0/tests/test_cli_related.py +189 -0
  58. papermind-1.3.0/tests/test_config.py +93 -0
  59. papermind-1.3.0/tests/test_discovery/__init__.py +0 -0
  60. papermind-1.3.0/tests/test_discovery/test_citations.py +87 -0
  61. papermind-1.3.0/tests/test_discovery/test_downloader.py +499 -0
  62. papermind-1.3.0/tests/test_discovery/test_exa.py +291 -0
  63. papermind-1.3.0/tests/test_discovery/test_orchestrator.py +597 -0
  64. papermind-1.3.0/tests/test_discovery/test_semantic_scholar.py +400 -0
  65. papermind-1.3.0/tests/test_discovery/test_unpaywall.py +122 -0
  66. papermind-1.3.0/tests/test_e2e.py +621 -0
  67. papermind-1.3.0/tests/test_ingestion/__init__.py +0 -0
  68. papermind-1.3.0/tests/test_ingestion/test_codebase.py +81 -0
  69. papermind-1.3.0/tests/test_ingestion/test_codebase_render.py +115 -0
  70. papermind-1.3.0/tests/test_ingestion/test_common.py +73 -0
  71. papermind-1.3.0/tests/test_ingestion/test_glm_ocr.py +154 -0
  72. papermind-1.3.0/tests/test_ingestion/test_package.py +25 -0
  73. papermind-1.3.0/tests/test_ingestion/test_package_ingest.py +290 -0
  74. papermind-1.3.0/tests/test_ingestion/test_paper.py +457 -0
  75. papermind-1.3.0/tests/test_ingestion/test_paper_batch.py +455 -0
  76. papermind-1.3.0/tests/test_ingestion/test_validation.py +50 -0
  77. papermind-1.3.0/tests/test_mcp_server.py +244 -0
  78. papermind-1.3.0/tests/test_offline.py +40 -0
  79. papermind-1.3.0/tests/test_query/__init__.py +1 -0
  80. papermind-1.3.0/tests/test_query/test_fallback.py +69 -0
  81. papermind-1.3.0/tests/test_query/test_qmd.py +233 -0
  82. papermind-1.3.0/tests/test_reindex.py +228 -0
  83. papermind-1.3.0/uv.lock +1945 -0
@@ -0,0 +1,35 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ branches: [main]
8
+
9
+ jobs:
10
+ test:
11
+ runs-on: ubuntu-latest
12
+ strategy:
13
+ matrix:
14
+ python-version: ["3.11", "3.12"]
15
+
16
+ steps:
17
+ - uses: actions/checkout@v4
18
+
19
+ - name: Install uv
20
+ uses: astral-sh/setup-uv@v4
21
+
22
+ - name: Set up Python
23
+ run: uv python install ${{ matrix.python-version }}
24
+
25
+ - name: Install dependencies
26
+ run: uv sync --extra dev
27
+
28
+ - name: Lint
29
+ run: uv run ruff check src/
30
+
31
+ - name: Format check
32
+ run: uv run ruff format --check src/
33
+
34
+ - name: Tests
35
+ run: uv run python -m pytest tests/ -q --tb=short
@@ -0,0 +1,33 @@
1
+ name: Publish to PyPI
2
+
3
+ on:
4
+ push:
5
+ tags: ["v*"]
6
+
7
+ jobs:
8
+ publish:
9
+ runs-on: ubuntu-latest
10
+ environment: pypi
11
+ permissions:
12
+ id-token: write # OIDC trusted publishing
13
+
14
+ steps:
15
+ - uses: actions/checkout@v4
16
+
17
+ - name: Install uv
18
+ uses: astral-sh/setup-uv@v4
19
+
20
+ - name: Set up Python
21
+ run: uv python install 3.11
22
+
23
+ - name: Install dependencies
24
+ run: uv sync --extra dev
25
+
26
+ - name: Run tests
27
+ run: uv run pytest tests/ -q --tb=short
28
+
29
+ - name: Build package
30
+ run: uv build
31
+
32
+ - name: Publish to PyPI
33
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,8 @@
1
+ __pycache__/
2
+ *.egg-info/
3
+ dist/
4
+ build/
5
+ .venv/
6
+ .ruff_cache/
7
+ *.pyc
8
+ demo_kb/
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 HydroFound Contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,108 @@
1
+ # Third-Party Licenses
2
+
3
+ This document lists all runtime dependencies of PaperMind and their licenses.
4
+
5
+ ---
6
+
7
+ ## Python Runtime Dependencies
8
+
9
+ ### typer
10
+ - **License**: MIT
11
+ - **URL**: https://github.com/tiangolo/typer
12
+ - **Used for**: CLI framework
13
+
14
+ ### httpx
15
+ - **License**: BSD-3-Clause
16
+ - **URL**: https://github.com/encode/httpx
17
+ - **Used for**: HTTP client for paper downloads and API calls
18
+
19
+ ### griffe
20
+ - **License**: ISC
21
+ - **URL**: https://github.com/mkdocstrings/griffe
22
+ - **Used for**: Python package API extraction (introspection without import)
23
+
24
+ ### PyYAML
25
+ - **License**: MIT
26
+ - **URL**: https://github.com/yaml/pyyaml
27
+ - **Used for**: YAML parsing in config and frontmatter
28
+
29
+ ### python-frontmatter
30
+ - **License**: MIT
31
+ - **URL**: https://github.com/eyeseast/python-frontmatter
32
+ - **Used for**: Reading and writing YAML frontmatter in markdown files
33
+
34
+ ### rich
35
+ - **License**: MIT
36
+ - **URL**: https://github.com/Textualize/rich
37
+ - **Used for**: Terminal output formatting
38
+
39
+ ### Jinja2
40
+ - **License**: BSD-3-Clause
41
+ - **URL**: https://github.com/pallets/jinja
42
+ - **Used for**: Markdown rendering templates for package API docs
43
+
44
+ ### mcp
45
+ - **License**: MIT
46
+ - **URL**: https://github.com/modelcontextprotocol/python-sdk
47
+ - **Used for**: MCP server implementation (stdio transport)
48
+
49
+ ---
50
+
51
+ ## Optional Python Dependencies
52
+
53
+ ### playwright
54
+ - **License**: Apache-2.0
55
+ - **URL**: https://github.com/microsoft/playwright-python
56
+ - **Used for**: Browser-based ingestion of JavaScript-rendered package documentation
57
+ - **Install**: `pip install "papermind[browser]"`
58
+
59
+ ---
60
+
61
+ ## Optional OCR Dependencies (papermind[ocr])
62
+
63
+ ### transformers
64
+ - **License**: Apache-2.0
65
+ - **URL**: https://github.com/huggingface/transformers
66
+ - **Used for**: Loading and running GLM-OCR model for PDF conversion
67
+
68
+ ### torch (PyTorch)
69
+ - **License**: BSD-3-Clause
70
+ - **URL**: https://github.com/pytorch/pytorch
71
+ - **Used for**: Model inference backend
72
+
73
+ ### pymupdf
74
+ - **License**: AGPL-3.0 (with commercial license available)
75
+ - **URL**: https://github.com/pymupdf/PyMuPDF
76
+ - **Used for**: PDF page rendering to images for OCR
77
+
78
+ ### GLM-OCR (model weights)
79
+ - **License**: MIT
80
+ - **URL**: https://huggingface.co/zai-org/GLM-OCR
81
+ - **Used for**: PDF to markdown OCR conversion
82
+ - **Note**: Model weights downloaded from HuggingFace on first use
83
+
84
+ ## External Tools (subprocess)
85
+
86
+ ### qmd
87
+ - **License**: See upstream project for current license
88
+ - **URL**: https://github.com/simonw/qmd
89
+ - **Used for**: Semantic vector search over the knowledge base
90
+ - **Note**: Optional. PaperMind falls back to grep-based search when qmd is not available.
91
+
92
+ ---
93
+
94
+ ## Development Dependencies
95
+
96
+ These are only required during development and testing, not at runtime.
97
+
98
+ ### pytest
99
+ - **License**: MIT
100
+ - **URL**: https://github.com/pytest-dev/pytest
101
+
102
+ ### pytest-asyncio
103
+ - **License**: Apache-2.0
104
+ - **URL**: https://github.com/pytest-dev/pytest-asyncio
105
+
106
+ ### ruff
107
+ - **License**: MIT
108
+ - **URL**: https://github.com/astral-sh/ruff
@@ -0,0 +1,256 @@
1
+ Metadata-Version: 2.4
2
+ Name: papermind
3
+ Version: 1.3.0
4
+ Summary: Scientific knowledge base — papers, packages, codebases → queryable markdown
5
+ Project-URL: Homepage, https://github.com/dmbrmv/papermind
6
+ Project-URL: Repository, https://github.com/dmbrmv/papermind
7
+ Project-URL: Issues, https://github.com/dmbrmv/papermind/issues
8
+ License-Expression: MIT
9
+ License-File: LICENSE
10
+ License-File: LICENSE_THIRD_PARTY.md
11
+ Requires-Python: >=3.11
12
+ Requires-Dist: griffe>=1.0
13
+ Requires-Dist: httpx>=0.27
14
+ Requires-Dist: jinja2>=3.1
15
+ Requires-Dist: mcp>=1.0
16
+ Requires-Dist: python-frontmatter>=1.0
17
+ Requires-Dist: pyyaml>=6.0
18
+ Requires-Dist: rich>=13.0
19
+ Requires-Dist: typer>=0.9
20
+ Provides-Extra: browser
21
+ Requires-Dist: playwright>=1.40; extra == 'browser'
22
+ Provides-Extra: dev
23
+ Requires-Dist: pytest-asyncio>=0.23; extra == 'dev'
24
+ Requires-Dist: pytest>=8.0; extra == 'dev'
25
+ Requires-Dist: ruff>=0.4; extra == 'dev'
26
+ Provides-Extra: ocr
27
+ Requires-Dist: accelerate>=1.0; extra == 'ocr'
28
+ Requires-Dist: pymupdf>=1.24; extra == 'ocr'
29
+ Requires-Dist: torch>=2.0; extra == 'ocr'
30
+ Requires-Dist: torchvision>=0.20; extra == 'ocr'
31
+ Requires-Dist: transformers>=4.48; extra == 'ocr'
32
+ Description-Content-Type: text/markdown
33
+
34
+ # PaperMind
35
+
36
+ Scientific knowledge base: papers, packages, and codebases → queryable markdown.
37
+
38
+ PaperMind ingests heterogeneous scientific sources — PDFs, PyPI packages, and source trees — into a portable, plain-text knowledge base. A CLI manages ingestion, search, and discovery. An MCP server exposes the KB as tools to any AI assistant that speaks the Model Context Protocol.
39
+
40
+ ## Install
41
+
42
+ **Minimum (no PDF or browser support):**
43
+
44
+ ```bash
45
+ pip install papermind
46
+ ```
47
+
48
+ **With PDF ingestion (GLM-OCR — requires GPU):**
49
+
50
+ ```bash
51
+ pip install "papermind[ocr]"
52
+ ```
53
+
54
+ > **Note:** GLM-OCR requires a recent transformers build with GLM-OCR support.
55
+ > If `pip install "papermind[ocr]"` gives a model loading error, install the dev branch:
56
+ > `pip install "transformers @ git+https://github.com/huggingface/transformers.git"`
57
+
58
+ **With semantic search (qmd):**
59
+
60
+ ```bash
61
+ npm install -g @tobilu/qmd
62
+ ```
63
+
64
+ **With browser-based package docs:**
65
+
66
+ ```bash
67
+ pip install "papermind[browser]"
68
+ playwright install chromium
69
+ ```
70
+
71
+ **Requirements:** Python 3.11+, GPU recommended for PDF ingestion
72
+
73
+ ## Quick Start
74
+
75
+ ```bash
76
+ # 1. Create a knowledge base
77
+ papermind --kb ~/kb init
78
+
79
+ # 2. Fetch papers (search + download + OCR + ingest in one step)
80
+ papermind --kb ~/kb fetch "SWAT+ calibration machine learning" -n 10 -t swat_ml
81
+
82
+ # 3. Ingest a local PDF
83
+ papermind --kb ~/kb ingest paper path/to/paper.pdf --topic hydrology
84
+
85
+ # 4. Ingest a Python package's API docs
86
+ papermind --kb ~/kb ingest package numpy
87
+
88
+ # 5. Ingest a codebase (Python, Fortran, C, Rust)
89
+ papermind --kb ~/kb ingest codebase ~/src/myproject --name myproject
90
+
91
+ # 6. Search
92
+ papermind --kb ~/kb search "evapotranspiration calibration"
93
+ papermind --kb ~/kb search "SWAT" --topic swat_ml
94
+
95
+ # 7. Check what's in the KB
96
+ papermind --kb ~/kb catalog show
97
+ ```
98
+
99
+ ## CLI Reference
100
+
101
+ All commands take `--kb <path>` as a global option. Pass `--offline` to disable all network access.
102
+
103
+ | Command | Description |
104
+ |---------|-------------|
105
+ | `init` | Initialize a new knowledge base directory |
106
+ | `fetch <query>` | Search + download + OCR + ingest papers in one step |
107
+ | `ingest paper <path>` | Add a paper (PDF) via GLM-OCR |
108
+ | `ingest package <name>` | Extract a PyPI package's API and docs |
109
+ | `ingest codebase <path>` | Walk a source tree (Python, Fortran, C) |
110
+ | `search <query>` | Search the KB (semantic via qmd, or grep fallback) |
111
+ | `catalog show` | List all KB entries (`--json` for machine-readable) |
112
+ | `catalog stats` | Summary statistics by type and topic |
113
+ | `remove <id>` | Remove an entry from the KB |
114
+ | `discover <query>` | Find papers via OpenAlex / Semantic Scholar / Exa |
115
+ | `download <url\|doi>` | Download a paper PDF |
116
+ | `export-bibtex` | Export paper citations as BibTeX |
117
+ | `doctor` | Check installed dependencies and tool availability |
118
+ | `reindex` | Rebuild `catalog.json` and `catalog.md` from filesystem |
119
+ | `serve` | Start the MCP server (stdio transport) |
120
+ | `version` | Print version |
121
+
122
+ ### Examples
123
+
124
+ ```bash
125
+ # Fetch 10 papers on a topic, auto-download and ingest
126
+ papermind --kb ~/kb fetch "differentiable hydrology neural ODE" -n 10 -t diff_hydro
127
+
128
+ # Preview what fetch would do (no download/ingest)
129
+ papermind --kb ~/kb fetch "SWAT calibration" -n 5 --dry-run
130
+
131
+ # Ingest multiple papers from a directory
132
+ papermind --kb ~/kb ingest paper papers/ --topic swat
133
+
134
+ # Export citations for reference managers
135
+ papermind --kb ~/kb export-bibtex > references.bib
136
+
137
+ # Machine-readable catalog
138
+ papermind --kb ~/kb catalog show --json
139
+
140
+ # Search with topic filter
141
+ papermind --kb ~/kb search "calibration" --topic swat_ml
142
+
143
+ # Run fully offline (no network calls at all)
144
+ papermind --kb ~/kb --offline search "groundwater recharge"
145
+
146
+ # Check tool health
147
+ papermind --kb ~/kb doctor
148
+ ```
149
+
150
+ ## Paper Discovery
151
+
152
+ PaperMind searches three academic APIs in parallel:
153
+
154
+ - **[OpenAlex](https://openalex.org/)** — free, no API key, direct PDF URLs for open-access papers
155
+ - **[Semantic Scholar](https://www.semanticscholar.org/)** — structured metadata, citation counts (optional API key for higher rate limits)
156
+ - **[Exa](https://exa.ai/)** — broad web search (requires API key)
157
+ - **[Unpaywall](https://unpaywall.org/)** — DOI→PDF resolver fallback (free, no key)
158
+
159
+ ## PDF OCR
160
+
161
+ PaperMind uses [GLM-OCR](https://huggingface.co/zai-org/GLM-OCR) (MIT, 0.9B params, #1 OmniDocBench) for PDF→markdown conversion. Features:
162
+
163
+ - Runs locally on GPU (RTX 3060+ recommended, ~2GB VRAM)
164
+ - Outputs structured markdown with LaTeX equations
165
+ - Auto-detects section headings (numbered sections, ALL-CAPS)
166
+ - Extracts embedded figures as PNG files alongside the markdown
167
+ - Source PDF copied next to markdown for easy comparison
168
+
169
+ Install with `pip install "papermind[ocr]"`. Model downloaded from HuggingFace on first use (~2GB, cached).
170
+
171
+ ## MCP Server
172
+
173
+ PaperMind exposes your KB to AI assistants via the [Model Context Protocol](https://modelcontextprotocol.io/).
174
+
175
+ **Claude Code (`.claude/mcp.json`):**
176
+
177
+ ```json
178
+ {
179
+ "mcpServers": {
180
+ "papermind": {
181
+ "command": "papermind",
182
+ "args": ["--kb", "/path/to/kb", "serve"]
183
+ }
184
+ }
185
+ }
186
+ ```
187
+
188
+ **Available MCP tools:**
189
+
190
+ | Tool | Description |
191
+ |------|-------------|
192
+ | `query` | Search the KB; optional `scope`, `topic`, `limit` |
193
+ | `get` | Read a single document by relative path |
194
+ | `multi_get` | Read multiple documents in one call |
195
+ | `catalog_stats` | KB statistics (counts by type and topic) |
196
+ | `list_topics` | All topics in the KB |
197
+ | `discover_papers` | Search academic APIs |
198
+
199
+ ## Search
200
+
201
+ Two search backends:
202
+
203
+ - **[qmd](https://github.com/tobi/qmd)** — hybrid search (BM25 + vector embeddings + LLM reranking). Install: `npm install -g @tobilu/qmd`, then `qmd collection add ~/kb --name my-kb`
204
+ - **Built-in fallback** — grep-based term matching (zero dependencies)
205
+
206
+ ## Configuration
207
+
208
+ Each KB has a `.papermind/config.toml`. All keys are optional.
209
+
210
+ ```toml
211
+ [search]
212
+ qmd_path = "qmd"
213
+ fallback_search = true
214
+
215
+ [apis]
216
+ semantic_scholar_key = ""
217
+ exa_key = ""
218
+
219
+ [ingestion]
220
+ ocr_model = "zai-org/GLM-OCR"
221
+ ocr_dpi = 150
222
+ default_paper_topic = "uncategorized"
223
+
224
+ [firecrawl]
225
+ api_key = ""
226
+
227
+ [privacy]
228
+ offline_only = false
229
+ ```
230
+
231
+ **Environment variables** override config file values:
232
+
233
+ | Variable | Purpose |
234
+ |----------|---------|
235
+ | `PAPERMIND_EXA_KEY` | Exa search API key |
236
+ | `PAPERMIND_SEMANTIC_SCHOLAR_KEY` | Semantic Scholar API key |
237
+ | `PAPERMIND_FIRECRAWL_KEY` | Firecrawl API key |
238
+ | `HF_TOKEN` | HuggingFace token (faster model downloads) |
239
+
240
+ ## Contributing
241
+
242
+ ```bash
243
+ git clone https://github.com/dmbrmv/papermind
244
+ cd papermind
245
+ pip install -e ".[dev]"
246
+ uv run pytest tests/ -v
247
+ uv run ruff check src/
248
+ ```
249
+
250
+ The test suite is fully offline — no network calls, no external tools required.
251
+
252
+ ## License
253
+
254
+ MIT — see [LICENSE](LICENSE).
255
+
256
+ Third-party dependency licenses: [LICENSE_THIRD_PARTY.md](LICENSE_THIRD_PARTY.md).
@@ -0,0 +1,223 @@
1
+ # PaperMind
2
+
3
+ Scientific knowledge base: papers, packages, and codebases → queryable markdown.
4
+
5
+ PaperMind ingests heterogeneous scientific sources — PDFs, PyPI packages, and source trees — into a portable, plain-text knowledge base. A CLI manages ingestion, search, and discovery. An MCP server exposes the KB as tools to any AI assistant that speaks the Model Context Protocol.
6
+
7
+ ## Install
8
+
9
+ **Minimum (no PDF or browser support):**
10
+
11
+ ```bash
12
+ pip install papermind
13
+ ```
14
+
15
+ **With PDF ingestion (GLM-OCR — requires GPU):**
16
+
17
+ ```bash
18
+ pip install "papermind[ocr]"
19
+ ```
20
+
21
+ > **Note:** GLM-OCR requires a recent transformers build with GLM-OCR support.
22
+ > If `pip install "papermind[ocr]"` gives a model loading error, install the dev branch:
23
+ > `pip install "transformers @ git+https://github.com/huggingface/transformers.git"`
24
+
25
+ **With semantic search (qmd):**
26
+
27
+ ```bash
28
+ npm install -g @tobilu/qmd
29
+ ```
30
+
31
+ **With browser-based package docs:**
32
+
33
+ ```bash
34
+ pip install "papermind[browser]"
35
+ playwright install chromium
36
+ ```
37
+
38
+ **Requirements:** Python 3.11+, GPU recommended for PDF ingestion
39
+
40
+ ## Quick Start
41
+
42
+ ```bash
43
+ # 1. Create a knowledge base
44
+ papermind --kb ~/kb init
45
+
46
+ # 2. Fetch papers (search + download + OCR + ingest in one step)
47
+ papermind --kb ~/kb fetch "SWAT+ calibration machine learning" -n 10 -t swat_ml
48
+
49
+ # 3. Ingest a local PDF
50
+ papermind --kb ~/kb ingest paper path/to/paper.pdf --topic hydrology
51
+
52
+ # 4. Ingest a Python package's API docs
53
+ papermind --kb ~/kb ingest package numpy
54
+
55
+ # 5. Ingest a codebase (Python, Fortran, C, Rust)
56
+ papermind --kb ~/kb ingest codebase ~/src/myproject --name myproject
57
+
58
+ # 6. Search
59
+ papermind --kb ~/kb search "evapotranspiration calibration"
60
+ papermind --kb ~/kb search "SWAT" --topic swat_ml
61
+
62
+ # 7. Check what's in the KB
63
+ papermind --kb ~/kb catalog show
64
+ ```
65
+
66
+ ## CLI Reference
67
+
68
+ All commands take `--kb <path>` as a global option. Pass `--offline` to disable all network access.
69
+
70
+ | Command | Description |
71
+ |---------|-------------|
72
+ | `init` | Initialize a new knowledge base directory |
73
+ | `fetch <query>` | Search + download + OCR + ingest papers in one step |
74
+ | `ingest paper <path>` | Add a paper (PDF) via GLM-OCR |
75
+ | `ingest package <name>` | Extract a PyPI package's API and docs |
76
+ | `ingest codebase <path>` | Walk a source tree (Python, Fortran, C) |
77
+ | `search <query>` | Search the KB (semantic via qmd, or grep fallback) |
78
+ | `catalog show` | List all KB entries (`--json` for machine-readable) |
79
+ | `catalog stats` | Summary statistics by type and topic |
80
+ | `remove <id>` | Remove an entry from the KB |
81
+ | `discover <query>` | Find papers via OpenAlex / Semantic Scholar / Exa |
82
+ | `download <url\|doi>` | Download a paper PDF |
83
+ | `export-bibtex` | Export paper citations as BibTeX |
84
+ | `doctor` | Check installed dependencies and tool availability |
85
+ | `reindex` | Rebuild `catalog.json` and `catalog.md` from filesystem |
86
+ | `serve` | Start the MCP server (stdio transport) |
87
+ | `version` | Print version |
88
+
89
+ ### Examples
90
+
91
+ ```bash
92
+ # Fetch 10 papers on a topic, auto-download and ingest
93
+ papermind --kb ~/kb fetch "differentiable hydrology neural ODE" -n 10 -t diff_hydro
94
+
95
+ # Preview what fetch would do (no download/ingest)
96
+ papermind --kb ~/kb fetch "SWAT calibration" -n 5 --dry-run
97
+
98
+ # Ingest multiple papers from a directory
99
+ papermind --kb ~/kb ingest paper papers/ --topic swat
100
+
101
+ # Export citations for reference managers
102
+ papermind --kb ~/kb export-bibtex > references.bib
103
+
104
+ # Machine-readable catalog
105
+ papermind --kb ~/kb catalog show --json
106
+
107
+ # Search with topic filter
108
+ papermind --kb ~/kb search "calibration" --topic swat_ml
109
+
110
+ # Run fully offline (no network calls at all)
111
+ papermind --kb ~/kb --offline search "groundwater recharge"
112
+
113
+ # Check tool health
114
+ papermind --kb ~/kb doctor
115
+ ```
116
+
117
+ ## Paper Discovery
118
+
119
+ PaperMind searches three academic APIs in parallel:
120
+
121
+ - **[OpenAlex](https://openalex.org/)** — free, no API key, direct PDF URLs for open-access papers
122
+ - **[Semantic Scholar](https://www.semanticscholar.org/)** — structured metadata, citation counts (optional API key for higher rate limits)
123
+ - **[Exa](https://exa.ai/)** — broad web search (requires API key)
124
+ - **[Unpaywall](https://unpaywall.org/)** — DOI→PDF resolver fallback (free, no key)
125
+
126
+ ## PDF OCR
127
+
128
+ PaperMind uses [GLM-OCR](https://huggingface.co/zai-org/GLM-OCR) (MIT, 0.9B params, #1 OmniDocBench) for PDF→markdown conversion. Features:
129
+
130
+ - Runs locally on GPU (RTX 3060+ recommended, ~2GB VRAM)
131
+ - Outputs structured markdown with LaTeX equations
132
+ - Auto-detects section headings (numbered sections, ALL-CAPS)
133
+ - Extracts embedded figures as PNG files alongside the markdown
134
+ - Source PDF copied next to markdown for easy comparison
135
+
136
+ Install with `pip install "papermind[ocr]"`. Model downloaded from HuggingFace on first use (~2GB, cached).
137
+
138
+ ## MCP Server
139
+
140
+ PaperMind exposes your KB to AI assistants via the [Model Context Protocol](https://modelcontextprotocol.io/).
141
+
142
+ **Claude Code (`.claude/mcp.json`):**
143
+
144
+ ```json
145
+ {
146
+ "mcpServers": {
147
+ "papermind": {
148
+ "command": "papermind",
149
+ "args": ["--kb", "/path/to/kb", "serve"]
150
+ }
151
+ }
152
+ }
153
+ ```
154
+
155
+ **Available MCP tools:**
156
+
157
+ | Tool | Description |
158
+ |------|-------------|
159
+ | `query` | Search the KB; optional `scope`, `topic`, `limit` |
160
+ | `get` | Read a single document by relative path |
161
+ | `multi_get` | Read multiple documents in one call |
162
+ | `catalog_stats` | KB statistics (counts by type and topic) |
163
+ | `list_topics` | All topics in the KB |
164
+ | `discover_papers` | Search academic APIs |
165
+
166
+ ## Search
167
+
168
+ Two search backends:
169
+
170
+ - **[qmd](https://github.com/tobi/qmd)** — hybrid search (BM25 + vector embeddings + LLM reranking). Install: `npm install -g @tobilu/qmd`, then `qmd collection add ~/kb --name my-kb`
171
+ - **Built-in fallback** — grep-based term matching (zero dependencies)
172
+
173
+ ## Configuration
174
+
175
+ Each KB has a `.papermind/config.toml`. All keys are optional.
176
+
177
+ ```toml
178
+ [search]
179
+ qmd_path = "qmd"
180
+ fallback_search = true
181
+
182
+ [apis]
183
+ semantic_scholar_key = ""
184
+ exa_key = ""
185
+
186
+ [ingestion]
187
+ ocr_model = "zai-org/GLM-OCR"
188
+ ocr_dpi = 150
189
+ default_paper_topic = "uncategorized"
190
+
191
+ [firecrawl]
192
+ api_key = ""
193
+
194
+ [privacy]
195
+ offline_only = false
196
+ ```
197
+
198
+ **Environment variables** override config file values:
199
+
200
+ | Variable | Purpose |
201
+ |----------|---------|
202
+ | `PAPERMIND_EXA_KEY` | Exa search API key |
203
+ | `PAPERMIND_SEMANTIC_SCHOLAR_KEY` | Semantic Scholar API key |
204
+ | `PAPERMIND_FIRECRAWL_KEY` | Firecrawl API key |
205
+ | `HF_TOKEN` | HuggingFace token (faster model downloads) |
206
+
207
+ ## Contributing
208
+
209
+ ```bash
210
+ git clone https://github.com/dmbrmv/papermind
211
+ cd papermind
212
+ pip install -e ".[dev]"
213
+ uv run pytest tests/ -v
214
+ uv run ruff check src/
215
+ ```
216
+
217
+ The test suite is fully offline — no network calls, no external tools required.
218
+
219
+ ## License
220
+
221
+ MIT — see [LICENSE](LICENSE).
222
+
223
+ Third-party dependency licenses: [LICENSE_THIRD_PARTY.md](LICENSE_THIRD_PARTY.md).