data-aggregator-mcp 0.11.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. data_aggregator_mcp-0.11.0/.github/workflows/ci.yml +16 -0
  2. data_aggregator_mcp-0.11.0/.github/workflows/publish.yml +34 -0
  3. data_aggregator_mcp-0.11.0/.gitignore +5 -0
  4. data_aggregator_mcp-0.11.0/CHANGELOG.md +223 -0
  5. data_aggregator_mcp-0.11.0/LICENSE +21 -0
  6. data_aggregator_mcp-0.11.0/PKG-INFO +223 -0
  7. data_aggregator_mcp-0.11.0/PUBLISH.md +61 -0
  8. data_aggregator_mcp-0.11.0/README.md +195 -0
  9. data_aggregator_mcp-0.11.0/examples/_demo_stdio.py +156 -0
  10. data_aggregator_mcp-0.11.0/examples/assets/demo.svg +1 -0
  11. data_aggregator_mcp-0.11.0/pyproject.toml +65 -0
  12. data_aggregator_mcp-0.11.0/server.json +29 -0
  13. data_aggregator_mcp-0.11.0/src/data_aggregator_mcp/__init__.py +3 -0
  14. data_aggregator_mcp-0.11.0/src/data_aggregator_mcp/__main__.py +6 -0
  15. data_aggregator_mcp-0.11.0/src/data_aggregator_mcp/_eutils.py +133 -0
  16. data_aggregator_mcp-0.11.0/src/data_aggregator_mcp/_http.py +207 -0
  17. data_aggregator_mcp-0.11.0/src/data_aggregator_mcp/_merge.py +22 -0
  18. data_aggregator_mcp-0.11.0/src/data_aggregator_mcp/archive.py +85 -0
  19. data_aggregator_mcp-0.11.0/src/data_aggregator_mcp/citation.py +79 -0
  20. data_aggregator_mcp-0.11.0/src/data_aggregator_mcp/datacite.py +183 -0
  21. data_aggregator_mcp-0.11.0/src/data_aggregator_mcp/dataverse.py +52 -0
  22. data_aggregator_mcp-0.11.0/src/data_aggregator_mcp/dryad.py +46 -0
  23. data_aggregator_mcp-0.11.0/src/data_aggregator_mcp/ena.py +67 -0
  24. data_aggregator_mcp-0.11.0/src/data_aggregator_mcp/errors.py +43 -0
  25. data_aggregator_mcp-0.11.0/src/data_aggregator_mcp/fetch.py +135 -0
  26. data_aggregator_mcp-0.11.0/src/data_aggregator_mcp/figshare.py +46 -0
  27. data_aggregator_mcp-0.11.0/src/data_aggregator_mcp/fulltext.py +119 -0
  28. data_aggregator_mcp-0.11.0/src/data_aggregator_mcp/geo.py +49 -0
  29. data_aggregator_mcp-0.11.0/src/data_aggregator_mcp/idconv.py +47 -0
  30. data_aggregator_mcp-0.11.0/src/data_aggregator_mcp/literature.py +59 -0
  31. data_aggregator_mcp-0.11.0/src/data_aggregator_mcp/models.py +100 -0
  32. data_aggregator_mcp-0.11.0/src/data_aggregator_mcp/omics.py +186 -0
  33. data_aggregator_mcp-0.11.0/src/data_aggregator_mcp/openaire.py +128 -0
  34. data_aggregator_mcp-0.11.0/src/data_aggregator_mcp/osf.py +46 -0
  35. data_aggregator_mcp-0.11.0/src/data_aggregator_mcp/pubmed.py +133 -0
  36. data_aggregator_mcp-0.11.0/src/data_aggregator_mcp/router.py +218 -0
  37. data_aggregator_mcp-0.11.0/src/data_aggregator_mcp/scholix.py +77 -0
  38. data_aggregator_mcp-0.11.0/src/data_aggregator_mcp/server.py +329 -0
  39. data_aggregator_mcp-0.11.0/src/data_aggregator_mcp/taxonomy.py +71 -0
  40. data_aggregator_mcp-0.11.0/src/data_aggregator_mcp/zenodo.py +102 -0
  41. data_aggregator_mcp-0.11.0/tests/__init__.py +0 -0
  42. data_aggregator_mcp-0.11.0/tests/conftest.py +3 -0
  43. data_aggregator_mcp-0.11.0/tests/test__eutils.py +133 -0
  44. data_aggregator_mcp-0.11.0/tests/test__http.py +123 -0
  45. data_aggregator_mcp-0.11.0/tests/test__merge.py +12 -0
  46. data_aggregator_mcp-0.11.0/tests/test_archive.py +71 -0
  47. data_aggregator_mcp-0.11.0/tests/test_citation.py +94 -0
  48. data_aggregator_mcp-0.11.0/tests/test_datacite.py +252 -0
  49. data_aggregator_mcp-0.11.0/tests/test_dataverse.py +86 -0
  50. data_aggregator_mcp-0.11.0/tests/test_dryad.py +67 -0
  51. data_aggregator_mcp-0.11.0/tests/test_ena.py +70 -0
  52. data_aggregator_mcp-0.11.0/tests/test_entrypoint_smoke.py +20 -0
  53. data_aggregator_mcp-0.11.0/tests/test_errors.py +26 -0
  54. data_aggregator_mcp-0.11.0/tests/test_fetch.py +204 -0
  55. data_aggregator_mcp-0.11.0/tests/test_fetch_gate.py +225 -0
  56. data_aggregator_mcp-0.11.0/tests/test_figshare.py +74 -0
  57. data_aggregator_mcp-0.11.0/tests/test_fulltext.py +135 -0
  58. data_aggregator_mcp-0.11.0/tests/test_geo.py +130 -0
  59. data_aggregator_mcp-0.11.0/tests/test_idconv.py +73 -0
  60. data_aggregator_mcp-0.11.0/tests/test_literature.py +68 -0
  61. data_aggregator_mcp-0.11.0/tests/test_models.py +142 -0
  62. data_aggregator_mcp-0.11.0/tests/test_omics.py +285 -0
  63. data_aggregator_mcp-0.11.0/tests/test_openaire.py +281 -0
  64. data_aggregator_mcp-0.11.0/tests/test_osf.py +74 -0
  65. data_aggregator_mcp-0.11.0/tests/test_packaging.py +68 -0
  66. data_aggregator_mcp-0.11.0/tests/test_pubmed.py +354 -0
  67. data_aggregator_mcp-0.11.0/tests/test_router.py +635 -0
  68. data_aggregator_mcp-0.11.0/tests/test_scholix.py +82 -0
  69. data_aggregator_mcp-0.11.0/tests/test_server.py +197 -0
  70. data_aggregator_mcp-0.11.0/tests/test_taxonomy.py +140 -0
  71. data_aggregator_mcp-0.11.0/tests/test_workflows.py +39 -0
  72. data_aggregator_mcp-0.11.0/tests/test_zenodo.py +137 -0
  73. data_aggregator_mcp-0.11.0/uv.lock +1037 -0
@@ -0,0 +1,16 @@
1
+ name: CI
2
+ on: [push, pull_request]
3
+ jobs:
4
+ test:
5
+ runs-on: ubuntu-latest
6
+ strategy:
7
+ matrix:
8
+ python-version: ["3.11", "3.12"]
9
+ steps:
10
+ - uses: actions/checkout@v4
11
+ - uses: astral-sh/setup-uv@v5
12
+ with:
13
+ python-version: ${{ matrix.python-version }}
14
+ - run: uv sync --extra dev
15
+ - run: uv run ruff check .
16
+ - run: uv run pytest -q
@@ -0,0 +1,34 @@
1
+ name: Publish
2
+ on:
3
+ release:
4
+ types: [published]
5
+ jobs:
6
+ build:
7
+ runs-on: ubuntu-latest
8
+ steps:
9
+ - uses: actions/checkout@v4
10
+ - uses: astral-sh/setup-uv@v5
11
+ - name: Verify tag matches package version
12
+ run: |
13
+ TAG="${GITHUB_REF_NAME#v}"
14
+ VER=$(python3 -c "import tomllib; print(tomllib.load(open('pyproject.toml','rb'))['project']['version'])")
15
+ if [ "$TAG" != "$VER" ]; then
16
+ echo "Release tag '$TAG' != pyproject version '$VER'"; exit 1
17
+ fi
18
+ - run: uv build
19
+ - uses: actions/upload-artifact@v4
20
+ with:
21
+ name: dist
22
+ path: dist/
23
+ pypi-publish:
24
+ needs: build
25
+ runs-on: ubuntu-latest
26
+ environment: pypi
27
+ permissions:
28
+ id-token: write
29
+ steps:
30
+ - uses: actions/download-artifact@v4
31
+ with:
32
+ name: dist
33
+ path: dist/
34
+ - uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,5 @@
1
+ .venv/
2
+ __pycache__/
3
+ *.egg-info/
4
+ dist/
5
+ build/
@@ -0,0 +1,223 @@
1
+ # Changelog
2
+
3
+ ## [0.11.0] - 2026-05-29
4
+
5
+ ### Added
6
+
7
+ - `fetch(extract=true)` — opt-in unpacking of downloaded zip/tar archives,
8
+ guarded against path-traversal and runaway extracted size.
9
+ - `fetch` integrity check — an unverified `pdf`/`xml` download whose body is
10
+ HTML (a login/paywall page) now fails loud instead of saving a bogus file.
11
+ - `resolve` of a Zenodo DOI via DataCite now populates `files[]` (delegates to
12
+ the native Zenodo adapter); such ids are fetchable.
13
+ - BioProject `resolve` attaches `links[]` to its SRA runs.
14
+ - PubMed `resolve` populates the article abstract (`description`) and, for
15
+ PMC open-access records, `access`/`license` (from EuropePMC/Unpaywall).
16
+ - `list_sources` reports per-source fetchability, id examples, and the
17
+ `organism` filter.
18
+
19
+ ### Fixed
20
+
21
+ - HTTP boundary now fully honors the fail-loud contract: transport-level errors
22
+ (connect/read/timeout) and malformed HTTP-200 bodies (NCBI throttle envelopes)
23
+ are retried and surface as a typed `DataAggregatorError`, on both the
24
+ search/resolve path (`_http`) and the `fetch` streaming path.
25
+
26
+ ## [0.10.0] - 2026-05-29
27
+
28
+ ### Added
29
+
30
+ - Literature `resolve` (`pubmed:`/`openaire:`) attaches an open-access full-text
31
+ file via an EuropePMC `fullTextXML` → Unpaywall `url_for_pdf` cascade (first
32
+ hit wins; `FileEntry.source` labels the origin). Enrichment — fails soft.
33
+ - `DataResource.identifiers` — normalized `{pmid, pmcid, doi}` cross-identifiers.
34
+ PubMed gets them free from esummary; OpenAIRE via the NCBI ID Converter.
35
+ - `FileEntry.source` — provenance label for an attached file.
36
+ - `pubmed:`/`openaire:` are now fetchable: `fetch` streams open-access full text
37
+ (unverified — no upstream checksum, like GEO). Fails loud when a paper has no
38
+ open full text.
39
+
40
+ ### Changed
41
+
42
+ - New env var `UNPAYWALL_EMAIL` enables the Unpaywall fallback leg (the EuropePMC
43
+ leg needs no key). `NCBI_EMAIL`/`UNPAYWALL_EMAIL` is forwarded to NCBI idconv.
44
+
45
+ ### Notes
46
+
47
+ - Cascade deviates from the umbrella spec's literal "PMC → EuropePMC → Unpaywall":
48
+ PMC's machine download is tgz-over-FTP (not HTTPS-fetchable), and EuropePMC
49
+ already serves the PMC OA subset as HTTPS XML — so the dedicated PMC leg is
50
+ dropped. Honors the spec's intent (open full text, first hit wins).
51
+ - No MeSH (ceded to the openalex MCP). Full text is open-access only; paywalled
52
+ content is never bypassed.
53
+
54
+ ## [0.9.0] - 2026-05-29
55
+
56
+ ### Added
57
+
58
+ - `resolve(id, cite=<format>)` renders a citation onto the record — `bibtex`,
59
+ `ris`, `csl-json`, or any CSL style name (`apa`, `mla`, `vancouver`, …). DOI
60
+ records use DOI content negotiation (CrossRef + DataCite); non-DOI records
61
+ produce CSL-JSON from metadata. Default off; failures degrade quietly.
62
+ - `DataResource.access` — normalized access status
63
+ (`open`/`embargoed`/`restricted`/`closed`/`unknown`), populated from Zenodo
64
+ `access_right`, OpenAIRE `bestAccessRight`, and an open-license signal on
65
+ DataCite rights.
66
+ - `DataResource.citation` — holds the rendered citation when `cite=` is used.
67
+
68
+ ### Changed
69
+
70
+ - OpenAIRE records now carry `license` (from the deposit instance) and `access`.
71
+
72
+ ### Notes
73
+
74
+ - PMC license/access for `pubmed:` records is deferred to Phase 9 (bundled with
75
+ PMC full-text retrieval). GEO/SRA/BioProject expose no rights → `access` stays
76
+ null honestly.
77
+
78
+ ## [0.8.0] - 2026-05-29
79
+
80
+ ### Added
81
+
82
+ - DataCite-repo fetch: resolving a DataCite DOI now attaches `files[]` from the
83
+ host repo's native API — **Figshare** (md5), **Dataverse** (Harvard default,
84
+ `DATAVERSE_BASE_URL` override; md5), **OSF** (osfstorage, paginated; md5), all
85
+ fetchable and checksum-verified. **Dryad** is manifest-only (names/sizes/
86
+ sha-256) — its downloads are token/bot-challenge gated, so it is excluded from
87
+ the fetch allowlist and fetching a Dryad DOI fails loud.
88
+ - New per-repo resolver modules: `figshare.py`, `dataverse.py`, `osf.py`, `dryad.py`.
89
+
90
+ ### Changed
91
+
92
+ - The fetch allowlist accepts `datacite:` ids; fetchability is then decided
93
+ post-resolve from the detected host repo (`_DATACITE_FETCHABLE`).
94
+
95
+ ### Fixed
96
+
97
+ - DataCite source detection now recognizes Harvard Dataverse (client id
98
+ `gdcc.harvard-dv`, which contains no "dataverse" substring).
99
+
100
+ ## [0.7.0] - 2026-05-29
101
+
102
+ ### Added
103
+
104
+ - Omics fetch: `fetch` now downloads SRA FASTQ files (via the ENA manifest,
105
+ md5-verified) and GEO supplementary files (parsed from the GEO `suppl/`
106
+ directory index; unverified — NCBI exposes no checksums there).
107
+ - New `geo.py` supplementary-file resolver; `geo:` resolves now populate
108
+ `files[]`. A GEO record with no `suppl/` directory (HTTP 404) degrades to
109
+ `files=[]` rather than failing.
110
+
111
+ ### Changed
112
+
113
+ - The `fetch` tool resolves through `router.resolve` (source-agnostic) instead
114
+ of a hardcoded Zenodo path; the `_FETCHABLE_SOURCES` allowlist now includes
115
+ `sra:` and `geo:`.
116
+
117
+ ## [0.6.0] - 2026-05-29
118
+
119
+ ### Added
120
+
121
+ - Packaging for publication: `python -m data_aggregator_mcp` entry point,
122
+ complete `[project.urls]` + `keywords`, Beta classifier.
123
+ - `server.json` for the official MCP registry
124
+ (`io.github.musharna/data-aggregator-mcp`) + the `mcp-name:` ownership marker
125
+ in the README.
126
+ - GitHub Actions: `ci.yml` (pytest + ruff, Python 3.11/3.12) and `publish.yml`
127
+ (Release-triggered PyPI upload via OIDC trusted publishing — no stored token).
128
+ - `PUBLISH.md` runbook and user-facing install/use docs (`uvx`, `pip`,
129
+ `claude mcp add`).
130
+
131
+ ### Notes
132
+
133
+ - Prepare-to-the-gate: the public GitHub repo, the real PyPI upload, and the
134
+ registry submission are documented manual steps, not executed here.
135
+ - HTTP transport remains deferred — distribution is local stdio via PyPI/`uvx`.
136
+
137
+ ## [0.5.0] - 2026-05-28
138
+
139
+ ### Added
140
+
141
+ - Unifying layer: NCBI-Taxonomy-backed **synonym expansion** on `search`. New
142
+ optional `organism` param — resolves to a taxid and ANDs the query with the
143
+ canonical name + synonyms (e.g. `Orobanche aegyptiaca` also matches
144
+ `Phelipanche aegyptiaca`, taxid 99112). The expansion is echoed in
145
+ `SearchResult.taxon_expansion`.
146
+ - **Organism normalization**: results/resolved records gain `taxa[]`
147
+ (`{taxid, name}`) derived from raw `organism[]` via NCBI Taxonomy; raw strings
148
+ are preserved.
149
+ - **Cross-links**: a `described_in` → `plant-genomics:taxid:<n>` link is attached
150
+ for Viridiplantae (plant) taxa, the seam to the sibling `plant-genomics-mcp`.
151
+
152
+ ### Notes
153
+
154
+ - No new search source and no new tool — Phase 5 is a taxonomy module plus a
155
+ post-merge enrichment pass. `fetch` is unchanged (Zenodo-only).
156
+ - Enrichment incurs zero taxonomy calls for records without an organism. A
157
+ taxonomy outage surfaces in `errors["taxonomy"]` on `search` (never silently
158
+ dropped) and degrades gracefully on `resolve`.
159
+
160
+ ## [0.4.0] - 2026-05-28
161
+
162
+ ### Added
163
+
164
+ - Unified `literature` source: PubMed + OpenAIRE publication discovery, fanned
165
+ out in parallel and merged. Registered as a fourth `search` source.
166
+ - Resolve-time paper→data links: resolving a `pubmed:` id attaches `links[]` to
167
+ `sra:`/`geo:`/`bioproject:` ids via NCBI elink; resolving an `openaire:` id
168
+ attaches `datacite:` links via the ScholeXplorer Scholix API. Publication↔
169
+ publication citation edges are dropped — that is the standalone openalex MCP's
170
+ job, not ours.
171
+
172
+ ### Notes
173
+
174
+ - Literature is discovery-only — `fetch` stays Zenodo-only and fails loud for
175
+ `pubmed:`/`openaire:` ids.
176
+ - OpenAIRE paper→dataset Scholix links are sparse (most paper edges are
177
+ citations, which are dropped); the PubMed→GEO/SRA elink path is the reliable
178
+ paper→data bridge. OpenAIRE's contribution is discovery breadth.
179
+
180
+ ## [0.3.0] - 2026-05-28
181
+
182
+ ### Added
183
+
184
+ - Unified NCBI omics source: GEO + SRA + BioProject discovery via E-utilities,
185
+ fanned out internally and merged. Registered as a third `search` source.
186
+ - ENA filereport FASTQ manifest attached on `resolve` of an `sra:` id (direct
187
+ https URLs).
188
+ - Optional `NCBI_API_KEY` env var to raise the NCBI rate limit (3→10 req/s).
189
+ - Shared round-robin `_merge.interleave` (extracted from the router) so the
190
+ omics fan-out reuses fair merging.
191
+
192
+ ### Notes
193
+
194
+ - Omics fetch is deferred — `fetch` remains Zenodo-only and fails loud for omics
195
+ ids. GEO/BioProject are discovery-only (no file manifest in this phase).
196
+
197
+ ## [0.2.0] - 2026-05-28
198
+
199
+ ### Added
200
+
201
+ - DataCite discovery adapter — one query spans every DataCite client (Dryad,
202
+ Figshare, Dataverse, OSF, Mendeley, …); metadata-only, so resources carry no
203
+ file manifest.
204
+ - Multi-source router: `search` fans out across Zenodo + DataCite in parallel,
205
+ round-robin merges results so the page limit never starves a later source,
206
+ dedups by DOI (native fetch backends win over DataCite metadata), and surfaces
207
+ per-source failures in `errors{}` instead of silently dropping a backend.
208
+ - `search` `sources` filter to restrict fan-out (e.g. `["datacite"]`).
209
+ - Shared `compact()` helper in `models` (extracted from the Zenodo adapter).
210
+
211
+ ### Changed
212
+
213
+ - `resolve` routes by id shape (`zenodo:` / bare id / `datacite:` / bare DOI).
214
+ - `fetch` is Zenodo-only in Phase 2 and fails loud (`FetchNotSupportedError`)
215
+ for discovery-only sources; per-repo fetch adapters come in a later phase.
216
+
217
+ ## [0.1.0] - 2026-05-28
218
+
219
+ ### Added
220
+
221
+ - Initial MCP server: search/resolve/fetch/list_sources over Zenodo.
222
+ - Normalized DataResource model; stream-to-disk fetch with max_bytes guard,
223
+ checksum verification, and provenance sidecar.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Jaret Arnold
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,223 @@
1
+ Metadata-Version: 2.4
2
+ Name: data-aggregator-mcp
3
+ Version: 0.11.0
4
+ Summary: Research-data acquisition MCP — find and fetch datasets across archives, omics registries, and literature
5
+ Project-URL: Homepage, https://github.com/musharna/data-aggregator-mcp
6
+ Project-URL: Repository, https://github.com/musharna/data-aggregator-mcp
7
+ Project-URL: Issues, https://github.com/musharna/data-aggregator-mcp/issues
8
+ Author-email: Jaret Arnold <mjarnold1998@gmail.com>
9
+ License-Expression: MIT
10
+ License-File: LICENSE
11
+ Keywords: bioinformatics,datacite,datasets,geo,mcp,model-context-protocol,ncbi,research-data,sra,zenodo
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
16
+ Requires-Python: >=3.11
17
+ Requires-Dist: httpx>=0.27
18
+ Requires-Dist: mcp>=1.0
19
+ Requires-Dist: pydantic>=2.6
20
+ Provides-Extra: dev
21
+ Requires-Dist: pytest-asyncio>=0.23; extra == 'dev'
22
+ Requires-Dist: pytest-cov>=5.0; extra == 'dev'
23
+ Requires-Dist: pytest-httpx>=0.30; extra == 'dev'
24
+ Requires-Dist: pytest>=8.0; extra == 'dev'
25
+ Requires-Dist: pyyaml>=6.0; extra == 'dev'
26
+ Requires-Dist: ruff>=0.3; extra == 'dev'
27
+ Description-Content-Type: text/markdown
28
+
29
+ # 🔎 data-aggregator-mcp
30
+
31
+ **One MCP server to find and fetch research data across archives, omics
32
+ registries, and literature — behind a single normalized model.**
33
+
34
+ [![PyPI](https://img.shields.io/pypi/v/data-aggregator-mcp.svg)](https://pypi.org/project/data-aggregator-mcp/)
35
+ [![Python](https://img.shields.io/pypi/pyversions/data-aggregator-mcp.svg)](https://pypi.org/project/data-aggregator-mcp/)
36
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
37
+ [![CI](https://github.com/musharna/data-aggregator-mcp/actions/workflows/ci.yml/badge.svg)](https://github.com/musharna/data-aggregator-mcp/actions/workflows/ci.yml)
38
+
39
+ `search` one query across **Zenodo, DataCite** (Dryad / Figshare / Dataverse /
40
+ OSF / Mendeley), **NCBI omics** (GEO / SRA / BioProject), and **literature**
41
+ (PubMed / OpenAIRE) — deduplicated, normalized, and cross-linked. `resolve` any
42
+ hit to its file manifest, citation, and the data it points at. `fetch` it to
43
+ disk with checksum verification.
44
+
45
+ mcp-name: io.github.musharna/data-aggregator-mcp
46
+
47
+ <p align="center">
48
+ <img src="examples/assets/demo.svg"
49
+ alt="data-aggregator-mcp stdio demo — initialize, tools/list (search, resolve, fetch, list_sources), and a live list_sources call showing the four wired sources"
50
+ width="820">
51
+ </p>
52
+
53
+ ## ✨ Why this
54
+
55
+ Most data MCPs wrap a single source. This one **unifies** them behind four tools
56
+ and one `DataResource` model, so an agent searches once and gets back comparable
57
+ records:
58
+
59
+ - **Multi-domain, one model** — generalist archives + raw omics + literature,
60
+ deduplicated by DOI (the fetchable record wins over bare metadata).
61
+ - **Taxonomy synonym expansion** — `organism="Orobanche aegyptiaca"` also matches
62
+ `Phelipanche aegyptiaca` (NCBI Taxonomy), so a species rename doesn't cost you
63
+ results.
64
+ - **Paper → data bridge** — resolve a paper and get links to the GEO / SRA /
65
+ BioProject / DataCite records it produced.
66
+ - **Verified fetch** — streams to disk with md5 verification where the source
67
+ exposes a checksum, optional archive unpacking, and a fail-loud integrity
68
+ sniff that rejects an HTML paywall page served as a "PDF".
69
+ - **Citations, access & full text** — render a citation in any CSL style, get
70
+ normalized access/license, and pull open-access full text — all in one
71
+ `resolve`.
72
+
73
+ ## ⚡ Quickstart
74
+
75
+ Run with no install:
76
+
77
+ ```bash
78
+ uvx data-aggregator-mcp
79
+ ```
80
+
81
+ Register with Claude Code:
82
+
83
+ ```bash
84
+ claude mcp add data-aggregator -- uvx data-aggregator-mcp
85
+ ```
86
+
87
+ A typical agent flow:
88
+
89
+ ```text
90
+ search("drought stress RNA-seq", organism="Sorghum bicolor")
91
+ → [ geo:GSE..., sra:SRX..., zenodo:..., pubmed:... ] # deduped, taxa-normalized
92
+
93
+ resolve("sra:SRX079566")
94
+ → DataResource{ files: [ENA FASTQ urls…], access: "open", taxa: [...] }
95
+
96
+ fetch("sra:SRX079566", dest="./data")
97
+ → ["./data/SRX079566_1.fastq.gz", …] # md5-verified
98
+ ```
99
+
100
+ <details>
101
+ <summary>Other ways to run (pip, python -m, raw client config)</summary>
102
+
103
+ ```bash
104
+ pip install data-aggregator-mcp
105
+ data-aggregator-mcp # or: python -m data_aggregator_mcp
106
+ ```
107
+
108
+ Add to a client's MCP config (e.g. Claude Desktop `claude_desktop_config.json`):
109
+
110
+ ```json
111
+ {
112
+ "mcpServers": {
113
+ "data-aggregator": {
114
+ "command": "uvx",
115
+ "args": ["data-aggregator-mcp"],
116
+ "env": { "NCBI_API_KEY": "your-optional-key" }
117
+ }
118
+ }
119
+ }
120
+ ```
121
+
122
+ </details>
123
+
124
+ ## 🗂️ Sources
125
+
126
+ | Source | Discover | Fetch | Checksum |
127
+ | ---------------------------- | :------: | :---------------: | :--------------: |
128
+ | Zenodo | ✅ | ✅ | md5 |
129
+ | DataCite → Figshare | ✅ | ✅ | md5 |
130
+ | DataCite → Dataverse | ✅ | ✅ | md5 |
131
+ | DataCite → OSF | ✅ | ✅ | md5 |
132
+ | DataCite → Dryad | ✅ | manifest only¹ | sha-256 (listed) |
133
+ | DataCite → Mendeley & others | ✅ | — | — |
134
+ | NCBI SRA | ✅ | ✅ (ENA FASTQ) | md5 |
135
+ | NCBI GEO | ✅ | ✅ (`suppl/`) | none² |
136
+ | NCBI BioProject | ✅ | → SRA links | — |
137
+ | PubMed / OpenAIRE | ✅ | ✅ (OA full text) | none² |
138
+
139
+ ¹ Dryad downloads are token / bot-challenge gated, so `fetch` fails loud;
140
+ `resolve` still lists the files.
141
+ ² No upstream checksum — `fetch` verifies content-type instead (rejects an HTML
142
+ page served in place of a binary).
143
+
144
+ ## 🛠️ Tools
145
+
146
+ ### `search(query, size?, sources?, organism?)`
147
+
148
+ Fan out across all wired sources in parallel and return compact `DataResource`
149
+ records, deduped by DOI. Per-source failures land in `errors{}` — never silently
150
+ dropped.
151
+
152
+ - `organism` — expand the query with NCBI-Taxonomy synonyms; the expansion is
153
+ echoed in `taxon_expansion`, and results carry normalized `taxa[]`
154
+ (`{taxid, name}`) plus a `described_in` link to plant-genomics-mcp for plant
155
+ taxa.
156
+ - `sources` — restrict the fan-out, e.g. `["omics"]`.
157
+ - `size` — max results (1–50).
158
+
159
+ ### `resolve(id)`
160
+
161
+ Full record + files manifest. Routes by id shape — `zenodo:7654321`, a bare DOI,
162
+ `datacite:10.5061/dryad.x`, an omics id (`sra:SRX079566`, `geo:GSE332789`,
163
+ `bioproject:PRJNA1468572`), or a literature id (`pubmed:34320281`,
164
+ `openaire:<id>`). Attaches, where available:
165
+
166
+ - **`files[]`** — ENA FASTQ manifest (SRA), GEO `suppl/`, or the host repo's
167
+ native manifest (Figshare / Dataverse / OSF / Dryad).
168
+ - **`links[]`** — paper → data: `pubmed:` → `sra:` / `geo:` / `bioproject:` (NCBI
169
+ elink); `openaire:` → `datacite:` (ScholeXplorer Scholix).
170
+ - **`access` / `license`** — normalized status
171
+ (`open` / `embargoed` / `restricted` / `closed` / `unknown`) and license where
172
+ the source exposes it.
173
+ - **`identifiers`** — normalized `{pmid, pmcid, doi}`, plus an open-access
174
+ full-text `FileEntry` (EuropePMC XML, or an Unpaywall PDF fallback) for papers.
175
+ - **`citation`** — pass `cite=<format>`: `bibtex`, `ris`, `csl-json`, or any CSL
176
+ style name (`apa`, `mla`, `vancouver`, …). DOI records use content
177
+ negotiation; others render CSL-JSON from metadata. Off by default; failures
178
+ degrade quietly.
179
+
180
+ ### `fetch(id, dest?, files?, max_bytes?, force?, extract?)`
181
+
182
+ Download files to disk and return their paths. Streams under a `max_bytes` guard
183
+ (`force` to override) with md5 verification wherever a checksum exists.
184
+
185
+ - `files` — restrict to a subset of the resolved manifest.
186
+ - `extract` — unpack downloaded zip / tar archives in place, guarded against
187
+ path traversal and runaway extracted size. Off by default.
188
+ - Unverified fetches (GEO `suppl/`, literature full text) get a content-type
189
+ sniff that fails loud if a declared binary is actually an HTML page.
190
+ - Fetchable: **Zenodo**, **SRA**, **GEO**, DataCite-hosted **Figshare** /
191
+ **Dataverse** / **OSF**, and **literature** open-access full text. **Dryad**
192
+ and other DataCite repos are discovery-only and raise
193
+ `FetchNotSupportedError`.
194
+
195
+ ### `list_sources()`
196
+
197
+ Wired sources with their capabilities — layer, kinds, supported filters,
198
+ fetchability, id examples, auth, and rate limits.
199
+
200
+ ## ⚙️ Configuration
201
+
202
+ Both optional, set via environment variables:
203
+
204
+ - `NCBI_API_KEY` — raises the NCBI E-utilities rate limit (3 → 10 req/s) used by
205
+ the omics, literature, and taxonomy lookups.
206
+ - `UNPAYWALL_EMAIL` — enables the Unpaywall fallback leg of literature full-text
207
+ retrieval (the EuropePMC leg works without it).
208
+
209
+ ## 🧪 Develop
210
+
211
+ ```bash
212
+ uv venv && uv pip install -e ".[dev]"
213
+ uv run pytest -q
214
+ uv run ruff check src tests
215
+ DATA_AGGREGATOR_MCP_LIVE=1 uv run pytest -k live -q # real-API probes
216
+ ```
217
+
218
+ The README demo (`examples/assets/demo.svg`) is recorded network-free from
219
+ `examples/_demo_stdio.py` — see the header of that file to re-record.
220
+
221
+ ## License
222
+
223
+ MIT — see [LICENSE](LICENSE).
@@ -0,0 +1,61 @@
1
+ # Publishing data-aggregator-mcp
2
+
3
+ Every step below is **outward-facing and irreversible** (PyPI versions cannot be
4
+ re-uploaded or deleted; the PyPI project name and the registry name are
5
+ permanent). The repo is prepared to the gate — nothing here has been executed.
6
+ Run these manually when ready to ship a release.
7
+
8
+ ## One-time setup
9
+
10
+ ### 1. Create the public GitHub repo
11
+
12
+ ```bash
13
+ gh repo create musharna/data-aggregator-mcp --public --source=. --remote=origin --push
14
+ ```
15
+
16
+ ### 2. Configure PyPI trusted publisher (no token needed)
17
+
18
+ On https://pypi.org → your account → _Publishing_ → _Add a pending trusted publisher_:
19
+
20
+ - PyPI Project Name: `data-aggregator-mcp`
21
+ - Owner: `musharna`
22
+ - Repository name: `data-aggregator-mcp`
23
+ - Workflow name: `publish.yml`
24
+ - Environment name: `pypi`
25
+
26
+ The first OIDC publish (step 3) creates the project automatically.
27
+
28
+ ## Per release
29
+
30
+ ### 3. Cut the release (fires `.github/workflows/publish.yml`)
31
+
32
+ Set the version in **all three** places to the release value —
33
+ `pyproject.toml`, `src/data_aggregator_mcp/__init__.py`, and `server.json`
34
+ (both top-level `version` and `packages[0].version`). The tree is already at
35
+ `0.11.0` for the first release, so no bump is needed there — just tag. Then:
36
+
37
+ ```bash
38
+ git tag v0.11.0
39
+ git push origin main --tags
40
+ gh release create v0.11.0 --title v0.11.0 --notes-from-tag
41
+ ```
42
+
43
+ The publish workflow verifies the tag matches the package version, builds the
44
+ wheel + sdist, and uploads to PyPI via OIDC trusted publishing.
45
+
46
+ ### 4. Submit to the official MCP registry (after the PyPI release is live)
47
+
48
+ ```bash
49
+ # install the publisher CLI (see modelcontextprotocol/registry releases)
50
+ mcp-publisher login github # OIDC device flow; grants the io.github.musharna/* namespace
51
+ mcp-publisher publish # reads server.json; validates the README mcp-name marker
52
+ ```
53
+
54
+ The registry fetches `https://pypi.org/pypi/data-aggregator-mcp/json` and
55
+ confirms the `mcp-name: io.github.musharna/data-aggregator-mcp` marker is present
56
+ in the published description — so the PyPI release in step 3 must land first.
57
+
58
+ ## Future enhancement (not built)
59
+
60
+ Registry submission can be automated in GitHub Actions via OIDC (a separate
61
+ `mcp-publisher` CI step). Left manual here by design.