data-aggregator-mcp 0.11.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_aggregator_mcp-0.11.0/.github/workflows/ci.yml +16 -0
- data_aggregator_mcp-0.11.0/.github/workflows/publish.yml +34 -0
- data_aggregator_mcp-0.11.0/.gitignore +5 -0
- data_aggregator_mcp-0.11.0/CHANGELOG.md +223 -0
- data_aggregator_mcp-0.11.0/LICENSE +21 -0
- data_aggregator_mcp-0.11.0/PKG-INFO +223 -0
- data_aggregator_mcp-0.11.0/PUBLISH.md +61 -0
- data_aggregator_mcp-0.11.0/README.md +195 -0
- data_aggregator_mcp-0.11.0/examples/_demo_stdio.py +156 -0
- data_aggregator_mcp-0.11.0/examples/assets/demo.svg +1 -0
- data_aggregator_mcp-0.11.0/pyproject.toml +65 -0
- data_aggregator_mcp-0.11.0/server.json +29 -0
- data_aggregator_mcp-0.11.0/src/data_aggregator_mcp/__init__.py +3 -0
- data_aggregator_mcp-0.11.0/src/data_aggregator_mcp/__main__.py +6 -0
- data_aggregator_mcp-0.11.0/src/data_aggregator_mcp/_eutils.py +133 -0
- data_aggregator_mcp-0.11.0/src/data_aggregator_mcp/_http.py +207 -0
- data_aggregator_mcp-0.11.0/src/data_aggregator_mcp/_merge.py +22 -0
- data_aggregator_mcp-0.11.0/src/data_aggregator_mcp/archive.py +85 -0
- data_aggregator_mcp-0.11.0/src/data_aggregator_mcp/citation.py +79 -0
- data_aggregator_mcp-0.11.0/src/data_aggregator_mcp/datacite.py +183 -0
- data_aggregator_mcp-0.11.0/src/data_aggregator_mcp/dataverse.py +52 -0
- data_aggregator_mcp-0.11.0/src/data_aggregator_mcp/dryad.py +46 -0
- data_aggregator_mcp-0.11.0/src/data_aggregator_mcp/ena.py +67 -0
- data_aggregator_mcp-0.11.0/src/data_aggregator_mcp/errors.py +43 -0
- data_aggregator_mcp-0.11.0/src/data_aggregator_mcp/fetch.py +135 -0
- data_aggregator_mcp-0.11.0/src/data_aggregator_mcp/figshare.py +46 -0
- data_aggregator_mcp-0.11.0/src/data_aggregator_mcp/fulltext.py +119 -0
- data_aggregator_mcp-0.11.0/src/data_aggregator_mcp/geo.py +49 -0
- data_aggregator_mcp-0.11.0/src/data_aggregator_mcp/idconv.py +47 -0
- data_aggregator_mcp-0.11.0/src/data_aggregator_mcp/literature.py +59 -0
- data_aggregator_mcp-0.11.0/src/data_aggregator_mcp/models.py +100 -0
- data_aggregator_mcp-0.11.0/src/data_aggregator_mcp/omics.py +186 -0
- data_aggregator_mcp-0.11.0/src/data_aggregator_mcp/openaire.py +128 -0
- data_aggregator_mcp-0.11.0/src/data_aggregator_mcp/osf.py +46 -0
- data_aggregator_mcp-0.11.0/src/data_aggregator_mcp/pubmed.py +133 -0
- data_aggregator_mcp-0.11.0/src/data_aggregator_mcp/router.py +218 -0
- data_aggregator_mcp-0.11.0/src/data_aggregator_mcp/scholix.py +77 -0
- data_aggregator_mcp-0.11.0/src/data_aggregator_mcp/server.py +329 -0
- data_aggregator_mcp-0.11.0/src/data_aggregator_mcp/taxonomy.py +71 -0
- data_aggregator_mcp-0.11.0/src/data_aggregator_mcp/zenodo.py +102 -0
- data_aggregator_mcp-0.11.0/tests/__init__.py +0 -0
- data_aggregator_mcp-0.11.0/tests/conftest.py +3 -0
- data_aggregator_mcp-0.11.0/tests/test__eutils.py +133 -0
- data_aggregator_mcp-0.11.0/tests/test__http.py +123 -0
- data_aggregator_mcp-0.11.0/tests/test__merge.py +12 -0
- data_aggregator_mcp-0.11.0/tests/test_archive.py +71 -0
- data_aggregator_mcp-0.11.0/tests/test_citation.py +94 -0
- data_aggregator_mcp-0.11.0/tests/test_datacite.py +252 -0
- data_aggregator_mcp-0.11.0/tests/test_dataverse.py +86 -0
- data_aggregator_mcp-0.11.0/tests/test_dryad.py +67 -0
- data_aggregator_mcp-0.11.0/tests/test_ena.py +70 -0
- data_aggregator_mcp-0.11.0/tests/test_entrypoint_smoke.py +20 -0
- data_aggregator_mcp-0.11.0/tests/test_errors.py +26 -0
- data_aggregator_mcp-0.11.0/tests/test_fetch.py +204 -0
- data_aggregator_mcp-0.11.0/tests/test_fetch_gate.py +225 -0
- data_aggregator_mcp-0.11.0/tests/test_figshare.py +74 -0
- data_aggregator_mcp-0.11.0/tests/test_fulltext.py +135 -0
- data_aggregator_mcp-0.11.0/tests/test_geo.py +130 -0
- data_aggregator_mcp-0.11.0/tests/test_idconv.py +73 -0
- data_aggregator_mcp-0.11.0/tests/test_literature.py +68 -0
- data_aggregator_mcp-0.11.0/tests/test_models.py +142 -0
- data_aggregator_mcp-0.11.0/tests/test_omics.py +285 -0
- data_aggregator_mcp-0.11.0/tests/test_openaire.py +281 -0
- data_aggregator_mcp-0.11.0/tests/test_osf.py +74 -0
- data_aggregator_mcp-0.11.0/tests/test_packaging.py +68 -0
- data_aggregator_mcp-0.11.0/tests/test_pubmed.py +354 -0
- data_aggregator_mcp-0.11.0/tests/test_router.py +635 -0
- data_aggregator_mcp-0.11.0/tests/test_scholix.py +82 -0
- data_aggregator_mcp-0.11.0/tests/test_server.py +197 -0
- data_aggregator_mcp-0.11.0/tests/test_taxonomy.py +140 -0
- data_aggregator_mcp-0.11.0/tests/test_workflows.py +39 -0
- data_aggregator_mcp-0.11.0/tests/test_zenodo.py +137 -0
- data_aggregator_mcp-0.11.0/uv.lock +1037 -0
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
on: [push, pull_request]
|
|
3
|
+
jobs:
|
|
4
|
+
test:
|
|
5
|
+
runs-on: ubuntu-latest
|
|
6
|
+
strategy:
|
|
7
|
+
matrix:
|
|
8
|
+
python-version: ["3.11", "3.12"]
|
|
9
|
+
steps:
|
|
10
|
+
- uses: actions/checkout@v4
|
|
11
|
+
- uses: astral-sh/setup-uv@v5
|
|
12
|
+
with:
|
|
13
|
+
python-version: ${{ matrix.python-version }}
|
|
14
|
+
- run: uv sync --extra dev
|
|
15
|
+
- run: uv run ruff check .
|
|
16
|
+
- run: uv run pytest -q
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
name: Publish
|
|
2
|
+
on:
|
|
3
|
+
release:
|
|
4
|
+
types: [published]
|
|
5
|
+
jobs:
|
|
6
|
+
build:
|
|
7
|
+
runs-on: ubuntu-latest
|
|
8
|
+
steps:
|
|
9
|
+
- uses: actions/checkout@v4
|
|
10
|
+
- uses: astral-sh/setup-uv@v5
|
|
11
|
+
- name: Verify tag matches package version
|
|
12
|
+
run: |
|
|
13
|
+
TAG="${GITHUB_REF_NAME#v}"
|
|
14
|
+
VER=$(python3 -c "import tomllib; print(tomllib.load(open('pyproject.toml','rb'))['project']['version'])")
|
|
15
|
+
if [ "$TAG" != "$VER" ]; then
|
|
16
|
+
echo "Release tag '$TAG' != pyproject version '$VER'"; exit 1
|
|
17
|
+
fi
|
|
18
|
+
- run: uv build
|
|
19
|
+
- uses: actions/upload-artifact@v4
|
|
20
|
+
with:
|
|
21
|
+
name: dist
|
|
22
|
+
path: dist/
|
|
23
|
+
pypi-publish:
|
|
24
|
+
needs: build
|
|
25
|
+
runs-on: ubuntu-latest
|
|
26
|
+
environment: pypi
|
|
27
|
+
permissions:
|
|
28
|
+
id-token: write
|
|
29
|
+
steps:
|
|
30
|
+
- uses: actions/download-artifact@v4
|
|
31
|
+
with:
|
|
32
|
+
name: dist
|
|
33
|
+
path: dist/
|
|
34
|
+
- uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
## [0.11.0] - 2026-05-29
|
|
4
|
+
|
|
5
|
+
### Added
|
|
6
|
+
|
|
7
|
+
- `fetch(extract=true)` — opt-in unpacking of downloaded zip/tar archives,
|
|
8
|
+
guarded against path-traversal and runaway extracted size.
|
|
9
|
+
- `fetch` integrity check — an unverified `pdf`/`xml` download whose body is
|
|
10
|
+
HTML (a login/paywall page) now fails loud instead of saving a bogus file.
|
|
11
|
+
- `resolve` of a Zenodo DOI via DataCite now populates `files[]` (delegates to
|
|
12
|
+
the native Zenodo adapter); such ids are fetchable.
|
|
13
|
+
- BioProject `resolve` attaches `links[]` to its SRA runs.
|
|
14
|
+
- PubMed `resolve` populates the article abstract (`description`) and, for
|
|
15
|
+
PMC open-access records, `access`/`license` (from EuropePMC/Unpaywall).
|
|
16
|
+
- `list_sources` reports per-source fetchability, id examples, and the
|
|
17
|
+
`organism` filter.
|
|
18
|
+
|
|
19
|
+
### Fixed
|
|
20
|
+
|
|
21
|
+
- HTTP boundary now fully honors the fail-loud contract: transport-level errors
|
|
22
|
+
(connect/read/timeout) and malformed HTTP-200 bodies (NCBI throttle envelopes)
|
|
23
|
+
are retried and surface as a typed `DataAggregatorError`, on both the
|
|
24
|
+
search/resolve path (`_http`) and the `fetch` streaming path.
|
|
25
|
+
|
|
26
|
+
## [0.10.0] - 2026-05-29
|
|
27
|
+
|
|
28
|
+
### Added
|
|
29
|
+
|
|
30
|
+
- Literature `resolve` (`pubmed:`/`openaire:`) attaches an open-access full-text
|
|
31
|
+
file via an EuropePMC `fullTextXML` → Unpaywall `url_for_pdf` cascade (first
|
|
32
|
+
hit wins; `FileEntry.source` labels the origin). Enrichment — fails soft.
|
|
33
|
+
- `DataResource.identifiers` — normalized `{pmid, pmcid, doi}` cross-identifiers.
|
|
34
|
+
PubMed gets them free from esummary; OpenAIRE via the NCBI ID Converter.
|
|
35
|
+
- `FileEntry.source` — provenance label for an attached file.
|
|
36
|
+
- `pubmed:`/`openaire:` are now fetchable: `fetch` streams open-access full text
|
|
37
|
+
(unverified — no upstream checksum, like GEO). Fails loud when a paper has no
|
|
38
|
+
open full text.
|
|
39
|
+
|
|
40
|
+
### Changed
|
|
41
|
+
|
|
42
|
+
- New env var `UNPAYWALL_EMAIL` enables the Unpaywall fallback leg (the EuropePMC
|
|
43
|
+
leg needs no key). `NCBI_EMAIL`/`UNPAYWALL_EMAIL` is forwarded to NCBI idconv.
|
|
44
|
+
|
|
45
|
+
### Notes
|
|
46
|
+
|
|
47
|
+
- Cascade deviates from the umbrella spec's literal "PMC → EuropePMC → Unpaywall":
|
|
48
|
+
PMC's machine download is tgz-over-FTP (not HTTPS-fetchable), and EuropePMC
|
|
49
|
+
already serves the PMC OA subset as HTTPS XML — so the dedicated PMC leg is
|
|
50
|
+
dropped. Honors the spec's intent (open full text, first hit wins).
|
|
51
|
+
- No MeSH (ceded to the openalex MCP). Full text is open-access only; paywalled
|
|
52
|
+
content is never bypassed.
|
|
53
|
+
|
|
54
|
+
## [0.9.0] - 2026-05-29
|
|
55
|
+
|
|
56
|
+
### Added
|
|
57
|
+
|
|
58
|
+
- `resolve(id, cite=<format>)` renders a citation onto the record — `bibtex`,
|
|
59
|
+
`ris`, `csl-json`, or any CSL style name (`apa`, `mla`, `vancouver`, …). DOI
|
|
60
|
+
records use DOI content negotiation (CrossRef + DataCite); non-DOI records
|
|
61
|
+
produce CSL-JSON from metadata. Default off; failures degrade quietly.
|
|
62
|
+
- `DataResource.access` — normalized access status
|
|
63
|
+
(`open`/`embargoed`/`restricted`/`closed`/`unknown`), populated from Zenodo
|
|
64
|
+
`access_right`, OpenAIRE `bestAccessRight`, and an open-license signal on
|
|
65
|
+
DataCite rights.
|
|
66
|
+
- `DataResource.citation` — holds the rendered citation when `cite=` is used.
|
|
67
|
+
|
|
68
|
+
### Changed
|
|
69
|
+
|
|
70
|
+
- OpenAIRE records now carry `license` (from the deposit instance) and `access`.
|
|
71
|
+
|
|
72
|
+
### Notes
|
|
73
|
+
|
|
74
|
+
- PMC license/access for `pubmed:` records is deferred to Phase 9 (bundled with
|
|
75
|
+
PMC full-text retrieval). GEO/SRA/BioProject expose no rights → `access` stays
|
|
76
|
+
null honestly.
|
|
77
|
+
|
|
78
|
+
## [0.8.0] - 2026-05-29
|
|
79
|
+
|
|
80
|
+
### Added
|
|
81
|
+
|
|
82
|
+
- DataCite-repo fetch: resolving a DataCite DOI now attaches `files[]` from the
|
|
83
|
+
host repo's native API — **Figshare** (md5), **Dataverse** (Harvard default,
|
|
84
|
+
`DATAVERSE_BASE_URL` override; md5), **OSF** (osfstorage, paginated; md5), all
|
|
85
|
+
fetchable and checksum-verified. **Dryad** is manifest-only (names/sizes/
|
|
86
|
+
sha-256) — its downloads are token/bot-challenge gated, so it is excluded from
|
|
87
|
+
the fetch allowlist and fetching a Dryad DOI fails loud.
|
|
88
|
+
- New per-repo resolver modules: `figshare.py`, `dataverse.py`, `osf.py`, `dryad.py`.
|
|
89
|
+
|
|
90
|
+
### Changed
|
|
91
|
+
|
|
92
|
+
- The fetch allowlist accepts `datacite:` ids; fetchability is then decided
|
|
93
|
+
post-resolve from the detected host repo (`_DATACITE_FETCHABLE`).
|
|
94
|
+
|
|
95
|
+
### Fixed
|
|
96
|
+
|
|
97
|
+
- DataCite source detection now recognizes Harvard Dataverse (client id
|
|
98
|
+
`gdcc.harvard-dv`, which contains no "dataverse" substring).
|
|
99
|
+
|
|
100
|
+
## [0.7.0] - 2026-05-29
|
|
101
|
+
|
|
102
|
+
### Added
|
|
103
|
+
|
|
104
|
+
- Omics fetch: `fetch` now downloads SRA FASTQ files (via the ENA manifest,
|
|
105
|
+
md5-verified) and GEO supplementary files (parsed from the GEO `suppl/`
|
|
106
|
+
directory index; unverified — NCBI exposes no checksums there).
|
|
107
|
+
- New `geo.py` supplementary-file resolver; `geo:` resolves now populate
|
|
108
|
+
`files[]`. A GEO record with no `suppl/` directory (HTTP 404) degrades to
|
|
109
|
+
`files=[]` rather than failing.
|
|
110
|
+
|
|
111
|
+
### Changed
|
|
112
|
+
|
|
113
|
+
- The `fetch` tool resolves through `router.resolve` (source-agnostic) instead
|
|
114
|
+
of a hardcoded Zenodo path; the `_FETCHABLE_SOURCES` allowlist now includes
|
|
115
|
+
`sra:` and `geo:`.
|
|
116
|
+
|
|
117
|
+
## [0.6.0] - 2026-05-29
|
|
118
|
+
|
|
119
|
+
### Added
|
|
120
|
+
|
|
121
|
+
- Packaging for publication: `python -m data_aggregator_mcp` entry point,
|
|
122
|
+
complete `[project.urls]` + `keywords`, Beta classifier.
|
|
123
|
+
- `server.json` for the official MCP registry
|
|
124
|
+
(`io.github.musharna/data-aggregator-mcp`) + the `mcp-name:` ownership marker
|
|
125
|
+
in the README.
|
|
126
|
+
- GitHub Actions: `ci.yml` (pytest + ruff, Python 3.11/3.12) and `publish.yml`
|
|
127
|
+
(Release-triggered PyPI upload via OIDC trusted publishing — no stored token).
|
|
128
|
+
- `PUBLISH.md` runbook and user-facing install/use docs (`uvx`, `pip`,
|
|
129
|
+
`claude mcp add`).
|
|
130
|
+
|
|
131
|
+
### Notes
|
|
132
|
+
|
|
133
|
+
- Prepare-to-the-gate: the public GitHub repo, the real PyPI upload, and the
|
|
134
|
+
registry submission are documented manual steps, not executed here.
|
|
135
|
+
- HTTP transport remains deferred — distribution is local stdio via PyPI/`uvx`.
|
|
136
|
+
|
|
137
|
+
## [0.5.0] - 2026-05-28
|
|
138
|
+
|
|
139
|
+
### Added
|
|
140
|
+
|
|
141
|
+
- Unifying layer: NCBI-Taxonomy-backed **synonym expansion** on `search`. New
|
|
142
|
+
optional `organism` param — resolves to a taxid and ANDs the query with the
|
|
143
|
+
canonical name + synonyms (e.g. `Orobanche aegyptiaca` also matches
|
|
144
|
+
`Phelipanche aegyptiaca`, taxid 99112). The expansion is echoed in
|
|
145
|
+
`SearchResult.taxon_expansion`.
|
|
146
|
+
- **Organism normalization**: results/resolved records gain `taxa[]`
|
|
147
|
+
(`{taxid, name}`) derived from raw `organism[]` via NCBI Taxonomy; raw strings
|
|
148
|
+
are preserved.
|
|
149
|
+
- **Cross-links**: a `described_in` → `plant-genomics:taxid:<n>` link is attached
|
|
150
|
+
for Viridiplantae (plant) taxa, the seam to the sibling `plant-genomics-mcp`.
|
|
151
|
+
|
|
152
|
+
### Notes
|
|
153
|
+
|
|
154
|
+
- No new search source and no new tool — Phase 5 is a taxonomy module plus a
|
|
155
|
+
post-merge enrichment pass. `fetch` is unchanged (Zenodo-only).
|
|
156
|
+
- Enrichment incurs zero taxonomy calls for records without an organism. A
|
|
157
|
+
taxonomy outage surfaces in `errors["taxonomy"]` on `search` (never silently
|
|
158
|
+
dropped) and degrades gracefully on `resolve`.
|
|
159
|
+
|
|
160
|
+
## [0.4.0] - 2026-05-28
|
|
161
|
+
|
|
162
|
+
### Added
|
|
163
|
+
|
|
164
|
+
- Unified `literature` source: PubMed + OpenAIRE publication discovery, fanned
|
|
165
|
+
out in parallel and merged. Registered as a fourth `search` source.
|
|
166
|
+
- Resolve-time paper→data links: resolving a `pubmed:` id attaches `links[]` to
|
|
167
|
+
`sra:`/`geo:`/`bioproject:` ids via NCBI elink; resolving an `openaire:` id
|
|
168
|
+
attaches `datacite:` links via the ScholeXplorer Scholix API. Publication↔
|
|
169
|
+
publication citation edges are dropped — that is the standalone openalex MCP's
|
|
170
|
+
job, not ours.
|
|
171
|
+
|
|
172
|
+
### Notes
|
|
173
|
+
|
|
174
|
+
- Literature is discovery-only — `fetch` stays Zenodo-only and fails loud for
|
|
175
|
+
`pubmed:`/`openaire:` ids.
|
|
176
|
+
- OpenAIRE paper→dataset Scholix links are sparse (most paper edges are
|
|
177
|
+
citations, which are dropped); the PubMed→GEO/SRA elink path is the reliable
|
|
178
|
+
paper→data bridge. OpenAIRE's contribution is discovery breadth.
|
|
179
|
+
|
|
180
|
+
## [0.3.0] - 2026-05-28
|
|
181
|
+
|
|
182
|
+
### Added
|
|
183
|
+
|
|
184
|
+
- Unified NCBI omics source: GEO + SRA + BioProject discovery via E-utilities,
|
|
185
|
+
fanned out internally and merged. Registered as a third `search` source.
|
|
186
|
+
- ENA filereport FASTQ manifest attached on `resolve` of an `sra:` id (direct
|
|
187
|
+
https URLs).
|
|
188
|
+
- Optional `NCBI_API_KEY` env var to raise the NCBI rate limit (3→10 req/s).
|
|
189
|
+
- Shared round-robin `_merge.interleave` (extracted from the router) so the
|
|
190
|
+
omics fan-out reuses fair merging.
|
|
191
|
+
|
|
192
|
+
### Notes
|
|
193
|
+
|
|
194
|
+
- Omics fetch is deferred — `fetch` remains Zenodo-only and fails loud for omics
|
|
195
|
+
ids. GEO/BioProject are discovery-only (no file manifest in this phase).
|
|
196
|
+
|
|
197
|
+
## [0.2.0] - 2026-05-28
|
|
198
|
+
|
|
199
|
+
### Added
|
|
200
|
+
|
|
201
|
+
- DataCite discovery adapter — one query spans every DataCite client (Dryad,
|
|
202
|
+
Figshare, Dataverse, OSF, Mendeley, …); metadata-only, so resources carry no
|
|
203
|
+
file manifest.
|
|
204
|
+
- Multi-source router: `search` fans out across Zenodo + DataCite in parallel,
|
|
205
|
+
round-robin merges results so the page limit never starves a later source,
|
|
206
|
+
dedups by DOI (native fetch backends win over DataCite metadata), and surfaces
|
|
207
|
+
per-source failures in `errors{}` instead of silently dropping a backend.
|
|
208
|
+
- `search` `sources` filter to restrict fan-out (e.g. `["datacite"]`).
|
|
209
|
+
- Shared `compact()` helper in `models` (extracted from the Zenodo adapter).
|
|
210
|
+
|
|
211
|
+
### Changed
|
|
212
|
+
|
|
213
|
+
- `resolve` routes by id shape (`zenodo:` / bare id / `datacite:` / bare DOI).
|
|
214
|
+
- `fetch` is Zenodo-only in Phase 2 and fails loud (`FetchNotSupportedError`)
|
|
215
|
+
for discovery-only sources; per-repo fetch adapters come in a later phase.
|
|
216
|
+
|
|
217
|
+
## [0.1.0] - 2026-05-28
|
|
218
|
+
|
|
219
|
+
### Added
|
|
220
|
+
|
|
221
|
+
- Initial MCP server: search/resolve/fetch/list_sources over Zenodo.
|
|
222
|
+
- Normalized DataResource model; stream-to-disk fetch with max_bytes guard,
|
|
223
|
+
checksum verification, and provenance sidecar.
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Jaret Arnold
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: data-aggregator-mcp
|
|
3
|
+
Version: 0.11.0
|
|
4
|
+
Summary: Research-data acquisition MCP — find and fetch datasets across archives, omics registries, and literature
|
|
5
|
+
Project-URL: Homepage, https://github.com/musharna/data-aggregator-mcp
|
|
6
|
+
Project-URL: Repository, https://github.com/musharna/data-aggregator-mcp
|
|
7
|
+
Project-URL: Issues, https://github.com/musharna/data-aggregator-mcp/issues
|
|
8
|
+
Author-email: Jaret Arnold <mjarnold1998@gmail.com>
|
|
9
|
+
License-Expression: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: bioinformatics,datacite,datasets,geo,mcp,model-context-protocol,ncbi,research-data,sra,zenodo
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
16
|
+
Requires-Python: >=3.11
|
|
17
|
+
Requires-Dist: httpx>=0.27
|
|
18
|
+
Requires-Dist: mcp>=1.0
|
|
19
|
+
Requires-Dist: pydantic>=2.6
|
|
20
|
+
Provides-Extra: dev
|
|
21
|
+
Requires-Dist: pytest-asyncio>=0.23; extra == 'dev'
|
|
22
|
+
Requires-Dist: pytest-cov>=5.0; extra == 'dev'
|
|
23
|
+
Requires-Dist: pytest-httpx>=0.30; extra == 'dev'
|
|
24
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
25
|
+
Requires-Dist: pyyaml>=6.0; extra == 'dev'
|
|
26
|
+
Requires-Dist: ruff>=0.3; extra == 'dev'
|
|
27
|
+
Description-Content-Type: text/markdown
|
|
28
|
+
|
|
29
|
+
# 🔎 data-aggregator-mcp
|
|
30
|
+
|
|
31
|
+
**One MCP server to find and fetch research data across archives, omics
|
|
32
|
+
registries, and literature — behind a single normalized model.**
|
|
33
|
+
|
|
34
|
+
[](https://pypi.org/project/data-aggregator-mcp/)
|
|
35
|
+
[](https://pypi.org/project/data-aggregator-mcp/)
|
|
36
|
+
[](LICENSE)
|
|
37
|
+
[](https://github.com/musharna/data-aggregator-mcp/actions/workflows/ci.yml)
|
|
38
|
+
|
|
39
|
+
`search` one query across **Zenodo, DataCite** (Dryad / Figshare / Dataverse /
|
|
40
|
+
OSF / Mendeley), **NCBI omics** (GEO / SRA / BioProject), and **literature**
|
|
41
|
+
(PubMed / OpenAIRE) — deduplicated, normalized, and cross-linked. `resolve` any
|
|
42
|
+
hit to its file manifest, citation, and the data it points at. `fetch` it to
|
|
43
|
+
disk with checksum verification.
|
|
44
|
+
|
|
45
|
+
mcp-name: io.github.musharna/data-aggregator-mcp
|
|
46
|
+
|
|
47
|
+
<p align="center">
|
|
48
|
+
<img src="examples/assets/demo.svg"
|
|
49
|
+
alt="data-aggregator-mcp stdio demo — initialize, tools/list (search, resolve, fetch, list_sources), and a live list_sources call showing the four wired sources"
|
|
50
|
+
width="820">
|
|
51
|
+
</p>
|
|
52
|
+
|
|
53
|
+
## ✨ Why this
|
|
54
|
+
|
|
55
|
+
Most data MCPs wrap a single source. This one **unifies** them behind four tools
|
|
56
|
+
and one `DataResource` model, so an agent searches once and gets back comparable
|
|
57
|
+
records:
|
|
58
|
+
|
|
59
|
+
- **Multi-domain, one model** — generalist archives + raw omics + literature,
|
|
60
|
+
deduplicated by DOI (the fetchable record wins over bare metadata).
|
|
61
|
+
- **Taxonomy synonym expansion** — `organism="Orobanche aegyptiaca"` also matches
|
|
62
|
+
`Phelipanche aegyptiaca` (NCBI Taxonomy), so a species rename doesn't cost you
|
|
63
|
+
results.
|
|
64
|
+
- **Paper → data bridge** — resolve a paper and get links to the GEO / SRA /
|
|
65
|
+
BioProject / DataCite records it produced.
|
|
66
|
+
- **Verified fetch** — streams to disk with md5 verification where the source
|
|
67
|
+
exposes a checksum, optional archive unpacking, and a fail-loud integrity
|
|
68
|
+
sniff that rejects an HTML paywall page served as a "PDF".
|
|
69
|
+
- **Citations, access & full text** — render a citation in any CSL style, get
|
|
70
|
+
normalized access/license, and pull open-access full text — all in one
|
|
71
|
+
`resolve`.
|
|
72
|
+
|
|
73
|
+
## ⚡ Quickstart
|
|
74
|
+
|
|
75
|
+
Run with no install:
|
|
76
|
+
|
|
77
|
+
```bash
|
|
78
|
+
uvx data-aggregator-mcp
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
Register with Claude Code:
|
|
82
|
+
|
|
83
|
+
```bash
|
|
84
|
+
claude mcp add data-aggregator -- uvx data-aggregator-mcp
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
A typical agent flow:
|
|
88
|
+
|
|
89
|
+
```text
|
|
90
|
+
search("drought stress RNA-seq", organism="Sorghum bicolor")
|
|
91
|
+
→ [ geo:GSE..., sra:SRX..., zenodo:..., pubmed:... ] # deduped, taxa-normalized
|
|
92
|
+
|
|
93
|
+
resolve("sra:SRX079566")
|
|
94
|
+
→ DataResource{ files: [ENA FASTQ urls…], access: "open", taxa: [...] }
|
|
95
|
+
|
|
96
|
+
fetch("sra:SRX079566", dest="./data")
|
|
97
|
+
→ ["./data/SRX079566_1.fastq.gz", …] # md5-verified
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
<details>
|
|
101
|
+
<summary>Other ways to run (pip, python -m, raw client config)</summary>
|
|
102
|
+
|
|
103
|
+
```bash
|
|
104
|
+
pip install data-aggregator-mcp
|
|
105
|
+
data-aggregator-mcp # or: python -m data_aggregator_mcp
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
Add to a client's MCP config (e.g. Claude Desktop `claude_desktop_config.json`):
|
|
109
|
+
|
|
110
|
+
```json
|
|
111
|
+
{
|
|
112
|
+
"mcpServers": {
|
|
113
|
+
"data-aggregator": {
|
|
114
|
+
"command": "uvx",
|
|
115
|
+
"args": ["data-aggregator-mcp"],
|
|
116
|
+
"env": { "NCBI_API_KEY": "your-optional-key" }
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
</details>
|
|
123
|
+
|
|
124
|
+
## 🗂️ Sources
|
|
125
|
+
|
|
126
|
+
| Source | Discover | Fetch | Checksum |
|
|
127
|
+
| ---------------------------- | :------: | :---------------: | :--------------: |
|
|
128
|
+
| Zenodo | ✅ | ✅ | md5 |
|
|
129
|
+
| DataCite → Figshare | ✅ | ✅ | md5 |
|
|
130
|
+
| DataCite → Dataverse | ✅ | ✅ | md5 |
|
|
131
|
+
| DataCite → OSF | ✅ | ✅ | md5 |
|
|
132
|
+
| DataCite → Dryad | ✅ | manifest only¹ | sha-256 (listed) |
|
|
133
|
+
| DataCite → Mendeley & others | ✅ | — | — |
|
|
134
|
+
| NCBI SRA | ✅ | ✅ (ENA FASTQ) | md5 |
|
|
135
|
+
| NCBI GEO | ✅ | ✅ (`suppl/`) | none² |
|
|
136
|
+
| NCBI BioProject | ✅ | → SRA links | — |
|
|
137
|
+
| PubMed / OpenAIRE | ✅ | ✅ (OA full text) | none² |
|
|
138
|
+
|
|
139
|
+
¹ Dryad downloads are token / bot-challenge gated, so `fetch` fails loud;
|
|
140
|
+
`resolve` still lists the files.
|
|
141
|
+
² No upstream checksum — `fetch` verifies content-type instead (rejects an HTML
|
|
142
|
+
page served in place of a binary).
|
|
143
|
+
|
|
144
|
+
## 🛠️ Tools
|
|
145
|
+
|
|
146
|
+
### `search(query, size?, sources?, organism?)`
|
|
147
|
+
|
|
148
|
+
Fan out across all wired sources in parallel and return compact `DataResource`
|
|
149
|
+
records, deduped by DOI. Per-source failures land in `errors{}` — never silently
|
|
150
|
+
dropped.
|
|
151
|
+
|
|
152
|
+
- `organism` — expand the query with NCBI-Taxonomy synonyms; the expansion is
|
|
153
|
+
echoed in `taxon_expansion`, and results carry normalized `taxa[]`
|
|
154
|
+
(`{taxid, name}`) plus a `described_in` link to plant-genomics-mcp for plant
|
|
155
|
+
taxa.
|
|
156
|
+
- `sources` — restrict the fan-out, e.g. `["omics"]`.
|
|
157
|
+
- `size` — max results (1–50).
|
|
158
|
+
|
|
159
|
+
### `resolve(id)`
|
|
160
|
+
|
|
161
|
+
Full record + files manifest. Routes by id shape — `zenodo:7654321`, a bare DOI,
|
|
162
|
+
`datacite:10.5061/dryad.x`, an omics id (`sra:SRX079566`, `geo:GSE332789`,
|
|
163
|
+
`bioproject:PRJNA1468572`), or a literature id (`pubmed:34320281`,
|
|
164
|
+
`openaire:<id>`). Attaches, where available:
|
|
165
|
+
|
|
166
|
+
- **`files[]`** — ENA FASTQ manifest (SRA), GEO `suppl/`, or the host repo's
|
|
167
|
+
native manifest (Figshare / Dataverse / OSF / Dryad).
|
|
168
|
+
- **`links[]`** — paper → data: `pubmed:` → `sra:` / `geo:` / `bioproject:` (NCBI
|
|
169
|
+
elink); `openaire:` → `datacite:` (ScholeXplorer Scholix).
|
|
170
|
+
- **`access` / `license`** — normalized status
|
|
171
|
+
(`open` / `embargoed` / `restricted` / `closed` / `unknown`) and license where
|
|
172
|
+
the source exposes it.
|
|
173
|
+
- **`identifiers`** — normalized `{pmid, pmcid, doi}`, plus an open-access
|
|
174
|
+
full-text `FileEntry` (EuropePMC XML, or an Unpaywall PDF fallback) for papers.
|
|
175
|
+
- **`citation`** — pass `cite=<format>`: `bibtex`, `ris`, `csl-json`, or any CSL
|
|
176
|
+
style name (`apa`, `mla`, `vancouver`, …). DOI records use content
|
|
177
|
+
negotiation; others render CSL-JSON from metadata. Off by default; failures
|
|
178
|
+
degrade quietly.
|
|
179
|
+
|
|
180
|
+
### `fetch(id, dest?, files?, max_bytes?, force?, extract?)`
|
|
181
|
+
|
|
182
|
+
Download files to disk and return their paths. Streams under a `max_bytes` guard
|
|
183
|
+
(`force` to override) with md5 verification wherever a checksum exists.
|
|
184
|
+
|
|
185
|
+
- `files` — restrict to a subset of the resolved manifest.
|
|
186
|
+
- `extract` — unpack downloaded zip / tar archives in place, guarded against
|
|
187
|
+
path traversal and runaway extracted size. Off by default.
|
|
188
|
+
- Unverified fetches (GEO `suppl/`, literature full text) get a content-type
|
|
189
|
+
sniff that fails loud if a declared binary is actually an HTML page.
|
|
190
|
+
- Fetchable: **Zenodo**, **SRA**, **GEO**, DataCite-hosted **Figshare** /
|
|
191
|
+
**Dataverse** / **OSF**, and **literature** open-access full text. **Dryad**
|
|
192
|
+
and other DataCite repos are discovery-only and raise
|
|
193
|
+
`FetchNotSupportedError`.
|
|
194
|
+
|
|
195
|
+
### `list_sources()`
|
|
196
|
+
|
|
197
|
+
Wired sources with their capabilities — layer, kinds, supported filters,
|
|
198
|
+
fetchability, id examples, auth, and rate limits.
|
|
199
|
+
|
|
200
|
+
## ⚙️ Configuration
|
|
201
|
+
|
|
202
|
+
Both optional, set via environment variables:
|
|
203
|
+
|
|
204
|
+
- `NCBI_API_KEY` — raises the NCBI E-utilities rate limit (3 → 10 req/s) used by
|
|
205
|
+
the omics, literature, and taxonomy lookups.
|
|
206
|
+
- `UNPAYWALL_EMAIL` — enables the Unpaywall fallback leg of literature full-text
|
|
207
|
+
retrieval (the EuropePMC leg works without it).
|
|
208
|
+
|
|
209
|
+
## 🧪 Develop
|
|
210
|
+
|
|
211
|
+
```bash
|
|
212
|
+
uv venv && uv pip install -e ".[dev]"
|
|
213
|
+
uv run pytest -q
|
|
214
|
+
uv run ruff check src tests
|
|
215
|
+
DATA_AGGREGATOR_MCP_LIVE=1 uv run pytest -k live -q # real-API probes
|
|
216
|
+
```
|
|
217
|
+
|
|
218
|
+
The README demo (`examples/assets/demo.svg`) is recorded network-free from
|
|
219
|
+
`examples/_demo_stdio.py` — see the header of that file to re-record.
|
|
220
|
+
|
|
221
|
+
## License
|
|
222
|
+
|
|
223
|
+
MIT — see [LICENSE](LICENSE).
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
# Publishing data-aggregator-mcp
|
|
2
|
+
|
|
3
|
+
Every step below is **outward-facing and irreversible** (PyPI versions cannot be
|
|
4
|
+
re-uploaded or deleted; the PyPI project name and the registry name are
|
|
5
|
+
permanent). The repo is prepared to the gate — nothing here has been executed.
|
|
6
|
+
Run these manually when ready to ship a release.
|
|
7
|
+
|
|
8
|
+
## One-time setup
|
|
9
|
+
|
|
10
|
+
### 1. Create the public GitHub repo
|
|
11
|
+
|
|
12
|
+
```bash
|
|
13
|
+
gh repo create musharna/data-aggregator-mcp --public --source=. --remote=origin --push
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
### 2. Configure PyPI trusted publisher (no token needed)
|
|
17
|
+
|
|
18
|
+
On https://pypi.org → your account → _Publishing_ → _Add a pending trusted publisher_:
|
|
19
|
+
|
|
20
|
+
- PyPI Project Name: `data-aggregator-mcp`
|
|
21
|
+
- Owner: `musharna`
|
|
22
|
+
- Repository name: `data-aggregator-mcp`
|
|
23
|
+
- Workflow name: `publish.yml`
|
|
24
|
+
- Environment name: `pypi`
|
|
25
|
+
|
|
26
|
+
The first OIDC publish (step 3) creates the project automatically.
|
|
27
|
+
|
|
28
|
+
## Per release
|
|
29
|
+
|
|
30
|
+
### 3. Cut the release (fires `.github/workflows/publish.yml`)
|
|
31
|
+
|
|
32
|
+
Set the version in **all three** places to the release value —
|
|
33
|
+
`pyproject.toml`, `src/data_aggregator_mcp/__init__.py`, and `server.json`
|
|
34
|
+
(both top-level `version` and `packages[0].version`). The tree is already at
|
|
35
|
+
`0.11.0` for the first release, so no bump is needed there — just tag. Then:
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
git tag v0.11.0
|
|
39
|
+
git push origin main --tags
|
|
40
|
+
gh release create v0.11.0 --title v0.11.0 --notes-from-tag
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
The publish workflow verifies the tag matches the package version, builds the
|
|
44
|
+
wheel + sdist, and uploads to PyPI via OIDC trusted publishing.
|
|
45
|
+
|
|
46
|
+
### 4. Submit to the official MCP registry (after the PyPI release is live)
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
# install the publisher CLI (see modelcontextprotocol/registry releases)
|
|
50
|
+
mcp-publisher login github # OIDC device flow; grants the io.github.musharna/* namespace
|
|
51
|
+
mcp-publisher publish # reads server.json; validates the README mcp-name marker
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
The registry fetches `https://pypi.org/pypi/data-aggregator-mcp/json` and
|
|
55
|
+
confirms the `mcp-name: io.github.musharna/data-aggregator-mcp` marker is present
|
|
56
|
+
in the published description — so the PyPI release in step 3 must land first.
|
|
57
|
+
|
|
58
|
+
## Future enhancement (not built)
|
|
59
|
+
|
|
60
|
+
Registry submission can be automated in GitHub Actions via OIDC (a separate
|
|
61
|
+
`mcp-publisher` CI step). Left manual here by design.
|