m4-infra 0.0.0.dev0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- m4_infra-0.0.0.dev0/LICENSE +21 -0
- m4_infra-0.0.0.dev0/PKG-INFO +324 -0
- m4_infra-0.0.0.dev0/README.md +279 -0
- m4_infra-0.0.0.dev0/pyproject.toml +134 -0
- m4_infra-0.0.0.dev0/src/m4/__init__.py +54 -0
- m4_infra-0.0.0.dev0/src/m4/api.py +346 -0
- m4_infra-0.0.0.dev0/src/m4/auth.py +433 -0
- m4_infra-0.0.0.dev0/src/m4/cli.py +882 -0
- m4_infra-0.0.0.dev0/src/m4/config.py +252 -0
- m4_infra-0.0.0.dev0/src/m4/console.py +325 -0
- m4_infra-0.0.0.dev0/src/m4/core/__init__.py +28 -0
- m4_infra-0.0.0.dev0/src/m4/core/backends/__init__.py +98 -0
- m4_infra-0.0.0.dev0/src/m4/core/backends/base.py +202 -0
- m4_infra-0.0.0.dev0/src/m4/core/backends/bigquery.py +345 -0
- m4_infra-0.0.0.dev0/src/m4/core/backends/duckdb.py +272 -0
- m4_infra-0.0.0.dev0/src/m4/core/datasets.py +283 -0
- m4_infra-0.0.0.dev0/src/m4/core/exceptions.py +152 -0
- m4_infra-0.0.0.dev0/src/m4/core/serialization.py +146 -0
- m4_infra-0.0.0.dev0/src/m4/core/tools/__init__.py +106 -0
- m4_infra-0.0.0.dev0/src/m4/core/tools/base.py +160 -0
- m4_infra-0.0.0.dev0/src/m4/core/tools/management.py +170 -0
- m4_infra-0.0.0.dev0/src/m4/core/tools/notes.py +350 -0
- m4_infra-0.0.0.dev0/src/m4/core/tools/registry.py +332 -0
- m4_infra-0.0.0.dev0/src/m4/core/tools/tabular.py +215 -0
- m4_infra-0.0.0.dev0/src/m4/core/validation.py +217 -0
- m4_infra-0.0.0.dev0/src/m4/data_io.py +531 -0
- m4_infra-0.0.0.dev0/src/m4/mcp_client_configs/__init__.py +6 -0
- m4_infra-0.0.0.dev0/src/m4/mcp_client_configs/dynamic_mcp_config.py +500 -0
- m4_infra-0.0.0.dev0/src/m4/mcp_client_configs/setup_claude_desktop.py +322 -0
- m4_infra-0.0.0.dev0/src/m4/mcp_server.py +515 -0
- m4_infra-0.0.0.dev0/src/m4/skills/SKILLS_INDEX.md +132 -0
- m4_infra-0.0.0.dev0/src/m4/skills/__init__.py +21 -0
- m4_infra-0.0.0.dev0/src/m4/skills/apsiii-score/SKILL.md +147 -0
- m4_infra-0.0.0.dev0/src/m4/skills/apsiii-score/scripts/apsiii.sql +894 -0
- m4_infra-0.0.0.dev0/src/m4/skills/baseline-creatinine/SKILL.md +145 -0
- m4_infra-0.0.0.dev0/src/m4/skills/baseline-creatinine/scripts/creatinine_baseline.sql +71 -0
- m4_infra-0.0.0.dev0/src/m4/skills/clinical-research-pitfalls/SKILL.md +242 -0
- m4_infra-0.0.0.dev0/src/m4/skills/first-icu-stay/SKILL.md +193 -0
- m4_infra-0.0.0.dev0/src/m4/skills/first-icu-stay/scripts/icustay_detail.sql +47 -0
- m4_infra-0.0.0.dev0/src/m4/skills/gcs-calculation/SKILL.md +161 -0
- m4_infra-0.0.0.dev0/src/m4/skills/gcs-calculation/scripts/gcs.sql +127 -0
- m4_infra-0.0.0.dev0/src/m4/skills/installer.py +197 -0
- m4_infra-0.0.0.dev0/src/m4/skills/kdigo-aki-staging/SKILL.md +207 -0
- m4_infra-0.0.0.dev0/src/m4/skills/kdigo-aki-staging/scripts/kdigo_creatinine.sql +63 -0
- m4_infra-0.0.0.dev0/src/m4/skills/kdigo-aki-staging/scripts/kdigo_stages.sql +153 -0
- m4_infra-0.0.0.dev0/src/m4/skills/kdigo-aki-staging/scripts/kdigo_uo.sql +112 -0
- m4_infra-0.0.0.dev0/src/m4/skills/lods-score/SKILL.md +157 -0
- m4_infra-0.0.0.dev0/src/m4/skills/lods-score/scripts/lods.sql +230 -0
- m4_infra-0.0.0.dev0/src/m4/skills/m4-api/SKILL.md +134 -0
- m4_infra-0.0.0.dev0/src/m4/skills/mimic-eicu-mapping/SKILL.md +205 -0
- m4_infra-0.0.0.dev0/src/m4/skills/mimic-table-relationships/SKILL.md +193 -0
- m4_infra-0.0.0.dev0/src/m4/skills/oasis-score/SKILL.md +131 -0
- m4_infra-0.0.0.dev0/src/m4/skills/oasis-score/scripts/oasis.sql +287 -0
- m4_infra-0.0.0.dev0/src/m4/skills/sapsii-score/SKILL.md +134 -0
- m4_infra-0.0.0.dev0/src/m4/skills/sapsii-score/scripts/sapsii.sql +549 -0
- m4_infra-0.0.0.dev0/src/m4/skills/sepsis-3-cohort/SKILL.md +152 -0
- m4_infra-0.0.0.dev0/src/m4/skills/sepsis-3-cohort/scripts/sepsis3.sql +80 -0
- m4_infra-0.0.0.dev0/src/m4/skills/sirs-criteria/SKILL.md +156 -0
- m4_infra-0.0.0.dev0/src/m4/skills/sirs-criteria/scripts/sirs.sql +100 -0
- m4_infra-0.0.0.dev0/src/m4/skills/sofa-score/SKILL.md +138 -0
- m4_infra-0.0.0.dev0/src/m4/skills/sofa-score/scripts/sofa.sql +379 -0
- m4_infra-0.0.0.dev0/src/m4/skills/suspicion-of-infection/SKILL.md +158 -0
- m4_infra-0.0.0.dev0/src/m4/skills/suspicion-of-infection/scripts/suspicion_of_infection.sql +175 -0
- m4_infra-0.0.0.dev0/src/m4/skills/vasopressor-equivalents/SKILL.md +158 -0
- m4_infra-0.0.0.dev0/src/m4/skills/vasopressor-equivalents/scripts/norepinephrine_equivalent_dose.sql +33 -0
- m4_infra-0.0.0.dev0/tests/core/__init__.py +1 -0
- m4_infra-0.0.0.dev0/tests/core/backends/__init__.py +1 -0
- m4_infra-0.0.0.dev0/tests/core/backends/test_base.py +140 -0
- m4_infra-0.0.0.dev0/tests/core/backends/test_bigquery.py +311 -0
- m4_infra-0.0.0.dev0/tests/core/backends/test_duckdb.py +472 -0
- m4_infra-0.0.0.dev0/tests/core/backends/test_factory.py +120 -0
- m4_infra-0.0.0.dev0/tests/core/test_datasets.py +202 -0
- m4_infra-0.0.0.dev0/tests/core/test_validation.py +348 -0
- m4_infra-0.0.0.dev0/tests/core/tools/__init__.py +1 -0
- m4_infra-0.0.0.dev0/tests/core/tools/test_base.py +222 -0
- m4_infra-0.0.0.dev0/tests/core/tools/test_management.py +367 -0
- m4_infra-0.0.0.dev0/tests/core/tools/test_registry.py +449 -0
- m4_infra-0.0.0.dev0/tests/core/tools/test_tabular.py +281 -0
- m4_infra-0.0.0.dev0/tests/test_api.py +379 -0
- m4_infra-0.0.0.dev0/tests/test_cli.py +237 -0
- m4_infra-0.0.0.dev0/tests/test_config.py +71 -0
- m4_infra-0.0.0.dev0/tests/test_config_scripts.py +118 -0
- m4_infra-0.0.0.dev0/tests/test_data_io.py +141 -0
- m4_infra-0.0.0.dev0/tests/test_dynamic_switching.py +69 -0
- m4_infra-0.0.0.dev0/tests/test_mcp_dataset_tools.py +107 -0
- m4_infra-0.0.0.dev0/tests/test_mcp_server.py +670 -0
- m4_infra-0.0.0.dev0/tests/test_oauth2_basic.py +279 -0
- m4_infra-0.0.0.dev0/tests/utils/__init__.py +5 -0
- m4_infra-0.0.0.dev0/tests/utils/auth_helpers.py +74 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Hannes Ill
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,324 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: m4-infra
|
|
3
|
+
Version: 0.0.0.dev0
|
|
4
|
+
Summary: Infrastructure for AI-assisted clinical research with EHR datasets
|
|
5
|
+
Keywords: mimic-iv,clinical-data,mcp,llm,medical,healthcare,duckdb,bigquery
|
|
6
|
+
Author-Email: Rafi Al Attrach <rafiaa@mit.edu>, Pedro Moreira <pedrojfm@mit.edu>, Rajna Fani <rajnaf@mit.edu>
|
|
7
|
+
Maintainer-Email: Rafi Al Attrach <rafiaa@mit.edu>, Pedro Moreira <pedrojfm@mit.edu>, Rajna Fani <rajnaf@mit.edu>
|
|
8
|
+
License-Expression: MIT
|
|
9
|
+
Classifier: Development Status :: 4 - Beta
|
|
10
|
+
Classifier: Intended Audience :: Science/Research
|
|
11
|
+
Classifier: Intended Audience :: Healthcare Industry
|
|
12
|
+
Classifier: Topic :: Scientific/Engineering :: Medical Science Apps.
|
|
13
|
+
Classifier: Topic :: Database :: Database Engines/Servers
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Operating System :: OS Independent
|
|
19
|
+
Project-URL: Homepage, https://github.com/rafiattrach/m4
|
|
20
|
+
Project-URL: Repository, https://github.com/rafiattrach/m4
|
|
21
|
+
Project-URL: Documentation, https://github.com/rafiattrach/m4#readme
|
|
22
|
+
Project-URL: Issues, https://github.com/rafiattrach/m4/issues
|
|
23
|
+
Project-URL: Changelog, https://github.com/rafiattrach/m4/releases
|
|
24
|
+
Requires-Python: >=3.10
|
|
25
|
+
Requires-Dist: typer>=0.9.0
|
|
26
|
+
Requires-Dist: rich>=13.0.0
|
|
27
|
+
Requires-Dist: requests>=2.30.0
|
|
28
|
+
Requires-Dist: beautifulsoup4>=4.12.0
|
|
29
|
+
Requires-Dist: polars[pyarrow]>=0.20.10
|
|
30
|
+
Requires-Dist: appdirs>=1.4.0
|
|
31
|
+
Requires-Dist: sqlalchemy>=2.0.0
|
|
32
|
+
Requires-Dist: pandas>=2.0.0
|
|
33
|
+
Requires-Dist: fastmcp>=0.1.0
|
|
34
|
+
Requires-Dist: google-cloud-bigquery>=3.0.0
|
|
35
|
+
Requires-Dist: db-dtypes>=1.0.0
|
|
36
|
+
Requires-Dist: sqlparse>=0.4.0
|
|
37
|
+
Requires-Dist: pyjwt[crypto]>=2.8.0
|
|
38
|
+
Requires-Dist: cryptography>=41.0.0
|
|
39
|
+
Requires-Dist: python-jose[cryptography]>=3.3.0
|
|
40
|
+
Requires-Dist: httpx>=0.24.0
|
|
41
|
+
Requires-Dist: duckdb>=1.4.1
|
|
42
|
+
Requires-Dist: matplotlib>=3.10.8
|
|
43
|
+
Requires-Dist: lifelines>=0.30.0
|
|
44
|
+
Description-Content-Type: text/markdown
|
|
45
|
+
|
|
46
|
+
# M4: A Toolbox for LLMs on Clinical Data
|
|
47
|
+
|
|
48
|
+
<p align="center">
|
|
49
|
+
<img src="webapp/public/m4_logo_transparent.png" alt="M4 Logo" width="180"/>
|
|
50
|
+
</p>
|
|
51
|
+
|
|
52
|
+
<p align="center">
|
|
53
|
+
<strong>Query clinical datasets with natural language through Claude, Cursor, or any MCP client</strong>
|
|
54
|
+
</p>
|
|
55
|
+
|
|
56
|
+
<p align="center">
|
|
57
|
+
<a href="https://www.python.org/downloads/"><img alt="Python" src="https://img.shields.io/badge/Python-3.10+-blue?logo=python&logoColor=white"></a>
|
|
58
|
+
<a href="https://modelcontextprotocol.io/"><img alt="MCP" src="https://img.shields.io/badge/MCP-Compatible-green?logo=ai&logoColor=white"></a>
|
|
59
|
+
<a href="https://github.com/hannesill/m4/actions/workflows/tests.yaml"><img alt="Tests" src="https://github.com/hannesill/m4/actions/workflows/tests.yaml/badge.svg"></a>
|
|
60
|
+
</p>
|
|
61
|
+
|
|
62
|
+
M4 is an infrastructure layer for multimodal EHR data that provides LLM agents with a unified toolbox for querying clinical datasets.
|
|
63
|
+
It supports tabular data and clinical notes, dynamically selecting tools by modality to query MIMIC-IV, eICU, and custom datasets through a single natural-language interface.
|
|
64
|
+
|
|
65
|
+
[Usage example](https://claude.ai/share/93f26832-f298-4d1d-96e3-5608d7f0d7ad)
|
|
66
|
+
|
|
67
|
+
> M4 is a fork of the [M3](https://github.com/rafiattrach/m3) project and would not be possible without it 🫶 Please [cite](#citation) their work when using M4!
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
## Quickstart (3 steps)
|
|
71
|
+
|
|
72
|
+
### 1. Install uv
|
|
73
|
+
|
|
74
|
+
**macOS/Linux:**
|
|
75
|
+
```bash
|
|
76
|
+
curl -LsSf https://astral.sh/uv/install.sh | sh
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
**Windows (PowerShell):**
|
|
80
|
+
```powershell
|
|
81
|
+
powershell -ExecutionPolicy ByPass -c "irm https://astral.sh/uv/install.ps1 | iex"
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
### 2. Initialize M4
|
|
85
|
+
|
|
86
|
+
```bash
|
|
87
|
+
mkdir my-research && cd my-research
|
|
88
|
+
uv init && uv add m4-infra
|
|
89
|
+
uv run m4 init mimic-iv-demo
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
This downloads the free MIMIC-IV demo dataset (~16MB) and sets up a local DuckDB database.
|
|
93
|
+
|
|
94
|
+
### 3. Connect your AI client
|
|
95
|
+
|
|
96
|
+
**Claude Desktop:**
|
|
97
|
+
```bash
|
|
98
|
+
uv run m4 config claude --quick
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
**Other clients (Cursor, LibreChat, etc.):**
|
|
102
|
+
```bash
|
|
103
|
+
uv run m4 config --quick
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
Copy the generated JSON into your client's MCP settings, restart, and start asking questions!
|
|
107
|
+
|
|
108
|
+
<details>
|
|
109
|
+
<summary>Different setup options</summary>
|
|
110
|
+
|
|
111
|
+
* If you don't want to use uv, you can just run pip install m4-infra
|
|
112
|
+
|
|
113
|
+
* If you want to use Docker, look at <a href="docs/DEVELOPMENT.md">docs/DEVELOPMENT.md</a>
|
|
114
|
+
</details>
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
## Code Execution
|
|
118
|
+
|
|
119
|
+
For complex analysis that goes beyond simple queries, M4 provides a Python API that returns Python data types instead of formatted strings (e.g. pd.DataFrame for SQL queries). This transforms M4 from a query tool into a complete clinical data analysis environment.
|
|
120
|
+
|
|
121
|
+
```python
|
|
122
|
+
from m4 import set_dataset, execute_query, get_schema
|
|
123
|
+
|
|
124
|
+
set_dataset("mimic-iv")
|
|
125
|
+
|
|
126
|
+
# Get schema as a dict
|
|
127
|
+
schema = get_schema()
|
|
128
|
+
print(schema['tables']) # ['admissions', 'diagnoses_icd', ...]
|
|
129
|
+
|
|
130
|
+
# Query returns a pandas DataFrame
|
|
131
|
+
df = execute_query("""
|
|
132
|
+
SELECT diagnosis, COUNT(*) as n
|
|
133
|
+
FROM diagnoses_icd
|
|
134
|
+
GROUP BY diagnosis
|
|
135
|
+
ORDER BY n DESC
|
|
136
|
+
LIMIT 10
|
|
137
|
+
""")
|
|
138
|
+
|
|
139
|
+
# Use full pandas power: filter, join, compute statistics
|
|
140
|
+
df[df['n'] > 100].plot(kind='bar')
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
The API uses the same tools as the MCP server, so behavior is consistent. But instead of parsing text, you get DataFrames you can immediately analyze, visualize, or feed into downstream pipelines.
|
|
144
|
+
|
|
145
|
+
**When to use code execution:**
|
|
146
|
+
- Multi-step analyses where each query informs the next
|
|
147
|
+
- Large result sets (thousands of rows) that shouldn't flood your context
|
|
148
|
+
- Statistical computations, survival analysis, cohort characterization
|
|
149
|
+
- Building reproducible analysis notebooks
|
|
150
|
+
|
|
151
|
+
See [Code Execution Guide](docs/CODE_EXECUTION.md) for the full API reference.
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
## Agent Skills
|
|
155
|
+
|
|
156
|
+
M4 ships with skills that teach AI coding assistants how to use the Python API effectively. Skills are contextual prompts that activate when relevant—when you ask about clinical data analysis, the assistant automatically knows how to use M4's API.
|
|
157
|
+
|
|
158
|
+
**Supported tools:** Claude Code, Cursor, Cline, Codex CLI, Gemini CLI, GitHub Copilot
|
|
159
|
+
|
|
160
|
+
```bash
|
|
161
|
+
m4 skills # Interactive tool selection
|
|
162
|
+
m4 skills --tools claude,cursor # Install for specific tools
|
|
163
|
+
m4 skills --list # Show installed skills
|
|
164
|
+
m4 config claude --skills # Install during Claude Desktop setup
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
See [Skills Guide](docs/SKILLS.md) for details on the available skills and how to create custom ones.
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
## Example Questions
|
|
171
|
+
|
|
172
|
+
Once connected, try asking:
|
|
173
|
+
|
|
174
|
+
**Tabular data (mimic-iv, eicu):**
|
|
175
|
+
- *"What tables are available in the database?"*
|
|
176
|
+
- *"Show me the race distribution in hospital admissions"*
|
|
177
|
+
- *"Find all ICU stays longer than 7 days"*
|
|
178
|
+
- *"What are the most common lab tests?"*
|
|
179
|
+
|
|
180
|
+
**Clinical notes (mimic-iv-note):**
|
|
181
|
+
- *"Search for notes mentioning diabetes"*
|
|
182
|
+
- *"List all notes for patient 10000032"*
|
|
183
|
+
- *"Get the full discharge summary for this patient"*
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
## Supported Datasets
|
|
187
|
+
|
|
188
|
+
| Dataset | Modality | Size | Access | Local | BigQuery |
|
|
189
|
+
|---------|----------|------|--------|-------|----------|
|
|
190
|
+
| **mimic-iv-demo** | Tabular | 100 patients | Free | Yes | No |
|
|
191
|
+
| **mimic-iv** | Tabular | 365k patients | [PhysioNet credentialed](https://physionet.org/content/mimiciv/) | Yes | Yes |
|
|
192
|
+
| **mimic-iv-note** | Notes | 331k notes | [PhysioNet credentialed](https://physionet.org/content/mimic-iv-note/) | Yes | Yes |
|
|
193
|
+
| **eicu** | Tabular | 200k+ patients | [PhysioNet credentialed](https://physionet.org/content/eicu-crd/) | Yes | Yes |
|
|
194
|
+
|
|
195
|
+
These datasets are supported out of the box. However, it is possible to add any other custom dataset by following [these instructions](docs/CUSTOM_DATASETS.md).
|
|
196
|
+
|
|
197
|
+
Switch datasets anytime:
|
|
198
|
+
```bash
|
|
199
|
+
m4 use mimic-iv # Switch to full MIMIC-IV
|
|
200
|
+
m4 status # Show active dataset details
|
|
201
|
+
m4 status --all # List all available datasets
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
<details>
|
|
205
|
+
<summary><strong>Setting up MIMIC-IV or eICU (credentialed datasets)</strong></summary>
|
|
206
|
+
|
|
207
|
+
1. **Get PhysioNet credentials:** Complete the [credentialing process](https://physionet.org/settings/credentialing/) and sign the data use agreement for the dataset.
|
|
208
|
+
|
|
209
|
+
2. **Download the data:**
|
|
210
|
+
```bash
|
|
211
|
+
# For MIMIC-IV
|
|
212
|
+
wget -r -N -c -np --user YOUR_USERNAME --ask-password \
|
|
213
|
+
https://physionet.org/files/mimiciv/3.1/ \
|
|
214
|
+
-P m4_data/raw_files/mimic-iv
|
|
215
|
+
|
|
216
|
+
# For eICU
|
|
217
|
+
wget -r -N -c -np --user YOUR_USERNAME --ask-password \
|
|
218
|
+
https://physionet.org/files/eicu-crd/2.0/ \
|
|
219
|
+
-P m4_data/raw_files/eicu
|
|
220
|
+
```
|
|
221
|
+
Put the downloaded data in a `m4_data` directory that ideally is located within the project directory. Name the directory for the dataset `mimic-iv`/`eicu`.
|
|
222
|
+
|
|
223
|
+
3. **Initialize:**
|
|
224
|
+
```bash
|
|
225
|
+
m4 init mimic-iv # or: m4 init eicu
|
|
226
|
+
```
|
|
227
|
+
|
|
228
|
+
This converts the CSV files to Parquet format and creates a local DuckDB database.
|
|
229
|
+
</details>
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
## Available Tools
|
|
233
|
+
|
|
234
|
+
M4 exposes these tools to your AI client. Tools are filtered based on the active dataset's modality.
|
|
235
|
+
|
|
236
|
+
**Dataset Management:**
|
|
237
|
+
| Tool | Description |
|
|
238
|
+
|------|-------------|
|
|
239
|
+
| `list_datasets` | List available datasets and their status |
|
|
240
|
+
| `set_dataset` | Switch the active dataset |
|
|
241
|
+
|
|
242
|
+
**Tabular Data Tools** (mimic-iv, mimic-iv-demo, eicu):
|
|
243
|
+
| Tool | Description |
|
|
244
|
+
|------|-------------|
|
|
245
|
+
| `get_database_schema` | List all available tables |
|
|
246
|
+
| `get_table_info` | Get column details and sample data |
|
|
247
|
+
| `execute_query` | Run SQL SELECT queries |
|
|
248
|
+
|
|
249
|
+
**Clinical Notes Tools** (mimic-iv-note):
|
|
250
|
+
| Tool | Description |
|
|
251
|
+
|------|-------------|
|
|
252
|
+
| `search_notes` | Full-text search with snippets |
|
|
253
|
+
| `get_note` | Retrieve a single note by ID |
|
|
254
|
+
| `list_patient_notes` | List notes for a patient (metadata only) |
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
## More Documentation
|
|
258
|
+
|
|
259
|
+
| Guide | Description |
|
|
260
|
+
|-------|-------------|
|
|
261
|
+
| [Code Execution](docs/CODE_EXECUTION.md) | Python API for programmatic access |
|
|
262
|
+
| [Skills](docs/SKILLS.md) | Claude Code skills for contextual assistance |
|
|
263
|
+
| [Tools Reference](docs/TOOLS.md) | MCP tool documentation |
|
|
264
|
+
| [BigQuery Setup](docs/BIGQUERY.md) | Google Cloud for full datasets |
|
|
265
|
+
| [Custom Datasets](docs/CUSTOM_DATASETS.md) | Add your own PhysioNet datasets |
|
|
266
|
+
| [Development](docs/DEVELOPMENT.md) | Contributing, testing, architecture |
|
|
267
|
+
| [OAuth2 Authentication](docs/OAUTH2_AUTHENTICATION.md) | Enterprise security setup |
|
|
268
|
+
|
|
269
|
+
## Roadmap
|
|
270
|
+
|
|
271
|
+
M4 is designed as a growing toolbox for LLM agents working with EHR data. Planned and ongoing directions include:
|
|
272
|
+
|
|
273
|
+
- **More Tools**
|
|
274
|
+
- Implement tools for current modalities (e.g. statistical reports, RAG)
|
|
275
|
+
- Add tools for new modalities (images, waveforms)
|
|
276
|
+
|
|
277
|
+
- **Better context handling**
|
|
278
|
+
- Concise, dataset-aware context for LLM agents
|
|
279
|
+
|
|
280
|
+
- **Dataset expansion**
|
|
281
|
+
- Out-of-the-box support for additional PhysioNet datasets
|
|
282
|
+
- Improved support for institutional/custom EHR schemas
|
|
283
|
+
|
|
284
|
+
- **Evaluation & reproducibility**
|
|
285
|
+
- Session export and replay
|
|
286
|
+
- Evaluation with the latest LLMs and smaller expert models
|
|
287
|
+
|
|
288
|
+
The roadmap reflects current development goals and may evolve as the project matures.
|
|
289
|
+
|
|
290
|
+
## Troubleshooting
|
|
291
|
+
|
|
292
|
+
**"Parquet not found" error:**
|
|
293
|
+
```bash
|
|
294
|
+
m4 init mimic-iv-demo --force
|
|
295
|
+
```
|
|
296
|
+
|
|
297
|
+
**MCP client won't connect:**
|
|
298
|
+
Check client logs (Claude Desktop: Help → View Logs) and ensure the config JSON is valid.
|
|
299
|
+
|
|
300
|
+
**Need to reconfigure:**
|
|
301
|
+
```bash
|
|
302
|
+
m4 config claude --quick # Regenerate Claude Desktop config
|
|
303
|
+
m4 config --quick # Regenerate generic config
|
|
304
|
+
```
|
|
305
|
+
|
|
306
|
+
## Citation
|
|
307
|
+
|
|
308
|
+
M4 builds on the M3 project. Please cite:
|
|
309
|
+
|
|
310
|
+
```bibtex
|
|
311
|
+
@article{attrach2025conversational,
|
|
312
|
+
title={Conversational LLMs Simplify Secure Clinical Data Access, Understanding, and Analysis},
|
|
313
|
+
author={Attrach, Rafi Al and Moreira, Pedro and Fani, Rajna and Umeton, Renato and Celi, Leo Anthony},
|
|
314
|
+
journal={arXiv preprint arXiv:2507.01053},
|
|
315
|
+
year={2025}
|
|
316
|
+
}
|
|
317
|
+
```
|
|
318
|
+
|
|
319
|
+
---
|
|
320
|
+
|
|
321
|
+
<p align="center">
|
|
322
|
+
<a href="https://github.com/hannesill/m4/issues">Report an Issue</a> ·
|
|
323
|
+
<a href="docs/DEVELOPMENT.md">Contribute</a>
|
|
324
|
+
</p>
|
|
@@ -0,0 +1,279 @@
|
|
|
1
|
+
# M4: A Toolbox for LLMs on Clinical Data
|
|
2
|
+
|
|
3
|
+
<p align="center">
|
|
4
|
+
<img src="webapp/public/m4_logo_transparent.png" alt="M4 Logo" width="180"/>
|
|
5
|
+
</p>
|
|
6
|
+
|
|
7
|
+
<p align="center">
|
|
8
|
+
<strong>Query clinical datasets with natural language through Claude, Cursor, or any MCP client</strong>
|
|
9
|
+
</p>
|
|
10
|
+
|
|
11
|
+
<p align="center">
|
|
12
|
+
<a href="https://www.python.org/downloads/"><img alt="Python" src="https://img.shields.io/badge/Python-3.10+-blue?logo=python&logoColor=white"></a>
|
|
13
|
+
<a href="https://modelcontextprotocol.io/"><img alt="MCP" src="https://img.shields.io/badge/MCP-Compatible-green?logo=ai&logoColor=white"></a>
|
|
14
|
+
<a href="https://github.com/hannesill/m4/actions/workflows/tests.yaml"><img alt="Tests" src="https://github.com/hannesill/m4/actions/workflows/tests.yaml/badge.svg"></a>
|
|
15
|
+
</p>
|
|
16
|
+
|
|
17
|
+
M4 is an infrastructure layer for multimodal EHR data that provides LLM agents with a unified toolbox for querying clinical datasets.
|
|
18
|
+
It supports tabular data and clinical notes, dynamically selecting tools by modality to query MIMIC-IV, eICU, and custom datasets through a single natural-language interface.
|
|
19
|
+
|
|
20
|
+
[Usage example](https://claude.ai/share/93f26832-f298-4d1d-96e3-5608d7f0d7ad)
|
|
21
|
+
|
|
22
|
+
> M4 is a fork of the [M3](https://github.com/rafiattrach/m3) project and would not be possible without it 🫶 Please [cite](#citation) their work when using M4!
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
## Quickstart (3 steps)
|
|
26
|
+
|
|
27
|
+
### 1. Install uv
|
|
28
|
+
|
|
29
|
+
**macOS/Linux:**
|
|
30
|
+
```bash
|
|
31
|
+
curl -LsSf https://astral.sh/uv/install.sh | sh
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
**Windows (PowerShell):**
|
|
35
|
+
```powershell
|
|
36
|
+
powershell -ExecutionPolicy ByPass -c "irm https://astral.sh/uv/install.ps1 | iex"
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
### 2. Initialize M4
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
mkdir my-research && cd my-research
|
|
43
|
+
uv init && uv add m4-infra
|
|
44
|
+
uv run m4 init mimic-iv-demo
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
This downloads the free MIMIC-IV demo dataset (~16MB) and sets up a local DuckDB database.
|
|
48
|
+
|
|
49
|
+
### 3. Connect your AI client
|
|
50
|
+
|
|
51
|
+
**Claude Desktop:**
|
|
52
|
+
```bash
|
|
53
|
+
uv run m4 config claude --quick
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
**Other clients (Cursor, LibreChat, etc.):**
|
|
57
|
+
```bash
|
|
58
|
+
uv run m4 config --quick
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
Copy the generated JSON into your client's MCP settings, restart, and start asking questions!
|
|
62
|
+
|
|
63
|
+
<details>
|
|
64
|
+
<summary>Different setup options</summary>
|
|
65
|
+
|
|
66
|
+
* If you don't want to use uv, you can just run pip install m4-infra
|
|
67
|
+
|
|
68
|
+
* If you want to use Docker, look at <a href="docs/DEVELOPMENT.md">docs/DEVELOPMENT.md</a>
|
|
69
|
+
</details>
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
## Code Execution
|
|
73
|
+
|
|
74
|
+
For complex analysis that goes beyond simple queries, M4 provides a Python API that returns Python data types instead of formatted strings (e.g. pd.DataFrame for SQL queries). This transforms M4 from a query tool into a complete clinical data analysis environment.
|
|
75
|
+
|
|
76
|
+
```python
|
|
77
|
+
from m4 import set_dataset, execute_query, get_schema
|
|
78
|
+
|
|
79
|
+
set_dataset("mimic-iv")
|
|
80
|
+
|
|
81
|
+
# Get schema as a dict
|
|
82
|
+
schema = get_schema()
|
|
83
|
+
print(schema['tables']) # ['admissions', 'diagnoses_icd', ...]
|
|
84
|
+
|
|
85
|
+
# Query returns a pandas DataFrame
|
|
86
|
+
df = execute_query("""
|
|
87
|
+
SELECT diagnosis, COUNT(*) as n
|
|
88
|
+
FROM diagnoses_icd
|
|
89
|
+
GROUP BY diagnosis
|
|
90
|
+
ORDER BY n DESC
|
|
91
|
+
LIMIT 10
|
|
92
|
+
""")
|
|
93
|
+
|
|
94
|
+
# Use full pandas power: filter, join, compute statistics
|
|
95
|
+
df[df['n'] > 100].plot(kind='bar')
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
The API uses the same tools as the MCP server, so behavior is consistent. But instead of parsing text, you get DataFrames you can immediately analyze, visualize, or feed into downstream pipelines.
|
|
99
|
+
|
|
100
|
+
**When to use code execution:**
|
|
101
|
+
- Multi-step analyses where each query informs the next
|
|
102
|
+
- Large result sets (thousands of rows) that shouldn't flood your context
|
|
103
|
+
- Statistical computations, survival analysis, cohort characterization
|
|
104
|
+
- Building reproducible analysis notebooks
|
|
105
|
+
|
|
106
|
+
See [Code Execution Guide](docs/CODE_EXECUTION.md) for the full API reference.
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
## Agent Skills
|
|
110
|
+
|
|
111
|
+
M4 ships with skills that teach AI coding assistants how to use the Python API effectively. Skills are contextual prompts that activate when relevant—when you ask about clinical data analysis, the assistant automatically knows how to use M4's API.
|
|
112
|
+
|
|
113
|
+
**Supported tools:** Claude Code, Cursor, Cline, Codex CLI, Gemini CLI, GitHub Copilot
|
|
114
|
+
|
|
115
|
+
```bash
|
|
116
|
+
m4 skills # Interactive tool selection
|
|
117
|
+
m4 skills --tools claude,cursor # Install for specific tools
|
|
118
|
+
m4 skills --list # Show installed skills
|
|
119
|
+
m4 config claude --skills # Install during Claude Desktop setup
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
See [Skills Guide](docs/SKILLS.md) for details on the available skills and how to create custom ones.
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
## Example Questions
|
|
126
|
+
|
|
127
|
+
Once connected, try asking:
|
|
128
|
+
|
|
129
|
+
**Tabular data (mimic-iv, eicu):**
|
|
130
|
+
- *"What tables are available in the database?"*
|
|
131
|
+
- *"Show me the race distribution in hospital admissions"*
|
|
132
|
+
- *"Find all ICU stays longer than 7 days"*
|
|
133
|
+
- *"What are the most common lab tests?"*
|
|
134
|
+
|
|
135
|
+
**Clinical notes (mimic-iv-note):**
|
|
136
|
+
- *"Search for notes mentioning diabetes"*
|
|
137
|
+
- *"List all notes for patient 10000032"*
|
|
138
|
+
- *"Get the full discharge summary for this patient"*
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
## Supported Datasets
|
|
142
|
+
|
|
143
|
+
| Dataset | Modality | Size | Access | Local | BigQuery |
|
|
144
|
+
|---------|----------|------|--------|-------|----------|
|
|
145
|
+
| **mimic-iv-demo** | Tabular | 100 patients | Free | Yes | No |
|
|
146
|
+
| **mimic-iv** | Tabular | 365k patients | [PhysioNet credentialed](https://physionet.org/content/mimiciv/) | Yes | Yes |
|
|
147
|
+
| **mimic-iv-note** | Notes | 331k notes | [PhysioNet credentialed](https://physionet.org/content/mimic-iv-note/) | Yes | Yes |
|
|
148
|
+
| **eicu** | Tabular | 200k+ patients | [PhysioNet credentialed](https://physionet.org/content/eicu-crd/) | Yes | Yes |
|
|
149
|
+
|
|
150
|
+
These datasets are supported out of the box. However, it is possible to add any other custom dataset by following [these instructions](docs/CUSTOM_DATASETS.md).
|
|
151
|
+
|
|
152
|
+
Switch datasets anytime:
|
|
153
|
+
```bash
|
|
154
|
+
m4 use mimic-iv # Switch to full MIMIC-IV
|
|
155
|
+
m4 status # Show active dataset details
|
|
156
|
+
m4 status --all # List all available datasets
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
<details>
|
|
160
|
+
<summary><strong>Setting up MIMIC-IV or eICU (credentialed datasets)</strong></summary>
|
|
161
|
+
|
|
162
|
+
1. **Get PhysioNet credentials:** Complete the [credentialing process](https://physionet.org/settings/credentialing/) and sign the data use agreement for the dataset.
|
|
163
|
+
|
|
164
|
+
2. **Download the data:**
|
|
165
|
+
```bash
|
|
166
|
+
# For MIMIC-IV
|
|
167
|
+
wget -r -N -c -np --user YOUR_USERNAME --ask-password \
|
|
168
|
+
https://physionet.org/files/mimiciv/3.1/ \
|
|
169
|
+
-P m4_data/raw_files/mimic-iv
|
|
170
|
+
|
|
171
|
+
# For eICU
|
|
172
|
+
wget -r -N -c -np --user YOUR_USERNAME --ask-password \
|
|
173
|
+
https://physionet.org/files/eicu-crd/2.0/ \
|
|
174
|
+
-P m4_data/raw_files/eicu
|
|
175
|
+
```
|
|
176
|
+
Put the downloaded data in a `m4_data` directory that ideally is located within the project directory. Name the directory for the dataset `mimic-iv`/`eicu`.
|
|
177
|
+
|
|
178
|
+
3. **Initialize:**
|
|
179
|
+
```bash
|
|
180
|
+
m4 init mimic-iv # or: m4 init eicu
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
This converts the CSV files to Parquet format and creates a local DuckDB database.
|
|
184
|
+
</details>
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
## Available Tools
|
|
188
|
+
|
|
189
|
+
M4 exposes these tools to your AI client. Tools are filtered based on the active dataset's modality.
|
|
190
|
+
|
|
191
|
+
**Dataset Management:**
|
|
192
|
+
| Tool | Description |
|
|
193
|
+
|------|-------------|
|
|
194
|
+
| `list_datasets` | List available datasets and their status |
|
|
195
|
+
| `set_dataset` | Switch the active dataset |
|
|
196
|
+
|
|
197
|
+
**Tabular Data Tools** (mimic-iv, mimic-iv-demo, eicu):
|
|
198
|
+
| Tool | Description |
|
|
199
|
+
|------|-------------|
|
|
200
|
+
| `get_database_schema` | List all available tables |
|
|
201
|
+
| `get_table_info` | Get column details and sample data |
|
|
202
|
+
| `execute_query` | Run SQL SELECT queries |
|
|
203
|
+
|
|
204
|
+
**Clinical Notes Tools** (mimic-iv-note):
|
|
205
|
+
| Tool | Description |
|
|
206
|
+
|------|-------------|
|
|
207
|
+
| `search_notes` | Full-text search with snippets |
|
|
208
|
+
| `get_note` | Retrieve a single note by ID |
|
|
209
|
+
| `list_patient_notes` | List notes for a patient (metadata only) |
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
## More Documentation
|
|
213
|
+
|
|
214
|
+
| Guide | Description |
|
|
215
|
+
|-------|-------------|
|
|
216
|
+
| [Code Execution](docs/CODE_EXECUTION.md) | Python API for programmatic access |
|
|
217
|
+
| [Skills](docs/SKILLS.md) | Claude Code skills for contextual assistance |
|
|
218
|
+
| [Tools Reference](docs/TOOLS.md) | MCP tool documentation |
|
|
219
|
+
| [BigQuery Setup](docs/BIGQUERY.md) | Google Cloud for full datasets |
|
|
220
|
+
| [Custom Datasets](docs/CUSTOM_DATASETS.md) | Add your own PhysioNet datasets |
|
|
221
|
+
| [Development](docs/DEVELOPMENT.md) | Contributing, testing, architecture |
|
|
222
|
+
| [OAuth2 Authentication](docs/OAUTH2_AUTHENTICATION.md) | Enterprise security setup |
|
|
223
|
+
|
|
224
|
+
## Roadmap
|
|
225
|
+
|
|
226
|
+
M4 is designed as a growing toolbox for LLM agents working with EHR data. Planned and ongoing directions include:
|
|
227
|
+
|
|
228
|
+
- **More Tools**
|
|
229
|
+
- Implement tools for current modalities (e.g. statistical reports, RAG)
|
|
230
|
+
- Add tools for new modalities (images, waveforms)
|
|
231
|
+
|
|
232
|
+
- **Better context handling**
|
|
233
|
+
- Concise, dataset-aware context for LLM agents
|
|
234
|
+
|
|
235
|
+
- **Dataset expansion**
|
|
236
|
+
- Out-of-the-box support for additional PhysioNet datasets
|
|
237
|
+
- Improved support for institutional/custom EHR schemas
|
|
238
|
+
|
|
239
|
+
- **Evaluation & reproducibility**
|
|
240
|
+
- Session export and replay
|
|
241
|
+
- Evaluation with the latest LLMs and smaller expert models
|
|
242
|
+
|
|
243
|
+
The roadmap reflects current development goals and may evolve as the project matures.
|
|
244
|
+
|
|
245
|
+
## Troubleshooting
|
|
246
|
+
|
|
247
|
+
**"Parquet not found" error:**
|
|
248
|
+
```bash
|
|
249
|
+
m4 init mimic-iv-demo --force
|
|
250
|
+
```
|
|
251
|
+
|
|
252
|
+
**MCP client won't connect:**
|
|
253
|
+
Check client logs (Claude Desktop: Help → View Logs) and ensure the config JSON is valid.
|
|
254
|
+
|
|
255
|
+
**Need to reconfigure:**
|
|
256
|
+
```bash
|
|
257
|
+
m4 config claude --quick # Regenerate Claude Desktop config
|
|
258
|
+
m4 config --quick # Regenerate generic config
|
|
259
|
+
```
|
|
260
|
+
|
|
261
|
+
## Citation
|
|
262
|
+
|
|
263
|
+
M4 builds on the M3 project. Please cite:
|
|
264
|
+
|
|
265
|
+
```bibtex
|
|
266
|
+
@article{attrach2025conversational,
|
|
267
|
+
title={Conversational LLMs Simplify Secure Clinical Data Access, Understanding, and Analysis},
|
|
268
|
+
author={Attrach, Rafi Al and Moreira, Pedro and Fani, Rajna and Umeton, Renato and Celi, Leo Anthony},
|
|
269
|
+
journal={arXiv preprint arXiv:2507.01053},
|
|
270
|
+
year={2025}
|
|
271
|
+
}
|
|
272
|
+
```
|
|
273
|
+
|
|
274
|
+
---
|
|
275
|
+
|
|
276
|
+
<p align="center">
|
|
277
|
+
<a href="https://github.com/hannesill/m4/issues">Report an Issue</a> ·
|
|
278
|
+
<a href="docs/DEVELOPMENT.md">Contribute</a>
|
|
279
|
+
</p>
|