m4-infra 0.0.0.dev0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. m4_infra-0.0.0.dev0/LICENSE +21 -0
  2. m4_infra-0.0.0.dev0/PKG-INFO +324 -0
  3. m4_infra-0.0.0.dev0/README.md +279 -0
  4. m4_infra-0.0.0.dev0/pyproject.toml +134 -0
  5. m4_infra-0.0.0.dev0/src/m4/__init__.py +54 -0
  6. m4_infra-0.0.0.dev0/src/m4/api.py +346 -0
  7. m4_infra-0.0.0.dev0/src/m4/auth.py +433 -0
  8. m4_infra-0.0.0.dev0/src/m4/cli.py +882 -0
  9. m4_infra-0.0.0.dev0/src/m4/config.py +252 -0
  10. m4_infra-0.0.0.dev0/src/m4/console.py +325 -0
  11. m4_infra-0.0.0.dev0/src/m4/core/__init__.py +28 -0
  12. m4_infra-0.0.0.dev0/src/m4/core/backends/__init__.py +98 -0
  13. m4_infra-0.0.0.dev0/src/m4/core/backends/base.py +202 -0
  14. m4_infra-0.0.0.dev0/src/m4/core/backends/bigquery.py +345 -0
  15. m4_infra-0.0.0.dev0/src/m4/core/backends/duckdb.py +272 -0
  16. m4_infra-0.0.0.dev0/src/m4/core/datasets.py +283 -0
  17. m4_infra-0.0.0.dev0/src/m4/core/exceptions.py +152 -0
  18. m4_infra-0.0.0.dev0/src/m4/core/serialization.py +146 -0
  19. m4_infra-0.0.0.dev0/src/m4/core/tools/__init__.py +106 -0
  20. m4_infra-0.0.0.dev0/src/m4/core/tools/base.py +160 -0
  21. m4_infra-0.0.0.dev0/src/m4/core/tools/management.py +170 -0
  22. m4_infra-0.0.0.dev0/src/m4/core/tools/notes.py +350 -0
  23. m4_infra-0.0.0.dev0/src/m4/core/tools/registry.py +332 -0
  24. m4_infra-0.0.0.dev0/src/m4/core/tools/tabular.py +215 -0
  25. m4_infra-0.0.0.dev0/src/m4/core/validation.py +217 -0
  26. m4_infra-0.0.0.dev0/src/m4/data_io.py +531 -0
  27. m4_infra-0.0.0.dev0/src/m4/mcp_client_configs/__init__.py +6 -0
  28. m4_infra-0.0.0.dev0/src/m4/mcp_client_configs/dynamic_mcp_config.py +500 -0
  29. m4_infra-0.0.0.dev0/src/m4/mcp_client_configs/setup_claude_desktop.py +322 -0
  30. m4_infra-0.0.0.dev0/src/m4/mcp_server.py +515 -0
  31. m4_infra-0.0.0.dev0/src/m4/skills/SKILLS_INDEX.md +132 -0
  32. m4_infra-0.0.0.dev0/src/m4/skills/__init__.py +21 -0
  33. m4_infra-0.0.0.dev0/src/m4/skills/apsiii-score/SKILL.md +147 -0
  34. m4_infra-0.0.0.dev0/src/m4/skills/apsiii-score/scripts/apsiii.sql +894 -0
  35. m4_infra-0.0.0.dev0/src/m4/skills/baseline-creatinine/SKILL.md +145 -0
  36. m4_infra-0.0.0.dev0/src/m4/skills/baseline-creatinine/scripts/creatinine_baseline.sql +71 -0
  37. m4_infra-0.0.0.dev0/src/m4/skills/clinical-research-pitfalls/SKILL.md +242 -0
  38. m4_infra-0.0.0.dev0/src/m4/skills/first-icu-stay/SKILL.md +193 -0
  39. m4_infra-0.0.0.dev0/src/m4/skills/first-icu-stay/scripts/icustay_detail.sql +47 -0
  40. m4_infra-0.0.0.dev0/src/m4/skills/gcs-calculation/SKILL.md +161 -0
  41. m4_infra-0.0.0.dev0/src/m4/skills/gcs-calculation/scripts/gcs.sql +127 -0
  42. m4_infra-0.0.0.dev0/src/m4/skills/installer.py +197 -0
  43. m4_infra-0.0.0.dev0/src/m4/skills/kdigo-aki-staging/SKILL.md +207 -0
  44. m4_infra-0.0.0.dev0/src/m4/skills/kdigo-aki-staging/scripts/kdigo_creatinine.sql +63 -0
  45. m4_infra-0.0.0.dev0/src/m4/skills/kdigo-aki-staging/scripts/kdigo_stages.sql +153 -0
  46. m4_infra-0.0.0.dev0/src/m4/skills/kdigo-aki-staging/scripts/kdigo_uo.sql +112 -0
  47. m4_infra-0.0.0.dev0/src/m4/skills/lods-score/SKILL.md +157 -0
  48. m4_infra-0.0.0.dev0/src/m4/skills/lods-score/scripts/lods.sql +230 -0
  49. m4_infra-0.0.0.dev0/src/m4/skills/m4-api/SKILL.md +134 -0
  50. m4_infra-0.0.0.dev0/src/m4/skills/mimic-eicu-mapping/SKILL.md +205 -0
  51. m4_infra-0.0.0.dev0/src/m4/skills/mimic-table-relationships/SKILL.md +193 -0
  52. m4_infra-0.0.0.dev0/src/m4/skills/oasis-score/SKILL.md +131 -0
  53. m4_infra-0.0.0.dev0/src/m4/skills/oasis-score/scripts/oasis.sql +287 -0
  54. m4_infra-0.0.0.dev0/src/m4/skills/sapsii-score/SKILL.md +134 -0
  55. m4_infra-0.0.0.dev0/src/m4/skills/sapsii-score/scripts/sapsii.sql +549 -0
  56. m4_infra-0.0.0.dev0/src/m4/skills/sepsis-3-cohort/SKILL.md +152 -0
  57. m4_infra-0.0.0.dev0/src/m4/skills/sepsis-3-cohort/scripts/sepsis3.sql +80 -0
  58. m4_infra-0.0.0.dev0/src/m4/skills/sirs-criteria/SKILL.md +156 -0
  59. m4_infra-0.0.0.dev0/src/m4/skills/sirs-criteria/scripts/sirs.sql +100 -0
  60. m4_infra-0.0.0.dev0/src/m4/skills/sofa-score/SKILL.md +138 -0
  61. m4_infra-0.0.0.dev0/src/m4/skills/sofa-score/scripts/sofa.sql +379 -0
  62. m4_infra-0.0.0.dev0/src/m4/skills/suspicion-of-infection/SKILL.md +158 -0
  63. m4_infra-0.0.0.dev0/src/m4/skills/suspicion-of-infection/scripts/suspicion_of_infection.sql +175 -0
  64. m4_infra-0.0.0.dev0/src/m4/skills/vasopressor-equivalents/SKILL.md +158 -0
  65. m4_infra-0.0.0.dev0/src/m4/skills/vasopressor-equivalents/scripts/norepinephrine_equivalent_dose.sql +33 -0
  66. m4_infra-0.0.0.dev0/tests/core/__init__.py +1 -0
  67. m4_infra-0.0.0.dev0/tests/core/backends/__init__.py +1 -0
  68. m4_infra-0.0.0.dev0/tests/core/backends/test_base.py +140 -0
  69. m4_infra-0.0.0.dev0/tests/core/backends/test_bigquery.py +311 -0
  70. m4_infra-0.0.0.dev0/tests/core/backends/test_duckdb.py +472 -0
  71. m4_infra-0.0.0.dev0/tests/core/backends/test_factory.py +120 -0
  72. m4_infra-0.0.0.dev0/tests/core/test_datasets.py +202 -0
  73. m4_infra-0.0.0.dev0/tests/core/test_validation.py +348 -0
  74. m4_infra-0.0.0.dev0/tests/core/tools/__init__.py +1 -0
  75. m4_infra-0.0.0.dev0/tests/core/tools/test_base.py +222 -0
  76. m4_infra-0.0.0.dev0/tests/core/tools/test_management.py +367 -0
  77. m4_infra-0.0.0.dev0/tests/core/tools/test_registry.py +449 -0
  78. m4_infra-0.0.0.dev0/tests/core/tools/test_tabular.py +281 -0
  79. m4_infra-0.0.0.dev0/tests/test_api.py +379 -0
  80. m4_infra-0.0.0.dev0/tests/test_cli.py +237 -0
  81. m4_infra-0.0.0.dev0/tests/test_config.py +71 -0
  82. m4_infra-0.0.0.dev0/tests/test_config_scripts.py +118 -0
  83. m4_infra-0.0.0.dev0/tests/test_data_io.py +141 -0
  84. m4_infra-0.0.0.dev0/tests/test_dynamic_switching.py +69 -0
  85. m4_infra-0.0.0.dev0/tests/test_mcp_dataset_tools.py +107 -0
  86. m4_infra-0.0.0.dev0/tests/test_mcp_server.py +670 -0
  87. m4_infra-0.0.0.dev0/tests/test_oauth2_basic.py +279 -0
  88. m4_infra-0.0.0.dev0/tests/utils/__init__.py +5 -0
  89. m4_infra-0.0.0.dev0/tests/utils/auth_helpers.py +74 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Hannes Ill
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,324 @@
1
+ Metadata-Version: 2.4
2
+ Name: m4-infra
3
+ Version: 0.0.0.dev0
4
+ Summary: Infrastructure for AI-assisted clinical research with EHR datasets
5
+ Keywords: mimic-iv,clinical-data,mcp,llm,medical,healthcare,duckdb,bigquery
6
+ Author-Email: Rafi Al Attrach <rafiaa@mit.edu>, Pedro Moreira <pedrojfm@mit.edu>, Rajna Fani <rajnaf@mit.edu>
7
+ Maintainer-Email: Rafi Al Attrach <rafiaa@mit.edu>, Pedro Moreira <pedrojfm@mit.edu>, Rajna Fani <rajnaf@mit.edu>
8
+ License-Expression: MIT
9
+ Classifier: Development Status :: 4 - Beta
10
+ Classifier: Intended Audience :: Science/Research
11
+ Classifier: Intended Audience :: Healthcare Industry
12
+ Classifier: Topic :: Scientific/Engineering :: Medical Science Apps.
13
+ Classifier: Topic :: Database :: Database Engines/Servers
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Operating System :: OS Independent
19
+ Project-URL: Homepage, https://github.com/rafiattrach/m4
20
+ Project-URL: Repository, https://github.com/rafiattrach/m4
21
+ Project-URL: Documentation, https://github.com/rafiattrach/m4#readme
22
+ Project-URL: Issues, https://github.com/rafiattrach/m4/issues
23
+ Project-URL: Changelog, https://github.com/rafiattrach/m4/releases
24
+ Requires-Python: >=3.10
25
+ Requires-Dist: typer>=0.9.0
26
+ Requires-Dist: rich>=13.0.0
27
+ Requires-Dist: requests>=2.30.0
28
+ Requires-Dist: beautifulsoup4>=4.12.0
29
+ Requires-Dist: polars[pyarrow]>=0.20.10
30
+ Requires-Dist: appdirs>=1.4.0
31
+ Requires-Dist: sqlalchemy>=2.0.0
32
+ Requires-Dist: pandas>=2.0.0
33
+ Requires-Dist: fastmcp>=0.1.0
34
+ Requires-Dist: google-cloud-bigquery>=3.0.0
35
+ Requires-Dist: db-dtypes>=1.0.0
36
+ Requires-Dist: sqlparse>=0.4.0
37
+ Requires-Dist: pyjwt[crypto]>=2.8.0
38
+ Requires-Dist: cryptography>=41.0.0
39
+ Requires-Dist: python-jose[cryptography]>=3.3.0
40
+ Requires-Dist: httpx>=0.24.0
41
+ Requires-Dist: duckdb>=1.4.1
42
+ Requires-Dist: matplotlib>=3.10.8
43
+ Requires-Dist: lifelines>=0.30.0
44
+ Description-Content-Type: text/markdown
45
+
46
+ # M4: A Toolbox for LLMs on Clinical Data
47
+
48
+ <p align="center">
49
+ <img src="webapp/public/m4_logo_transparent.png" alt="M4 Logo" width="180"/>
50
+ </p>
51
+
52
+ <p align="center">
53
+ <strong>Query clinical datasets with natural language through Claude, Cursor, or any MCP client</strong>
54
+ </p>
55
+
56
+ <p align="center">
57
+ <a href="https://www.python.org/downloads/"><img alt="Python" src="https://img.shields.io/badge/Python-3.10+-blue?logo=python&logoColor=white"></a>
58
+ <a href="https://modelcontextprotocol.io/"><img alt="MCP" src="https://img.shields.io/badge/MCP-Compatible-green?logo=ai&logoColor=white"></a>
59
+ <a href="https://github.com/hannesill/m4/actions/workflows/tests.yaml"><img alt="Tests" src="https://github.com/hannesill/m4/actions/workflows/tests.yaml/badge.svg"></a>
60
+ </p>
61
+
62
+ M4 is an infrastructure layer for multimodal EHR data that provides LLM agents with a unified toolbox for querying clinical datasets.
63
+ It supports tabular data and clinical notes, dynamically selecting tools by modality to query MIMIC-IV, eICU, and custom datasets through a single natural-language interface.
64
+
65
+ [Usage example](https://claude.ai/share/93f26832-f298-4d1d-96e3-5608d7f0d7ad)
66
+
67
+ > M4 is a fork of the [M3](https://github.com/rafiattrach/m3) project and would not be possible without it 🫶 Please [cite](#citation) their work when using M4!
68
+
69
+
70
+ ## Quickstart (3 steps)
71
+
72
+ ### 1. Install uv
73
+
74
+ **macOS/Linux:**
75
+ ```bash
76
+ curl -LsSf https://astral.sh/uv/install.sh | sh
77
+ ```
78
+
79
+ **Windows (PowerShell):**
80
+ ```powershell
81
+ powershell -ExecutionPolicy ByPass -c "irm https://astral.sh/uv/install.ps1 | iex"
82
+ ```
83
+
84
+ ### 2. Initialize M4
85
+
86
+ ```bash
87
+ mkdir my-research && cd my-research
88
+ uv init && uv add m4-infra
89
+ uv run m4 init mimic-iv-demo
90
+ ```
91
+
92
+ This downloads the free MIMIC-IV demo dataset (~16MB) and sets up a local DuckDB database.
93
+
94
+ ### 3. Connect your AI client
95
+
96
+ **Claude Desktop:**
97
+ ```bash
98
+ uv run m4 config claude --quick
99
+ ```
100
+
101
+ **Other clients (Cursor, LibreChat, etc.):**
102
+ ```bash
103
+ uv run m4 config --quick
104
+ ```
105
+
106
+ Copy the generated JSON into your client's MCP settings, restart, and start asking questions!
107
+
108
+ <details>
109
+ <summary>Different setup options</summary>
110
+
111
+ * If you don't want to use uv, you can just run pip install m4-infra
112
+
113
+ * If you want to use Docker, look at <a href="docs/DEVELOPMENT.md">docs/DEVELOPMENT.md</a>
114
+ </details>
115
+
116
+
117
+ ## Code Execution
118
+
119
+ For complex analysis that goes beyond simple queries, M4 provides a Python API that returns Python data types instead of formatted strings (e.g. pd.DataFrame for SQL queries). This transforms M4 from a query tool into a complete clinical data analysis environment.
120
+
121
+ ```python
122
+ from m4 import set_dataset, execute_query, get_schema
123
+
124
+ set_dataset("mimic-iv")
125
+
126
+ # Get schema as a dict
127
+ schema = get_schema()
128
+ print(schema['tables']) # ['admissions', 'diagnoses_icd', ...]
129
+
130
+ # Query returns a pandas DataFrame
131
+ df = execute_query("""
132
+ SELECT diagnosis, COUNT(*) as n
133
+ FROM diagnoses_icd
134
+ GROUP BY diagnosis
135
+ ORDER BY n DESC
136
+ LIMIT 10
137
+ """)
138
+
139
+ # Use full pandas power: filter, join, compute statistics
140
+ df[df['n'] > 100].plot(kind='bar')
141
+ ```
142
+
143
+ The API uses the same tools as the MCP server, so behavior is consistent. But instead of parsing text, you get DataFrames you can immediately analyze, visualize, or feed into downstream pipelines.
144
+
145
+ **When to use code execution:**
146
+ - Multi-step analyses where each query informs the next
147
+ - Large result sets (thousands of rows) that shouldn't flood your context
148
+ - Statistical computations, survival analysis, cohort characterization
149
+ - Building reproducible analysis notebooks
150
+
151
+ See [Code Execution Guide](docs/CODE_EXECUTION.md) for the full API reference.
152
+
153
+
154
+ ## Agent Skills
155
+
156
+ M4 ships with skills that teach AI coding assistants how to use the Python API effectively. Skills are contextual prompts that activate when relevant—when you ask about clinical data analysis, the assistant automatically knows how to use M4's API.
157
+
158
+ **Supported tools:** Claude Code, Cursor, Cline, Codex CLI, Gemini CLI, GitHub Copilot
159
+
160
+ ```bash
161
+ m4 skills # Interactive tool selection
162
+ m4 skills --tools claude,cursor # Install for specific tools
163
+ m4 skills --list # Show installed skills
164
+ m4 config claude --skills # Install during Claude Desktop setup
165
+ ```
166
+
167
+ See [Skills Guide](docs/SKILLS.md) for details on the available skills and how to create custom ones.
168
+
169
+
170
+ ## Example Questions
171
+
172
+ Once connected, try asking:
173
+
174
+ **Tabular data (mimic-iv, eicu):**
175
+ - *"What tables are available in the database?"*
176
+ - *"Show me the race distribution in hospital admissions"*
177
+ - *"Find all ICU stays longer than 7 days"*
178
+ - *"What are the most common lab tests?"*
179
+
180
+ **Clinical notes (mimic-iv-note):**
181
+ - *"Search for notes mentioning diabetes"*
182
+ - *"List all notes for patient 10000032"*
183
+ - *"Get the full discharge summary for this patient"*
184
+
185
+
186
+ ## Supported Datasets
187
+
188
+ | Dataset | Modality | Size | Access | Local | BigQuery |
189
+ |---------|----------|------|--------|-------|----------|
190
+ | **mimic-iv-demo** | Tabular | 100 patients | Free | Yes | No |
191
+ | **mimic-iv** | Tabular | 365k patients | [PhysioNet credentialed](https://physionet.org/content/mimiciv/) | Yes | Yes |
192
+ | **mimic-iv-note** | Notes | 331k notes | [PhysioNet credentialed](https://physionet.org/content/mimic-iv-note/) | Yes | Yes |
193
+ | **eicu** | Tabular | 200k+ patients | [PhysioNet credentialed](https://physionet.org/content/eicu-crd/) | Yes | Yes |
194
+
195
+ These datasets are supported out of the box. However, it is possible to add any other custom dataset by following [these instructions](docs/CUSTOM_DATASETS.md).
196
+
197
+ Switch datasets anytime:
198
+ ```bash
199
+ m4 use mimic-iv # Switch to full MIMIC-IV
200
+ m4 status # Show active dataset details
201
+ m4 status --all # List all available datasets
202
+ ```
203
+
204
+ <details>
205
+ <summary><strong>Setting up MIMIC-IV or eICU (credentialed datasets)</strong></summary>
206
+
207
+ 1. **Get PhysioNet credentials:** Complete the [credentialing process](https://physionet.org/settings/credentialing/) and sign the data use agreement for the dataset.
208
+
209
+ 2. **Download the data:**
210
+ ```bash
211
+ # For MIMIC-IV
212
+ wget -r -N -c -np --user YOUR_USERNAME --ask-password \
213
+ https://physionet.org/files/mimiciv/3.1/ \
214
+ -P m4_data/raw_files/mimic-iv
215
+
216
+ # For eICU
217
+ wget -r -N -c -np --user YOUR_USERNAME --ask-password \
218
+ https://physionet.org/files/eicu-crd/2.0/ \
219
+ -P m4_data/raw_files/eicu
220
+ ```
221
+ Put the downloaded data in a `m4_data` directory that ideally is located within the project directory. Name the directory for the dataset `mimic-iv`/`eicu`.
222
+
223
+ 3. **Initialize:**
224
+ ```bash
225
+ m4 init mimic-iv # or: m4 init eicu
226
+ ```
227
+
228
+ This converts the CSV files to Parquet format and creates a local DuckDB database.
229
+ </details>
230
+
231
+
232
+ ## Available Tools
233
+
234
+ M4 exposes these tools to your AI client. Tools are filtered based on the active dataset's modality.
235
+
236
+ **Dataset Management:**
237
+ | Tool | Description |
238
+ |------|-------------|
239
+ | `list_datasets` | List available datasets and their status |
240
+ | `set_dataset` | Switch the active dataset |
241
+
242
+ **Tabular Data Tools** (mimic-iv, mimic-iv-demo, eicu):
243
+ | Tool | Description |
244
+ |------|-------------|
245
+ | `get_database_schema` | List all available tables |
246
+ | `get_table_info` | Get column details and sample data |
247
+ | `execute_query` | Run SQL SELECT queries |
248
+
249
+ **Clinical Notes Tools** (mimic-iv-note):
250
+ | Tool | Description |
251
+ |------|-------------|
252
+ | `search_notes` | Full-text search with snippets |
253
+ | `get_note` | Retrieve a single note by ID |
254
+ | `list_patient_notes` | List notes for a patient (metadata only) |
255
+
256
+
257
+ ## More Documentation
258
+
259
+ | Guide | Description |
260
+ |-------|-------------|
261
+ | [Code Execution](docs/CODE_EXECUTION.md) | Python API for programmatic access |
262
+ | [Skills](docs/SKILLS.md) | Claude Code skills for contextual assistance |
263
+ | [Tools Reference](docs/TOOLS.md) | MCP tool documentation |
264
+ | [BigQuery Setup](docs/BIGQUERY.md) | Google Cloud for full datasets |
265
+ | [Custom Datasets](docs/CUSTOM_DATASETS.md) | Add your own PhysioNet datasets |
266
+ | [Development](docs/DEVELOPMENT.md) | Contributing, testing, architecture |
267
+ | [OAuth2 Authentication](docs/OAUTH2_AUTHENTICATION.md) | Enterprise security setup |
268
+
269
+ ## Roadmap
270
+
271
+ M4 is designed as a growing toolbox for LLM agents working with EHR data. Planned and ongoing directions include:
272
+
273
+ - **More Tools**
274
+ - Implement tools for current modalities (e.g. statistical reports, RAG)
275
+ - Add tools for new modalities (images, waveforms)
276
+
277
+ - **Better context handling**
278
+ - Concise, dataset-aware context for LLM agents
279
+
280
+ - **Dataset expansion**
281
+ - Out-of-the-box support for additional PhysioNet datasets
282
+ - Improved support for institutional/custom EHR schemas
283
+
284
+ - **Evaluation & reproducibility**
285
+ - Session export and replay
286
+ - Evaluation with the latest LLMs and smaller expert models
287
+
288
+ The roadmap reflects current development goals and may evolve as the project matures.
289
+
290
+ ## Troubleshooting
291
+
292
+ **"Parquet not found" error:**
293
+ ```bash
294
+ m4 init mimic-iv-demo --force
295
+ ```
296
+
297
+ **MCP client won't connect:**
298
+ Check client logs (Claude Desktop: Help → View Logs) and ensure the config JSON is valid.
299
+
300
+ **Need to reconfigure:**
301
+ ```bash
302
+ m4 config claude --quick # Regenerate Claude Desktop config
303
+ m4 config --quick # Regenerate generic config
304
+ ```
305
+
306
+ ## Citation
307
+
308
+ M4 builds on the M3 project. Please cite:
309
+
310
+ ```bibtex
311
+ @article{attrach2025conversational,
312
+ title={Conversational LLMs Simplify Secure Clinical Data Access, Understanding, and Analysis},
313
+ author={Attrach, Rafi Al and Moreira, Pedro and Fani, Rajna and Umeton, Renato and Celi, Leo Anthony},
314
+ journal={arXiv preprint arXiv:2507.01053},
315
+ year={2025}
316
+ }
317
+ ```
318
+
319
+ ---
320
+
321
+ <p align="center">
322
+ <a href="https://github.com/hannesill/m4/issues">Report an Issue</a> ·
323
+ <a href="docs/DEVELOPMENT.md">Contribute</a>
324
+ </p>
@@ -0,0 +1,279 @@
1
+ # M4: A Toolbox for LLMs on Clinical Data
2
+
3
+ <p align="center">
4
+ <img src="webapp/public/m4_logo_transparent.png" alt="M4 Logo" width="180"/>
5
+ </p>
6
+
7
+ <p align="center">
8
+ <strong>Query clinical datasets with natural language through Claude, Cursor, or any MCP client</strong>
9
+ </p>
10
+
11
+ <p align="center">
12
+ <a href="https://www.python.org/downloads/"><img alt="Python" src="https://img.shields.io/badge/Python-3.10+-blue?logo=python&logoColor=white"></a>
13
+ <a href="https://modelcontextprotocol.io/"><img alt="MCP" src="https://img.shields.io/badge/MCP-Compatible-green?logo=ai&logoColor=white"></a>
14
+ <a href="https://github.com/hannesill/m4/actions/workflows/tests.yaml"><img alt="Tests" src="https://github.com/hannesill/m4/actions/workflows/tests.yaml/badge.svg"></a>
15
+ </p>
16
+
17
+ M4 is an infrastructure layer for multimodal EHR data that provides LLM agents with a unified toolbox for querying clinical datasets.
18
+ It supports tabular data and clinical notes, dynamically selecting tools by modality to query MIMIC-IV, eICU, and custom datasets through a single natural-language interface.
19
+
20
+ [Usage example](https://claude.ai/share/93f26832-f298-4d1d-96e3-5608d7f0d7ad)
21
+
22
+ > M4 is a fork of the [M3](https://github.com/rafiattrach/m3) project and would not be possible without it 🫶 Please [cite](#citation) their work when using M4!
23
+
24
+
25
+ ## Quickstart (3 steps)
26
+
27
+ ### 1. Install uv
28
+
29
+ **macOS/Linux:**
30
+ ```bash
31
+ curl -LsSf https://astral.sh/uv/install.sh | sh
32
+ ```
33
+
34
+ **Windows (PowerShell):**
35
+ ```powershell
36
+ powershell -ExecutionPolicy ByPass -c "irm https://astral.sh/uv/install.ps1 | iex"
37
+ ```
38
+
39
+ ### 2. Initialize M4
40
+
41
+ ```bash
42
+ mkdir my-research && cd my-research
43
+ uv init && uv add m4-infra
44
+ uv run m4 init mimic-iv-demo
45
+ ```
46
+
47
+ This downloads the free MIMIC-IV demo dataset (~16MB) and sets up a local DuckDB database.
48
+
49
+ ### 3. Connect your AI client
50
+
51
+ **Claude Desktop:**
52
+ ```bash
53
+ uv run m4 config claude --quick
54
+ ```
55
+
56
+ **Other clients (Cursor, LibreChat, etc.):**
57
+ ```bash
58
+ uv run m4 config --quick
59
+ ```
60
+
61
+ Copy the generated JSON into your client's MCP settings, restart, and start asking questions!
62
+
63
+ <details>
64
+ <summary>Different setup options</summary>
65
+
66
+ * If you don't want to use uv, you can just run pip install m4-infra
67
+
68
+ * If you want to use Docker, look at <a href="docs/DEVELOPMENT.md">docs/DEVELOPMENT.md</a>
69
+ </details>
70
+
71
+
72
+ ## Code Execution
73
+
74
+ For complex analysis that goes beyond simple queries, M4 provides a Python API that returns Python data types instead of formatted strings (e.g. pd.DataFrame for SQL queries). This transforms M4 from a query tool into a complete clinical data analysis environment.
75
+
76
+ ```python
77
+ from m4 import set_dataset, execute_query, get_schema
78
+
79
+ set_dataset("mimic-iv")
80
+
81
+ # Get schema as a dict
82
+ schema = get_schema()
83
+ print(schema['tables']) # ['admissions', 'diagnoses_icd', ...]
84
+
85
+ # Query returns a pandas DataFrame
86
+ df = execute_query("""
87
+ SELECT diagnosis, COUNT(*) as n
88
+ FROM diagnoses_icd
89
+ GROUP BY diagnosis
90
+ ORDER BY n DESC
91
+ LIMIT 10
92
+ """)
93
+
94
+ # Use full pandas power: filter, join, compute statistics
95
+ df[df['n'] > 100].plot(kind='bar')
96
+ ```
97
+
98
+ The API uses the same tools as the MCP server, so behavior is consistent. But instead of parsing text, you get DataFrames you can immediately analyze, visualize, or feed into downstream pipelines.
99
+
100
+ **When to use code execution:**
101
+ - Multi-step analyses where each query informs the next
102
+ - Large result sets (thousands of rows) that shouldn't flood your context
103
+ - Statistical computations, survival analysis, cohort characterization
104
+ - Building reproducible analysis notebooks
105
+
106
+ See [Code Execution Guide](docs/CODE_EXECUTION.md) for the full API reference.
107
+
108
+
109
+ ## Agent Skills
110
+
111
+ M4 ships with skills that teach AI coding assistants how to use the Python API effectively. Skills are contextual prompts that activate when relevant—when you ask about clinical data analysis, the assistant automatically knows how to use M4's API.
112
+
113
+ **Supported tools:** Claude Code, Cursor, Cline, Codex CLI, Gemini CLI, GitHub Copilot
114
+
115
+ ```bash
116
+ m4 skills # Interactive tool selection
117
+ m4 skills --tools claude,cursor # Install for specific tools
118
+ m4 skills --list # Show installed skills
119
+ m4 config claude --skills # Install during Claude Desktop setup
120
+ ```
121
+
122
+ See [Skills Guide](docs/SKILLS.md) for details on the available skills and how to create custom ones.
123
+
124
+
125
+ ## Example Questions
126
+
127
+ Once connected, try asking:
128
+
129
+ **Tabular data (mimic-iv, eicu):**
130
+ - *"What tables are available in the database?"*
131
+ - *"Show me the race distribution in hospital admissions"*
132
+ - *"Find all ICU stays longer than 7 days"*
133
+ - *"What are the most common lab tests?"*
134
+
135
+ **Clinical notes (mimic-iv-note):**
136
+ - *"Search for notes mentioning diabetes"*
137
+ - *"List all notes for patient 10000032"*
138
+ - *"Get the full discharge summary for this patient"*
139
+
140
+
141
+ ## Supported Datasets
142
+
143
+ | Dataset | Modality | Size | Access | Local | BigQuery |
144
+ |---------|----------|------|--------|-------|----------|
145
+ | **mimic-iv-demo** | Tabular | 100 patients | Free | Yes | No |
146
+ | **mimic-iv** | Tabular | 365k patients | [PhysioNet credentialed](https://physionet.org/content/mimiciv/) | Yes | Yes |
147
+ | **mimic-iv-note** | Notes | 331k notes | [PhysioNet credentialed](https://physionet.org/content/mimic-iv-note/) | Yes | Yes |
148
+ | **eicu** | Tabular | 200k+ patients | [PhysioNet credentialed](https://physionet.org/content/eicu-crd/) | Yes | Yes |
149
+
150
+ These datasets are supported out of the box. However, it is possible to add any other custom dataset by following [these instructions](docs/CUSTOM_DATASETS.md).
151
+
152
+ Switch datasets anytime:
153
+ ```bash
154
+ m4 use mimic-iv # Switch to full MIMIC-IV
155
+ m4 status # Show active dataset details
156
+ m4 status --all # List all available datasets
157
+ ```
158
+
159
+ <details>
160
+ <summary><strong>Setting up MIMIC-IV or eICU (credentialed datasets)</strong></summary>
161
+
162
+ 1. **Get PhysioNet credentials:** Complete the [credentialing process](https://physionet.org/settings/credentialing/) and sign the data use agreement for the dataset.
163
+
164
+ 2. **Download the data:**
165
+ ```bash
166
+ # For MIMIC-IV
167
+ wget -r -N -c -np --user YOUR_USERNAME --ask-password \
168
+ https://physionet.org/files/mimiciv/3.1/ \
169
+ -P m4_data/raw_files/mimic-iv
170
+
171
+ # For eICU
172
+ wget -r -N -c -np --user YOUR_USERNAME --ask-password \
173
+ https://physionet.org/files/eicu-crd/2.0/ \
174
+ -P m4_data/raw_files/eicu
175
+ ```
176
+ Put the downloaded data in a `m4_data` directory that ideally is located within the project directory. Name the directory for the dataset `mimic-iv`/`eicu`.
177
+
178
+ 3. **Initialize:**
179
+ ```bash
180
+ m4 init mimic-iv # or: m4 init eicu
181
+ ```
182
+
183
+ This converts the CSV files to Parquet format and creates a local DuckDB database.
184
+ </details>
185
+
186
+
187
+ ## Available Tools
188
+
189
+ M4 exposes these tools to your AI client. Tools are filtered based on the active dataset's modality.
190
+
191
+ **Dataset Management:**
192
+ | Tool | Description |
193
+ |------|-------------|
194
+ | `list_datasets` | List available datasets and their status |
195
+ | `set_dataset` | Switch the active dataset |
196
+
197
+ **Tabular Data Tools** (mimic-iv, mimic-iv-demo, eicu):
198
+ | Tool | Description |
199
+ |------|-------------|
200
+ | `get_database_schema` | List all available tables |
201
+ | `get_table_info` | Get column details and sample data |
202
+ | `execute_query` | Run SQL SELECT queries |
203
+
204
+ **Clinical Notes Tools** (mimic-iv-note):
205
+ | Tool | Description |
206
+ |------|-------------|
207
+ | `search_notes` | Full-text search with snippets |
208
+ | `get_note` | Retrieve a single note by ID |
209
+ | `list_patient_notes` | List notes for a patient (metadata only) |
210
+
211
+
212
+ ## More Documentation
213
+
214
+ | Guide | Description |
215
+ |-------|-------------|
216
+ | [Code Execution](docs/CODE_EXECUTION.md) | Python API for programmatic access |
217
+ | [Skills](docs/SKILLS.md) | Claude Code skills for contextual assistance |
218
+ | [Tools Reference](docs/TOOLS.md) | MCP tool documentation |
219
+ | [BigQuery Setup](docs/BIGQUERY.md) | Google Cloud for full datasets |
220
+ | [Custom Datasets](docs/CUSTOM_DATASETS.md) | Add your own PhysioNet datasets |
221
+ | [Development](docs/DEVELOPMENT.md) | Contributing, testing, architecture |
222
+ | [OAuth2 Authentication](docs/OAUTH2_AUTHENTICATION.md) | Enterprise security setup |
223
+
224
+ ## Roadmap
225
+
226
+ M4 is designed as a growing toolbox for LLM agents working with EHR data. Planned and ongoing directions include:
227
+
228
+ - **More Tools**
229
+ - Implement tools for current modalities (e.g. statistical reports, RAG)
230
+ - Add tools for new modalities (images, waveforms)
231
+
232
+ - **Better context handling**
233
+ - Concise, dataset-aware context for LLM agents
234
+
235
+ - **Dataset expansion**
236
+ - Out-of-the-box support for additional PhysioNet datasets
237
+ - Improved support for institutional/custom EHR schemas
238
+
239
+ - **Evaluation & reproducibility**
240
+ - Session export and replay
241
+ - Evaluation with the latest LLMs and smaller expert models
242
+
243
+ The roadmap reflects current development goals and may evolve as the project matures.
244
+
245
+ ## Troubleshooting
246
+
247
+ **"Parquet not found" error:**
248
+ ```bash
249
+ m4 init mimic-iv-demo --force
250
+ ```
251
+
252
+ **MCP client won't connect:**
253
+ Check client logs (Claude Desktop: Help → View Logs) and ensure the config JSON is valid.
254
+
255
+ **Need to reconfigure:**
256
+ ```bash
257
+ m4 config claude --quick # Regenerate Claude Desktop config
258
+ m4 config --quick # Regenerate generic config
259
+ ```
260
+
261
+ ## Citation
262
+
263
+ M4 builds on the M3 project. Please cite:
264
+
265
+ ```bibtex
266
+ @article{attrach2025conversational,
267
+ title={Conversational LLMs Simplify Secure Clinical Data Access, Understanding, and Analysis},
268
+ author={Attrach, Rafi Al and Moreira, Pedro and Fani, Rajna and Umeton, Renato and Celi, Leo Anthony},
269
+ journal={arXiv preprint arXiv:2507.01053},
270
+ year={2025}
271
+ }
272
+ ```
273
+
274
+ ---
275
+
276
+ <p align="center">
277
+ <a href="https://github.com/hannesill/m4/issues">Report an Issue</a> ·
278
+ <a href="docs/DEVELOPMENT.md">Contribute</a>
279
+ </p>