m4-infra 0.0.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- m4/__init__.py +54 -0
- m4/api.py +346 -0
- m4/auth.py +433 -0
- m4/cli.py +882 -0
- m4/config.py +252 -0
- m4/console.py +325 -0
- m4/core/__init__.py +28 -0
- m4/core/backends/__init__.py +98 -0
- m4/core/backends/base.py +202 -0
- m4/core/backends/bigquery.py +345 -0
- m4/core/backends/duckdb.py +272 -0
- m4/core/datasets.py +283 -0
- m4/core/exceptions.py +152 -0
- m4/core/serialization.py +146 -0
- m4/core/tools/__init__.py +106 -0
- m4/core/tools/base.py +160 -0
- m4/core/tools/management.py +170 -0
- m4/core/tools/notes.py +350 -0
- m4/core/tools/registry.py +332 -0
- m4/core/tools/tabular.py +215 -0
- m4/core/validation.py +217 -0
- m4/data_io.py +531 -0
- m4/mcp_client_configs/__init__.py +6 -0
- m4/mcp_client_configs/dynamic_mcp_config.py +500 -0
- m4/mcp_client_configs/setup_claude_desktop.py +322 -0
- m4/mcp_server.py +515 -0
- m4/skills/SKILLS_INDEX.md +132 -0
- m4/skills/__init__.py +21 -0
- m4/skills/apsiii-score/SKILL.md +147 -0
- m4/skills/apsiii-score/scripts/apsiii.sql +894 -0
- m4/skills/baseline-creatinine/SKILL.md +145 -0
- m4/skills/baseline-creatinine/scripts/creatinine_baseline.sql +71 -0
- m4/skills/clinical-research-pitfalls/SKILL.md +242 -0
- m4/skills/first-icu-stay/SKILL.md +193 -0
- m4/skills/first-icu-stay/scripts/icustay_detail.sql +47 -0
- m4/skills/gcs-calculation/SKILL.md +161 -0
- m4/skills/gcs-calculation/scripts/gcs.sql +127 -0
- m4/skills/installer.py +197 -0
- m4/skills/kdigo-aki-staging/SKILL.md +207 -0
- m4/skills/kdigo-aki-staging/scripts/kdigo_creatinine.sql +63 -0
- m4/skills/kdigo-aki-staging/scripts/kdigo_stages.sql +153 -0
- m4/skills/kdigo-aki-staging/scripts/kdigo_uo.sql +112 -0
- m4/skills/lods-score/SKILL.md +157 -0
- m4/skills/lods-score/scripts/lods.sql +230 -0
- m4/skills/m4-api/SKILL.md +134 -0
- m4/skills/mimic-eicu-mapping/SKILL.md +205 -0
- m4/skills/mimic-table-relationships/SKILL.md +193 -0
- m4/skills/oasis-score/SKILL.md +131 -0
- m4/skills/oasis-score/scripts/oasis.sql +287 -0
- m4/skills/sapsii-score/SKILL.md +134 -0
- m4/skills/sapsii-score/scripts/sapsii.sql +549 -0
- m4/skills/sepsis-3-cohort/SKILL.md +152 -0
- m4/skills/sepsis-3-cohort/scripts/sepsis3.sql +80 -0
- m4/skills/sirs-criteria/SKILL.md +156 -0
- m4/skills/sirs-criteria/scripts/sirs.sql +100 -0
- m4/skills/sofa-score/SKILL.md +138 -0
- m4/skills/sofa-score/scripts/sofa.sql +379 -0
- m4/skills/suspicion-of-infection/SKILL.md +158 -0
- m4/skills/suspicion-of-infection/scripts/suspicion_of_infection.sql +175 -0
- m4/skills/vasopressor-equivalents/SKILL.md +158 -0
- m4/skills/vasopressor-equivalents/scripts/norepinephrine_equivalent_dose.sql +33 -0
- m4_infra-0.0.0.dev0.dist-info/METADATA +324 -0
- m4_infra-0.0.0.dev0.dist-info/RECORD +66 -0
- m4_infra-0.0.0.dev0.dist-info/WHEEL +4 -0
- m4_infra-0.0.0.dev0.dist-info/entry_points.txt +8 -0
- m4_infra-0.0.0.dev0.dist-info/licenses/LICENSE +21 -0
m4/__init__.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
"""M4: Multi-Dataset Infrastructure for LLM-Assisted Clinical Research.
|
|
2
|
+
|
|
3
|
+
M4 provides rigorous, auditable infrastructure for AI-assisted clinical research,
|
|
4
|
+
offering a safe interface for LLMs and autonomous agents to interact with EHR data.
|
|
5
|
+
|
|
6
|
+
Quick Start:
|
|
7
|
+
from m4 import execute_query, set_dataset, get_schema
|
|
8
|
+
|
|
9
|
+
set_dataset("mimic-iv")
|
|
10
|
+
print(get_schema())
|
|
11
|
+
result = execute_query("SELECT COUNT(*) FROM patients")
|
|
12
|
+
|
|
13
|
+
For MCP server usage, run: m4 serve
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
__version__ = "0.0.0.dev0"
|
|
17
|
+
|
|
18
|
+
# Expose API functions at package level for easy imports
|
|
19
|
+
from m4.api import (
|
|
20
|
+
# Exceptions
|
|
21
|
+
DatasetError,
|
|
22
|
+
M4Error,
|
|
23
|
+
ModalityError,
|
|
24
|
+
QueryError,
|
|
25
|
+
# Tabular data
|
|
26
|
+
execute_query,
|
|
27
|
+
# Dataset management
|
|
28
|
+
get_active_dataset,
|
|
29
|
+
# Clinical notes
|
|
30
|
+
get_note,
|
|
31
|
+
get_schema,
|
|
32
|
+
get_table_info,
|
|
33
|
+
list_datasets,
|
|
34
|
+
list_patient_notes,
|
|
35
|
+
search_notes,
|
|
36
|
+
set_dataset,
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
__all__ = [
|
|
40
|
+
"DatasetError",
|
|
41
|
+
"M4Error",
|
|
42
|
+
"ModalityError",
|
|
43
|
+
"QueryError",
|
|
44
|
+
"__version__",
|
|
45
|
+
"execute_query",
|
|
46
|
+
"get_active_dataset",
|
|
47
|
+
"get_note",
|
|
48
|
+
"get_schema",
|
|
49
|
+
"get_table_info",
|
|
50
|
+
"list_datasets",
|
|
51
|
+
"list_patient_notes",
|
|
52
|
+
"search_notes",
|
|
53
|
+
"set_dataset",
|
|
54
|
+
]
|
m4/api.py
ADDED
|
@@ -0,0 +1,346 @@
|
|
|
1
|
+
"""M4 Python API for direct access to clinical data tools.
|
|
2
|
+
|
|
3
|
+
This module provides a clean Python API for code execution environments
|
|
4
|
+
like Claude Code. Functions delegate to the same tool classes used by
|
|
5
|
+
the MCP server, ensuring consistent behavior across interfaces.
|
|
6
|
+
|
|
7
|
+
Unlike the MCP server, this API returns native Python types:
|
|
8
|
+
- execute_query() returns pd.DataFrame
|
|
9
|
+
- get_schema() returns dict with tables list
|
|
10
|
+
- get_table_info() returns dict with schema DataFrame
|
|
11
|
+
- etc.
|
|
12
|
+
|
|
13
|
+
Example:
|
|
14
|
+
from m4 import execute_query, set_dataset, get_schema
|
|
15
|
+
import pandas as pd
|
|
16
|
+
|
|
17
|
+
set_dataset("mimic-iv")
|
|
18
|
+
schema = get_schema() # Returns dict with 'tables' list
|
|
19
|
+
print(schema['tables'])
|
|
20
|
+
|
|
21
|
+
df = execute_query("SELECT COUNT(*) FROM patients")
|
|
22
|
+
print(df) # DataFrame
|
|
23
|
+
|
|
24
|
+
All functions work with the currently active dataset. Use set_dataset()
|
|
25
|
+
to switch between datasets.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
from typing import Any
|
|
29
|
+
|
|
30
|
+
import pandas as pd
|
|
31
|
+
|
|
32
|
+
from m4.config import get_active_dataset as _get_active_dataset
|
|
33
|
+
from m4.config import set_active_dataset as _set_active_dataset
|
|
34
|
+
from m4.core.datasets import DatasetRegistry
|
|
35
|
+
from m4.core.exceptions import DatasetError, M4Error, ModalityError, QueryError
|
|
36
|
+
from m4.core.tools import ToolRegistry, ToolSelector, init_tools
|
|
37
|
+
from m4.core.tools.notes import (
|
|
38
|
+
GetNoteInput,
|
|
39
|
+
ListPatientNotesInput,
|
|
40
|
+
SearchNotesInput,
|
|
41
|
+
)
|
|
42
|
+
from m4.core.tools.tabular import (
|
|
43
|
+
ExecuteQueryInput,
|
|
44
|
+
GetDatabaseSchemaInput,
|
|
45
|
+
GetTableInfoInput,
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
# Initialize tools on module import
|
|
49
|
+
init_tools()
|
|
50
|
+
|
|
51
|
+
# Tool selector for compatibility checking
|
|
52
|
+
_tool_selector = ToolSelector()
|
|
53
|
+
|
|
54
|
+
# Re-export exceptions for convenience
|
|
55
|
+
__all__ = [
|
|
56
|
+
"DatasetError",
|
|
57
|
+
"M4Error",
|
|
58
|
+
"ModalityError",
|
|
59
|
+
"QueryError",
|
|
60
|
+
"execute_query",
|
|
61
|
+
"get_active_dataset",
|
|
62
|
+
"get_note",
|
|
63
|
+
"get_schema",
|
|
64
|
+
"get_table_info",
|
|
65
|
+
"list_datasets",
|
|
66
|
+
"list_patient_notes",
|
|
67
|
+
"search_notes",
|
|
68
|
+
"set_dataset",
|
|
69
|
+
]
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
# =============================================================================
|
|
73
|
+
# Dataset Management
|
|
74
|
+
# =============================================================================
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def list_datasets() -> list[str]:
|
|
78
|
+
"""List all available datasets.
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
List of dataset names that can be used with set_dataset().
|
|
82
|
+
|
|
83
|
+
Example:
|
|
84
|
+
>>> list_datasets()
|
|
85
|
+
['mimic-iv', 'mimic-iv-note', 'eicu']
|
|
86
|
+
"""
|
|
87
|
+
return [ds.name for ds in DatasetRegistry.list_all()]
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def set_dataset(name: str) -> str:
|
|
91
|
+
"""Set the active dataset for subsequent queries.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
name: Dataset name (e.g., 'mimic-iv', 'eicu')
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
Confirmation message with dataset info.
|
|
98
|
+
|
|
99
|
+
Raises:
|
|
100
|
+
DatasetError: If dataset doesn't exist.
|
|
101
|
+
|
|
102
|
+
Example:
|
|
103
|
+
>>> set_dataset("mimic-iv")
|
|
104
|
+
'Active dataset: mimic-iv (modalities: TABULAR)'
|
|
105
|
+
"""
|
|
106
|
+
try:
|
|
107
|
+
_set_active_dataset(name)
|
|
108
|
+
dataset = DatasetRegistry.get(name)
|
|
109
|
+
if not dataset:
|
|
110
|
+
raise ValueError(f"Dataset '{name}' not found")
|
|
111
|
+
modalities = ", ".join(m.name for m in dataset.modalities)
|
|
112
|
+
return f"Active dataset: {name} (modalities: {modalities})"
|
|
113
|
+
except ValueError as e:
|
|
114
|
+
available = ", ".join(list_datasets())
|
|
115
|
+
raise DatasetError(f"{e}. Available datasets: {available}") from e
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def get_active_dataset() -> str:
|
|
119
|
+
"""Get the name of the currently active dataset.
|
|
120
|
+
|
|
121
|
+
Returns:
|
|
122
|
+
Name of the active dataset.
|
|
123
|
+
|
|
124
|
+
Raises:
|
|
125
|
+
DatasetError: If no dataset is active.
|
|
126
|
+
"""
|
|
127
|
+
try:
|
|
128
|
+
return _get_active_dataset()
|
|
129
|
+
except ValueError as e:
|
|
130
|
+
raise DatasetError(str(e)) from e
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
# =============================================================================
|
|
134
|
+
# Tabular Data Tools
|
|
135
|
+
# =============================================================================
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def get_schema() -> dict[str, Any]:
|
|
139
|
+
"""Get database schema information for the active dataset.
|
|
140
|
+
|
|
141
|
+
Returns:
|
|
142
|
+
dict with:
|
|
143
|
+
- backend_info: str - Backend description
|
|
144
|
+
- tables: list[str] - List of table names
|
|
145
|
+
|
|
146
|
+
Example:
|
|
147
|
+
>>> set_dataset("mimic-iv")
|
|
148
|
+
>>> schema = get_schema()
|
|
149
|
+
>>> print(schema['tables'])
|
|
150
|
+
['admissions', 'diagnoses_icd', 'patients', ...]
|
|
151
|
+
"""
|
|
152
|
+
dataset = DatasetRegistry.get_active()
|
|
153
|
+
tool = ToolRegistry.get("get_database_schema")
|
|
154
|
+
return tool.invoke(dataset, GetDatabaseSchemaInput())
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def get_table_info(table_name: str, show_sample: bool = True) -> dict[str, Any]:
|
|
158
|
+
"""Get column information and sample data for a table.
|
|
159
|
+
|
|
160
|
+
Args:
|
|
161
|
+
table_name: Name of the table to inspect.
|
|
162
|
+
show_sample: If True, include sample rows (default: True).
|
|
163
|
+
|
|
164
|
+
Returns:
|
|
165
|
+
dict with:
|
|
166
|
+
- backend_info: str - Backend description
|
|
167
|
+
- table_name: str - Table name
|
|
168
|
+
- schema: pd.DataFrame - Column information
|
|
169
|
+
- sample: pd.DataFrame | None - Sample rows if requested
|
|
170
|
+
|
|
171
|
+
Raises:
|
|
172
|
+
QueryError: If table doesn't exist.
|
|
173
|
+
|
|
174
|
+
Example:
|
|
175
|
+
>>> info = get_table_info("patients")
|
|
176
|
+
>>> print(info['schema']) # DataFrame with column info
|
|
177
|
+
>>> print(info['sample']) # DataFrame with sample rows
|
|
178
|
+
"""
|
|
179
|
+
dataset = DatasetRegistry.get_active()
|
|
180
|
+
tool = ToolRegistry.get("get_table_info")
|
|
181
|
+
return tool.invoke(
|
|
182
|
+
dataset, GetTableInfoInput(table_name=table_name, show_sample=show_sample)
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def execute_query(sql: str) -> pd.DataFrame:
|
|
187
|
+
"""Execute a SQL SELECT query against the active dataset.
|
|
188
|
+
|
|
189
|
+
Args:
|
|
190
|
+
sql: SQL SELECT query string.
|
|
191
|
+
|
|
192
|
+
Returns:
|
|
193
|
+
pd.DataFrame with query results.
|
|
194
|
+
|
|
195
|
+
Raises:
|
|
196
|
+
SecurityError: If query violates security constraints.
|
|
197
|
+
QueryError: If query execution fails.
|
|
198
|
+
|
|
199
|
+
Example:
|
|
200
|
+
>>> df = execute_query("SELECT gender, COUNT(*) FROM patients GROUP BY gender")
|
|
201
|
+
>>> print(df)
|
|
202
|
+
gender count_star()
|
|
203
|
+
0 M 55
|
|
204
|
+
1 F 45
|
|
205
|
+
"""
|
|
206
|
+
dataset = DatasetRegistry.get_active()
|
|
207
|
+
tool = ToolRegistry.get("execute_query")
|
|
208
|
+
return tool.invoke(dataset, ExecuteQueryInput(sql_query=sql))
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
# =============================================================================
|
|
212
|
+
# Clinical Notes Tools
|
|
213
|
+
# =============================================================================
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def _check_notes_compatibility(tool_name: str) -> None:
|
|
217
|
+
"""Check that active dataset supports notes tools."""
|
|
218
|
+
dataset = DatasetRegistry.get_active()
|
|
219
|
+
result = _tool_selector.check_compatibility(tool_name, dataset)
|
|
220
|
+
if not result.compatible:
|
|
221
|
+
raise ModalityError(
|
|
222
|
+
f"Dataset '{dataset.name}' does not support clinical notes. "
|
|
223
|
+
f"Available modalities: {', '.join(m.name for m in dataset.modalities)}. "
|
|
224
|
+
f"Use a dataset with NOTES modality (e.g., 'mimic-iv-note')."
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
def search_notes(
|
|
229
|
+
query: str,
|
|
230
|
+
note_type: str = "all",
|
|
231
|
+
limit: int = 5,
|
|
232
|
+
snippet_length: int = 300,
|
|
233
|
+
) -> dict[str, Any]:
|
|
234
|
+
"""Search clinical notes by keyword, returning snippets.
|
|
235
|
+
|
|
236
|
+
Args:
|
|
237
|
+
query: Search term to find in notes.
|
|
238
|
+
note_type: Type of notes - 'discharge', 'radiology', or 'all'.
|
|
239
|
+
limit: Maximum results per note type (default: 5).
|
|
240
|
+
snippet_length: Characters of context around matches (default: 300).
|
|
241
|
+
|
|
242
|
+
Returns:
|
|
243
|
+
dict with:
|
|
244
|
+
- backend_info: str - Backend description
|
|
245
|
+
- query: str - Search term used
|
|
246
|
+
- snippet_length: int - Snippet length
|
|
247
|
+
- results: dict[str, pd.DataFrame] - Results by note type
|
|
248
|
+
|
|
249
|
+
Raises:
|
|
250
|
+
ModalityError: If active dataset doesn't support notes.
|
|
251
|
+
QueryError: If note_type is invalid.
|
|
252
|
+
|
|
253
|
+
Example:
|
|
254
|
+
>>> set_dataset("mimic-iv-note")
|
|
255
|
+
>>> results = search_notes("pneumonia", limit=3)
|
|
256
|
+
>>> for note_type, df in results['results'].items():
|
|
257
|
+
... print(f"{note_type}: {len(df)} matches")
|
|
258
|
+
"""
|
|
259
|
+
_check_notes_compatibility("search_notes")
|
|
260
|
+
|
|
261
|
+
dataset = DatasetRegistry.get_active()
|
|
262
|
+
tool = ToolRegistry.get("search_notes")
|
|
263
|
+
return tool.invoke(
|
|
264
|
+
dataset,
|
|
265
|
+
SearchNotesInput(
|
|
266
|
+
query=query,
|
|
267
|
+
note_type=note_type,
|
|
268
|
+
limit=limit,
|
|
269
|
+
snippet_length=snippet_length,
|
|
270
|
+
),
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
def get_note(note_id: str, max_length: int | None = None) -> dict[str, Any]:
|
|
275
|
+
"""Retrieve full text of a clinical note by ID.
|
|
276
|
+
|
|
277
|
+
Args:
|
|
278
|
+
note_id: The note ID (e.g., from search_notes results).
|
|
279
|
+
max_length: Optional maximum characters to return.
|
|
280
|
+
|
|
281
|
+
Returns:
|
|
282
|
+
dict with:
|
|
283
|
+
- backend_info: str - Backend description
|
|
284
|
+
- note_id: str - Note identifier
|
|
285
|
+
- subject_id: int - Patient ID
|
|
286
|
+
- text: str - Full note text (possibly truncated)
|
|
287
|
+
- note_length: int - Original note length
|
|
288
|
+
- truncated: bool - Whether text was truncated
|
|
289
|
+
|
|
290
|
+
Raises:
|
|
291
|
+
ModalityError: If active dataset doesn't support notes.
|
|
292
|
+
QueryError: If note not found.
|
|
293
|
+
|
|
294
|
+
Example:
|
|
295
|
+
>>> note = get_note("10000032_DS-1")
|
|
296
|
+
>>> print(note['text'][:500])
|
|
297
|
+
"""
|
|
298
|
+
_check_notes_compatibility("get_note")
|
|
299
|
+
|
|
300
|
+
dataset = DatasetRegistry.get_active()
|
|
301
|
+
tool = ToolRegistry.get("get_note")
|
|
302
|
+
return tool.invoke(
|
|
303
|
+
dataset,
|
|
304
|
+
GetNoteInput(note_id=note_id, max_length=max_length),
|
|
305
|
+
)
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
def list_patient_notes(
|
|
309
|
+
subject_id: int,
|
|
310
|
+
note_type: str = "all",
|
|
311
|
+
limit: int = 20,
|
|
312
|
+
) -> dict[str, Any]:
|
|
313
|
+
"""List available clinical notes for a patient (metadata only).
|
|
314
|
+
|
|
315
|
+
Args:
|
|
316
|
+
subject_id: Patient identifier.
|
|
317
|
+
note_type: Type of notes - 'discharge', 'radiology', or 'all'.
|
|
318
|
+
limit: Maximum notes to return (default: 20).
|
|
319
|
+
|
|
320
|
+
Returns:
|
|
321
|
+
dict with:
|
|
322
|
+
- backend_info: str - Backend description
|
|
323
|
+
- subject_id: int - Patient ID
|
|
324
|
+
- notes: dict[str, pd.DataFrame] - Note metadata by type
|
|
325
|
+
|
|
326
|
+
Raises:
|
|
327
|
+
ModalityError: If active dataset doesn't support notes.
|
|
328
|
+
QueryError: If note_type is invalid.
|
|
329
|
+
|
|
330
|
+
Example:
|
|
331
|
+
>>> notes = list_patient_notes(10000032)
|
|
332
|
+
>>> for note_type, df in notes['notes'].items():
|
|
333
|
+
... print(f"{note_type}: {len(df)} notes")
|
|
334
|
+
"""
|
|
335
|
+
_check_notes_compatibility("list_patient_notes")
|
|
336
|
+
|
|
337
|
+
dataset = DatasetRegistry.get_active()
|
|
338
|
+
tool = ToolRegistry.get("list_patient_notes")
|
|
339
|
+
return tool.invoke(
|
|
340
|
+
dataset,
|
|
341
|
+
ListPatientNotesInput(
|
|
342
|
+
subject_id=subject_id,
|
|
343
|
+
note_type=note_type,
|
|
344
|
+
limit=limit,
|
|
345
|
+
),
|
|
346
|
+
)
|