m4-infra 0.0.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. m4/__init__.py +54 -0
  2. m4/api.py +346 -0
  3. m4/auth.py +433 -0
  4. m4/cli.py +882 -0
  5. m4/config.py +252 -0
  6. m4/console.py +325 -0
  7. m4/core/__init__.py +28 -0
  8. m4/core/backends/__init__.py +98 -0
  9. m4/core/backends/base.py +202 -0
  10. m4/core/backends/bigquery.py +345 -0
  11. m4/core/backends/duckdb.py +272 -0
  12. m4/core/datasets.py +283 -0
  13. m4/core/exceptions.py +152 -0
  14. m4/core/serialization.py +146 -0
  15. m4/core/tools/__init__.py +106 -0
  16. m4/core/tools/base.py +160 -0
  17. m4/core/tools/management.py +170 -0
  18. m4/core/tools/notes.py +350 -0
  19. m4/core/tools/registry.py +332 -0
  20. m4/core/tools/tabular.py +215 -0
  21. m4/core/validation.py +217 -0
  22. m4/data_io.py +531 -0
  23. m4/mcp_client_configs/__init__.py +6 -0
  24. m4/mcp_client_configs/dynamic_mcp_config.py +500 -0
  25. m4/mcp_client_configs/setup_claude_desktop.py +322 -0
  26. m4/mcp_server.py +515 -0
  27. m4/skills/SKILLS_INDEX.md +132 -0
  28. m4/skills/__init__.py +21 -0
  29. m4/skills/apsiii-score/SKILL.md +147 -0
  30. m4/skills/apsiii-score/scripts/apsiii.sql +894 -0
  31. m4/skills/baseline-creatinine/SKILL.md +145 -0
  32. m4/skills/baseline-creatinine/scripts/creatinine_baseline.sql +71 -0
  33. m4/skills/clinical-research-pitfalls/SKILL.md +242 -0
  34. m4/skills/first-icu-stay/SKILL.md +193 -0
  35. m4/skills/first-icu-stay/scripts/icustay_detail.sql +47 -0
  36. m4/skills/gcs-calculation/SKILL.md +161 -0
  37. m4/skills/gcs-calculation/scripts/gcs.sql +127 -0
  38. m4/skills/installer.py +197 -0
  39. m4/skills/kdigo-aki-staging/SKILL.md +207 -0
  40. m4/skills/kdigo-aki-staging/scripts/kdigo_creatinine.sql +63 -0
  41. m4/skills/kdigo-aki-staging/scripts/kdigo_stages.sql +153 -0
  42. m4/skills/kdigo-aki-staging/scripts/kdigo_uo.sql +112 -0
  43. m4/skills/lods-score/SKILL.md +157 -0
  44. m4/skills/lods-score/scripts/lods.sql +230 -0
  45. m4/skills/m4-api/SKILL.md +134 -0
  46. m4/skills/mimic-eicu-mapping/SKILL.md +205 -0
  47. m4/skills/mimic-table-relationships/SKILL.md +193 -0
  48. m4/skills/oasis-score/SKILL.md +131 -0
  49. m4/skills/oasis-score/scripts/oasis.sql +287 -0
  50. m4/skills/sapsii-score/SKILL.md +134 -0
  51. m4/skills/sapsii-score/scripts/sapsii.sql +549 -0
  52. m4/skills/sepsis-3-cohort/SKILL.md +152 -0
  53. m4/skills/sepsis-3-cohort/scripts/sepsis3.sql +80 -0
  54. m4/skills/sirs-criteria/SKILL.md +156 -0
  55. m4/skills/sirs-criteria/scripts/sirs.sql +100 -0
  56. m4/skills/sofa-score/SKILL.md +138 -0
  57. m4/skills/sofa-score/scripts/sofa.sql +379 -0
  58. m4/skills/suspicion-of-infection/SKILL.md +158 -0
  59. m4/skills/suspicion-of-infection/scripts/suspicion_of_infection.sql +175 -0
  60. m4/skills/vasopressor-equivalents/SKILL.md +158 -0
  61. m4/skills/vasopressor-equivalents/scripts/norepinephrine_equivalent_dose.sql +33 -0
  62. m4_infra-0.0.0.dev0.dist-info/METADATA +324 -0
  63. m4_infra-0.0.0.dev0.dist-info/RECORD +66 -0
  64. m4_infra-0.0.0.dev0.dist-info/WHEEL +4 -0
  65. m4_infra-0.0.0.dev0.dist-info/entry_points.txt +8 -0
  66. m4_infra-0.0.0.dev0.dist-info/licenses/LICENSE +21 -0
m4/__init__.py ADDED
@@ -0,0 +1,54 @@
1
+ """M4: Multi-Dataset Infrastructure for LLM-Assisted Clinical Research.
2
+
3
+ M4 provides rigorous, auditable infrastructure for AI-assisted clinical research,
4
+ offering a safe interface for LLMs and autonomous agents to interact with EHR data.
5
+
6
+ Quick Start:
7
+ from m4 import execute_query, set_dataset, get_schema
8
+
9
+ set_dataset("mimic-iv")
10
+ print(get_schema())
11
+ result = execute_query("SELECT COUNT(*) FROM patients")
12
+
13
+ For MCP server usage, run: m4 serve
14
+ """
15
+
16
+ __version__ = "0.0.0.dev0"
17
+
18
+ # Expose API functions at package level for easy imports
19
+ from m4.api import (
20
+ # Exceptions
21
+ DatasetError,
22
+ M4Error,
23
+ ModalityError,
24
+ QueryError,
25
+ # Tabular data
26
+ execute_query,
27
+ # Dataset management
28
+ get_active_dataset,
29
+ # Clinical notes
30
+ get_note,
31
+ get_schema,
32
+ get_table_info,
33
+ list_datasets,
34
+ list_patient_notes,
35
+ search_notes,
36
+ set_dataset,
37
+ )
38
+
39
+ __all__ = [
40
+ "DatasetError",
41
+ "M4Error",
42
+ "ModalityError",
43
+ "QueryError",
44
+ "__version__",
45
+ "execute_query",
46
+ "get_active_dataset",
47
+ "get_note",
48
+ "get_schema",
49
+ "get_table_info",
50
+ "list_datasets",
51
+ "list_patient_notes",
52
+ "search_notes",
53
+ "set_dataset",
54
+ ]
m4/api.py ADDED
@@ -0,0 +1,346 @@
1
+ """M4 Python API for direct access to clinical data tools.
2
+
3
+ This module provides a clean Python API for code execution environments
4
+ like Claude Code. Functions delegate to the same tool classes used by
5
+ the MCP server, ensuring consistent behavior across interfaces.
6
+
7
+ Unlike the MCP server, this API returns native Python types:
8
+ - execute_query() returns pd.DataFrame
9
+ - get_schema() returns dict with tables list
10
+ - get_table_info() returns dict with schema DataFrame
11
+ - etc.
12
+
13
+ Example:
14
+ from m4 import execute_query, set_dataset, get_schema
15
+ import pandas as pd
16
+
17
+ set_dataset("mimic-iv")
18
+ schema = get_schema() # Returns dict with 'tables' list
19
+ print(schema['tables'])
20
+
21
+ df = execute_query("SELECT COUNT(*) FROM patients")
22
+ print(df) # DataFrame
23
+
24
+ All functions work with the currently active dataset. Use set_dataset()
25
+ to switch between datasets.
26
+ """
27
+
28
+ from typing import Any
29
+
30
+ import pandas as pd
31
+
32
+ from m4.config import get_active_dataset as _get_active_dataset
33
+ from m4.config import set_active_dataset as _set_active_dataset
34
+ from m4.core.datasets import DatasetRegistry
35
+ from m4.core.exceptions import DatasetError, M4Error, ModalityError, QueryError
36
+ from m4.core.tools import ToolRegistry, ToolSelector, init_tools
37
+ from m4.core.tools.notes import (
38
+ GetNoteInput,
39
+ ListPatientNotesInput,
40
+ SearchNotesInput,
41
+ )
42
+ from m4.core.tools.tabular import (
43
+ ExecuteQueryInput,
44
+ GetDatabaseSchemaInput,
45
+ GetTableInfoInput,
46
+ )
47
+
48
+ # Initialize tools on module import
49
+ init_tools()
50
+
51
+ # Tool selector for compatibility checking
52
+ _tool_selector = ToolSelector()
53
+
54
+ # Re-export exceptions for convenience
55
+ __all__ = [
56
+ "DatasetError",
57
+ "M4Error",
58
+ "ModalityError",
59
+ "QueryError",
60
+ "execute_query",
61
+ "get_active_dataset",
62
+ "get_note",
63
+ "get_schema",
64
+ "get_table_info",
65
+ "list_datasets",
66
+ "list_patient_notes",
67
+ "search_notes",
68
+ "set_dataset",
69
+ ]
70
+
71
+
72
+ # =============================================================================
73
+ # Dataset Management
74
+ # =============================================================================
75
+
76
+
77
+ def list_datasets() -> list[str]:
78
+ """List all available datasets.
79
+
80
+ Returns:
81
+ List of dataset names that can be used with set_dataset().
82
+
83
+ Example:
84
+ >>> list_datasets()
85
+ ['mimic-iv', 'mimic-iv-note', 'eicu']
86
+ """
87
+ return [ds.name for ds in DatasetRegistry.list_all()]
88
+
89
+
90
+ def set_dataset(name: str) -> str:
91
+ """Set the active dataset for subsequent queries.
92
+
93
+ Args:
94
+ name: Dataset name (e.g., 'mimic-iv', 'eicu')
95
+
96
+ Returns:
97
+ Confirmation message with dataset info.
98
+
99
+ Raises:
100
+ DatasetError: If dataset doesn't exist.
101
+
102
+ Example:
103
+ >>> set_dataset("mimic-iv")
104
+ 'Active dataset: mimic-iv (modalities: TABULAR)'
105
+ """
106
+ try:
107
+ _set_active_dataset(name)
108
+ dataset = DatasetRegistry.get(name)
109
+ if not dataset:
110
+ raise ValueError(f"Dataset '{name}' not found")
111
+ modalities = ", ".join(m.name for m in dataset.modalities)
112
+ return f"Active dataset: {name} (modalities: {modalities})"
113
+ except ValueError as e:
114
+ available = ", ".join(list_datasets())
115
+ raise DatasetError(f"{e}. Available datasets: {available}") from e
116
+
117
+
118
+ def get_active_dataset() -> str:
119
+ """Get the name of the currently active dataset.
120
+
121
+ Returns:
122
+ Name of the active dataset.
123
+
124
+ Raises:
125
+ DatasetError: If no dataset is active.
126
+ """
127
+ try:
128
+ return _get_active_dataset()
129
+ except ValueError as e:
130
+ raise DatasetError(str(e)) from e
131
+
132
+
133
+ # =============================================================================
134
+ # Tabular Data Tools
135
+ # =============================================================================
136
+
137
+
138
+ def get_schema() -> dict[str, Any]:
139
+ """Get database schema information for the active dataset.
140
+
141
+ Returns:
142
+ dict with:
143
+ - backend_info: str - Backend description
144
+ - tables: list[str] - List of table names
145
+
146
+ Example:
147
+ >>> set_dataset("mimic-iv")
148
+ >>> schema = get_schema()
149
+ >>> print(schema['tables'])
150
+ ['admissions', 'diagnoses_icd', 'patients', ...]
151
+ """
152
+ dataset = DatasetRegistry.get_active()
153
+ tool = ToolRegistry.get("get_database_schema")
154
+ return tool.invoke(dataset, GetDatabaseSchemaInput())
155
+
156
+
157
+ def get_table_info(table_name: str, show_sample: bool = True) -> dict[str, Any]:
158
+ """Get column information and sample data for a table.
159
+
160
+ Args:
161
+ table_name: Name of the table to inspect.
162
+ show_sample: If True, include sample rows (default: True).
163
+
164
+ Returns:
165
+ dict with:
166
+ - backend_info: str - Backend description
167
+ - table_name: str - Table name
168
+ - schema: pd.DataFrame - Column information
169
+ - sample: pd.DataFrame | None - Sample rows if requested
170
+
171
+ Raises:
172
+ QueryError: If table doesn't exist.
173
+
174
+ Example:
175
+ >>> info = get_table_info("patients")
176
+ >>> print(info['schema']) # DataFrame with column info
177
+ >>> print(info['sample']) # DataFrame with sample rows
178
+ """
179
+ dataset = DatasetRegistry.get_active()
180
+ tool = ToolRegistry.get("get_table_info")
181
+ return tool.invoke(
182
+ dataset, GetTableInfoInput(table_name=table_name, show_sample=show_sample)
183
+ )
184
+
185
+
186
+ def execute_query(sql: str) -> pd.DataFrame:
187
+ """Execute a SQL SELECT query against the active dataset.
188
+
189
+ Args:
190
+ sql: SQL SELECT query string.
191
+
192
+ Returns:
193
+ pd.DataFrame with query results.
194
+
195
+ Raises:
196
+ SecurityError: If query violates security constraints.
197
+ QueryError: If query execution fails.
198
+
199
+ Example:
200
+ >>> df = execute_query("SELECT gender, COUNT(*) FROM patients GROUP BY gender")
201
+ >>> print(df)
202
+ gender count_star()
203
+ 0 M 55
204
+ 1 F 45
205
+ """
206
+ dataset = DatasetRegistry.get_active()
207
+ tool = ToolRegistry.get("execute_query")
208
+ return tool.invoke(dataset, ExecuteQueryInput(sql_query=sql))
209
+
210
+
211
+ # =============================================================================
212
+ # Clinical Notes Tools
213
+ # =============================================================================
214
+
215
+
216
+ def _check_notes_compatibility(tool_name: str) -> None:
217
+ """Check that active dataset supports notes tools."""
218
+ dataset = DatasetRegistry.get_active()
219
+ result = _tool_selector.check_compatibility(tool_name, dataset)
220
+ if not result.compatible:
221
+ raise ModalityError(
222
+ f"Dataset '{dataset.name}' does not support clinical notes. "
223
+ f"Available modalities: {', '.join(m.name for m in dataset.modalities)}. "
224
+ f"Use a dataset with NOTES modality (e.g., 'mimic-iv-note')."
225
+ )
226
+
227
+
228
+ def search_notes(
229
+ query: str,
230
+ note_type: str = "all",
231
+ limit: int = 5,
232
+ snippet_length: int = 300,
233
+ ) -> dict[str, Any]:
234
+ """Search clinical notes by keyword, returning snippets.
235
+
236
+ Args:
237
+ query: Search term to find in notes.
238
+ note_type: Type of notes - 'discharge', 'radiology', or 'all'.
239
+ limit: Maximum results per note type (default: 5).
240
+ snippet_length: Characters of context around matches (default: 300).
241
+
242
+ Returns:
243
+ dict with:
244
+ - backend_info: str - Backend description
245
+ - query: str - Search term used
246
+ - snippet_length: int - Snippet length
247
+ - results: dict[str, pd.DataFrame] - Results by note type
248
+
249
+ Raises:
250
+ ModalityError: If active dataset doesn't support notes.
251
+ QueryError: If note_type is invalid.
252
+
253
+ Example:
254
+ >>> set_dataset("mimic-iv-note")
255
+ >>> results = search_notes("pneumonia", limit=3)
256
+ >>> for note_type, df in results['results'].items():
257
+ ... print(f"{note_type}: {len(df)} matches")
258
+ """
259
+ _check_notes_compatibility("search_notes")
260
+
261
+ dataset = DatasetRegistry.get_active()
262
+ tool = ToolRegistry.get("search_notes")
263
+ return tool.invoke(
264
+ dataset,
265
+ SearchNotesInput(
266
+ query=query,
267
+ note_type=note_type,
268
+ limit=limit,
269
+ snippet_length=snippet_length,
270
+ ),
271
+ )
272
+
273
+
274
+ def get_note(note_id: str, max_length: int | None = None) -> dict[str, Any]:
275
+ """Retrieve full text of a clinical note by ID.
276
+
277
+ Args:
278
+ note_id: The note ID (e.g., from search_notes results).
279
+ max_length: Optional maximum characters to return.
280
+
281
+ Returns:
282
+ dict with:
283
+ - backend_info: str - Backend description
284
+ - note_id: str - Note identifier
285
+ - subject_id: int - Patient ID
286
+ - text: str - Full note text (possibly truncated)
287
+ - note_length: int - Original note length
288
+ - truncated: bool - Whether text was truncated
289
+
290
+ Raises:
291
+ ModalityError: If active dataset doesn't support notes.
292
+ QueryError: If note not found.
293
+
294
+ Example:
295
+ >>> note = get_note("10000032_DS-1")
296
+ >>> print(note['text'][:500])
297
+ """
298
+ _check_notes_compatibility("get_note")
299
+
300
+ dataset = DatasetRegistry.get_active()
301
+ tool = ToolRegistry.get("get_note")
302
+ return tool.invoke(
303
+ dataset,
304
+ GetNoteInput(note_id=note_id, max_length=max_length),
305
+ )
306
+
307
+
308
+ def list_patient_notes(
309
+ subject_id: int,
310
+ note_type: str = "all",
311
+ limit: int = 20,
312
+ ) -> dict[str, Any]:
313
+ """List available clinical notes for a patient (metadata only).
314
+
315
+ Args:
316
+ subject_id: Patient identifier.
317
+ note_type: Type of notes - 'discharge', 'radiology', or 'all'.
318
+ limit: Maximum notes to return (default: 20).
319
+
320
+ Returns:
321
+ dict with:
322
+ - backend_info: str - Backend description
323
+ - subject_id: int - Patient ID
324
+ - notes: dict[str, pd.DataFrame] - Note metadata by type
325
+
326
+ Raises:
327
+ ModalityError: If active dataset doesn't support notes.
328
+ QueryError: If note_type is invalid.
329
+
330
+ Example:
331
+ >>> notes = list_patient_notes(10000032)
332
+ >>> for note_type, df in notes['notes'].items():
333
+ ... print(f"{note_type}: {len(df)} notes")
334
+ """
335
+ _check_notes_compatibility("list_patient_notes")
336
+
337
+ dataset = DatasetRegistry.get_active()
338
+ tool = ToolRegistry.get("list_patient_notes")
339
+ return tool.invoke(
340
+ dataset,
341
+ ListPatientNotesInput(
342
+ subject_id=subject_id,
343
+ note_type=note_type,
344
+ limit=limit,
345
+ ),
346
+ )