trialmatchai 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (152) hide show
  1. trialmatchai-0.2.0/LICENSE +31 -0
  2. trialmatchai-0.2.0/PKG-INFO +474 -0
  3. trialmatchai-0.2.0/README.md +416 -0
  4. trialmatchai-0.2.0/pyproject.toml +119 -0
  5. trialmatchai-0.2.0/setup.cfg +4 -0
  6. trialmatchai-0.2.0/src/trialmatchai/__init__.py +3 -0
  7. trialmatchai-0.2.0/src/trialmatchai/__main__.py +4 -0
  8. trialmatchai-0.2.0/src/trialmatchai/cli/__init__.py +1 -0
  9. trialmatchai-0.2.0/src/trialmatchai/cli/bootstrap_data.py +260 -0
  10. trialmatchai-0.2.0/src/trialmatchai/cli/build.py +116 -0
  11. trialmatchai-0.2.0/src/trialmatchai/cli/build_concepts.py +208 -0
  12. trialmatchai-0.2.0/src/trialmatchai/cli/e2e.py +70 -0
  13. trialmatchai-0.2.0/src/trialmatchai/cli/healthcheck.py +106 -0
  14. trialmatchai-0.2.0/src/trialmatchai/cli/import_patient.py +100 -0
  15. trialmatchai-0.2.0/src/trialmatchai/cli/index_data.py +93 -0
  16. trialmatchai-0.2.0/src/trialmatchai/cli/main.py +100 -0
  17. trialmatchai-0.2.0/src/trialmatchai/cli/pipeline.py +106 -0
  18. trialmatchai-0.2.0/src/trialmatchai/cli/run.py +27 -0
  19. trialmatchai-0.2.0/src/trialmatchai/cli/trec.py +69 -0
  20. trialmatchai-0.2.0/src/trialmatchai/cli/update_registry.py +201 -0
  21. trialmatchai-0.2.0/src/trialmatchai/config/__init__.py +0 -0
  22. trialmatchai-0.2.0/src/trialmatchai/config/config.json +141 -0
  23. trialmatchai-0.2.0/src/trialmatchai/config/config_loader.py +178 -0
  24. trialmatchai-0.2.0/src/trialmatchai/config/settings.py +444 -0
  25. trialmatchai-0.2.0/src/trialmatchai/constraints/__init__.py +30 -0
  26. trialmatchai-0.2.0/src/trialmatchai/constraints/evaluation.py +445 -0
  27. trialmatchai-0.2.0/src/trialmatchai/constraints/extraction.py +487 -0
  28. trialmatchai-0.2.0/src/trialmatchai/constraints/models.py +121 -0
  29. trialmatchai-0.2.0/src/trialmatchai/constraints/reports.py +119 -0
  30. trialmatchai-0.2.0/src/trialmatchai/entities/__init__.py +13 -0
  31. trialmatchai-0.2.0/src/trialmatchai/entities/annotator.py +113 -0
  32. trialmatchai-0.2.0/src/trialmatchai/entities/builder.py +193 -0
  33. trialmatchai-0.2.0/src/trialmatchai/entities/concept_sources.py +242 -0
  34. trialmatchai-0.2.0/src/trialmatchai/entities/linker.py +352 -0
  35. trialmatchai-0.2.0/src/trialmatchai/entities/recognizers.py +351 -0
  36. trialmatchai-0.2.0/src/trialmatchai/entities/resources/variant_patterns.tsv +52 -0
  37. trialmatchai-0.2.0/src/trialmatchai/entities/schemas.py +111 -0
  38. trialmatchai-0.2.0/src/trialmatchai/entities/types.py +135 -0
  39. trialmatchai-0.2.0/src/trialmatchai/entity_schemas/trialmatchai.yaml +122 -0
  40. trialmatchai-0.2.0/src/trialmatchai/finetuning/__init__.py +17 -0
  41. trialmatchai-0.2.0/src/trialmatchai/finetuning/_sft.py +231 -0
  42. trialmatchai-0.2.0/src/trialmatchai/finetuning/cli.py +208 -0
  43. trialmatchai-0.2.0/src/trialmatchai/finetuning/config.py +94 -0
  44. trialmatchai-0.2.0/src/trialmatchai/finetuning/cot.py +25 -0
  45. trialmatchai-0.2.0/src/trialmatchai/finetuning/data.py +177 -0
  46. trialmatchai-0.2.0/src/trialmatchai/finetuning/merge.py +45 -0
  47. trialmatchai-0.2.0/src/trialmatchai/finetuning/ner.py +197 -0
  48. trialmatchai-0.2.0/src/trialmatchai/finetuning/reranker.py +30 -0
  49. trialmatchai-0.2.0/src/trialmatchai/interop/__init__.py +25 -0
  50. trialmatchai-0.2.0/src/trialmatchai/interop/detect.py +57 -0
  51. trialmatchai-0.2.0/src/trialmatchai/interop/exporters/__init__.py +9 -0
  52. trialmatchai-0.2.0/src/trialmatchai/interop/exporters/fhir.py +83 -0
  53. trialmatchai-0.2.0/src/trialmatchai/interop/exporters/matching_summary.py +27 -0
  54. trialmatchai-0.2.0/src/trialmatchai/interop/exporters/phenopacket.py +87 -0
  55. trialmatchai-0.2.0/src/trialmatchai/interop/importers/__init__.py +35 -0
  56. trialmatchai-0.2.0/src/trialmatchai/interop/importers/fhir.py +764 -0
  57. trialmatchai-0.2.0/src/trialmatchai/interop/importers/omop.py +417 -0
  58. trialmatchai-0.2.0/src/trialmatchai/interop/importers/phenopacket.py +299 -0
  59. trialmatchai-0.2.0/src/trialmatchai/interop/importers/text.py +116 -0
  60. trialmatchai-0.2.0/src/trialmatchai/interop/models.py +148 -0
  61. trialmatchai-0.2.0/src/trialmatchai/interop/narrative.py +108 -0
  62. trialmatchai-0.2.0/src/trialmatchai/interop/utils.py +252 -0
  63. trialmatchai-0.2.0/src/trialmatchai/main.py +644 -0
  64. trialmatchai-0.2.0/src/trialmatchai/matching/__init__.py +0 -0
  65. trialmatchai-0.2.0/src/trialmatchai/matching/eligibility_base.py +278 -0
  66. trialmatchai-0.2.0/src/trialmatchai/matching/eligibility_reasoning_transformers.py +123 -0
  67. trialmatchai-0.2.0/src/trialmatchai/matching/eligibility_reasoning_vllm.py +348 -0
  68. trialmatchai-0.2.0/src/trialmatchai/matching/query_expansion.py +210 -0
  69. trialmatchai-0.2.0/src/trialmatchai/matching/retrieval/__init__.py +0 -0
  70. trialmatchai-0.2.0/src/trialmatchai/matching/retrieval/criteria_retrieval.py +299 -0
  71. trialmatchai-0.2.0/src/trialmatchai/matching/retrieval/first_level_planner.py +399 -0
  72. trialmatchai-0.2.0/src/trialmatchai/matching/retrieval/location.py +60 -0
  73. trialmatchai-0.2.0/src/trialmatchai/matching/retrieval/synonyms.py +34 -0
  74. trialmatchai-0.2.0/src/trialmatchai/matching/retrieval/trial_retrieval.py +282 -0
  75. trialmatchai-0.2.0/src/trialmatchai/matching/trial_ranker.py +119 -0
  76. trialmatchai-0.2.0/src/trialmatchai/models/__init__.py +0 -0
  77. trialmatchai-0.2.0/src/trialmatchai/models/embedding/__init__.py +8 -0
  78. trialmatchai-0.2.0/src/trialmatchai/models/embedding/text_embedder.py +181 -0
  79. trialmatchai-0.2.0/src/trialmatchai/models/llm/__init__.py +0 -0
  80. trialmatchai-0.2.0/src/trialmatchai/models/llm/llm_reranker.py +141 -0
  81. trialmatchai-0.2.0/src/trialmatchai/models/llm/transformers_reranker.py +101 -0
  82. trialmatchai-0.2.0/src/trialmatchai/models/llm/vllm_loader.py +308 -0
  83. trialmatchai-0.2.0/src/trialmatchai/orchestration.py +539 -0
  84. trialmatchai-0.2.0/src/trialmatchai/pipeline.py +211 -0
  85. trialmatchai-0.2.0/src/trialmatchai/registry/__init__.py +13 -0
  86. trialmatchai-0.2.0/src/trialmatchai/registry/clinicaltrials_gov.py +187 -0
  87. trialmatchai-0.2.0/src/trialmatchai/registry/criteria_chunking.py +229 -0
  88. trialmatchai-0.2.0/src/trialmatchai/registry/defaults.py +20 -0
  89. trialmatchai-0.2.0/src/trialmatchai/registry/manifest.py +55 -0
  90. trialmatchai-0.2.0/src/trialmatchai/registry/normalization.py +140 -0
  91. trialmatchai-0.2.0/src/trialmatchai/registry/preparation.py +305 -0
  92. trialmatchai-0.2.0/src/trialmatchai/registry/updater.py +250 -0
  93. trialmatchai-0.2.0/src/trialmatchai/schemas/__init__.py +3 -0
  94. trialmatchai-0.2.0/src/trialmatchai/schemas/phenopacket.py +22 -0
  95. trialmatchai-0.2.0/src/trialmatchai/search/__init__.py +15 -0
  96. trialmatchai-0.2.0/src/trialmatchai/search/lancedb_backend.py +855 -0
  97. trialmatchai-0.2.0/src/trialmatchai/services/__init__.py +0 -0
  98. trialmatchai-0.2.0/src/trialmatchai/services/preflight.py +292 -0
  99. trialmatchai-0.2.0/src/trialmatchai/trec/__init__.py +17 -0
  100. trialmatchai-0.2.0/src/trialmatchai/trec/corpus.py +71 -0
  101. trialmatchai-0.2.0/src/trialmatchai/trec/metrics.py +102 -0
  102. trialmatchai-0.2.0/src/trialmatchai/trec/qrels.py +223 -0
  103. trialmatchai-0.2.0/src/trialmatchai/trec/runner.py +154 -0
  104. trialmatchai-0.2.0/src/trialmatchai/trec/topics.py +275 -0
  105. trialmatchai-0.2.0/src/trialmatchai/utils/__init__.py +0 -0
  106. trialmatchai-0.2.0/src/trialmatchai/utils/file_utils.py +78 -0
  107. trialmatchai-0.2.0/src/trialmatchai/utils/json_utils.py +35 -0
  108. trialmatchai-0.2.0/src/trialmatchai/utils/logging_config.py +74 -0
  109. trialmatchai-0.2.0/src/trialmatchai/utils/temporal_utils.py +43 -0
  110. trialmatchai-0.2.0/src/trialmatchai/utils/text.py +21 -0
  111. trialmatchai-0.2.0/src/trialmatchai/utils/timing.py +15 -0
  112. trialmatchai-0.2.0/src/trialmatchai.egg-info/PKG-INFO +474 -0
  113. trialmatchai-0.2.0/src/trialmatchai.egg-info/SOURCES.txt +150 -0
  114. trialmatchai-0.2.0/src/trialmatchai.egg-info/dependency_links.txt +1 -0
  115. trialmatchai-0.2.0/src/trialmatchai.egg-info/entry_points.txt +2 -0
  116. trialmatchai-0.2.0/src/trialmatchai.egg-info/requires.txt +44 -0
  117. trialmatchai-0.2.0/src/trialmatchai.egg-info/top_level.txt +1 -0
  118. trialmatchai-0.2.0/tests/test_bootstrap_data.py +124 -0
  119. trialmatchai-0.2.0/tests/test_concept_sources.py +51 -0
  120. trialmatchai-0.2.0/tests/test_config_pytest.py +39 -0
  121. trialmatchai-0.2.0/tests/test_constraints.py +317 -0
  122. trialmatchai-0.2.0/tests/test_criteria_chunking.py +105 -0
  123. trialmatchai-0.2.0/tests/test_deployment_readiness.py +112 -0
  124. trialmatchai-0.2.0/tests/test_embedding.py +22 -0
  125. trialmatchai-0.2.0/tests/test_entities.py +247 -0
  126. trialmatchai-0.2.0/tests/test_fhir_robustness.py +211 -0
  127. trialmatchai-0.2.0/tests/test_file_utils_pytest.py +15 -0
  128. trialmatchai-0.2.0/tests/test_finetuning.py +220 -0
  129. trialmatchai-0.2.0/tests/test_first_level_planner.py +327 -0
  130. trialmatchai-0.2.0/tests/test_first_level_search_pytest.py +42 -0
  131. trialmatchai-0.2.0/tests/test_import_patient_cli.py +37 -0
  132. trialmatchai-0.2.0/tests/test_lancedb_search_backend.py +183 -0
  133. trialmatchai-0.2.0/tests/test_location_filter.py +165 -0
  134. trialmatchai-0.2.0/tests/test_logging.py +34 -0
  135. trialmatchai-0.2.0/tests/test_logging_pytest.py +27 -0
  136. trialmatchai-0.2.0/tests/test_metrics.py +57 -0
  137. trialmatchai-0.2.0/tests/test_package_imports.py +43 -0
  138. trialmatchai-0.2.0/tests/test_patient_interop.py +347 -0
  139. trialmatchai-0.2.0/tests/test_patient_runtime_loading.py +112 -0
  140. trialmatchai-0.2.0/tests/test_pipeline.py +96 -0
  141. trialmatchai-0.2.0/tests/test_preflight_and_indexer.py +222 -0
  142. trialmatchai-0.2.0/tests/test_qrels_eval.py +53 -0
  143. trialmatchai-0.2.0/tests/test_registry_updater.py +347 -0
  144. trialmatchai-0.2.0/tests/test_schemas.py +30 -0
  145. trialmatchai-0.2.0/tests/test_search_queries_pytest.py +55 -0
  146. trialmatchai-0.2.0/tests/test_second_level_search_pytest.py +197 -0
  147. trialmatchai-0.2.0/tests/test_settings.py +88 -0
  148. trialmatchai-0.2.0/tests/test_trec_reproduction.py +46 -0
  149. trialmatchai-0.2.0/tests/test_trial_ranker_pytest.py +68 -0
  150. trialmatchai-0.2.0/tests/test_trial_ranker_scoring_contract.py +84 -0
  151. trialmatchai-0.2.0/tests/test_trial_ranker_tiebreak.py +47 -0
  152. trialmatchai-0.2.0/tests/test_variant_recognizer.py +51 -0
@@ -0,0 +1,31 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024
4
+ Majd Abdallah (1,2) (abdallahmajd7@gmail.com)
5
+ Macha Nikolski (1,2) (macha.nikolski@u-bordeaux.fr)
6
+ Mikaël Georges (1,2) (mikael.georges@u-bordeaux.fr)
7
+
8
+ (1) CBiB - University of Bordeaux,
9
+ 146, rue Leo Saignat, 33076 Bordeaux, France
10
+
11
+ (2) CNRS, IBGC - University of Bordeaux,
12
+ 1, rue Camille Saint-Saens, 33077 Bordeaux, France
13
+
14
+
15
+ Permission is hereby granted, free of charge, to any person obtaining a copy
16
+ of this software and associated documentation files (the "Software"), to deal
17
+ in the Software without restriction, including without limitation the rights
18
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
19
+ copies of the Software, and to permit persons to whom the Software is
20
+ furnished to do so, subject to the following conditions:
21
+
22
+ The above copyright notice and this permission notice shall be included in all
23
+ copies or substantial portions of the Software.
24
+
25
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
26
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
27
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
28
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
29
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
30
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31
+ SOFTWARE.
@@ -0,0 +1,474 @@
1
+ Metadata-Version: 2.4
2
+ Name: trialmatchai
3
+ Version: 0.2.0
4
+ Summary: AI-driven patient-to-clinical-trial matching: hybrid retrieval + LLM eligibility reasoning.
5
+ Author-email: Majd Abdallah <abdallahmajd7@gmail.com>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/cbib/TrialMatchAI
8
+ Project-URL: Repository, https://github.com/cbib/TrialMatchAI
9
+ Project-URL: Issues, https://github.com/cbib/TrialMatchAI/issues
10
+ Project-URL: Paper, https://doi.org/10.1038/s41467-026-70509-w
11
+ Keywords: clinical-trials,patient-trial-matching,biomedical-nlp,information-retrieval,large-language-models,eligibility,healthcare
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Healthcare Industry
14
+ Classifier: Intended Audience :: Science/Research
15
+ Classifier: Operating System :: POSIX :: Linux
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
19
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
20
+ Requires-Python: <3.12,>=3.11
21
+ Description-Content-Type: text/markdown
22
+ License-File: LICENSE
23
+ Requires-Dist: numpy<3,>=2.0
24
+ Requires-Dist: pandas==2.2.3
25
+ Requires-Dist: python-dateutil==2.9.0.post0
26
+ Requires-Dist: requests==2.34.2
27
+ Requires-Dist: tenacity==9.0.0
28
+ Requires-Dist: tqdm==4.67.1
29
+ Requires-Dist: PyYAML==6.0.2
30
+ Requires-Dist: lancedb<0.26,>=0.25.0
31
+ Requires-Dist: pyarrow<25,>=24.0.0
32
+ Requires-Dist: pydantic<3,>=2.12.0
33
+ Provides-Extra: gpu
34
+ Requires-Dist: bitsandbytes==0.49.2; (sys_platform == "linux" or sys_platform == "win32") and extra == "gpu"
35
+ Requires-Dist: vllm==0.23.0; sys_platform == "linux" and extra == "gpu"
36
+ Provides-Extra: llm
37
+ Requires-Dist: torch==2.11.0; extra == "llm"
38
+ Requires-Dist: transformers==5.6.2; extra == "llm"
39
+ Requires-Dist: accelerate==1.8.1; extra == "llm"
40
+ Requires-Dist: tokenizers==0.22.1; extra == "llm"
41
+ Requires-Dist: safetensors<0.7,>=0.6.2; extra == "llm"
42
+ Requires-Dist: sentencepiece==0.2.1; extra == "llm"
43
+ Requires-Dist: peft==0.15.2; extra == "llm"
44
+ Requires-Dist: einops==0.8.0; extra == "llm"
45
+ Provides-Extra: entity
46
+ Requires-Dist: torch==2.11.0; extra == "entity"
47
+ Requires-Dist: transformers==5.6.2; extra == "entity"
48
+ Requires-Dist: gliner2<2,>=1.3.1; extra == "entity"
49
+ Provides-Extra: finetune
50
+ Requires-Dist: torch==2.11.0; extra == "finetune"
51
+ Requires-Dist: transformers==5.6.2; extra == "finetune"
52
+ Requires-Dist: accelerate==1.8.1; extra == "finetune"
53
+ Requires-Dist: peft==0.15.2; extra == "finetune"
54
+ Requires-Dist: datasets<5,>=3.0; extra == "finetune"
55
+ Requires-Dist: gliner2<2,>=1.3.1; extra == "finetune"
56
+ Requires-Dist: bitsandbytes==0.49.2; (sys_platform == "linux" or sys_platform == "win32") and extra == "finetune"
57
+ Dynamic: license-file
58
+
59
+ <div align="center">
60
+
61
+ <img src="img/logo.png" alt="TrialMatchAI" width="480"/>
62
+
63
+ <p><b>AI-driven clinical trial matching.</b> Import a patient — text, FHIR, Phenopacket, or OMOP — and get ranked, eligible trials with criterion-level eligibility explanations. Local LanceDB search + vLLM reasoning on a single GPU server; no Elasticsearch or hosted vector database to run.</p>
64
+
65
+ <p>
66
+ <a href="#install">Install</a> ·
67
+ <a href="#quickstart">Quickstart</a> ·
68
+ <a href="#how-it-works">How it works</a> ·
69
+ <a href="#configuration">Configuration</a> ·
70
+ <a href="#cli-reference">CLI</a>
71
+ </p>
72
+
73
+ </div>
74
+
75
+ > **⚕️ For research and informational use only.** TrialMatchAI is not medical
76
+ > advice, not a medical device, and must not replace review by qualified
77
+ > healthcare professionals.
78
+
79
+ ## TL;DR
80
+
81
+ TrialMatchAI runs in **two halves**: **build the system once**, then **match
82
+ patients many times**. Both commands are idempotent and resume after disruption.
83
+
84
+ ```bash
85
+ uv sync --extra llm --extra gpu --extra entity # GPU host + HuggingFace access
86
+ uv run trialmatchai bootstrap-data # fetch prepared corpus + adapters
87
+ uv run trialmatchai build # 1) BUILD: prepare + index (once)
88
+ uv run trialmatchai e2e --input patient.txt # 2) MATCH: ingest + match a patient
89
+ # -> results/<patient_id>/ranked_trials.json
90
+ ```
91
+
92
+ ## Requirements
93
+
94
+ - Python 3.11 (`pyproject.toml` requires `>=3.11,<3.12`)
95
+ - `uv` recommended, or `pip` with an editable install
96
+ - NVIDIA GPU for vLLM-backed matching and fine-tuning
97
+ - Around 100 GB disk for datasets, model artifacts, LanceDB tables, manifests,
98
+ and run outputs
99
+ - OMOP vocabulary files if you want to build the concept-linking table locally
100
+
101
+ ## Install
102
+
103
+ Clone the repository and install from the project root:
104
+
105
+ ```bash
106
+ git clone <repo-url>
107
+ cd TrialMatchAI
108
+ ```
109
+
110
+ Base install with `uv` gives the package and CLI entry points without the heavy
111
+ model stack:
112
+
113
+ ```bash
114
+ uv sync
115
+ uv run trialmatchai --help
116
+ ```
117
+
118
+ Install the full model-backed runtime:
119
+
120
+ ```bash
121
+ uv sync --extra llm --extra gpu --extra entity
122
+ ```
123
+
124
+ Install the fine-tuning stack:
125
+
126
+ ```bash
127
+ uv sync --extra finetune
128
+ ```
129
+
130
+ Editable install with `pip` is also supported:
131
+
132
+ ```bash
133
+ python3.11 -m venv .venv
134
+ source .venv/bin/activate
135
+ pip install -e .
136
+ ```
137
+
138
+ Optional extras with `pip`:
139
+
140
+ ```bash
141
+ pip install -e ".[entity]"
142
+ pip install -e ".[llm,entity]"
143
+ pip install -e ".[llm,gpu,entity]"
144
+ pip install -e ".[finetune]"
145
+ ```
146
+
147
+ | Extra | Adds |
148
+ | --- | --- |
149
+ | `entity` | GLiNER2 biomedical extraction |
150
+ | `llm` | local embedding and LLM dependencies |
151
+ | `gpu` | vLLM and bitsandbytes; intended for Linux CUDA hosts |
152
+ | `finetune` | training dependencies for `trialmatchai finetune` |
153
+
154
+ Installing the package only gives you the CLI. Real matching also needs the
155
+ trial corpus, model artifacts, and LanceDB search tables — all produced by the
156
+ **build** step below.
157
+
158
+ ## Quickstart
159
+
160
+ The pipeline has two halves. **Build** is the heavy, one-time setup (GPU); it is
161
+ resumable and only does work that is not already done. **Match** is fast and
162
+ repeatable against the built system.
163
+
164
+ ### 0. Set up the runtime (GPU host)
165
+
166
+ ```bash
167
+ uv sync --extra llm --extra gpu --extra entity # model-backed runtime
168
+ cp .env.example .env # optional local overrides
169
+ export HF_TOKEN=<token> # required for gated models (phi-4, gemma-2)
170
+ ```
171
+
172
+ ### 1. Build the system — once
173
+
174
+ ```bash
175
+ uv run trialmatchai bootstrap-data # download the prepared corpus + LoRA adapters
176
+ uv run trialmatchai build # prepare embeddings/entities + build the index
177
+ uv run trialmatchai build --status # see exactly what is built (and what isn't)
178
+ ```
179
+
180
+ `build` fails fast if a GPU, an extra, or model access is missing — and resumes
181
+ from where it left off if interrupted. Bringing your **own** trials instead of
182
+ bootstrapping? Put normalized JSON in `data/trials_jsons/` and `build` will
183
+ prepare them. To enable entity→concept linking, add `--concepts` (open
184
+ vocabularies, **auto-downloaded**) — and optionally an OMOP `CONCEPT.csv` for
185
+ SNOMED/LOINC/RxNorm on top:
186
+
187
+ ```bash
188
+ uv run trialmatchai build --concepts # genes, diseases, chemicals, cells, phenotypes
189
+ uv run trialmatchai build --concepts --concepts-csv data/omop/CONCEPT.csv --synonym-csv data/omop/CONCEPT_SYNONYM.csv
190
+ ```
191
+
192
+ #### What gets fetched, and how
193
+
194
+ | Resource | How you get it | Automatic? |
195
+ | --- | --- | --- |
196
+ | Trial corpus (`processed_trials` + criteria) | `trialmatchai bootstrap-data` (Zenodo) | ✅ automatic |
197
+ | Fine-tuned LoRA adapters (CoT + reranker) | `trialmatchai bootstrap-data` (Zenodo) | ✅ automatic |
198
+ | Fine-tuning datasets (only if you re-train) | `trialmatchai bootstrap-data --finetune-data` (Zenodo) | ✅ automatic (opt-in) |
199
+ | Embedder (`BAAI/bge-m3`) | downloaded from HuggingFace on first use | ✅ automatic |
200
+ | Concept-linking vocabularies (genes, diseases, …) | `trialmatchai build --concepts` | ✅ automatic |
201
+ | Base LLMs (`microsoft/phi-4`, `google/gemma-2-2b-it`) | HuggingFace on first use | ⚠️ automatic, but **gated** models need a **one-time** `hf auth login` + accepting the model licence |
202
+ | OMOP clinical vocab (SNOMED/LOINC/RxNorm) | download `CONCEPT.csv` from [OHDSI Athena](https://athena.ohdsi.org/) | ❌ manual (licensed); linking works without it |
203
+
204
+ So a from-scratch user runs **two commands** (`bootstrap-data`, then `build --concepts`) after a one-time `hf auth login`. Everything else is pulled on demand.
205
+
206
+ ### 2. Match patients — repeatably
207
+
208
+ `e2e` ingests the patient (format auto-detected) and matches in one command:
209
+
210
+ ```bash
211
+ uv run trialmatchai e2e --input data/patients/raw/patient-1.txt
212
+ uv run trialmatchai e2e --input data/patients/raw/patient-1.fhir.json
213
+ uv run trialmatchai e2e --input data/patients/omop_extract
214
+ ```
215
+
216
+ Results land in `results/<patient_id>/` (ranked trials + eligibility
217
+ explanations). Re-running skips patients already matched.
218
+
219
+ ### Health and keeping trials current
220
+
221
+ ```bash
222
+ uv run trialmatchai healthcheck # validate config/paths/deps
223
+ ```
224
+
225
+ Fold new/changed ClinicalTrials.gov studies into the **live index** — fetch →
226
+ embed + entity-annotate → upsert, incremental and idempotent (unchanged studies
227
+ are skipped via a manifest, so it is safe to re-run):
228
+
229
+ ```bash
230
+ uv run trialmatchai update-registry --since 2026-06-01 # one-shot
231
+ uv run trialmatchai update-registry --watch --interval 86400 # server: update daily
232
+ ```
233
+
234
+ For a one-shot cadence you can also drive `update-registry` from cron, a systemd
235
+ timer, or GitHub Actions — see [docs/registry-updater.md](docs/registry-updater.md).
236
+
237
+ <details>
238
+ <summary>Manual / advanced control (the steps <code>build</code> and <code>e2e</code> wrap)</summary>
239
+
240
+ ```bash
241
+ uv run trialmatchai index --prepare # prepare + index from trials_jsons (what `build` runs)
242
+ uv run trialmatchai import-patient --input patient.txt # stage a profile only
243
+ uv run trialmatchai run # match already-staged profiles
244
+ uv run trialmatchai trec --tracks "21 22" # benchmark: official TREC CT eval
245
+ ```
246
+
247
+ </details>
248
+
249
+ ## How It Works
250
+
251
+ The diagram below is the **match** path. The one-time **build** step produces the
252
+ LanceDB index it queries — trial and criterion embeddings, entity annotations,
253
+ and parsed eligibility constraints.
254
+
255
+ ```text
256
+ Patient data (text / FHIR / Phenopacket / OMOP)
257
+ |
258
+ v
259
+ Interop importers -> canonical PatientProfile
260
+ |
261
+ v
262
+ GLiNER2 entity extraction + deterministic variant patterns -> concept linking
263
+ |
264
+ v
265
+ First-level trial retrieval in LanceDB (BM25 + embeddings)
266
+ |
267
+ v
268
+ Multi-channel query fusion for broad candidate recall
269
+ |
270
+ v
271
+ Criterion retrieval + vLLM Yes/No reranker
272
+ |
273
+ v
274
+ Constraint-aware criterion scoring
275
+ |
276
+ v
277
+ vLLM eligibility reasoning per criterion
278
+ |
279
+ v
280
+ Final ranking + explanations in results/
281
+ ```
282
+
283
+ The generative LLM stages, reranker and eligibility reasoning, run on vLLM.
284
+ LoRA adapters are served natively through vLLM. NER, reranker, and eligibility
285
+ reasoning are configurable and fine-tunable.
286
+
287
+ ## Data and storage
288
+
289
+ Everything is **embedded LanceDB** — no external services. A search DB
290
+ (`data/search`, with `trials` + `criteria` tables) and a concept-linking DB
291
+ (`data/concepts`). ClinicalTrials.gov records are normalized to
292
+ `data/trials_jsons/<NCT_ID>.json`, then prepared into one trial row and one
293
+ criteria row per eligibility criterion (text + embeddings + entity annotations +
294
+ parsed constraints). Both tables carry full-text and vector columns, so retrieval
295
+ runs in `bm25`, `vector`, or `hybrid` mode. Imported patients live under
296
+ `data/patients/{profiles,summaries}/`.
297
+
298
+ ## Patient Inputs
299
+
300
+ The importer supports:
301
+
302
+ - free-text notes: `.txt` and `.md`
303
+ - GA4GH Phenopacket JSON
304
+ - HL7 FHIR R4 Bundle JSON, individual FHIR resource JSON, NDJSON, and JSONL
305
+ - OMOP CDM extract folders with CSV or Parquet tables
306
+
307
+ Importers preserve provenance and unsupported source elements where possible.
308
+ The matching summary is rendered deterministically from the canonical
309
+ `PatientProfile`; raw patient files are not consumed directly by
310
+ `trialmatchai run`.
311
+
312
+ See [docs/interoperability.md](docs/interoperability.md) for format details.
313
+
314
+ ## Learn more
315
+
316
+ Deeper guides live in the **[documentation site](https://cbib.github.io/TrialMatchAI/)**:
317
+
318
+ - **[Pipeline &amp; CLI](https://cbib.github.io/TrialMatchAI/pipeline/)** — the stage registry, `--only/--skip/--from/--to/--force`, ablation, and presets.
319
+ - **[Architecture](https://cbib.github.io/TrialMatchAI/architecture/)** — multi-channel first-level retrieval, constraint-aware ranking, and the LanceDB tables.
320
+ - **[Patient interoperability](https://cbib.github.io/TrialMatchAI/interoperability/)** — text / FHIR / Phenopacket / OMOP importers.
321
+ - **[Fine-tuning &amp; custom models](https://cbib.github.io/TrialMatchAI/finetuning/)** — swap the NER, reranker, and CoT models; training-data formats.
322
+ - **[Registry updater](https://cbib.github.io/TrialMatchAI/registry-updater/)** — keep trials current from ClinicalTrials.gov.
323
+ - **[API reference](https://cbib.github.io/TrialMatchAI/api/)** — the Python API.
324
+
325
+ To bring your own models, point `entity_extraction.model_name`,
326
+ `model.reranker_adapter_path`, and `model.cot_adapter_path` at your checkpoints /
327
+ adapters, or train them with `trialmatchai finetune {cot,reranker,ner}` — see the
328
+ [fine-tuning guide](https://cbib.github.io/TrialMatchAI/finetuning/).
329
+
330
+ ## Configuration
331
+
332
+ Defaults live in `src/trialmatchai/config/config.json`. Runtime overrides can be
333
+ set in `.env` or as environment variables.
334
+
335
+ Common overrides:
336
+
337
+ ```bash
338
+ TRIALMATCHAI_OUTPUT_DIR=results
339
+ TRIALMATCHAI_TRIALS_JSON_FOLDER=data/trials_jsons
340
+ TRIALMATCHAI_SEARCH_DB_PATH=data/search
341
+ TRIALMATCHAI_SEARCH_MODE=hybrid
342
+ TRIALMATCHAI_FIRST_LEVEL_MAX_TRIALS=1000
343
+ TRIALMATCHAI_FIRST_LEVEL_PER_CHANNEL_SIZE=300
344
+ TRIALMATCHAI_FIRST_LEVEL_VECTOR_SCORE_THRESHOLD=0.0
345
+ TRIALMATCHAI_FIRST_LEVEL_LLM_EXPANSION_ENABLED=false
346
+ TRIALMATCHAI_ENTITY_BACKEND=gliner2
347
+ TRIALMATCHAI_ENTITY_SCHEMA_PATH=entity_schemas/trialmatchai.yaml
348
+ TRIALMATCHAI_CONCEPT_DB_PATH=data/concepts
349
+ TRIALMATCHAI_LINK_ACCEPT=0.80
350
+ TRIALMATCHAI_LINK_REJECT=0.30
351
+ TRIALMATCHAI_CONSTRAINTS_ENABLED=true
352
+ TRIALMATCHAI_CONSTRAINTS_SCORE_WEIGHT=0.25
353
+ TRIALMATCHAI_CONSTRAINTS_LLM_EXTRACTION_ENABLED=false
354
+ TRIALMATCHAI_CONSTRAINTS_WRITE_REPORTS=true
355
+ TRIALMATCHAI_MODEL_TRUST_REMOTE_CODE=false
356
+ TRIALMATCHAI_LOG_JSON=1
357
+ ```
358
+
359
+ The full override list is in [`.env.example`](.env.example).
360
+
361
+ ## CLI Reference
362
+
363
+ There is a single entry point — `trialmatchai` — and every capability is a
364
+ subcommand. Under the hood they are all slices of **one idempotent pipeline**.
365
+
366
+ **The unified pipeline (run any subset)**
367
+
368
+ | Command | Purpose |
369
+ | --- | --- |
370
+ | `trialmatchai pipeline` | Run the whole pipeline, or any slice: `--only` / `--from` / `--to` / `--skip` / `--force` over the stages `prepare → concepts → index → ingest → expand → match → eval`. Idempotent — finished work is skipped. See [docs](https://cbib.github.io/TrialMatchAI/pipeline/). |
371
+
372
+ The commands below are convenience presets over that pipeline.
373
+
374
+ **Build the system (setup half)**
375
+
376
+ | Command | Purpose |
377
+ | --- | --- |
378
+ | `trialmatchai build` | Prepare the corpus (embeddings + entities) and build the search index — resumable, with `--status` |
379
+ | `trialmatchai bootstrap-data` | Download and extract the prepared corpus + model adapters |
380
+ | `trialmatchai build-concepts` | Build the LanceDB concept table for entity normalization (optional, OMOP) |
381
+ | `trialmatchai update-registry` | Fetch changed ClinicalTrials.gov studies and upsert LanceDB |
382
+
383
+ **Match patients (run half)**
384
+
385
+ | Command | Purpose |
386
+ | --- | --- |
387
+ | `trialmatchai e2e` | Ingest a patient and match end-to-end (idempotent, per-patient resume) |
388
+ | `trialmatchai import-patient` | Import text, FHIR, Phenopacket, or OMOP patient data into a profile |
389
+ | `trialmatchai run` | Match already-staged patient profiles |
390
+ | `trialmatchai trec` | Benchmark: end-to-end evaluation on the official TREC Clinical Trials tracks |
391
+
392
+ **Utility**
393
+
394
+ | Command | Purpose |
395
+ | --- | --- |
396
+ | `trialmatchai healthcheck` | Validate config, paths, optional model deps, and LanceDB tables |
397
+ | `trialmatchai index` | Lower-level prepare/index of trial and criteria search tables |
398
+ | `trialmatchai finetune` | Fine-tune NER, reranker, or eligibility reasoning models |
399
+
400
+ ```bash
401
+ uv run trialmatchai build --status # what is built
402
+ uv run python -m trialmatchai e2e --input patient.txt
403
+ ```
404
+
405
+ ## Deployment
406
+
407
+ The supported deployment is a single Python 3.11 GPU server or VM. Search tables
408
+ are local LanceDB files under `data/search`, and concept linking uses a separate
409
+ LanceDB database under `data/concepts`.
410
+
411
+ The registry updater is designed for cron, systemd timers, or GitHub Actions.
412
+ See [docs/registry-updater.md](docs/registry-updater.md).
413
+
414
+ ## Development
415
+
416
+ ```bash
417
+ uv sync
418
+ uv run ruff check .
419
+ uv run pytest
420
+ uv run pre-commit run --all-files # ruff + gitleaks secret scan + hygiene
421
+ uv run pip-audit --progress-spinner off --ignore-vuln CVE-2025-3000
422
+ ```
423
+
424
+ Install the git hooks once so secret scanning and linting run on every commit:
425
+
426
+ ```bash
427
+ uv run pre-commit install
428
+ ```
429
+
430
+ ## Security
431
+
432
+ Never commit real credentials, private keys, datasets, models, local LanceDB
433
+ data, run manifests, or results. Keep runtime values local:
434
+
435
+ ```bash
436
+ cp .env.example .env
437
+ ```
438
+
439
+ Artifact bootstrap supports optional SHA-256 verification through:
440
+
441
+ - `TRIALMATCHAI_PROCESSED_TRIALS_SHA256`
442
+ - `TRIALMATCHAI_MODELS_SHA256`
443
+ - `TRIALMATCHAI_CRITERIA_PART_<N>_SHA256`
444
+
445
+ Dependency auditing currently ignores `CVE-2025-3000` because vLLM 0.23 pins
446
+ Torch 2.11.0 and the advisory lists no fixed Torch version. Revisit this when
447
+ upgrading vLLM or Torch.
448
+
449
+ ## Citation
450
+
451
+ If you use TrialMatchAI in your research, please cite the Nature Communications
452
+ paper:
453
+
454
+ > Abdallah, M. _et al._ TrialMatchAI: an end-to-end AI-powered clinical trial
455
+ > recommendation system to streamline patient-to-trial matching. _Nature
456
+ > Communications_ **17**, 4472 (2026). <https://doi.org/10.1038/s41467-026-70509-w>
457
+
458
+ ```bibtex
459
+ @article{abdallah2026trialmatchai,
460
+ title = {TrialMatchAI: an end-to-end AI-powered clinical trial recommendation system to streamline patient-to-trial matching},
461
+ author = {Abdallah, Majd and Nakken, Sigve and Georges, Mikael and Bierkens, Mariska and Galvis, Johanna and Groppi, Alexis and Karkar, Slim and Meiqari, Lana and Rujano, Maria Alexandra and Canham, Steve and Dienstmann, Rodrigo and Fijneman, Remond and Hovig, Eivind and Meijer, Gerrit and Nikolski, Macha},
462
+ journal = {Nature Communications},
463
+ volume = {17},
464
+ pages = {4472},
465
+ year = {2026},
466
+ doi = {10.1038/s41467-026-70509-w},
467
+ url = {https://doi.org/10.1038/s41467-026-70509-w}
468
+ }
469
+ ```
470
+
471
+ ## Support
472
+
473
+ - Email: abdallahmajd7@gmail.com
474
+ - Software archive (DOI): <https://doi.org/10.5281/zenodo.18329084>