syntha-ehr 0.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. syntha_ehr-0.5.0/LICENSE +17 -0
  2. syntha_ehr-0.5.0/PKG-INFO +526 -0
  3. syntha_ehr-0.5.0/README.md +503 -0
  4. syntha_ehr-0.5.0/pyproject.toml +63 -0
  5. syntha_ehr-0.5.0/setup.cfg +4 -0
  6. syntha_ehr-0.5.0/src/syntha/__init__.py +14 -0
  7. syntha_ehr-0.5.0/src/syntha/cli.py +256 -0
  8. syntha_ehr-0.5.0/src/syntha/conditional.py +176 -0
  9. syntha_ehr-0.5.0/src/syntha/data.py +30 -0
  10. syntha_ehr-0.5.0/src/syntha/export_model.py +50 -0
  11. syntha_ehr-0.5.0/src/syntha/fhir/__init__.py +3 -0
  12. syntha_ehr-0.5.0/src/syntha/fhir/clinical_extras.py +246 -0
  13. syntha_ehr-0.5.0/src/syntha/fhir/codes.py +95 -0
  14. syntha_ehr-0.5.0/src/syntha/fhir/export.py +326 -0
  15. syntha_ehr-0.5.0/src/syntha/fhir/panels.py +104 -0
  16. syntha_ehr-0.5.0/src/syntha/fhir/resources.py +154 -0
  17. syntha_ehr-0.5.0/src/syntha/fhir/rxnorm.py +57 -0
  18. syntha_ehr-0.5.0/src/syntha/generator/__init__.py +4 -0
  19. syntha_ehr-0.5.0/src/syntha/generator/constraints.py +79 -0
  20. syntha_ehr-0.5.0/src/syntha/generator/copula.py +191 -0
  21. syntha_ehr-0.5.0/src/syntha/generator/missingness.py +155 -0
  22. syntha_ehr-0.5.0/src/syntha/generator/mixed_corr.py +229 -0
  23. syntha_ehr-0.5.0/src/syntha/locale/__init__.py +19 -0
  24. syntha_ehr-0.5.0/src/syntha/locale/turkish.py +121 -0
  25. syntha_ehr-0.5.0/src/syntha/longitudinal.py +79 -0
  26. syntha_ehr-0.5.0/src/syntha/longitudinal_labs.py +228 -0
  27. syntha_ehr-0.5.0/src/syntha/models/__init__.py +3 -0
  28. syntha_ehr-0.5.0/src/syntha/models/registry.py +124 -0
  29. syntha_ehr-0.5.0/src/syntha/modules/__init__.py +32 -0
  30. syntha_ehr-0.5.0/src/syntha/modules/asthma_copd.py +45 -0
  31. syntha_ehr-0.5.0/src/syntha/modules/base.py +46 -0
  32. syntha_ehr-0.5.0/src/syntha/modules/depression_anxiety.py +64 -0
  33. syntha_ehr-0.5.0/src/syntha/modules/diabetes.py +53 -0
  34. syntha_ehr-0.5.0/src/syntha/modules/hyperlipidemia.py +32 -0
  35. syntha_ehr-0.5.0/src/syntha/modules/hypertension.py +47 -0
  36. syntha_ehr-0.5.0/src/syntha/modules/ihd.py +28 -0
  37. syntha_ehr-0.5.0/src/syntha/modules/thyroid.py +28 -0
  38. syntha_ehr-0.5.0/src/syntha/pipeline.py +193 -0
  39. syntha_ehr-0.5.0/src/syntha/preprocess.py +36 -0
  40. syntha_ehr-0.5.0/src/syntha/privacy.py +204 -0
  41. syntha_ehr-0.5.0/src/syntha/reference_ranges.py +175 -0
  42. syntha_ehr-0.5.0/src/syntha/schema.py +127 -0
  43. syntha_ehr-0.5.0/src/syntha/server.py +187 -0
  44. syntha_ehr-0.5.0/src/syntha/validate.py +138 -0
  45. syntha_ehr-0.5.0/src/syntha_ehr.egg-info/PKG-INFO +526 -0
  46. syntha_ehr-0.5.0/src/syntha_ehr.egg-info/SOURCES.txt +66 -0
  47. syntha_ehr-0.5.0/src/syntha_ehr.egg-info/dependency_links.txt +1 -0
  48. syntha_ehr-0.5.0/src/syntha_ehr.egg-info/entry_points.txt +2 -0
  49. syntha_ehr-0.5.0/src/syntha_ehr.egg-info/requires.txt +13 -0
  50. syntha_ehr-0.5.0/src/syntha_ehr.egg-info/top_level.txt +1 -0
  51. syntha_ehr-0.5.0/tests/test_clinical_extras.py +81 -0
  52. syntha_ehr-0.5.0/tests/test_conditional.py +85 -0
  53. syntha_ehr-0.5.0/tests/test_constraints.py +49 -0
  54. syntha_ehr-0.5.0/tests/test_copula.py +95 -0
  55. syntha_ehr-0.5.0/tests/test_fhir.py +56 -0
  56. syntha_ehr-0.5.0/tests/test_locale.py +38 -0
  57. syntha_ehr-0.5.0/tests/test_longitudinal.py +55 -0
  58. syntha_ehr-0.5.0/tests/test_longitudinal_labs.py +122 -0
  59. syntha_ehr-0.5.0/tests/test_missingness.py +148 -0
  60. syntha_ehr-0.5.0/tests/test_mixed_corr.py +137 -0
  61. syntha_ehr-0.5.0/tests/test_modules.py +71 -0
  62. syntha_ehr-0.5.0/tests/test_panels.py +69 -0
  63. syntha_ehr-0.5.0/tests/test_pipeline.py +52 -0
  64. syntha_ehr-0.5.0/tests/test_privacy.py +86 -0
  65. syntha_ehr-0.5.0/tests/test_reference_ranges.py +91 -0
  66. syntha_ehr-0.5.0/tests/test_registry.py +69 -0
  67. syntha_ehr-0.5.0/tests/test_server.py +76 -0
  68. syntha_ehr-0.5.0/tests/test_validate.py +61 -0
@@ -0,0 +1,17 @@
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ Copyright 2026 Ariorad Moniri
6
+
7
+ Licensed under the Apache License, Version 2.0 (the "License");
8
+ you may not use this file except in compliance with the License.
9
+ You may obtain a copy of the License at
10
+
11
+ http://www.apache.org/licenses/LICENSE-2.0
12
+
13
+ Unless required by applicable law or agreed to in writing, software
14
+ distributed under the License is distributed on an "AS IS" BASIS,
15
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
+ See the License for the specific language governing permissions and
17
+ limitations under the License.
@@ -0,0 +1,526 @@
1
+ Metadata-Version: 2.4
2
+ Name: syntha-ehr
3
+ Version: 0.5.0
4
+ Summary: Synthetic patient record generator (Synthea-inspired) trained on pristine-healthy episode data
5
+ Author: Ario Moniri
6
+ License: Apache-2.0
7
+ Requires-Python: >=3.10
8
+ Description-Content-Type: text/markdown
9
+ License-File: LICENSE
10
+ Requires-Dist: pandas>=2.0
11
+ Requires-Dist: numpy>=1.24
12
+ Requires-Dist: scipy>=1.10
13
+ Requires-Dist: scikit-learn>=1.3
14
+ Requires-Dist: click>=8.0
15
+ Requires-Dist: pyyaml>=6.0
16
+ Provides-Extra: dev
17
+ Requires-Dist: pytest>=7.0; extra == "dev"
18
+ Requires-Dist: pytest-cov>=4.0; extra == "dev"
19
+ Requires-Dist: matplotlib>=3.7; extra == "dev"
20
+ Requires-Dist: ruff>=0.8; extra == "dev"
21
+ Requires-Dist: pre-commit>=4.0; extra == "dev"
22
+ Dynamic: license-file
23
+
24
+ # 🩺 syntha
25
+
26
+ > **A [Synthea](https://github.com/synthetichealth/synthea)-inspired hybrid synthetic patient record generator**
27
+ > β€” learns the joint distribution of real anonymized Turkish-cohort EHR episodes with a Gaussian copula, then layers Synthea-style clinical pathways on top to emit fully-coded FHIR R4 bundles in Turkish.
28
+
29
+ [![CI](https://github.com/ArioMoniri/syntha/actions/workflows/ci.yml/badge.svg)](https://github.com/ArioMoniri/syntha/actions/workflows/ci.yml)
30
+ [![Cross-platform](https://github.com/ArioMoniri/syntha/actions/workflows/cross-platform.yml/badge.svg)](https://github.com/ArioMoniri/syntha/actions/workflows/cross-platform.yml)
31
+ [![Release](https://github.com/ArioMoniri/syntha/actions/workflows/release.yml/badge.svg)](https://github.com/ArioMoniri/syntha/actions/workflows/release.yml)
32
+ [![Install buttons](https://github.com/ArioMoniri/syntha/actions/workflows/verify-install-buttons.yml/badge.svg)](https://github.com/ArioMoniri/syntha/actions/workflows/verify-install-buttons.yml)
33
+ [![Codecov](https://codecov.io/gh/ArioMoniri/syntha/branch/main/graph/badge.svg)](https://codecov.io/gh/ArioMoniri/syntha)
34
+ [![Latest release](https://img.shields.io/github/v/release/ArioMoniri/syntha?include_prereleases&sort=semver&label=latest&color=2563eb)](https://github.com/ArioMoniri/syntha/releases/latest)
35
+ [![Downloads](https://img.shields.io/github/downloads/ArioMoniri/syntha/total?color=2563eb)](https://github.com/ArioMoniri/syntha/releases)
36
+ [![License: Apache 2.0](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](LICENSE)
37
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10%2B-blue.svg)](https://www.python.org/downloads/)
38
+ [![FHIR R4](https://img.shields.io/badge/FHIR-R4-orange)](https://hl7.org/fhir/R4/)
39
+ [![Locale: tr-TR](https://img.shields.io/badge/locale-tr--TR-red)](#-turkish-cohort--turkish-output)
40
+
41
+ ---
42
+
43
+ ## πŸ–₯️ Desktop app β€” generate synthetic patients without code
44
+
45
+ <p align="center">
46
+ <a href="https://github.com/ArioMoniri/syntha/releases/latest/download/syntha_aarch64.dmg"><img src="docs/assets/download-macos.png" alt="Download macOS Apple Silicon (.dmg)" height="64"/></a>
47
+ &nbsp;
48
+ <a href="https://github.com/ArioMoniri/syntha/releases/latest/download/syntha_x64-setup.exe"><img src="docs/assets/download-windows.png" alt="Download Windows installer (.exe)" height="64"/></a>
49
+ &nbsp;
50
+ <a href="https://github.com/ArioMoniri/syntha/releases/latest/download/syntha_amd64.AppImage"><img src="docs/assets/download-linux.png" alt="Download Linux AppImage" height="64"/></a>
51
+ </p>
52
+
53
+ <p align="center">
54
+ <sub>A Tauri 2 desktop app that bundles the trained Gaussian copula and samples synthetic patients <b>fully client-side</b> (no Python required). Pick cohort + n + seed + constraints, hit <b>Generate</b>, and download a CSV.</sub>
55
+ </p>
56
+
57
+ > πŸ“¦ Installers are produced by the [release workflow](.github/workflows/release.yml) on every `v*` tag push and live at stable filenames (`syntha_aarch64.dmg`, `syntha_x64-setup.exe`, `syntha_amd64.AppImage`). The buttons above all use `releases/latest/download/…` so they **track the latest release automatically** β€” no manual link maintenance per version. A daily [Install-buttons verification workflow](.github/workflows/verify-install-buttons.yml) HEAD-checks each URL and opens an issue if any 404s. Source for the app lives in [`app/`](app/).
58
+
59
+ > πŸ›‘οΈ **macOS sees `"syntha.app" is damaged`?** That's Gatekeeper's misleading error for unsigned apps. Until the signing pipeline ships ([app/README.md β†’ signing setup](app/README.md#macos-code-signing--notarization)), strip the quarantine flag manually:
60
+ > ```bash
61
+ > xattr -dr com.apple.quarantine /Applications/syntha.app
62
+ > ```
63
+
64
+ ---
65
+
66
+ ## πŸ“‘ Table of contents
67
+
68
+ - [πŸ” Why syntha?](#-why-syntha)
69
+ - [🎯 What it produces](#-what-it-produces)
70
+ - [⚠️ The catch (what it is *not*)](#%EF%B8%8F-the-catch-what-it-is-not)
71
+ - [πŸ‡ΉπŸ‡· Turkish cohort + Turkish output](#-turkish-cohort--turkish-output)
72
+ - [πŸ§ͺ Use cases](#-use-cases)
73
+ - [πŸš€ Quick start](#-quick-start)
74
+ - [πŸ“Š Distribution fidelity](#-distribution-fidelity)
75
+ - [πŸ“¦ Example output](#-example-output-embedded)
76
+ - [🌐 FHIR endpoints](#-fhir-endpoints)
77
+ - [🧱 Architecture](#-architecture)
78
+ - [🧬 Synthea-style clinical modules](#-synthea-style-clinical-modules)
79
+ - [πŸ› οΈ CLI reference](#%EF%B8%8F-cli-reference)
80
+ - [πŸ—ΊοΈ Roadmap](#%EF%B8%8F-roadmap)
81
+ - [🀝 Contributing + clinician curation](#-contributing--clinician-curation-welcome)
82
+ - [πŸ“„ License + citation](#-license--citation)
83
+
84
+ ---
85
+
86
+ ## πŸ” Why syntha?
87
+
88
+ Synthea is the gold standard for synthetic FHIR patients, but it is **rules-only** and tuned to US population priors. CTGAN-style purely-generative models capture data faithfully but emit physiologically impossible tuples and have no clinical-pathway awareness. **syntha gives you both:**
89
+
90
+ | | Synthea (rules-only) | CTGAN / copula-only | **syntha (hybrid)** |
91
+ |---|---|---|---|
92
+ | Matches *this cohort's* lab distributions | ❌ generic US priors | βœ… | βœ… |
93
+ | Coherent prescriptions per condition | βœ… | ❌ | βœ… |
94
+ | Physiologically valid (BP, eGFR…) | βœ… | ⚠️ sometimes | βœ… |
95
+ | LOINC + SNOMED + ICD-10 + RxNorm-coded FHIR | βœ… | ❌ | βœ… |
96
+ | Longitudinal trajectories | βœ… state machines | ❌ | βœ… drift + sticky flags |
97
+ | Turkish locale (names, addresses, displays) | ❌ | ❌ | βœ… |
98
+
99
+ ## 🎯 What it produces
100
+
101
+ For each synthetic patient, syntha emits a FHIR R4 *transaction* `Bundle` containing:
102
+
103
+ - πŸ‘€ **Patient** β€” Turkish HumanName + Address + `tr` language code, derived birthDate
104
+ - πŸ§ͺ **Observation** Γ— ~12 β€” LOINC-coded labs and vitals (glucose, lipid panel, CBC, LFTs, eGFR, BP, …)
105
+ - 🩺 **Condition** Γ— N β€” every active comorbidity flag, **dual-coded SNOMED CT + ICD-10**, with English/Turkish display text
106
+ - πŸ₯ **Encounter** Γ— M β€” one per active condition, driven by the relevant clinical module
107
+ - πŸ’Š **MedicationRequest** Γ— P β€” RxNorm-coded, dosage included
108
+ - πŸ”¬ **Procedure** Γ— Q β€” e.g. HbA1c, lipid panel, ECG, spirometry
109
+ - πŸ“‹ **CarePlan** Γ— R β€” disease-specific lifestyle / monitoring plans
110
+
111
+ Plus a flat CSV that matches the **input schema** for drop-in use as training data.
112
+
113
+ ## ⚠️ The catch (what it is *not*)
114
+
115
+ - 🚫 **Not** a substitute for real PHI when validity hinges on rare events β€” the copula reproduces the *bulk* of the joint distribution, not the long tails.
116
+ - 🚫 **Not** privacy-proof. Gaussian copulas are not differentially private; if the source has fewer than ~50 patients with a rare combination, syntha may reproduce that combination too closely. **Do not use** when the source is a small sensitive cohort without adding a DP mechanism.
117
+ - 🚫 **No disease *progression* simulator** yet β€” the copula gives a cross-sectional snapshot; longitudinal mode adds plausible drift but is not a Synthea-PADM state machine. (See [v0.8 in the roadmap](ROADMAP.md).)
118
+ - 🚫 The source CSVs are **anonymized retrospective Turkish-cohort episodes of healthy patients** β€” synthetic disease prevalence is *lower* than Turkish national averages (TÜİK). If you need a population-representative Turkish cohort, calibrate per the [`v0.6` roadmap items](ROADMAP.md).
119
+ - ⚠️ **Continuous↔binary correlations are attenuated ~50% in magnitude** (signs are correct since v0.3.2). Pure Spearman rank correlation on tied binary columns is biased toward zero; the proper fix is the polyserial/tetrachoric correlation, queued as [v0.4 in the roadmap](ROADMAP.md). For most downstream uses (training risk models, healthy-control comparisons) this is acceptable; if you need exact lab↔disease correlations, wait for v0.4 or contribute the fix.
120
+
121
+ ## πŸ‡ΉπŸ‡· Turkish cohort + Turkish output
122
+
123
+ The training data comes from `pristine_strict_episodes.csv` and `pristine_tolerant_episodes.csv` β€” anonymized retrospective EHR episodes from a Turkish patient cohort selected to represent *clinically pristine* (i.e. healthy / minimally medicated) adults. Source CSVs are **never** committed to this repo (gitignored).
124
+
125
+ Synthetic output is **Turkish-localized**:
126
+
127
+ - Patient names sampled from common Turkish given-name and family-name distributions (`src/syntha/locale/turkish.py`).
128
+ - Addresses use real Turkish cities weighted by approximate population, with ISO 3166-2:TR province codes.
129
+ - Every Condition emits both an English SNOMED display and a clinical-Turkish translation in `Condition.code.text`.
130
+ - Patient.communication is set to `tr`.
131
+
132
+ All clinical terminology used (LOINC, SNOMED CT, ICD-10, RxNorm) comes from **open international standards** β€” no licensed terminology content is reproduced or embedded.
133
+
134
+ ## πŸ§ͺ Use cases
135
+
136
+ | Where to use it | Why |
137
+ |---|---|
138
+ | πŸ€– **Training ML risk models** without exposing real PHI | The copula preserves joint distributions, so a model trained on synthetic data transfers reasonably to real test sets (TSTR benchmark in v0.9). |
139
+ | 🧬 **Bioinformatics healthy-control cohorts** | The source is *pristine healthy* episodes β€” use the synthetic patients as a normal-baseline group to compare against your disease cohort. |
140
+ | πŸ› οΈ **EHR pipeline / ETL integration testing** | Realistic-but-fake FHIR R4 bundles with valid LOINC/SNOMED/ICD-10/RxNorm codes are ideal for testing FHIR consumers, mapping pipelines, and OMOP/i2b2 ETLs without DPA paperwork. |
141
+ | πŸ“š **Teaching / coursework** | Drop-in dataset for biostatistics, epidemiology, and clinical-informatics teaching without IRB. |
142
+ | πŸ”¬ **Data augmentation** | Boost rare-event coverage by oversampling synthetic patients with specific comorbidity combinations (conditional sampling lands in v0.7). |
143
+
144
+ ## πŸš€ Quick start
145
+
146
+ ```bash
147
+ # 1. Install
148
+ git clone https://github.com/ArioMoniri/syntha.git
149
+ cd syntha
150
+ pip install -e .
151
+
152
+ # 2. (Optional) Ingest your source CSVs β€” files in data/raw/ are gitignored
153
+ bash scripts/ingest_csvs.sh
154
+
155
+ # 3. Generate 1000 synthetic episodes + FHIR bundles + model card + validation report
156
+ syntha generate \
157
+ --input data/raw/pristine_tolerant_episodes.csv \
158
+ --output output/tolerant \
159
+ --n 1000 --cohort tolerant
160
+
161
+ # 4. Longitudinal β€” 500 baseline patients Γ— ~4 encounters over 3 years
162
+ syntha generate \
163
+ --input data/raw/pristine_tolerant_episodes.csv \
164
+ --output output/tolerant_long \
165
+ --n 2000 --cohort tolerant \
166
+ --longitudinal --encounters-per-patient 4 --years-of-history 3
167
+
168
+ # 5. Validate any synthetic CSV against its source
169
+ syntha validate \
170
+ --source data/raw/pristine_tolerant_episodes.csv \
171
+ --synthetic output/tolerant/synthetic_tolerant_episodes.csv \
172
+ --output output/tolerant/validation.json
173
+ ```
174
+
175
+ ## πŸ“Š Distribution fidelity
176
+
177
+ A 100-episode sample of `tolerant` cohort vs the full 135 569-row source:
178
+
179
+ ### Marginal distributions
180
+
181
+ ![Marginal distributions β€” source vs synthetic](docs/figures/distributions.png)
182
+
183
+ ### Spearman correlation structure
184
+
185
+ ![Spearman correlations β€” source vs synthetic vs diff](docs/figures/correlations.png)
186
+
187
+ ### Disease prevalence
188
+
189
+ ![Comorbidity prevalence β€” source vs synthetic](docs/figures/prevalence.png)
190
+
191
+ ### Numbers (from `examples/sample_output/sample_validation_report.json`)
192
+
193
+ | Metric | Value |
194
+ |---|---|
195
+ | n (source / synthetic) | 135 569 / 100 |
196
+ | **Max Kolmogorov–Smirnov** across continuous columns | **0.14** |
197
+ | Mean KS | 0.07 |
198
+ | **Max binary-prevalence error** | **0.025** (`has_rx_data`) |
199
+ | Disease-prevalence error (HTN / DM / hyperlipidemia) | 0.015 / 0.004 / 0.010 |
200
+ | Spearman correlation-matrix Frobenius diff | 2.94 |
201
+
202
+ > πŸ“ The KS statistic is well below the typical 0.20 "noticeable difference" threshold for every column; binary marginals (gender, disease prevalence) match to within ~1 percentage point.
203
+
204
+ ## πŸ“¦ Example output (embedded)
205
+
206
+ A pretty-printed sample FHIR Bundle, a 100-episode synthetic CSV, the model card, and the validation report all live under [`examples/sample_output/`](examples/sample_output/) and are tracked in git.
207
+
208
+ | File | Click to view (GitHub built-in viewer) | What's inside |
209
+ |---|---|---|
210
+ | 🧾 **Full FHIR Bundle (pretty)** | [`sample_bundle_pretty.json`](examples/sample_output/sample_bundle_pretty.json) | One transaction Bundle: Patient + Observations + Conditions + Encounter + MedicationRequests + Procedure + CarePlan |
211
+ | πŸ“‘ **100 bundles, NDJSON** | [`sample_bundles.ndjson`](examples/sample_output/sample_bundles.ndjson) | Bulk-FHIR-style export, one transaction Bundle per line |
212
+ | πŸ“Š **Flat CSV** | [`sample_episodes.csv`](examples/sample_output/sample_episodes.csv) | 100 synthetic episodes matching input schema |
213
+ | πŸ—’οΈ **Model card** | [`sample_model_card.json`](examples/sample_output/sample_model_card.json) | source sha256, n_train, marginals, top correlations |
214
+ | βœ… **Validation report** | [`sample_validation_report.json`](examples/sample_output/sample_validation_report.json) | KS / Wasserstein / correlation-Frobenius per column |
215
+
216
+ > πŸ’‘ **Embedded viewer.** GitHub renders the linked JSON files with syntax highlighting and a collapsible outline (click the `{}` icon top-right of the file view). For full **FHIR-aware** validation and tree-view rendering, drag the file onto [simplifier.net](https://simplifier.net/) or paste it into the [official HL7 Clinical FHIR Renderer](https://clinical-fhir.github.io/Renderer/).
217
+
218
+ <details>
219
+ <summary>πŸ‘οΈ <b>Inline preview β€” first synthetic patient (click to expand)</b></summary>
220
+
221
+ ```json
222
+ {
223
+ "resourceType": "Bundle",
224
+ "type": "transaction",
225
+ "timestamp": "2017-05-27T21:49:42Z",
226
+ "entry": [
227
+ {
228
+ "resource": {
229
+ "resourceType": "Patient",
230
+ "id": "20f13c43-d17b-443b-b7a7-69ccc40631c6",
231
+ "gender": "male",
232
+ "name": [{"use": "official", "family": "AvcΔ±", "given": ["Furkan"]}],
233
+ "address": [{
234
+ "use": "home", "type": "physical",
235
+ "city": "Δ°stanbul", "state": "TR-34", "country": "TR"
236
+ }],
237
+ "communication": [{
238
+ "language": {"coding": [{"system": "urn:ietf:bcp:47", "code": "tr", "display": "Turkish"}]},
239
+ "preferred": true
240
+ }],
241
+ "birthDate": "1975-…"
242
+ }
243
+ },
244
+ {
245
+ "resource": {
246
+ "resourceType": "Observation",
247
+ "code": {
248
+ "coding": [{"system": "http://loinc.org", "code": "8480-6",
249
+ "display": "Systolic blood pressure"}]
250
+ },
251
+ "valueQuantity": {"value": 118.72, "unit": "mm[Hg]"}
252
+ }
253
+ },
254
+ {
255
+ "resource": {
256
+ "resourceType": "Condition",
257
+ "code": {
258
+ "coding": [
259
+ {"system": "http://snomed.info/sct", "code": "414545008",
260
+ "display": "Ischemic heart disease (disorder)"},
261
+ {"system": "http://hl7.org/fhir/sid/icd-10", "code": "I25.9",
262
+ "display": "Chronic ischaemic heart disease, unspecified"}
263
+ ],
264
+ "text": "Ischemic heart disease (disorder) / İskemik kalp hastalığı"
265
+ }
266
+ }
267
+ },
268
+ {
269
+ "resource": {
270
+ "resourceType": "MedicationRequest",
271
+ "medicationCodeableConcept": {
272
+ "coding": [{
273
+ "system": "http://www.nlm.nih.gov/research/umls/rxnorm",
274
+ "code": "243670", "display": "Aspirin 81 MG Oral Tablet"
275
+ }]
276
+ },
277
+ "dosageInstruction": [{"text": "81 mg daily"}]
278
+ }
279
+ }
280
+ ]
281
+ }
282
+ ```
283
+
284
+ </details>
285
+
286
+ <details>
287
+ <summary>πŸ‘οΈ <b>Inline preview β€” first 5 rows of the CSV</b></summary>
288
+
289
+ | RF_EPISODE2 | HASTA_ID | episode_date | gender | age | bp_sys | bp_dia | hdl | ldl | hgb | egfr | Hipertansiyon | DM_Tum |
290
+ |---|---|---|---|---|---|---|---|---|---|---|---|---|
291
+ | 92893619 | SYN_7D70431D | 2017-05-27 | M | 42 | 118.7 | 63.0 | 95.0 | 58.0 | 12.9 | 105.7 | 0 | 0 |
292
+ | … | … | … | … | … | … | … | … | … | … | … | … | … |
293
+
294
+ Full file: [`examples/sample_output/sample_episodes.csv`](examples/sample_output/sample_episodes.csv) (100 rows Γ— 73 cols).
295
+
296
+ </details>
297
+
298
+ <details>
299
+ <summary>πŸ‘οΈ <b>Inline preview β€” validation report summary</b></summary>
300
+
301
+ ```json
302
+ {
303
+ "n_source": 135569,
304
+ "n_synthetic": 100,
305
+ "ks_max": 0.14,
306
+ "ks_mean": 0.07,
307
+ "binary_max_abs_error": 0.025,
308
+ "correlation_frobenius": 2.94
309
+ }
310
+ ```
311
+
312
+ </details>
313
+
314
+ ## 🌐 FHIR endpoints
315
+
316
+ syntha emits canonical FHIR R4 resources, so every emitted resource type maps to its standard REST endpoint:
317
+
318
+ | Resource type | GET (read) | GET (search) | Create (POST to base) |
319
+ |---|---|---|---|
320
+ | πŸ‘€ Patient | `GET /Patient/{id}` | `GET /Patient` | as part of transaction `Bundle` |
321
+ | πŸ§ͺ Observation | `GET /Observation/{id}` | `GET /Observation?subject={ref}` | ↑ |
322
+ | 🩺 Condition | `GET /Condition/{id}` | `GET /Condition?patient={id}` | ↑ |
323
+ | πŸ₯ Encounter | `GET /Encounter/{id}` | `GET /Encounter?patient={id}` | ↑ |
324
+ | πŸ’Š MedicationRequest | `GET /MedicationRequest/{id}` | `GET /MedicationRequest?patient={id}` | ↑ |
325
+ | πŸ”¬ Procedure | `GET /Procedure/{id}` | `GET /Procedure?patient={id}` | ↑ |
326
+ | πŸ“‹ CarePlan | `GET /CarePlan/{id}` | `GET /CarePlan?patient={id}` | ↑ |
327
+ | πŸ“¦ Bundle | `GET /Bundle/{id}` | β€” | `POST /` (transaction) |
328
+
329
+ ### Spin up a demo FHIR server locally
330
+
331
+ ```bash
332
+ syntha serve --bundles examples/sample_output/sample_bundles.ndjson --port 8080
333
+ ```
334
+
335
+ Then:
336
+
337
+ ```bash
338
+ curl http://127.0.0.1:8080/metadata # CapabilityStatement
339
+ curl http://127.0.0.1:8080/Patient # searchset Bundle (all Patients)
340
+ curl http://127.0.0.1:8080/Patient/{id} # single Patient
341
+ curl http://127.0.0.1:8080/Observation # all Observations
342
+ curl http://127.0.0.1:8080/\$export # FHIR Bulk Data export (NDJSON)
343
+ ```
344
+
345
+ This is a **read-only demo server** (stdlib `http.server`, no dependencies). For a production-grade FHIR server, POST the bundles to a HAPI / Microsoft FHIR / Google Healthcare API instance β€” see below.
346
+
347
+ ### POST the bundles to any FHIR R4 server
348
+
349
+ `scripts/post_to_fhir.sh` POSTs every transaction Bundle in an NDJSON file to a configurable FHIR endpoint (default: the public [HAPI test server](https://hapi.fhir.org/baseR4)):
350
+
351
+ ```bash
352
+ # To the public HAPI playground:
353
+ bash scripts/post_to_fhir.sh examples/sample_output/sample_bundles.ndjson
354
+
355
+ # To your own server:
356
+ FHIR_BASE=http://localhost:8080/fhir bash scripts/post_to_fhir.sh
357
+ ```
358
+
359
+ Once uploaded, you can browse the resources in any FHIR UI β€” e.g. [HAPI's built-in browser](https://hapi.fhir.org/) or the [Open Patient Browser](https://patient-browser.smarthealthit.org/).
360
+
361
+ ## 🧱 Architecture
362
+
363
+ ```
364
+ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
365
+ β”‚ Source CSV │──▢│ Gaussian copula │──▢│ Physiologic filter β”‚
366
+ β”‚ (Turkish β”‚ β”‚ (Spearman β†’ ρ; β”‚ β”‚ (BP, Friedewald, β”‚
367
+ β”‚ pristine) β”‚ β”‚ nearest-PSD) β”‚ β”‚ eGFR/creatinine) β”‚
368
+ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
369
+ β”‚
370
+ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
371
+ β”‚ β”‚
372
+ β–Ό β–Ό
373
+ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
374
+ β”‚ Longitudinal β”‚ (optional) β”‚ Direct single-episode β”‚
375
+ β”‚ expansion β”‚ ───────────────▢│ CSV + FHIR R4 export β”‚
376
+ β”‚ (drift, Poisson) β”‚ β”‚ with Synthea-style β”‚
377
+ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ module activation β”‚
378
+ β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
379
+ β–Ό
380
+ (same FHIR export)
381
+ ```
382
+
383
+ Read [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md) for the math (Spearman→Gaussian transform, nearest-PSD projection, constraint rules).
384
+
385
+ ## 🧬 Synthea-style clinical modules
386
+
387
+ Nine modules ship out of the box (`src/syntha/modules/`); each fires on its corresponding source-CSV comorbidity flag:
388
+
389
+ | Module | Flag(s) | Emits |
390
+ |---|---|---|
391
+ | πŸ«€ Hypertension | `Hipertansiyon` | Encounter, 1–2 antihypertensives (stage 2 β†’ dual), CarePlan |
392
+ | 🍬 Diabetes | `DM_Tum`, `DM_Komplikasyonlu` | Encounter, HbA1c, metformin (+ insulin if severe), CarePlan |
393
+ | πŸ§€ Hyperlipidemia | `Hiperlipidemi` | Encounter, lipid panel, statin (high-intensity if LDL β‰₯ 190) |
394
+ | πŸ¦‹ Thyroid | `Tiroid` | Encounter, TSH, levothyroxine |
395
+ | πŸ˜” Depression | `Depresyon` | Psych encounter, sertraline, CBT CarePlan |
396
+ | 😰 Anxiety | `Anksiyete` | Psych encounter, escitalopram (or buspirone if already on SSRI) |
397
+ | ❀️ IHD | `Iskemik_Kalp` | Cardiology encounter, ECG, aspirin + β-blocker + statin |
398
+ | 🌬️ Asthma | `Astim` | Resp encounter, spirometry, SABA + ICS |
399
+ | 🚭 COPD | `COPD` | Resp encounter, spirometry, LABA + SABA |
400
+
401
+ See [docs/MODULES.md](docs/MODULES.md) for the authoring guide. Clinician contributions for **TR-specific drug choices** are highly welcome β€” see [CONTRIBUTING.md](CONTRIBUTING.md).
402
+
403
+ ## πŸ› οΈ CLI reference
404
+
405
+ | Command | Description |
406
+ |---|---|
407
+ | `syntha generate` | End-to-end: train copula + sample + modules + CSV/FHIR + model card + validation report |
408
+ | `syntha fit` | Fit and persist a copula in a registry without sampling |
409
+ | `syntha sample` | Raw sampling from a registered model |
410
+ | `syntha fhir` | Convert an existing synthetic CSV to FHIR bundles |
411
+ | `syntha validate` | KS / Wasserstein / correlation diff between source and synthetic |
412
+ | `syntha serve` | Boot a read-only FHIR R4 demo server from a bundles NDJSON file |
413
+ | `syntha export-model` | Export a registered copula to a compact JSON the desktop app consumes |
414
+ | `syntha list-models` | List models in a registry |
415
+ | `syntha show-card` | Print a model card |
416
+
417
+ Run `syntha <cmd> --help` for full option lists.
418
+
419
+ ## πŸ—ΊοΈ Roadmap
420
+
421
+ The full phased roadmap (v0.1 β†’ v1.0) lives in [ROADMAP.md](ROADMAP.md). Highlights:
422
+
423
+ - **v0.6 β€” clinician curation** 🟣 β€” needs Dr. Moniri (or a collaborator)
424
+ - **v0.7 β€” optional CTGAN/TVAE backend** ⬜
425
+ - **v0.8 β€” true Synthea PADM-style state machines** ⬜
426
+ - **v0.9 β€” TSTR benchmark** ⬜
427
+ - **v1.0 β€” PyPI + paper** ⬜
428
+
429
+ ## 🀝 Contributing + clinician curation welcome
430
+
431
+ There are **three ways** to feed clinical guidance into syntha β€” pick whichever is least friction for you:
432
+
433
+ ### 1. πŸš€ Just tell me (lowest friction)
434
+
435
+ Reply in any open conversation with Claude (the agent that maintains this repo) saying e.g.
436
+
437
+ > *"In TΓΌrkiye, perindopril 5 mg is the typical first-line ACEi for uncomplicated hypertension per TKD 2023 β€” switch the default in the hypertension module."*
438
+
439
+ …and I'll edit the relevant file, push, and re-run CI. No GitHub UI needed.
440
+
441
+ ### 2. πŸ“ GitHub issue (recommended for asynchronous tracking)
442
+
443
+ Open an issue using the **πŸ§‘β€βš•οΈ Clinical curation** template β€” one click:
444
+
445
+ πŸ‘‰ **[Open a Clinical curation issue](https://github.com/ArioMoniri/syntha/issues/new?template=clinical_curation.md&labels=clinical-curation&title=%5Bclinical-curation%5D%20)** πŸ‘ˆ
446
+
447
+ The template pre-lists the files most likely to need changes:
448
+
449
+ | If you want to change… | Edit this file |
450
+ |---|---|
451
+ | Which drug a module prescribes | `src/syntha/modules/<condition>.py` |
452
+ | The RxNorm code or dose text | `src/syntha/fhir/rxnorm.py` |
453
+ | The SNOMED / ICD-10 code for a Condition | `src/syntha/fhir/codes.py` |
454
+ | Turkish display strings | `src/syntha/locale/turkish.py` |
455
+ | Prevalence calibration / disease-progression rules | `src/syntha/longitudinal.py` |
456
+
457
+ ### 3. πŸ”§ Pull request
458
+
459
+ ```bash
460
+ git clone https://github.com/ArioMoniri/syntha
461
+ cd syntha
462
+ pip install -e ".[dev]"
463
+ # … edit files …
464
+ pytest -q
465
+ git checkout -b clinical/<short-topic>
466
+ git commit -am "clinical: <what you changed and why>"
467
+ git push -u origin clinical/<short-topic>
468
+ gh pr create # or open via the GitHub UI
469
+ ```
470
+
471
+ ### What's currently flagged 🟣 (waiting for clinician input)
472
+
473
+ Per [ROADMAP.md β†’ v0.6](ROADMAP.md):
474
+
475
+ - 🟣 **TR-specific first-line drug calibration** β€” current defaults are international (lisinopril/amlodipine for HTN, metformin for DM, atorvastatin for hyperlipidemia). Turkish primary-care reality may differ (e.g. perindopril, nebivolol).
476
+ - 🟣 **New modules**: CKD staging (eGFR-driven), MAFLD (ALT/AST + obesity), anemia (Hb-driven), B12 deficiency (vit B12 column directly available).
477
+ - 🟣 **Prevalence calibration to TÜİK** β€” synthetic disease rates currently mirror the *pristine-healthy* source cohort. To use syntha as a Turkish-population baseline rather than a healthy baseline, the marginals should be calibrated to TÜİK figures.
478
+ - 🟣 **Turkish display string review** β€” confirm clinical-Turkish preferred terms match `TΓΌrk Tabipleri Birliği` / TR-specific usage rather than literal translations.
479
+ - 🟣 **ICD-10 specificity** β€” the current mapping uses unspecified (".9") forms; specifying further (`E11.65`, `I50.32`, etc.) when the source flag carries the information would improve downstream realism.
480
+
481
+ Full developer guide: [CONTRIBUTING.md](CONTRIBUTING.md). All PRs must pass the CI matrix (Py 3.10 β†’ 3.13) before merge.
482
+
483
+ ## πŸ“„ License + citation
484
+
485
+ Apache 2.0 Β© 2026 **Ariorad Moniri** β€” see [LICENSE](LICENSE).
486
+
487
+ If you use syntha in academic work, please cite:
488
+
489
+ ```
490
+ Moniri, A. (2026). syntha: hybrid synthetic patient record generator
491
+ trained on Turkish pristine-healthy EHR cohorts.
492
+ https://github.com/ArioMoniri/syntha
493
+ ```
494
+
495
+ ---
496
+
497
+ ### Acknowledgements
498
+
499
+ - 🩺 [Synthea](https://github.com/synthetichealth/synthea) β€” the inspiration for the clinical-module layer and FHIR output format.
500
+ - 🌐 Open clinical terminologies: [LOINC](https://loinc.org/), [SNOMED CT](https://www.snomed.org/), [ICD-10](https://icd.who.int/browse10/), [RxNorm](https://www.nlm.nih.gov/research/umls/rxnorm/).
501
+ - πŸ“Š The anonymized Turkish-cohort EHR data used to train the copula (de-identified by the upstream data steward; never redistributed by this repo).
502
+
503
+ ### Contributors
504
+
505
+ <!-- ALL-CONTRIBUTORS-LIST:START - Do not remove or modify this section -->
506
+ <!-- prettier-ignore-start -->
507
+ <!-- markdownlint-disable -->
508
+ <table>
509
+ <tbody>
510
+ <tr>
511
+ <td align="center" valign="top" width="14.28%"><a href="https://github.com/ArioMoniri"><img src="https://avatars.githubusercontent.com/u/ArioMoniri?v=4?s=80" width="80px;" alt="Ariorad Moniri"/><br /><sub><b>Ariorad Moniri</b></sub></a><br /><a href="https://github.com/ArioMoniri/syntha/commits?author=ArioMoniri" title="Code">πŸ’»</a> <a href="#design-ArioMoniri" title="Design">🎨</a> <a href="https://github.com/ArioMoniri/syntha/commits?author=ArioMoniri" title="Documentation">πŸ“–</a> <a href="#maintenance-ArioMoniri" title="Maintenance">🚧</a> <a href="#ideas-ArioMoniri" title="Ideas & Planning">πŸ€”</a> <a href="https://github.com/ArioMoniri/syntha/pulls?q=is%3Apr+reviewed-by%3AArioMoniri" title="Reviewed Pull Requests">πŸ‘€</a> <a href="#infra-ArioMoniri" title="Infrastructure">πŸš‡</a> <a href="https://github.com/ArioMoniri/syntha/commits?author=ArioMoniri" title="Tests">⚠️</a></td>
512
+ </tr>
513
+ </tbody>
514
+ </table>
515
+ <!-- markdownlint-restore -->
516
+ <!-- prettier-ignore-end -->
517
+
518
+ <!-- ALL-CONTRIBUTORS-LIST:END -->
519
+
520
+ This project follows the [all-contributors](https://allcontributors.org/) specification β€” contributions of any kind welcome. Comment `@all-contributors please add @username for code,doc` on an issue or PR to nominate someone.
521
+
522
+ ### πŸ’¬ Community
523
+
524
+ - πŸ—¨οΈ **[GitHub Discussions](https://github.com/ArioMoniri/syntha/discussions)** β€” open questions, "is this the right tool for X?", show-and-tell
525
+ - πŸ› **[Issues](https://github.com/ArioMoniri/syntha/issues)** β€” bug reports + feature requests + clinical-curation
526
+ - πŸ“– **[Contributing](CONTRIBUTING.md)** β€” dev setup + commit conventions + clinical-curation workflow