phenocluster 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. phenocluster-0.1.0/LICENSE +21 -0
  2. phenocluster-0.1.0/PKG-INFO +241 -0
  3. phenocluster-0.1.0/README.md +192 -0
  4. phenocluster-0.1.0/phenocluster/__init__.py +190 -0
  5. phenocluster-0.1.0/phenocluster/cache.py +239 -0
  6. phenocluster-0.1.0/phenocluster/cli.py +543 -0
  7. phenocluster-0.1.0/phenocluster/config.py +866 -0
  8. phenocluster-0.1.0/phenocluster/core/__init__.py +32 -0
  9. phenocluster-0.1.0/phenocluster/core/exceptions.py +86 -0
  10. phenocluster-0.1.0/phenocluster/core/types.py +78 -0
  11. phenocluster-0.1.0/phenocluster/data/__init__.py +14 -0
  12. phenocluster-0.1.0/phenocluster/data/encoder.py +186 -0
  13. phenocluster-0.1.0/phenocluster/data/imputer.py +189 -0
  14. phenocluster-0.1.0/phenocluster/data/outlier_handler.py +93 -0
  15. phenocluster-0.1.0/phenocluster/data/preprocessor.py +123 -0
  16. phenocluster-0.1.0/phenocluster/data/scaler.py +43 -0
  17. phenocluster-0.1.0/phenocluster/data/splitter.py +109 -0
  18. phenocluster-0.1.0/phenocluster/evaluation/__init__.py +32 -0
  19. phenocluster-0.1.0/phenocluster/evaluation/cluster_statistics.py +80 -0
  20. phenocluster-0.1.0/phenocluster/evaluation/data_quality.py +910 -0
  21. phenocluster-0.1.0/phenocluster/evaluation/external_validation.py +316 -0
  22. phenocluster-0.1.0/phenocluster/evaluation/feature_characterization.py +351 -0
  23. phenocluster-0.1.0/phenocluster/evaluation/metrics.py +61 -0
  24. phenocluster-0.1.0/phenocluster/evaluation/multistate/__init__.py +25 -0
  25. phenocluster-0.1.0/phenocluster/evaluation/multistate/analyzer.py +382 -0
  26. phenocluster-0.1.0/phenocluster/evaluation/multistate/hazard_ratios.py +276 -0
  27. phenocluster-0.1.0/phenocluster/evaluation/multistate/pathways.py +64 -0
  28. phenocluster-0.1.0/phenocluster/evaluation/multistate/trajectory.py +177 -0
  29. phenocluster-0.1.0/phenocluster/evaluation/multistate/transition_hazards.py +376 -0
  30. phenocluster-0.1.0/phenocluster/evaluation/multistate/types.py +80 -0
  31. phenocluster-0.1.0/phenocluster/evaluation/outcome_analysis.py +187 -0
  32. phenocluster-0.1.0/phenocluster/evaluation/stability.py +495 -0
  33. phenocluster-0.1.0/phenocluster/evaluation/stats_utils.py +65 -0
  34. phenocluster-0.1.0/phenocluster/evaluation/survival.py +402 -0
  35. phenocluster-0.1.0/phenocluster/feature_selection/__init__.py +22 -0
  36. phenocluster-0.1.0/phenocluster/feature_selection/base.py +195 -0
  37. phenocluster-0.1.0/phenocluster/feature_selection/correlation.py +121 -0
  38. phenocluster-0.1.0/phenocluster/feature_selection/lasso.py +184 -0
  39. phenocluster-0.1.0/phenocluster/feature_selection/mixed_selector.py +178 -0
  40. phenocluster-0.1.0/phenocluster/feature_selection/mutual_info.py +122 -0
  41. phenocluster-0.1.0/phenocluster/feature_selection/variance.py +115 -0
  42. phenocluster-0.1.0/phenocluster/model_selection/__init__.py +34 -0
  43. phenocluster-0.1.0/phenocluster/model_selection/grid_search.py +485 -0
  44. phenocluster-0.1.0/phenocluster/model_selection/scorers.py +211 -0
  45. phenocluster-0.1.0/phenocluster/pipeline.py +1394 -0
  46. phenocluster-0.1.0/phenocluster/profiles.py +419 -0
  47. phenocluster-0.1.0/phenocluster/utils/__init__.py +14 -0
  48. phenocluster-0.1.0/phenocluster/utils/logging.py +134 -0
  49. phenocluster-0.1.0/phenocluster/utils/report/__init__.py +8 -0
  50. phenocluster-0.1.0/phenocluster/utils/report/_css.py +233 -0
  51. phenocluster-0.1.0/phenocluster/utils/report/_helpers.py +117 -0
  52. phenocluster-0.1.0/phenocluster/utils/report/generator.py +141 -0
  53. phenocluster-0.1.0/phenocluster/utils/report/sections/__init__.py +1 -0
  54. phenocluster-0.1.0/phenocluster/utils/report/sections/classification_quality.py +102 -0
  55. phenocluster-0.1.0/phenocluster/utils/report/sections/clusters.py +66 -0
  56. phenocluster-0.1.0/phenocluster/utils/report/sections/data_quality.py +56 -0
  57. phenocluster-0.1.0/phenocluster/utils/report/sections/external_validation.py +329 -0
  58. phenocluster-0.1.0/phenocluster/utils/report/sections/feature_importance.py +94 -0
  59. phenocluster-0.1.0/phenocluster/utils/report/sections/header.py +73 -0
  60. phenocluster-0.1.0/phenocluster/utils/report/sections/methods.py +62 -0
  61. phenocluster-0.1.0/phenocluster/utils/report/sections/model_selection.py +50 -0
  62. phenocluster-0.1.0/phenocluster/utils/report/sections/multistate.py +184 -0
  63. phenocluster-0.1.0/phenocluster/utils/report/sections/outcomes.py +122 -0
  64. phenocluster-0.1.0/phenocluster/utils/report/sections/stability.py +85 -0
  65. phenocluster-0.1.0/phenocluster/utils/report/sections/summary.py +62 -0
  66. phenocluster-0.1.0/phenocluster/utils/report/sections/survival.py +125 -0
  67. phenocluster-0.1.0/phenocluster/utils/report/sections/validation.py +198 -0
  68. phenocluster-0.1.0/phenocluster/utils/results_io.py +241 -0
  69. phenocluster-0.1.0/phenocluster/visualization/__init__.py +12 -0
  70. phenocluster-0.1.0/phenocluster/visualization/_base.py +317 -0
  71. phenocluster-0.1.0/phenocluster/visualization/_cluster_distribution.py +247 -0
  72. phenocluster-0.1.0/phenocluster/visualization/_cluster_heatmap.py +531 -0
  73. phenocluster-0.1.0/phenocluster/visualization/_cluster_quality.py +305 -0
  74. phenocluster-0.1.0/phenocluster/visualization/_multistate.py +778 -0
  75. phenocluster-0.1.0/phenocluster/visualization/_outcome.py +220 -0
  76. phenocluster-0.1.0/phenocluster/visualization/_survival.py +426 -0
  77. phenocluster-0.1.0/phenocluster/visualization/plots.py +206 -0
  78. phenocluster-0.1.0/phenocluster.egg-info/PKG-INFO +241 -0
  79. phenocluster-0.1.0/phenocluster.egg-info/SOURCES.txt +90 -0
  80. phenocluster-0.1.0/phenocluster.egg-info/dependency_links.txt +1 -0
  81. phenocluster-0.1.0/phenocluster.egg-info/entry_points.txt +2 -0
  82. phenocluster-0.1.0/phenocluster.egg-info/requires.txt +22 -0
  83. phenocluster-0.1.0/phenocluster.egg-info/top_level.txt +1 -0
  84. phenocluster-0.1.0/pyproject.toml +87 -0
  85. phenocluster-0.1.0/requirements.txt +26 -0
  86. phenocluster-0.1.0/setup.cfg +4 -0
  87. phenocluster-0.1.0/tests/test_cache.py +182 -0
  88. phenocluster-0.1.0/tests/test_external_validation.py +306 -0
  89. phenocluster-0.1.0/tests/test_multistate.py +269 -0
  90. phenocluster-0.1.0/tests/test_pipeline.py +812 -0
  91. phenocluster-0.1.0/tests/test_stats_utils.py +75 -0
  92. phenocluster-0.1.0/tests/test_survival.py +107 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Ettore Rocchi
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,241 @@
1
+ Metadata-Version: 2.4
2
+ Name: phenocluster
3
+ Version: 0.1.0
4
+ Summary: Clinical Phenotype Discovery using Latent Class / Profile Analysis with Automatic Model Selection
5
+ Author-email: Ettore Rocchi <ettore.rocchi3@unibo.it>
6
+ Maintainer-email: Ettore Rocchi <ettore.rocchi3@unibo.it>
7
+ License: MIT
8
+ Project-URL: Homepage, https://github.com/EttoreRocchi/PhenoCluster
9
+ Project-URL: Documentation, https://ettorerocchi.github.io/PhenoCluster
10
+ Project-URL: Repository, https://github.com/EttoreRocchi/PhenoCluster
11
+ Project-URL: Bug Tracker, https://github.com/EttoreRocchi/PhenoCluster/issues
12
+ Project-URL: Changelog, https://github.com/EttoreRocchi/PhenoCluster/blob/main/docs/changelog.rst
13
+ Keywords: clinical,phenotype,clustering,latent-class-analysis,latent-profile-analysis,machine-learning,bioinformatics,healthcare,data-science
14
+ Classifier: Development Status :: 4 - Beta
15
+ Classifier: Intended Audience :: Science/Research
16
+ Classifier: Intended Audience :: Healthcare Industry
17
+ Classifier: License :: OSI Approved :: MIT License
18
+ Classifier: Programming Language :: Python :: 3
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
21
+ Classifier: Programming Language :: Python :: 3.13
22
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
23
+ Classifier: Topic :: Scientific/Engineering :: Medical Science Apps.
24
+ Classifier: Operating System :: OS Independent
25
+ Requires-Python: >=3.11
26
+ Description-Content-Type: text/markdown
27
+ License-File: LICENSE
28
+ Requires-Dist: numpy<2.0.0,>=1.21.0
29
+ Requires-Dist: pandas>=2.1.0
30
+ Requires-Dist: scikit-learn>=1.0.0
31
+ Requires-Dist: scipy>=1.11.0
32
+ Requires-Dist: pyyaml>=5.4.0
33
+ Requires-Dist: joblib>=1.3.0
34
+ Requires-Dist: stepmix>=2.1.0
35
+ Requires-Dist: statsmodels>=0.14.0
36
+ Requires-Dist: plotly>=5.3.0
37
+ Requires-Dist: typer>=0.9.0
38
+ Requires-Dist: rich>=13.0.0
39
+ Requires-Dist: tqdm>=4.62.0
40
+ Requires-Dist: lifelines>=0.30.0
41
+ Provides-Extra: dev
42
+ Requires-Dist: pytest>=7.0; extra == "dev"
43
+ Requires-Dist: ruff>=0.4.0; extra == "dev"
44
+ Provides-Extra: docs
45
+ Requires-Dist: sphinx>=7.0; extra == "docs"
46
+ Requires-Dist: sphinx-click>=5.0; extra == "docs"
47
+ Requires-Dist: furo; extra == "docs"
48
+ Dynamic: license-file
49
+
50
+ <p align="center">
51
+ <img src="docs/phenocluster_logo.png" alt="PhenoCluster" width="280"/>
52
+ </p>
53
+
54
+ <p align="center">
55
+ <strong>A flexible data-driven framework for identifying clinical phenotypes using latent class and profile analysis</strong>
56
+ </p>
57
+
58
+ [![PyPI version](https://img.shields.io/pypi/v/phenocluster)](https://pypi.org/project/phenocluster/)
59
+ [![Python versions](https://img.shields.io/pypi/pyversions/phenocluster)](https://www.python.org/downloads/)
60
+ [![MIT License](https://img.shields.io/badge/license-MIT-green)](https://opensource.org/licenses/MIT)
61
+ [![CI](https://github.com/EttoreRocchi/PhenoCluster/actions/workflows/ci.yml/badge.svg)](https://github.com/EttoreRocchi/PhenoCluster/actions/workflows/ci.yml)
62
+ [![Docs](https://img.shields.io/badge/docs-GitHub%20Pages-blue)](https://ettorerocchi.github.io/PhenoCluster)
63
+
64
+ ---
65
+
66
+ ## Overview
67
+
68
+ PhenoCluster is a Python framework for unsupervised discovery of clinical phenotypes from heterogeneous patient data. It implements an end-to-end pipeline: from data preprocessing and latent class identification to outcome association analysis, survival modelling, and multistate transition modelling.
69
+
70
+ The framework is **domain-agnostic** and can be applied to any clinical cohort study where the goal is to identify latent patient subgroups and characterise their relationship with clinical outcomes. Users supply a dataset and a YAML configuration file; PhenoCluster handles model selection, phenotype assignment, and downstream inference automatically.
71
+
72
+ ### Key capabilities
73
+
74
+ - **Latent Class / Profile Analysis** via the [StepMix](https://github.com/Labo-Lacourse/stepmix) framework with native support for mixed continuous/categorical data and missing values
75
+ - **Automatic model selection** using information criteria (BIC, AIC, ICL, CAIC, SABIC) with configurable cluster-size constraints
76
+ - **Classification quality assessment** with per-phenotype Average Posterior Probability (AvePP) and assignment confidence metrics
77
+ - **Outcome association analysis** with logistic regression yielding odds ratios, confidence intervals, and FDR-corrected p-values
78
+ - **Survival analysis** with Cox proportional hazards models producing hazard ratios and log-rank tests
79
+ - **Multistate modelling** with transition-specific Cox PH analysis, Monte Carlo simulation for state occupation probabilities with confidence interval bands, and clinical pathway enumeration
80
+ - **Comprehensive output** including an interactive HTML report, forest plots with confidence intervals, Kaplan-Meier and Nelson-Aalen curves, heatmaps, and JSON/CSV data exports
81
+
82
+ ## Installation
83
+
84
+ > **Requires Python ≥ 3.11**
85
+
86
+ ### From PyPI
87
+
88
+ ```bash
89
+ pip install phenocluster
90
+ ```
91
+
92
+ ### From source
93
+
94
+ ```bash
95
+ git clone https://github.com/EttoreRocchi/PhenoCluster.git
96
+ cd PhenoCluster
97
+ pip install -e ".[dev]"
98
+ ```
99
+
100
+ ## Quick start
101
+
102
+ ### 1. Generate a configuration file
103
+
104
+ ```bash
105
+ phenocluster create-config -p complete -o config.yaml
106
+ ```
107
+
108
+ ### 2. Edit the configuration
109
+
110
+ Open `config.yaml` and fill in your dataset-specific parameters:
111
+
112
+ ```yaml
113
+ global:
114
+ project_name: "My Study"
115
+ output_dir: "results"
116
+ random_state: 42
117
+
118
+ data:
119
+ continuous_columns:
120
+ - age
121
+ - bmi
122
+ - lab_value_1
123
+ categorical_columns:
124
+ - sex
125
+ - smoking_status
126
+ - disease_stage
127
+ split:
128
+ test_size: 0.2
129
+
130
+ outcome:
131
+ enabled: true
132
+ outcome_columns:
133
+ - mortality_30d
134
+ - readmission_30d
135
+
136
+ survival:
137
+ enabled: true
138
+ targets:
139
+ - name: "overall_survival"
140
+ time_column: "time_to_death"
141
+ event_column: "death_indicator"
142
+ ```
143
+
144
+ ### 3. Run the pipeline
145
+
146
+ ```bash
147
+ phenocluster run -d data.csv -c config.yaml
148
+ ```
149
+
150
+ ### 4. Inspect results
151
+
152
+ Results are written to the output directory (default: `results/`):
153
+
154
+ | File | Description |
155
+ |------|-------------|
156
+ | `analysis_report.html` | Comprehensive HTML report with all results and visualisations |
157
+ | `cluster_statistics.json` | Phenotype sizes, feature distributions, and classification quality |
158
+ | `outcome_results.json` | Odds ratios with confidence intervals and p-values |
159
+ | `survival_results.json` | Kaplan-Meier estimates and Cox PH hazard ratios |
160
+ | `multistate_results.json` | Transition-specific hazard ratios, pathways, and state occupation |
161
+ | `data/model_fit_metrics.csv` | Information criteria, entropy, and average posterior probabilities |
162
+ | `data/phenotypes_data.csv` | Original data augmented with phenotype assignments |
163
+ | `data/posterior_probabilities.csv` | Posterior class membership probabilities |
164
+ | `results/model_selection_summary.json` | Model selection comparison table and best model info |
165
+ | `results/feature_importance.json` | Feature characterisation per phenotype |
166
+ | `results/validation_report.json` | Internal validation metrics (train/test comparison) |
167
+ | `results/stability_results.json` | Consensus clustering stability metrics |
168
+ | `results/split_info.json` | Train/test split details |
169
+ | `results/external_validation_results.json` | External validation results (when enabled) |
170
+ | `phenocluster.log` | Pipeline execution log |
171
+ | `artifacts/` | Cached intermediate results for incremental re-runs |
172
+
173
+ ## Pipeline overview
174
+
175
+ PhenoCluster executes the following stages in order:
176
+
177
+ 1. **Data quality assessment.** Missingness patterns, correlations, variance, and MCAR testing.
178
+ 2. **Train/test split.** Stratified splitting with configurable test size, performed before preprocessing to prevent data leakage.
179
+ 3. **Preprocessing.** Imputation, outlier handling, categorical encoding, standardization, and feature selection -- fit on training data only, then applied to the test set.
180
+ 4. **Model selection.** Cross-validated information criterion search over cluster counts (training set only).
181
+ 5. **Full-cohort refit.** Once K is selected, preprocessing and LCA/LPA model are refitted on the entire cohort; phenotypes reordered by size (largest = Phenotype 0).
182
+ 6. **Stability analysis.** Consensus clustering over subsampled runs.
183
+ 7. **Internal validation.** Train/test log-likelihood comparison, cluster proportion stability, and outcome OR consistency.
184
+ 8. **Outcome association.** Logistic regression for binary outcomes with FDR-corrected p-values (optional).
185
+ 9. **Survival analysis.** Kaplan-Meier curves, Nelson-Aalen estimators, log-rank tests, and Cox PH hazard ratios (optional).
186
+ 10. **Multistate modelling.** Transition-specific Cox PH models, transition hazard ratios, and Monte Carlo simulation (optional).
187
+ 11. **Report generation.** Interactive HTML report with all figures and tables.
188
+
189
+ ## CLI reference
190
+
191
+ | Command | Description |
192
+ |---------|-------------|
193
+ | `phenocluster run -d DATA -c CONFIG [--force-rerun]` | Run the full pipeline |
194
+ | `phenocluster create-config [-p PROFILE] [-o OUTPUT]` | Generate a config YAML from a profile template |
195
+ | `phenocluster validate-config -c CONFIG [-d DATA]` | Validate config structure; cross-check columns against data |
196
+ | `phenocluster version` | Show version, repository link, and documentation link |
197
+
198
+ ## Configuration profiles
199
+
200
+ Profiles set sensible defaults for common use-cases. Generate one with `phenocluster create-config -p <profile>`:
201
+
202
+ | Profile | Description | Inference | Stability | Multistate |
203
+ |---------|-------------|:---------:|:---------:|:----------:|
204
+ | `descriptive` | Phenotype discovery only, no statistical inference | off | on | off |
205
+ | `complete` | All analyses enabled (outcomes, survival, multistate) | on | on | on |
206
+ | `quick` | Fast iteration for development | on | off | off |
207
+
208
+ ## Configuration reference
209
+
210
+ See the full [Configuration Reference](https://ettorerocchi.github.io/PhenoCluster/configuration.html) in the documentation.
211
+
212
+ ## Documentation
213
+
214
+ Full documentation (statistical methods, configuration reference, output descriptions) is available at **[ettorerocchi.github.io/PhenoCluster](https://ettorerocchi.github.io/PhenoCluster)**.
215
+
216
+ ## Testing
217
+
218
+ ```bash
219
+ pip install -e ".[dev]"
220
+ pytest tests/ -v
221
+ ```
222
+
223
+ ## License
224
+
225
+ This project is licensed under the [MIT](LICENSE) License.
226
+
227
+ ## Citation
228
+
229
+ If you use **PhenoCluster** in your research, please cite:
230
+
231
+ ```bibtex
232
+
233
+ ```
234
+
235
+ ## Acknowledgment
236
+
237
+ This project relies on **StepMix**, a Python package for pseudo-likelihood estimation of generalized mixture models with external variables. We thank the authors for making their work openly available.
238
+
239
+ If you use this framework, please cite also:
240
+
241
+ > Morin, S., Legault, R., Laliberté, F., Bakk, Z., Giguère, C.-É., de la Sablonnière, R., & Lacourse, É. (2025). StepMix: A Python Package for Pseudo-Likelihood Estimation of Generalized Mixture Models with External Variables. Journal of Statistical Software, 113(8), 1-39. doi: [10.18637/jss.v113.i08](https://doi.org/10.18637/jss.v113.i08)
@@ -0,0 +1,192 @@
1
+ <p align="center">
2
+ <img src="docs/phenocluster_logo.png" alt="PhenoCluster" width="280"/>
3
+ </p>
4
+
5
+ <p align="center">
6
+ <strong>A flexible data-driven framework for identifying clinical phenotypes using latent class and profile analysis</strong>
7
+ </p>
8
+
9
+ [![PyPI version](https://img.shields.io/pypi/v/phenocluster)](https://pypi.org/project/phenocluster/)
10
+ [![Python versions](https://img.shields.io/pypi/pyversions/phenocluster)](https://www.python.org/downloads/)
11
+ [![MIT License](https://img.shields.io/badge/license-MIT-green)](https://opensource.org/licenses/MIT)
12
+ [![CI](https://github.com/EttoreRocchi/PhenoCluster/actions/workflows/ci.yml/badge.svg)](https://github.com/EttoreRocchi/PhenoCluster/actions/workflows/ci.yml)
13
+ [![Docs](https://img.shields.io/badge/docs-GitHub%20Pages-blue)](https://ettorerocchi.github.io/PhenoCluster)
14
+
15
+ ---
16
+
17
+ ## Overview
18
+
19
+ PhenoCluster is a Python framework for unsupervised discovery of clinical phenotypes from heterogeneous patient data. It implements an end-to-end pipeline: from data preprocessing and latent class identification to outcome association analysis, survival modelling, and multistate transition modelling.
20
+
21
+ The framework is **domain-agnostic** and can be applied to any clinical cohort study where the goal is to identify latent patient subgroups and characterise their relationship with clinical outcomes. Users supply a dataset and a YAML configuration file; PhenoCluster handles model selection, phenotype assignment, and downstream inference automatically.
22
+
23
+ ### Key capabilities
24
+
25
+ - **Latent Class / Profile Analysis** via the [StepMix](https://github.com/Labo-Lacourse/stepmix) framework with native support for mixed continuous/categorical data and missing values
26
+ - **Automatic model selection** using information criteria (BIC, AIC, ICL, CAIC, SABIC) with configurable cluster-size constraints
27
+ - **Classification quality assessment** with per-phenotype Average Posterior Probability (AvePP) and assignment confidence metrics
28
+ - **Outcome association analysis** with logistic regression yielding odds ratios, confidence intervals, and FDR-corrected p-values
29
+ - **Survival analysis** with Cox proportional hazards models producing hazard ratios and log-rank tests
30
+ - **Multistate modelling** with transition-specific Cox PH analysis, Monte Carlo simulation for state occupation probabilities with confidence interval bands, and clinical pathway enumeration
31
+ - **Comprehensive output** including an interactive HTML report, forest plots with confidence intervals, Kaplan-Meier and Nelson-Aalen curves, heatmaps, and JSON/CSV data exports
32
+
33
+ ## Installation
34
+
35
+ > **Requires Python ≥ 3.11**
36
+
37
+ ### From PyPI
38
+
39
+ ```bash
40
+ pip install phenocluster
41
+ ```
42
+
43
+ ### From source
44
+
45
+ ```bash
46
+ git clone https://github.com/EttoreRocchi/PhenoCluster.git
47
+ cd PhenoCluster
48
+ pip install -e ".[dev]"
49
+ ```
50
+
51
+ ## Quick start
52
+
53
+ ### 1. Generate a configuration file
54
+
55
+ ```bash
56
+ phenocluster create-config -p complete -o config.yaml
57
+ ```
58
+
59
+ ### 2. Edit the configuration
60
+
61
+ Open `config.yaml` and fill in your dataset-specific parameters:
62
+
63
+ ```yaml
64
+ global:
65
+ project_name: "My Study"
66
+ output_dir: "results"
67
+ random_state: 42
68
+
69
+ data:
70
+ continuous_columns:
71
+ - age
72
+ - bmi
73
+ - lab_value_1
74
+ categorical_columns:
75
+ - sex
76
+ - smoking_status
77
+ - disease_stage
78
+ split:
79
+ test_size: 0.2
80
+
81
+ outcome:
82
+ enabled: true
83
+ outcome_columns:
84
+ - mortality_30d
85
+ - readmission_30d
86
+
87
+ survival:
88
+ enabled: true
89
+ targets:
90
+ - name: "overall_survival"
91
+ time_column: "time_to_death"
92
+ event_column: "death_indicator"
93
+ ```
94
+
95
+ ### 3. Run the pipeline
96
+
97
+ ```bash
98
+ phenocluster run -d data.csv -c config.yaml
99
+ ```
100
+
101
+ ### 4. Inspect results
102
+
103
+ Results are written to the output directory (default: `results/`):
104
+
105
+ | File | Description |
106
+ |------|-------------|
107
+ | `analysis_report.html` | Comprehensive HTML report with all results and visualisations |
108
+ | `cluster_statistics.json` | Phenotype sizes, feature distributions, and classification quality |
109
+ | `outcome_results.json` | Odds ratios with confidence intervals and p-values |
110
+ | `survival_results.json` | Kaplan-Meier estimates and Cox PH hazard ratios |
111
+ | `multistate_results.json` | Transition-specific hazard ratios, pathways, and state occupation |
112
+ | `data/model_fit_metrics.csv` | Information criteria, entropy, and average posterior probabilities |
113
+ | `data/phenotypes_data.csv` | Original data augmented with phenotype assignments |
114
+ | `data/posterior_probabilities.csv` | Posterior class membership probabilities |
115
+ | `results/model_selection_summary.json` | Model selection comparison table and best model info |
116
+ | `results/feature_importance.json` | Feature characterisation per phenotype |
117
+ | `results/validation_report.json` | Internal validation metrics (train/test comparison) |
118
+ | `results/stability_results.json` | Consensus clustering stability metrics |
119
+ | `results/split_info.json` | Train/test split details |
120
+ | `results/external_validation_results.json` | External validation results (when enabled) |
121
+ | `phenocluster.log` | Pipeline execution log |
122
+ | `artifacts/` | Cached intermediate results for incremental re-runs |
123
+
124
+ ## Pipeline overview
125
+
126
+ PhenoCluster executes the following stages in order:
127
+
128
+ 1. **Data quality assessment.** Missingness patterns, correlations, variance, and MCAR testing.
129
+ 2. **Train/test split.** Stratified splitting with configurable test size, performed before preprocessing to prevent data leakage.
130
+ 3. **Preprocessing.** Imputation, outlier handling, categorical encoding, standardization, and feature selection -- fit on training data only, then applied to the test set.
131
+ 4. **Model selection.** Cross-validated information criterion search over cluster counts (training set only).
132
+ 5. **Full-cohort refit.** Once K is selected, preprocessing and LCA/LPA model are refitted on the entire cohort; phenotypes reordered by size (largest = Phenotype 0).
133
+ 6. **Stability analysis.** Consensus clustering over subsampled runs.
134
+ 7. **Internal validation.** Train/test log-likelihood comparison, cluster proportion stability, and outcome OR consistency.
135
+ 8. **Outcome association.** Logistic regression for binary outcomes with FDR-corrected p-values (optional).
136
+ 9. **Survival analysis.** Kaplan-Meier curves, Nelson-Aalen estimators, log-rank tests, and Cox PH hazard ratios (optional).
137
+ 10. **Multistate modelling.** Transition-specific Cox PH models, transition hazard ratios, and Monte Carlo simulation (optional).
138
+ 11. **Report generation.** Interactive HTML report with all figures and tables.
139
+
140
+ ## CLI reference
141
+
142
+ | Command | Description |
143
+ |---------|-------------|
144
+ | `phenocluster run -d DATA -c CONFIG [--force-rerun]` | Run the full pipeline |
145
+ | `phenocluster create-config [-p PROFILE] [-o OUTPUT]` | Generate a config YAML from a profile template |
146
+ | `phenocluster validate-config -c CONFIG [-d DATA]` | Validate config structure; cross-check columns against data |
147
+ | `phenocluster version` | Show version, repository link, and documentation link |
148
+
149
+ ## Configuration profiles
150
+
151
+ Profiles set sensible defaults for common use-cases. Generate one with `phenocluster create-config -p <profile>`:
152
+
153
+ | Profile | Description | Inference | Stability | Multistate |
154
+ |---------|-------------|:---------:|:---------:|:----------:|
155
+ | `descriptive` | Phenotype discovery only, no statistical inference | off | on | off |
156
+ | `complete` | All analyses enabled (outcomes, survival, multistate) | on | on | on |
157
+ | `quick` | Fast iteration for development | on | off | off |
158
+
159
+ ## Configuration reference
160
+
161
+ See the full [Configuration Reference](https://ettorerocchi.github.io/PhenoCluster/configuration.html) in the documentation.
162
+
163
+ ## Documentation
164
+
165
+ Full documentation (statistical methods, configuration reference, output descriptions) is available at **[ettorerocchi.github.io/PhenoCluster](https://ettorerocchi.github.io/PhenoCluster)**.
166
+
167
+ ## Testing
168
+
169
+ ```bash
170
+ pip install -e ".[dev]"
171
+ pytest tests/ -v
172
+ ```
173
+
174
+ ## License
175
+
176
+ This project is licensed under the [MIT](LICENSE) License.
177
+
178
+ ## Citation
179
+
180
+ If you use **PhenoCluster** in your research, please cite:
181
+
182
+ ```bibtex
183
+
184
+ ```
185
+
186
+ ## Acknowledgment
187
+
188
+ This project relies on **StepMix**, a Python package for pseudo-likelihood estimation of generalized mixture models with external variables. We thank the authors for making their work openly available.
189
+
190
+ If you use this framework, please cite also:
191
+
192
+ > Morin, S., Legault, R., Laliberté, F., Bakk, Z., Giguère, C.-É., de la Sablonnière, R., & Lacourse, É. (2025). StepMix: A Python Package for Pseudo-Likelihood Estimation of Generalized Mixture Models with External Variables. Journal of Statistical Software, 113(8), 1-39. doi: [10.18637/jss.v113.i08](https://doi.org/10.18637/jss.v113.i08)
@@ -0,0 +1,190 @@
1
+ """
2
+ PhenoCluster: Clinical Phenotype Discovery using Latent Class / Profile Analysis.
3
+
4
+ A pipeline for identifying latent clinical phenotypes with automatic
5
+ model selection, comprehensive validation, and advanced visualizations.
6
+
7
+ Author: Ettore Rocchi <ettore.rocchi3@unibo.it>
8
+ License: MIT
9
+ """
10
+
11
+ __version__ = "0.1.0"
12
+ __author__ = "Ettore Rocchi"
13
+ __email__ = "ettore.rocchi3@unibo.it"
14
+
15
+ # Configuration classes
16
+ from .config import (
17
+ CacheConfig,
18
+ CategoricalEncodingConfig,
19
+ CategoricalFlowConfig,
20
+ DataQualityConfig,
21
+ DataSplitConfig,
22
+ ExternalValidationConfig,
23
+ FeatureCharacterizationConfig,
24
+ FeatureSelectionConfig,
25
+ ImputationConfig,
26
+ InferenceConfig,
27
+ LoggingConfig,
28
+ ModelSelectionConfig,
29
+ MultistateConfig,
30
+ MultistateState,
31
+ MultistateTransition,
32
+ OutcomeConfig,
33
+ OutlierConfig,
34
+ PhenoClusterConfig,
35
+ ReferenceConfig,
36
+ RowFilterConfig,
37
+ StabilityConfig,
38
+ StepMixConfig,
39
+ SurvivalConfig,
40
+ SurvivalTarget,
41
+ VisualizationConfig,
42
+ )
43
+
44
+ # Core exceptions
45
+ from .core.exceptions import (
46
+ DataSplitError,
47
+ FeatureSelectionError,
48
+ ModelNotFittedError,
49
+ PhenoClusterError,
50
+ )
51
+
52
+ # Core types
53
+ from .core.types import (
54
+ DataSplitResult,
55
+ ModelSelectionResult,
56
+ )
57
+
58
+ # Data handling
59
+ from .data import DataPreprocessor, DataSplitter
60
+
61
+ # Evaluation
62
+ from .evaluation import (
63
+ ClusterEvaluator,
64
+ ClusterStatistics,
65
+ DataQualityAssessor,
66
+ ExternalValidator,
67
+ FeatureCharacterizer,
68
+ MonteCarloResults,
69
+ MultistateAnalyzer,
70
+ MultistateResults,
71
+ OutcomeAnalyzer,
72
+ StabilityAnalyzer,
73
+ SurvivalAnalyzer,
74
+ )
75
+
76
+ # Feature selection
77
+ from .feature_selection import (
78
+ BaseFeatureSelector,
79
+ CorrelationSelector,
80
+ LassoSelector,
81
+ MixedDataFeatureSelector,
82
+ MutualInfoSelector,
83
+ VarianceSelector,
84
+ )
85
+
86
+ # Model selection
87
+ from .model_selection import (
88
+ AVAILABLE_CRITERIA,
89
+ StepMixModelSelector,
90
+ aic_score,
91
+ bic_score,
92
+ caic_score,
93
+ create_scorer,
94
+ get_all_criteria,
95
+ icl_score,
96
+ relative_entropy_score,
97
+ sabic_score,
98
+ )
99
+
100
+ # Pipeline
101
+ from .pipeline import PhenoClusterPipeline, run_pipeline
102
+
103
+ # Config profiles
104
+ from .profiles import create_config_yaml, get_profile, list_profiles
105
+
106
+ # Utils
107
+ from .utils import PhenoClusterLogger
108
+
109
+ # Visualization
110
+ from .visualization import Visualizer
111
+
112
+ __all__ = [
113
+ # Main pipeline
114
+ "PhenoClusterPipeline",
115
+ "run_pipeline",
116
+ # Configuration
117
+ "PhenoClusterConfig",
118
+ "CacheConfig",
119
+ "CategoricalEncodingConfig",
120
+ "CategoricalFlowConfig",
121
+ "DataQualityConfig",
122
+ "DataSplitConfig",
123
+ "FeatureCharacterizationConfig",
124
+ "FeatureSelectionConfig",
125
+ "ExternalValidationConfig",
126
+ "InferenceConfig",
127
+ "ImputationConfig",
128
+ "LoggingConfig",
129
+ "ModelSelectionConfig",
130
+ "MultistateConfig",
131
+ "MultistateState",
132
+ "MultistateTransition",
133
+ "OutcomeConfig",
134
+ "OutlierConfig",
135
+ "ReferenceConfig",
136
+ "RowFilterConfig",
137
+ "StabilityConfig",
138
+ "StepMixConfig",
139
+ "SurvivalConfig",
140
+ "SurvivalTarget",
141
+ "VisualizationConfig",
142
+ # Config profiles
143
+ "list_profiles",
144
+ "get_profile",
145
+ "create_config_yaml",
146
+ # Data handling
147
+ "DataPreprocessor",
148
+ "DataSplitter",
149
+ "DataSplitResult",
150
+ # Evaluation
151
+ "ClusterEvaluator",
152
+ "ClusterStatistics",
153
+ "OutcomeAnalyzer",
154
+ "FeatureCharacterizer",
155
+ "StabilityAnalyzer",
156
+ "SurvivalAnalyzer",
157
+ "DataQualityAssessor",
158
+ "ExternalValidator",
159
+ "MultistateAnalyzer",
160
+ "MultistateResults",
161
+ "MonteCarloResults",
162
+ # Visualization
163
+ "Visualizer",
164
+ # Utils
165
+ "PhenoClusterLogger",
166
+ # Feature selection
167
+ "BaseFeatureSelector",
168
+ "VarianceSelector",
169
+ "CorrelationSelector",
170
+ "MutualInfoSelector",
171
+ "LassoSelector",
172
+ "MixedDataFeatureSelector",
173
+ # Model selection
174
+ "StepMixModelSelector",
175
+ "bic_score",
176
+ "aic_score",
177
+ "caic_score",
178
+ "sabic_score",
179
+ "icl_score",
180
+ "relative_entropy_score",
181
+ "create_scorer",
182
+ "get_all_criteria",
183
+ "AVAILABLE_CRITERIA",
184
+ "ModelSelectionResult",
185
+ # Exceptions
186
+ "PhenoClusterError",
187
+ "ModelNotFittedError",
188
+ "FeatureSelectionError",
189
+ "DataSplitError",
190
+ ]