stratiphy 0.3.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. stratiphy-0.3.2/LICENSE +29 -0
  2. stratiphy-0.3.2/PKG-INFO +86 -0
  3. stratiphy-0.3.2/README.md +17 -0
  4. stratiphy-0.3.2/pyproject.toml +62 -0
  5. stratiphy-0.3.2/setup.cfg +4 -0
  6. stratiphy-0.3.2/src/stratiphy/__init__.py +14 -0
  7. stratiphy-0.3.2/src/stratiphy/_cli.py +426 -0
  8. stratiphy-0.3.2/src/stratiphy/analysis/__init__.py +0 -0
  9. stratiphy-0.3.2/src/stratiphy/analysis/explain/__init__.py +15 -0
  10. stratiphy-0.3.2/src/stratiphy/analysis/explain/_explain.py +421 -0
  11. stratiphy-0.3.2/src/stratiphy/analysis/gap.py +124 -0
  12. stratiphy-0.3.2/src/stratiphy/analysis/metrics.py +60 -0
  13. stratiphy-0.3.2/src/stratiphy/analysis/simulate/__init__.py +11 -0
  14. stratiphy-0.3.2/src/stratiphy/analysis/simulate/_impl.py +318 -0
  15. stratiphy-0.3.2/src/stratiphy/analysis/split.py +164 -0
  16. stratiphy-0.3.2/src/stratiphy/bench/__init__.py +5 -0
  17. stratiphy-0.3.2/src/stratiphy/bench/_bencher.py +125 -0
  18. stratiphy-0.3.2/src/stratiphy/bench/_cli.py +390 -0
  19. stratiphy-0.3.2/src/stratiphy/bench/_data.py +170 -0
  20. stratiphy-0.3.2/src/stratiphy/bench/_io.py +49 -0
  21. stratiphy-0.3.2/src/stratiphy/bench/_model.py +69 -0
  22. stratiphy-0.3.2/src/stratiphy/cluster/__init__.py +18 -0
  23. stratiphy-0.3.2/src/stratiphy/cluster/_base.py +41 -0
  24. stratiphy-0.3.2/src/stratiphy/cluster/_sim.py +91 -0
  25. stratiphy-0.3.2/src/stratiphy/cluster/_sklearn_sim.py +33 -0
  26. stratiphy-0.3.2/src/stratiphy/config/__init__.py +5 -0
  27. stratiphy-0.3.2/src/stratiphy/config/_workflow.py +43 -0
  28. stratiphy-0.3.2/src/stratiphy/io.py +164 -0
  29. stratiphy-0.3.2/src/stratiphy/model/__init__.py +9 -0
  30. stratiphy-0.3.2/src/stratiphy/model/_base.py +357 -0
  31. stratiphy-0.3.2/src/stratiphy/preprocessing/__init__.py +3 -0
  32. stratiphy-0.3.2/src/stratiphy/preprocessing/annoqc.py +76 -0
  33. stratiphy-0.3.2/src/stratiphy/preprocessing/phenopackets.py +190 -0
  34. stratiphy-0.3.2/src/stratiphy/preprocessing/sanitize/__init__.py +19 -0
  35. stratiphy-0.3.2/src/stratiphy/preprocessing/sanitize/_api.py +36 -0
  36. stratiphy-0.3.2/src/stratiphy/preprocessing/sanitize/_convenience.py +147 -0
  37. stratiphy-0.3.2/src/stratiphy/preprocessing/sanitize/_impl.py +620 -0
  38. stratiphy-0.3.2/src/stratiphy/preprocessing/sanitize/_model.py +271 -0
  39. stratiphy-0.3.2/src/stratiphy/preprocessing/sanitize/_test__model.py +148 -0
  40. stratiphy-0.3.2/src/stratiphy/preprocessing/summarize/__init__.py +6 -0
  41. stratiphy-0.3.2/src/stratiphy/preprocessing/summarize/_summarize.py +54 -0
  42. stratiphy-0.3.2/src/stratiphy/preprocessing/validate/__init__.py +7 -0
  43. stratiphy-0.3.2/src/stratiphy/preprocessing/validate/_base.py +149 -0
  44. stratiphy-0.3.2/src/stratiphy/preprocessing/validate/_simple.py +9 -0
  45. stratiphy-0.3.2/src/stratiphy/py.typed +0 -0
  46. stratiphy-0.3.2/src/stratiphy/semsim/__init__.py +26 -0
  47. stratiphy-0.3.2/src/stratiphy/semsim/_base.py +223 -0
  48. stratiphy-0.3.2/src/stratiphy/semsim/_ic.py +58 -0
  49. stratiphy-0.3.2/src/stratiphy/semsim/_pe.py +139 -0
  50. stratiphy-0.3.2/src/stratiphy/semsim/_sts.py +121 -0
  51. stratiphy-0.3.2/src/stratiphy/semsim/_test__base.py +52 -0
  52. stratiphy-0.3.2/src/stratiphy/semsim/_test__sts.py +74 -0
  53. stratiphy-0.3.2/src/stratiphy/util.py +85 -0
  54. stratiphy-0.3.2/src/stratiphy/workflow/__init__.py +13 -0
  55. stratiphy-0.3.2/src/stratiphy/workflow/_base.py +705 -0
  56. stratiphy-0.3.2/src/stratiphy/workflow/util.py +201 -0
  57. stratiphy-0.3.2/src/stratiphy/workflow/workflow_pb2.py +58 -0
  58. stratiphy-0.3.2/src/stratiphy/workflow/workflow_pb2.pyi +95 -0
  59. stratiphy-0.3.2/src/stratiphy.egg-info/PKG-INFO +86 -0
  60. stratiphy-0.3.2/src/stratiphy.egg-info/SOURCES.txt +64 -0
  61. stratiphy-0.3.2/src/stratiphy.egg-info/dependency_links.txt +1 -0
  62. stratiphy-0.3.2/src/stratiphy.egg-info/entry_points.txt +3 -0
  63. stratiphy-0.3.2/src/stratiphy.egg-info/requires.txt +19 -0
  64. stratiphy-0.3.2/src/stratiphy.egg-info/top_level.txt +1 -0
  65. stratiphy-0.3.2/tests/test_model.py +38 -0
  66. stratiphy-0.3.2/tests/test_prepare_test_data.py +47 -0
@@ -0,0 +1,29 @@
1
+ BSD 3-Clause License
2
+
3
+ Copyright (c) 2026 Daniel Danis
4
+ All rights reserved.
5
+
6
+ Redistribution and use in source and binary forms, with or without
7
+ modification, are permitted provided that the following conditions are met:
8
+
9
+ 1. Redistributions of source code must retain the above copyright notice, this
10
+ list of conditions and the following disclaimer.
11
+
12
+ 2. Redistributions in binary form must reproduce the above copyright notice,
13
+ this list of conditions and the following disclaimer in the documentation
14
+ and/or other materials provided with the distribution.
15
+
16
+ 3. Neither the name of the copyright holder nor the names of its
17
+ contributors may be used to endorse or promote products derived from
18
+ this software without specific prior written permission.
19
+
20
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
@@ -0,0 +1,86 @@
1
+ Metadata-Version: 2.4
2
+ Name: stratiphy
3
+ Version: 0.3.2
4
+ Summary: Phenotype-driven clustering of cohorts for discovery of novel disease subgroups
5
+ Author-email: Daniel Danis <daniel.gordon.danis@protonmail.com>, Peter N Robinson <peter.robinson@bih-charite.de>
6
+ License: BSD 3-Clause License
7
+
8
+ Copyright (c) 2026 Daniel Danis
9
+ All rights reserved.
10
+
11
+ Redistribution and use in source and binary forms, with or without
12
+ modification, are permitted provided that the following conditions are met:
13
+
14
+ 1. Redistributions of source code must retain the above copyright notice, this
15
+ list of conditions and the following disclaimer.
16
+
17
+ 2. Redistributions in binary form must reproduce the above copyright notice,
18
+ this list of conditions and the following disclaimer in the documentation
19
+ and/or other materials provided with the distribution.
20
+
21
+ 3. Neither the name of the copyright holder nor the names of its
22
+ contributors may be used to endorse or promote products derived from
23
+ this software without specific prior written permission.
24
+
25
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
26
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
28
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
29
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
31
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
32
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
33
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
34
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35
+
36
+ Project-URL: homepage, https://github.com/P2GX/stratiphy
37
+ Project-URL: repository, https://github.com/P2GX/stratiphy.git
38
+ Project-URL: documentation, https://P2GX.github.io/stratiphy/stable
39
+ Project-URL: bugtracker, https://github.com/P2GX/stratiphy/issues
40
+ Keywords: clustering,human phenotype ontology,phenopackets
41
+ Classifier: Development Status :: 3 - Alpha
42
+ Classifier: Operating System :: OS Independent
43
+ Classifier: Programming Language :: Python :: 3.10
44
+ Classifier: Programming Language :: Python :: 3.11
45
+ Classifier: Programming Language :: Python :: 3.12
46
+ Classifier: Programming Language :: Python :: 3.13
47
+ Classifier: Programming Language :: Python :: 3.14
48
+ Requires-Python: >=3.10
49
+ Description-Content-Type: text/markdown
50
+ License-File: LICENSE
51
+ Requires-Dist: hpo-toolkit>=0.8.1
52
+ Requires-Dist: numpy>=1.10
53
+ Requires-Dist: pandas<3.0.0,>=2.0
54
+ Requires-Dist: phenopackets~=2.0.2
55
+ Requires-Dist: protobuf<8.0.0,>=3.20.2
56
+ Requires-Dist: scikit-learn<2.0.0,>=1.0.0
57
+ Requires-Dist: scipy<2.0,>=1.1.0
58
+ Requires-Dist: statsmodels<1.0.0,>=0.14.6
59
+ Requires-Dist: tqdm<5.0,>=4.0
60
+ Provides-Extra: test
61
+ Requires-Dist: pytest<8.0.0,>=7.0.0; extra == "test"
62
+ Requires-Dist: ruff==0.12.2; extra == "test"
63
+ Provides-Extra: docs
64
+ Requires-Dist: mkdocs>=1.6.1; extra == "docs"
65
+ Requires-Dist: mkdocs-material>=9.7.0; extra == "docs"
66
+ Requires-Dist: mkdocstrings[python]>=0.30.1; extra == "docs"
67
+ Requires-Dist: mkdocs-api-autonav>=0.4.0; extra == "docs"
68
+ Dynamic: license-file
69
+
70
+ [![Build status](https://github.com/P2GX/stratiphy/workflows/CI/badge.svg)](https://github.com/P2GX/stratiphy/actions/workflows/python_ci.yml)
71
+ [![GitHub release](https://img.shields.io/github/release/P2GX/stratiphy.svg)](https://github.com/P2GX/stratiphy/releases)
72
+ ![PyPi downloads](https://img.shields.io/pypi/dm/stratiphy.svg?label=Pypi%20downloads)
73
+ ![PyPI - Python Version](https://img.shields.io/pypi/pyversions/stratiphy)
74
+
75
+ # Stratiphy
76
+
77
+ A Python package for phenotype-driven clustering of cohorts for discovery of novel disease subgroups.
78
+
79
+ See our documentation for the [setup](https://p2gx.github.io/stratiphy/stable/setup.html) instructions,
80
+ a [tutorial](https://p2gx.github.io/stratiphy/stable/tutorial.html) with an end-to-end genotype-phenotype association analysis,
81
+ and a comprehensive [user guide](https://p2gx.github.io/stratiphy/stable/user-guide/index.html) with everything else.
82
+
83
+ The documentation comes in two flavors:
84
+
85
+ - [Stable documentation](https://p2gx.github.io/stratiphy/stable/) (last release on `main` branch)
86
+ - [Latest documentation](https://p2gx.github.io/stratiphy/latest) (bleeding edge, latest commit on `development` branch)
@@ -0,0 +1,17 @@
1
+ [![Build status](https://github.com/P2GX/stratiphy/workflows/CI/badge.svg)](https://github.com/P2GX/stratiphy/actions/workflows/python_ci.yml)
2
+ [![GitHub release](https://img.shields.io/github/release/P2GX/stratiphy.svg)](https://github.com/P2GX/stratiphy/releases)
3
+ ![PyPi downloads](https://img.shields.io/pypi/dm/stratiphy.svg?label=Pypi%20downloads)
4
+ ![PyPI - Python Version](https://img.shields.io/pypi/pyversions/stratiphy)
5
+
6
+ # Stratiphy
7
+
8
+ A Python package for phenotype-driven clustering of cohorts for discovery of novel disease subgroups.
9
+
10
+ See our documentation for the [setup](https://p2gx.github.io/stratiphy/stable/setup.html) instructions,
11
+ a [tutorial](https://p2gx.github.io/stratiphy/stable/tutorial.html) with an end-to-end genotype-phenotype association analysis,
12
+ and a comprehensive [user guide](https://p2gx.github.io/stratiphy/stable/user-guide/index.html) with everything else.
13
+
14
+ The documentation comes in two flavors:
15
+
16
+ - [Stable documentation](https://p2gx.github.io/stratiphy/stable/) (last release on `main` branch)
17
+ - [Latest documentation](https://p2gx.github.io/stratiphy/latest) (bleeding edge, latest commit on `development` branch)
@@ -0,0 +1,62 @@
1
+ [build-system]
2
+ requires = ["setuptools>=65.6.3"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "stratiphy"
7
+ description = "Phenotype-driven clustering of cohorts for discovery of novel disease subgroups"
8
+ authors = [
9
+ { name = "Daniel Danis", email = "daniel.gordon.danis@protonmail.com" },
10
+ { name = "Peter N Robinson", email = "peter.robinson@bih-charite.de" },
11
+ ]
12
+
13
+ version = "0.3.2"
14
+ readme = "README.md"
15
+ # As of May 2026, Python 3.10 is the latest supported version
16
+ # Source: https://devguide.python.org/versions/
17
+ requires-python = ">=3.10"
18
+ keywords = ["clustering", "human phenotype ontology", "phenopackets"]
19
+ license = { file = "LICENSE" }
20
+ classifiers = [
21
+ "Development Status :: 3 - Alpha",
22
+ "Operating System :: OS Independent",
23
+ "Programming Language :: Python :: 3.10",
24
+ "Programming Language :: Python :: 3.11",
25
+ "Programming Language :: Python :: 3.12",
26
+ "Programming Language :: Python :: 3.13",
27
+ "Programming Language :: Python :: 3.14",
28
+ ]
29
+
30
+ dependencies = [
31
+ "hpo-toolkit >= 0.8.1",
32
+ "numpy >= 1.10",
33
+ "pandas >= 2.0, < 3.0.0",
34
+ "phenopackets ~= 2.0.2",
35
+ "protobuf >=3.20.2, <8.0.0", # we should be OK even with more recent versions but they must be tested first.
36
+ "scikit-learn >= 1.0.0, <2.0.0",
37
+ "scipy >= 1.1.0, < 2.0", # scipy `1.1.0` is used by scikit-learn 1.0.0
38
+ "statsmodels >= 0.14.6, <1.0.0",
39
+ "tqdm >=4.0 , < 5.0",
40
+ ]
41
+
42
+ [project.optional-dependencies]
43
+ test = ["pytest>=7.0.0, <8.0.0", "ruff==0.12.2",]
44
+ docs = [
45
+ "mkdocs>=1.6.1",
46
+ "mkdocs-material >= 9.7.0",
47
+ "mkdocstrings[python] >= 0.30.1",
48
+ "mkdocs-api-autonav >= 0.4.0",
49
+ ]
50
+
51
+ [project.urls]
52
+ homepage = "https://github.com/P2GX/stratiphy"
53
+ repository = "https://github.com/P2GX/stratiphy.git"
54
+ documentation = "https://P2GX.github.io/stratiphy/stable"
55
+ bugtracker = "https://github.com/P2GX/stratiphy/issues"
56
+
57
+ [project.scripts]
58
+ stratiphy = "stratiphy._cli:main"
59
+ stratiphy-bench = "stratiphy.bench._cli:main"
60
+
61
+ [tool.setuptools]
62
+ package-dir = { "" = "src" }
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,14 @@
1
+ """
2
+ The API documentation for the `stratiphy` Python package.
3
+
4
+ The API documentation is targeted for the advanced users wanting to use
5
+ `stratiphy` as a Python library. For general public, we recommend to use
6
+ the command-line interface (CLI).
7
+
8
+ See the [Tutorial](../../tutorial.md) and [User Guide](../../user-guide/index.md)
9
+ for an overview of the CLI main use cases.
10
+ """
11
+
12
+ from importlib.metadata import version
13
+
14
+ __version__ = version("stratiphy")
@@ -0,0 +1,426 @@
1
+ import argparse
2
+ import json
3
+ import logging
4
+ import os
5
+ import pathlib
6
+ import sys
7
+ import typing
8
+
9
+ import hpotk
10
+ from hpotk.util import open_text_io_handle_for_reading, open_text_io_handle_for_writing
11
+
12
+ import stratiphy
13
+ from stratiphy.io import StratiphyJSONDecoder
14
+ from stratiphy.model import Sample
15
+ from stratiphy.util import setup_logging
16
+
17
+ PROG = "stratiphy"
18
+ DEFAULT_DATA_PATH = "data"
19
+ DEFAULT_HPO_PATH = "hp.json"
20
+ DEFAULT_SAMPLES_PATH = "samples.json.gz"
21
+ DEFAULT_RESULTS_PATH = "results.pb"
22
+ DEFAULT_RESULTS_JSON_PATH = "results.json"
23
+
24
+
25
+ # ################################## CLI ######################################
26
+
27
+ logger = logging.getLogger(PROG)
28
+
29
+
30
+ parser = argparse.ArgumentParser(
31
+ prog=PROG,
32
+ formatter_class=argparse.RawTextHelpFormatter,
33
+ description="Phenotype-driven stratification of patient cohorts",
34
+ epilog="Find more info at https://P2GX.github.io/stratiphy/stable",
35
+ )
36
+
37
+ parser.add_argument(
38
+ "-v",
39
+ "--verbosity",
40
+ action="count",
41
+ default=0,
42
+ help="increase verbosity",
43
+ )
44
+
45
+ parser.add_argument(
46
+ "--version",
47
+ action="version",
48
+ version="%(prog)s {version}".format(version=stratiphy.__version__),
49
+ )
50
+
51
+ # generate subparsers/subcommands
52
+ subparsers = parser.add_subparsers(dest="command")
53
+
54
+
55
+ # #################### ------------ `setup` ---------------- ####################
56
+
57
+ parser_setup = subparsers.add_parser(
58
+ "setup",
59
+ description="Initialize stratiphy resources",
60
+ help="initialize stratiphy resources",
61
+ epilog="Find more info at https://P2GX.github.io/stratiphy/stable",
62
+ )
63
+
64
+ subparsers_setup = parser_setup.add_subparsers(dest="command_setup")
65
+
66
+
67
+ # #################### ------------ `setup download` ------- ####################
68
+ parser_setup_download = subparsers_setup.add_parser(
69
+ "download",
70
+ help="download the resource files",
71
+ )
72
+ parser_setup_download.add_argument(
73
+ "-d",
74
+ "--data",
75
+ type=pathlib.Path,
76
+ default=pathlib.Path(os.getcwd()).joinpath(DEFAULT_DATA_PATH),
77
+ help=f"where to download the resources (default: {DEFAULT_DATA_PATH})",
78
+ )
79
+ parser_setup_download.add_argument(
80
+ "-w",
81
+ "--overwrite",
82
+ default=False,
83
+ action="store_true",
84
+ help="overwrite previously downloaded resource files",
85
+ )
86
+
87
+
88
+ def setup_download(
89
+ data: pathlib.Path,
90
+ overwrite: bool,
91
+ ) -> int:
92
+ # Ensure the `data` directory exists
93
+ if not os.path.exists(data):
94
+ logger.debug("Creating directory at %s", data)
95
+ os.makedirs(data, exist_ok=True)
96
+ elif os.path.isfile(data):
97
+ logger.error("`-d | --data` must point to a directory, but %s is a file", data)
98
+ return 1
99
+
100
+ # Download HPO, if needed
101
+ fpath_hpo = data.joinpath(DEFAULT_HPO_PATH)
102
+ should_download_hpo = should_execute(
103
+ fpath_hpo,
104
+ "HPO",
105
+ "download",
106
+ overwrite,
107
+ )
108
+
109
+ if should_download_hpo:
110
+ url_hpo = "https://purl.obolibrary.org/obo/hp.json"
111
+ download_resource(
112
+ url_hpo,
113
+ str(fpath_hpo),
114
+ "HPO",
115
+ )
116
+
117
+ return 0
118
+
119
+
120
+ def should_execute(
121
+ fpath: pathlib.Path,
122
+ resource_name: str,
123
+ action_name: str,
124
+ overwrite: bool,
125
+ ) -> bool:
126
+ if os.path.isfile(fpath):
127
+ if overwrite:
128
+ logger.info("Overwriting %s at %s", resource_name, fpath)
129
+ return True
130
+ else:
131
+ logger.info(
132
+ "Cowardly refusing to %s %s since it already exists at %s",
133
+ action_name,
134
+ resource_name,
135
+ fpath,
136
+ )
137
+ return False
138
+ else:
139
+ logger.info("Proceeding with the %s of %s", action_name, resource_name)
140
+ return True
141
+
142
+
143
+ def download_resource(
144
+ url: str,
145
+ destination: str,
146
+ resource_name: str,
147
+ ):
148
+ logger.debug("Fetching %s from %s", resource_name, url)
149
+ logger.debug("Storing %s to %s", resource_name, destination)
150
+
151
+ with (
152
+ open_text_io_handle_for_reading(url) as fhin,
153
+ open_text_io_handle_for_writing(destination) as fhout,
154
+ ):
155
+ fhout.write(fhin.read())
156
+
157
+
158
+ # #################### ------------ `preprocess` ----------- ####################
159
+
160
+ parser_preprocess = subparsers.add_parser(
161
+ "preprocess",
162
+ help="prepare phenopackets for clustering",
163
+ )
164
+
165
+ parser_preprocess.add_argument(
166
+ "-d",
167
+ "--data",
168
+ type=pathlib.Path,
169
+ default=pathlib.Path(os.getcwd()).joinpath(DEFAULT_DATA_PATH),
170
+ help="path to stratify data directory",
171
+ )
172
+ parser_preprocess.add_argument(
173
+ "--controversy",
174
+ type=str,
175
+ default="small",
176
+ choices=("high", "moderate", "small", "none"),
177
+ help="try to sanitize issues with controversy less than this threshold",
178
+ )
179
+ parser_preprocess.add_argument(
180
+ "outdir",
181
+ type=pathlib.Path,
182
+ default=pathlib.Path(os.getcwd()),
183
+ help="folder for storing the preprocessed files",
184
+ )
185
+ parser_preprocess.add_argument(
186
+ "phenopackets",
187
+ nargs="+",
188
+ type=pathlib.Path,
189
+ help="phenopacket JSON files with case reports for clustering",
190
+ )
191
+
192
+
193
+ def preprocess(
194
+ data: pathlib.Path,
195
+ controversy: typing.Literal["high", "moderate", "small", "none"],
196
+ outdir: pathlib.Path,
197
+ phenopackets: typing.Sequence[pathlib.Path],
198
+ ) -> int:
199
+ import json
200
+
201
+ from stratiphy.io import StratiphyJSONEncoder
202
+ from stratiphy.preprocessing.phenopackets import read_phenopacket
203
+ from stratiphy.preprocessing.sanitize import Controversy, sanitize_samples
204
+
205
+ # Check inputs
206
+ fpath_hpo = data.joinpath(DEFAULT_HPO_PATH)
207
+ if not os.path.isfile(fpath_hpo):
208
+ logger.error("HPO is not present at %s", fpath_hpo.absolute())
209
+ return 1
210
+ # Try to create the output folder, including possibly non-existent parent folders
211
+ os.makedirs(outdir, exist_ok=True)
212
+ # Check the controversy threshold
213
+ assert controversy.lower() in ("high", "moderate", "small", "none")
214
+
215
+ # Read phenopackets
216
+ logger.info("Reading phenopackets")
217
+ logger.debug(
218
+ "Phenopacket paths: %s",
219
+ list(str(pp) for pp in phenopackets),
220
+ )
221
+ samples = tuple(read_phenopacket(pp) for pp in phenopackets)
222
+ logger.info("Read %d phenopackets", len(samples))
223
+
224
+ # Sanitize sample
225
+ logger.info("Sanitizing samples")
226
+ logger.debug("Loading HPO from %s", fpath_hpo.absolute())
227
+ hpo = hpotk.load_minimal_ontology(str(fpath_hpo.absolute()))
228
+ level = Controversy[controversy.upper()]
229
+ logger.debug("Fixing sanity issues at or below %s level of controversy", level.name.lower())
230
+ sanitation_result = sanitize_samples(
231
+ samples=samples,
232
+ hpo=hpo,
233
+ threshold=level,
234
+ )
235
+
236
+ for sample, actions in sanitation_result.get_samples_and_actions():
237
+ print(f"Sample: {sample.labels}")
238
+ for action in actions:
239
+ print(f" - {action}")
240
+
241
+ # Serialize the samples
242
+ logger.info("Serializing the sanitized samples")
243
+ fpath_cohort = os.path.abspath(os.path.join(outdir, DEFAULT_SAMPLES_PATH))
244
+ with open_text_io_handle_for_writing(fpath_cohort) as fh:
245
+ json.dump(sanitation_result.sanitized_samples, fh, cls=StratiphyJSONEncoder)
246
+ logger.info("Wrote the samples to %s", fpath_cohort)
247
+
248
+ return 0
249
+
250
+
251
+ # #################### ------------- `compute` ------------- ####################
252
+
253
+ parser_compute = subparsers.add_parser(
254
+ "compute",
255
+ help="execute the clustering workflow",
256
+ )
257
+
258
+ parser_compute.add_argument(
259
+ "-d",
260
+ "--data",
261
+ type=pathlib.Path,
262
+ default=pathlib.Path(os.getcwd()).joinpath(DEFAULT_DATA_PATH),
263
+ help="path to stratify data directory",
264
+ )
265
+ parser_compute.add_argument(
266
+ "--rand-iter",
267
+ type=int,
268
+ default=200,
269
+ help="the number of random cohorts to simulate",
270
+ )
271
+ parser_compute.add_argument(
272
+ "-k",
273
+ "--k-clusters",
274
+ nargs="+",
275
+ type=int,
276
+ default=(2, 3, 4, 5, 6),
277
+ help="k clusters to test",
278
+ )
279
+ parser_compute.add_argument(
280
+ "--mc-iter",
281
+ type=int,
282
+ default=1_000_000,
283
+ help="count of Monte-Carlo simulations for testing term-cluster association",
284
+ )
285
+ parser_compute.add_argument(
286
+ "-s",
287
+ "--samples",
288
+ metavar=DEFAULT_SAMPLES_PATH,
289
+ default=None,
290
+ help="path to JSON file with preprocessed samples",
291
+ )
292
+ parser_compute.add_argument(
293
+ "-r",
294
+ "--results",
295
+ metavar=DEFAULT_RESULTS_PATH,
296
+ default=None,
297
+ help="path to store the clustering result data",
298
+ )
299
+ parser_compute.add_argument(
300
+ "outdir",
301
+ type=pathlib.Path,
302
+ default=pathlib.Path(os.getcwd()),
303
+ help="folder for storing the preprocessed files",
304
+ )
305
+
306
+
307
+ def compute(
308
+ k_clusters: typing.Sequence[int],
309
+ n_rand_cohort: int,
310
+ mc_iter: int,
311
+ fpath_samples: typing.Optional[pathlib.Path],
312
+ fpath_results: typing.Optional[pathlib.Path],
313
+ data: pathlib.Path,
314
+ outdir: pathlib.Path,
315
+ ) -> int:
316
+ from stratiphy.config import configure_workflow
317
+
318
+ samples = _read_samples(fpath_samples, outdir)
319
+ logger.info("Read %d samples", len(samples))
320
+
321
+ logger.info("Configuring the clustering workflow")
322
+ logger.debug("%d random cohorts", n_rand_cohort)
323
+ fpath_hpo = data.joinpath(DEFAULT_HPO_PATH)
324
+ logger.debug("Using HPO at %s", fpath_hpo.absolute())
325
+
326
+ # Sanitize inputs.
327
+ _validate_is_readable_file(fpath_hpo)
328
+ hpo = hpotk.load_minimal_ontology(str(fpath_hpo))
329
+ workflow = configure_workflow(
330
+ hpo=hpo,
331
+ rand_cohorts=n_rand_cohort,
332
+ mc_iter=mc_iter,
333
+ )
334
+
335
+ logger.info("Executing the workflow")
336
+ result = workflow.run(
337
+ samples=samples,
338
+ k_clusters=k_clusters,
339
+ )
340
+
341
+ logger.debug("Serializing clustering results")
342
+ if fpath_results is None:
343
+ fpath_results = outdir.joinpath(DEFAULT_RESULTS_PATH)
344
+ result.to_protobuf(fpath_results)
345
+ logger.info("Serialized the results to %s", fpath_results.absolute())
346
+
347
+ return 0
348
+
349
+
350
+ # ###############################################################################
351
+ # Utils
352
+ def _make_optional_path(
353
+ path: typing.Optional[str],
354
+ ) -> typing.Optional[pathlib.Path]:
355
+ return None if path is None else pathlib.Path(path)
356
+
357
+
358
+ def _read_samples(
359
+ fpath_samples: typing.Optional[pathlib.Path],
360
+ outdir: pathlib.Path,
361
+ ) -> typing.Sequence[Sample]:
362
+ if fpath_samples is None:
363
+ fpath_samples = outdir.joinpath(DEFAULT_SAMPLES_PATH)
364
+ logger.debug(
365
+ "Reading samples from %s",
366
+ fpath_samples.absolute(),
367
+ )
368
+ # TODO: remove `str()` when using hpotk>=0.6.1
369
+ with open_text_io_handle_for_reading(fpath_samples) as fh:
370
+ return json.load(fh, cls=StratiphyJSONDecoder)
371
+
372
+
373
+ def _validate_is_readable_file(fpath: typing.Union[str, pathlib.Path]):
374
+ if not isinstance(fpath, (str, pathlib.Path)) or not (os.path.isfile(fpath) and os.access(fpath, os.R_OK)):
375
+ raise ValueError(f"{fpath} is not a `str` or `pathlib.Path` pointing to a readable file")
376
+
377
+
378
+ # ###############################################################################
379
+
380
+
381
+ def main():
382
+ argv = sys.argv[1:]
383
+
384
+ if len(argv) == 0:
385
+ parser.print_help()
386
+ sys.exit(1)
387
+
388
+ args = parser.parse_args(argv)
389
+
390
+ setup_logging(logger, args.verbosity)
391
+
392
+ if args.command == "setup":
393
+ if args.command_setup == "download":
394
+ sys.exit(
395
+ setup_download(
396
+ data=getattr(args, "data"),
397
+ overwrite=getattr(args, "overwrite"),
398
+ ),
399
+ )
400
+ else:
401
+ parser_setup.print_help()
402
+ sys.exit(1)
403
+ elif args.command == "preprocess":
404
+ sys.exit(
405
+ preprocess(
406
+ data=getattr(args, "data"),
407
+ controversy=getattr(args, "controversy"),
408
+ outdir=getattr(args, "outdir"),
409
+ phenopackets=getattr(args, "phenopackets"),
410
+ )
411
+ )
412
+ elif args.command == "compute":
413
+ sys.exit(
414
+ compute(
415
+ k_clusters=getattr(args, "k_clusters"),
416
+ n_rand_cohort=getattr(args, "rand_iter"),
417
+ mc_iter=getattr(args, "mc_iter"),
418
+ fpath_samples=_make_optional_path(getattr(args, "samples")),
419
+ fpath_results=_make_optional_path(getattr(args, "results")),
420
+ data=getattr(args, "data"),
421
+ outdir=getattr(args, "outdir"),
422
+ )
423
+ )
424
+ else:
425
+ parser.print_help()
426
+ sys.exit(1)
File without changes
@@ -0,0 +1,15 @@
1
+ from ._explain import (
2
+ FisherExplainMethod,
3
+ TermAssociation,
4
+ TermCounter,
5
+ TermFilter,
6
+ TermTest,
7
+ )
8
+
9
+ __all__ = [
10
+ "FisherExplainMethod",
11
+ "TermAssociation",
12
+ "TermCounter",
13
+ "TermFilter",
14
+ "TermTest",
15
+ ]