stratiphy 0.3.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- stratiphy-0.3.2/LICENSE +29 -0
- stratiphy-0.3.2/PKG-INFO +86 -0
- stratiphy-0.3.2/README.md +17 -0
- stratiphy-0.3.2/pyproject.toml +62 -0
- stratiphy-0.3.2/setup.cfg +4 -0
- stratiphy-0.3.2/src/stratiphy/__init__.py +14 -0
- stratiphy-0.3.2/src/stratiphy/_cli.py +426 -0
- stratiphy-0.3.2/src/stratiphy/analysis/__init__.py +0 -0
- stratiphy-0.3.2/src/stratiphy/analysis/explain/__init__.py +15 -0
- stratiphy-0.3.2/src/stratiphy/analysis/explain/_explain.py +421 -0
- stratiphy-0.3.2/src/stratiphy/analysis/gap.py +124 -0
- stratiphy-0.3.2/src/stratiphy/analysis/metrics.py +60 -0
- stratiphy-0.3.2/src/stratiphy/analysis/simulate/__init__.py +11 -0
- stratiphy-0.3.2/src/stratiphy/analysis/simulate/_impl.py +318 -0
- stratiphy-0.3.2/src/stratiphy/analysis/split.py +164 -0
- stratiphy-0.3.2/src/stratiphy/bench/__init__.py +5 -0
- stratiphy-0.3.2/src/stratiphy/bench/_bencher.py +125 -0
- stratiphy-0.3.2/src/stratiphy/bench/_cli.py +390 -0
- stratiphy-0.3.2/src/stratiphy/bench/_data.py +170 -0
- stratiphy-0.3.2/src/stratiphy/bench/_io.py +49 -0
- stratiphy-0.3.2/src/stratiphy/bench/_model.py +69 -0
- stratiphy-0.3.2/src/stratiphy/cluster/__init__.py +18 -0
- stratiphy-0.3.2/src/stratiphy/cluster/_base.py +41 -0
- stratiphy-0.3.2/src/stratiphy/cluster/_sim.py +91 -0
- stratiphy-0.3.2/src/stratiphy/cluster/_sklearn_sim.py +33 -0
- stratiphy-0.3.2/src/stratiphy/config/__init__.py +5 -0
- stratiphy-0.3.2/src/stratiphy/config/_workflow.py +43 -0
- stratiphy-0.3.2/src/stratiphy/io.py +164 -0
- stratiphy-0.3.2/src/stratiphy/model/__init__.py +9 -0
- stratiphy-0.3.2/src/stratiphy/model/_base.py +357 -0
- stratiphy-0.3.2/src/stratiphy/preprocessing/__init__.py +3 -0
- stratiphy-0.3.2/src/stratiphy/preprocessing/annoqc.py +76 -0
- stratiphy-0.3.2/src/stratiphy/preprocessing/phenopackets.py +190 -0
- stratiphy-0.3.2/src/stratiphy/preprocessing/sanitize/__init__.py +19 -0
- stratiphy-0.3.2/src/stratiphy/preprocessing/sanitize/_api.py +36 -0
- stratiphy-0.3.2/src/stratiphy/preprocessing/sanitize/_convenience.py +147 -0
- stratiphy-0.3.2/src/stratiphy/preprocessing/sanitize/_impl.py +620 -0
- stratiphy-0.3.2/src/stratiphy/preprocessing/sanitize/_model.py +271 -0
- stratiphy-0.3.2/src/stratiphy/preprocessing/sanitize/_test__model.py +148 -0
- stratiphy-0.3.2/src/stratiphy/preprocessing/summarize/__init__.py +6 -0
- stratiphy-0.3.2/src/stratiphy/preprocessing/summarize/_summarize.py +54 -0
- stratiphy-0.3.2/src/stratiphy/preprocessing/validate/__init__.py +7 -0
- stratiphy-0.3.2/src/stratiphy/preprocessing/validate/_base.py +149 -0
- stratiphy-0.3.2/src/stratiphy/preprocessing/validate/_simple.py +9 -0
- stratiphy-0.3.2/src/stratiphy/py.typed +0 -0
- stratiphy-0.3.2/src/stratiphy/semsim/__init__.py +26 -0
- stratiphy-0.3.2/src/stratiphy/semsim/_base.py +223 -0
- stratiphy-0.3.2/src/stratiphy/semsim/_ic.py +58 -0
- stratiphy-0.3.2/src/stratiphy/semsim/_pe.py +139 -0
- stratiphy-0.3.2/src/stratiphy/semsim/_sts.py +121 -0
- stratiphy-0.3.2/src/stratiphy/semsim/_test__base.py +52 -0
- stratiphy-0.3.2/src/stratiphy/semsim/_test__sts.py +74 -0
- stratiphy-0.3.2/src/stratiphy/util.py +85 -0
- stratiphy-0.3.2/src/stratiphy/workflow/__init__.py +13 -0
- stratiphy-0.3.2/src/stratiphy/workflow/_base.py +705 -0
- stratiphy-0.3.2/src/stratiphy/workflow/util.py +201 -0
- stratiphy-0.3.2/src/stratiphy/workflow/workflow_pb2.py +58 -0
- stratiphy-0.3.2/src/stratiphy/workflow/workflow_pb2.pyi +95 -0
- stratiphy-0.3.2/src/stratiphy.egg-info/PKG-INFO +86 -0
- stratiphy-0.3.2/src/stratiphy.egg-info/SOURCES.txt +64 -0
- stratiphy-0.3.2/src/stratiphy.egg-info/dependency_links.txt +1 -0
- stratiphy-0.3.2/src/stratiphy.egg-info/entry_points.txt +3 -0
- stratiphy-0.3.2/src/stratiphy.egg-info/requires.txt +19 -0
- stratiphy-0.3.2/src/stratiphy.egg-info/top_level.txt +1 -0
- stratiphy-0.3.2/tests/test_model.py +38 -0
- stratiphy-0.3.2/tests/test_prepare_test_data.py +47 -0
stratiphy-0.3.2/LICENSE
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
BSD 3-Clause License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Daniel Danis
|
|
4
|
+
All rights reserved.
|
|
5
|
+
|
|
6
|
+
Redistribution and use in source and binary forms, with or without
|
|
7
|
+
modification, are permitted provided that the following conditions are met:
|
|
8
|
+
|
|
9
|
+
1. Redistributions of source code must retain the above copyright notice, this
|
|
10
|
+
list of conditions and the following disclaimer.
|
|
11
|
+
|
|
12
|
+
2. Redistributions in binary form must reproduce the above copyright notice,
|
|
13
|
+
this list of conditions and the following disclaimer in the documentation
|
|
14
|
+
and/or other materials provided with the distribution.
|
|
15
|
+
|
|
16
|
+
3. Neither the name of the copyright holder nor the names of its
|
|
17
|
+
contributors may be used to endorse or promote products derived from
|
|
18
|
+
this software without specific prior written permission.
|
|
19
|
+
|
|
20
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
21
|
+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
22
|
+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
23
|
+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
|
24
|
+
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
25
|
+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
26
|
+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
27
|
+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
28
|
+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
29
|
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
stratiphy-0.3.2/PKG-INFO
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: stratiphy
|
|
3
|
+
Version: 0.3.2
|
|
4
|
+
Summary: Phenotype-driven clustering of cohorts for discovery of novel disease subgroups
|
|
5
|
+
Author-email: Daniel Danis <daniel.gordon.danis@protonmail.com>, Peter N Robinson <peter.robinson@bih-charite.de>
|
|
6
|
+
License: BSD 3-Clause License
|
|
7
|
+
|
|
8
|
+
Copyright (c) 2026 Daniel Danis
|
|
9
|
+
All rights reserved.
|
|
10
|
+
|
|
11
|
+
Redistribution and use in source and binary forms, with or without
|
|
12
|
+
modification, are permitted provided that the following conditions are met:
|
|
13
|
+
|
|
14
|
+
1. Redistributions of source code must retain the above copyright notice, this
|
|
15
|
+
list of conditions and the following disclaimer.
|
|
16
|
+
|
|
17
|
+
2. Redistributions in binary form must reproduce the above copyright notice,
|
|
18
|
+
this list of conditions and the following disclaimer in the documentation
|
|
19
|
+
and/or other materials provided with the distribution.
|
|
20
|
+
|
|
21
|
+
3. Neither the name of the copyright holder nor the names of its
|
|
22
|
+
contributors may be used to endorse or promote products derived from
|
|
23
|
+
this software without specific prior written permission.
|
|
24
|
+
|
|
25
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
26
|
+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
27
|
+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
28
|
+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
|
29
|
+
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
30
|
+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
31
|
+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
32
|
+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
33
|
+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
34
|
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
35
|
+
|
|
36
|
+
Project-URL: homepage, https://github.com/P2GX/stratiphy
|
|
37
|
+
Project-URL: repository, https://github.com/P2GX/stratiphy.git
|
|
38
|
+
Project-URL: documentation, https://P2GX.github.io/stratiphy/stable
|
|
39
|
+
Project-URL: bugtracker, https://github.com/P2GX/stratiphy/issues
|
|
40
|
+
Keywords: clustering,human phenotype ontology,phenopackets
|
|
41
|
+
Classifier: Development Status :: 3 - Alpha
|
|
42
|
+
Classifier: Operating System :: OS Independent
|
|
43
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
44
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
45
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
46
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
47
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
48
|
+
Requires-Python: >=3.10
|
|
49
|
+
Description-Content-Type: text/markdown
|
|
50
|
+
License-File: LICENSE
|
|
51
|
+
Requires-Dist: hpo-toolkit>=0.8.1
|
|
52
|
+
Requires-Dist: numpy>=1.10
|
|
53
|
+
Requires-Dist: pandas<3.0.0,>=2.0
|
|
54
|
+
Requires-Dist: phenopackets~=2.0.2
|
|
55
|
+
Requires-Dist: protobuf<8.0.0,>=3.20.2
|
|
56
|
+
Requires-Dist: scikit-learn<2.0.0,>=1.0.0
|
|
57
|
+
Requires-Dist: scipy<2.0,>=1.1.0
|
|
58
|
+
Requires-Dist: statsmodels<1.0.0,>=0.14.6
|
|
59
|
+
Requires-Dist: tqdm<5.0,>=4.0
|
|
60
|
+
Provides-Extra: test
|
|
61
|
+
Requires-Dist: pytest<8.0.0,>=7.0.0; extra == "test"
|
|
62
|
+
Requires-Dist: ruff==0.12.2; extra == "test"
|
|
63
|
+
Provides-Extra: docs
|
|
64
|
+
Requires-Dist: mkdocs>=1.6.1; extra == "docs"
|
|
65
|
+
Requires-Dist: mkdocs-material>=9.7.0; extra == "docs"
|
|
66
|
+
Requires-Dist: mkdocstrings[python]>=0.30.1; extra == "docs"
|
|
67
|
+
Requires-Dist: mkdocs-api-autonav>=0.4.0; extra == "docs"
|
|
68
|
+
Dynamic: license-file
|
|
69
|
+
|
|
70
|
+
[](https://github.com/P2GX/stratiphy/actions/workflows/python_ci.yml)
|
|
71
|
+
[](https://github.com/P2GX/stratiphy/releases)
|
|
72
|
+

|
|
73
|
+

|
|
74
|
+
|
|
75
|
+
# Stratiphy
|
|
76
|
+
|
|
77
|
+
A Python package for phenotype-driven clustering of cohorts for discovery of novel disease subgroups.
|
|
78
|
+
|
|
79
|
+
See our documentation for the [setup](https://p2gx.github.io/stratiphy/stable/setup.html) instructions,
|
|
80
|
+
a [tutorial](https://p2gx.github.io/stratiphy/stable/tutorial.html) with an end-to-end genotype-phenotype association analysis,
|
|
81
|
+
and a comprehensive [user guide](https://p2gx.github.io/stratiphy/stable/user-guide/index.html) with everything else.
|
|
82
|
+
|
|
83
|
+
The documentation comes in two flavors:
|
|
84
|
+
|
|
85
|
+
- [Stable documentation](https://p2gx.github.io/stratiphy/stable/) (last release on `main` branch)
|
|
86
|
+
- [Latest documentation](https://p2gx.github.io/stratiphy/latest) (bleeding edge, latest commit on `development` branch)
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
[](https://github.com/P2GX/stratiphy/actions/workflows/python_ci.yml)
|
|
2
|
+
[](https://github.com/P2GX/stratiphy/releases)
|
|
3
|
+

|
|
4
|
+

|
|
5
|
+
|
|
6
|
+
# Stratiphy
|
|
7
|
+
|
|
8
|
+
A Python package for phenotype-driven clustering of cohorts for discovery of novel disease subgroups.
|
|
9
|
+
|
|
10
|
+
See our documentation for the [setup](https://p2gx.github.io/stratiphy/stable/setup.html) instructions,
|
|
11
|
+
a [tutorial](https://p2gx.github.io/stratiphy/stable/tutorial.html) with an end-to-end genotype-phenotype association analysis,
|
|
12
|
+
and a comprehensive [user guide](https://p2gx.github.io/stratiphy/stable/user-guide/index.html) with everything else.
|
|
13
|
+
|
|
14
|
+
The documentation comes in two flavors:
|
|
15
|
+
|
|
16
|
+
- [Stable documentation](https://p2gx.github.io/stratiphy/stable/) (last release on `main` branch)
|
|
17
|
+
- [Latest documentation](https://p2gx.github.io/stratiphy/latest) (bleeding edge, latest commit on `development` branch)
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=65.6.3"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "stratiphy"
|
|
7
|
+
description = "Phenotype-driven clustering of cohorts for discovery of novel disease subgroups"
|
|
8
|
+
authors = [
|
|
9
|
+
{ name = "Daniel Danis", email = "daniel.gordon.danis@protonmail.com" },
|
|
10
|
+
{ name = "Peter N Robinson", email = "peter.robinson@bih-charite.de" },
|
|
11
|
+
]
|
|
12
|
+
|
|
13
|
+
version = "0.3.2"
|
|
14
|
+
readme = "README.md"
|
|
15
|
+
# As of May 2026, Python 3.10 is the latest supported version
|
|
16
|
+
# Source: https://devguide.python.org/versions/
|
|
17
|
+
requires-python = ">=3.10"
|
|
18
|
+
keywords = ["clustering", "human phenotype ontology", "phenopackets"]
|
|
19
|
+
license = { file = "LICENSE" }
|
|
20
|
+
classifiers = [
|
|
21
|
+
"Development Status :: 3 - Alpha",
|
|
22
|
+
"Operating System :: OS Independent",
|
|
23
|
+
"Programming Language :: Python :: 3.10",
|
|
24
|
+
"Programming Language :: Python :: 3.11",
|
|
25
|
+
"Programming Language :: Python :: 3.12",
|
|
26
|
+
"Programming Language :: Python :: 3.13",
|
|
27
|
+
"Programming Language :: Python :: 3.14",
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
dependencies = [
|
|
31
|
+
"hpo-toolkit >= 0.8.1",
|
|
32
|
+
"numpy >= 1.10",
|
|
33
|
+
"pandas >= 2.0, < 3.0.0",
|
|
34
|
+
"phenopackets ~= 2.0.2",
|
|
35
|
+
"protobuf >=3.20.2, <8.0.0", # we should be OK even with more recent versions but they must be tested first.
|
|
36
|
+
"scikit-learn >= 1.0.0, <2.0.0",
|
|
37
|
+
"scipy >= 1.1.0, < 2.0", # scipy `1.1.0` is used by scikit-learn 1.0.0
|
|
38
|
+
"statsmodels >= 0.14.6, <1.0.0",
|
|
39
|
+
"tqdm >=4.0 , < 5.0",
|
|
40
|
+
]
|
|
41
|
+
|
|
42
|
+
[project.optional-dependencies]
|
|
43
|
+
test = ["pytest>=7.0.0, <8.0.0", "ruff==0.12.2",]
|
|
44
|
+
docs = [
|
|
45
|
+
"mkdocs>=1.6.1",
|
|
46
|
+
"mkdocs-material >= 9.7.0",
|
|
47
|
+
"mkdocstrings[python] >= 0.30.1",
|
|
48
|
+
"mkdocs-api-autonav >= 0.4.0",
|
|
49
|
+
]
|
|
50
|
+
|
|
51
|
+
[project.urls]
|
|
52
|
+
homepage = "https://github.com/P2GX/stratiphy"
|
|
53
|
+
repository = "https://github.com/P2GX/stratiphy.git"
|
|
54
|
+
documentation = "https://P2GX.github.io/stratiphy/stable"
|
|
55
|
+
bugtracker = "https://github.com/P2GX/stratiphy/issues"
|
|
56
|
+
|
|
57
|
+
[project.scripts]
|
|
58
|
+
stratiphy = "stratiphy._cli:main"
|
|
59
|
+
stratiphy-bench = "stratiphy.bench._cli:main"
|
|
60
|
+
|
|
61
|
+
[tool.setuptools]
|
|
62
|
+
package-dir = { "" = "src" }
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
"""
|
|
2
|
+
The API documentation for the `stratiphy` Python package.
|
|
3
|
+
|
|
4
|
+
The API documentation is targeted for the advanced users wanting to use
|
|
5
|
+
`stratiphy` as a Python library. For general public, we recommend to use
|
|
6
|
+
the command-line interface (CLI).
|
|
7
|
+
|
|
8
|
+
See the [Tutorial](../../tutorial.md) and [User Guide](../../user-guide/index.md)
|
|
9
|
+
for an overview of the CLI main use cases.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from importlib.metadata import version
|
|
13
|
+
|
|
14
|
+
__version__ = version("stratiphy")
|
|
@@ -0,0 +1,426 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import json
|
|
3
|
+
import logging
|
|
4
|
+
import os
|
|
5
|
+
import pathlib
|
|
6
|
+
import sys
|
|
7
|
+
import typing
|
|
8
|
+
|
|
9
|
+
import hpotk
|
|
10
|
+
from hpotk.util import open_text_io_handle_for_reading, open_text_io_handle_for_writing
|
|
11
|
+
|
|
12
|
+
import stratiphy
|
|
13
|
+
from stratiphy.io import StratiphyJSONDecoder
|
|
14
|
+
from stratiphy.model import Sample
|
|
15
|
+
from stratiphy.util import setup_logging
|
|
16
|
+
|
|
17
|
+
PROG = "stratiphy"
|
|
18
|
+
DEFAULT_DATA_PATH = "data"
|
|
19
|
+
DEFAULT_HPO_PATH = "hp.json"
|
|
20
|
+
DEFAULT_SAMPLES_PATH = "samples.json.gz"
|
|
21
|
+
DEFAULT_RESULTS_PATH = "results.pb"
|
|
22
|
+
DEFAULT_RESULTS_JSON_PATH = "results.json"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
# ################################## CLI ######################################
|
|
26
|
+
|
|
27
|
+
logger = logging.getLogger(PROG)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
parser = argparse.ArgumentParser(
|
|
31
|
+
prog=PROG,
|
|
32
|
+
formatter_class=argparse.RawTextHelpFormatter,
|
|
33
|
+
description="Phenotype-driven stratification of patient cohorts",
|
|
34
|
+
epilog="Find more info at https://P2GX.github.io/stratiphy/stable",
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
parser.add_argument(
|
|
38
|
+
"-v",
|
|
39
|
+
"--verbosity",
|
|
40
|
+
action="count",
|
|
41
|
+
default=0,
|
|
42
|
+
help="increase verbosity",
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
parser.add_argument(
|
|
46
|
+
"--version",
|
|
47
|
+
action="version",
|
|
48
|
+
version="%(prog)s {version}".format(version=stratiphy.__version__),
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
# generate subparsers/subcommands
|
|
52
|
+
subparsers = parser.add_subparsers(dest="command")
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
# #################### ------------ `setup` ---------------- ####################
|
|
56
|
+
|
|
57
|
+
parser_setup = subparsers.add_parser(
|
|
58
|
+
"setup",
|
|
59
|
+
description="Initialize stratiphy resources",
|
|
60
|
+
help="initialize stratiphy resources",
|
|
61
|
+
epilog="Find more info at https://P2GX.github.io/stratiphy/stable",
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
subparsers_setup = parser_setup.add_subparsers(dest="command_setup")
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
# #################### ------------ `setup download` ------- ####################
|
|
68
|
+
parser_setup_download = subparsers_setup.add_parser(
|
|
69
|
+
"download",
|
|
70
|
+
help="download the resource files",
|
|
71
|
+
)
|
|
72
|
+
parser_setup_download.add_argument(
|
|
73
|
+
"-d",
|
|
74
|
+
"--data",
|
|
75
|
+
type=pathlib.Path,
|
|
76
|
+
default=pathlib.Path(os.getcwd()).joinpath(DEFAULT_DATA_PATH),
|
|
77
|
+
help=f"where to download the resources (default: {DEFAULT_DATA_PATH})",
|
|
78
|
+
)
|
|
79
|
+
parser_setup_download.add_argument(
|
|
80
|
+
"-w",
|
|
81
|
+
"--overwrite",
|
|
82
|
+
default=False,
|
|
83
|
+
action="store_true",
|
|
84
|
+
help="overwrite previously downloaded resource files",
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def setup_download(
|
|
89
|
+
data: pathlib.Path,
|
|
90
|
+
overwrite: bool,
|
|
91
|
+
) -> int:
|
|
92
|
+
# Ensure the `data` directory exists
|
|
93
|
+
if not os.path.exists(data):
|
|
94
|
+
logger.debug("Creating directory at %s", data)
|
|
95
|
+
os.makedirs(data, exist_ok=True)
|
|
96
|
+
elif os.path.isfile(data):
|
|
97
|
+
logger.error("`-d | --data` must point to a directory, but %s is a file", data)
|
|
98
|
+
return 1
|
|
99
|
+
|
|
100
|
+
# Download HPO, if needed
|
|
101
|
+
fpath_hpo = data.joinpath(DEFAULT_HPO_PATH)
|
|
102
|
+
should_download_hpo = should_execute(
|
|
103
|
+
fpath_hpo,
|
|
104
|
+
"HPO",
|
|
105
|
+
"download",
|
|
106
|
+
overwrite,
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
if should_download_hpo:
|
|
110
|
+
url_hpo = "https://purl.obolibrary.org/obo/hp.json"
|
|
111
|
+
download_resource(
|
|
112
|
+
url_hpo,
|
|
113
|
+
str(fpath_hpo),
|
|
114
|
+
"HPO",
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
return 0
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def should_execute(
|
|
121
|
+
fpath: pathlib.Path,
|
|
122
|
+
resource_name: str,
|
|
123
|
+
action_name: str,
|
|
124
|
+
overwrite: bool,
|
|
125
|
+
) -> bool:
|
|
126
|
+
if os.path.isfile(fpath):
|
|
127
|
+
if overwrite:
|
|
128
|
+
logger.info("Overwriting %s at %s", resource_name, fpath)
|
|
129
|
+
return True
|
|
130
|
+
else:
|
|
131
|
+
logger.info(
|
|
132
|
+
"Cowardly refusing to %s %s since it already exists at %s",
|
|
133
|
+
action_name,
|
|
134
|
+
resource_name,
|
|
135
|
+
fpath,
|
|
136
|
+
)
|
|
137
|
+
return False
|
|
138
|
+
else:
|
|
139
|
+
logger.info("Proceeding with the %s of %s", action_name, resource_name)
|
|
140
|
+
return True
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def download_resource(
|
|
144
|
+
url: str,
|
|
145
|
+
destination: str,
|
|
146
|
+
resource_name: str,
|
|
147
|
+
):
|
|
148
|
+
logger.debug("Fetching %s from %s", resource_name, url)
|
|
149
|
+
logger.debug("Storing %s to %s", resource_name, destination)
|
|
150
|
+
|
|
151
|
+
with (
|
|
152
|
+
open_text_io_handle_for_reading(url) as fhin,
|
|
153
|
+
open_text_io_handle_for_writing(destination) as fhout,
|
|
154
|
+
):
|
|
155
|
+
fhout.write(fhin.read())
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
# #################### ------------ `preprocess` ----------- ####################
|
|
159
|
+
|
|
160
|
+
parser_preprocess = subparsers.add_parser(
|
|
161
|
+
"preprocess",
|
|
162
|
+
help="prepare phenopackets for clustering",
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
parser_preprocess.add_argument(
|
|
166
|
+
"-d",
|
|
167
|
+
"--data",
|
|
168
|
+
type=pathlib.Path,
|
|
169
|
+
default=pathlib.Path(os.getcwd()).joinpath(DEFAULT_DATA_PATH),
|
|
170
|
+
help="path to stratify data directory",
|
|
171
|
+
)
|
|
172
|
+
parser_preprocess.add_argument(
|
|
173
|
+
"--controversy",
|
|
174
|
+
type=str,
|
|
175
|
+
default="small",
|
|
176
|
+
choices=("high", "moderate", "small", "none"),
|
|
177
|
+
help="try to sanitize issues with controversy less than this threshold",
|
|
178
|
+
)
|
|
179
|
+
parser_preprocess.add_argument(
|
|
180
|
+
"outdir",
|
|
181
|
+
type=pathlib.Path,
|
|
182
|
+
default=pathlib.Path(os.getcwd()),
|
|
183
|
+
help="folder for storing the preprocessed files",
|
|
184
|
+
)
|
|
185
|
+
parser_preprocess.add_argument(
|
|
186
|
+
"phenopackets",
|
|
187
|
+
nargs="+",
|
|
188
|
+
type=pathlib.Path,
|
|
189
|
+
help="phenopacket JSON files with case reports for clustering",
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def preprocess(
|
|
194
|
+
data: pathlib.Path,
|
|
195
|
+
controversy: typing.Literal["high", "moderate", "small", "none"],
|
|
196
|
+
outdir: pathlib.Path,
|
|
197
|
+
phenopackets: typing.Sequence[pathlib.Path],
|
|
198
|
+
) -> int:
|
|
199
|
+
import json
|
|
200
|
+
|
|
201
|
+
from stratiphy.io import StratiphyJSONEncoder
|
|
202
|
+
from stratiphy.preprocessing.phenopackets import read_phenopacket
|
|
203
|
+
from stratiphy.preprocessing.sanitize import Controversy, sanitize_samples
|
|
204
|
+
|
|
205
|
+
# Check inputs
|
|
206
|
+
fpath_hpo = data.joinpath(DEFAULT_HPO_PATH)
|
|
207
|
+
if not os.path.isfile(fpath_hpo):
|
|
208
|
+
logger.error("HPO is not present at %s", fpath_hpo.absolute())
|
|
209
|
+
return 1
|
|
210
|
+
# Try to create the output folder, including possibly non-existent parent folders
|
|
211
|
+
os.makedirs(outdir, exist_ok=True)
|
|
212
|
+
# Check the controversy threshold
|
|
213
|
+
assert controversy.lower() in ("high", "moderate", "small", "none")
|
|
214
|
+
|
|
215
|
+
# Read phenopackets
|
|
216
|
+
logger.info("Reading phenopackets")
|
|
217
|
+
logger.debug(
|
|
218
|
+
"Phenopacket paths: %s",
|
|
219
|
+
list(str(pp) for pp in phenopackets),
|
|
220
|
+
)
|
|
221
|
+
samples = tuple(read_phenopacket(pp) for pp in phenopackets)
|
|
222
|
+
logger.info("Read %d phenopackets", len(samples))
|
|
223
|
+
|
|
224
|
+
# Sanitize sample
|
|
225
|
+
logger.info("Sanitizing samples")
|
|
226
|
+
logger.debug("Loading HPO from %s", fpath_hpo.absolute())
|
|
227
|
+
hpo = hpotk.load_minimal_ontology(str(fpath_hpo.absolute()))
|
|
228
|
+
level = Controversy[controversy.upper()]
|
|
229
|
+
logger.debug("Fixing sanity issues at or below %s level of controversy", level.name.lower())
|
|
230
|
+
sanitation_result = sanitize_samples(
|
|
231
|
+
samples=samples,
|
|
232
|
+
hpo=hpo,
|
|
233
|
+
threshold=level,
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
for sample, actions in sanitation_result.get_samples_and_actions():
|
|
237
|
+
print(f"Sample: {sample.labels}")
|
|
238
|
+
for action in actions:
|
|
239
|
+
print(f" - {action}")
|
|
240
|
+
|
|
241
|
+
# Serialize the samples
|
|
242
|
+
logger.info("Serializing the sanitized samples")
|
|
243
|
+
fpath_cohort = os.path.abspath(os.path.join(outdir, DEFAULT_SAMPLES_PATH))
|
|
244
|
+
with open_text_io_handle_for_writing(fpath_cohort) as fh:
|
|
245
|
+
json.dump(sanitation_result.sanitized_samples, fh, cls=StratiphyJSONEncoder)
|
|
246
|
+
logger.info("Wrote the samples to %s", fpath_cohort)
|
|
247
|
+
|
|
248
|
+
return 0
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
# #################### ------------- `compute` ------------- ####################
|
|
252
|
+
|
|
253
|
+
parser_compute = subparsers.add_parser(
|
|
254
|
+
"compute",
|
|
255
|
+
help="execute the clustering workflow",
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
parser_compute.add_argument(
|
|
259
|
+
"-d",
|
|
260
|
+
"--data",
|
|
261
|
+
type=pathlib.Path,
|
|
262
|
+
default=pathlib.Path(os.getcwd()).joinpath(DEFAULT_DATA_PATH),
|
|
263
|
+
help="path to stratify data directory",
|
|
264
|
+
)
|
|
265
|
+
parser_compute.add_argument(
|
|
266
|
+
"--rand-iter",
|
|
267
|
+
type=int,
|
|
268
|
+
default=200,
|
|
269
|
+
help="the number of random cohorts to simulate",
|
|
270
|
+
)
|
|
271
|
+
parser_compute.add_argument(
|
|
272
|
+
"-k",
|
|
273
|
+
"--k-clusters",
|
|
274
|
+
nargs="+",
|
|
275
|
+
type=int,
|
|
276
|
+
default=(2, 3, 4, 5, 6),
|
|
277
|
+
help="k clusters to test",
|
|
278
|
+
)
|
|
279
|
+
parser_compute.add_argument(
|
|
280
|
+
"--mc-iter",
|
|
281
|
+
type=int,
|
|
282
|
+
default=1_000_000,
|
|
283
|
+
help="count of Monte-Carlo simulations for testing term-cluster association",
|
|
284
|
+
)
|
|
285
|
+
parser_compute.add_argument(
|
|
286
|
+
"-s",
|
|
287
|
+
"--samples",
|
|
288
|
+
metavar=DEFAULT_SAMPLES_PATH,
|
|
289
|
+
default=None,
|
|
290
|
+
help="path to JSON file with preprocessed samples",
|
|
291
|
+
)
|
|
292
|
+
parser_compute.add_argument(
|
|
293
|
+
"-r",
|
|
294
|
+
"--results",
|
|
295
|
+
metavar=DEFAULT_RESULTS_PATH,
|
|
296
|
+
default=None,
|
|
297
|
+
help="path to store the clustering result data",
|
|
298
|
+
)
|
|
299
|
+
parser_compute.add_argument(
|
|
300
|
+
"outdir",
|
|
301
|
+
type=pathlib.Path,
|
|
302
|
+
default=pathlib.Path(os.getcwd()),
|
|
303
|
+
help="folder for storing the preprocessed files",
|
|
304
|
+
)
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
def compute(
|
|
308
|
+
k_clusters: typing.Sequence[int],
|
|
309
|
+
n_rand_cohort: int,
|
|
310
|
+
mc_iter: int,
|
|
311
|
+
fpath_samples: typing.Optional[pathlib.Path],
|
|
312
|
+
fpath_results: typing.Optional[pathlib.Path],
|
|
313
|
+
data: pathlib.Path,
|
|
314
|
+
outdir: pathlib.Path,
|
|
315
|
+
) -> int:
|
|
316
|
+
from stratiphy.config import configure_workflow
|
|
317
|
+
|
|
318
|
+
samples = _read_samples(fpath_samples, outdir)
|
|
319
|
+
logger.info("Read %d samples", len(samples))
|
|
320
|
+
|
|
321
|
+
logger.info("Configuring the clustering workflow")
|
|
322
|
+
logger.debug("%d random cohorts", n_rand_cohort)
|
|
323
|
+
fpath_hpo = data.joinpath(DEFAULT_HPO_PATH)
|
|
324
|
+
logger.debug("Using HPO at %s", fpath_hpo.absolute())
|
|
325
|
+
|
|
326
|
+
# Sanitize inputs.
|
|
327
|
+
_validate_is_readable_file(fpath_hpo)
|
|
328
|
+
hpo = hpotk.load_minimal_ontology(str(fpath_hpo))
|
|
329
|
+
workflow = configure_workflow(
|
|
330
|
+
hpo=hpo,
|
|
331
|
+
rand_cohorts=n_rand_cohort,
|
|
332
|
+
mc_iter=mc_iter,
|
|
333
|
+
)
|
|
334
|
+
|
|
335
|
+
logger.info("Executing the workflow")
|
|
336
|
+
result = workflow.run(
|
|
337
|
+
samples=samples,
|
|
338
|
+
k_clusters=k_clusters,
|
|
339
|
+
)
|
|
340
|
+
|
|
341
|
+
logger.debug("Serializing clustering results")
|
|
342
|
+
if fpath_results is None:
|
|
343
|
+
fpath_results = outdir.joinpath(DEFAULT_RESULTS_PATH)
|
|
344
|
+
result.to_protobuf(fpath_results)
|
|
345
|
+
logger.info("Serialized the results to %s", fpath_results.absolute())
|
|
346
|
+
|
|
347
|
+
return 0
|
|
348
|
+
|
|
349
|
+
|
|
350
|
+
# ###############################################################################
|
|
351
|
+
# Utils
|
|
352
|
+
def _make_optional_path(
|
|
353
|
+
path: typing.Optional[str],
|
|
354
|
+
) -> typing.Optional[pathlib.Path]:
|
|
355
|
+
return None if path is None else pathlib.Path(path)
|
|
356
|
+
|
|
357
|
+
|
|
358
|
+
def _read_samples(
|
|
359
|
+
fpath_samples: typing.Optional[pathlib.Path],
|
|
360
|
+
outdir: pathlib.Path,
|
|
361
|
+
) -> typing.Sequence[Sample]:
|
|
362
|
+
if fpath_samples is None:
|
|
363
|
+
fpath_samples = outdir.joinpath(DEFAULT_SAMPLES_PATH)
|
|
364
|
+
logger.debug(
|
|
365
|
+
"Reading samples from %s",
|
|
366
|
+
fpath_samples.absolute(),
|
|
367
|
+
)
|
|
368
|
+
# TODO: remove `str()` when using hpotk>=0.6.1
|
|
369
|
+
with open_text_io_handle_for_reading(fpath_samples) as fh:
|
|
370
|
+
return json.load(fh, cls=StratiphyJSONDecoder)
|
|
371
|
+
|
|
372
|
+
|
|
373
|
+
def _validate_is_readable_file(fpath: typing.Union[str, pathlib.Path]):
|
|
374
|
+
if not isinstance(fpath, (str, pathlib.Path)) or not (os.path.isfile(fpath) and os.access(fpath, os.R_OK)):
|
|
375
|
+
raise ValueError(f"{fpath} is not a `str` or `pathlib.Path` pointing to a readable file")
|
|
376
|
+
|
|
377
|
+
|
|
378
|
+
# ###############################################################################
|
|
379
|
+
|
|
380
|
+
|
|
381
|
+
def main():
|
|
382
|
+
argv = sys.argv[1:]
|
|
383
|
+
|
|
384
|
+
if len(argv) == 0:
|
|
385
|
+
parser.print_help()
|
|
386
|
+
sys.exit(1)
|
|
387
|
+
|
|
388
|
+
args = parser.parse_args(argv)
|
|
389
|
+
|
|
390
|
+
setup_logging(logger, args.verbosity)
|
|
391
|
+
|
|
392
|
+
if args.command == "setup":
|
|
393
|
+
if args.command_setup == "download":
|
|
394
|
+
sys.exit(
|
|
395
|
+
setup_download(
|
|
396
|
+
data=getattr(args, "data"),
|
|
397
|
+
overwrite=getattr(args, "overwrite"),
|
|
398
|
+
),
|
|
399
|
+
)
|
|
400
|
+
else:
|
|
401
|
+
parser_setup.print_help()
|
|
402
|
+
sys.exit(1)
|
|
403
|
+
elif args.command == "preprocess":
|
|
404
|
+
sys.exit(
|
|
405
|
+
preprocess(
|
|
406
|
+
data=getattr(args, "data"),
|
|
407
|
+
controversy=getattr(args, "controversy"),
|
|
408
|
+
outdir=getattr(args, "outdir"),
|
|
409
|
+
phenopackets=getattr(args, "phenopackets"),
|
|
410
|
+
)
|
|
411
|
+
)
|
|
412
|
+
elif args.command == "compute":
|
|
413
|
+
sys.exit(
|
|
414
|
+
compute(
|
|
415
|
+
k_clusters=getattr(args, "k_clusters"),
|
|
416
|
+
n_rand_cohort=getattr(args, "rand_iter"),
|
|
417
|
+
mc_iter=getattr(args, "mc_iter"),
|
|
418
|
+
fpath_samples=_make_optional_path(getattr(args, "samples")),
|
|
419
|
+
fpath_results=_make_optional_path(getattr(args, "results")),
|
|
420
|
+
data=getattr(args, "data"),
|
|
421
|
+
outdir=getattr(args, "outdir"),
|
|
422
|
+
)
|
|
423
|
+
)
|
|
424
|
+
else:
|
|
425
|
+
parser.print_help()
|
|
426
|
+
sys.exit(1)
|
|
File without changes
|