stratiphy 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. stratiphy/__init__.py +14 -0
  2. stratiphy/_cli.py +426 -0
  3. stratiphy/analysis/__init__.py +0 -0
  4. stratiphy/analysis/explain/__init__.py +15 -0
  5. stratiphy/analysis/explain/_explain.py +421 -0
  6. stratiphy/analysis/gap.py +124 -0
  7. stratiphy/analysis/metrics.py +60 -0
  8. stratiphy/analysis/simulate/__init__.py +11 -0
  9. stratiphy/analysis/simulate/_impl.py +318 -0
  10. stratiphy/analysis/split.py +164 -0
  11. stratiphy/bench/__init__.py +5 -0
  12. stratiphy/bench/_bencher.py +125 -0
  13. stratiphy/bench/_cli.py +390 -0
  14. stratiphy/bench/_data.py +170 -0
  15. stratiphy/bench/_io.py +49 -0
  16. stratiphy/bench/_model.py +69 -0
  17. stratiphy/cluster/__init__.py +18 -0
  18. stratiphy/cluster/_base.py +41 -0
  19. stratiphy/cluster/_sim.py +91 -0
  20. stratiphy/cluster/_sklearn_sim.py +33 -0
  21. stratiphy/config/__init__.py +5 -0
  22. stratiphy/config/_workflow.py +43 -0
  23. stratiphy/io.py +164 -0
  24. stratiphy/model/__init__.py +9 -0
  25. stratiphy/model/_base.py +357 -0
  26. stratiphy/preprocessing/__init__.py +3 -0
  27. stratiphy/preprocessing/annoqc.py +76 -0
  28. stratiphy/preprocessing/phenopackets.py +190 -0
  29. stratiphy/preprocessing/sanitize/__init__.py +19 -0
  30. stratiphy/preprocessing/sanitize/_api.py +36 -0
  31. stratiphy/preprocessing/sanitize/_convenience.py +147 -0
  32. stratiphy/preprocessing/sanitize/_impl.py +620 -0
  33. stratiphy/preprocessing/sanitize/_model.py +271 -0
  34. stratiphy/preprocessing/sanitize/_test__model.py +148 -0
  35. stratiphy/preprocessing/summarize/__init__.py +6 -0
  36. stratiphy/preprocessing/summarize/_summarize.py +54 -0
  37. stratiphy/preprocessing/validate/__init__.py +7 -0
  38. stratiphy/preprocessing/validate/_base.py +149 -0
  39. stratiphy/preprocessing/validate/_simple.py +9 -0
  40. stratiphy/py.typed +0 -0
  41. stratiphy/semsim/__init__.py +26 -0
  42. stratiphy/semsim/_base.py +223 -0
  43. stratiphy/semsim/_ic.py +58 -0
  44. stratiphy/semsim/_pe.py +139 -0
  45. stratiphy/semsim/_sts.py +121 -0
  46. stratiphy/semsim/_test__base.py +52 -0
  47. stratiphy/semsim/_test__sts.py +74 -0
  48. stratiphy/util.py +85 -0
  49. stratiphy/workflow/__init__.py +13 -0
  50. stratiphy/workflow/_base.py +705 -0
  51. stratiphy/workflow/util.py +201 -0
  52. stratiphy/workflow/workflow_pb2.py +58 -0
  53. stratiphy/workflow/workflow_pb2.pyi +95 -0
  54. stratiphy-0.3.2.dist-info/METADATA +86 -0
  55. stratiphy-0.3.2.dist-info/RECORD +59 -0
  56. stratiphy-0.3.2.dist-info/WHEEL +5 -0
  57. stratiphy-0.3.2.dist-info/entry_points.txt +3 -0
  58. stratiphy-0.3.2.dist-info/licenses/LICENSE +29 -0
  59. stratiphy-0.3.2.dist-info/top_level.txt +1 -0
stratiphy/__init__.py ADDED
@@ -0,0 +1,14 @@
1
+ """
2
+ The API documentation for the `stratiphy` Python package.
3
+
4
+ The API documentation is targeted for the advanced users wanting to use
5
+ `stratiphy` as a Python library. For general public, we recommend to use
6
+ the command-line interface (CLI).
7
+
8
+ See the [Tutorial](../../tutorial.md) and [User Guide](../../user-guide/index.md)
9
+ for an overview of the CLI main use cases.
10
+ """
11
+
12
+ from importlib.metadata import version
13
+
14
+ __version__ = version("stratiphy")
stratiphy/_cli.py ADDED
@@ -0,0 +1,426 @@
1
+ import argparse
2
+ import json
3
+ import logging
4
+ import os
5
+ import pathlib
6
+ import sys
7
+ import typing
8
+
9
+ import hpotk
10
+ from hpotk.util import open_text_io_handle_for_reading, open_text_io_handle_for_writing
11
+
12
+ import stratiphy
13
+ from stratiphy.io import StratiphyJSONDecoder
14
+ from stratiphy.model import Sample
15
+ from stratiphy.util import setup_logging
16
+
17
+ PROG = "stratiphy"
18
+ DEFAULT_DATA_PATH = "data"
19
+ DEFAULT_HPO_PATH = "hp.json"
20
+ DEFAULT_SAMPLES_PATH = "samples.json.gz"
21
+ DEFAULT_RESULTS_PATH = "results.pb"
22
+ DEFAULT_RESULTS_JSON_PATH = "results.json"
23
+
24
+
25
+ # ################################## CLI ######################################
26
+
27
+ logger = logging.getLogger(PROG)
28
+
29
+
30
+ parser = argparse.ArgumentParser(
31
+ prog=PROG,
32
+ formatter_class=argparse.RawTextHelpFormatter,
33
+ description="Phenotype-driven stratification of patient cohorts",
34
+ epilog="Find more info at https://P2GX.github.io/stratiphy/stable",
35
+ )
36
+
37
+ parser.add_argument(
38
+ "-v",
39
+ "--verbosity",
40
+ action="count",
41
+ default=0,
42
+ help="increase verbosity",
43
+ )
44
+
45
+ parser.add_argument(
46
+ "--version",
47
+ action="version",
48
+ version="%(prog)s {version}".format(version=stratiphy.__version__),
49
+ )
50
+
51
+ # generate subparsers/subcommands
52
+ subparsers = parser.add_subparsers(dest="command")
53
+
54
+
55
+ # #################### ------------ `setup` ---------------- ####################
56
+
57
+ parser_setup = subparsers.add_parser(
58
+ "setup",
59
+ description="Initialize stratiphy resources",
60
+ help="initialize stratiphy resources",
61
+ epilog="Find more info at https://P2GX.github.io/stratiphy/stable",
62
+ )
63
+
64
+ subparsers_setup = parser_setup.add_subparsers(dest="command_setup")
65
+
66
+
67
+ # #################### ------------ `setup download` ------- ####################
68
+ parser_setup_download = subparsers_setup.add_parser(
69
+ "download",
70
+ help="download the resource files",
71
+ )
72
+ parser_setup_download.add_argument(
73
+ "-d",
74
+ "--data",
75
+ type=pathlib.Path,
76
+ default=pathlib.Path(os.getcwd()).joinpath(DEFAULT_DATA_PATH),
77
+ help=f"where to download the resources (default: {DEFAULT_DATA_PATH})",
78
+ )
79
+ parser_setup_download.add_argument(
80
+ "-w",
81
+ "--overwrite",
82
+ default=False,
83
+ action="store_true",
84
+ help="overwrite previously downloaded resource files",
85
+ )
86
+
87
+
88
+ def setup_download(
89
+ data: pathlib.Path,
90
+ overwrite: bool,
91
+ ) -> int:
92
+ # Ensure the `data` directory exists
93
+ if not os.path.exists(data):
94
+ logger.debug("Creating directory at %s", data)
95
+ os.makedirs(data, exist_ok=True)
96
+ elif os.path.isfile(data):
97
+ logger.error("`-d | --data` must point to a directory, but %s is a file", data)
98
+ return 1
99
+
100
+ # Download HPO, if needed
101
+ fpath_hpo = data.joinpath(DEFAULT_HPO_PATH)
102
+ should_download_hpo = should_execute(
103
+ fpath_hpo,
104
+ "HPO",
105
+ "download",
106
+ overwrite,
107
+ )
108
+
109
+ if should_download_hpo:
110
+ url_hpo = "https://purl.obolibrary.org/obo/hp.json"
111
+ download_resource(
112
+ url_hpo,
113
+ str(fpath_hpo),
114
+ "HPO",
115
+ )
116
+
117
+ return 0
118
+
119
+
120
+ def should_execute(
121
+ fpath: pathlib.Path,
122
+ resource_name: str,
123
+ action_name: str,
124
+ overwrite: bool,
125
+ ) -> bool:
126
+ if os.path.isfile(fpath):
127
+ if overwrite:
128
+ logger.info("Overwriting %s at %s", resource_name, fpath)
129
+ return True
130
+ else:
131
+ logger.info(
132
+ "Cowardly refusing to %s %s since it already exists at %s",
133
+ action_name,
134
+ resource_name,
135
+ fpath,
136
+ )
137
+ return False
138
+ else:
139
+ logger.info("Proceeding with the %s of %s", action_name, resource_name)
140
+ return True
141
+
142
+
143
+ def download_resource(
144
+ url: str,
145
+ destination: str,
146
+ resource_name: str,
147
+ ):
148
+ logger.debug("Fetching %s from %s", resource_name, url)
149
+ logger.debug("Storing %s to %s", resource_name, destination)
150
+
151
+ with (
152
+ open_text_io_handle_for_reading(url) as fhin,
153
+ open_text_io_handle_for_writing(destination) as fhout,
154
+ ):
155
+ fhout.write(fhin.read())
156
+
157
+
158
+ # #################### ------------ `preprocess` ----------- ####################
159
+
160
+ parser_preprocess = subparsers.add_parser(
161
+ "preprocess",
162
+ help="prepare phenopackets for clustering",
163
+ )
164
+
165
+ parser_preprocess.add_argument(
166
+ "-d",
167
+ "--data",
168
+ type=pathlib.Path,
169
+ default=pathlib.Path(os.getcwd()).joinpath(DEFAULT_DATA_PATH),
170
+ help="path to stratify data directory",
171
+ )
172
+ parser_preprocess.add_argument(
173
+ "--controversy",
174
+ type=str,
175
+ default="small",
176
+ choices=("high", "moderate", "small", "none"),
177
+ help="try to sanitize issues with controversy less than this threshold",
178
+ )
179
+ parser_preprocess.add_argument(
180
+ "outdir",
181
+ type=pathlib.Path,
182
+ default=pathlib.Path(os.getcwd()),
183
+ help="folder for storing the preprocessed files",
184
+ )
185
+ parser_preprocess.add_argument(
186
+ "phenopackets",
187
+ nargs="+",
188
+ type=pathlib.Path,
189
+ help="phenopacket JSON files with case reports for clustering",
190
+ )
191
+
192
+
193
+ def preprocess(
194
+ data: pathlib.Path,
195
+ controversy: typing.Literal["high", "moderate", "small", "none"],
196
+ outdir: pathlib.Path,
197
+ phenopackets: typing.Sequence[pathlib.Path],
198
+ ) -> int:
199
+ import json
200
+
201
+ from stratiphy.io import StratiphyJSONEncoder
202
+ from stratiphy.preprocessing.phenopackets import read_phenopacket
203
+ from stratiphy.preprocessing.sanitize import Controversy, sanitize_samples
204
+
205
+ # Check inputs
206
+ fpath_hpo = data.joinpath(DEFAULT_HPO_PATH)
207
+ if not os.path.isfile(fpath_hpo):
208
+ logger.error("HPO is not present at %s", fpath_hpo.absolute())
209
+ return 1
210
+ # Try to create the output folder, including possibly non-existent parent folders
211
+ os.makedirs(outdir, exist_ok=True)
212
+ # Check the controversy threshold
213
+ assert controversy.lower() in ("high", "moderate", "small", "none")
214
+
215
+ # Read phenopackets
216
+ logger.info("Reading phenopackets")
217
+ logger.debug(
218
+ "Phenopacket paths: %s",
219
+ list(str(pp) for pp in phenopackets),
220
+ )
221
+ samples = tuple(read_phenopacket(pp) for pp in phenopackets)
222
+ logger.info("Read %d phenopackets", len(samples))
223
+
224
+ # Sanitize sample
225
+ logger.info("Sanitizing samples")
226
+ logger.debug("Loading HPO from %s", fpath_hpo.absolute())
227
+ hpo = hpotk.load_minimal_ontology(str(fpath_hpo.absolute()))
228
+ level = Controversy[controversy.upper()]
229
+ logger.debug("Fixing sanity issues at or below %s level of controversy", level.name.lower())
230
+ sanitation_result = sanitize_samples(
231
+ samples=samples,
232
+ hpo=hpo,
233
+ threshold=level,
234
+ )
235
+
236
+ for sample, actions in sanitation_result.get_samples_and_actions():
237
+ print(f"Sample: {sample.labels}")
238
+ for action in actions:
239
+ print(f" - {action}")
240
+
241
+ # Serialize the samples
242
+ logger.info("Serializing the sanitized samples")
243
+ fpath_cohort = os.path.abspath(os.path.join(outdir, DEFAULT_SAMPLES_PATH))
244
+ with open_text_io_handle_for_writing(fpath_cohort) as fh:
245
+ json.dump(sanitation_result.sanitized_samples, fh, cls=StratiphyJSONEncoder)
246
+ logger.info("Wrote the samples to %s", fpath_cohort)
247
+
248
+ return 0
249
+
250
+
251
+ # #################### ------------- `compute` ------------- ####################
252
+
253
+ parser_compute = subparsers.add_parser(
254
+ "compute",
255
+ help="execute the clustering workflow",
256
+ )
257
+
258
+ parser_compute.add_argument(
259
+ "-d",
260
+ "--data",
261
+ type=pathlib.Path,
262
+ default=pathlib.Path(os.getcwd()).joinpath(DEFAULT_DATA_PATH),
263
+ help="path to stratify data directory",
264
+ )
265
+ parser_compute.add_argument(
266
+ "--rand-iter",
267
+ type=int,
268
+ default=200,
269
+ help="the number of random cohorts to simulate",
270
+ )
271
+ parser_compute.add_argument(
272
+ "-k",
273
+ "--k-clusters",
274
+ nargs="+",
275
+ type=int,
276
+ default=(2, 3, 4, 5, 6),
277
+ help="k clusters to test",
278
+ )
279
+ parser_compute.add_argument(
280
+ "--mc-iter",
281
+ type=int,
282
+ default=1_000_000,
283
+ help="count of Monte-Carlo simulations for testing term-cluster association",
284
+ )
285
+ parser_compute.add_argument(
286
+ "-s",
287
+ "--samples",
288
+ metavar=DEFAULT_SAMPLES_PATH,
289
+ default=None,
290
+ help="path to JSON file with preprocessed samples",
291
+ )
292
+ parser_compute.add_argument(
293
+ "-r",
294
+ "--results",
295
+ metavar=DEFAULT_RESULTS_PATH,
296
+ default=None,
297
+ help="path to store the clustering result data",
298
+ )
299
+ parser_compute.add_argument(
300
+ "outdir",
301
+ type=pathlib.Path,
302
+ default=pathlib.Path(os.getcwd()),
303
+ help="folder for storing the preprocessed files",
304
+ )
305
+
306
+
307
+ def compute(
308
+ k_clusters: typing.Sequence[int],
309
+ n_rand_cohort: int,
310
+ mc_iter: int,
311
+ fpath_samples: typing.Optional[pathlib.Path],
312
+ fpath_results: typing.Optional[pathlib.Path],
313
+ data: pathlib.Path,
314
+ outdir: pathlib.Path,
315
+ ) -> int:
316
+ from stratiphy.config import configure_workflow
317
+
318
+ samples = _read_samples(fpath_samples, outdir)
319
+ logger.info("Read %d samples", len(samples))
320
+
321
+ logger.info("Configuring the clustering workflow")
322
+ logger.debug("%d random cohorts", n_rand_cohort)
323
+ fpath_hpo = data.joinpath(DEFAULT_HPO_PATH)
324
+ logger.debug("Using HPO at %s", fpath_hpo.absolute())
325
+
326
+ # Sanitize inputs.
327
+ _validate_is_readable_file(fpath_hpo)
328
+ hpo = hpotk.load_minimal_ontology(str(fpath_hpo))
329
+ workflow = configure_workflow(
330
+ hpo=hpo,
331
+ rand_cohorts=n_rand_cohort,
332
+ mc_iter=mc_iter,
333
+ )
334
+
335
+ logger.info("Executing the workflow")
336
+ result = workflow.run(
337
+ samples=samples,
338
+ k_clusters=k_clusters,
339
+ )
340
+
341
+ logger.debug("Serializing clustering results")
342
+ if fpath_results is None:
343
+ fpath_results = outdir.joinpath(DEFAULT_RESULTS_PATH)
344
+ result.to_protobuf(fpath_results)
345
+ logger.info("Serialized the results to %s", fpath_results.absolute())
346
+
347
+ return 0
348
+
349
+
350
+ # ###############################################################################
351
+ # Utils
352
+ def _make_optional_path(
353
+ path: typing.Optional[str],
354
+ ) -> typing.Optional[pathlib.Path]:
355
+ return None if path is None else pathlib.Path(path)
356
+
357
+
358
+ def _read_samples(
359
+ fpath_samples: typing.Optional[pathlib.Path],
360
+ outdir: pathlib.Path,
361
+ ) -> typing.Sequence[Sample]:
362
+ if fpath_samples is None:
363
+ fpath_samples = outdir.joinpath(DEFAULT_SAMPLES_PATH)
364
+ logger.debug(
365
+ "Reading samples from %s",
366
+ fpath_samples.absolute(),
367
+ )
368
+ # TODO: remove `str()` when using hpotk>=0.6.1
369
+ with open_text_io_handle_for_reading(fpath_samples) as fh:
370
+ return json.load(fh, cls=StratiphyJSONDecoder)
371
+
372
+
373
+ def _validate_is_readable_file(fpath: typing.Union[str, pathlib.Path]):
374
+ if not isinstance(fpath, (str, pathlib.Path)) or not (os.path.isfile(fpath) and os.access(fpath, os.R_OK)):
375
+ raise ValueError(f"{fpath} is not a `str` or `pathlib.Path` pointing to a readable file")
376
+
377
+
378
+ # ###############################################################################
379
+
380
+
381
+ def main():
382
+ argv = sys.argv[1:]
383
+
384
+ if len(argv) == 0:
385
+ parser.print_help()
386
+ sys.exit(1)
387
+
388
+ args = parser.parse_args(argv)
389
+
390
+ setup_logging(logger, args.verbosity)
391
+
392
+ if args.command == "setup":
393
+ if args.command_setup == "download":
394
+ sys.exit(
395
+ setup_download(
396
+ data=getattr(args, "data"),
397
+ overwrite=getattr(args, "overwrite"),
398
+ ),
399
+ )
400
+ else:
401
+ parser_setup.print_help()
402
+ sys.exit(1)
403
+ elif args.command == "preprocess":
404
+ sys.exit(
405
+ preprocess(
406
+ data=getattr(args, "data"),
407
+ controversy=getattr(args, "controversy"),
408
+ outdir=getattr(args, "outdir"),
409
+ phenopackets=getattr(args, "phenopackets"),
410
+ )
411
+ )
412
+ elif args.command == "compute":
413
+ sys.exit(
414
+ compute(
415
+ k_clusters=getattr(args, "k_clusters"),
416
+ n_rand_cohort=getattr(args, "rand_iter"),
417
+ mc_iter=getattr(args, "mc_iter"),
418
+ fpath_samples=_make_optional_path(getattr(args, "samples")),
419
+ fpath_results=_make_optional_path(getattr(args, "results")),
420
+ data=getattr(args, "data"),
421
+ outdir=getattr(args, "outdir"),
422
+ )
423
+ )
424
+ else:
425
+ parser.print_help()
426
+ sys.exit(1)
File without changes
@@ -0,0 +1,15 @@
1
+ from ._explain import (
2
+ FisherExplainMethod,
3
+ TermAssociation,
4
+ TermCounter,
5
+ TermFilter,
6
+ TermTest,
7
+ )
8
+
9
+ __all__ = [
10
+ "FisherExplainMethod",
11
+ "TermAssociation",
12
+ "TermCounter",
13
+ "TermFilter",
14
+ "TermTest",
15
+ ]