SigProfilerExtractor 1.1.24__tar.gz → 1.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. {SigProfilerExtractor-1.1.24 → sigprofilerextractor-1.2.0}/MANIFEST.in +1 -0
  2. {SigProfilerExtractor-1.1.24/SigProfilerExtractor.egg-info → sigprofilerextractor-1.2.0}/PKG-INFO +65 -43
  3. {SigProfilerExtractor-1.1.24 → sigprofilerextractor-1.2.0}/README.md +42 -41
  4. sigprofilerextractor-1.2.0/SigProfilerExtractor/controllers/cli_controller.py +266 -0
  5. {SigProfilerExtractor-1.1.24 → sigprofilerextractor-1.2.0}/SigProfilerExtractor/nmf_cpu.py +4 -4
  6. {SigProfilerExtractor-1.1.24 → sigprofilerextractor-1.2.0}/SigProfilerExtractor/nmf_gpu.py +4 -4
  7. {SigProfilerExtractor-1.1.24 → sigprofilerextractor-1.2.0}/SigProfilerExtractor/sigpro.py +4 -0
  8. sigprofilerextractor-1.2.0/SigProfilerExtractor/sigprofilerextractor_cli.py +34 -0
  9. {SigProfilerExtractor-1.1.24 → sigprofilerextractor-1.2.0}/SigProfilerExtractor/subroutines.py +25 -15
  10. sigprofilerextractor-1.2.0/SigProfilerExtractor/version.py +7 -0
  11. {SigProfilerExtractor-1.1.24 → sigprofilerextractor-1.2.0/SigProfilerExtractor.egg-info}/PKG-INFO +65 -43
  12. {SigProfilerExtractor-1.1.24 → sigprofilerextractor-1.2.0}/SigProfilerExtractor.egg-info/SOURCES.txt +3 -0
  13. sigprofilerextractor-1.2.0/SigProfilerExtractor.egg-info/entry_points.txt +2 -0
  14. sigprofilerextractor-1.2.0/SigProfilerExtractor.egg-info/requires.txt +11 -0
  15. {SigProfilerExtractor-1.1.24 → sigprofilerextractor-1.2.0}/setup.py +13 -10
  16. SigProfilerExtractor-1.1.24/SigProfilerExtractor/version.py +0 -7
  17. SigProfilerExtractor-1.1.24/SigProfilerExtractor.egg-info/requires.txt +0 -14
  18. {SigProfilerExtractor-1.1.24 → sigprofilerextractor-1.2.0}/LICENSE.txt +0 -0
  19. {SigProfilerExtractor-1.1.24 → sigprofilerextractor-1.2.0}/SigProfilerExtractor/__init__.py +0 -0
  20. {SigProfilerExtractor-1.1.24 → sigprofilerextractor-1.2.0}/SigProfilerExtractor/data/CNVInput/Battenberg_test.tsv +0 -0
  21. {SigProfilerExtractor-1.1.24 → sigprofilerextractor-1.2.0}/SigProfilerExtractor/data/CSVInput/csv_example.csv +0 -0
  22. {SigProfilerExtractor-1.1.24 → sigprofilerextractor-1.2.0}/SigProfilerExtractor/data/MatObjInput/21_breast_WGS_substitutions.mat +0 -0
  23. {SigProfilerExtractor-1.1.24 → sigprofilerextractor-1.2.0}/SigProfilerExtractor/data/ReferenceFiles/CNV_features.tsv +0 -0
  24. {SigProfilerExtractor-1.1.24 → sigprofilerextractor-1.2.0}/SigProfilerExtractor/data/ReferenceFiles/CN_classes_dictionary.txt +0 -0
  25. {SigProfilerExtractor-1.1.24 → sigprofilerextractor-1.2.0}/SigProfilerExtractor/data/TextInput/Samples_CNV.txt +0 -0
  26. {SigProfilerExtractor-1.1.24 → sigprofilerextractor-1.2.0}/SigProfilerExtractor/data/TextInput/Samples_DBS.txt +0 -0
  27. {SigProfilerExtractor-1.1.24 → sigprofilerextractor-1.2.0}/SigProfilerExtractor/data/TextInput/Samples_ID.txt +0 -0
  28. {SigProfilerExtractor-1.1.24 → sigprofilerextractor-1.2.0}/SigProfilerExtractor/data/TextInput/Samples_SBS.txt +0 -0
  29. {SigProfilerExtractor-1.1.24 → sigprofilerextractor-1.2.0}/SigProfilerExtractor/data/TextInput/Samples_SV.txt +0 -0
  30. {SigProfilerExtractor-1.1.24 → sigprofilerextractor-1.2.0}/SigProfilerExtractor/data/VCFInput/PD3851a.vcf +0 -0
  31. {SigProfilerExtractor-1.1.24 → sigprofilerextractor-1.2.0}/SigProfilerExtractor/data/VCFInput/PD3890a.vcf +0 -0
  32. {SigProfilerExtractor-1.1.24 → sigprofilerextractor-1.2.0}/SigProfilerExtractor/data/VCFInput/PD3904a.vcf +0 -0
  33. {SigProfilerExtractor-1.1.24 → sigprofilerextractor-1.2.0}/SigProfilerExtractor/data/VCFInput/PD3905a.vcf +0 -0
  34. {SigProfilerExtractor-1.1.24 → sigprofilerextractor-1.2.0}/SigProfilerExtractor/data/VCFInput/PD3945a.vcf +0 -0
  35. {SigProfilerExtractor-1.1.24 → sigprofilerextractor-1.2.0}/SigProfilerExtractor/estimate_best_solution.py +0 -0
  36. {SigProfilerExtractor-1.1.24 → sigprofilerextractor-1.2.0}/SigProfilerExtractor.egg-info/dependency_links.txt +0 -0
  37. {SigProfilerExtractor-1.1.24 → sigprofilerextractor-1.2.0}/SigProfilerExtractor.egg-info/not-zip-safe +0 -0
  38. {SigProfilerExtractor-1.1.24 → sigprofilerextractor-1.2.0}/SigProfilerExtractor.egg-info/top_level.txt +0 -0
  39. {SigProfilerExtractor-1.1.24 → sigprofilerextractor-1.2.0}/setup.cfg +0 -0
@@ -4,4 +4,5 @@ include SigProfilerExtractor/data/CNVInput/*
4
4
  include SigProfilerExtractor/data/CSVInput/*
5
5
  include SigProfilerExtractor/data/MatObjInput/*
6
6
  include SigProfilerExtractor/data/ReferenceFiles/*
7
+ include SigProfilerExtractor/controllers/*
7
8
 
@@ -1,17 +1,38 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.2
2
2
  Name: SigProfilerExtractor
3
- Version: 1.1.24
3
+ Version: 1.2.0
4
4
  Summary: Extracts mutational signatures from mutational catalogues
5
5
  Home-page: https://github.com/AlexandrovLab/SigProfilerExtractor.git
6
6
  Author: S Mishu Ashiqul Islam
7
7
  Author-email: m0islam@ucsd.edu
8
8
  License: UCSD
9
+ Requires-Python: >=3.9
9
10
  Description-Content-Type: text/markdown
10
11
  License-File: LICENSE.txt
12
+ Requires-Dist: scipy>=1.6.3
13
+ Requires-Dist: torch>=1.8.1
14
+ Requires-Dist: numpy>=2.0.0
15
+ Requires-Dist: pandas>=2.0.0
16
+ Requires-Dist: nimfa>=1.1.0
17
+ Requires-Dist: sigProfilerPlotting>=1.4.0
18
+ Requires-Dist: SigProfilerMatrixGenerator>=1.3.0
19
+ Requires-Dist: SigProfilerAssignment>=0.2.0
20
+ Requires-Dist: statsmodels>=0.9.0
21
+ Requires-Dist: scikit-learn>=0.24.2
22
+ Requires-Dist: psutil>=5.6.1
23
+ Dynamic: author
24
+ Dynamic: author-email
25
+ Dynamic: description
26
+ Dynamic: description-content-type
27
+ Dynamic: home-page
28
+ Dynamic: license
29
+ Dynamic: requires-dist
30
+ Dynamic: requires-python
31
+ Dynamic: summary
11
32
 
12
33
  [![Docs](https://img.shields.io/badge/docs-latest-blue.svg)](https://osf.io/t6j7u/wiki/home/)
13
34
  [![License](https://img.shields.io/badge/License-BSD\%202--Clause-orange.svg)](https://opensource.org/licenses/BSD-2-Clause)
14
- [![Build Status](https://travis-ci.com/AlexandrovLab/SigProfilerExtractor.svg?branch=master)](https://app.travis-ci.com/AlexandrovLab/SigProfilerExtractor)
35
+ [![Build Status](https://app.travis-ci.com/AlexandrovLab/SigProfilerExtractor.svg?branch=master)](https://app.travis-ci.com/AlexandrovLab/SigProfilerExtractor)
15
36
 
16
37
  # SigProfilerExtractor
17
38
  SigProfilerExtractor allows de novo extraction of mutational signatures from data generated in a matrix format.
@@ -104,43 +125,44 @@ sigProfilerExtractor(input_type, out_put, input_data, reference_genome="GRCh37",
104
125
  | Category | Parameter | Variable Type | Parameter Description |
105
126
  | --------- | --------------------- | -------- |-------- |
106
127
  | **Input Data** | | | |
107
- | | **input_type** | String | The type of input:<br><ul><li>"vcf": used for vcf format inputs.</li><li>"matrix": used for table format inputs using a tab separated file.</li><li>"bedpe": used for bedpe files with each SV annotated with its type, size bin, and clustered/non-clustered status. Please check the required format at https://github.com/AlexandrovLab/SigProfilerMatrixGenerator#structural-variant-matrix-generation.</li><li>"seg:TYPE": used for a multi-sample segmentation file for copy number analysis. Please check the required format at https://github.com/AlexandrovLab/SigProfilerMatrixGenerator#copy-number-matrix-generation. The accepted callers for TYPE are the following {"ASCAT", "ASCAT_NGS", "SEQUENZA", "ABSOLUTE", "BATTENBERG", "FACETS", "PURPLE", "TCGA"}. For example, when using segmentation file from BATTENBERG then set input_type to "seg:BATTENBERG".</li></ul> |
128
+ | | **input_type** | String | The type of input:<br><ul><li>`"vcf"`: used for vcf format inputs.</li><li>`"matrix"`: used for table format inputs using a tab separated file.</li><li>`"bedpe"`: used for bedpe files with each SV annotated with its type, size bin, and clustered/non-clustered status. Please check the required format at https://github.com/AlexandrovLab/SigProfilerMatrixGenerator#structural-variant-matrix-generation.</li><li>`"seg:TYPE"`: used for a multi-sample segmentation file for copy number analysis. Please check the required format at https://github.com/AlexandrovLab/SigProfilerMatrixGenerator#copy-number-matrix-generation. The accepted callers for TYPE are the following {"ASCAT", "ASCAT_NGS", "SEQUENZA", "ABSOLUTE", "BATTENBERG", "FACETS", "PURPLE", "TCGA"}. For example, when using segmentation file from BATTENBERG then set input_type to "seg:BATTENBERG".</li></ul> |
108
129
  | | **output** | String | The name of the output folder. The output folder will be generated in the current working directory. |
109
- | | **input_data** | String | <br>Path to input folder for input_type:<ul><li>vcf</li><li>bedpe</li></ul>Path to file for input_type:<ul><li>matrix</li><li>seg:TYPE</li></ul> |
110
- | | **reference_genome** | String | The name of the reference genome. The default reference genome is "GRCh37". This parameter is applicable only if the input_type is "vcf". |
111
- | | **opportunity_genome** | String | The build or version of the reference genome for the reference signatures. The default opportunity genome is GRCh37. If the input_type is "vcf", the opportunity_genome automatically matches the input reference genome value. Only the genomes available in COSMIC are supported (GRCh37, GRCh38, mm9, mm10 and rn6). If a different opportunity genome is selected, the default genome GRCh37 will be used. |
112
- | | **context_type** | String | A string of mutaion context name/names separated by comma (","). The items in the list defines the mutational contexts to be considered to extract the signatures. The default value is "96,DINUC,ID", where "96" is the SBS96 context, "DINUC" is the DINUCLEOTIDE context and ID is INDEL context. |
113
- | | **exome** | Boolean | Defines if the exomes will be extracted. The default value is "False". |
130
+ | | **input_data** | String | <br>Path to input folder for input_type:<ul><li>`vcf`</li><li>`bedpe`</li></ul>Path to file for input_type:<ul><li>`matrix`</li><li>`seg:TYPE`</li></ul> |
131
+ | | **reference_genome** | String | The name of the reference genome (default: `"GRCh37"`). This parameter is applicable only if the `input_type` is `"vcf"`. |
132
+ | | **opportunity_genome** | String | The build or version of the reference genome for the reference signatures (default: `"GRCh37"`). When the input_type is `"vcf"`, the opportunity_genome automatically matches the input reference genome value. Only the genomes available in COSMIC are supported (`GRCh37`, `GRCh38`, `mm9`, `mm10`, and `rn6`). If a different opportunity genome is selected, the default genome `GRCh37` will be used. |
133
+ | | **context_type** | String | Mutation context name(s), separated by commas (`,`), that define the mutational contexts for signature extraction (default: `"96,DINUC,ID"`). In the default value, `96` represents the SBS96 context, `DINUC` represents the dinucleotide context, and `ID` represents the indel context. |
134
+ | | **exome** | Boolean | Defines if the exomes will be extracted (default: `False`). |
114
135
  | **NMF Replicates** | | | |
115
- | | **minimum_signatures** | Positive Integer | The minimum number of signatures to be extracted. The default value is 1. |
116
- | | **maximum_signatures** | Positive Integer | The maximum number of signatures to be extracted. The default value is 25. |
117
- | | **nmf_replicates** | Positive Integer | The number of iteration to be performed to extract each number signature. The default value is 100. |
118
- | | **resample** | Boolean | Default is True. If True, add poisson noise to samples by resampling. |
119
- | | **seeds** | String | It can be used to get reproducible resamples for the NMF replicates. A path of a tab separated .txt file containing the replicated id and preset seeds in a two columns dataframe can be passed through this parameter. The Seeds.txt file in the results folder from a previous analysis can be used for the seeds parameter in a new analysis. The Default value for this parameter is "random". When "random", the seeds for resampling will be random for different analysis. |
136
+ | | **minimum_signatures** | Positive Integer | The minimum number of signatures to be extracted (default: `1`). |
137
+ | | **maximum_signatures** | Positive Integer | The maximum number of signatures to be extracted (default: `25`). |
138
+ | | **nmf_replicates** | Positive Integer | The number of iteration to be performed to extract each number signature (default: `100`). |
139
+ | | **resample** | Boolean | If `True`, add poisson noise to samples by resampling (default: `True`). |
140
+ | | **seeds** | String | Ensures reproducible NMF replicate resamples. Provide the path to the `Seeds.txt` file (found in the results folder from a previous analysis) to reproduce results (default: `"random"`). |
120
141
  | **NMF Engines** | | | |
121
- | | **matrix_normalization** | String | Method of normalizing the genome matrix before it is analyzed by NMF. Default is value is "gmm". Other options are, "log2", "custom" or "none". |
122
- | | **nmf_init** | String | The initialization algorithm for W and H matrix of NMF. Options are 'random', 'nndsvd', 'nndsvda', 'nndsvdar' and 'nndsvd_min'. Default is 'random'. |
123
- | | **precision** | String | Values should be single or double. Default is single. |
124
- | | **min_nmf_iterations** | Integer | Value defines the minimum number of iterations to be completed before NMF converges. Default is 10000. |
125
- | | **max_nmf_iterations** | Integer | Value defines the maximum number of iterations to be completed before NMF converges. Default is 1000000. |
126
- | | **nmf_test_conv** | Integer | Value defines the number number of iterations to done between checking next convergence. Default is 10000. |
127
- | | **nmf_tolerance** | Float | Value defines the tolerance to achieve to converge. Default is 1e-15. |
142
+ | | **matrix_normalization** | String | Method of normalizing the genome matrix before it is analyzed by NMF (default: `"gmm"`). Options are, `"log2"`, `"custom"` or `"none"`. |
143
+ | | **nmf_init** | String | The initialization algorithm for W and H matrix of NMF (default: `"random"`). Options are `"random"`, `"nndsvd"`, `"nndsvda"`, `"nndsvdar"` and `"nndsvd_min"`. |
144
+ | | **precision** | String | Values should be single or double (default: `"single"`). |
145
+ | | **min_nmf_iterations** | Integer | Value defines the minimum number of iterations to be completed before NMF converges (default: `10000`). |
146
+ | | **max_nmf_iterations** | Integer | Value defines the maximum number of iterations to be completed before NMF converges (default: `1000000`). |
147
+ | | **nmf_test_conv** | Integer | Value defines the number number of iterations to done between checking next convergence (default: `10000`). |
148
+ | | **nmf_tolerance** | Float | Value defines the tolerance to achieve to converge (default: `1e-15`).|
128
149
  | **Execution** | | | |
129
- | | **cpu** | Integer | The number of processors to be used to extract the signatures. The default value is -1 which will use all available processors. |
130
- | | **gpu** | Boolean | Defines if the GPU resource will used if available. Default is False. If True, the GPU resources will be used in the computation. *Note: All available CPU processors are used by default, which may cause a memory error. This error can be resolved by reducing the number of CPU processes through the **cpu** parameter.*|
131
- | | **batch_size** | Integer | Will be effective only if the GPU is used. Defines the number of NMF replicates to be performed by each CPU during the parallel processing. Default is 1. |
150
+ | | **cpu** | Integer | The number of processors to be used to extract the signatures (default: all processors). |
151
+ | | **gpu** | Boolean | Defines if the GPU resource will used if available (default: `False`). If `True`, the GPU resources will be used in the computation. *Note: All available CPU processors are used by default, which may cause a memory error. This error can be resolved by reducing the number of CPU processes through the `cpu` parameter.*|
152
+ | | **batch_size** | Integer | Will be effective only if the GPU is used. Defines the number of NMF replicates to be performed by each CPU during the parallel processing (default: `1`). *Note: For `batch_size` values greater than 1, each NMF replicate will update until `max_nmf_iterations` is reached.*|
132
153
  | **Solution Estimation Thresholds** | | | |
133
- | | **stability** | Float | Default is 0.8. The cutoff thresh-hold of the average stability. Solutions with average stabilities below this thresh-hold will not be considered. |
134
- | | **min_stability** | Float | Default is 0.2. The cutoff thresh-hold of the minimum stability. Solutions with minimum stabilities below this thresh-hold will not be considered. |
135
- | | **combined_stability** | Float | Default is 1.0. The cutoff thresh-hold of the combined stability (sum of average and minimum stability). Solutions with combined stabilities below this thresh-hold will not be considered. |
136
- | | **allow_stability_drop** | Boolean | Default is False. Defines if solutions with a drop in stability with respect to the highest stable number of signatures will be considered. |
154
+ | | **stability** | Float | The cutoff thresh-hold of the average stability (default: `0.8`). Solutions with average stabilities below this thresh-hold will not be considered. |
155
+ | | **min_stability** | Float | The cutoff thresh-hold of the minimum stability (default: `0.2`). Solutions with minimum stabilities below this thresh-hold will not be considered. |
156
+ | | **combined_stability** | Float | The cutoff thresh-hold of the combined stability (sum of average and minimum stability) (default: `1.0`). Solutions with combined stabilities below this thresh-hold will not be considered. |
157
+ | | **allow_stability_drop** | Boolean | Defines if solutions with a drop in stability with respect to the highest stable number of signatures will be considered (default: `False`). |
137
158
  | **Decomposition** | | | |
138
- | | **cosmic_version** | Float | Takes a positive float among 1, 2, 3, 3.1, 3.2, 3.3, and 3.4. Default is 3.4. Defines the version of the COSMIC reference signatures. |
139
- | | **make_decomposition_plots** | Boolean | Defualt is True. If True, Denovo to Cosmic sigantures decompostion plots will be created as a part the results. |
140
- | | **collapse_to_SBS96** | Boolean | Defualt is True. If True, SBS288 and SBS1536 Denovo signatures will be mapped to SBS96 reference signatures. If False, those will be mapped to reference signatures of the same context.
159
+ | | **cosmic_version** | Float | Defines the version of the COSMIC reference signatures (default: `3.4`). Takes a positive float among `1`, `2`, `3`, `3.1`, `3.2`, `3.3`, and `3.4`.|
160
+ | | **make_decomposition_plots** | Boolean | Generate de novo to COSMIC signature decomposition plots as part of the results (default: `True`). Set to `False` to skip generating these plots. |
161
+ | | **collapse_to_SBS96** | Boolean | If `True`, SBS288 and SBS1536 de novo signatures will be mapped to SBS96 reference signatures (default: `True`). If `False`, those will be mapped to reference signatures of the same context.
141
162
  | **Others** | | | |
142
- | | **get_all_signature_matrices** | Boolean | If True, the Ws and Hs from all the NMF iterations are generated in the output. |
143
- | | **export_probabilities** | Boolean | Defualt is True. If False, then doesn't create the probability matrix. |
163
+ | | **get_all_signature_matrices** | Boolean | Write to output Ws and Hs from all the NMF iterations (default: `False`) |
164
+ | | **export_probabilities** | Boolean | Create the probability matrix (default: `True`). |
165
+ | | **volume** | String | Path to the volume for writing and loading reference genomes, plotting templates, and COSMIC signature plots (default: `None`). Environmental variables take precedence: `SIGPROFILERMATRIXGENERATOR_VOLUME`, `SIGPROFILERPLOTTING_VOLUME`, and `SIGPROFILERASSIGNMENT_VOLUME`. |
144
166
 
145
167
  #### sigProfilerExtractor Example
146
168
  VCF Files as Input
@@ -191,16 +213,16 @@ estimate_solution(base_csvfile="All_solutions_stat.csv",
191
213
 
192
214
  | Parameter | Variable Type | Parameter Description |
193
215
  | --------------------- | -------- |-------- |
194
- | **base_csvfile** | String | Default is "All_solutions_stat.csv". Path to a csv file that contains the statistics of all solutions. |
195
- | **All_solution** | String | Default is "All_Solutions". Path to a folder that contains the results of all solutions. |
196
- | **genomes** | String | Default is Samples.txt. Path to a tab delimilted file that contains the mutation counts for all genomes given to different mutation types. |
197
- | **output** | String | Default is "results". Path to the output folder. |
198
- | **title** | String | Default is "Selection_Plot". This sets the title of the selection_plot.pdf |
216
+ | **base_csvfile** | String | Default is `"All_solutions_stat.csv"`. Path to a CSV file that contains the statistics of all solutions. |
217
+ | **All_solution** | String | Default is `"All_Solutions"`. Path to a folder that contains the results of all solutions. |
218
+ | **genomes** | String | Default is `"Samples.txt"`. Path to a tab delimilted file that contains the mutation counts for all genomes given to different mutation types. |
219
+ | **output** | String | Default is `"results"`. Path to the output folder. |
220
+ | **title** | String | Default is `"Selection_Plot"`. This sets the title of the selection_plot.pdf |
199
221
  | **stability** | Float | Default is 0.8. The cutoff thresh-hold of the average stability. Solutions with average stabilities below this thresh-hold will not be considered. |
200
- | **min_stability** | Float | Default is 0.2. The cutoff thresh-hold of the minimum stability. Solutions with minimum stabilities below this thresh-hold will not be considered. |
201
- | **combined_stability** | Float | Default is 1.0. The cutoff thresh-hold of the combined stability (sum of average and minimum stability). Solutions with combined stabilities below this thresh-hold will not be considered. |
202
- | **allow_stability_drop** | Boolean | Default is False. Defines if solutions with a drop in stability with respect to the highest stable number of signatures will be considered. |
203
- | **exome** | Boolean | Default is "False". Defines if exomes samples are used. |
222
+ | **min_stability** | Float | Default is `0.2`. The cutoff thresh-hold of the minimum stability. Solutions with minimum stabilities below this thresh-hold will not be considered. |
223
+ | **combined_stability** | Float | Default is `1.0`. The cutoff thresh-hold of the combined stability (sum of average and minimum stability). Solutions with combined stabilities below this thresh-hold will not be considered. |
224
+ | **allow_stability_drop** | Boolean | Default is `False`. Defines if solutions with a drop in stability with respect to the highest stable number of signatures will be considered. |
225
+ | **exome** | Boolean | Default is `False`. Defines if exomes samples are used. |
204
226
 
205
227
 
206
228
  #### Estimation of the Optimum Solution Example
@@ -227,7 +249,7 @@ The files below will be generated in the output folder:
227
249
 
228
250
  ### <a name="decompose"></a> Decompose
229
251
 
230
- For decomposition of denovo signatures please use [SigProfilerAssignment](https://github.com/AlexandrovLab/SigProfilerAssignment)
252
+ For decomposition of de novo signatures please use [SigProfilerAssignment](https://github.com/AlexandrovLab/SigProfilerAssignment)
231
253
 
232
254
  ### <a name="plotActivity"></a> Activity Stacked Bar Plot
233
255
  Generates a stacked bar plot showing activities in individuals
@@ -1,6 +1,6 @@
1
1
  [![Docs](https://img.shields.io/badge/docs-latest-blue.svg)](https://osf.io/t6j7u/wiki/home/)
2
2
  [![License](https://img.shields.io/badge/License-BSD\%202--Clause-orange.svg)](https://opensource.org/licenses/BSD-2-Clause)
3
- [![Build Status](https://travis-ci.com/AlexandrovLab/SigProfilerExtractor.svg?branch=master)](https://app.travis-ci.com/AlexandrovLab/SigProfilerExtractor)
3
+ [![Build Status](https://app.travis-ci.com/AlexandrovLab/SigProfilerExtractor.svg?branch=master)](https://app.travis-ci.com/AlexandrovLab/SigProfilerExtractor)
4
4
 
5
5
  # SigProfilerExtractor
6
6
  SigProfilerExtractor allows de novo extraction of mutational signatures from data generated in a matrix format.
@@ -93,43 +93,44 @@ sigProfilerExtractor(input_type, out_put, input_data, reference_genome="GRCh37",
93
93
  | Category | Parameter | Variable Type | Parameter Description |
94
94
  | --------- | --------------------- | -------- |-------- |
95
95
  | **Input Data** | | | |
96
- | | **input_type** | String | The type of input:<br><ul><li>"vcf": used for vcf format inputs.</li><li>"matrix": used for table format inputs using a tab separated file.</li><li>"bedpe": used for bedpe files with each SV annotated with its type, size bin, and clustered/non-clustered status. Please check the required format at https://github.com/AlexandrovLab/SigProfilerMatrixGenerator#structural-variant-matrix-generation.</li><li>"seg:TYPE": used for a multi-sample segmentation file for copy number analysis. Please check the required format at https://github.com/AlexandrovLab/SigProfilerMatrixGenerator#copy-number-matrix-generation. The accepted callers for TYPE are the following {"ASCAT", "ASCAT_NGS", "SEQUENZA", "ABSOLUTE", "BATTENBERG", "FACETS", "PURPLE", "TCGA"}. For example, when using segmentation file from BATTENBERG then set input_type to "seg:BATTENBERG".</li></ul> |
96
+ | | **input_type** | String | The type of input:<br><ul><li>`"vcf"`: used for vcf format inputs.</li><li>`"matrix"`: used for table format inputs using a tab separated file.</li><li>`"bedpe"`: used for bedpe files with each SV annotated with its type, size bin, and clustered/non-clustered status. Please check the required format at https://github.com/AlexandrovLab/SigProfilerMatrixGenerator#structural-variant-matrix-generation.</li><li>`"seg:TYPE"`: used for a multi-sample segmentation file for copy number analysis. Please check the required format at https://github.com/AlexandrovLab/SigProfilerMatrixGenerator#copy-number-matrix-generation. The accepted callers for TYPE are the following {"ASCAT", "ASCAT_NGS", "SEQUENZA", "ABSOLUTE", "BATTENBERG", "FACETS", "PURPLE", "TCGA"}. For example, when using segmentation file from BATTENBERG then set input_type to "seg:BATTENBERG".</li></ul> |
97
97
  | | **output** | String | The name of the output folder. The output folder will be generated in the current working directory. |
98
- | | **input_data** | String | <br>Path to input folder for input_type:<ul><li>vcf</li><li>bedpe</li></ul>Path to file for input_type:<ul><li>matrix</li><li>seg:TYPE</li></ul> |
99
- | | **reference_genome** | String | The name of the reference genome. The default reference genome is "GRCh37". This parameter is applicable only if the input_type is "vcf". |
100
- | | **opportunity_genome** | String | The build or version of the reference genome for the reference signatures. The default opportunity genome is GRCh37. If the input_type is "vcf", the opportunity_genome automatically matches the input reference genome value. Only the genomes available in COSMIC are supported (GRCh37, GRCh38, mm9, mm10 and rn6). If a different opportunity genome is selected, the default genome GRCh37 will be used. |
101
- | | **context_type** | String | A string of mutaion context name/names separated by comma (","). The items in the list defines the mutational contexts to be considered to extract the signatures. The default value is "96,DINUC,ID", where "96" is the SBS96 context, "DINUC" is the DINUCLEOTIDE context and ID is INDEL context. |
102
- | | **exome** | Boolean | Defines if the exomes will be extracted. The default value is "False". |
98
+ | | **input_data** | String | <br>Path to input folder for input_type:<ul><li>`vcf`</li><li>`bedpe`</li></ul>Path to file for input_type:<ul><li>`matrix`</li><li>`seg:TYPE`</li></ul> |
99
+ | | **reference_genome** | String | The name of the reference genome (default: `"GRCh37"`). This parameter is applicable only if the `input_type` is `"vcf"`. |
100
+ | | **opportunity_genome** | String | The build or version of the reference genome for the reference signatures (default: `"GRCh37"`). When the input_type is `"vcf"`, the opportunity_genome automatically matches the input reference genome value. Only the genomes available in COSMIC are supported (`GRCh37`, `GRCh38`, `mm9`, `mm10`, and `rn6`). If a different opportunity genome is selected, the default genome `GRCh37` will be used. |
101
+ | | **context_type** | String | Mutation context name(s), separated by commas (`,`), that define the mutational contexts for signature extraction (default: `"96,DINUC,ID"`). In the default value, `96` represents the SBS96 context, `DINUC` represents the dinucleotide context, and `ID` represents the indel context. |
102
+ | | **exome** | Boolean | Defines if the exomes will be extracted (default: `False`). |
103
103
  | **NMF Replicates** | | | |
104
- | | **minimum_signatures** | Positive Integer | The minimum number of signatures to be extracted. The default value is 1. |
105
- | | **maximum_signatures** | Positive Integer | The maximum number of signatures to be extracted. The default value is 25. |
106
- | | **nmf_replicates** | Positive Integer | The number of iteration to be performed to extract each number signature. The default value is 100. |
107
- | | **resample** | Boolean | Default is True. If True, add poisson noise to samples by resampling. |
108
- | | **seeds** | String | It can be used to get reproducible resamples for the NMF replicates. A path of a tab separated .txt file containing the replicated id and preset seeds in a two columns dataframe can be passed through this parameter. The Seeds.txt file in the results folder from a previous analysis can be used for the seeds parameter in a new analysis. The Default value for this parameter is "random". When "random", the seeds for resampling will be random for different analysis. |
104
+ | | **minimum_signatures** | Positive Integer | The minimum number of signatures to be extracted (default: `1`). |
105
+ | | **maximum_signatures** | Positive Integer | The maximum number of signatures to be extracted (default: `25`). |
106
+ | | **nmf_replicates** | Positive Integer | The number of iteration to be performed to extract each number signature (default: `100`). |
107
+ | | **resample** | Boolean | If `True`, add poisson noise to samples by resampling (default: `True`). |
108
+ | | **seeds** | String | Ensures reproducible NMF replicate resamples. Provide the path to the `Seeds.txt` file (found in the results folder from a previous analysis) to reproduce results (default: `"random"`). |
109
109
  | **NMF Engines** | | | |
110
- | | **matrix_normalization** | String | Method of normalizing the genome matrix before it is analyzed by NMF. Default is value is "gmm". Other options are, "log2", "custom" or "none". |
111
- | | **nmf_init** | String | The initialization algorithm for W and H matrix of NMF. Options are 'random', 'nndsvd', 'nndsvda', 'nndsvdar' and 'nndsvd_min'. Default is 'random'. |
112
- | | **precision** | String | Values should be single or double. Default is single. |
113
- | | **min_nmf_iterations** | Integer | Value defines the minimum number of iterations to be completed before NMF converges. Default is 10000. |
114
- | | **max_nmf_iterations** | Integer | Value defines the maximum number of iterations to be completed before NMF converges. Default is 1000000. |
115
- | | **nmf_test_conv** | Integer | Value defines the number number of iterations to done between checking next convergence. Default is 10000. |
116
- | | **nmf_tolerance** | Float | Value defines the tolerance to achieve to converge. Default is 1e-15. |
110
+ | | **matrix_normalization** | String | Method of normalizing the genome matrix before it is analyzed by NMF (default: `"gmm"`). Options are, `"log2"`, `"custom"` or `"none"`. |
111
+ | | **nmf_init** | String | The initialization algorithm for W and H matrix of NMF (default: `"random"`). Options are `"random"`, `"nndsvd"`, `"nndsvda"`, `"nndsvdar"` and `"nndsvd_min"`. |
112
+ | | **precision** | String | Values should be single or double (default: `"single"`). |
113
+ | | **min_nmf_iterations** | Integer | Value defines the minimum number of iterations to be completed before NMF converges (default: `10000`). |
114
+ | | **max_nmf_iterations** | Integer | Value defines the maximum number of iterations to be completed before NMF converges (default: `1000000`). |
115
+ | | **nmf_test_conv** | Integer | Value defines the number number of iterations to done between checking next convergence (default: `10000`). |
116
+ | | **nmf_tolerance** | Float | Value defines the tolerance to achieve to converge (default: `1e-15`).|
117
117
  | **Execution** | | | |
118
- | | **cpu** | Integer | The number of processors to be used to extract the signatures. The default value is -1 which will use all available processors. |
119
- | | **gpu** | Boolean | Defines if the GPU resource will used if available. Default is False. If True, the GPU resources will be used in the computation. *Note: All available CPU processors are used by default, which may cause a memory error. This error can be resolved by reducing the number of CPU processes through the **cpu** parameter.*|
120
- | | **batch_size** | Integer | Will be effective only if the GPU is used. Defines the number of NMF replicates to be performed by each CPU during the parallel processing. Default is 1. |
118
+ | | **cpu** | Integer | The number of processors to be used to extract the signatures (default: all processors). |
119
+ | | **gpu** | Boolean | Defines if the GPU resource will used if available (default: `False`). If `True`, the GPU resources will be used in the computation. *Note: All available CPU processors are used by default, which may cause a memory error. This error can be resolved by reducing the number of CPU processes through the `cpu` parameter.*|
120
+ | | **batch_size** | Integer | Will be effective only if the GPU is used. Defines the number of NMF replicates to be performed by each CPU during the parallel processing (default: `1`). *Note: For `batch_size` values greater than 1, each NMF replicate will update until `max_nmf_iterations` is reached.*|
121
121
  | **Solution Estimation Thresholds** | | | |
122
- | | **stability** | Float | Default is 0.8. The cutoff thresh-hold of the average stability. Solutions with average stabilities below this thresh-hold will not be considered. |
123
- | | **min_stability** | Float | Default is 0.2. The cutoff thresh-hold of the minimum stability. Solutions with minimum stabilities below this thresh-hold will not be considered. |
124
- | | **combined_stability** | Float | Default is 1.0. The cutoff thresh-hold of the combined stability (sum of average and minimum stability). Solutions with combined stabilities below this thresh-hold will not be considered. |
125
- | | **allow_stability_drop** | Boolean | Default is False. Defines if solutions with a drop in stability with respect to the highest stable number of signatures will be considered. |
122
+ | | **stability** | Float | The cutoff thresh-hold of the average stability (default: `0.8`). Solutions with average stabilities below this thresh-hold will not be considered. |
123
+ | | **min_stability** | Float | The cutoff thresh-hold of the minimum stability (default: `0.2`). Solutions with minimum stabilities below this thresh-hold will not be considered. |
124
+ | | **combined_stability** | Float | The cutoff thresh-hold of the combined stability (sum of average and minimum stability) (default: `1.0`). Solutions with combined stabilities below this thresh-hold will not be considered. |
125
+ | | **allow_stability_drop** | Boolean | Defines if solutions with a drop in stability with respect to the highest stable number of signatures will be considered (default: `False`). |
126
126
  | **Decomposition** | | | |
127
- | | **cosmic_version** | Float | Takes a positive float among 1, 2, 3, 3.1, 3.2, 3.3, and 3.4. Default is 3.4. Defines the version of the COSMIC reference signatures. |
128
- | | **make_decomposition_plots** | Boolean | Defualt is True. If True, Denovo to Cosmic sigantures decompostion plots will be created as a part the results. |
129
- | | **collapse_to_SBS96** | Boolean | Defualt is True. If True, SBS288 and SBS1536 Denovo signatures will be mapped to SBS96 reference signatures. If False, those will be mapped to reference signatures of the same context.
127
+ | | **cosmic_version** | Float | Defines the version of the COSMIC reference signatures (default: `3.4`). Takes a positive float among `1`, `2`, `3`, `3.1`, `3.2`, `3.3`, and `3.4`.|
128
+ | | **make_decomposition_plots** | Boolean | Generate de novo to COSMIC signature decomposition plots as part of the results (default: `True`). Set to `False` to skip generating these plots. |
129
+ | | **collapse_to_SBS96** | Boolean | If `True`, SBS288 and SBS1536 de novo signatures will be mapped to SBS96 reference signatures (default: `True`). If `False`, those will be mapped to reference signatures of the same context.
130
130
  | **Others** | | | |
131
- | | **get_all_signature_matrices** | Boolean | If True, the Ws and Hs from all the NMF iterations are generated in the output. |
132
- | | **export_probabilities** | Boolean | Defualt is True. If False, then doesn't create the probability matrix. |
131
+ | | **get_all_signature_matrices** | Boolean | Write to output Ws and Hs from all the NMF iterations (default: `False`) |
132
+ | | **export_probabilities** | Boolean | Create the probability matrix (default: `True`). |
133
+ | | **volume** | String | Path to the volume for writing and loading reference genomes, plotting templates, and COSMIC signature plots (default: `None`). Environmental variables take precedence: `SIGPROFILERMATRIXGENERATOR_VOLUME`, `SIGPROFILERPLOTTING_VOLUME`, and `SIGPROFILERASSIGNMENT_VOLUME`. |
133
134
 
134
135
  #### sigProfilerExtractor Example
135
136
  VCF Files as Input
@@ -180,16 +181,16 @@ estimate_solution(base_csvfile="All_solutions_stat.csv",
180
181
 
181
182
  | Parameter | Variable Type | Parameter Description |
182
183
  | --------------------- | -------- |-------- |
183
- | **base_csvfile** | String | Default is "All_solutions_stat.csv". Path to a csv file that contains the statistics of all solutions. |
184
- | **All_solution** | String | Default is "All_Solutions". Path to a folder that contains the results of all solutions. |
185
- | **genomes** | String | Default is Samples.txt. Path to a tab delimilted file that contains the mutation counts for all genomes given to different mutation types. |
186
- | **output** | String | Default is "results". Path to the output folder. |
187
- | **title** | String | Default is "Selection_Plot". This sets the title of the selection_plot.pdf |
184
+ | **base_csvfile** | String | Default is `"All_solutions_stat.csv"`. Path to a CSV file that contains the statistics of all solutions. |
185
+ | **All_solution** | String | Default is `"All_Solutions"`. Path to a folder that contains the results of all solutions. |
186
+ | **genomes** | String | Default is `"Samples.txt"`. Path to a tab delimilted file that contains the mutation counts for all genomes given to different mutation types. |
187
+ | **output** | String | Default is `"results"`. Path to the output folder. |
188
+ | **title** | String | Default is `"Selection_Plot"`. This sets the title of the selection_plot.pdf |
188
189
  | **stability** | Float | Default is 0.8. The cutoff thresh-hold of the average stability. Solutions with average stabilities below this thresh-hold will not be considered. |
189
- | **min_stability** | Float | Default is 0.2. The cutoff thresh-hold of the minimum stability. Solutions with minimum stabilities below this thresh-hold will not be considered. |
190
- | **combined_stability** | Float | Default is 1.0. The cutoff thresh-hold of the combined stability (sum of average and minimum stability). Solutions with combined stabilities below this thresh-hold will not be considered. |
191
- | **allow_stability_drop** | Boolean | Default is False. Defines if solutions with a drop in stability with respect to the highest stable number of signatures will be considered. |
192
- | **exome** | Boolean | Default is "False". Defines if exomes samples are used. |
190
+ | **min_stability** | Float | Default is `0.2`. The cutoff thresh-hold of the minimum stability. Solutions with minimum stabilities below this thresh-hold will not be considered. |
191
+ | **combined_stability** | Float | Default is `1.0`. The cutoff thresh-hold of the combined stability (sum of average and minimum stability). Solutions with combined stabilities below this thresh-hold will not be considered. |
192
+ | **allow_stability_drop** | Boolean | Default is `False`. Defines if solutions with a drop in stability with respect to the highest stable number of signatures will be considered. |
193
+ | **exome** | Boolean | Default is `False`. Defines if exomes samples are used. |
193
194
 
194
195
 
195
196
  #### Estimation of the Optimum Solution Example
@@ -216,7 +217,7 @@ The files below will be generated in the output folder:
216
217
 
217
218
  ### <a name="decompose"></a> Decompose
218
219
 
219
- For decomposition of denovo signatures please use [SigProfilerAssignment](https://github.com/AlexandrovLab/SigProfilerAssignment)
220
+ For decomposition of de novo signatures please use [SigProfilerAssignment](https://github.com/AlexandrovLab/SigProfilerAssignment)
220
221
 
221
222
  ### <a name="plotActivity"></a> Activity Stacked Bar Plot
222
223
  Generates a stacked bar plot showing activities in individuals
@@ -0,0 +1,266 @@
1
+ import argparse
2
+ from typing import List
3
+ from SigProfilerExtractor import sigpro
4
+
5
+
6
+ def str2bool(v):
7
+ if isinstance(v, bool):
8
+ return v
9
+ if v.lower() in ("yes", "true", "t", "y", "1"):
10
+ return True
11
+ elif v.lower() in ("no", "false", "f", "n", "0"):
12
+ return False
13
+ else:
14
+ raise argparse.ArgumentTypeError("Boolean value expected.")
15
+
16
+
17
+ def parse_arguments_extractor(args: List[str], description: str) -> argparse.Namespace:
18
+ parser = argparse.ArgumentParser(description=description)
19
+
20
+ # Core required arguments
21
+ input_type_help = (
22
+ "The input file type: 'vcf', 'matrix', 'bedpe', or 'seg:TYPE'. "
23
+ "Accepted callers for TYPE: {'ASCAT', 'ASCAT_NGS', 'SEQUENZA', "
24
+ "'ABSOLUTE', 'BATTENBERG', 'FACETS', 'PURPLE', 'TCGA'}."
25
+ )
26
+
27
+ parser.add_argument(
28
+ "input_type",
29
+ help=input_type_help,
30
+ )
31
+
32
+ parser.add_argument(
33
+ "output",
34
+ help="Path to the output folder.",
35
+ )
36
+
37
+ input_data_help = (
38
+ "Path to input data. For 'vcf' or 'bedpe', provide an input folder. "
39
+ "For 'matrix' or 'seg:TYPE', provide an input file."
40
+ )
41
+
42
+ parser.add_argument(
43
+ "input_data",
44
+ help=input_data_help,
45
+ )
46
+
47
+ # Optional arguments with defaults
48
+ parser.add_argument(
49
+ "--reference_genome",
50
+ default="GRCh37",
51
+ help="Reference genome (default: 'GRCh37'). This parameter is applicable only if the input_type is 'vcf'.",
52
+ )
53
+ parser.add_argument(
54
+ "--opportunity_genome",
55
+ default="GRCh37",
56
+ help="The build or version of the reference genome for the reference signatures (default: 'GRCh37'). When the input type is 'vcf' the value for 'opportunity_genome' will be used instead.",
57
+ )
58
+ parser.add_argument(
59
+ "--context_type",
60
+ default="default",
61
+ help="Mutational context types (default: '96,DINUC,ID').",
62
+ )
63
+ parser.add_argument(
64
+ "--exome",
65
+ type=str2bool,
66
+ nargs="?",
67
+ const=True,
68
+ default=False,
69
+ help="Extract exomes (default: False).",
70
+ )
71
+ parser.add_argument(
72
+ "--minimum_signatures",
73
+ type=int,
74
+ default=1,
75
+ help="Minimum number of signatures to be extracted (default: 1).",
76
+ )
77
+ parser.add_argument(
78
+ "--maximum_signatures",
79
+ type=int,
80
+ default=10,
81
+ help="Maximum number of signatures to be extracted (default: 10).",
82
+ )
83
+ parser.add_argument(
84
+ "--nmf_replicates",
85
+ type=int,
86
+ default=100,
87
+ help="Number of NMF replicates to be performed at each rank using W and H (default: 100).",
88
+ )
89
+ parser.add_argument(
90
+ "--resample",
91
+ type=str2bool,
92
+ nargs="?",
93
+ const=True,
94
+ default=True,
95
+ help="Add poisson noise to samples by resampling (default: True).",
96
+ )
97
+ parser.add_argument(
98
+ "--seeds",
99
+ default="random",
100
+ help="Seeds for reproducible resamples, file path or 'random' (default: 'random').",
101
+ )
102
+ parser.add_argument(
103
+ "--batch_size",
104
+ type=int,
105
+ default=1,
106
+ help="Batch size is for GPU only and defines the number of NMF replicates to be performed by each CPU during parallel processing (default: 1).",
107
+ )
108
+ parser.add_argument(
109
+ "--cpu",
110
+ type=int,
111
+ default=-1,
112
+ help="Number of processors to use (default: all available).",
113
+ )
114
+ parser.add_argument(
115
+ "--gpu",
116
+ type=str2bool,
117
+ nargs="?",
118
+ const=True,
119
+ default=False,
120
+ help="Use GPU if available (default: False). note: All available CPU processors are used by default, which may cause a memory error. This error can be resolved by reducing the number of CPU processes through the 'cpu' parameter.",
121
+ )
122
+ parser.add_argument(
123
+ "--nmf_init",
124
+ default="random",
125
+ help="The initialization algorithm for W and H matrix of NMF (default: 'random'). Options are 'random', 'nndsvd', 'nndsvda', 'nndsvdar' and 'nndsvd_min'.",
126
+ )
127
+ parser.add_argument(
128
+ "--precision",
129
+ default="single",
130
+ help="Precision for calculations (default: 'single'). Options are 'single' and 'double'.",
131
+ )
132
+ parser.add_argument(
133
+ "--matrix_normalization",
134
+ default="gmm",
135
+ help="Method of normalizing the genome matrix before it is analyzed by NMF (default: 'gmm'). Options are 'custom', 'gmm', 'log2', or 'none'.",
136
+ )
137
+ parser.add_argument(
138
+ "--min_nmf_iterations",
139
+ type=int,
140
+ default=10000,
141
+ help="Minimum NMF iterations (default: 10000).",
142
+ )
143
+ parser.add_argument(
144
+ "--max_nmf_iterations",
145
+ type=int,
146
+ default=1000000,
147
+ help="Maximum NMF iterations (default: 1000000).",
148
+ )
149
+ parser.add_argument(
150
+ "--nmf_test_conv",
151
+ type=int,
152
+ default=10000,
153
+ help="Test convergence every X iterations (default: 10000).",
154
+ )
155
+ parser.add_argument(
156
+ "--nmf_tolerance",
157
+ type=float,
158
+ default=1e-15,
159
+ help="NMF tolerance for convergence (default: 1e-15).",
160
+ )
161
+ parser.add_argument(
162
+ "--get_all_signature_matrices",
163
+ type=str2bool,
164
+ nargs="?",
165
+ const=True,
166
+ default=False,
167
+ help="Get all NMF matrices (default: False).",
168
+ )
169
+ parser.add_argument(
170
+ "--export_probabilities",
171
+ type=str2bool,
172
+ nargs="?",
173
+ const=True,
174
+ default=True,
175
+ help="Export probability matrix (default: True).",
176
+ )
177
+ parser.add_argument(
178
+ "--stability",
179
+ type=float,
180
+ default=0.8,
181
+ help="Average stability cutoff (default: 0.8).",
182
+ )
183
+ parser.add_argument(
184
+ "--min_stability",
185
+ type=float,
186
+ default=0.2,
187
+ help="Minimum stability cutoff (default: 0.2).",
188
+ )
189
+ parser.add_argument(
190
+ "--combined_stability",
191
+ type=float,
192
+ default=1.0,
193
+ help="Combined stability cutoff (default: 1.0).",
194
+ )
195
+ parser.add_argument(
196
+ "--allow_stability_drop",
197
+ type=str2bool,
198
+ nargs="?",
199
+ const=True,
200
+ default=False,
201
+ help="Allow stability drop (default: False).",
202
+ )
203
+ parser.add_argument(
204
+ "--cosmic_version",
205
+ type=float,
206
+ default=3.4,
207
+ help="COSMIC version for reference signatures. Valid values are 1, 2, 3, 3.1, 3.2, 3.3, and 3.4 (default: 3.4).",
208
+ )
209
+ parser.add_argument(
210
+ "--make_decomposition_plots",
211
+ type=str2bool,
212
+ nargs="?",
213
+ const=True,
214
+ default=True,
215
+ help="Generate decomposition plots (default: True).",
216
+ )
217
+ parser.add_argument(
218
+ "--collapse_to_SBS96",
219
+ type=str2bool,
220
+ nargs="?",
221
+ const=True,
222
+ default=True,
223
+ help="Collapse to SBS288 and SBS1536 matrices to SBS96. If False, will map reference signatures to the same context as input (default: True).",
224
+ )
225
+
226
+ return parser.parse_args(args)
227
+
228
+
229
+ class CliController:
230
+ def dispatch_sigProfilerExtractor(self, user_args: List[str]) -> None:
231
+ parsed_args = parse_arguments_extractor(
232
+ user_args, "Extract mutational signatures from input samples."
233
+ )
234
+ sigpro.sigProfilerExtractor(
235
+ input_type=parsed_args.input_type,
236
+ output=parsed_args.output,
237
+ input_data=parsed_args.input_data,
238
+ reference_genome=parsed_args.reference_genome,
239
+ opportunity_genome=parsed_args.opportunity_genome,
240
+ context_type=parsed_args.context_type,
241
+ exome=parsed_args.exome,
242
+ minimum_signatures=parsed_args.minimum_signatures,
243
+ maximum_signatures=parsed_args.maximum_signatures,
244
+ nmf_replicates=parsed_args.nmf_replicates,
245
+ resample=parsed_args.resample,
246
+ seeds=parsed_args.seeds,
247
+ batch_size=parsed_args.batch_size,
248
+ cpu=parsed_args.cpu,
249
+ gpu=parsed_args.gpu,
250
+ nmf_init=parsed_args.nmf_init,
251
+ precision=parsed_args.precision,
252
+ matrix_normalization=parsed_args.matrix_normalization,
253
+ min_nmf_iterations=parsed_args.min_nmf_iterations,
254
+ max_nmf_iterations=parsed_args.max_nmf_iterations,
255
+ nmf_test_conv=parsed_args.nmf_test_conv,
256
+ nmf_tolerance=parsed_args.nmf_tolerance,
257
+ get_all_signature_matrices=parsed_args.get_all_signature_matrices,
258
+ export_probabilities=parsed_args.export_probabilities,
259
+ stability=parsed_args.stability,
260
+ min_stability=parsed_args.min_stability,
261
+ combined_stability=parsed_args.combined_stability,
262
+ allow_stability_drop=parsed_args.allow_stability_drop,
263
+ cosmic_version=parsed_args.cosmic_version,
264
+ make_decomposition_plots=parsed_args.make_decomposition_plots,
265
+ collapse_to_SBS96=parsed_args.collapse_to_SBS96,
266
+ )
@@ -113,7 +113,7 @@ class NMF:
113
113
  H = np.zeros([self._V.shape[0], self._rank, self._V.shape[2]])
114
114
  nv = nndsvd.Nndsvd()
115
115
  for i in range(self._V.shape[0]):
116
- vin = np.mat(self._V.cpu().numpy()[i])
116
+ vin = np.asmatrix(self._V.cpu().numpy()[i])
117
117
  W[i, :, :], H[i, :, :] = nv.initialize(
118
118
  vin, self._rank, options={"flag": 0}
119
119
  )
@@ -123,7 +123,7 @@ class NMF:
123
123
  H = np.zeros([self._V.shape[0], self._rank, self._V.shape[2]])
124
124
  nv = nndsvd.Nndsvd()
125
125
  for i in range(self._V.shape[0]):
126
- vin = np.mat(self._V.cpu().numpy()[i])
126
+ vin = np.asmatrix(self._V.cpu().numpy()[i])
127
127
  W[i, :, :], H[i, :, :] = nv.initialize(
128
128
  vin, self._rank, options={"flag": 1}
129
129
  )
@@ -133,7 +133,7 @@ class NMF:
133
133
  H = np.zeros([self._V.shape[0], self._rank, self._V.shape[2]])
134
134
  nv = nndsvd.Nndsvd()
135
135
  for i in range(self._V.shape[0]):
136
- vin = np.mat(self._V.cpu().numpy()[i])
136
+ vin = np.asmatrix(self._V.cpu().numpy()[i])
137
137
  W[i, :, :], H[i, :, :] = nv.initialize(
138
138
  vin, self._rank, options={"flag": 2}
139
139
  )
@@ -142,7 +142,7 @@ class NMF:
142
142
  H = np.zeros([self._V.shape[0], self._rank, self._V.shape[2]])
143
143
  nv = nndsvd.Nndsvd()
144
144
  for i in range(self._V.shape[0]):
145
- vin = np.mat(self._V.cpu().numpy()[i])
145
+ vin = np.asmatrix(self._V.cpu().numpy()[i])
146
146
  w, h = nv.initialize(vin, self._rank, options={"flag": 2})
147
147
  min_X = np.min(vin[vin > 0])
148
148
  h[h <= min_X] = min_X