biopipen 0.25.4__py3-none-any.whl → 0.26.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biopipen might be problematic. Click here for more details.

biopipen/__init__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.25.4"
1
+ __version__ = "0.26.1"
biopipen/core/config.toml CHANGED
@@ -27,6 +27,8 @@ convert = "convert"
27
27
  wget = "wget"
28
28
  # aria2c
29
29
  aria2c = "aria2c"
30
+ # plink
31
+ plink = "plink"
30
32
  # tabix
31
33
  tabix = "tabix"
32
34
  # sambamba
biopipen/ns/rnaseq.py CHANGED
@@ -5,17 +5,154 @@ from ..core.config import config
5
5
 
6
6
 
7
7
  class UnitConversion(Proc):
8
- """Convert expression value units back and forth"""
8
+ """Convert expression value units back and forth
9
+
10
+ See <https://haroldpimentel.wordpress.com/2014/05/08/what-the-fpkm-a-review-rna-seq-expression-units/>
11
+ and <https://docs.gdc.cancer.gov/Data/Bioinformatics_Pipelines/Expression_mRNA_Pipeline/#fpkm>.
12
+
13
+ Following converstions are supported -
14
+ * `count -> cpm, fpkm/rpkm, fpkmuq/rpkmrq, tpm, tmm`
15
+ * `fpkm/rpkm -> count, tpm, cpm`
16
+ * `tpm -> count, fpkm/rpkm, cpm`
17
+ * `cpm -> count, fpkm/rpkm, tpm`
18
+ NOTE that during some conversions, `sum(counts/effLen)` is approximated to
19
+ `sum(counts)/sum(effLen) * length(effLen))`
20
+
21
+ You can also use this process to just transform the expression values, e.g., take
22
+ log2 of the expression values. In this case, you can set `inunit` and `outunit` to
23
+ `count` and `log2(count + 1)` respectively.
24
+
25
+ Input:
26
+ infile: Input file containing expression values
27
+ The file should be a matrix with rows representing genes and columns
28
+ representing samples.
29
+ It could be an RDS file containing a data frame or a matrix, or a
30
+ text file containing a matrix with tab as the delimiter. The text
31
+ file can be gzipped.
32
+
33
+ Output:
34
+ outfile: Output file containing the converted expression values
35
+ The file will be a matrix with rows representing genes and columns
36
+ representing samples.
37
+
38
+ Envs:
39
+ inunit: The input unit of the expression values.
40
+ You can also use an expression to indicate the input unit, e.g.,
41
+ `log2(counts + 1)`. The expression should be like `A * fn(B*X + C) + D`,
42
+ where `A`, `B`, `C` and `D` are constants, `fn` is a function, and X is
43
+ the input unit.
44
+ Currently only `expr`, `sqrt`, `log2`, `log10` and `log` are supported as
45
+ functions.
46
+ Supported input units are:
47
+ * counts/count/rawcounts/rawcount: raw counts.
48
+ * cpm: counts per million.
49
+ * fpkm/rpkm: fragments per kilobase of transcript per million.
50
+ * fpkmuq/rpkmuq: upper quartile normalized FPKM/RPKM.
51
+ * tpm: transcripts per million.
52
+ * tmm: trimmed mean of M-values.
53
+ outunit: The output unit of the expression values. An expression can also be
54
+ used for transformation (e.g. `log2(tpm + 1)`). If `inunit` is `count`,
55
+ then this means we are converting raw counts to tpm, and transforming it
56
+ to `log2(tpm + 1)` as the output. Any expression supported by `R` can be
57
+ used. Same units as `inunit` are supported.
58
+ refexon: Path to the reference exon gff file.
59
+ meanfl (type=auto): A file containing the mean fragment length for each sample
60
+ by rows (samples as rowname), without header.
61
+ Or a fixed universal estimated number (1 used by TCGA).
62
+ nreads (type=auto): The estimatied total number of reads for each sample.
63
+ or you can pass a file with the number for each sample by rows
64
+ (samples as rowname), without header.
65
+ When converting `fpkm/rpkm -> count`, it should be total reads of that sample.
66
+ When converting `cpm -> count`: it should be total reads of that sample.
67
+ When converting `tpm -> count`: it should be total reads of that sample.
68
+ When converting `tpm -> cpm`: it should be total reads of that sample.
69
+ When converting `tpm -> fpkm/rpkm`: it should be `sum(fpkm)` of that sample.
70
+ It is not used when converting `count -> cpm, fpkm/rpkm, tpm`.
71
+ """ # noqa: E501
9
72
  input = "infile:file"
10
73
  output = "outfile:file:{{in.infile | basename}}"
11
74
  lang = config.lang.rscript
12
75
  envs = {
13
- "infmt": "matrix", # or rds
14
76
  "inunit": None,
15
77
  "outunit": None,
16
78
  "refexon": config.ref.refexon,
17
- "meanfl": None,
18
- "inlog2p": False,
19
- "outlog2p": False,
79
+ "meanfl": 1,
80
+ "nreads": 1_000_000,
20
81
  }
21
82
  script = "file://../scripts/rnaseq/UnitConversion.R"
83
+
84
+
85
+ class Simulation(Proc):
86
+ """Simulate RNA-seq data using ESCO/RUVcorr package
87
+
88
+ Input:
89
+ ngenes: Number of genes to simulate
90
+ nsamples: Number of samples to simulate
91
+ If you want to force the process to re-simulate for the same
92
+ `ngenes` and `nsamples`, you can set a different value for `envs.seed`.
93
+ Note that the samples will be shown as cells in the output (since
94
+ the simulation is designed for single-cell RNA-seq data).
95
+
96
+ Output:
97
+ outfile: Output file containing the simulated data with rows representing
98
+ genes and columns representing samples.
99
+ outdir: Output directory containing the simulated data
100
+ `sim.rds` and `True.rds` will be generated.
101
+ For `ESCO`, `sim.rds` contains the simulated data in a
102
+ `SingleCellExperiment` object, and `True.rds` contains the matrix of true
103
+ counts.
104
+ For `RUVcorr`, `sim.rds` contains the simulated data in list with
105
+ `Truth`, A matrix containing the values of Xβ; `Y` A matrix containing the
106
+ values in `Y`; `Noise` A matrix containing the values in `Wα`; `Sigma`
107
+ A matrix containing the true gene-gene correlations, as defined by Xβ; and
108
+ `Info` A matrix containing some of the general information about the
109
+ simulation.
110
+ For all matrices, rows represent genes and columns represent samples.
111
+
112
+ Envs:
113
+ tool (choice): Which tool to use for simulation.
114
+ - ESCO: uses the [ESCO](https://github.com/JINJINT/ESCO) package.
115
+ - RUVcorr: uses the [RUVcorr](https://rdrr.io/bioc/RUVcorr/) package.
116
+ ncores (type=int): Number of cores to use.
117
+ seed (type=int): Random seed.
118
+ If not set, seed will not be set.
119
+ esco_args (ns): Additional arguments to pass to the simulation function.
120
+ - save (choice): Which type of data to save to `out.outfile`.
121
+ - `simulated-truth`: saves the simulated true counts.
122
+ - `zero-inflated`: saves the zero-inflated counts.
123
+ - `down-sampled`: saves the down-sampled counts.
124
+ - type (choice): Which type of heterogenounity to use.
125
+ - single: produces a single population.
126
+ - group: produces distinct groups.
127
+ - tree: produces distinct groups but admits a tree structure.
128
+ - traj: produces distinct groups but admits a smooth trajectory
129
+ structure.
130
+ - <more>: See <https://rdrr.io/github/JINJINT/ESCO/man/escoParams.html>.
131
+ ruvcorr_args (ns): Additional arguments to pass to the simulation
132
+ function.
133
+ - <more>: See <https://rdrr.io/bioc/RUVcorr/man/simulateGEdata.html>.
134
+ transpose_output (flag): If set, the output will be transposed.
135
+ index_start (type=int): The index to start from when naming the samples.
136
+ Affects the sample names in `out.outfile` only.
137
+ """
138
+ input = "ngenes:var, nsamples:var"
139
+ output = [
140
+ "outfile:file:{{in.ngenes}}x{{in.nsamples}}.sim/simulated.txt",
141
+ "outdir:dir:{{in.ngenes}}x{{in.nsamples}}.sim",
142
+ ]
143
+ lang = config.lang.rscript
144
+ envs = {
145
+ "tool": "RUVcorr",
146
+ "ncores": config.misc.ncores,
147
+ "type": "single",
148
+ "esco_args": {
149
+ "dropout-type": "none",
150
+ "save": "simulated-truth",
151
+ "type": "single",
152
+ },
153
+ "ruvcorr_args": {},
154
+ "seed": None,
155
+ "transpose_output": False,
156
+ "index_start": 1,
157
+ }
158
+ script = "file://../scripts/rnaseq/Simulation.R"
biopipen/ns/scrna.py CHANGED
@@ -483,14 +483,18 @@ class SeuratClusterStats(Proc):
483
483
  The parameters from the cases can overwrite the default parameters.
484
484
  - frac (flag): Whether to output the fraction of cells instead of number.
485
485
  - pie (flag): Also output a pie chart?
486
+ - circos (flag): Also output a circos plot?
486
487
  - table (flag): Whether to output a table (in tab-delimited format) and in the report.
487
488
  - frac_ofall(flag): Whether to output the fraction against all cells,
488
489
  instead of the fraction in each group.
490
+ Does not work for circos plot.
489
491
  Only works when `frac` is `True` and `group-by` is specified.
490
492
  - transpose (flag): Whether to transpose the cluster and group, that is,
491
493
  using group as the x-axis and cluster to fill the plot.
494
+ For circos plot, when transposed, the arrows will be drawn from the idents (by `ident`) to the
495
+ the groups (by `group-by`).
492
496
  Only works when `group-by` is specified.
493
- - position (choice): The position of the bars.
497
+ - position (choice): The position of the bars. Does not work for pie and circos plots.
494
498
  - stack: Use `position_stack()`.
495
499
  - fill: Use `position_fill()`.
496
500
  - dodge: Use `position_dodge()`.
@@ -499,8 +503,13 @@ class SeuratClusterStats(Proc):
499
503
  - group-by: The column name in metadata to group the cells.
500
504
  Does NOT support for pie charts.
501
505
  - split-by: The column name in metadata to split the cells into different plots.
506
+ Does NOT support for circos plots.
502
507
  - subset: An expression to subset the cells, will be passed to
503
508
  `dplyr::filter()` on metadata.
509
+ - circos_devpars (ns): The device parameters for the circos plots.
510
+ - res (type=int): The resolution of the plots.
511
+ - height (type=int): The height of the plots.
512
+ - width (type=int): The width of the plots.
504
513
  - pie_devpars (ns): The device parameters for the pie charts.
505
514
  - res (type=int): The resolution of the plots.
506
515
  - height (type=int): The height of the plots.
@@ -634,6 +643,7 @@ class SeuratClusterStats(Proc):
634
643
  "stats_defaults": {
635
644
  "frac": False,
636
645
  "pie": False,
646
+ "circos": False,
637
647
  "table": False,
638
648
  "position": "auto",
639
649
  "frac_ofall": False,
@@ -644,6 +654,7 @@ class SeuratClusterStats(Proc):
644
654
  "subset": None,
645
655
  "devpars": {"res": 100, "height": 600, "width": 800},
646
656
  "pie_devpars": {"res": 100, "height": 600, "width": 800},
657
+ "circos_devpars": {"res": 100, "height": 600, "width": 600},
647
658
  },
648
659
  "stats": {
649
660
  "Number of cells in each cluster": {
@@ -882,8 +893,9 @@ class CellsDistribution(Proc):
882
893
  each: The column name in metadata to separate the cells into different plots.
883
894
  section: The section to show in the report. This allows different cases to be put in the same section in report.
884
895
  Only works when `each` is not specified.
885
- overlap (list): Plot the overlap of cells in different cases under the same section.
886
- The section must have at least 2 cases.
896
+ overlap (list): Plot the overlap of cell groups (values of `cells_by`) in different cases
897
+ under the same section.
898
+ The section must have at least 2 cases, each case should have a single `cells_by` column.
887
899
  cases (type=json;order=99): If you have multiple cases, you can specify them here.
888
900
  Keys are the names of the cases and values are the options above except `mutaters`.
889
901
  If some options are not specified, the options in `envs` will be used.
@@ -1141,6 +1153,7 @@ class TopExpressingGenes(Proc):
1141
1153
  markers See below for all libraries.
1142
1154
  <https://maayanlab.cloud/Enrichr/#libraries>
1143
1155
  n (type=int): The number of top expressing genes to find.
1156
+ subset: An expression to subset the cells for each case.
1144
1157
  cases (type=json): If you have multiple cases, you can specify them
1145
1158
  here. The keys are the names of the cases and the values are the
1146
1159
  above options except `mutaters`. If some options are
@@ -1161,6 +1174,7 @@ class TopExpressingGenes(Proc):
1161
1174
  "section": "DEFAULT",
1162
1175
  "dbs": ["KEGG_2021_Human", "MSigDB_Hallmark_2020"],
1163
1176
  "n": 250,
1177
+ "subset": None,
1164
1178
  "cases": {},
1165
1179
  }
1166
1180
  plugin_opts = {
biopipen/ns/snp.py ADDED
@@ -0,0 +1,70 @@
1
+ """Plink processes"""
2
+
3
+ from ..core.proc import Proc
4
+ from ..core.config import config
5
+
6
+
7
+ class PlinkSimulation(Proc):
8
+ """Simulate SNPs using PLINK v1.9
9
+
10
+ See also <https://www.cog-genomics.org/plink/1.9/input#simulate>.
11
+
12
+ Input:
13
+ nsnps: Number of SNPs to simulate
14
+ ncases: Number of cases to simulate
15
+ nctrls: Number of controls to simulate
16
+
17
+ Output:
18
+ outdir: Output directory containing the simulated data
19
+ `plink_sim.bed`, `plink_sim.bim`, and `plink_sim.fam` will be generated.
20
+ gtmat: Genotype matrix file containing the simulated data with rows representing
21
+ SNPs and columns representing samples.
22
+
23
+ Envs:
24
+ plink: Path to PLINK v1.9
25
+ seed (type=int): Random seed.
26
+ If not set, seed will not be set.
27
+ label: Prefix label for the SNPs.
28
+ prevalence (type=float): Disease prevalence.
29
+ minfreq (type=float): Minimum allele frequency.
30
+ maxfreq (type=float): Maximum allele frequency.
31
+ hetodds (type=float): Odds ratio for heterozygous genotypes.
32
+ homodds (type=float): Odds ratio for homozygous genotypes.
33
+ missing (type=float): Proportion of missing genotypes.
34
+ args (ns): Additional arguments to pass to PLINK.
35
+ - <more>: see <https://www.cog-genomics.org/plink/1.9/input#simulate>.
36
+ transpose_gtmat (flag): If set, the genotype matrix (`out.gtmat`) will
37
+ be transposed.
38
+ sample_prefix: Use this prefix for the sample names. If not set, the sample
39
+ names will be `per0_per0`, `per1_per1`, `per2_per2`, etc. If set, the
40
+ sample names will be `prefix0`, `prefix1`, `prefix2`, etc.
41
+ This only affects the sample names in the genotype matrix file
42
+ (`out.gtmat`).
43
+ """
44
+ input = "nsnps:var, ncases:var, nctrls:var"
45
+ output = [
46
+ (
47
+ "outdir:dir:{{in.nsnps | int}}_"
48
+ "{{in.ncases | int}}xcases_{{in.nctrls | int}}xctrls.plink_sim"
49
+ ),
50
+ (
51
+ "gtmat:file:{{in.nsnps | int}}_"
52
+ "{{in.ncases | int}}xcases_{{in.nctrls | int}}xctrls.plink_sim/gtmat.txt"
53
+ ),
54
+ ]
55
+ lang = config.lang.python
56
+ envs = {
57
+ "plink": config.exe.plink,
58
+ "seed": None,
59
+ "label": "SNP",
60
+ "prevalence": 0.01,
61
+ "minfreq": 0.0,
62
+ "maxfreq": 1.0,
63
+ "hetodds": 1.0,
64
+ "homodds": 1.0,
65
+ "missing": 0.0,
66
+ "args": {},
67
+ "transpose_gtmat": False,
68
+ "sample_prefix": None,
69
+ }
70
+ script = "file://../scripts/snp/PlinkSimulation.py"
biopipen/ns/stats.py ADDED
@@ -0,0 +1,320 @@
1
+ """Provides processes for statistics."""
2
+
3
+ from ..core.proc import Proc
4
+ from ..core.config import config
5
+
6
+
7
+ class ChowTest(Proc):
8
+ """Massive Chow tests.
9
+
10
+ See Also https://en.wikipedia.org/wiki/Chow_test
11
+
12
+ Input:
13
+ infile: The input data file. The rows are samples and the columns are
14
+ features. It must be tab-delimited.
15
+ ```
16
+ Sample F1 F2 F3 ... Fn
17
+ S1 1.2 3.4 5.6 7.8
18
+ S2 2.3 4.5 6.7 8.9
19
+ ...
20
+ Sm 5.6 7.8 9.0 1.2
21
+ ```
22
+ groupfile: The group file. The rows are the samples and the columns
23
+ are the groupings. It must be tab-delimited.
24
+ ```
25
+ Sample G1 G2 G3 ... Gk
26
+ S1 0 1 0 0
27
+ S2 2 1 0 NA # exclude this sample
28
+ ...
29
+ Sm 1 0 0 0
30
+ ```
31
+ fmlfile: The formula file. The first column is grouping and the
32
+ second column is the formula. It must be tab-delimited.
33
+ ```
34
+ Group Formula ... # Other columns to be added to outfile
35
+ G1 Fn ~ F1 + Fx + Fy # Fx, Fy could be covariates
36
+ G1 Fn ~ F2 + Fx + Fy
37
+ ...
38
+ Gk Fn ~ F3 + Fx + Fy
39
+ ```
40
+
41
+ Output:
42
+ outfile: The output file. It is a tab-delimited file with the first
43
+ column as the grouping and the second column as the p-value.
44
+ ```
45
+ Group Formula ... Pooled Groups SSR SumSSR Fstat Pval Padj
46
+ G1 Fn ~ F1 0.123 2 1 0.123 0.123 0.123 0.123
47
+ G1 Fn ~ F2 0.123 2 1 0.123 0.123 0.123 0.123
48
+ ...
49
+ Gk Fn ~ F3 0.123 2 1 0.123 0.123 0.123 0.123
50
+ ```
51
+
52
+ Envs:
53
+ padj (choice): The method for p-value adjustment.
54
+ - none: No p-value adjustment (no Padj column in outfile).
55
+ - holm: Holm-Bonferroni method.
56
+ - hochberg: Hochberg method.
57
+ - hommel: Hommel method.
58
+ - bonferroni: Bonferroni method.
59
+ - BH: Benjamini-Hochberg method.
60
+ - BY: Benjamini-Yekutieli method.
61
+ - fdr: FDR correction method.
62
+ transpose_input (flag): Whether to transpose the input file.
63
+ transpose_group (flag): Whether to transpose the group file.
64
+ """
65
+ input = "infile:file, groupfile:file, fmlfile:file"
66
+ output = "outfile:file:{{in.infile | stem}}.chowtest.txt"
67
+ lang = config.lang.rscript
68
+ envs = {
69
+ "padj": "none",
70
+ "transpose_input": False,
71
+ "transpose_group": False,
72
+ }
73
+ script = "file://../scripts/stats/ChowTest.R"
74
+
75
+
76
+ class LiquidAssoc(Proc):
77
+ """Liquid association tests.
78
+
79
+ See Also https://github.com/gundt/fastLiquidAssociation
80
+ Requieres https://github.com/pwwang/fastLiquidAssociation
81
+
82
+ Input:
83
+ infile: The input data file. The rows are samples and the columns are
84
+ features. It must be tab-delimited.
85
+ ```
86
+ Sample F1 F2 F3 ... Fn
87
+ S1 1.2 3.4 5.6 7.8
88
+ S2 2.3 4.5 6.7 8.9
89
+ ...
90
+ Sm 5.6 7.8 9.0 1.2
91
+ ```
92
+ The features (columns) will be tested pairwise, which will be the X and
93
+ Y columns in the result of `fastMLA`
94
+ covfile: The covariate file. The rows are the samples and the columns
95
+ are the covariates. It must be tab-delimited.
96
+ If provided, the data in `in.infile` will be adjusted by covariates by
97
+ regressing out the covariates and the residuals will be used for
98
+ liquid association tests.
99
+ groupfile: The group file. The rows are the samples and the columns
100
+ are the groupings. It must be tab-delimited.
101
+ ```
102
+ Sample G1 G2 G3 ... Gk
103
+ S1 0 1 0 0
104
+ S2 2 1 0 NA # exclude this sample
105
+ ...
106
+ Sm 1 0 0 0
107
+ ```
108
+ This will be served as the Z column in the result of `fastMLA`
109
+ This can be omitted. If so, `envs.nvec` should be specified, which is
110
+ to select column from `in.infile` as Z.
111
+ fmlfile: The formula file. The 3 columns are X3, X12 and X21. The results
112
+ will be filtered based on the formula. It must be tab-delimited without
113
+ header.
114
+
115
+ Output:
116
+ outfile: The output file.
117
+ ```
118
+ X12 X21 X3 rhodiff MLA value estimates san.se wald Pval model
119
+ C38 C46 C5 0.87 0.32 0.67 0.20 10.87 0 F
120
+ C46 C38 C5 0.87 0.32 0.67 0.20 10.87 0 F
121
+ C27 C39 C4 0.94 0.34 1.22 0.38 10.03 0 F
122
+ ```
123
+
124
+ Envs:
125
+ nvec: The column index (1-based) of Z in `in.infile`, if `in.groupfile` is
126
+ omitted. You can specify multiple columns by comma-seperated values, or
127
+ a range of columns by `-`. For example, `1,3,5-7,9`. It also supports
128
+ column names. For example, `F1,F3`. `-` is not supported for column
129
+ names.
130
+ x: Similar as `nvec`, but limit X group to given features.
131
+ The rest of features (other than X and Z) in `in.infile` will
132
+ be used as Y.
133
+ The features in `in.infile` will still be tested pairwise, but only
134
+ features in X and Y will be kept.
135
+ topn (type=int): Number of results to return by `fastMLA`, ordered from
136
+ highest `|MLA|` value descending.
137
+ The default of the package is 2000, but here we set to 1e6 to return as
138
+ many results as possible (also good to do pvalue adjustment).
139
+ rvalue (type=float): Tolerance value for LA approximation. Lower values of
140
+ rvalue will cause a more thorough search, but take longer.
141
+ cut (type=int): Value passed to the GLA function to create buckets
142
+ (equal to number of buckets+1). Values placing between 15-30 samples per
143
+ bucket are optimal. Must be a positive integer>1. By default,
144
+ `max(ceiling(nrow(data)/22), 4)` is used.
145
+ ncores (type=int): Number of cores to use for parallelization.
146
+ padj (choice): The method for p-value adjustment.
147
+ - none: No p-value adjustment (no Padj column in outfile).
148
+ - holm: Holm-Bonferroni method.
149
+ - hochberg: Hochberg method.
150
+ - hommel: Hommel method.
151
+ - bonferroni: Bonferroni method.
152
+ - BH: Benjamini-Hochberg method.
153
+ - BY: Benjamini-Yekutieli method.
154
+ - fdr: FDR correction method.
155
+ transpose_input (flag): Whether to transpose the input file.
156
+ transpose_group (flag): Whether to transpose the group file.
157
+ transpose_cov (flag): Whether to transpose the covariate file.
158
+ xyz_names: The names of X12, X21 and X3 in the final output file. Separated
159
+ by comma. For example, `X12,X21,X3`.
160
+ """
161
+ input = "infile:file, covfile:file, groupfile:file, fmlfile:file"
162
+ output = "outfile:file:{{in.infile | stem}}.liquidassoc.txt"
163
+ lang = config.lang.rscript
164
+ envs = {
165
+ "nvec": None,
166
+ "x": None,
167
+ "topn": 1e6,
168
+ "rvalue": 0.5,
169
+ "cut": 20,
170
+ "ncores": config.misc.ncores,
171
+ "padj": "none",
172
+ "transpose_input": False,
173
+ "transpose_group": False,
174
+ "transpose_cov": False,
175
+ "xyz_names": None,
176
+ }
177
+ script = "file://../scripts/stats/LiquidAssoc.R"
178
+
179
+
180
+ class DiffCoexpr(Proc):
181
+ """Differential co-expression analysis.
182
+
183
+ See also <https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-11-497>
184
+ and <https://github.com/DavisLaboratory/dcanr/blob/8958d61788937eef3b7e2b4118651cbd7af7469d/R/inference_methods.R#L199>.
185
+
186
+ Input:
187
+ infile: The input data file. The rows are samples and the columns are
188
+ features. It must be tab-delimited.
189
+ ```
190
+ Sample F1 F2 F3 ... Fn
191
+ S1 1.2 3.4 5.6 7.8
192
+ S2 2.3 4.5 6.7 8.9
193
+ ...
194
+ Sm 5.6 7.8 9.0 1.2
195
+ ```
196
+ groupfile: The group file. The rows are the samples and the columns
197
+ are the groupings. It must be tab-delimited.
198
+ ```
199
+ Sample G1 G2 G3 ... Gk
200
+ S1 0 1 0 0
201
+ S2 2 1 0 NA # exclude this sample
202
+ ...
203
+ Sm 1 0 0 0
204
+ ```
205
+
206
+ Output:
207
+ outfile: The output file. It is a tab-delimited file with the first
208
+ column as the feature pair and the second column as the p-value.
209
+ ```
210
+ Group Feature1 Feature2 Pval Padj
211
+ G1 F1 F2 0.123 0.123
212
+ G1 F1 F3 0.123 0.123
213
+ ...
214
+ ```
215
+
216
+ Envs:
217
+ method (choice): The method used to calculate the differential
218
+ co-expression.
219
+ - pearson: Pearson correlation.
220
+ - spearman: Spearman correlation.
221
+ beta: The beta value for the differential co-expression analysis.
222
+ padj (choice): The method for p-value adjustment.
223
+ - none: No p-value adjustment (no Padj column in outfile).
224
+ - holm: Holm-Bonferroni method.
225
+ - hochberg: Hochberg method.
226
+ - hommel: Hommel method.
227
+ - bonferroni: Bonferroni method.
228
+ - BH: Benjamini-Hochberg method.
229
+ - BY: Benjamini-Yekutieli method.
230
+ - fdr: FDR correction method.
231
+ perm_batch (type=int): The number of permutations to run in each batch
232
+ seed (type=int): The seed for random number generation
233
+ ncores (type=int): The number of cores to use for parallelization
234
+ transpose_input (flag): Whether to transpose the input file.
235
+ transpose_group (flag): Whether to transpose the group file.
236
+ """ # noqa: E501
237
+ input = "infile:file, groupfile:file"
238
+ output = "outfile:file:{{in.infile | stem}}.diffcoexpr.txt"
239
+ lang = config.lang.rscript
240
+ envs = {
241
+ "method": "pearson",
242
+ "beta": 6,
243
+ "padj": "none",
244
+ "perm_batch": 20,
245
+ "seed": 8525,
246
+ "ncores": config.misc.ncores,
247
+ "transpose_input": False,
248
+ "transpose_group": False,
249
+ }
250
+ script = "file://../scripts/stats/DiffCoexpr.R"
251
+
252
+
253
+ class MetaPvalue(Proc):
254
+ """Calulation of meta p-values.
255
+
256
+ If there is only one input file, only the p-value adjustment will be performed.
257
+
258
+ Input:
259
+ infiles: The input files. Each file is a tab-delimited file with multiple
260
+ columns. There should be ID column(s) to match the rows in other files and
261
+ p-value column(s) to be combined. The records will be full-joined by ID.
262
+ When only one file is provided, only the pvalue adjustment will be
263
+ performed when `envs.padj` is not `none`, otherwise the input file will
264
+ be copied to `out.outfile`.
265
+
266
+ Output:
267
+ outfile: The output file. It is a tab-delimited file with the first column as
268
+ the ID and the second column as the combined p-value.
269
+ ```
270
+ ID ID1 ... Pval Padj
271
+ a x ... 0.123 0.123
272
+ b y ... 0.123 0.123
273
+ ...
274
+ ```
275
+
276
+ Envs:
277
+ id_cols: The column names used in all `in.infiles` as ID columns. Multiple
278
+ columns can be specified by comma-seperated values. For example, `ID1,ID2`.
279
+ If `id_expr` is specified, this should be a single column name for the new
280
+ ID column in each `in.infiles` and the final `out.outfile`.
281
+ id_exprs: The R expressions for each `in.infiles` to get ID column(s).
282
+ pval_cols: The column names used in all `in.infiles` as p-value columns.
283
+ Different columns can be specified by comma-seperated values for each
284
+ `in.infiles`. For example, `Pval1,Pval2`.
285
+ method (choice): The method used to calculate the meta-pvalue.
286
+ - fisher: Fisher's method.
287
+ - sumlog: Sum of logarithms (same as Fisher's method)
288
+ - logitp: Logit method.
289
+ - sumz: Sum of z method (Stouffer's method).
290
+ - meanz: Mean of z method.
291
+ - meanp: Mean of p method.
292
+ - invt: Inverse t method.
293
+ - sump: Sum of p method (Edgington's method).
294
+ - votep: Vote counting method.
295
+ - wilkinsonp: Wilkinson's method.
296
+ - invchisq: Inverse chi-square method.
297
+ na: The method to handle NA values. -1 to skip the record. Otherwise NA
298
+ will be replaced by the given value.
299
+ padj (choice): The method for p-value adjustment.
300
+ - none: No p-value adjustment (no Padj column in outfile).
301
+ - holm: Holm-Bonferroni method.
302
+ - hochberg: Hochberg method.
303
+ - hommel: Hommel method.
304
+ - bonferroni: Bonferroni method.
305
+ - BH: Benjamini-Hochberg method.
306
+ - BY: Benjamini-Yekutieli method.
307
+ - fdr: FDR correction method.
308
+ """
309
+ input = "infiles:files"
310
+ output = "outfile:file:{{in.infiles | first | stem}}.metapval.txt"
311
+ lang = config.lang.rscript
312
+ envs = {
313
+ "id_cols": None,
314
+ "id_exprs": None,
315
+ "pval_cols": None,
316
+ "method": "fisher",
317
+ "na": -1,
318
+ "padj": "none",
319
+ }
320
+ script = "file://../scripts/stats/MetaPvalue.R"