biopipen 0.28.1__py3-none-any.whl → 0.29.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biopipen might be problematic. Click here for more details.

Files changed (85) hide show
  1. biopipen/__init__.py +1 -1
  2. biopipen/core/config.toml +8 -0
  3. biopipen/ns/bam.py +0 -2
  4. biopipen/ns/bed.py +35 -0
  5. biopipen/ns/cellranger_pipeline.py +5 -5
  6. biopipen/ns/cnv.py +18 -2
  7. biopipen/ns/cnvkit_pipeline.py +16 -11
  8. biopipen/ns/gene.py +68 -23
  9. biopipen/ns/misc.py +2 -15
  10. biopipen/ns/plot.py +204 -0
  11. biopipen/ns/regulatory.py +214 -0
  12. biopipen/ns/scrna.py +31 -5
  13. biopipen/ns/snp.py +516 -8
  14. biopipen/ns/stats.py +167 -3
  15. biopipen/ns/vcf.py +196 -0
  16. biopipen/reports/snp/PlinkCallRate.svelte +24 -0
  17. biopipen/reports/snp/PlinkFreq.svelte +18 -0
  18. biopipen/reports/snp/PlinkHWE.svelte +18 -0
  19. biopipen/reports/snp/PlinkHet.svelte +18 -0
  20. biopipen/reports/snp/PlinkIBD.svelte +18 -0
  21. biopipen/scripts/bam/CNVpytor.py +144 -46
  22. biopipen/scripts/bed/BedtoolsIntersect.py +54 -0
  23. biopipen/scripts/bed/BedtoolsMerge.py +1 -1
  24. biopipen/scripts/cnv/AneuploidyScore.R +30 -7
  25. biopipen/scripts/cnv/AneuploidyScoreSummary.R +5 -2
  26. biopipen/scripts/cnv/TMADScore.R +21 -5
  27. biopipen/scripts/cnv/TMADScoreSummary.R +6 -2
  28. biopipen/scripts/cnvkit/CNVkitAccess.py +2 -1
  29. biopipen/scripts/cnvkit/CNVkitAutobin.py +3 -2
  30. biopipen/scripts/cnvkit/CNVkitBatch.py +1 -1
  31. biopipen/scripts/cnvkit/CNVkitCoverage.py +2 -1
  32. biopipen/scripts/cnvkit/CNVkitGuessBaits.py +1 -1
  33. biopipen/scripts/cnvkit/CNVkitHeatmap.py +1 -1
  34. biopipen/scripts/cnvkit/CNVkitReference.py +2 -1
  35. biopipen/scripts/delim/SampleInfo.R +10 -5
  36. biopipen/scripts/gene/GeneNameConversion.R +65 -0
  37. biopipen/scripts/gene/GenePromoters.R +61 -0
  38. biopipen/scripts/misc/Shell.sh +15 -0
  39. biopipen/scripts/plot/Manhattan.R +146 -0
  40. biopipen/scripts/plot/QQPlot.R +146 -0
  41. biopipen/scripts/regulatory/MotifAffinityTest.R +226 -0
  42. biopipen/scripts/regulatory/MotifAffinityTest_AtSNP.R +126 -0
  43. biopipen/scripts/regulatory/MotifAffinityTest_MotifBreakR.R +96 -0
  44. biopipen/scripts/regulatory/MotifScan.py +159 -0
  45. biopipen/scripts/regulatory/atSNP.R +33 -0
  46. biopipen/scripts/regulatory/motifBreakR.R +1594 -0
  47. biopipen/scripts/scrna/MarkersFinder.R +69 -67
  48. biopipen/scripts/scrna/SeuratClustering.R +71 -29
  49. biopipen/scripts/scrna/SeuratMap2Ref.R +20 -0
  50. biopipen/scripts/scrna/SeuratPreparing.R +252 -122
  51. biopipen/scripts/scrna/SeuratSubClustering.R +76 -27
  52. biopipen/scripts/snp/MatrixEQTL.R +85 -44
  53. biopipen/scripts/snp/Plink2GTMat.py +133 -0
  54. biopipen/scripts/snp/PlinkCallRate.R +190 -0
  55. biopipen/scripts/snp/PlinkFilter.py +100 -0
  56. biopipen/scripts/snp/PlinkFreq.R +298 -0
  57. biopipen/scripts/snp/PlinkFromVcf.py +78 -0
  58. biopipen/scripts/snp/PlinkHWE.R +80 -0
  59. biopipen/scripts/snp/PlinkHet.R +92 -0
  60. biopipen/scripts/snp/PlinkIBD.R +200 -0
  61. biopipen/scripts/snp/PlinkUpdateName.py +124 -0
  62. biopipen/scripts/stats/Mediation.R +94 -0
  63. biopipen/scripts/stats/MetaPvalue.R +2 -1
  64. biopipen/scripts/stats/MetaPvalue1.R +70 -0
  65. biopipen/scripts/tcr/TCRClusterStats.R +12 -7
  66. biopipen/scripts/vcf/BcftoolsAnnotate.py +91 -0
  67. biopipen/scripts/vcf/BcftoolsFilter.py +90 -0
  68. biopipen/scripts/vcf/BcftoolsSort.py +113 -0
  69. biopipen/scripts/vcf/BcftoolsView.py +73 -0
  70. biopipen/scripts/vcf/VcfFix_utils.py +1 -1
  71. biopipen/scripts/vcf/bcftools_utils.py +52 -0
  72. biopipen/utils/gene.R +83 -37
  73. biopipen/utils/gene.py +108 -60
  74. biopipen/utils/misc.R +56 -0
  75. biopipen/utils/misc.py +5 -2
  76. biopipen/utils/reference.py +54 -10
  77. {biopipen-0.28.1.dist-info → biopipen-0.29.1.dist-info}/METADATA +2 -2
  78. {biopipen-0.28.1.dist-info → biopipen-0.29.1.dist-info}/RECORD +80 -51
  79. {biopipen-0.28.1.dist-info → biopipen-0.29.1.dist-info}/entry_points.txt +1 -1
  80. biopipen/ns/bcftools.py +0 -111
  81. biopipen/scripts/bcftools/BcftoolsAnnotate.py +0 -42
  82. biopipen/scripts/bcftools/BcftoolsFilter.py +0 -79
  83. biopipen/scripts/bcftools/BcftoolsSort.py +0 -19
  84. biopipen/scripts/gene/GeneNameConversion.py +0 -66
  85. {biopipen-0.28.1.dist-info → biopipen-0.29.1.dist-info}/WHEEL +0 -0
biopipen/ns/snp.py CHANGED
@@ -5,9 +5,9 @@ from ..core.config import config
5
5
 
6
6
 
7
7
  class PlinkSimulation(Proc):
8
- """Simulate SNPs using PLINK v1.9
8
+ """Simulate SNPs using PLINK v2
9
9
 
10
- See also <https://www.cog-genomics.org/plink/1.9/input#simulate> and
10
+ See also <https://www.cog-genomics.org/plink/2.0/input#simulate> and
11
11
  <https://pwwang.github.io/biopipen/api/biopipen.ns.snp/#biopipen.ns.snp.PlinkSimulation>
12
12
 
13
13
  Input:
@@ -27,7 +27,7 @@ class PlinkSimulation(Proc):
27
27
  nsnps (type=int): Number of SNPs to simulate
28
28
  ncases (type=int): Number of cases to simulate
29
29
  nctrls (type=int): Number of controls to simulate
30
- plink: Path to PLINK v1.9
30
+ plink: Path to PLINK v2
31
31
  seed (type=int): Random seed. If not set, seed will not be set.
32
32
  label: Prefix label for the SNPs.
33
33
  prevalence (type=float): Disease prevalence.
@@ -37,7 +37,7 @@ class PlinkSimulation(Proc):
37
37
  homodds (type=float): Odds ratio for homozygous genotypes.
38
38
  missing (type=float): Proportion of missing genotypes.
39
39
  args (ns): Additional arguments to pass to PLINK.
40
- - <more>: see <https://www.cog-genomics.org/plink/1.9/input#simulate>.
40
+ - <more>: see <https://www.cog-genomics.org/plink/2.0/input#simulate>.
41
41
  transpose_gtmat (flag): If set, the genotype matrix (`out.gtmat`) will
42
42
  be transposed.
43
43
  sample_prefix: Use this prefix for the sample names. If not set, the sample
@@ -93,11 +93,15 @@ class MatrixEQTL(Proc):
93
93
 
94
94
  Envs:
95
95
  model (choice): The model to use.
96
- - `linear`: Linear model
97
- - `modelLINEAR`: Same as `linear`
98
- - `anova`: ANOVA model
99
- - `modelANOVA`: Same as `anova`
96
+ - linear: Linear model
97
+ - modelLINEAR: Same as `linear`
98
+ - anova: ANOVA model
99
+ - modelANOVA: Same as `anova`
100
100
  pval (type=float): P-value threshold for eQTLs
101
+ match_samples (flag): Match samples in the genotype and expression matrices.
102
+ If True, an error will be raised if samples from `in.geno`, `in.expr`,
103
+ and `in.cov` (if provided) are not the same.
104
+ If False, common samples will be used to subset the matrices.
101
105
  transp (type=float): P-value threshold for trans-eQTLs.
102
106
  If cis-eQTLs are not enabled (`snppos` and `genepos` are not set),
103
107
  this defaults to 1e-5.
@@ -126,6 +130,7 @@ class MatrixEQTL(Proc):
126
130
  envs = {
127
131
  "model": "linear",
128
132
  "pval": 1e-3,
133
+ "match_samples": False,
129
134
  "transp": None,
130
135
  "fdr": False,
131
136
  "snppos": None,
@@ -136,3 +141,506 @@ class MatrixEQTL(Proc):
136
141
  "transpose_cov": False,
137
142
  }
138
143
  script = "file://../scripts/snp/MatrixEQTL.R"
144
+
145
+
146
+ class PlinkFromVcf(Proc):
147
+ """Convert VCF to PLINK format.
148
+
149
+ The PLINK format consists of 3 files: `.bed`, `.bim`, and `.fam`.
150
+
151
+ Requires PLINK v2
152
+
153
+ TODO:
154
+ Handle sex when sex chromosomes are included.
155
+
156
+ Input:
157
+ invcf: VCF file
158
+
159
+ Output:
160
+ outdir: Output directory containing the PLINK files
161
+
162
+ Envs:
163
+ plink: Path to PLINK v2
164
+ tabix: Path to tabix
165
+ ncores (type=int): Number of cores/threads to use, will pass to plink
166
+ `--threads` option
167
+ vcf_half_call (choice): The current VCF standard does not specify
168
+ how '0/.' and similar GT values should be interpreted.
169
+ - error: error out and reports the line number of the anomaly
170
+ - e: alias for `error`
171
+ - haploid: treat half-calls as haploid/homozygous
172
+ - h: alias for `haploid`
173
+ - missing: treat half-calls as missing
174
+ - m: alias for `missing`
175
+ - reference: treat the missing part as reference
176
+ - r: alias for `reference`
177
+ double_id (flag): set both FIDs and IIDs to the VCF/BCF sample ID.
178
+ vcf_filter (auto): skip variants which failed one or more filters tracked
179
+ by the FILTER field.
180
+ If True, only FILTER with `PASS` or `.` will be kept.
181
+ Multiple filters can be specified by separating them with space or
182
+ as a list.
183
+ vcf_idspace_to: convert all spaces in sample IDs to this character.
184
+ set_missing_var_ids: update variant IDs using a template string,
185
+ with a '@' where the chromosome code should go, and a '#' where the
186
+ base-pair position belongs. You can also specify `\$r` and `\$a` for
187
+ the reference and alternate alleles, respectively.
188
+ See <https://www.cog-genomics.org/plink/2.0/data#set_all_var_ids>
189
+ max_alleles (type=int): Maximum number of alleles per variant.
190
+ <more>: see <https://www.cog-genomics.org/plink/2.0/> for more options.
191
+ Note that `_` will be replaced by `-` in the argument names.
192
+ """ # noqa: E501
193
+ input = "invcf:file"
194
+ output = "outdir:dir:{{in.invcf | regex_replace: '\\.gz$', '' | stem}}"
195
+ lang = config.lang.python
196
+ envs = {
197
+ "plink": config.exe.plink2,
198
+ "tabix": config.exe.tabix,
199
+ "ncores": config.misc.ncores,
200
+ "vcf_half_call": "missing",
201
+ "double_id": True,
202
+ "vcf_filter": True,
203
+ "vcf_idspace_to": "_",
204
+ "set_missing_var_ids": "@_#",
205
+ "max_alleles": 2,
206
+ }
207
+ script = "file://../scripts/snp/PlinkFromVcf.py"
208
+
209
+
210
+ class Plink2GTMat(Proc):
211
+ """Convert PLINK files to genotype matrix.
212
+
213
+ Requires PLINK v2. The .raw/.traw file is generated by plink and then transformed
214
+ to a genotype matrix file.
215
+ See <https://www.cog-genomics.org/plink/2.0/formats#raw> and
216
+ <https://www.cog-genomics.org/plink/2.0/formats#traw> for more information.
217
+
218
+ The allelic dosage is used as the values of genotype matrix.
219
+ "--keep-allele-order" is used to keep the allele order consistent with the
220
+ reference allele first.
221
+
222
+ Input:
223
+ indir: Input directory containing the PLINK files.
224
+ Including `.bed`, `.bim`, and `.fam` files
225
+
226
+ Output:
227
+ outfile: Genotype matrix file with rows representing SNPs and columns
228
+ representing samples if `envs.transpose` is `False`.
229
+
230
+ Envs:
231
+ plink: Path to PLINK v2.0
232
+ ncores (type=int): Number of cores/threads to use, will pass to plink
233
+ `--threads` option
234
+ transpose (flag): If set, the genotype matrix (`out.outfile`) is transposed.
235
+ samid: what to use as sample ID.
236
+ Placeholders include `{fid}` and `{iid}` for family and individual IDs,
237
+ respectively.
238
+ varid: what to use as variant ID.
239
+ Placeholders include `{chr}`, `{pos}`, `{rs}`, `{ref}`, and `{alt}` for
240
+ chromosome, position, rsID, reference allele, and alternate allele,
241
+ respectively.
242
+ trans_chr: A dictionary to translate chromosome numbers to chromosome names.
243
+ missing_id: what to use as the rs if missing.
244
+ """
245
+ input = "indir:dir"
246
+ output = "outfile:file:{{in.indir | stem}}-gtmat.txt"
247
+ lang = config.lang.python
248
+ envs = {
249
+ "plink": config.exe.plink2,
250
+ "ncores": config.misc.ncores,
251
+ "transpose": False,
252
+ "samid": "{fid}_{iid}",
253
+ "varid": "{chr}_{pos}_{varid}_{ref}_{alt}",
254
+ "trans_chr": {"23": "X", "24": "Y", "25": "XY", "26": "M"},
255
+ "missing_id": "NA",
256
+ }
257
+ script = "file://../scripts/snp/Plink2GTMat.py"
258
+
259
+
260
+ class PlinkIBD(Proc):
261
+ """Run PLINK IBD analysis (identity by descent)
262
+
263
+ See also <https://www.cog-genomics.org/plink/1.9/ibd>
264
+ This has to run with PLINK v1.9. Plink v2 does not support IBD analysis yet.
265
+
266
+ Input:
267
+ indir: Input directory containing the PLINK files.
268
+ Including `.bed`, `.bim`, and `.fam` files
269
+
270
+ Output:
271
+ outdir: Output file containing the IBD results.
272
+ Including [`.genome`](https://www.cog-genomics.org/plink/2.0/formats#genome)
273
+ file for the original IBD report from PLINK, and `.ibd.png` for the
274
+ heatmap of `PI_HAT` values.
275
+
276
+ Envs:
277
+ plink: Path to PLINK v1.9
278
+ ncores (type=int): Number of cores/threads to use, will pass to plink
279
+ `--threads` option
280
+ highld: High LD regions to be excluded from the analysis.
281
+ If not set, no regions will be excluded.
282
+ samid: what to use as sample ID.
283
+ Placeholders include `{fid}` and `{iid}` for family and individual IDs,
284
+ respectively
285
+ indep (type=auto): LD pruning parameters. Either a list of numerics or a string
286
+ concatenated by `,` to specify
287
+ 1) consider a window of N SNPs (e.g. 50),
288
+ 2) calculate LD between each pair of SNPs in the window (e.g. 5),
289
+ 3) remove one of a pair of SNPs if the LD is greater than X (e.g. 0.2).
290
+ pihat (type=float): PI_HAT threshold for IBD analysis.
291
+ See also <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5007749/>
292
+ plot (flag): If set, plot the heatmap of `PI_HAT` values.
293
+ anno: The annotation file for the samples, used to plot on the heatmap.
294
+ Names must match the ones that are transformed by `args.samid`.
295
+ seed (type=int): Random seed for the analysis.
296
+ devpars (ns): The device parameters for the plot.
297
+ - width (type=int): Width of the plot
298
+ - height (type=int): Height of the plot
299
+ - res (type=int): Resolution of the plot
300
+ """
301
+ input = "indir:dir"
302
+ output = "outdir:dir:{{in.indir | stem}}.ibd"
303
+ lang = config.lang.rscript
304
+ envs = {
305
+ "plink": config.exe.plink,
306
+ "ncores": config.misc.ncores,
307
+ "highld": None,
308
+ "samid": "{fid}_{iid}",
309
+ "indep": [50, 5, 0.2],
310
+ "pihat": 0.1875,
311
+ "plot": True,
312
+ "anno": None,
313
+ "seed": 8525,
314
+ "devpars": {"width": 1000, "height": 1000, "res": 100},
315
+ }
316
+ script = "file://../scripts/snp/PlinkIBD.R"
317
+ plugin_opts = {"report": "file://../reports/snp/PlinkIBD.svelte"}
318
+
319
+
320
+ class PlinkHWE(Proc):
321
+ """Hardy-Weinberg Equilibrium report and filtering
322
+
323
+ See also <https://www.cog-genomics.org/plink/2.0/basic_stats#hardy>
324
+
325
+ Input:
326
+ indir: Input directory containing the PLINK files.
327
+ Including `.bed`, `.bim`, and `.fam` files
328
+
329
+ Output:
330
+ outdir: Output file containing the HWE results.
331
+ Including [`.hwe`](https://www.cog-genomics.org/plink/2.0/formats#hwe)
332
+ file for the original HWE report from PLINK and
333
+ `.hardy.fail` for the variants that failed the HWE test.
334
+ It also includes binary files `.bed`, `.bim`, and `.fam`
335
+
336
+ Envs:
337
+ plink: Path to PLINK v2
338
+ ncores (type=int): Number of cores/threads to use, will pass to plink
339
+ `--threads` option
340
+ cutoff (type=float): P-value cutoff for HWE test
341
+ plot (flag): If set, plot the distribution of HWE p-values.
342
+ devpars (ns): The device parameters for the plot.
343
+ - width (type=int): Width of the plot
344
+ - height (type=int): Height of the plot
345
+ - res (type=int): Resolution of the plot
346
+ """
347
+ input = "indir:dir"
348
+ output = "outdir:dir:{{in.indir | stem}}.hwe"
349
+ lang = config.lang.rscript
350
+ envs = {
351
+ "plink": config.exe.plink2,
352
+ "ncores": config.misc.ncores,
353
+ "cutoff": 1e-5,
354
+ "plot": True,
355
+ "devpars": {"width": 1000, "height": 800, "res": 100},
356
+ }
357
+ script = "file://../scripts/snp/PlinkHWE.R"
358
+ plugin_opts = {"report": "file://../reports/snp/PlinkHWE.svelte"}
359
+
360
+
361
+ class PlinkHet(Proc):
362
+ """Calculation of sample heterozygosity.
363
+
364
+ Input:
365
+ indir: Input directory containing the PLINK files.
366
+ Including `.bed`, `.bim`, and `.fam` files
367
+
368
+ Output:
369
+ outdir: Output file containing the heterozygosity results.
370
+ Including [`.het`](https://www.cog-genomics.org/plink/2.0/formats#het)
371
+ file for the original heterozygosity report from PLINK and
372
+ `.het.fail` for the samples that failed the heterozygosity test.
373
+ It also includes binary files `.bed`, `.bim`, and `.fam`
374
+
375
+ Envs:
376
+ plink: Path to PLINK v2, at least v2.00a5.10
377
+ ncores (type=int): Number of cores/threads to use, will pass to plink
378
+ `--threads` option
379
+ cutoff (type=float): Heterozygosity cutoff, samples with heterozygosity
380
+ beyond `mean - cutoff * sd` or `mean + cutoff * sd` will be considered
381
+ as outliers.
382
+ plot (flag): If set, plot the distribution of heterozygosity values.
383
+ devpars (ns): The device parameters for the plot.
384
+ - width (type=int): Width of the plot
385
+ - height (type=int): Height of the plot
386
+ - res (type=int): Resolution of the plot
387
+ """
388
+ input = "indir:dir"
389
+ output = "outdir:dir:{{in.indir | stem}}.het"
390
+ lang = config.lang.rscript
391
+ envs = {
392
+ "plink": config.exe.plink2,
393
+ "ncores": config.misc.ncores,
394
+ "cutoff": 3.0,
395
+ "plot": True,
396
+ "devpars": {"width": 1000, "height": 800, "res": 100},
397
+ }
398
+ script = "file://../scripts/snp/PlinkHet.R"
399
+ plugin_opts = {"report": "file://../reports/snp/PlinkHet.svelte"}
400
+
401
+
402
+ class PlinkCallRate(Proc):
403
+ """Calculation of call rate for the samples and variants.
404
+
405
+ Input:
406
+ indir: Input directory containing the PLINK files.
407
+ Including `.bed`, `.bim`, and `.fam` files
408
+
409
+ Output:
410
+ outdir: Output file containing the call rate results.
411
+ Including [`.imiss`](https://www.cog-genomics.org/plink/2.0/formats#imiss)
412
+ file for missing calls for samples,
413
+ [`.lmiss`](https://www.cog-genomics.org/plink/2.0/formats#lmiss) for
414
+ missing calls for variants, `.samplecr.fail` for the samples fail
415
+ sample call rate cutoff (`args.samplecr`), and `.varcr.fail` for the SNPs
416
+ fail snp call rate cutoff (`args.varcr`).
417
+ It also includes binary files `.bed`, `.bim`, and `.fam`.
418
+
419
+ Envs:
420
+ plink: Path to PLINK v2
421
+ ncores (type=int): Number of cores/threads to use, will pass to plink
422
+ `--threads` option
423
+ samplecr (type=float): Sample call rate cutoff
424
+ varcr (type=float): Variant call rate cutoff
425
+ max_iter (type=int): Maximum number of iterations to run the call rate
426
+ calculation.
427
+ Since the sample and variant call rates are affected by each other,
428
+ it may be necessary to iterate the calculation to get the stable results.
429
+ plot (flag): If set, plot the distribution of call rates.
430
+ devpars (ns): The device parameters for the plot.
431
+ - width (type=int): Width of the plot
432
+ - height (type=int): Height of the plot
433
+ - res (type=int): Resolution of the plot
434
+ """
435
+ input = "indir:dir"
436
+ output = "outdir:dir:{{in.indir | stem}}.callrate"
437
+ lang = config.lang.rscript
438
+ envs = {
439
+ "plink": config.exe.plink2,
440
+ "ncores": config.misc.ncores,
441
+ "samplecr": 0.95,
442
+ "varcr": 0.95,
443
+ "max_iter": 3,
444
+ "plot": True,
445
+ "devpars": {"width": 1000, "height": 800, "res": 100},
446
+ }
447
+ script = "file://../scripts/snp/PlinkCallRate.R"
448
+ plugin_opts = {"report": "file://../reports/snp/PlinkCallRate.svelte"}
449
+
450
+
451
+ class PlinkFilter(Proc):
452
+ """Filter samples and variants for PLINK files.
453
+
454
+ Input:
455
+ indir: Input directory containing the PLINK files.
456
+ Including `.bed`, `.bim`, and `.fam` files
457
+ samples_file: File containing the sample IDs.
458
+ variants_file: File containing the variant IDs or regions.
459
+
460
+ Output:
461
+ outdir: Output directory containing the filtered PLINK files.
462
+ Including `.bed`, `.bim`, and `.fam` files
463
+
464
+ Envs:
465
+ plink: Path to PLINK v2
466
+ ncores (type=int): Number of cores/threads to use, will pass to plink
467
+ `--threads` option
468
+ samples (auto): Sample IDs.
469
+ If both FID and IID should be provided and separatedby `/`. Otherwise,
470
+ assuming the same FID and IID.
471
+ A list of sample IDs or string concatenated by `,`.
472
+ If either `in.samples_file` or `envs.samples_file` is set,
473
+ this will be ignored.
474
+ variants (auto): Variant IDs.
475
+ A list of variant IDs or string concatenated by `,`.
476
+ If either `in.variants_file` or `envs.variants_file` is set,
477
+ this will be ignored.
478
+ samples_file: File containing the sample IDs.
479
+ If `in.samples_file` is set, this will be ignored.
480
+ variants_file: File containing the variant IDs.
481
+ If `in.variants_file` is set, this will be ignored.
482
+ keep (flag): Use `samples`/`variants`/`samples_file`/`variants_file` to
483
+ only keep the specified samples/variants, instead of removing them.
484
+ vfile_type (choice): The type of the variants file.
485
+ - id: Variant IDs
486
+ - bed0: 0-based BED file
487
+ - bed1: 1-based BED file
488
+ chr: Chromosome to keep.
489
+ For example, `1-4 22 XY` will keep chromosomes 1 to 4, 22, and XY.
490
+ not_chr: Chromosome to remove.
491
+ For example, `1-4 22 XY` will remove chromosomes 1 to 4, 22, and XY.
492
+ autosome (flag): Excludes all unplaced and non-autosomal variants
493
+ autosome_xy (flag): Does `autosome` but does not exclude the pseudo-autosomal
494
+ region of X.
495
+ snps_only (auto): Excludes all variants with one or more multi-character
496
+ allele codes. With 'just-acgt', variants with single-character allele codes
497
+ outside of {'A', 'C', 'G', 'T', 'a', 'c', 'g', 't', <missing code>}
498
+ are also excluded.
499
+ """
500
+ input = [
501
+ "indir:dir",
502
+ "samples_file:file",
503
+ "variants_file:file",
504
+ ]
505
+ output = "outdir:dir:{{in.indir | stem}}.filtered"
506
+ lang = config.lang.python
507
+ envs = {
508
+ "plink": config.exe.plink2,
509
+ "ncores": config.misc.ncores,
510
+ "samples": None,
511
+ "variants": None,
512
+ "samples_file": None,
513
+ "variants_file": None,
514
+ "keep": False,
515
+ "vfile_type": "id",
516
+ "chr": None,
517
+ "not_chr": None,
518
+ "autosome": False,
519
+ "autosome_xy": False,
520
+ "snps_only": False,
521
+ }
522
+ script = "file://../scripts/snp/PlinkFilter.py"
523
+
524
+
525
+ class PlinkFreq(Proc):
526
+ """Calculate allele frequencies for the variants.
527
+
528
+ Input:
529
+ indir: Input directory containing the PLINK files.
530
+ Including `.bed`, `.bim`, and `.fam` files
531
+
532
+ Output:
533
+ outdir: Output file containing the allele frequency results.
534
+ By default, it includes
535
+ [`.afreq`](https://www.cog-genomics.org/plink/2.0/formats#afreq)
536
+ file for the allele frequency report from PLINK.
537
+ Modifiers can be added to change this behavior.
538
+ See `envs.modifier` for more information.
539
+ When `envs.filter != no`, it also includes binary files `.bed`, `.bim`,
540
+ and `.fam` after filtering with `envs.cutoff`.
541
+
542
+ Envs:
543
+ plink: Path to PLINK v2
544
+ ncores (type=int): Number of cores/threads to use, will pass to plink
545
+ `--threads` option
546
+ modifier (choice): The modifier of `--freq` to control the output behavior.
547
+ - none: No modifier, only the `.afreq` file will be generated.
548
+ `MAF` (minor allele frequency) will be added in addition to the
549
+ `REF_FREQ` and `ALT1_FREQ` columns. Check `.afreqx` for the added
550
+ columns.
551
+ - counts: write allele count report to `.acount`.
552
+ See <https://www.cog-genomics.org/plink/2.0/formats#afreq>.
553
+ `ALT1`, `ALT1_CT`, and `REF_CT` are added. Check `.acountx` for
554
+ the added columns.
555
+ - x: write genotype count report to `.gcount`
556
+ Like `--freqx` in v1.9, `--geno-counts` will be run to generate
557
+ the genotype counts.
558
+ `ALT1`, `HET_REF_ALT1_CT`, and `HOM_ALT1_CT` are added. Check
559
+ `.gcountx` for the added columns.
560
+ gz (flag): If set, compress the output files.
561
+ cutoff (auto): Cutoffs to mark or filter the variants.
562
+ If a float is given, default column will be used based on the modifier.
563
+ For `modifier="none"`, it defaults to `MAF`.
564
+ For `modifier="counts"`, it defaults to `ALT1_CT`.
565
+ For `modifier="x"`, it defaults to `HOM_ALT1_CT`.
566
+ Or this could be a dictionary to specify the column names and cutoffs.
567
+ For example, `{"MAF": 0.05}`.
568
+ filter (auto): The direction of filtering variants based on `cutoff`.
569
+ If a single value is given, it will apply to all columns provided in
570
+ `cutoff`. If a dictionary is given, it will apply to the corresponding
571
+ column. If a column cannot be found in the dictionary, it defaults to
572
+ `no`.
573
+ no: Do not filter variants (no binary files are generated in outdir).
574
+ gt: Filter variants with MAF greater than `cutoff`.
575
+ lt: Filter variants with MAF less than `cutoff`.
576
+ ge: Filter variants with MAF greater than or equal to `cutoff`.
577
+ le: Filter variants with MAF less than or equal to `cutoff`.
578
+ plot (flag): If set, plot the distribution of allele frequencies.
579
+ devpars (ns): The device parameters for the plot.
580
+ - width (type=int): Width of the plot
581
+ - height (type=int): Height of the plot
582
+ - res (type=int): Resolution of the plot
583
+ """
584
+ input = "indir:dir"
585
+ output = "outdir:dir:{{in.indir | stem}}.freq"
586
+ lang = config.lang.rscript
587
+ envs = {
588
+ "plink": config.exe.plink2,
589
+ "ncores": config.misc.ncores,
590
+ "modifier": "none",
591
+ "gz": False,
592
+ "cutoff": {},
593
+ "filter": {},
594
+ "plot": True,
595
+ "devpars": {"width": 1000, "height": 800, "res": 100},
596
+ }
597
+ script = "file://../scripts/snp/PlinkFreq.R"
598
+ plugin_opts = {"report": "file://../reports/snp/PlinkFreq.svelte"}
599
+
600
+
601
+ class PlinkUpdateName(Proc):
602
+ """Update variant names in PLINK files.
603
+
604
+ See also <https://www.cog-genomics.org/plink/2.0/data#update_map>.
605
+
606
+ Input:
607
+ indir: Input directory containing the PLINK files.
608
+ Including `.bed`, `.bim`, and `.fam` files
609
+ namefile: File containing the variant names to update.
610
+ Either a file containing two columns, the first column is the old
611
+ variant name, and the second column is the new variant name.
612
+ Or a VCF file containing the variant names to update.
613
+ When a VCF file is given, the chromosome, position, and reference and
614
+ alternate alleles will be used to match the variants.
615
+
616
+ Output:
617
+ outdir: Output directory containing the updated PLINK files.
618
+ Including `.bed`, `.bim`, and `.fam` files
619
+
620
+ Envs:
621
+ ncores: Number of cores/threads to use, will pass to plink `--threads` option
622
+ plink: Path to PLINK v2
623
+ bcftools: Path to bcftools
624
+ match_alt (choice): How to match alternate alleles when `in.namefile`
625
+ is a VCF file.
626
+ - exact: Matches alternate alleles exactly.
627
+ - all: Matches alternate alleles regardless of the order.
628
+ `chr1:100:A:T,G` matches `chr1:100:A:G,T` or `chr1:100:A:T,G`.
629
+ - any: Matches any alternate allele.
630
+ For example, `chr1:100:A:T,G` matches `chr1:100:A:G,C`
631
+ - first_included: Matches when the first allele is included.
632
+ For example, `chr1:100:A:T,G` matches `chr1:100:A:C,T`.
633
+ - first: Match first alternate allele
634
+ For example, `chr1:100:A:T,G` matches `chr1:100:A:T`.
635
+ - none: Do not match alternate alleles
636
+ """
637
+ input = "indir:dir, namefile:file"
638
+ output = "outdir:dir:{{in.indir | stem}}.newnames"
639
+ lang = config.lang.python
640
+ envs = {
641
+ "ncores": config.misc.ncores,
642
+ "plink": config.exe.plink2,
643
+ "bcftools": config.exe.bcftools,
644
+ "match_alt": "exact",
645
+ }
646
+ script = "file://../scripts/snp/PlinkUpdateName.py"