biopipen 0.28.1__py3-none-any.whl → 0.29.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biopipen might be problematic. Click here for more details.
- biopipen/__init__.py +1 -1
- biopipen/core/config.toml +8 -0
- biopipen/ns/bam.py +0 -2
- biopipen/ns/bed.py +35 -0
- biopipen/ns/cellranger_pipeline.py +5 -5
- biopipen/ns/cnv.py +18 -2
- biopipen/ns/cnvkit_pipeline.py +16 -11
- biopipen/ns/gene.py +68 -23
- biopipen/ns/misc.py +2 -15
- biopipen/ns/plot.py +204 -0
- biopipen/ns/regulatory.py +214 -0
- biopipen/ns/scrna.py +31 -5
- biopipen/ns/snp.py +516 -8
- biopipen/ns/stats.py +167 -3
- biopipen/ns/vcf.py +196 -0
- biopipen/reports/snp/PlinkCallRate.svelte +24 -0
- biopipen/reports/snp/PlinkFreq.svelte +18 -0
- biopipen/reports/snp/PlinkHWE.svelte +18 -0
- biopipen/reports/snp/PlinkHet.svelte +18 -0
- biopipen/reports/snp/PlinkIBD.svelte +18 -0
- biopipen/scripts/bam/CNVpytor.py +144 -46
- biopipen/scripts/bed/BedtoolsIntersect.py +54 -0
- biopipen/scripts/bed/BedtoolsMerge.py +1 -1
- biopipen/scripts/cnv/AneuploidyScore.R +30 -7
- biopipen/scripts/cnv/AneuploidyScoreSummary.R +5 -2
- biopipen/scripts/cnv/TMADScore.R +21 -5
- biopipen/scripts/cnv/TMADScoreSummary.R +6 -2
- biopipen/scripts/cnvkit/CNVkitAccess.py +2 -1
- biopipen/scripts/cnvkit/CNVkitAutobin.py +3 -2
- biopipen/scripts/cnvkit/CNVkitBatch.py +1 -1
- biopipen/scripts/cnvkit/CNVkitCoverage.py +2 -1
- biopipen/scripts/cnvkit/CNVkitGuessBaits.py +1 -1
- biopipen/scripts/cnvkit/CNVkitHeatmap.py +1 -1
- biopipen/scripts/cnvkit/CNVkitReference.py +2 -1
- biopipen/scripts/delim/SampleInfo.R +10 -5
- biopipen/scripts/gene/GeneNameConversion.R +65 -0
- biopipen/scripts/gene/GenePromoters.R +61 -0
- biopipen/scripts/misc/Shell.sh +15 -0
- biopipen/scripts/plot/Manhattan.R +146 -0
- biopipen/scripts/plot/QQPlot.R +146 -0
- biopipen/scripts/regulatory/MotifAffinityTest.R +226 -0
- biopipen/scripts/regulatory/MotifAffinityTest_AtSNP.R +126 -0
- biopipen/scripts/regulatory/MotifAffinityTest_MotifBreakR.R +96 -0
- biopipen/scripts/regulatory/MotifScan.py +159 -0
- biopipen/scripts/regulatory/atSNP.R +33 -0
- biopipen/scripts/regulatory/motifBreakR.R +1594 -0
- biopipen/scripts/scrna/MarkersFinder.R +69 -67
- biopipen/scripts/scrna/SeuratClustering.R +71 -29
- biopipen/scripts/scrna/SeuratMap2Ref.R +20 -0
- biopipen/scripts/scrna/SeuratPreparing.R +252 -122
- biopipen/scripts/scrna/SeuratSubClustering.R +76 -27
- biopipen/scripts/snp/MatrixEQTL.R +85 -44
- biopipen/scripts/snp/Plink2GTMat.py +133 -0
- biopipen/scripts/snp/PlinkCallRate.R +190 -0
- biopipen/scripts/snp/PlinkFilter.py +100 -0
- biopipen/scripts/snp/PlinkFreq.R +298 -0
- biopipen/scripts/snp/PlinkFromVcf.py +78 -0
- biopipen/scripts/snp/PlinkHWE.R +80 -0
- biopipen/scripts/snp/PlinkHet.R +92 -0
- biopipen/scripts/snp/PlinkIBD.R +200 -0
- biopipen/scripts/snp/PlinkUpdateName.py +124 -0
- biopipen/scripts/stats/Mediation.R +94 -0
- biopipen/scripts/stats/MetaPvalue.R +2 -1
- biopipen/scripts/stats/MetaPvalue1.R +70 -0
- biopipen/scripts/tcr/TCRClusterStats.R +12 -7
- biopipen/scripts/vcf/BcftoolsAnnotate.py +91 -0
- biopipen/scripts/vcf/BcftoolsFilter.py +90 -0
- biopipen/scripts/vcf/BcftoolsSort.py +113 -0
- biopipen/scripts/vcf/BcftoolsView.py +73 -0
- biopipen/scripts/vcf/VcfFix_utils.py +1 -1
- biopipen/scripts/vcf/bcftools_utils.py +52 -0
- biopipen/utils/gene.R +83 -37
- biopipen/utils/gene.py +108 -60
- biopipen/utils/misc.R +56 -0
- biopipen/utils/misc.py +5 -2
- biopipen/utils/reference.py +54 -10
- {biopipen-0.28.1.dist-info → biopipen-0.29.1.dist-info}/METADATA +2 -2
- {biopipen-0.28.1.dist-info → biopipen-0.29.1.dist-info}/RECORD +80 -51
- {biopipen-0.28.1.dist-info → biopipen-0.29.1.dist-info}/entry_points.txt +1 -1
- biopipen/ns/bcftools.py +0 -111
- biopipen/scripts/bcftools/BcftoolsAnnotate.py +0 -42
- biopipen/scripts/bcftools/BcftoolsFilter.py +0 -79
- biopipen/scripts/bcftools/BcftoolsSort.py +0 -19
- biopipen/scripts/gene/GeneNameConversion.py +0 -66
- {biopipen-0.28.1.dist-info → biopipen-0.29.1.dist-info}/WHEEL +0 -0
biopipen/scripts/bam/CNVpytor.py
CHANGED
|
@@ -1,15 +1,15 @@
|
|
|
1
1
|
from pathlib import Path
|
|
2
2
|
|
|
3
|
+
import warnings
|
|
3
4
|
import pandas
|
|
4
|
-
from
|
|
5
|
+
from datetime import datetime
|
|
5
6
|
from biopipen.utils.reference import bam_index
|
|
6
|
-
from biopipen.utils.misc import run_command, dict_to_cli_args
|
|
7
|
+
from biopipen.utils.misc import run_command, dict_to_cli_args, logger
|
|
7
8
|
|
|
8
|
-
bamfile = {{in.bamfile | quote}} # pyright: ignore
|
|
9
|
+
bamfile = {{in.bamfile | quote}} # pyright: ignore # noqa
|
|
9
10
|
snpfile = {{in.snpfile | repr}} # pyright: ignore
|
|
10
11
|
outdir = Path({{out.outdir | quote}}) # pyright: ignore
|
|
11
12
|
cnvpytor = {{envs.cnvpytor | quote}} # pyright: ignore
|
|
12
|
-
cnvnator2vcf = {{envs.cnvnator2vcf | quote}} # pyright: ignore
|
|
13
13
|
samtools = {{envs.samtools | quote}} # pyright: ignore
|
|
14
14
|
ncores = {{envs.ncores | int}} # pyright: ignore
|
|
15
15
|
refdir = {{envs.refdir | quote}} # pyright: ignore
|
|
@@ -20,7 +20,6 @@ args = {{envs | repr}} # pyright: ignore
|
|
|
20
20
|
|
|
21
21
|
del args['cnvpytor']
|
|
22
22
|
del args['ncores']
|
|
23
|
-
del args['cnvnator2vcf']
|
|
24
23
|
del args['samtools']
|
|
25
24
|
del args['refdir']
|
|
26
25
|
del args['genome']
|
|
@@ -236,47 +235,138 @@ def load_chrsize():
|
|
|
236
235
|
yield chrom, int(size)
|
|
237
236
|
|
|
238
237
|
|
|
239
|
-
def
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
{
|
|
245
|
-
"": cnvpytor2vcf,
|
|
246
|
-
"reference": genome,
|
|
247
|
-
"_": [infile, refdir],
|
|
248
|
-
},
|
|
249
|
-
prefix="-",
|
|
250
|
-
),
|
|
251
|
-
stdout="return",
|
|
252
|
-
)
|
|
253
|
-
if fix:
|
|
254
|
-
unfixedfile.write_text(stdout)
|
|
238
|
+
def parse_chrom(chrom, chromdir):
|
|
239
|
+
file = Path(chromdir) / f"{chrom}.fa"
|
|
240
|
+
if not file.exists():
|
|
241
|
+
warnings.warn(f"Chromosome file not found in refdir: {chrom}")
|
|
242
|
+
return ""
|
|
255
243
|
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
244
|
+
seq = ""
|
|
245
|
+
with open(file) as f:
|
|
246
|
+
for line in f:
|
|
247
|
+
line = line.strip()
|
|
248
|
+
if not line:
|
|
249
|
+
continue
|
|
250
|
+
if line.startswith(">"):
|
|
251
|
+
seq = ""
|
|
252
|
+
else:
|
|
253
|
+
seq += line
|
|
254
|
+
return seq
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
def cnvpytor2vcf(infile, snp):
|
|
258
|
+
# snp: in case to be used in the future
|
|
259
|
+
outfile = Path(infile).with_suffix(f".vcf")
|
|
260
|
+
# stdout = run_command(
|
|
261
|
+
# dict_to_cli_args(
|
|
262
|
+
# {
|
|
263
|
+
# "": cnvnator2vcf,
|
|
264
|
+
# "reference": genome,
|
|
265
|
+
# "_": [infile, refdir],
|
|
266
|
+
# },
|
|
267
|
+
# prefix="-",
|
|
268
|
+
# ),
|
|
269
|
+
# stdout="return",
|
|
270
|
+
# )
|
|
271
|
+
## command hangs
|
|
272
|
+
with open(infile) as fin, open(outfile, "w") as fout:
|
|
273
|
+
fout.write("##fileformat=VCFv4.2\n")
|
|
274
|
+
fout.write(f"##fileDate={datetime.now().strftime('%Y%m%d')}\n")
|
|
275
|
+
fout.write(f"##reference={genome}\n")
|
|
276
|
+
fout.write(f"##source=CNVpytor\n")
|
|
267
277
|
for chrom, size in load_chrsize():
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
278
|
+
fout.write(f"##contig=<ID={chrom},length={size}>\n")
|
|
279
|
+
fout.write('##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant described in this record">\n')
|
|
280
|
+
fout.write('##INFO=<ID=IMPRECISE,Number=0,Type=Flag,Description="Imprecise structural variation">\n')
|
|
281
|
+
fout.write('##INFO=<ID=SVLEN,Number=1,Type=Integer,Description="Difference in length between REF and ALT alleles">\n')
|
|
282
|
+
fout.write('##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">\n')
|
|
283
|
+
fout.write('##INFO=<ID=natorRD,Number=1,Type=Float,Description="Normalized RD">\n')
|
|
284
|
+
fout.write('##INFO=<ID=natorP1,Number=1,Type=Float,Description="e-val by t-test">\n')
|
|
285
|
+
fout.write('##INFO=<ID=natorP2,Number=1,Type=Float,Description="e-val by Gaussian tail">\n')
|
|
286
|
+
fout.write('##INFO=<ID=natorP3,Number=1,Type=Float,Description="e-val by t-test (middle)">\n')
|
|
287
|
+
fout.write('##INFO=<ID=natorP4,Number=1,Type=Float,Description="e-val by Gaussian tail (middle)">\n')
|
|
288
|
+
fout.write('##INFO=<ID=natorQ0,Number=1,Type=Float,Description="Fraction of reads with 0 mapping quality">\n')
|
|
289
|
+
fout.write('##INFO=<ID=natorPE,Number=1,Type=Integer,Description="Number of paired-ends support the event">\n')
|
|
290
|
+
fout.write('##INFO=<ID=SAMPLES,Number=.,Type=String,Description="Sample genotyped to have the variant">\n')
|
|
291
|
+
fout.write('##ALT=<ID=DEL,Description="Deletion">\n')
|
|
292
|
+
fout.write('##ALT=<ID=DUP,Description="Duplication">\n')
|
|
293
|
+
fout.write('##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n')
|
|
294
|
+
fout.write('##FORMAT=<ID=CN,Number=1,Type=Integer,Description="Copy number genotype for imprecise events">\n')
|
|
295
|
+
fout.write('##FORMAT=<ID=PE,Number=1,Type=String,Description="Number of paired-ends that support the event">\n')
|
|
296
|
+
fout.write(f"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t{Path(bamfile).stem}\n")
|
|
297
|
+
prev_chrom, chrom_seq, count = "", "", 0
|
|
298
|
+
for line in fin:
|
|
299
|
+
# type, coor, length, rd, p1, p2, p3, p4, q0, pe = line.strip("\n").split()
|
|
300
|
+
items = line.strip("\n").split()
|
|
301
|
+
type, coor, length = items[:3]
|
|
302
|
+
rd = float(items[3]) if len(items) > 3 else False
|
|
303
|
+
p1 = items[4] if len(items) > 4 else ""
|
|
304
|
+
p2 = items[5] if len(items) > 5 else ""
|
|
305
|
+
p3 = items[6] if len(items) > 6 else ""
|
|
306
|
+
p4 = items[7] if len(items) > 7 else ""
|
|
307
|
+
q0 = items[8] if len(items) > 8 else ""
|
|
308
|
+
pe = items[9] if len(items) > 9 else ""
|
|
309
|
+
chrom, pos = coor.split(":")
|
|
310
|
+
start, end = pos.split("-")
|
|
311
|
+
start, end = int(start), int(end)
|
|
312
|
+
is_del = type == "deletion"
|
|
313
|
+
is_dup = type == "duplication"
|
|
314
|
+
|
|
315
|
+
if not is_del and not is_dup:
|
|
316
|
+
warnings.warn(f"Skipping unrecognized CNV type: {type}")
|
|
317
|
+
continue
|
|
276
318
|
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
319
|
+
if chrom != prev_chrom:
|
|
320
|
+
chrom_seq = parse_chrom(chrom, refdir)
|
|
321
|
+
prev_chrom = chrom
|
|
322
|
+
|
|
323
|
+
count += 1
|
|
324
|
+
info = f"END={end}"
|
|
325
|
+
info += f";SVTYPE=DEL;SVLEN=-{length}" if is_del else f";SVTYPE=DUP;SVLEN={length}"
|
|
326
|
+
info += ";IMPRECISE"
|
|
327
|
+
info += f";natorRD={rd}" if rd is not False else ""
|
|
328
|
+
info += f";natorP1={p1}" if p1 else ""
|
|
329
|
+
info += f";natorP2={p2}" if p2 else ""
|
|
330
|
+
info += f";natorP3={p3}" if p3 else ""
|
|
331
|
+
info += f";natorP4={p4}" if p4 else ""
|
|
332
|
+
info += f";natorQ0={q0}" if q0 else ""
|
|
333
|
+
info += f";natorPE={pe}" if pe else ""
|
|
334
|
+
|
|
335
|
+
gt = "GT"
|
|
336
|
+
if rd is not False:
|
|
337
|
+
gt += ":CN"
|
|
338
|
+
gt += ":PE" if pe else ""
|
|
339
|
+
gt += "\t"
|
|
340
|
+
if is_del and rd < 0.25:
|
|
341
|
+
gt += "1/1:0"
|
|
342
|
+
elif is_del and rd >= 0.25:
|
|
343
|
+
gt += "0/1:1"
|
|
344
|
+
elif rd <= 1.75:
|
|
345
|
+
gt += "0/1:2"
|
|
346
|
+
elif rd > 1.75 and rd <= 2.25:
|
|
347
|
+
gt += "1/1:2"
|
|
348
|
+
elif rd > 2.25:
|
|
349
|
+
gt += f"./2:{rd:.0f}"
|
|
350
|
+
else:
|
|
351
|
+
gt = "GT:PE\t./." if pe else "GT\t./."
|
|
352
|
+
|
|
353
|
+
gt += f":{pe}" if pe else ""
|
|
354
|
+
else:
|
|
355
|
+
gt += "\t./."
|
|
356
|
+
|
|
357
|
+
fout.write("\t".join(
|
|
358
|
+
[
|
|
359
|
+
chrom,
|
|
360
|
+
str(start),
|
|
361
|
+
f"CNVpytor_{'del_' if is_del else 'dup_'}{count}",
|
|
362
|
+
chrom_seq[start - 1] if start < len(chrom_seq) else "N",
|
|
363
|
+
"<DEL>" if is_del else "<DUP>",
|
|
364
|
+
".",
|
|
365
|
+
"PASS",
|
|
366
|
+
info,
|
|
367
|
+
gt,
|
|
368
|
+
]
|
|
369
|
+
) + "\n")
|
|
280
370
|
|
|
281
371
|
|
|
282
372
|
def do_case():
|
|
@@ -290,7 +380,7 @@ def do_case():
|
|
|
290
380
|
rootfile = outdir / "file.pytor"
|
|
291
381
|
case["j"] = case.get("j", ncores)
|
|
292
382
|
|
|
293
|
-
|
|
383
|
+
logger.info("Reading depth signals ...")
|
|
294
384
|
run_command(
|
|
295
385
|
dict_to_cli_args(
|
|
296
386
|
{
|
|
@@ -305,7 +395,7 @@ def do_case():
|
|
|
305
395
|
fg=True,
|
|
306
396
|
)
|
|
307
397
|
|
|
308
|
-
|
|
398
|
+
logger.info("Predicting CNVs ...")
|
|
309
399
|
run_command(
|
|
310
400
|
dict_to_cli_args(
|
|
311
401
|
{
|
|
@@ -314,6 +404,7 @@ def do_case():
|
|
|
314
404
|
"his": binsizes,
|
|
315
405
|
},
|
|
316
406
|
prefix="-",
|
|
407
|
+
dup_key=False,
|
|
317
408
|
),
|
|
318
409
|
fg=True,
|
|
319
410
|
)
|
|
@@ -326,6 +417,7 @@ def do_case():
|
|
|
326
417
|
"partition": binsizes,
|
|
327
418
|
},
|
|
328
419
|
prefix="-",
|
|
420
|
+
dup_key=False,
|
|
329
421
|
),
|
|
330
422
|
fg=True,
|
|
331
423
|
)
|
|
@@ -336,6 +428,7 @@ def do_case():
|
|
|
336
428
|
mask_snps = snp.pop("mask_snps", True)
|
|
337
429
|
baf_nomask = snp.pop("baf_nomask", False)
|
|
338
430
|
|
|
431
|
+
logger.info("Importing SNP data ...")
|
|
339
432
|
run_command(
|
|
340
433
|
dict_to_cli_args(
|
|
341
434
|
{
|
|
@@ -350,6 +443,7 @@ def do_case():
|
|
|
350
443
|
)
|
|
351
444
|
|
|
352
445
|
if mask_snps:
|
|
446
|
+
logger.info("Masking 1000 Genome SNPs ...")
|
|
353
447
|
run_command(
|
|
354
448
|
dict_to_cli_args(
|
|
355
449
|
{
|
|
@@ -362,6 +456,7 @@ def do_case():
|
|
|
362
456
|
fg=True,
|
|
363
457
|
)
|
|
364
458
|
|
|
459
|
+
logger.info("Calculating BAF histograms ...")
|
|
365
460
|
run_command(
|
|
366
461
|
dict_to_cli_args(
|
|
367
462
|
{
|
|
@@ -375,8 +470,9 @@ def do_case():
|
|
|
375
470
|
fg=True,
|
|
376
471
|
)
|
|
377
472
|
|
|
378
|
-
|
|
473
|
+
logger.info("Predicting CNV regions using joint caller ...")
|
|
379
474
|
for binsize in binsizes:
|
|
475
|
+
logger.info(f"- binsize: {binsize}")
|
|
380
476
|
outfile = outdir / f"calls{'.combined' if snp is not False else ''}.{binsize}.tsv"
|
|
381
477
|
outfile_filtered = outdir / f"calls{'.combined' if snp is not False else ''}.{binsize}.filtered.tsv"
|
|
382
478
|
run_command(
|
|
@@ -392,6 +488,7 @@ def do_case():
|
|
|
392
488
|
stdout=outfile,
|
|
393
489
|
)
|
|
394
490
|
|
|
491
|
+
logger.info(" Converting to other formats ...")
|
|
395
492
|
cnvpytor2other(outfile, bool(snp), "gff")
|
|
396
493
|
cnvpytor2other(outfile, bool(snp), "bed")
|
|
397
494
|
cnvpytor2vcf(outfile, bool(snp))
|
|
@@ -424,6 +521,7 @@ def do_case():
|
|
|
424
521
|
cnvpytor2vcf(outfile_filtered, bool(snp))
|
|
425
522
|
|
|
426
523
|
# plots
|
|
524
|
+
logger.info(" Plotting ...")
|
|
427
525
|
manplot = outdir / f"manhattan.{binsize}.png"
|
|
428
526
|
run_command(
|
|
429
527
|
dict_to_cli_args(
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from biopipen.utils.misc import run_command, dict_to_cli_args, logger
|
|
3
|
+
|
|
4
|
+
afile = Path({{in.afile | repr}}) # pyright: ignore # noqa: #999
|
|
5
|
+
bfile = Path({{in.bfile | repr}}) # pyright: ignore
|
|
6
|
+
outfile = {{out.outfile | repr}} # pyright: ignore
|
|
7
|
+
envs = {{envs | repr}} # pyright: ignore
|
|
8
|
+
|
|
9
|
+
bedtools = envs.pop("bedtools")
|
|
10
|
+
sort = envs.pop("sort")
|
|
11
|
+
chrsize = envs.pop("chrsize")
|
|
12
|
+
postcmd = envs.pop("postcmd", None)
|
|
13
|
+
outdir = Path(outfile).parent
|
|
14
|
+
|
|
15
|
+
if chrsize and "g" in envs:
|
|
16
|
+
logger.warning("Ignoring envs.g because envs.chrsize is provided.")
|
|
17
|
+
envs["g"] = Path(chrsize).expanduser()
|
|
18
|
+
elif chrsize:
|
|
19
|
+
envs["g"] = Path(chrsize).expanduser()
|
|
20
|
+
|
|
21
|
+
if sort:
|
|
22
|
+
afile_sorted = outdir / f"{afile.stem}_sorted{afile.suffix}"
|
|
23
|
+
bfile_sorted = outdir / f"{bfile.stem}_sorted{bfile.suffix}"
|
|
24
|
+
run_command(
|
|
25
|
+
[bedtools, "sort", "-g", envs["g"], "-i", afile],
|
|
26
|
+
stdout=afile_sorted,
|
|
27
|
+
)
|
|
28
|
+
run_command(
|
|
29
|
+
[bedtools, "sort", "-g", envs["g"], "-i", bfile],
|
|
30
|
+
stdout=bfile_sorted,
|
|
31
|
+
)
|
|
32
|
+
afile = afile_sorted
|
|
33
|
+
bfile = bfile_sorted
|
|
34
|
+
|
|
35
|
+
envs[""] = [bedtools, "intersect"]
|
|
36
|
+
envs["a"] = afile
|
|
37
|
+
envs["b"] = bfile
|
|
38
|
+
envs.setdefault("sorted", True)
|
|
39
|
+
|
|
40
|
+
if envs["sorted"] and not "g" in envs:
|
|
41
|
+
raise ValueError("envs.g is required or manullay set envs.sorted to False.")
|
|
42
|
+
|
|
43
|
+
if postcmd:
|
|
44
|
+
ofile = Path(outfile).with_suffix(".prior.bt")
|
|
45
|
+
run_command(dict_to_cli_args(envs, prefix="-"), stdout=ofile)
|
|
46
|
+
postcmd_file = outdir / "_postcmd.sh"
|
|
47
|
+
postcmd_file.write_text(postcmd)
|
|
48
|
+
run_command(
|
|
49
|
+
["bash", postcmd_file],
|
|
50
|
+
env={"infile": ofile, "outfile": outfile, "outdir": outdir},
|
|
51
|
+
fg=True,
|
|
52
|
+
)
|
|
53
|
+
else:
|
|
54
|
+
run_command(dict_to_cli_args(envs, prefix="-"), stdout=outfile)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from biopipen.utils import run_command, dict_to_cli_args
|
|
2
2
|
|
|
3
|
-
inbed = {{in.inbed | repr}} # pyright: ignore
|
|
3
|
+
inbed = {{in.inbed | repr}} # pyright: ignore # noqa: #999
|
|
4
4
|
outbed = {{out.outbed | repr}} # pyright: ignore
|
|
5
5
|
envs = {{envs | repr}} # pyright: ignore
|
|
6
6
|
bedtools = envs.pop("bedtools", "bedtools")
|
|
@@ -127,13 +127,32 @@ getCAA <- function(segf, cytoarm, tcn_col,
|
|
|
127
127
|
return(as(seg_cyto_chr, "GRangesList"))
|
|
128
128
|
}
|
|
129
129
|
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
130
|
+
if (endsWith(segfile, ".vcf") || endsWith(segfile, ".vcf.gz")) {
|
|
131
|
+
library(VariantAnnotation)
|
|
132
|
+
vcf = readVcf(segfile)
|
|
133
|
+
seg = data.frame(
|
|
134
|
+
seqnames = as.character(seqnames(vcf)),
|
|
135
|
+
start = start(vcf),
|
|
136
|
+
end = vcf@info[[end_col]],
|
|
137
|
+
seg.mean = vcf@info[[seg_col]]
|
|
138
|
+
)
|
|
139
|
+
} else if (endsWith(segfile, ".bed")) {
|
|
140
|
+
segments = read.table(segfile, header=F, row.names=NULL, sep="\t", stringsAsFactors=F)
|
|
141
|
+
seg = data.frame(
|
|
142
|
+
seqnames = segments[, 1],
|
|
143
|
+
start = segments[, 2],
|
|
144
|
+
end = segments[, 3],
|
|
145
|
+
seg.mean = segments[, 5]
|
|
146
|
+
)
|
|
147
|
+
} else {
|
|
148
|
+
segments = read.table(segfile, header=T, row.names=NULL, sep="\t", stringsAsFactors=F)
|
|
149
|
+
seg = data.frame(
|
|
150
|
+
seqnames = segments[, chrom_col],
|
|
151
|
+
start = segments[, start_col],
|
|
152
|
+
end = segments[, end_col],
|
|
153
|
+
seg.mean = segments[, seg_col]
|
|
154
|
+
)
|
|
155
|
+
}
|
|
137
156
|
|
|
138
157
|
{% if envs.segmean_transform %}
|
|
139
158
|
segmean_transform = {{envs.segmean_transform}}
|
|
@@ -168,6 +187,10 @@ if (is.character(cn_transform)) {
|
|
|
168
187
|
}
|
|
169
188
|
{% endif %}
|
|
170
189
|
|
|
190
|
+
seg <- seg[
|
|
191
|
+
!is.na(seg$seg.mean) & !is.na(seg$TCN) & !is.infinite(seg$seg.mean) & !is.infinite(seg$TCN),,
|
|
192
|
+
drop=FALSE]
|
|
193
|
+
|
|
171
194
|
write.table(seg, file.path(outdir, "seg.txt"), sep="\t", quote=F, row.names=F, col.names=T)
|
|
172
195
|
|
|
173
196
|
wgd_ploidy = checkIfWGD(
|
|
@@ -52,8 +52,11 @@ if (!is.null(group_cols)) {
|
|
|
52
52
|
|
|
53
53
|
if (!is.null(metafile)) {
|
|
54
54
|
metadf = read.table(metafile, header=T, row.names=NULL, sep="\t", stringsAsFactors=F)
|
|
55
|
-
|
|
56
|
-
|
|
55
|
+
if (!is.null(metadf$Sample)) {
|
|
56
|
+
metadf$Sample = as.character(metadf$Sample)
|
|
57
|
+
} else {
|
|
58
|
+
colnames(metadf)[1] = "Sample"
|
|
59
|
+
}
|
|
57
60
|
metadf = metadf[metadf$Sample %in% sams, c("Sample", meta_cols), drop=FALSE]
|
|
58
61
|
if (nrow(metadf) != length(sams)) {
|
|
59
62
|
stop(paste("Not all samples in metafile:", paste(setdiff(sams, metadf$Sample), collapse=", ")))
|
biopipen/scripts/cnv/TMADScore.R
CHANGED
|
@@ -11,11 +11,27 @@ if (is.character(segmean_transform)) {
|
|
|
11
11
|
segmean_transform = eval(parse(text=segmean_transform))
|
|
12
12
|
} # otherwise NULL
|
|
13
13
|
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
14
|
+
|
|
15
|
+
if (endsWith(segfile, ".vcf") || endsWith(segfile, ".vcf.gz")) {
|
|
16
|
+
library(VariantAnnotation)
|
|
17
|
+
segments = readVcf(segfile)
|
|
18
|
+
seg = data.frame(
|
|
19
|
+
chrom = as.character(seqnames(segments)),
|
|
20
|
+
log2 = segments@info[[seg_col]]
|
|
21
|
+
)
|
|
22
|
+
} else if (endsWith(segfile, ".bed")) {
|
|
23
|
+
segments = read.table(segfile, header=F, row.names=NULL, sep="\t", stringsAsFactors=F)
|
|
24
|
+
seg = data.frame(
|
|
25
|
+
chrom = segments[, 1],
|
|
26
|
+
log2 = segments[, 5]
|
|
27
|
+
)
|
|
28
|
+
} else {
|
|
29
|
+
segments = read.table(segfile, header=T, row.names=NULL, sep="\t", stringsAsFactors=F)
|
|
30
|
+
seg = data.frame(
|
|
31
|
+
chrom = segments[, chrom_col],
|
|
32
|
+
log2 = segments[, seg_col]
|
|
33
|
+
)
|
|
34
|
+
}
|
|
19
35
|
rm(segments)
|
|
20
36
|
|
|
21
37
|
if (!is.null(excl_chroms) && length(excl_chroms) > 0) {
|
|
@@ -49,8 +49,12 @@ if (!is.null(group_cols)) {
|
|
|
49
49
|
data = data.frame(Sample = sams, tMAD = tmads)
|
|
50
50
|
if (file.exists(metafile) && length(meta_cols) > 0) {
|
|
51
51
|
metadf = read.table(metafile, header=T, row.names=NULL, sep="\t", stringsAsFactors=F)
|
|
52
|
-
|
|
53
|
-
|
|
52
|
+
if (!is.null(metadf$Sample)) {
|
|
53
|
+
metadf$Sample = as.character(metadf$Sample)
|
|
54
|
+
} else {
|
|
55
|
+
colnames(metadf)[1] = "Sample"
|
|
56
|
+
}
|
|
57
|
+
meta = metadf[, c("Sample", meta_cols), drop=FALSE]
|
|
54
58
|
colnames(meta) = c("Sample", meta_cols)
|
|
55
59
|
data = data %>% left_join(meta, by="Sample")
|
|
56
60
|
}
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
from pathlib import Path
|
|
1
2
|
from biopipen.utils.misc import run_command, dict_to_cli_args
|
|
2
3
|
|
|
3
4
|
excfiles = {{in.excfiles | repr}} # pyright: ignore
|
|
@@ -12,7 +13,7 @@ def main():
|
|
|
12
13
|
"": [cnvkit, "access"],
|
|
13
14
|
"s": min_gap_size,
|
|
14
15
|
"o": outfile,
|
|
15
|
-
"_": reffile,
|
|
16
|
+
"_": Path(reffile).expanduser(),
|
|
16
17
|
}
|
|
17
18
|
if excfiles:
|
|
18
19
|
other_args["exclude"] = excfiles
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
from pathlib import Path
|
|
1
2
|
from biopipen.utils.misc import run_command, dict_to_cli_args
|
|
2
3
|
|
|
3
4
|
bamfiles = {{in.bamfiles | repr}} # pyright: ignore
|
|
@@ -20,7 +21,7 @@ short_names = {{envs.short_names | repr}} # pyright: ignore
|
|
|
20
21
|
def main():
|
|
21
22
|
|
|
22
23
|
args = dict(
|
|
23
|
-
f=reffile,
|
|
24
|
+
f=Path(reffile).expanduser(),
|
|
24
25
|
m=method,
|
|
25
26
|
g=accfile,
|
|
26
27
|
t=baitfile,
|
|
@@ -29,7 +30,7 @@ def main():
|
|
|
29
30
|
target_min_size=target_min_size,
|
|
30
31
|
antitarget_max_size=antitarget_max_size,
|
|
31
32
|
antitarget_min_size=antitarget_min_size,
|
|
32
|
-
annotate=annotate,
|
|
33
|
+
annotate=Path(annotate).expanduser(),
|
|
33
34
|
short_names=short_names,
|
|
34
35
|
target_output_bed=target_file,
|
|
35
36
|
antitarget_output_bed=antitarget_file,
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
from pathlib import Path
|
|
1
2
|
from biopipen.utils.misc import run_command, dict_to_cli_args
|
|
2
3
|
|
|
3
4
|
bamfile = {{in.bamfile | quote}} # pyright: ignore
|
|
@@ -13,7 +14,7 @@ ncores = {{envs.ncores | repr}} # pyright: ignore
|
|
|
13
14
|
def main():
|
|
14
15
|
|
|
15
16
|
args = dict(
|
|
16
|
-
f=reffile,
|
|
17
|
+
f=Path(reffile).expanduser(),
|
|
17
18
|
c=count,
|
|
18
19
|
q=min_mapq,
|
|
19
20
|
p=ncores,
|
|
@@ -4,7 +4,7 @@ from diot import Diot
|
|
|
4
4
|
|
|
5
5
|
from biopipen.utils.misc import run_command, dict_to_cli_args
|
|
6
6
|
|
|
7
|
-
segfiles = {{in.segfiles | repr}} # pyright: ignore
|
|
7
|
+
segfiles = {{in.segfiles | repr}} # pyright: ignore # noqa
|
|
8
8
|
sample_sex = {{in.sample_sex | repr}} # pyright: ignore
|
|
9
9
|
outdir = {{out.outdir | repr}} # pyright: ignore
|
|
10
10
|
cnvkit = {{envs.cnvkit | quote}} # pyright: ignore
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
from pathlib import Path
|
|
1
2
|
from biopipen.utils.misc import run_command, dict_to_cli_args
|
|
2
3
|
|
|
3
4
|
covfiles = {{in.covfiles | repr}} # pyright: ignore
|
|
@@ -18,7 +19,7 @@ no_rmask = {{envs.no_rmask | repr}} # pyright: ignore
|
|
|
18
19
|
def main():
|
|
19
20
|
|
|
20
21
|
args = dict(
|
|
21
|
-
f=reffile,
|
|
22
|
+
f=Path(reffile).expanduser(),
|
|
22
23
|
o=outfile,
|
|
23
24
|
c=cluster,
|
|
24
25
|
min_cluster_size=min_cluster_size,
|
|
@@ -88,7 +88,11 @@ for (name in names(stats)) {
|
|
|
88
88
|
group <- if (is.null(stat$group)) sym("..group") else sym(stat$group)
|
|
89
89
|
count_on <- paste0("..count.", stat$on)
|
|
90
90
|
if (!is_continuous) {
|
|
91
|
-
|
|
91
|
+
if (!is.null(stat$each)) {
|
|
92
|
+
data <- data %>% add_count(!!group, !!sym(stat$each), name = count_on)
|
|
93
|
+
} else {
|
|
94
|
+
data <- data %>% add_count(!!group, name = count_on)
|
|
95
|
+
}
|
|
92
96
|
}
|
|
93
97
|
|
|
94
98
|
if (is.null(stat$devpars)) {
|
|
@@ -141,18 +145,19 @@ for (name in names(stats)) {
|
|
|
141
145
|
} else {
|
|
142
146
|
data <- data %>%
|
|
143
147
|
distinct(!!group, !!sym(stat$each), .keep_all = TRUE) %>%
|
|
148
|
+
mutate(!!group := factor(!!group, levels = unique(!!group))) %>%
|
|
144
149
|
group_by(!!sym(stat$each))
|
|
145
150
|
}
|
|
146
151
|
p <- ggplot(
|
|
147
|
-
data %>%
|
|
148
|
-
aes(x =
|
|
152
|
+
data %>% mutate(.size = sum(!!sym(count_on))),
|
|
153
|
+
aes(x = sqrt(.size) / 2, width = sqrt(.size), y = !!sym(count_on), fill = !!group, label = !!sym(count_on))
|
|
149
154
|
) +
|
|
150
|
-
geom_bar(stat="identity",
|
|
155
|
+
geom_bar(stat="identity", color="white", position = position_fill(reverse = TRUE)) +
|
|
151
156
|
coord_polar("y", start = 0) +
|
|
152
157
|
theme_void() +
|
|
153
158
|
theme(plot.title = element_text(hjust = 0.5)) +
|
|
154
159
|
geom_label_repel(
|
|
155
|
-
position =
|
|
160
|
+
position = position_fill(reverse = TRUE,vjust = .5),
|
|
156
161
|
color="#333333",
|
|
157
162
|
fill="#EEEEEE",
|
|
158
163
|
size=4
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
source("{{biopipen_dir}}/utils/misc.R")
|
|
2
|
+
source("{{biopipen_dir}}/utils/gene.R")
|
|
3
|
+
|
|
4
|
+
infile <- {{in.infile | quote}}
|
|
5
|
+
outfile <- {{out.outfile | quote}}
|
|
6
|
+
notfound <- {{envs.notfound | r}}
|
|
7
|
+
genecol <- {{envs.genecol | r}}
|
|
8
|
+
output <- {{envs.output | r}}
|
|
9
|
+
dup <- {{envs.dup | r}}
|
|
10
|
+
infmt <- {{envs.infmt | r}}
|
|
11
|
+
outfmt <- {{envs.outfmt | r}}
|
|
12
|
+
species <- {{envs.species | r}}
|
|
13
|
+
|
|
14
|
+
if (is.na(notfound)) {
|
|
15
|
+
notfound = "na"
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
df <- read.table(infile, header=TRUE, sep="\t", check.names=FALSE)
|
|
19
|
+
|
|
20
|
+
if (genecol == 0) {
|
|
21
|
+
log_warn("envs.genecol should be 1-based, but 0 was given. Using 1 instead.")
|
|
22
|
+
genecol <- 1
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
if (is.numeric(genecol)) { genecol <- colnames(df)[genecol] }
|
|
26
|
+
if (dup == "combine") { dup <- ";" }
|
|
27
|
+
|
|
28
|
+
genes <- df[[genecol]]
|
|
29
|
+
converted <- gene_name_conversion(
|
|
30
|
+
genes=genes,
|
|
31
|
+
species=species,
|
|
32
|
+
infmt=infmt,
|
|
33
|
+
outfmt=outfmt,
|
|
34
|
+
notfound=notfound,
|
|
35
|
+
dup=dup
|
|
36
|
+
)
|
|
37
|
+
# <genecol> <outfmt>
|
|
38
|
+
# 1 1255_g_at GUCA1A
|
|
39
|
+
# 2 1316_at THRA
|
|
40
|
+
# 3 1320_at PTPN21
|
|
41
|
+
# 4 1294_at MIR5193
|
|
42
|
+
|
|
43
|
+
# order the converted dataframe by the original gene column
|
|
44
|
+
converted <- converted[order(match(converted$query, genes)), , drop=FALSE]
|
|
45
|
+
outcol <- outfmt
|
|
46
|
+
|
|
47
|
+
if (notfound == "skip" || notfound == "ignore") {
|
|
48
|
+
df <- df[df[[genecol]] %in% converted$query, , drop=FALSE]
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
if (output == "append") {
|
|
52
|
+
if (outfmt %in% colnames(df)) {
|
|
53
|
+
log_warn("The output column name already exists in the input dataframe. Appending with a suffix `_1`.")
|
|
54
|
+
outcol <- paste(outfmt, "_1", sep="")
|
|
55
|
+
}
|
|
56
|
+
df[[outcol]] <- converted[[outfmt]]
|
|
57
|
+
} else if (output == "replace") {
|
|
58
|
+
df[[genecol]] <- converted[[outfmt]]
|
|
59
|
+
} else if (output == "with-query") {
|
|
60
|
+
df <- converted
|
|
61
|
+
} else {
|
|
62
|
+
df <- converted[, outfmt, drop=FALSE]
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
write.table(df, file=outfile, sep="\t", quote=FALSE, row.names=FALSE)
|