biopipen 0.28.1__py3-none-any.whl → 0.29.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biopipen might be problematic. Click here for more details.

Files changed (85) hide show
  1. biopipen/__init__.py +1 -1
  2. biopipen/core/config.toml +8 -0
  3. biopipen/ns/bam.py +0 -2
  4. biopipen/ns/bed.py +35 -0
  5. biopipen/ns/cellranger_pipeline.py +5 -5
  6. biopipen/ns/cnv.py +18 -2
  7. biopipen/ns/cnvkit_pipeline.py +16 -11
  8. biopipen/ns/gene.py +68 -23
  9. biopipen/ns/misc.py +2 -15
  10. biopipen/ns/plot.py +204 -0
  11. biopipen/ns/regulatory.py +214 -0
  12. biopipen/ns/scrna.py +31 -5
  13. biopipen/ns/snp.py +516 -8
  14. biopipen/ns/stats.py +167 -3
  15. biopipen/ns/vcf.py +196 -0
  16. biopipen/reports/snp/PlinkCallRate.svelte +24 -0
  17. biopipen/reports/snp/PlinkFreq.svelte +18 -0
  18. biopipen/reports/snp/PlinkHWE.svelte +18 -0
  19. biopipen/reports/snp/PlinkHet.svelte +18 -0
  20. biopipen/reports/snp/PlinkIBD.svelte +18 -0
  21. biopipen/scripts/bam/CNVpytor.py +144 -46
  22. biopipen/scripts/bed/BedtoolsIntersect.py +54 -0
  23. biopipen/scripts/bed/BedtoolsMerge.py +1 -1
  24. biopipen/scripts/cnv/AneuploidyScore.R +30 -7
  25. biopipen/scripts/cnv/AneuploidyScoreSummary.R +5 -2
  26. biopipen/scripts/cnv/TMADScore.R +21 -5
  27. biopipen/scripts/cnv/TMADScoreSummary.R +6 -2
  28. biopipen/scripts/cnvkit/CNVkitAccess.py +2 -1
  29. biopipen/scripts/cnvkit/CNVkitAutobin.py +3 -2
  30. biopipen/scripts/cnvkit/CNVkitBatch.py +1 -1
  31. biopipen/scripts/cnvkit/CNVkitCoverage.py +2 -1
  32. biopipen/scripts/cnvkit/CNVkitGuessBaits.py +1 -1
  33. biopipen/scripts/cnvkit/CNVkitHeatmap.py +1 -1
  34. biopipen/scripts/cnvkit/CNVkitReference.py +2 -1
  35. biopipen/scripts/delim/SampleInfo.R +10 -5
  36. biopipen/scripts/gene/GeneNameConversion.R +65 -0
  37. biopipen/scripts/gene/GenePromoters.R +61 -0
  38. biopipen/scripts/misc/Shell.sh +15 -0
  39. biopipen/scripts/plot/Manhattan.R +146 -0
  40. biopipen/scripts/plot/QQPlot.R +146 -0
  41. biopipen/scripts/regulatory/MotifAffinityTest.R +226 -0
  42. biopipen/scripts/regulatory/MotifAffinityTest_AtSNP.R +126 -0
  43. biopipen/scripts/regulatory/MotifAffinityTest_MotifBreakR.R +96 -0
  44. biopipen/scripts/regulatory/MotifScan.py +159 -0
  45. biopipen/scripts/regulatory/atSNP.R +33 -0
  46. biopipen/scripts/regulatory/motifBreakR.R +1594 -0
  47. biopipen/scripts/scrna/MarkersFinder.R +69 -67
  48. biopipen/scripts/scrna/SeuratClustering.R +71 -29
  49. biopipen/scripts/scrna/SeuratMap2Ref.R +20 -0
  50. biopipen/scripts/scrna/SeuratPreparing.R +252 -122
  51. biopipen/scripts/scrna/SeuratSubClustering.R +76 -27
  52. biopipen/scripts/snp/MatrixEQTL.R +85 -44
  53. biopipen/scripts/snp/Plink2GTMat.py +133 -0
  54. biopipen/scripts/snp/PlinkCallRate.R +190 -0
  55. biopipen/scripts/snp/PlinkFilter.py +100 -0
  56. biopipen/scripts/snp/PlinkFreq.R +298 -0
  57. biopipen/scripts/snp/PlinkFromVcf.py +78 -0
  58. biopipen/scripts/snp/PlinkHWE.R +80 -0
  59. biopipen/scripts/snp/PlinkHet.R +92 -0
  60. biopipen/scripts/snp/PlinkIBD.R +200 -0
  61. biopipen/scripts/snp/PlinkUpdateName.py +124 -0
  62. biopipen/scripts/stats/Mediation.R +94 -0
  63. biopipen/scripts/stats/MetaPvalue.R +2 -1
  64. biopipen/scripts/stats/MetaPvalue1.R +70 -0
  65. biopipen/scripts/tcr/TCRClusterStats.R +12 -7
  66. biopipen/scripts/vcf/BcftoolsAnnotate.py +91 -0
  67. biopipen/scripts/vcf/BcftoolsFilter.py +90 -0
  68. biopipen/scripts/vcf/BcftoolsSort.py +113 -0
  69. biopipen/scripts/vcf/BcftoolsView.py +73 -0
  70. biopipen/scripts/vcf/VcfFix_utils.py +1 -1
  71. biopipen/scripts/vcf/bcftools_utils.py +52 -0
  72. biopipen/utils/gene.R +83 -37
  73. biopipen/utils/gene.py +108 -60
  74. biopipen/utils/misc.R +56 -0
  75. biopipen/utils/misc.py +5 -2
  76. biopipen/utils/reference.py +54 -10
  77. {biopipen-0.28.1.dist-info → biopipen-0.29.1.dist-info}/METADATA +2 -2
  78. {biopipen-0.28.1.dist-info → biopipen-0.29.1.dist-info}/RECORD +80 -51
  79. {biopipen-0.28.1.dist-info → biopipen-0.29.1.dist-info}/entry_points.txt +1 -1
  80. biopipen/ns/bcftools.py +0 -111
  81. biopipen/scripts/bcftools/BcftoolsAnnotate.py +0 -42
  82. biopipen/scripts/bcftools/BcftoolsFilter.py +0 -79
  83. biopipen/scripts/bcftools/BcftoolsSort.py +0 -19
  84. biopipen/scripts/gene/GeneNameConversion.py +0 -66
  85. {biopipen-0.28.1.dist-info → biopipen-0.29.1.dist-info}/WHEEL +0 -0
@@ -1,15 +1,15 @@
1
1
  from pathlib import Path
2
2
 
3
+ import warnings
3
4
  import pandas
4
- from biopipen.scripts.vcf.VcfFix_utils import HeaderContig, fix_vcffile
5
+ from datetime import datetime
5
6
  from biopipen.utils.reference import bam_index
6
- from biopipen.utils.misc import run_command, dict_to_cli_args
7
+ from biopipen.utils.misc import run_command, dict_to_cli_args, logger
7
8
 
8
- bamfile = {{in.bamfile | quote}} # pyright: ignore
9
+ bamfile = {{in.bamfile | quote}} # pyright: ignore # noqa
9
10
  snpfile = {{in.snpfile | repr}} # pyright: ignore
10
11
  outdir = Path({{out.outdir | quote}}) # pyright: ignore
11
12
  cnvpytor = {{envs.cnvpytor | quote}} # pyright: ignore
12
- cnvnator2vcf = {{envs.cnvnator2vcf | quote}} # pyright: ignore
13
13
  samtools = {{envs.samtools | quote}} # pyright: ignore
14
14
  ncores = {{envs.ncores | int}} # pyright: ignore
15
15
  refdir = {{envs.refdir | quote}} # pyright: ignore
@@ -20,7 +20,6 @@ args = {{envs | repr}} # pyright: ignore
20
20
 
21
21
  del args['cnvpytor']
22
22
  del args['ncores']
23
- del args['cnvnator2vcf']
24
23
  del args['samtools']
25
24
  del args['refdir']
26
25
  del args['genome']
@@ -236,47 +235,138 @@ def load_chrsize():
236
235
  yield chrom, int(size)
237
236
 
238
237
 
239
- def cnvpytor2vcf(infile, snp, fix=True):
240
- unfixedfile = Path(infile).with_suffix(f".unfixed.vcf")
241
- outfile = Path(infile).with_suffix(f".vcf")
242
- stdout = run_command(
243
- dict_to_cli_args(
244
- {
245
- "": cnvpytor2vcf,
246
- "reference": genome,
247
- "_": [infile, refdir],
248
- },
249
- prefix="-",
250
- ),
251
- stdout="return",
252
- )
253
- if fix:
254
- unfixedfile.write_text(stdout)
238
+ def parse_chrom(chrom, chromdir):
239
+ file = Path(chromdir) / f"{chrom}.fa"
240
+ if not file.exists():
241
+ warnings.warn(f"Chromosome file not found in refdir: {chrom}")
242
+ return ""
255
243
 
256
- fixes = [
257
- {
258
- "kind": "format",
259
- "id": "PE",
260
- "fix": lambda obj: setattr(obj, 'Type', 'String')
261
- },
262
- {
263
- "kind": "fields",
264
- "fix": lambda items: items.__setitem__(-1, Path(bamfile).stem)
265
- }
266
- ]
244
+ seq = ""
245
+ with open(file) as f:
246
+ for line in f:
247
+ line = line.strip()
248
+ if not line:
249
+ continue
250
+ if line.startswith(">"):
251
+ seq = ""
252
+ else:
253
+ seq += line
254
+ return seq
255
+
256
+
257
+ def cnvpytor2vcf(infile, snp):
258
+ # snp: in case to be used in the future
259
+ outfile = Path(infile).with_suffix(f".vcf")
260
+ # stdout = run_command(
261
+ # dict_to_cli_args(
262
+ # {
263
+ # "": cnvnator2vcf,
264
+ # "reference": genome,
265
+ # "_": [infile, refdir],
266
+ # },
267
+ # prefix="-",
268
+ # ),
269
+ # stdout="return",
270
+ # )
271
+ ## command hangs
272
+ with open(infile) as fin, open(outfile, "w") as fout:
273
+ fout.write("##fileformat=VCFv4.2\n")
274
+ fout.write(f"##fileDate={datetime.now().strftime('%Y%m%d')}\n")
275
+ fout.write(f"##reference={genome}\n")
276
+ fout.write(f"##source=CNVpytor\n")
267
277
  for chrom, size in load_chrsize():
268
- fixes.append({
269
- "kind": "contig",
270
- "append": True,
271
- "fix": (
272
- lambda obj, chrom=chrom, size=size:
273
- HeaderContig(ID=chrom, length=size)
274
- )
275
- })
278
+ fout.write(f"##contig=<ID={chrom},length={size}>\n")
279
+ fout.write('##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant described in this record">\n')
280
+ fout.write('##INFO=<ID=IMPRECISE,Number=0,Type=Flag,Description="Imprecise structural variation">\n')
281
+ fout.write('##INFO=<ID=SVLEN,Number=1,Type=Integer,Description="Difference in length between REF and ALT alleles">\n')
282
+ fout.write('##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">\n')
283
+ fout.write('##INFO=<ID=natorRD,Number=1,Type=Float,Description="Normalized RD">\n')
284
+ fout.write('##INFO=<ID=natorP1,Number=1,Type=Float,Description="e-val by t-test">\n')
285
+ fout.write('##INFO=<ID=natorP2,Number=1,Type=Float,Description="e-val by Gaussian tail">\n')
286
+ fout.write('##INFO=<ID=natorP3,Number=1,Type=Float,Description="e-val by t-test (middle)">\n')
287
+ fout.write('##INFO=<ID=natorP4,Number=1,Type=Float,Description="e-val by Gaussian tail (middle)">\n')
288
+ fout.write('##INFO=<ID=natorQ0,Number=1,Type=Float,Description="Fraction of reads with 0 mapping quality">\n')
289
+ fout.write('##INFO=<ID=natorPE,Number=1,Type=Integer,Description="Number of paired-ends support the event">\n')
290
+ fout.write('##INFO=<ID=SAMPLES,Number=.,Type=String,Description="Sample genotyped to have the variant">\n')
291
+ fout.write('##ALT=<ID=DEL,Description="Deletion">\n')
292
+ fout.write('##ALT=<ID=DUP,Description="Duplication">\n')
293
+ fout.write('##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n')
294
+ fout.write('##FORMAT=<ID=CN,Number=1,Type=Integer,Description="Copy number genotype for imprecise events">\n')
295
+ fout.write('##FORMAT=<ID=PE,Number=1,Type=String,Description="Number of paired-ends that support the event">\n')
296
+ fout.write(f"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t{Path(bamfile).stem}\n")
297
+ prev_chrom, chrom_seq, count = "", "", 0
298
+ for line in fin:
299
+ # type, coor, length, rd, p1, p2, p3, p4, q0, pe = line.strip("\n").split()
300
+ items = line.strip("\n").split()
301
+ type, coor, length = items[:3]
302
+ rd = float(items[3]) if len(items) > 3 else False
303
+ p1 = items[4] if len(items) > 4 else ""
304
+ p2 = items[5] if len(items) > 5 else ""
305
+ p3 = items[6] if len(items) > 6 else ""
306
+ p4 = items[7] if len(items) > 7 else ""
307
+ q0 = items[8] if len(items) > 8 else ""
308
+ pe = items[9] if len(items) > 9 else ""
309
+ chrom, pos = coor.split(":")
310
+ start, end = pos.split("-")
311
+ start, end = int(start), int(end)
312
+ is_del = type == "deletion"
313
+ is_dup = type == "duplication"
314
+
315
+ if not is_del and not is_dup:
316
+ warnings.warn(f"Skipping unrecognized CNV type: {type}")
317
+ continue
276
318
 
277
- fix_vcffile(unfixedfile, outfile, fixes)
278
- else:
279
- outfile.write_text(stdout)
319
+ if chrom != prev_chrom:
320
+ chrom_seq = parse_chrom(chrom, refdir)
321
+ prev_chrom = chrom
322
+
323
+ count += 1
324
+ info = f"END={end}"
325
+ info += f";SVTYPE=DEL;SVLEN=-{length}" if is_del else f";SVTYPE=DUP;SVLEN={length}"
326
+ info += ";IMPRECISE"
327
+ info += f";natorRD={rd}" if rd is not False else ""
328
+ info += f";natorP1={p1}" if p1 else ""
329
+ info += f";natorP2={p2}" if p2 else ""
330
+ info += f";natorP3={p3}" if p3 else ""
331
+ info += f";natorP4={p4}" if p4 else ""
332
+ info += f";natorQ0={q0}" if q0 else ""
333
+ info += f";natorPE={pe}" if pe else ""
334
+
335
+ gt = "GT"
336
+ if rd is not False:
337
+ gt += ":CN"
338
+ gt += ":PE" if pe else ""
339
+ gt += "\t"
340
+ if is_del and rd < 0.25:
341
+ gt += "1/1:0"
342
+ elif is_del and rd >= 0.25:
343
+ gt += "0/1:1"
344
+ elif rd <= 1.75:
345
+ gt += "0/1:2"
346
+ elif rd > 1.75 and rd <= 2.25:
347
+ gt += "1/1:2"
348
+ elif rd > 2.25:
349
+ gt += f"./2:{rd:.0f}"
350
+ else:
351
+ gt = "GT:PE\t./." if pe else "GT\t./."
352
+
353
+ gt += f":{pe}" if pe else ""
354
+ else:
355
+ gt += "\t./."
356
+
357
+ fout.write("\t".join(
358
+ [
359
+ chrom,
360
+ str(start),
361
+ f"CNVpytor_{'del_' if is_del else 'dup_'}{count}",
362
+ chrom_seq[start - 1] if start < len(chrom_seq) else "N",
363
+ "<DEL>" if is_del else "<DUP>",
364
+ ".",
365
+ "PASS",
366
+ info,
367
+ gt,
368
+ ]
369
+ ) + "\n")
280
370
 
281
371
 
282
372
  def do_case():
@@ -290,7 +380,7 @@ def do_case():
290
380
  rootfile = outdir / "file.pytor"
291
381
  case["j"] = case.get("j", ncores)
292
382
 
293
- # read depth signal
383
+ logger.info("Reading depth signals ...")
294
384
  run_command(
295
385
  dict_to_cli_args(
296
386
  {
@@ -305,7 +395,7 @@ def do_case():
305
395
  fg=True,
306
396
  )
307
397
 
308
- # predicting cnv
398
+ logger.info("Predicting CNVs ...")
309
399
  run_command(
310
400
  dict_to_cli_args(
311
401
  {
@@ -314,6 +404,7 @@ def do_case():
314
404
  "his": binsizes,
315
405
  },
316
406
  prefix="-",
407
+ dup_key=False,
317
408
  ),
318
409
  fg=True,
319
410
  )
@@ -326,6 +417,7 @@ def do_case():
326
417
  "partition": binsizes,
327
418
  },
328
419
  prefix="-",
420
+ dup_key=False,
329
421
  ),
330
422
  fg=True,
331
423
  )
@@ -336,6 +428,7 @@ def do_case():
336
428
  mask_snps = snp.pop("mask_snps", True)
337
429
  baf_nomask = snp.pop("baf_nomask", False)
338
430
 
431
+ logger.info("Importing SNP data ...")
339
432
  run_command(
340
433
  dict_to_cli_args(
341
434
  {
@@ -350,6 +443,7 @@ def do_case():
350
443
  )
351
444
 
352
445
  if mask_snps:
446
+ logger.info("Masking 1000 Genome SNPs ...")
353
447
  run_command(
354
448
  dict_to_cli_args(
355
449
  {
@@ -362,6 +456,7 @@ def do_case():
362
456
  fg=True,
363
457
  )
364
458
 
459
+ logger.info("Calculating BAF histograms ...")
365
460
  run_command(
366
461
  dict_to_cli_args(
367
462
  {
@@ -375,8 +470,9 @@ def do_case():
375
470
  fg=True,
376
471
  )
377
472
 
378
- # call
473
+ logger.info("Predicting CNV regions using joint caller ...")
379
474
  for binsize in binsizes:
475
+ logger.info(f"- binsize: {binsize}")
380
476
  outfile = outdir / f"calls{'.combined' if snp is not False else ''}.{binsize}.tsv"
381
477
  outfile_filtered = outdir / f"calls{'.combined' if snp is not False else ''}.{binsize}.filtered.tsv"
382
478
  run_command(
@@ -392,6 +488,7 @@ def do_case():
392
488
  stdout=outfile,
393
489
  )
394
490
 
491
+ logger.info(" Converting to other formats ...")
395
492
  cnvpytor2other(outfile, bool(snp), "gff")
396
493
  cnvpytor2other(outfile, bool(snp), "bed")
397
494
  cnvpytor2vcf(outfile, bool(snp))
@@ -424,6 +521,7 @@ def do_case():
424
521
  cnvpytor2vcf(outfile_filtered, bool(snp))
425
522
 
426
523
  # plots
524
+ logger.info(" Plotting ...")
427
525
  manplot = outdir / f"manhattan.{binsize}.png"
428
526
  run_command(
429
527
  dict_to_cli_args(
@@ -0,0 +1,54 @@
1
+ from pathlib import Path
2
+ from biopipen.utils.misc import run_command, dict_to_cli_args, logger
3
+
4
+ afile = Path({{in.afile | repr}}) # pyright: ignore # noqa: #999
5
+ bfile = Path({{in.bfile | repr}}) # pyright: ignore
6
+ outfile = {{out.outfile | repr}} # pyright: ignore
7
+ envs = {{envs | repr}} # pyright: ignore
8
+
9
+ bedtools = envs.pop("bedtools")
10
+ sort = envs.pop("sort")
11
+ chrsize = envs.pop("chrsize")
12
+ postcmd = envs.pop("postcmd", None)
13
+ outdir = Path(outfile).parent
14
+
15
+ if chrsize and "g" in envs:
16
+ logger.warning("Ignoring envs.g because envs.chrsize is provided.")
17
+ envs["g"] = Path(chrsize).expanduser()
18
+ elif chrsize:
19
+ envs["g"] = Path(chrsize).expanduser()
20
+
21
+ if sort:
22
+ afile_sorted = outdir / f"{afile.stem}_sorted{afile.suffix}"
23
+ bfile_sorted = outdir / f"{bfile.stem}_sorted{bfile.suffix}"
24
+ run_command(
25
+ [bedtools, "sort", "-g", envs["g"], "-i", afile],
26
+ stdout=afile_sorted,
27
+ )
28
+ run_command(
29
+ [bedtools, "sort", "-g", envs["g"], "-i", bfile],
30
+ stdout=bfile_sorted,
31
+ )
32
+ afile = afile_sorted
33
+ bfile = bfile_sorted
34
+
35
+ envs[""] = [bedtools, "intersect"]
36
+ envs["a"] = afile
37
+ envs["b"] = bfile
38
+ envs.setdefault("sorted", True)
39
+
40
+ if envs["sorted"] and not "g" in envs:
41
+ raise ValueError("envs.g is required or manullay set envs.sorted to False.")
42
+
43
+ if postcmd:
44
+ ofile = Path(outfile).with_suffix(".prior.bt")
45
+ run_command(dict_to_cli_args(envs, prefix="-"), stdout=ofile)
46
+ postcmd_file = outdir / "_postcmd.sh"
47
+ postcmd_file.write_text(postcmd)
48
+ run_command(
49
+ ["bash", postcmd_file],
50
+ env={"infile": ofile, "outfile": outfile, "outdir": outdir},
51
+ fg=True,
52
+ )
53
+ else:
54
+ run_command(dict_to_cli_args(envs, prefix="-"), stdout=outfile)
@@ -1,6 +1,6 @@
1
1
  from biopipen.utils import run_command, dict_to_cli_args
2
2
 
3
- inbed = {{in.inbed | repr}} # pyright: ignore
3
+ inbed = {{in.inbed | repr}} # pyright: ignore # noqa: #999
4
4
  outbed = {{out.outbed | repr}} # pyright: ignore
5
5
  envs = {{envs | repr}} # pyright: ignore
6
6
  bedtools = envs.pop("bedtools", "bedtools")
@@ -127,13 +127,32 @@ getCAA <- function(segf, cytoarm, tcn_col,
127
127
  return(as(seg_cyto_chr, "GRangesList"))
128
128
  }
129
129
 
130
- segments = read.table(segfile, header=T, row.names=NULL, sep="\t", stringsAsFactors=F)
131
- seg = data.frame(
132
- seqnames = segments[, chrom_col],
133
- start = segments[, start_col],
134
- end = segments[, end_col],
135
- seg.mean = segments[, seg_col]
136
- )
130
+ if (endsWith(segfile, ".vcf") || endsWith(segfile, ".vcf.gz")) {
131
+ library(VariantAnnotation)
132
+ vcf = readVcf(segfile)
133
+ seg = data.frame(
134
+ seqnames = as.character(seqnames(vcf)),
135
+ start = start(vcf),
136
+ end = vcf@info[[end_col]],
137
+ seg.mean = vcf@info[[seg_col]]
138
+ )
139
+ } else if (endsWith(segfile, ".bed")) {
140
+ segments = read.table(segfile, header=F, row.names=NULL, sep="\t", stringsAsFactors=F)
141
+ seg = data.frame(
142
+ seqnames = segments[, 1],
143
+ start = segments[, 2],
144
+ end = segments[, 3],
145
+ seg.mean = segments[, 5]
146
+ )
147
+ } else {
148
+ segments = read.table(segfile, header=T, row.names=NULL, sep="\t", stringsAsFactors=F)
149
+ seg = data.frame(
150
+ seqnames = segments[, chrom_col],
151
+ start = segments[, start_col],
152
+ end = segments[, end_col],
153
+ seg.mean = segments[, seg_col]
154
+ )
155
+ }
137
156
 
138
157
  {% if envs.segmean_transform %}
139
158
  segmean_transform = {{envs.segmean_transform}}
@@ -168,6 +187,10 @@ if (is.character(cn_transform)) {
168
187
  }
169
188
  {% endif %}
170
189
 
190
+ seg <- seg[
191
+ !is.na(seg$seg.mean) & !is.na(seg$TCN) & !is.infinite(seg$seg.mean) & !is.infinite(seg$TCN),,
192
+ drop=FALSE]
193
+
171
194
  write.table(seg, file.path(outdir, "seg.txt"), sep="\t", quote=F, row.names=F, col.names=T)
172
195
 
173
196
  wgd_ploidy = checkIfWGD(
@@ -52,8 +52,11 @@ if (!is.null(group_cols)) {
52
52
 
53
53
  if (!is.null(metafile)) {
54
54
  metadf = read.table(metafile, header=T, row.names=NULL, sep="\t", stringsAsFactors=F)
55
- sample_col = colnames(metadf)[1]
56
- colnames(metadf)[1] = "Sample"
55
+ if (!is.null(metadf$Sample)) {
56
+ metadf$Sample = as.character(metadf$Sample)
57
+ } else {
58
+ colnames(metadf)[1] = "Sample"
59
+ }
57
60
  metadf = metadf[metadf$Sample %in% sams, c("Sample", meta_cols), drop=FALSE]
58
61
  if (nrow(metadf) != length(sams)) {
59
62
  stop(paste("Not all samples in metafile:", paste(setdiff(sams, metadf$Sample), collapse=", ")))
@@ -11,11 +11,27 @@ if (is.character(segmean_transform)) {
11
11
  segmean_transform = eval(parse(text=segmean_transform))
12
12
  } # otherwise NULL
13
13
 
14
- segments = read.table(segfile, header=T, row.names=NULL, sep="\t", stringsAsFactors=F)
15
- seg = data.frame(
16
- chrom = segments[, chrom_col],
17
- log2 = segments[, seg_col]
18
- )
14
+
15
+ if (endsWith(segfile, ".vcf") || endsWith(segfile, ".vcf.gz")) {
16
+ library(VariantAnnotation)
17
+ segments = readVcf(segfile)
18
+ seg = data.frame(
19
+ chrom = as.character(seqnames(segments)),
20
+ log2 = segments@info[[seg_col]]
21
+ )
22
+ } else if (endsWith(segfile, ".bed")) {
23
+ segments = read.table(segfile, header=F, row.names=NULL, sep="\t", stringsAsFactors=F)
24
+ seg = data.frame(
25
+ chrom = segments[, 1],
26
+ log2 = segments[, 5]
27
+ )
28
+ } else {
29
+ segments = read.table(segfile, header=T, row.names=NULL, sep="\t", stringsAsFactors=F)
30
+ seg = data.frame(
31
+ chrom = segments[, chrom_col],
32
+ log2 = segments[, seg_col]
33
+ )
34
+ }
19
35
  rm(segments)
20
36
 
21
37
  if (!is.null(excl_chroms) && length(excl_chroms) > 0) {
@@ -49,8 +49,12 @@ if (!is.null(group_cols)) {
49
49
  data = data.frame(Sample = sams, tMAD = tmads)
50
50
  if (file.exists(metafile) && length(meta_cols) > 0) {
51
51
  metadf = read.table(metafile, header=T, row.names=NULL, sep="\t", stringsAsFactors=F)
52
- sample_col = colnames(metadf)[1]
53
- meta = metadf[, c(sample_col, meta_cols), drop=FALSE]
52
+ if (!is.null(metadf$Sample)) {
53
+ metadf$Sample = as.character(metadf$Sample)
54
+ } else {
55
+ colnames(metadf)[1] = "Sample"
56
+ }
57
+ meta = metadf[, c("Sample", meta_cols), drop=FALSE]
54
58
  colnames(meta) = c("Sample", meta_cols)
55
59
  data = data %>% left_join(meta, by="Sample")
56
60
  }
@@ -1,3 +1,4 @@
1
+ from pathlib import Path
1
2
  from biopipen.utils.misc import run_command, dict_to_cli_args
2
3
 
3
4
  excfiles = {{in.excfiles | repr}} # pyright: ignore
@@ -12,7 +13,7 @@ def main():
12
13
  "": [cnvkit, "access"],
13
14
  "s": min_gap_size,
14
15
  "o": outfile,
15
- "_": reffile,
16
+ "_": Path(reffile).expanduser(),
16
17
  }
17
18
  if excfiles:
18
19
  other_args["exclude"] = excfiles
@@ -1,3 +1,4 @@
1
+ from pathlib import Path
1
2
  from biopipen.utils.misc import run_command, dict_to_cli_args
2
3
 
3
4
  bamfiles = {{in.bamfiles | repr}} # pyright: ignore
@@ -20,7 +21,7 @@ short_names = {{envs.short_names | repr}} # pyright: ignore
20
21
  def main():
21
22
 
22
23
  args = dict(
23
- f=reffile,
24
+ f=Path(reffile).expanduser(),
24
25
  m=method,
25
26
  g=accfile,
26
27
  t=baitfile,
@@ -29,7 +30,7 @@ def main():
29
30
  target_min_size=target_min_size,
30
31
  antitarget_max_size=antitarget_max_size,
31
32
  antitarget_min_size=antitarget_min_size,
32
- annotate=annotate,
33
+ annotate=Path(annotate).expanduser(),
33
34
  short_names=short_names,
34
35
  target_output_bed=target_file,
35
36
  antitarget_output_bed=antitarget_file,
@@ -42,7 +42,7 @@ def gen_access():
42
42
  exclude=access_excludes or False,
43
43
  s=access_min_gap_size or False,
44
44
  o=accessfile,
45
- _=ref,
45
+ _=Path(ref).expanduser(),
46
46
  )
47
47
  args[""] = [cnvkit, "access"]
48
48
  run_command(dict_to_cli_args(args, dashify=True), fg=True)
@@ -1,3 +1,4 @@
1
+ from pathlib import Path
1
2
  from biopipen.utils.misc import run_command, dict_to_cli_args
2
3
 
3
4
  bamfile = {{in.bamfile | quote}} # pyright: ignore
@@ -13,7 +14,7 @@ ncores = {{envs.ncores | repr}} # pyright: ignore
13
14
  def main():
14
15
 
15
16
  args = dict(
16
- f=reffile,
17
+ f=Path(reffile).expanduser(),
17
18
  c=count,
18
19
  q=min_mapq,
19
20
  p=ncores,
@@ -60,7 +60,7 @@ params.update({
60
60
  "o": targetfile,
61
61
  "c": covfile,
62
62
  "p": ncores,
63
- "f": ref,
63
+ "f": Path(ref).expanduser(),
64
64
  "s": samtools,
65
65
  "_": bamfiles,
66
66
  })
@@ -4,7 +4,7 @@ from diot import Diot
4
4
 
5
5
  from biopipen.utils.misc import run_command, dict_to_cli_args
6
6
 
7
- segfiles = {{in.segfiles | repr}} # pyright: ignore
7
+ segfiles = {{in.segfiles | repr}} # pyright: ignore # noqa
8
8
  sample_sex = {{in.sample_sex | repr}} # pyright: ignore
9
9
  outdir = {{out.outdir | repr}} # pyright: ignore
10
10
  cnvkit = {{envs.cnvkit | quote}} # pyright: ignore
@@ -1,3 +1,4 @@
1
+ from pathlib import Path
1
2
  from biopipen.utils.misc import run_command, dict_to_cli_args
2
3
 
3
4
  covfiles = {{in.covfiles | repr}} # pyright: ignore
@@ -18,7 +19,7 @@ no_rmask = {{envs.no_rmask | repr}} # pyright: ignore
18
19
  def main():
19
20
 
20
21
  args = dict(
21
- f=reffile,
22
+ f=Path(reffile).expanduser(),
22
23
  o=outfile,
23
24
  c=cluster,
24
25
  min_cluster_size=min_cluster_size,
@@ -88,7 +88,11 @@ for (name in names(stats)) {
88
88
  group <- if (is.null(stat$group)) sym("..group") else sym(stat$group)
89
89
  count_on <- paste0("..count.", stat$on)
90
90
  if (!is_continuous) {
91
- data <- data %>% add_count(!!group, name = count_on)
91
+ if (!is.null(stat$each)) {
92
+ data <- data %>% add_count(!!group, !!sym(stat$each), name = count_on)
93
+ } else {
94
+ data <- data %>% add_count(!!group, name = count_on)
95
+ }
92
96
  }
93
97
 
94
98
  if (is.null(stat$devpars)) {
@@ -141,18 +145,19 @@ for (name in names(stats)) {
141
145
  } else {
142
146
  data <- data %>%
143
147
  distinct(!!group, !!sym(stat$each), .keep_all = TRUE) %>%
148
+ mutate(!!group := factor(!!group, levels = unique(!!group))) %>%
144
149
  group_by(!!sym(stat$each))
145
150
  }
146
151
  p <- ggplot(
147
- data %>% arrange(!!group),
148
- aes(x = "", y = !!sym(count_on), fill = !!group, label = !!sym(count_on))
152
+ data %>% mutate(.size = sum(!!sym(count_on))),
153
+ aes(x = sqrt(.size) / 2, width = sqrt(.size), y = !!sym(count_on), fill = !!group, label = !!sym(count_on))
149
154
  ) +
150
- geom_bar(stat="identity", width=1, color="white", position = position_stack(reverse = TRUE)) +
155
+ geom_bar(stat="identity", color="white", position = position_fill(reverse = TRUE)) +
151
156
  coord_polar("y", start = 0) +
152
157
  theme_void() +
153
158
  theme(plot.title = element_text(hjust = 0.5)) +
154
159
  geom_label_repel(
155
- position = position_stack(vjust = 0.5),
160
+ position = position_fill(reverse = TRUE,vjust = .5),
156
161
  color="#333333",
157
162
  fill="#EEEEEE",
158
163
  size=4
@@ -0,0 +1,65 @@
1
+ source("{{biopipen_dir}}/utils/misc.R")
2
+ source("{{biopipen_dir}}/utils/gene.R")
3
+
4
+ infile <- {{in.infile | quote}}
5
+ outfile <- {{out.outfile | quote}}
6
+ notfound <- {{envs.notfound | r}}
7
+ genecol <- {{envs.genecol | r}}
8
+ output <- {{envs.output | r}}
9
+ dup <- {{envs.dup | r}}
10
+ infmt <- {{envs.infmt | r}}
11
+ outfmt <- {{envs.outfmt | r}}
12
+ species <- {{envs.species | r}}
13
+
14
+ if (is.na(notfound)) {
15
+ notfound = "na"
16
+ }
17
+
18
+ df <- read.table(infile, header=TRUE, sep="\t", check.names=FALSE)
19
+
20
+ if (genecol == 0) {
21
+ log_warn("envs.genecol should be 1-based, but 0 was given. Using 1 instead.")
22
+ genecol <- 1
23
+ }
24
+
25
+ if (is.numeric(genecol)) { genecol <- colnames(df)[genecol] }
26
+ if (dup == "combine") { dup <- ";" }
27
+
28
+ genes <- df[[genecol]]
29
+ converted <- gene_name_conversion(
30
+ genes=genes,
31
+ species=species,
32
+ infmt=infmt,
33
+ outfmt=outfmt,
34
+ notfound=notfound,
35
+ dup=dup
36
+ )
37
+ # <genecol> <outfmt>
38
+ # 1 1255_g_at GUCA1A
39
+ # 2 1316_at THRA
40
+ # 3 1320_at PTPN21
41
+ # 4 1294_at MIR5193
42
+
43
+ # order the converted dataframe by the original gene column
44
+ converted <- converted[order(match(converted$query, genes)), , drop=FALSE]
45
+ outcol <- outfmt
46
+
47
+ if (notfound == "skip" || notfound == "ignore") {
48
+ df <- df[df[[genecol]] %in% converted$query, , drop=FALSE]
49
+ }
50
+
51
+ if (output == "append") {
52
+ if (outfmt %in% colnames(df)) {
53
+ log_warn("The output column name already exists in the input dataframe. Appending with a suffix `_1`.")
54
+ outcol <- paste(outfmt, "_1", sep="")
55
+ }
56
+ df[[outcol]] <- converted[[outfmt]]
57
+ } else if (output == "replace") {
58
+ df[[genecol]] <- converted[[outfmt]]
59
+ } else if (output == "with-query") {
60
+ df <- converted
61
+ } else {
62
+ df <- converted[, outfmt, drop=FALSE]
63
+ }
64
+
65
+ write.table(df, file=outfile, sep="\t", quote=FALSE, row.names=FALSE)