biopipen 0.28.1__py3-none-any.whl → 0.29.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biopipen might be problematic. Click here for more details.

Files changed (82) hide show
  1. biopipen/__init__.py +1 -1
  2. biopipen/core/config.toml +8 -0
  3. biopipen/ns/bam.py +0 -2
  4. biopipen/ns/bed.py +35 -0
  5. biopipen/ns/cellranger_pipeline.py +5 -5
  6. biopipen/ns/cnv.py +18 -2
  7. biopipen/ns/cnvkit_pipeline.py +16 -11
  8. biopipen/ns/gene.py +68 -23
  9. biopipen/ns/misc.py +2 -15
  10. biopipen/ns/plot.py +146 -0
  11. biopipen/ns/regulation.py +214 -0
  12. biopipen/ns/scrna.py +15 -3
  13. biopipen/ns/snp.py +516 -8
  14. biopipen/ns/stats.py +74 -2
  15. biopipen/ns/vcf.py +196 -0
  16. biopipen/reports/snp/PlinkCallRate.svelte +24 -0
  17. biopipen/reports/snp/PlinkFreq.svelte +18 -0
  18. biopipen/reports/snp/PlinkHWE.svelte +18 -0
  19. biopipen/reports/snp/PlinkHet.svelte +18 -0
  20. biopipen/reports/snp/PlinkIBD.svelte +18 -0
  21. biopipen/scripts/bam/CNVpytor.py +144 -46
  22. biopipen/scripts/bed/BedtoolsIntersect.py +54 -0
  23. biopipen/scripts/bed/BedtoolsMerge.py +1 -1
  24. biopipen/scripts/cnv/AneuploidyScore.R +30 -7
  25. biopipen/scripts/cnv/AneuploidyScoreSummary.R +5 -2
  26. biopipen/scripts/cnv/TMADScore.R +21 -5
  27. biopipen/scripts/cnv/TMADScoreSummary.R +6 -2
  28. biopipen/scripts/cnvkit/CNVkitAccess.py +2 -1
  29. biopipen/scripts/cnvkit/CNVkitAutobin.py +3 -2
  30. biopipen/scripts/cnvkit/CNVkitBatch.py +1 -1
  31. biopipen/scripts/cnvkit/CNVkitCoverage.py +2 -1
  32. biopipen/scripts/cnvkit/CNVkitGuessBaits.py +1 -1
  33. biopipen/scripts/cnvkit/CNVkitHeatmap.py +1 -1
  34. biopipen/scripts/cnvkit/CNVkitReference.py +2 -1
  35. biopipen/scripts/gene/GeneNameConversion.R +65 -0
  36. biopipen/scripts/gene/GenePromoters.R +61 -0
  37. biopipen/scripts/misc/Shell.sh +15 -0
  38. biopipen/scripts/plot/Manhattan.R +140 -0
  39. biopipen/scripts/plot/QQPlot.R +62 -0
  40. biopipen/scripts/regulation/MotifAffinityTest.R +226 -0
  41. biopipen/scripts/regulation/MotifAffinityTest_AtSNP.R +126 -0
  42. biopipen/scripts/regulation/MotifAffinityTest_MotifBreakR.R +96 -0
  43. biopipen/scripts/regulation/MotifScan.py +159 -0
  44. biopipen/scripts/regulation/atSNP.R +33 -0
  45. biopipen/scripts/regulation/motifBreakR.R +1594 -0
  46. biopipen/scripts/scrna/MarkersFinder.R +59 -67
  47. biopipen/scripts/scrna/SeuratClustering.R +63 -29
  48. biopipen/scripts/scrna/SeuratMap2Ref.R +20 -0
  49. biopipen/scripts/scrna/SeuratSubClustering.R +76 -27
  50. biopipen/scripts/snp/MatrixEQTL.R +84 -43
  51. biopipen/scripts/snp/Plink2GTMat.py +133 -0
  52. biopipen/scripts/snp/PlinkCallRate.R +190 -0
  53. biopipen/scripts/snp/PlinkFilter.py +100 -0
  54. biopipen/scripts/snp/PlinkFreq.R +298 -0
  55. biopipen/scripts/snp/PlinkFromVcf.py +78 -0
  56. biopipen/scripts/snp/PlinkHWE.R +80 -0
  57. biopipen/scripts/snp/PlinkHet.R +92 -0
  58. biopipen/scripts/snp/PlinkIBD.R +197 -0
  59. biopipen/scripts/snp/PlinkUpdateName.py +124 -0
  60. biopipen/scripts/stats/MetaPvalue.R +2 -1
  61. biopipen/scripts/stats/MetaPvalue1.R +70 -0
  62. biopipen/scripts/tcr/TCRClusterStats.R +12 -7
  63. biopipen/scripts/vcf/BcftoolsAnnotate.py +91 -0
  64. biopipen/scripts/vcf/BcftoolsFilter.py +90 -0
  65. biopipen/scripts/vcf/BcftoolsSort.py +113 -0
  66. biopipen/scripts/vcf/BcftoolsView.py +73 -0
  67. biopipen/scripts/vcf/VcfFix_utils.py +1 -1
  68. biopipen/scripts/vcf/bcftools_utils.py +52 -0
  69. biopipen/utils/gene.R +83 -37
  70. biopipen/utils/gene.py +108 -60
  71. biopipen/utils/misc.R +56 -0
  72. biopipen/utils/misc.py +5 -2
  73. biopipen/utils/reference.py +54 -10
  74. {biopipen-0.28.1.dist-info → biopipen-0.29.0.dist-info}/METADATA +2 -2
  75. {biopipen-0.28.1.dist-info → biopipen-0.29.0.dist-info}/RECORD +77 -49
  76. {biopipen-0.28.1.dist-info → biopipen-0.29.0.dist-info}/entry_points.txt +1 -1
  77. biopipen/ns/bcftools.py +0 -111
  78. biopipen/scripts/bcftools/BcftoolsAnnotate.py +0 -42
  79. biopipen/scripts/bcftools/BcftoolsFilter.py +0 -79
  80. biopipen/scripts/bcftools/BcftoolsSort.py +0 -19
  81. biopipen/scripts/gene/GeneNameConversion.py +0 -66
  82. {biopipen-0.28.1.dist-info → biopipen-0.29.0.dist-info}/WHEEL +0 -0
biopipen/ns/vcf.py CHANGED
@@ -439,3 +439,199 @@ class TruvariConsistency(Proc):
439
439
  envs = {"truvari": config.exe.truvari, "heatmap": {}}
440
440
  script = "file://../scripts/vcf/TruvariConsistency.R"
441
441
  plugin_opts = {"report": "file://../reports/vcf/TruvariConsistency.svelte"}
442
+
443
+
444
+ class BcftoolsAnnotate(Proc):
445
+ """Add or remove annotations from VCF files
446
+
447
+ See also: <https://samtools.github.io/bcftools/bcftools.html#annotate>
448
+
449
+ Input:
450
+ infile: The input VCF file
451
+ annfile: The annotation file.
452
+ Currently only VCF files are supported.
453
+
454
+ Output:
455
+ outfile: The VCF file with annotations added or removed.
456
+
457
+ Envs:
458
+ bcftools: Path to bcftools
459
+ tabix: Path to tabix, used to index infile and annfile
460
+ annfile: The annotation file. If `in.annfile` is provided,
461
+ this is ignored
462
+ ncores (type=int): Number of cores (`--threads`) to use
463
+ columns (auto): Comma-separated or list of columns or tags to carry over from
464
+ the annotation file. Overrides `-c, --columns`
465
+ remove (auto): Remove the specified columns from the input file
466
+ header (type=list): Headers to be added
467
+ gz (flag): Whether to gzip the output file
468
+ index (flag): Whether to index the output file (tbi) (`envs.gz` forced to True)
469
+ <more>: Other arguments for `bcftools annotate`
470
+ See also <https://samtools.github.io/bcftools/bcftools.html#annotate>
471
+ Note that the underscore `_` will be replaced with dash `-` in the
472
+ argument name.
473
+ """
474
+ input = "infile:file, annfile:file"
475
+ output = (
476
+ "outfile:file:{{in.infile | stem: 'gz'}}.vcf"
477
+ "{{'.gz' if envs.index or envs.gz else ''}}"
478
+ )
479
+ lang = config.lang.python
480
+ envs = {
481
+ "bcftools": config.exe.bcftools,
482
+ "tabix": config.exe.tabix,
483
+ "annfile": None,
484
+ "columns": [],
485
+ "remove": [],
486
+ "header": [],
487
+ "gz": True,
488
+ "index": True,
489
+ "ncores": config.misc.ncores,
490
+ }
491
+ script = "file://../scripts/vcf/BcftoolsAnnotate.py"
492
+
493
+
494
+ class BcftoolsFilter(Proc):
495
+ """Apply fixed threshold filters to VCF files
496
+
497
+ Input:
498
+ infile: The input VCF file
499
+
500
+ Output:
501
+ outfile: The filtered VCF file. If the `in.infile` is gzipped, this is
502
+ gzipped as well.
503
+
504
+ Envs:
505
+ bcftools: Path to bcftools
506
+ tabix: Path to tabix, used to index infile/outfile
507
+ ncores (type=int): Number of cores (`--threads`) to use
508
+ keep: Whether we should keep the filtered variants or not.
509
+ If True, the filtered variants will be kept in the output file, but
510
+ with a new FILTER.
511
+ includes: and
512
+ excludes: include/exclude only sites for which EXPRESSION is true.
513
+ See: <https://samtools.github.io/bcftools/bcftools.html#expressions>
514
+ If provided, `envs.include/exclude` will be ignored.
515
+ If `str`/`list` used, The filter names will be `Filter_<type>_<index>`.
516
+ A dict is used where keys are filter names and values are expressions
517
+ gz (flag): Whether to gzip the output file
518
+ index (flag): Whether to index the output file (tbi) (`envs.gz` forced to True)
519
+ <more>: Other arguments for `bcftools filter`
520
+ See also <https://samtools.github.io/bcftools/bcftools.html#filter>
521
+ """
522
+ input = "infile:file"
523
+ output = (
524
+ "outfile:file:{{in.infile | stem: 'gz'}}.vcf"
525
+ "{{'.gz' if envs.index or envs.gz else ''}}"
526
+ )
527
+ lang = config.lang.python
528
+ envs = {
529
+ "bcftools": config.exe.bcftools,
530
+ "tabix": config.exe.tabix,
531
+ "ncores": config.misc.ncores,
532
+ "keep": True,
533
+ "includes": None,
534
+ "excludes": None,
535
+ "gz": True,
536
+ "index": True,
537
+ }
538
+ script = "file://../scripts/vcf/BcftoolsFilter.py"
539
+
540
+
541
+ class BcftoolsSort(Proc):
542
+ """Sort VCF files using `bcftools sort`.
543
+
544
+ `bcftools sort` is used to sort VCF files by chromosome and position based on the
545
+ order of contigs in the header.
546
+
547
+ Here we provide a chrsize file to first sort the contigs in the header and then
548
+ sort the VCF file using `bcftools sort`.
549
+
550
+ Input:
551
+ infile: The input VCF file
552
+
553
+ Output:
554
+ outfile: The sorted VCF file.
555
+
556
+ Envs:
557
+ bcftools: Path to bcftools
558
+ tabix: Path to tabix, used to index infile/outfile
559
+ ncores (type=int): Number of cores (`--threads`) to use
560
+ gz (flag): Whether to gzip the output file
561
+ index (flag): Whether to index the output file (tbi) (`envs.gz` forced to True)
562
+ chrsize: The chromosome size file, from which the chromosome order is used
563
+ to sort the contig in the header first.
564
+ If not provided, `bcftools sort` will be used directly.
565
+ notfound (choice): What if the contig in the VCF file is not found in the
566
+ `chrsize` file.
567
+ - error: Report error
568
+ - remove: Remove the contig from the header.
569
+ Note that if there are records with the removed contig, an error will
570
+ be raised by `bcftools sort`
571
+ - start: Move the contig to the start of the contigs from `chrsize`
572
+ - end: Move the contig to the end of the contigs from `chrsize`
573
+ <more>: Other arguments for `bcftools sort`. For example `max_mem`.
574
+ See also <https://samtools.github.io/bcftools/bcftools.html#sort>
575
+ """
576
+ input = "infile:file"
577
+ output = (
578
+ "outfile:file:{{in.infile | stem: 'gz'}}.vcf"
579
+ "{{'.gz' if envs.index or envs.gz else ''}}"
580
+ )
581
+ lang = config.lang.python
582
+ envs = {
583
+ "bcftools": config.exe.bcftools,
584
+ "tabix": config.exe.tabix,
585
+ "ncores": config.misc.ncores,
586
+ "chrsize": config.ref.chrsize,
587
+ "notfound": "remove",
588
+ "gz": True,
589
+ "index": True,
590
+ }
591
+ script = "file://../scripts/vcf/BcftoolsSort.py"
592
+
593
+
594
+ class BcftoolsView(Proc):
595
+ """View, subset and filter VCF files by position and filtering expression.
596
+
597
+ Also convert between VCF and BCF.
598
+
599
+ Input:
600
+ infile: The input VCF file
601
+ regions_file: The region file used to subset the input VCF file.
602
+ samples_file: The samples file used to subset the input VCF file.
603
+
604
+ Output:
605
+ outfile: The output VCF file.
606
+
607
+ Envs:
608
+ bcftools: Path to bcftools
609
+ tabix: Path to tabix, used to index infile/outfile
610
+ ncores (type=int): Number of cores (`--threads`) to use
611
+ regions_file: The region file used to subset the input VCF file.
612
+ If `in.regions_file` is provided, this is ignored.
613
+ samples_file: The samples file used to subset the input VCF file.
614
+ If `in.samples_file` is provided, this is ignored.
615
+ gz (flag): Whether to gzip the output file
616
+ index (flag): Whether to index the output file (tbi) (`envs.gz` forced to True)
617
+ <more>: Other arguments for `bcftools view`.
618
+ See also https://samtools.github.io/bcftools/bcftools.html#view
619
+ Note that the underscore `_` will be replaced with dash `-` in the
620
+ argument name.
621
+ """
622
+ input = "infile:file, regions_file:file, samples_file:file"
623
+ output = (
624
+ "outfile:file:{{in.infile | stem: 'gz'}}.vcf"
625
+ "{{'.gz' if envs.index or envs.gz else ''}}"
626
+ )
627
+ lang = config.lang.python
628
+ envs = {
629
+ "bcftools": config.exe.bcftools,
630
+ "tabix": config.exe.tabix,
631
+ "ncores": config.misc.ncores,
632
+ "regions_file": None,
633
+ "samples_file": None,
634
+ "gz": True,
635
+ "index": True,
636
+ }
637
+ script = "file://../scripts/vcf/BcftoolsView.py"
@@ -0,0 +1,24 @@
1
+ {% from "utils/misc.liq" import report_jobs -%}
2
+ <script>
3
+ import { Image, Descr } from "$libs";
4
+ </script>
5
+
6
+ {%- macro report_job(job, h=1) -%}
7
+ <h{{h+1}}>Sample Call Rate</h{{h+1}}>
8
+ {%- for pngfile in job.out.outdir | joinpaths: '*.samplecr.png' | glob -%}
9
+ <Descr>Cutoff: {{envs.samplecr}}</Descr>
10
+ <Image src="{{pngfile}}" />
11
+ {%- endfor -%}
12
+
13
+ <h{{h+1}}>Variant Call Rate</h{{h+1}}>
14
+ {%- for pngfile in job.out.outdir | joinpaths: '*.varcr.png' | glob -%}
15
+ <Descr>Cutoff: {{envs.varcr}}</Descr>
16
+ <Image src="{{pngfile}}" />
17
+ {%- endfor -%}
18
+ {%- endmacro -%}
19
+
20
+ {%- macro head_job(job) -%}
21
+ <h1>Sample: {{job.in.cnrfile | stem0 }}</h1>
22
+ {%- endmacro -%}
23
+
24
+ {{ report_jobs(jobs, head_job, report_job) }}
@@ -0,0 +1,18 @@
1
+ {% from "utils/misc.liq" import report_jobs -%}
2
+ <script>
3
+ import { Image, Descr } from "$libs";
4
+ </script>
5
+
6
+ {%- macro report_job(job, h=1) -%}
7
+ {%- for pngfile in job.out.outdir | joinpaths: '*.png' | glob -%}
8
+ {% set metric_col = pngfile | stem | ext0 %}
9
+ <h{{h+1}}>{{metric_col}} distribution</h{{h+1}}>
10
+ <Image src="{{pngfile}}" />
11
+ {%- endfor -%}
12
+ {%- endmacro -%}
13
+
14
+ {%- macro head_job(job) -%}
15
+ <h1>Sample: {{job.in.cnrfile | stem0 }}</h1>
16
+ {%- endmacro -%}
17
+
18
+ {{ report_jobs(jobs, head_job, report_job) }}
@@ -0,0 +1,18 @@
1
+ {% from "utils/misc.liq" import report_jobs -%}
2
+ <script>
3
+ import { Image, Descr } from "$libs";
4
+ </script>
5
+
6
+ {%- macro report_job(job, h=1) -%}
7
+ {%- for pngfile in job.out.outdir | joinpaths: '*.png' | glob -%}
8
+ <h{{h+1}}>Distribution</h{{h+1}}>
9
+ <Descr>Cutoff: {{envs.cutoff}}</Descr>
10
+ <Image src="{{pngfile}}" />
11
+ {%- endfor -%}
12
+ {%- endmacro -%}
13
+
14
+ {%- macro head_job(job) -%}
15
+ <h1>Sample: {{job.in.cnrfile | stem0 }}</h1>
16
+ {%- endmacro -%}
17
+
18
+ {{ report_jobs(jobs, head_job, report_job) }}
@@ -0,0 +1,18 @@
1
+ {% from "utils/misc.liq" import report_jobs -%}
2
+ <script>
3
+ import { Image, Descr } from "$libs";
4
+ </script>
5
+
6
+ {%- macro report_job(job, h=1) -%}
7
+ {%- for pngfile in job.out.outdir | joinpaths: '*.png' | glob -%}
8
+ <h{{h+1}}>Distribution</h{{h+1}}>
9
+ <Descr>Cutoff: [mean - {{envs.cutoff}} x sd, mean + {{envs.cutoff}} x sd]</Descr>
10
+ <Image src="{{pngfile}}" />
11
+ {%- endfor -%}
12
+ {%- endmacro -%}
13
+
14
+ {%- macro head_job(job) -%}
15
+ <h1>Sample: {{job.in.cnrfile | stem0 }}</h1>
16
+ {%- endmacro -%}
17
+
18
+ {{ report_jobs(jobs, head_job, report_job) }}
@@ -0,0 +1,18 @@
1
+ {% from "utils/misc.liq" import report_jobs -%}
2
+ <script>
3
+ import { Image, Descr } from "$libs";
4
+ </script>
5
+
6
+ {%- macro report_job(job, h=1) -%}
7
+ {%- for pngfile in job.out.outdir | joinpaths: '*.png' | glob -%}
8
+ <h{{h+1}}>Heatmap</h{{h+1}}>
9
+ <Descr>PI_HAT threshold = {{envs.pihat}}</Descr>
10
+ <Image src="{{pngfile}}" />
11
+ {%- endfor -%}
12
+ {%- endmacro -%}
13
+
14
+ {%- macro head_job(job) -%}
15
+ <h1>Sample: {{job.in.cnrfile | stem0 }}</h1>
16
+ {%- endmacro -%}
17
+
18
+ {{ report_jobs(jobs, head_job, report_job) }}
@@ -1,15 +1,15 @@
1
1
  from pathlib import Path
2
2
 
3
+ import warnings
3
4
  import pandas
4
- from biopipen.scripts.vcf.VcfFix_utils import HeaderContig, fix_vcffile
5
+ from datetime import datetime
5
6
  from biopipen.utils.reference import bam_index
6
- from biopipen.utils.misc import run_command, dict_to_cli_args
7
+ from biopipen.utils.misc import run_command, dict_to_cli_args, logger
7
8
 
8
- bamfile = {{in.bamfile | quote}} # pyright: ignore
9
+ bamfile = {{in.bamfile | quote}} # pyright: ignore # noqa
9
10
  snpfile = {{in.snpfile | repr}} # pyright: ignore
10
11
  outdir = Path({{out.outdir | quote}}) # pyright: ignore
11
12
  cnvpytor = {{envs.cnvpytor | quote}} # pyright: ignore
12
- cnvnator2vcf = {{envs.cnvnator2vcf | quote}} # pyright: ignore
13
13
  samtools = {{envs.samtools | quote}} # pyright: ignore
14
14
  ncores = {{envs.ncores | int}} # pyright: ignore
15
15
  refdir = {{envs.refdir | quote}} # pyright: ignore
@@ -20,7 +20,6 @@ args = {{envs | repr}} # pyright: ignore
20
20
 
21
21
  del args['cnvpytor']
22
22
  del args['ncores']
23
- del args['cnvnator2vcf']
24
23
  del args['samtools']
25
24
  del args['refdir']
26
25
  del args['genome']
@@ -236,47 +235,138 @@ def load_chrsize():
236
235
  yield chrom, int(size)
237
236
 
238
237
 
239
- def cnvpytor2vcf(infile, snp, fix=True):
240
- unfixedfile = Path(infile).with_suffix(f".unfixed.vcf")
241
- outfile = Path(infile).with_suffix(f".vcf")
242
- stdout = run_command(
243
- dict_to_cli_args(
244
- {
245
- "": cnvpytor2vcf,
246
- "reference": genome,
247
- "_": [infile, refdir],
248
- },
249
- prefix="-",
250
- ),
251
- stdout="return",
252
- )
253
- if fix:
254
- unfixedfile.write_text(stdout)
238
+ def parse_chrom(chrom, chromdir):
239
+ file = Path(chromdir) / f"{chrom}.fa"
240
+ if not file.exists():
241
+ warnings.warn(f"Chromosome file not found in refdir: {chrom}")
242
+ return ""
255
243
 
256
- fixes = [
257
- {
258
- "kind": "format",
259
- "id": "PE",
260
- "fix": lambda obj: setattr(obj, 'Type', 'String')
261
- },
262
- {
263
- "kind": "fields",
264
- "fix": lambda items: items.__setitem__(-1, Path(bamfile).stem)
265
- }
266
- ]
244
+ seq = ""
245
+ with open(file) as f:
246
+ for line in f:
247
+ line = line.strip()
248
+ if not line:
249
+ continue
250
+ if line.startswith(">"):
251
+ seq = ""
252
+ else:
253
+ seq += line
254
+ return seq
255
+
256
+
257
+ def cnvpytor2vcf(infile, snp):
258
+ # snp: in case to be used in the future
259
+ outfile = Path(infile).with_suffix(f".vcf")
260
+ # stdout = run_command(
261
+ # dict_to_cli_args(
262
+ # {
263
+ # "": cnvnator2vcf,
264
+ # "reference": genome,
265
+ # "_": [infile, refdir],
266
+ # },
267
+ # prefix="-",
268
+ # ),
269
+ # stdout="return",
270
+ # )
271
+ ## command hangs
272
+ with open(infile) as fin, open(outfile, "w") as fout:
273
+ fout.write("##fileformat=VCFv4.2\n")
274
+ fout.write(f"##fileDate={datetime.now().strftime('%Y%m%d')}\n")
275
+ fout.write(f"##reference={genome}\n")
276
+ fout.write(f"##source=CNVpytor\n")
267
277
  for chrom, size in load_chrsize():
268
- fixes.append({
269
- "kind": "contig",
270
- "append": True,
271
- "fix": (
272
- lambda obj, chrom=chrom, size=size:
273
- HeaderContig(ID=chrom, length=size)
274
- )
275
- })
278
+ fout.write(f"##contig=<ID={chrom},length={size}>\n")
279
+ fout.write('##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant described in this record">\n')
280
+ fout.write('##INFO=<ID=IMPRECISE,Number=0,Type=Flag,Description="Imprecise structural variation">\n')
281
+ fout.write('##INFO=<ID=SVLEN,Number=1,Type=Integer,Description="Difference in length between REF and ALT alleles">\n')
282
+ fout.write('##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">\n')
283
+ fout.write('##INFO=<ID=natorRD,Number=1,Type=Float,Description="Normalized RD">\n')
284
+ fout.write('##INFO=<ID=natorP1,Number=1,Type=Float,Description="e-val by t-test">\n')
285
+ fout.write('##INFO=<ID=natorP2,Number=1,Type=Float,Description="e-val by Gaussian tail">\n')
286
+ fout.write('##INFO=<ID=natorP3,Number=1,Type=Float,Description="e-val by t-test (middle)">\n')
287
+ fout.write('##INFO=<ID=natorP4,Number=1,Type=Float,Description="e-val by Gaussian tail (middle)">\n')
288
+ fout.write('##INFO=<ID=natorQ0,Number=1,Type=Float,Description="Fraction of reads with 0 mapping quality">\n')
289
+ fout.write('##INFO=<ID=natorPE,Number=1,Type=Integer,Description="Number of paired-ends support the event">\n')
290
+ fout.write('##INFO=<ID=SAMPLES,Number=.,Type=String,Description="Sample genotyped to have the variant">\n')
291
+ fout.write('##ALT=<ID=DEL,Description="Deletion">\n')
292
+ fout.write('##ALT=<ID=DUP,Description="Duplication">\n')
293
+ fout.write('##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n')
294
+ fout.write('##FORMAT=<ID=CN,Number=1,Type=Integer,Description="Copy number genotype for imprecise events">\n')
295
+ fout.write('##FORMAT=<ID=PE,Number=1,Type=String,Description="Number of paired-ends that support the event">\n')
296
+ fout.write(f"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t{Path(bamfile).stem}\n")
297
+ prev_chrom, chrom_seq, count = "", "", 0
298
+ for line in fin:
299
+ # type, coor, length, rd, p1, p2, p3, p4, q0, pe = line.strip("\n").split()
300
+ items = line.strip("\n").split()
301
+ type, coor, length = items[:3]
302
+ rd = float(items[3]) if len(items) > 3 else False
303
+ p1 = items[4] if len(items) > 4 else ""
304
+ p2 = items[5] if len(items) > 5 else ""
305
+ p3 = items[6] if len(items) > 6 else ""
306
+ p4 = items[7] if len(items) > 7 else ""
307
+ q0 = items[8] if len(items) > 8 else ""
308
+ pe = items[9] if len(items) > 9 else ""
309
+ chrom, pos = coor.split(":")
310
+ start, end = pos.split("-")
311
+ start, end = int(start), int(end)
312
+ is_del = type == "deletion"
313
+ is_dup = type == "duplication"
314
+
315
+ if not is_del and not is_dup:
316
+ warnings.warn(f"Skipping unrecognized CNV type: {type}")
317
+ continue
276
318
 
277
- fix_vcffile(unfixedfile, outfile, fixes)
278
- else:
279
- outfile.write_text(stdout)
319
+ if chrom != prev_chrom:
320
+ chrom_seq = parse_chrom(chrom, refdir)
321
+ prev_chrom = chrom
322
+
323
+ count += 1
324
+ info = f"END={end}"
325
+ info += f";SVTYPE=DEL;SVLEN=-{length}" if is_del else f";SVTYPE=DUP;SVLEN={length}"
326
+ info += ";IMPRECISE"
327
+ info += f";natorRD={rd}" if rd is not False else ""
328
+ info += f";natorP1={p1}" if p1 else ""
329
+ info += f";natorP2={p2}" if p2 else ""
330
+ info += f";natorP3={p3}" if p3 else ""
331
+ info += f";natorP4={p4}" if p4 else ""
332
+ info += f";natorQ0={q0}" if q0 else ""
333
+ info += f";natorPE={pe}" if pe else ""
334
+
335
+ gt = "GT"
336
+ if rd is not False:
337
+ gt += ":CN"
338
+ gt += ":PE" if pe else ""
339
+ gt += "\t"
340
+ if is_del and rd < 0.25:
341
+ gt += "1/1:0"
342
+ elif is_del and rd >= 0.25:
343
+ gt += "0/1:1"
344
+ elif rd <= 1.75:
345
+ gt += "0/1:2"
346
+ elif rd > 1.75 and rd <= 2.25:
347
+ gt += "1/1:2"
348
+ elif rd > 2.25:
349
+ gt += f"./2:{rd:.0f}"
350
+ else:
351
+ gt = "GT:PE\t./." if pe else "GT\t./."
352
+
353
+ gt += f":{pe}" if pe else ""
354
+ else:
355
+ gt += "\t./."
356
+
357
+ fout.write("\t".join(
358
+ [
359
+ chrom,
360
+ str(start),
361
+ f"CNVpytor_{'del_' if is_del else 'dup_'}{count}",
362
+ chrom_seq[start - 1] if start < len(chrom_seq) else "N",
363
+ "<DEL>" if is_del else "<DUP>",
364
+ ".",
365
+ "PASS",
366
+ info,
367
+ gt,
368
+ ]
369
+ ) + "\n")
280
370
 
281
371
 
282
372
  def do_case():
@@ -290,7 +380,7 @@ def do_case():
290
380
  rootfile = outdir / "file.pytor"
291
381
  case["j"] = case.get("j", ncores)
292
382
 
293
- # read depth signal
383
+ logger.info("Reading depth signals ...")
294
384
  run_command(
295
385
  dict_to_cli_args(
296
386
  {
@@ -305,7 +395,7 @@ def do_case():
305
395
  fg=True,
306
396
  )
307
397
 
308
- # predicting cnv
398
+ logger.info("Predicting CNVs ...")
309
399
  run_command(
310
400
  dict_to_cli_args(
311
401
  {
@@ -314,6 +404,7 @@ def do_case():
314
404
  "his": binsizes,
315
405
  },
316
406
  prefix="-",
407
+ dup_key=False,
317
408
  ),
318
409
  fg=True,
319
410
  )
@@ -326,6 +417,7 @@ def do_case():
326
417
  "partition": binsizes,
327
418
  },
328
419
  prefix="-",
420
+ dup_key=False,
329
421
  ),
330
422
  fg=True,
331
423
  )
@@ -336,6 +428,7 @@ def do_case():
336
428
  mask_snps = snp.pop("mask_snps", True)
337
429
  baf_nomask = snp.pop("baf_nomask", False)
338
430
 
431
+ logger.info("Importing SNP data ...")
339
432
  run_command(
340
433
  dict_to_cli_args(
341
434
  {
@@ -350,6 +443,7 @@ def do_case():
350
443
  )
351
444
 
352
445
  if mask_snps:
446
+ logger.info("Masking 1000 Genome SNPs ...")
353
447
  run_command(
354
448
  dict_to_cli_args(
355
449
  {
@@ -362,6 +456,7 @@ def do_case():
362
456
  fg=True,
363
457
  )
364
458
 
459
+ logger.info("Calculating BAF histograms ...")
365
460
  run_command(
366
461
  dict_to_cli_args(
367
462
  {
@@ -375,8 +470,9 @@ def do_case():
375
470
  fg=True,
376
471
  )
377
472
 
378
- # call
473
+ logger.info("Predicting CNV regions using joint caller ...")
379
474
  for binsize in binsizes:
475
+ logger.info(f"- binsize: {binsize}")
380
476
  outfile = outdir / f"calls{'.combined' if snp is not False else ''}.{binsize}.tsv"
381
477
  outfile_filtered = outdir / f"calls{'.combined' if snp is not False else ''}.{binsize}.filtered.tsv"
382
478
  run_command(
@@ -392,6 +488,7 @@ def do_case():
392
488
  stdout=outfile,
393
489
  )
394
490
 
491
+ logger.info(" Converting to other formats ...")
395
492
  cnvpytor2other(outfile, bool(snp), "gff")
396
493
  cnvpytor2other(outfile, bool(snp), "bed")
397
494
  cnvpytor2vcf(outfile, bool(snp))
@@ -424,6 +521,7 @@ def do_case():
424
521
  cnvpytor2vcf(outfile_filtered, bool(snp))
425
522
 
426
523
  # plots
524
+ logger.info(" Plotting ...")
427
525
  manplot = outdir / f"manhattan.{binsize}.png"
428
526
  run_command(
429
527
  dict_to_cli_args(
@@ -0,0 +1,54 @@
1
+ from pathlib import Path
2
+ from biopipen.utils.misc import run_command, dict_to_cli_args, logger
3
+
4
+ afile = Path({{in.afile | repr}}) # pyright: ignore # noqa: #999
5
+ bfile = Path({{in.bfile | repr}}) # pyright: ignore
6
+ outfile = {{out.outfile | repr}} # pyright: ignore
7
+ envs = {{envs | repr}} # pyright: ignore
8
+
9
+ bedtools = envs.pop("bedtools")
10
+ sort = envs.pop("sort")
11
+ chrsize = envs.pop("chrsize")
12
+ postcmd = envs.pop("postcmd", None)
13
+ outdir = Path(outfile).parent
14
+
15
+ if chrsize and "g" in envs:
16
+ logger.warning("Ignoring envs.g because envs.chrsize is provided.")
17
+ envs["g"] = Path(chrsize).expanduser()
18
+ elif chrsize:
19
+ envs["g"] = Path(chrsize).expanduser()
20
+
21
+ if sort:
22
+ afile_sorted = outdir / f"{afile.stem}_sorted{afile.suffix}"
23
+ bfile_sorted = outdir / f"{bfile.stem}_sorted{bfile.suffix}"
24
+ run_command(
25
+ [bedtools, "sort", "-g", envs["g"], "-i", afile],
26
+ stdout=afile_sorted,
27
+ )
28
+ run_command(
29
+ [bedtools, "sort", "-g", envs["g"], "-i", bfile],
30
+ stdout=bfile_sorted,
31
+ )
32
+ afile = afile_sorted
33
+ bfile = bfile_sorted
34
+
35
+ envs[""] = [bedtools, "intersect"]
36
+ envs["a"] = afile
37
+ envs["b"] = bfile
38
+ envs.setdefault("sorted", True)
39
+
40
+ if envs["sorted"] and not "g" in envs:
41
+ raise ValueError("envs.g is required or manullay set envs.sorted to False.")
42
+
43
+ if postcmd:
44
+ ofile = Path(outfile).with_suffix(".prior.bt")
45
+ run_command(dict_to_cli_args(envs, prefix="-"), stdout=ofile)
46
+ postcmd_file = outdir / "_postcmd.sh"
47
+ postcmd_file.write_text(postcmd)
48
+ run_command(
49
+ ["bash", postcmd_file],
50
+ env={"infile": ofile, "outfile": outfile, "outdir": outdir},
51
+ fg=True,
52
+ )
53
+ else:
54
+ run_command(dict_to_cli_args(envs, prefix="-"), stdout=outfile)
@@ -1,6 +1,6 @@
1
1
  from biopipen.utils import run_command, dict_to_cli_args
2
2
 
3
- inbed = {{in.inbed | repr}} # pyright: ignore
3
+ inbed = {{in.inbed | repr}} # pyright: ignore # noqa: #999
4
4
  outbed = {{out.outbed | repr}} # pyright: ignore
5
5
  envs = {{envs | repr}} # pyright: ignore
6
6
  bedtools = envs.pop("bedtools", "bedtools")