biopipen 0.21.0__py3-none-any.whl → 0.34.26__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (290) hide show
  1. biopipen/__init__.py +1 -1
  2. biopipen/core/config.toml +28 -0
  3. biopipen/core/filters.py +79 -4
  4. biopipen/core/proc.py +12 -3
  5. biopipen/core/testing.py +75 -3
  6. biopipen/ns/bam.py +148 -6
  7. biopipen/ns/bed.py +75 -0
  8. biopipen/ns/cellranger.py +186 -0
  9. biopipen/ns/cellranger_pipeline.py +126 -0
  10. biopipen/ns/cnv.py +19 -3
  11. biopipen/ns/cnvkit.py +1 -1
  12. biopipen/ns/cnvkit_pipeline.py +20 -12
  13. biopipen/ns/delim.py +34 -35
  14. biopipen/ns/gene.py +68 -23
  15. biopipen/ns/gsea.py +63 -37
  16. biopipen/ns/misc.py +39 -14
  17. biopipen/ns/plot.py +304 -1
  18. biopipen/ns/protein.py +183 -0
  19. biopipen/ns/regulatory.py +290 -0
  20. biopipen/ns/rnaseq.py +142 -5
  21. biopipen/ns/scrna.py +2053 -473
  22. biopipen/ns/scrna_metabolic_landscape.py +228 -382
  23. biopipen/ns/snp.py +659 -0
  24. biopipen/ns/stats.py +484 -0
  25. biopipen/ns/tcr.py +683 -98
  26. biopipen/ns/vcf.py +236 -2
  27. biopipen/ns/web.py +97 -6
  28. biopipen/reports/bam/CNVpytor.svelte +4 -9
  29. biopipen/reports/cellranger/CellRangerCount.svelte +18 -0
  30. biopipen/reports/cellranger/CellRangerSummary.svelte +16 -0
  31. biopipen/reports/cellranger/CellRangerVdj.svelte +18 -0
  32. biopipen/reports/cnvkit/CNVkitDiagram.svelte +1 -1
  33. biopipen/reports/cnvkit/CNVkitHeatmap.svelte +1 -1
  34. biopipen/reports/cnvkit/CNVkitScatter.svelte +1 -1
  35. biopipen/reports/common.svelte +15 -0
  36. biopipen/reports/protein/ProdigySummary.svelte +16 -0
  37. biopipen/reports/scrna/CellsDistribution.svelte +4 -39
  38. biopipen/reports/scrna/DimPlots.svelte +1 -1
  39. biopipen/reports/scrna/MarkersFinder.svelte +6 -126
  40. biopipen/reports/scrna/MetaMarkers.svelte +3 -75
  41. biopipen/reports/scrna/RadarPlots.svelte +4 -20
  42. biopipen/reports/scrna_metabolic_landscape/MetabolicFeatures.svelte +61 -22
  43. biopipen/reports/scrna_metabolic_landscape/MetabolicPathwayActivity.svelte +88 -82
  44. biopipen/reports/scrna_metabolic_landscape/MetabolicPathwayHeterogeneity.svelte +70 -10
  45. biopipen/reports/snp/PlinkCallRate.svelte +24 -0
  46. biopipen/reports/snp/PlinkFreq.svelte +18 -0
  47. biopipen/reports/snp/PlinkHWE.svelte +18 -0
  48. biopipen/reports/snp/PlinkHet.svelte +18 -0
  49. biopipen/reports/snp/PlinkIBD.svelte +18 -0
  50. biopipen/reports/tcr/CDR3AAPhyschem.svelte +19 -66
  51. biopipen/reports/tcr/ClonalStats.svelte +16 -0
  52. biopipen/reports/tcr/CloneResidency.svelte +3 -93
  53. biopipen/reports/tcr/Immunarch.svelte +4 -155
  54. biopipen/reports/tcr/TCRClusterStats.svelte +3 -45
  55. biopipen/reports/tcr/TESSA.svelte +11 -28
  56. biopipen/reports/utils/misc.liq +22 -7
  57. biopipen/scripts/bam/BamMerge.py +11 -15
  58. biopipen/scripts/bam/BamSampling.py +90 -0
  59. biopipen/scripts/bam/BamSort.py +141 -0
  60. biopipen/scripts/bam/BamSplitChroms.py +10 -10
  61. biopipen/scripts/bam/BamSubsetByBed.py +38 -0
  62. biopipen/scripts/bam/CNAClinic.R +41 -5
  63. biopipen/scripts/bam/CNVpytor.py +153 -54
  64. biopipen/scripts/bam/ControlFREEC.py +13 -14
  65. biopipen/scripts/bam/SamtoolsView.py +33 -0
  66. biopipen/scripts/bed/Bed2Vcf.py +5 -5
  67. biopipen/scripts/bed/BedConsensus.py +5 -5
  68. biopipen/scripts/bed/BedLiftOver.sh +6 -4
  69. biopipen/scripts/bed/BedtoolsIntersect.py +54 -0
  70. biopipen/scripts/bed/BedtoolsMakeWindows.py +47 -0
  71. biopipen/scripts/bed/BedtoolsMerge.py +4 -4
  72. biopipen/scripts/cellranger/CellRangerCount.py +138 -0
  73. biopipen/scripts/cellranger/CellRangerSummary.R +181 -0
  74. biopipen/scripts/cellranger/CellRangerVdj.py +112 -0
  75. biopipen/scripts/cnv/AneuploidyScore.R +55 -20
  76. biopipen/scripts/cnv/AneuploidyScoreSummary.R +221 -163
  77. biopipen/scripts/cnv/TMADScore.R +25 -9
  78. biopipen/scripts/cnv/TMADScoreSummary.R +57 -86
  79. biopipen/scripts/cnvkit/CNVkitAccess.py +7 -6
  80. biopipen/scripts/cnvkit/CNVkitAutobin.py +26 -18
  81. biopipen/scripts/cnvkit/CNVkitBatch.py +6 -6
  82. biopipen/scripts/cnvkit/CNVkitCall.py +3 -3
  83. biopipen/scripts/cnvkit/CNVkitCoverage.py +4 -3
  84. biopipen/scripts/cnvkit/CNVkitDiagram.py +5 -5
  85. biopipen/scripts/cnvkit/CNVkitFix.py +3 -3
  86. biopipen/scripts/cnvkit/CNVkitGuessBaits.py +12 -8
  87. biopipen/scripts/cnvkit/CNVkitHeatmap.py +5 -5
  88. biopipen/scripts/cnvkit/CNVkitReference.py +6 -5
  89. biopipen/scripts/cnvkit/CNVkitScatter.py +5 -5
  90. biopipen/scripts/cnvkit/CNVkitSegment.py +5 -5
  91. biopipen/scripts/cnvkit/guess_baits.py +166 -93
  92. biopipen/scripts/delim/RowsBinder.R +1 -1
  93. biopipen/scripts/delim/SampleInfo.R +116 -118
  94. biopipen/scripts/gene/GeneNameConversion.R +67 -0
  95. biopipen/scripts/gene/GenePromoters.R +61 -0
  96. biopipen/scripts/gsea/Enrichr.R +5 -5
  97. biopipen/scripts/gsea/FGSEA.R +184 -50
  98. biopipen/scripts/gsea/GSEA.R +2 -2
  99. biopipen/scripts/gsea/PreRank.R +5 -5
  100. biopipen/scripts/misc/Config2File.py +2 -2
  101. biopipen/scripts/misc/Plot.R +80 -0
  102. biopipen/scripts/misc/Shell.sh +15 -0
  103. biopipen/scripts/misc/Str2File.py +2 -2
  104. biopipen/scripts/plot/Heatmap.R +3 -3
  105. biopipen/scripts/plot/Manhattan.R +147 -0
  106. biopipen/scripts/plot/QQPlot.R +146 -0
  107. biopipen/scripts/plot/ROC.R +88 -0
  108. biopipen/scripts/plot/Scatter.R +112 -0
  109. biopipen/scripts/plot/VennDiagram.R +5 -9
  110. biopipen/scripts/protein/MMCIF2PDB.py +33 -0
  111. biopipen/scripts/protein/PDB2Fasta.py +60 -0
  112. biopipen/scripts/protein/Prodigy.py +119 -0
  113. biopipen/scripts/protein/ProdigySummary.R +140 -0
  114. biopipen/scripts/protein/RMSD.py +178 -0
  115. biopipen/scripts/regulatory/MotifAffinityTest.R +102 -0
  116. biopipen/scripts/regulatory/MotifAffinityTest_AtSNP.R +127 -0
  117. biopipen/scripts/regulatory/MotifAffinityTest_MotifBreakR.R +104 -0
  118. biopipen/scripts/regulatory/MotifScan.py +159 -0
  119. biopipen/scripts/regulatory/VariantMotifPlot.R +78 -0
  120. biopipen/scripts/regulatory/motifs-common.R +324 -0
  121. biopipen/scripts/rnaseq/Simulation-ESCO.R +180 -0
  122. biopipen/scripts/rnaseq/Simulation-RUVcorr.R +45 -0
  123. biopipen/scripts/rnaseq/Simulation.R +21 -0
  124. biopipen/scripts/rnaseq/UnitConversion.R +325 -54
  125. biopipen/scripts/scrna/AnnData2Seurat.R +40 -0
  126. biopipen/scripts/scrna/CCPlotR-patch.R +161 -0
  127. biopipen/scripts/scrna/CellCellCommunication.py +150 -0
  128. biopipen/scripts/scrna/CellCellCommunicationPlots.R +93 -0
  129. biopipen/scripts/scrna/CellSNPLite.py +30 -0
  130. biopipen/scripts/scrna/CellTypeAnnotation-celltypist.R +185 -0
  131. biopipen/scripts/scrna/CellTypeAnnotation-direct.R +68 -31
  132. biopipen/scripts/scrna/CellTypeAnnotation-hitype.R +27 -22
  133. biopipen/scripts/scrna/CellTypeAnnotation-sccatch.R +28 -20
  134. biopipen/scripts/scrna/CellTypeAnnotation-sctype.R +48 -25
  135. biopipen/scripts/scrna/CellTypeAnnotation.R +37 -1
  136. biopipen/scripts/scrna/CellsDistribution.R +456 -167
  137. biopipen/scripts/scrna/DimPlots.R +1 -1
  138. biopipen/scripts/scrna/ExprImputation-alra.R +109 -0
  139. biopipen/scripts/scrna/ExprImputation-rmagic.R +256 -0
  140. biopipen/scripts/scrna/{ExprImpution-scimpute.R → ExprImputation-scimpute.R} +8 -5
  141. biopipen/scripts/scrna/ExprImputation.R +7 -0
  142. biopipen/scripts/scrna/LoomTo10X.R +51 -0
  143. biopipen/scripts/scrna/MQuad.py +25 -0
  144. biopipen/scripts/scrna/MarkersFinder.R +679 -400
  145. biopipen/scripts/scrna/MetaMarkers.R +265 -161
  146. biopipen/scripts/scrna/ModuleScoreCalculator.R +66 -11
  147. biopipen/scripts/scrna/PseudoBulkDEG.R +678 -0
  148. biopipen/scripts/scrna/RadarPlots.R +355 -134
  149. biopipen/scripts/scrna/ScFGSEA.R +298 -100
  150. biopipen/scripts/scrna/ScSimulation.R +65 -0
  151. biopipen/scripts/scrna/ScVelo.py +617 -0
  152. biopipen/scripts/scrna/Seurat2AnnData.R +7 -0
  153. biopipen/scripts/scrna/SeuratClusterStats-clustree.R +87 -0
  154. biopipen/scripts/scrna/SeuratClusterStats-dimplots.R +36 -30
  155. biopipen/scripts/scrna/SeuratClusterStats-features.R +138 -187
  156. biopipen/scripts/scrna/SeuratClusterStats-ngenes.R +81 -0
  157. biopipen/scripts/scrna/SeuratClusterStats-stats.R +78 -89
  158. biopipen/scripts/scrna/SeuratClusterStats.R +47 -10
  159. biopipen/scripts/scrna/SeuratClustering.R +36 -233
  160. biopipen/scripts/scrna/SeuratLoading.R +2 -2
  161. biopipen/scripts/scrna/SeuratMap2Ref.R +84 -113
  162. biopipen/scripts/scrna/SeuratMetadataMutater.R +16 -6
  163. biopipen/scripts/scrna/SeuratPreparing.R +223 -173
  164. biopipen/scripts/scrna/SeuratSubClustering.R +64 -0
  165. biopipen/scripts/scrna/SeuratTo10X.R +27 -0
  166. biopipen/scripts/scrna/Slingshot.R +65 -0
  167. biopipen/scripts/scrna/Subset10X.R +2 -2
  168. biopipen/scripts/scrna/TopExpressingGenes.R +169 -135
  169. biopipen/scripts/scrna/celltypist-wrapper.py +195 -0
  170. biopipen/scripts/scrna/scvelo_paga.py +313 -0
  171. biopipen/scripts/scrna/seurat_anndata_conversion.py +98 -0
  172. biopipen/scripts/scrna_metabolic_landscape/MetabolicFeatures.R +447 -82
  173. biopipen/scripts/scrna_metabolic_landscape/MetabolicPathwayActivity.R +348 -241
  174. biopipen/scripts/scrna_metabolic_landscape/MetabolicPathwayHeterogeneity.R +188 -166
  175. biopipen/scripts/snp/MatrixEQTL.R +217 -0
  176. biopipen/scripts/snp/Plink2GTMat.py +148 -0
  177. biopipen/scripts/snp/PlinkCallRate.R +199 -0
  178. biopipen/scripts/snp/PlinkFilter.py +100 -0
  179. biopipen/scripts/snp/PlinkFreq.R +291 -0
  180. biopipen/scripts/snp/PlinkFromVcf.py +81 -0
  181. biopipen/scripts/snp/PlinkHWE.R +85 -0
  182. biopipen/scripts/snp/PlinkHet.R +96 -0
  183. biopipen/scripts/snp/PlinkIBD.R +196 -0
  184. biopipen/scripts/snp/PlinkSimulation.py +124 -0
  185. biopipen/scripts/snp/PlinkUpdateName.py +124 -0
  186. biopipen/scripts/stats/ChowTest.R +146 -0
  187. biopipen/scripts/stats/DiffCoexpr.R +152 -0
  188. biopipen/scripts/stats/LiquidAssoc.R +135 -0
  189. biopipen/scripts/stats/Mediation.R +108 -0
  190. biopipen/scripts/stats/MetaPvalue.R +130 -0
  191. biopipen/scripts/stats/MetaPvalue1.R +74 -0
  192. biopipen/scripts/tcgamaf/Maf2Vcf.py +2 -2
  193. biopipen/scripts/tcgamaf/MafAddChr.py +2 -2
  194. biopipen/scripts/tcr/Attach2Seurat.R +3 -2
  195. biopipen/scripts/tcr/CDR3AAPhyschem.R +211 -143
  196. biopipen/scripts/tcr/CDR3Clustering.R +343 -0
  197. biopipen/scripts/tcr/ClonalStats.R +526 -0
  198. biopipen/scripts/tcr/CloneResidency.R +255 -131
  199. biopipen/scripts/tcr/CloneSizeQQPlot.R +4 -4
  200. biopipen/scripts/tcr/GIANA/GIANA.py +1356 -797
  201. biopipen/scripts/tcr/GIANA/GIANA4.py +1362 -789
  202. biopipen/scripts/tcr/GIANA/query.py +164 -162
  203. biopipen/scripts/tcr/Immunarch-basic.R +31 -9
  204. biopipen/scripts/tcr/Immunarch-clonality.R +25 -5
  205. biopipen/scripts/tcr/Immunarch-diversity.R +352 -134
  206. biopipen/scripts/tcr/Immunarch-geneusage.R +45 -5
  207. biopipen/scripts/tcr/Immunarch-kmer.R +68 -8
  208. biopipen/scripts/tcr/Immunarch-overlap.R +84 -4
  209. biopipen/scripts/tcr/Immunarch-spectratyping.R +35 -6
  210. biopipen/scripts/tcr/Immunarch-tracking.R +38 -6
  211. biopipen/scripts/tcr/Immunarch-vjjunc.R +165 -0
  212. biopipen/scripts/tcr/Immunarch.R +63 -11
  213. biopipen/scripts/tcr/Immunarch2VDJtools.R +2 -2
  214. biopipen/scripts/tcr/ImmunarchFilter.R +4 -4
  215. biopipen/scripts/tcr/ImmunarchLoading.R +38 -29
  216. biopipen/scripts/tcr/SampleDiversity.R +1 -1
  217. biopipen/scripts/tcr/ScRepCombiningExpression.R +40 -0
  218. biopipen/scripts/tcr/ScRepLoading.R +166 -0
  219. biopipen/scripts/tcr/TCRClusterStats.R +176 -22
  220. biopipen/scripts/tcr/TCRDock.py +110 -0
  221. biopipen/scripts/tcr/TESSA.R +102 -118
  222. biopipen/scripts/tcr/VJUsage.R +5 -5
  223. biopipen/scripts/tcr/immunarch-patched.R +142 -0
  224. biopipen/scripts/tcr/vdjtools-patch.sh +1 -1
  225. biopipen/scripts/vcf/BcftoolsAnnotate.py +91 -0
  226. biopipen/scripts/vcf/BcftoolsFilter.py +90 -0
  227. biopipen/scripts/vcf/BcftoolsMerge.py +31 -0
  228. biopipen/scripts/vcf/BcftoolsSort.py +113 -0
  229. biopipen/scripts/vcf/BcftoolsView.py +73 -0
  230. biopipen/scripts/vcf/TruvariBench.sh +14 -7
  231. biopipen/scripts/vcf/TruvariBenchSummary.R +16 -13
  232. biopipen/scripts/vcf/TruvariConsistency.R +1 -1
  233. biopipen/scripts/vcf/Vcf2Bed.py +2 -2
  234. biopipen/scripts/vcf/VcfAnno.py +11 -11
  235. biopipen/scripts/vcf/VcfDownSample.sh +22 -10
  236. biopipen/scripts/vcf/VcfFilter.py +5 -5
  237. biopipen/scripts/vcf/VcfFix.py +7 -7
  238. biopipen/scripts/vcf/VcfFix_utils.py +13 -4
  239. biopipen/scripts/vcf/VcfIndex.py +3 -3
  240. biopipen/scripts/vcf/VcfIntersect.py +3 -3
  241. biopipen/scripts/vcf/VcfLiftOver.sh +5 -0
  242. biopipen/scripts/vcf/VcfSplitSamples.py +4 -4
  243. biopipen/scripts/vcf/bcftools_utils.py +52 -0
  244. biopipen/scripts/web/Download.py +8 -4
  245. biopipen/scripts/web/DownloadList.py +5 -5
  246. biopipen/scripts/web/GCloudStorageDownloadBucket.py +82 -0
  247. biopipen/scripts/web/GCloudStorageDownloadFile.py +23 -0
  248. biopipen/scripts/web/gcloud_common.py +49 -0
  249. biopipen/utils/gene.py +108 -60
  250. biopipen/utils/misc.py +146 -20
  251. biopipen/utils/reference.py +64 -20
  252. biopipen/utils/reporter.py +177 -0
  253. biopipen/utils/vcf.py +1 -1
  254. biopipen-0.34.26.dist-info/METADATA +27 -0
  255. biopipen-0.34.26.dist-info/RECORD +292 -0
  256. {biopipen-0.21.0.dist-info → biopipen-0.34.26.dist-info}/WHEEL +1 -1
  257. {biopipen-0.21.0.dist-info → biopipen-0.34.26.dist-info}/entry_points.txt +6 -2
  258. biopipen/ns/bcftools.py +0 -111
  259. biopipen/ns/scrna_basic.py +0 -255
  260. biopipen/reports/delim/SampleInfo.svelte +0 -36
  261. biopipen/reports/scrna/GeneExpressionInvistigation.svelte +0 -32
  262. biopipen/reports/scrna/ScFGSEA.svelte +0 -35
  263. biopipen/reports/scrna/SeuratClusterStats.svelte +0 -82
  264. biopipen/reports/scrna/SeuratMap2Ref.svelte +0 -20
  265. biopipen/reports/scrna/SeuratPreparing.svelte +0 -38
  266. biopipen/reports/scrna/TopExpressingGenes.svelte +0 -55
  267. biopipen/reports/scrna_metabolic_landscape/MetabolicFeaturesIntraSubset.svelte +0 -31
  268. biopipen/reports/utils/gsea.liq +0 -110
  269. biopipen/scripts/bcftools/BcftoolsAnnotate.py +0 -42
  270. biopipen/scripts/bcftools/BcftoolsFilter.py +0 -79
  271. biopipen/scripts/bcftools/BcftoolsSort.py +0 -19
  272. biopipen/scripts/gene/GeneNameConversion.py +0 -66
  273. biopipen/scripts/scrna/ExprImpution-alra.R +0 -32
  274. biopipen/scripts/scrna/ExprImpution-rmagic.R +0 -29
  275. biopipen/scripts/scrna/ExprImpution.R +0 -7
  276. biopipen/scripts/scrna/GeneExpressionInvistigation.R +0 -132
  277. biopipen/scripts/scrna/Write10X.R +0 -11
  278. biopipen/scripts/scrna_metabolic_landscape/MetabolicFeaturesIntraSubset.R +0 -150
  279. biopipen/scripts/tcr/TCRClustering.R +0 -280
  280. biopipen/utils/common_docstrs.py +0 -61
  281. biopipen/utils/gene.R +0 -49
  282. biopipen/utils/gsea.R +0 -193
  283. biopipen/utils/io.R +0 -20
  284. biopipen/utils/misc.R +0 -114
  285. biopipen/utils/mutate_helpers.R +0 -433
  286. biopipen/utils/plot.R +0 -173
  287. biopipen/utils/rnaseq.R +0 -48
  288. biopipen/utils/single_cell.R +0 -115
  289. biopipen-0.21.0.dist-info/METADATA +0 -22
  290. biopipen-0.21.0.dist-info/RECORD +0 -218
@@ -0,0 +1,148 @@
1
+
2
+ from os import path
3
+ from glob import glob
4
+ from biopipen.utils.misc import run_command, logger
5
+
6
+ indir: str = {{in.indir | quote}} # noqa: E999 # pyright: ignore
7
+ outfile: str = {{out.outfile | quote}} # pyright: ignore
8
+ plink: str = {{envs.plink | quote}} # pyright: ignore
9
+ ncores: int = {{envs.ncores | repr}} # pyright: ignore
10
+ transpose: bool = {{envs.transpose | repr}} # pyright: ignore
11
+ samid: str = {{envs.samid | repr}} # pyright: ignore
12
+ varid: str = {{envs.varid | repr}} # pyright: ignore
13
+ trans_chr: dict = {{envs.trans_chr | repr}} # pyright: ignore
14
+ missing_id: str = {{envs.missing_id | repr}} # pyright: ignore
15
+ gtcoding: str = {{envs.gtcoding | repr}} # pyright: ignore
16
+ trans_chr = trans_chr or {}
17
+
18
+ bedfile = glob(path.join(indir, '*.bed'))
19
+ if len(bedfile) == 0:
20
+ raise FileNotFoundError(f"No .bed file found in `in.indir`")
21
+ elif len(bedfile) > 1:
22
+ logger.warning(f"Multiple .bed files found in `in.indir`, using the first one.")
23
+
24
+ bedfile = bedfile[0]
25
+ input = path.splitext(bedfile)[0]
26
+ output = path.splitext(outfile)[0]
27
+
28
+ cmd = [
29
+ plink,
30
+ "--bfile", input,
31
+ "--out", output,
32
+ "--threads", ncores,
33
+ "--keep-allele-order",
34
+ "--recode", "A-transpose" if not transpose else "A",
35
+ ]
36
+ # if transpose:
37
+ # cmd += ["tabx"]
38
+
39
+ run_command(cmd, fg=True, env={"cwd": path.dirname(outfile)})
40
+
41
+
42
+ def _vcf_gtcoding(gt):
43
+ try:
44
+ return str(2 - int(gt))
45
+ except (ValueError, TypeError):
46
+ return "NA"
47
+
48
+
49
+ if not transpose: # rows are variants, columns are samples
50
+ # .traw file is created, tab-separated, with the following columns:
51
+ trawfile = output + ".traw"
52
+ # CHR Chromosome code
53
+ # SNP Variant identifier
54
+ # (C)M Position in morgans or centimorgans
55
+ # POS Base-pair coordinate
56
+ # COUNTED Counted allele (defaults to A1), the actual alternative allele
57
+ # with --keep-allele-order
58
+ # ALT Other allele(s), comma-separated, the actual reference allele
59
+ # <FID>_<IID>... Allelic dosages
60
+ # (0/1/2/'NA' for diploid variants, 0/2/'NA' for haploid)
61
+ with open(trawfile, 'r') as fin:
62
+ with open(outfile, 'w') as fout:
63
+ samples = fin.readline().strip().split('\t')[6:]
64
+ header = ["Variant"]
65
+ for sam in samples:
66
+ try:
67
+ fid, iid = sam.split('_')
68
+ except ValueError:
69
+ raise ValueError(
70
+ f"Can't determine FID and IID from sample ID: {sam}, "
71
+ f"extra underscore (_) detected."
72
+ ) from None
73
+ sam = samid.replace('{fid}', fid).replace('{iid}', iid)
74
+ header.append(sam)
75
+ fout.write('\t'.join(header) + '\n')
76
+
77
+ for line in fin:
78
+ line = line.strip().split('\t')
79
+ chrom = trans_chr.get(line[0], line[0])
80
+ var = line[1]
81
+ if var == "." or var == "":
82
+ var = missing_id
83
+ pos = line[3]
84
+ ref = line[5]
85
+ alt = line[4]
86
+ variant = (
87
+ varid
88
+ .replace('{chr}', chrom)
89
+ .replace('{varid}', var)
90
+ .replace('{pos}', pos)
91
+ .replace('{ref}', ref)
92
+ .replace('{alt}', alt)
93
+ )
94
+ if gtcoding == "plink":
95
+ record = [variant] + line[6:]
96
+ else: # vcf
97
+ record = [variant] + [_vcf_gtcoding(x) for x in line[6:]]
98
+ fout.write('\t'.join(record) + '\n')
99
+
100
+ else:
101
+ # .raw file is created, tab-separated, with the following columns:
102
+ rawfile = output + ".raw"
103
+ # FID Family ID
104
+ # IID Individual ID
105
+ # PAT Paternal ID
106
+ # MAT Maternal ID
107
+ # SEX Sex (1 = male, 2 = female, 0 = unknown)
108
+ # PHENOTYPE Main phenotype value
109
+ # <VariantID>... Allelic dosage (0/1/2/NA for diploid variants, 0/2/NA for haploid)
110
+ #
111
+ # Variant information may not be included in <VariantID>
112
+ # We use the .bim file to get the variant information
113
+ bimfile = input + ".bim"
114
+ with open(rawfile, 'r') as fin:
115
+ with open(outfile, 'w') as fout:
116
+ header = ["Sample"]
117
+ with open(bimfile, 'r') as fbim:
118
+ for line in fbim:
119
+ line = line.strip().split('\t')
120
+ chrom = trans_chr.get(line[0], line[0])
121
+ var = line[1]
122
+ if var == "." or var == "":
123
+ var = missing_id
124
+ pos = line[3]
125
+ ref = line[5]
126
+ alt = line[4]
127
+ variant = (
128
+ varid
129
+ .replace('{chr}', chrom)
130
+ .replace('{varid}', var)
131
+ .replace('{pos}', pos)
132
+ .replace('{ref}', ref)
133
+ .replace('{alt}', alt)
134
+ )
135
+ header.append(variant)
136
+ fout.write('\t'.join(header) + '\n')
137
+
138
+ next(fin) # skip header
139
+ for line in fin:
140
+ line = line.strip().split('\t')
141
+ fid = line[0]
142
+ iid = line[1]
143
+ sam = samid.replace('{fid}', fid).replace('{iid}', iid)
144
+ if gtcoding == "plink":
145
+ record = [sam] + line[6:]
146
+ else: # vcf
147
+ record = [sam] + [_vcf_gtcoding(x) for x in line[6:]]
148
+ fout.write('\t'.join(record) + '\n')
@@ -0,0 +1,199 @@
1
+ library(plotthis)
2
+ library(biopipen.utils)
3
+
4
+ indir <- {{in.indir | r}}
5
+ outdir <- {{out.outdir | r}}
6
+ plink <- {{envs.plink | r}}
7
+ ncores <- {{envs.ncores | r}}
8
+ doplot <- {{envs.plot | r}}
9
+ devpars <- {{envs.devpars | r}}
10
+ samplecr <- {{envs.samplecr | r}}
11
+ varcr <- {{envs.varcr | r}}
12
+ max_iter <- {{envs.max_iter | r}}
13
+
14
+ log <- get_logger()
15
+
16
+ bedfile = Sys.glob(file.path(indir, '*.bed'))
17
+ if (length(bedfile) == 0)
18
+ stop("No bed files found in the input directory.")
19
+ if (length(bedfile) > 1) {
20
+ log$warn("Multiple bed files found in the input directory. Using the first one.")
21
+ bedfile <- bedfile[1]
22
+ }
23
+ input <- tools::file_path_sans_ext(bedfile)
24
+ output <- file.path(outdir, basename(input))
25
+
26
+ all_smiss_file = paste0(output, '.smiss')
27
+ all_vmiss_file = paste0(output, '.vmiss')
28
+ all_samplecr_fail_file = paste0(output, '.samplecr.fail')
29
+ all_varcr_fail_file = paste0(output, '.varcr.fail')
30
+ if (file.exists(all_smiss_file)) invisible(file.remove(all_smiss_file))
31
+ if (file.exists(all_vmiss_file)) invisible(file.remove(all_vmiss_file))
32
+ for (i in 1:max_iter) {
33
+ log$info("Iteration {i} ...")
34
+ # iter_out <- paste0(output, "-", i)
35
+ iter_dir <- file.path(outdir, paste0("iter", i))
36
+ dir.create(iter_dir, showWarnings = FALSE)
37
+ iter_out <- file.path(iter_dir, basename(output))
38
+ cmd <- c(
39
+ plink,
40
+ "--threads", ncores,
41
+ "--bfile", input,
42
+ "--missing",
43
+ "--out", iter_out
44
+ )
45
+ run_command(cmd, fg = TRUE)
46
+
47
+ smissfile <- paste0(iter_out, '.smiss')
48
+ smiss <- read.table(
49
+ smissfile,
50
+ header = TRUE,
51
+ row.names = NULL,
52
+ check.names = FALSE,
53
+ comment.char = ""
54
+ )
55
+ smiss$Iteration <- i
56
+ # append it to all_smiss_file
57
+ write.table(
58
+ smiss,
59
+ all_smiss_file,
60
+ append = i > 1,
61
+ col.names = !file.exists(all_smiss_file),
62
+ row.names = FALSE,
63
+ sep = "\t",
64
+ quote = FALSE
65
+ )
66
+ callrate.sample <- data.frame(Callrate = 1 - smiss$F_MISS)
67
+ rownames(callrate.sample) <- paste(smiss$FID, smiss$IID, sep = "\t")
68
+ callrate.sample.fail = rownames(callrate.sample[
69
+ callrate.sample$Callrate < samplecr, , drop = FALSE
70
+ ])
71
+ writeLines(callrate.sample.fail, con = file(paste0(iter_out, '.samplecr.fail')))
72
+ # append it to all_samplecr_fail_file
73
+ write(
74
+ paste0(sapply(
75
+ callrate.sample.fail,
76
+ function(x){ paste0(x, "\n") }
77
+ ), collapse = ""),
78
+ file = file(all_samplecr_fail_file),
79
+ append = i > 1
80
+ )
81
+
82
+ vmiss <- read.table(
83
+ paste0(iter_out, '.vmiss'),
84
+ header = TRUE,
85
+ row.names = NULL,
86
+ check.names = FALSE,
87
+ comment.char = ""
88
+ )
89
+ vmiss$Iteration <- i
90
+ # append it to all_vmiss_file
91
+ write.table(
92
+ vmiss,
93
+ all_vmiss_file,
94
+ append = i > 1,
95
+ col.names = !file.exists(all_vmiss_file),
96
+ row.names = FALSE,
97
+ sep = "\t",
98
+ quote = FALSE
99
+ )
100
+ vmiss$Callrate <- 1 - vmiss$F_MISS
101
+ callrate.var.fail <- vmiss[which(vmiss$Callrate < varcr), 'ID', drop = TRUE]
102
+ writeLines(callrate.var.fail, con = file(paste0(iter_out, '.varcr.fail')))
103
+ # append it to all_varcr_fail_file
104
+ write(
105
+ paste0(sapply(
106
+ callrate.var.fail,
107
+ function(x){ paste0(x, "\n") }
108
+ ), collapse = ""),
109
+ file = file(all_varcr_fail_file),
110
+ append = i > 1
111
+ )
112
+
113
+ if (length(callrate.sample.fail) == 0 && length(callrate.var.fail) == 0) {
114
+ # make symbolic links to output from input .bed, .bim and .fam files
115
+ file.symlink(paste0(input, '.bed'), paste0(output, '.bed'))
116
+ file.symlink(paste0(input, '.bim'), paste0(output, '.bim'))
117
+ file.symlink(paste0(input, '.fam'), paste0(output, '.fam'))
118
+ break
119
+ }
120
+
121
+ # remove samples in iter_out.samplecr.fail and variants in iter_out.varcr.fail
122
+ cmd <- c(
123
+ plink,
124
+ "--threads", ncores,
125
+ "--bfile", input,
126
+ "--remove", paste0(iter_out, '.samplecr.fail'),
127
+ "--exclude", paste0(iter_out, '.varcr.fail'),
128
+ "--make-bed",
129
+ "--out", iter_out
130
+ )
131
+ run_command(cmd, fg = TRUE)
132
+ input <- iter_out
133
+ }
134
+
135
+ smiss <- read.table(
136
+ smissfile,
137
+ header = TRUE,
138
+ row.names = NULL,
139
+ check.names = FALSE,
140
+ comment.char = ""
141
+ )
142
+ callrate.sample <- data.frame(Callrate = 1 - smiss$F_MISS)
143
+ rownames(callrate.sample) <- paste(smiss$FID, smiss$IID, sep = "\t")
144
+
145
+ vmiss <- read.table(
146
+ paste0(iter_out, '.vmiss'),
147
+ header = TRUE,
148
+ row.names = NULL,
149
+ check.names = FALSE,
150
+ comment.char = ""
151
+ )
152
+ vmiss$Callrate <- 1 - vmiss$F_MISS
153
+
154
+ if (doplot) {
155
+ log$info("Plotting ...")
156
+ callrate.sample$Status <- "Pass"
157
+ callrate.sample[callrate.sample.fail, "Status"] <- "Fail"
158
+ callrate.sample$Status <- factor(callrate.sample$Status, levels = c("Fail", "Pass"))
159
+
160
+ p_callrate_file <- paste0(output, '.samplecr.png')
161
+ p_callrate <- Histogram(
162
+ callrate.sample,
163
+ x = "Callrate",
164
+ group_by = "Status",
165
+ xlab = "Sample Call Rate",
166
+ ylab = "Count",
167
+ palette = "Set1",
168
+ alpha = 0.8,
169
+ bins = 50
170
+ )
171
+ res <- 70
172
+ height <- attr(p_callrate, "height") * res
173
+ width <- attr(p_callrate, "width") * res
174
+ png(p_callrate_file, width = width, height = height, res = res)
175
+ print(p_callrate)
176
+ dev.off()
177
+
178
+ vmiss$Status <- "Pass"
179
+ vmiss[which(vmiss$Callrate < varcr), "Status"] <- "Fail"
180
+ vmiss$Status <- factor(vmiss$Status, levels = c("Fail", "Pass"))
181
+
182
+ p_varcr_file <- paste0(output, '.varcr.png')
183
+ p_varcr <- Histogram(
184
+ vmiss,
185
+ x = "Callrate",
186
+ group_by = "Status",
187
+ xlab = "Variant Call Rate",
188
+ ylab = "Count",
189
+ palette = "Set1",
190
+ alpha = 0.8,
191
+ bins = 50
192
+ )
193
+ res <- 70
194
+ height <- attr(p_varcr, "height") * res
195
+ width <- attr(p_varcr, "width") * res
196
+ png(p_varcr_file, width = width, height = height, res = res)
197
+ print(p_varcr)
198
+ dev.off()
199
+ }
@@ -0,0 +1,100 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+ from biopipen.utils.misc import run_command, dict_to_cli_args, logger
5
+
6
+ indir: str = {{in.indir | quote}} # pyright: ignore # noqa: #999
7
+ samples_file = {{in.samples_file | quote}} # pyright: ignore
8
+ variants_file = {{in.variants_file | quote}} # pyright: ignore
9
+ outdir: str = {{out.outdir | quote}} # pyright: ignore
10
+
11
+ plink = {{envs.plink | repr}} # pyright: ignore
12
+ ncores = {{envs.ncores | repr}} # pyright: ignore
13
+ samples: list[str] | str = {{envs.samples | repr}} # pyright: ignore
14
+ variants: list[str] | str = {{envs.variants | repr}} # pyright: ignore
15
+ e_samples_file = {{envs.samples_file | repr}} # pyright: ignore
16
+ e_variants_file = {{envs.variants_file | repr}} # pyright: ignore
17
+ keep = {{envs.keep | repr}} # pyright: ignore
18
+ vfile_type = {{envs.vfile_type | repr}} # pyright: ignore
19
+ chr = {{envs.chr | repr}} # pyright: ignore
20
+ not_chr = {{envs.not_chr | repr}} # pyright: ignore
21
+ autosome = {{envs.autosome | repr}} # pyright: ignore
22
+ autosome_xy = {{envs.autosome_xy | repr}} # pyright: ignore
23
+ snps_only = {{envs.snps_only | repr}} # pyright: ignore
24
+
25
+ samples_file = samples_file or e_samples_file
26
+ if not samples_file and samples:
27
+ samples_file = Path(outdir) / "_samples.txt"
28
+ if isinstance(samples, str):
29
+ samples = [s.strip() for s in samples.split(",")]
30
+
31
+ with open(samples_file, "w") as fh:
32
+ fh.writelines(
33
+ [
34
+ line.replace("/", "\t") + "\n"
35
+ if "/" in line
36
+ else line + "\t" + line + "\n"
37
+ for line in samples
38
+ ]
39
+ )
40
+
41
+ variants_file = variants_file or e_variants_file
42
+ if not variants_file and variants:
43
+ if vfile_type != "id":
44
+ logger.warning(
45
+ "envs.vfile_type should be 'id' if only envs.variants is provided."
46
+ )
47
+ vfile_type = "id"
48
+
49
+ variants_file = Path(outdir) / "_variants.txt"
50
+ if isinstance(variants, str):
51
+ variants = [v.strip() for v in variants.split(",")]
52
+
53
+ with open(variants_file, "w") as fh:
54
+ fh.writelines([line + "\n" for line in variants])
55
+
56
+ bedfile = list(Path(indir).glob("*.bed"))
57
+ if len(bedfile) == 0:
58
+ raise FileNotFoundError(f"No .bed file found in `in.indir`")
59
+ elif len(bedfile) > 1:
60
+ logger.warning(f"Multiple .bed files found in `in.indir`, using the first one.")
61
+
62
+ bedfile = bedfile[0]
63
+ input = bedfile.with_suffix("")
64
+ output = Path(outdir) / bedfile.stem
65
+
66
+ args = {
67
+ "": [plink],
68
+ "bfile": input,
69
+ "out": output,
70
+ "threads": ncores,
71
+ "make-bed": True,
72
+ }
73
+
74
+ if keep:
75
+ if samples_file:
76
+ args["keep"] = samples_file
77
+ if variants_file:
78
+ args["extract"] = (
79
+ variants_file if vfile_type == "id" else [vfile_type, variants_file]
80
+ )
81
+ else:
82
+ if samples_file:
83
+ args["remove"] = samples_file
84
+ if variants_file:
85
+ args["exclude"] = (
86
+ variants_file if vfile_type == "id" else [vfile_type, variants_file]
87
+ )
88
+
89
+ if chr:
90
+ args["chr"] = chr
91
+ if not_chr:
92
+ args["not_chr"] = not_chr
93
+ if autosome:
94
+ args["autosome"] = True
95
+ if autosome_xy:
96
+ args["autosome"] = True
97
+ if snps_only:
98
+ args["snps_only"] = snps_only
99
+
100
+ run_command(dict_to_cli_args(args, dashify=True, dup_key=False), fg=True)