biopipen 0.21.0__py3-none-any.whl → 0.34.26__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (290) hide show
  1. biopipen/__init__.py +1 -1
  2. biopipen/core/config.toml +28 -0
  3. biopipen/core/filters.py +79 -4
  4. biopipen/core/proc.py +12 -3
  5. biopipen/core/testing.py +75 -3
  6. biopipen/ns/bam.py +148 -6
  7. biopipen/ns/bed.py +75 -0
  8. biopipen/ns/cellranger.py +186 -0
  9. biopipen/ns/cellranger_pipeline.py +126 -0
  10. biopipen/ns/cnv.py +19 -3
  11. biopipen/ns/cnvkit.py +1 -1
  12. biopipen/ns/cnvkit_pipeline.py +20 -12
  13. biopipen/ns/delim.py +34 -35
  14. biopipen/ns/gene.py +68 -23
  15. biopipen/ns/gsea.py +63 -37
  16. biopipen/ns/misc.py +39 -14
  17. biopipen/ns/plot.py +304 -1
  18. biopipen/ns/protein.py +183 -0
  19. biopipen/ns/regulatory.py +290 -0
  20. biopipen/ns/rnaseq.py +142 -5
  21. biopipen/ns/scrna.py +2053 -473
  22. biopipen/ns/scrna_metabolic_landscape.py +228 -382
  23. biopipen/ns/snp.py +659 -0
  24. biopipen/ns/stats.py +484 -0
  25. biopipen/ns/tcr.py +683 -98
  26. biopipen/ns/vcf.py +236 -2
  27. biopipen/ns/web.py +97 -6
  28. biopipen/reports/bam/CNVpytor.svelte +4 -9
  29. biopipen/reports/cellranger/CellRangerCount.svelte +18 -0
  30. biopipen/reports/cellranger/CellRangerSummary.svelte +16 -0
  31. biopipen/reports/cellranger/CellRangerVdj.svelte +18 -0
  32. biopipen/reports/cnvkit/CNVkitDiagram.svelte +1 -1
  33. biopipen/reports/cnvkit/CNVkitHeatmap.svelte +1 -1
  34. biopipen/reports/cnvkit/CNVkitScatter.svelte +1 -1
  35. biopipen/reports/common.svelte +15 -0
  36. biopipen/reports/protein/ProdigySummary.svelte +16 -0
  37. biopipen/reports/scrna/CellsDistribution.svelte +4 -39
  38. biopipen/reports/scrna/DimPlots.svelte +1 -1
  39. biopipen/reports/scrna/MarkersFinder.svelte +6 -126
  40. biopipen/reports/scrna/MetaMarkers.svelte +3 -75
  41. biopipen/reports/scrna/RadarPlots.svelte +4 -20
  42. biopipen/reports/scrna_metabolic_landscape/MetabolicFeatures.svelte +61 -22
  43. biopipen/reports/scrna_metabolic_landscape/MetabolicPathwayActivity.svelte +88 -82
  44. biopipen/reports/scrna_metabolic_landscape/MetabolicPathwayHeterogeneity.svelte +70 -10
  45. biopipen/reports/snp/PlinkCallRate.svelte +24 -0
  46. biopipen/reports/snp/PlinkFreq.svelte +18 -0
  47. biopipen/reports/snp/PlinkHWE.svelte +18 -0
  48. biopipen/reports/snp/PlinkHet.svelte +18 -0
  49. biopipen/reports/snp/PlinkIBD.svelte +18 -0
  50. biopipen/reports/tcr/CDR3AAPhyschem.svelte +19 -66
  51. biopipen/reports/tcr/ClonalStats.svelte +16 -0
  52. biopipen/reports/tcr/CloneResidency.svelte +3 -93
  53. biopipen/reports/tcr/Immunarch.svelte +4 -155
  54. biopipen/reports/tcr/TCRClusterStats.svelte +3 -45
  55. biopipen/reports/tcr/TESSA.svelte +11 -28
  56. biopipen/reports/utils/misc.liq +22 -7
  57. biopipen/scripts/bam/BamMerge.py +11 -15
  58. biopipen/scripts/bam/BamSampling.py +90 -0
  59. biopipen/scripts/bam/BamSort.py +141 -0
  60. biopipen/scripts/bam/BamSplitChroms.py +10 -10
  61. biopipen/scripts/bam/BamSubsetByBed.py +38 -0
  62. biopipen/scripts/bam/CNAClinic.R +41 -5
  63. biopipen/scripts/bam/CNVpytor.py +153 -54
  64. biopipen/scripts/bam/ControlFREEC.py +13 -14
  65. biopipen/scripts/bam/SamtoolsView.py +33 -0
  66. biopipen/scripts/bed/Bed2Vcf.py +5 -5
  67. biopipen/scripts/bed/BedConsensus.py +5 -5
  68. biopipen/scripts/bed/BedLiftOver.sh +6 -4
  69. biopipen/scripts/bed/BedtoolsIntersect.py +54 -0
  70. biopipen/scripts/bed/BedtoolsMakeWindows.py +47 -0
  71. biopipen/scripts/bed/BedtoolsMerge.py +4 -4
  72. biopipen/scripts/cellranger/CellRangerCount.py +138 -0
  73. biopipen/scripts/cellranger/CellRangerSummary.R +181 -0
  74. biopipen/scripts/cellranger/CellRangerVdj.py +112 -0
  75. biopipen/scripts/cnv/AneuploidyScore.R +55 -20
  76. biopipen/scripts/cnv/AneuploidyScoreSummary.R +221 -163
  77. biopipen/scripts/cnv/TMADScore.R +25 -9
  78. biopipen/scripts/cnv/TMADScoreSummary.R +57 -86
  79. biopipen/scripts/cnvkit/CNVkitAccess.py +7 -6
  80. biopipen/scripts/cnvkit/CNVkitAutobin.py +26 -18
  81. biopipen/scripts/cnvkit/CNVkitBatch.py +6 -6
  82. biopipen/scripts/cnvkit/CNVkitCall.py +3 -3
  83. biopipen/scripts/cnvkit/CNVkitCoverage.py +4 -3
  84. biopipen/scripts/cnvkit/CNVkitDiagram.py +5 -5
  85. biopipen/scripts/cnvkit/CNVkitFix.py +3 -3
  86. biopipen/scripts/cnvkit/CNVkitGuessBaits.py +12 -8
  87. biopipen/scripts/cnvkit/CNVkitHeatmap.py +5 -5
  88. biopipen/scripts/cnvkit/CNVkitReference.py +6 -5
  89. biopipen/scripts/cnvkit/CNVkitScatter.py +5 -5
  90. biopipen/scripts/cnvkit/CNVkitSegment.py +5 -5
  91. biopipen/scripts/cnvkit/guess_baits.py +166 -93
  92. biopipen/scripts/delim/RowsBinder.R +1 -1
  93. biopipen/scripts/delim/SampleInfo.R +116 -118
  94. biopipen/scripts/gene/GeneNameConversion.R +67 -0
  95. biopipen/scripts/gene/GenePromoters.R +61 -0
  96. biopipen/scripts/gsea/Enrichr.R +5 -5
  97. biopipen/scripts/gsea/FGSEA.R +184 -50
  98. biopipen/scripts/gsea/GSEA.R +2 -2
  99. biopipen/scripts/gsea/PreRank.R +5 -5
  100. biopipen/scripts/misc/Config2File.py +2 -2
  101. biopipen/scripts/misc/Plot.R +80 -0
  102. biopipen/scripts/misc/Shell.sh +15 -0
  103. biopipen/scripts/misc/Str2File.py +2 -2
  104. biopipen/scripts/plot/Heatmap.R +3 -3
  105. biopipen/scripts/plot/Manhattan.R +147 -0
  106. biopipen/scripts/plot/QQPlot.R +146 -0
  107. biopipen/scripts/plot/ROC.R +88 -0
  108. biopipen/scripts/plot/Scatter.R +112 -0
  109. biopipen/scripts/plot/VennDiagram.R +5 -9
  110. biopipen/scripts/protein/MMCIF2PDB.py +33 -0
  111. biopipen/scripts/protein/PDB2Fasta.py +60 -0
  112. biopipen/scripts/protein/Prodigy.py +119 -0
  113. biopipen/scripts/protein/ProdigySummary.R +140 -0
  114. biopipen/scripts/protein/RMSD.py +178 -0
  115. biopipen/scripts/regulatory/MotifAffinityTest.R +102 -0
  116. biopipen/scripts/regulatory/MotifAffinityTest_AtSNP.R +127 -0
  117. biopipen/scripts/regulatory/MotifAffinityTest_MotifBreakR.R +104 -0
  118. biopipen/scripts/regulatory/MotifScan.py +159 -0
  119. biopipen/scripts/regulatory/VariantMotifPlot.R +78 -0
  120. biopipen/scripts/regulatory/motifs-common.R +324 -0
  121. biopipen/scripts/rnaseq/Simulation-ESCO.R +180 -0
  122. biopipen/scripts/rnaseq/Simulation-RUVcorr.R +45 -0
  123. biopipen/scripts/rnaseq/Simulation.R +21 -0
  124. biopipen/scripts/rnaseq/UnitConversion.R +325 -54
  125. biopipen/scripts/scrna/AnnData2Seurat.R +40 -0
  126. biopipen/scripts/scrna/CCPlotR-patch.R +161 -0
  127. biopipen/scripts/scrna/CellCellCommunication.py +150 -0
  128. biopipen/scripts/scrna/CellCellCommunicationPlots.R +93 -0
  129. biopipen/scripts/scrna/CellSNPLite.py +30 -0
  130. biopipen/scripts/scrna/CellTypeAnnotation-celltypist.R +185 -0
  131. biopipen/scripts/scrna/CellTypeAnnotation-direct.R +68 -31
  132. biopipen/scripts/scrna/CellTypeAnnotation-hitype.R +27 -22
  133. biopipen/scripts/scrna/CellTypeAnnotation-sccatch.R +28 -20
  134. biopipen/scripts/scrna/CellTypeAnnotation-sctype.R +48 -25
  135. biopipen/scripts/scrna/CellTypeAnnotation.R +37 -1
  136. biopipen/scripts/scrna/CellsDistribution.R +456 -167
  137. biopipen/scripts/scrna/DimPlots.R +1 -1
  138. biopipen/scripts/scrna/ExprImputation-alra.R +109 -0
  139. biopipen/scripts/scrna/ExprImputation-rmagic.R +256 -0
  140. biopipen/scripts/scrna/{ExprImpution-scimpute.R → ExprImputation-scimpute.R} +8 -5
  141. biopipen/scripts/scrna/ExprImputation.R +7 -0
  142. biopipen/scripts/scrna/LoomTo10X.R +51 -0
  143. biopipen/scripts/scrna/MQuad.py +25 -0
  144. biopipen/scripts/scrna/MarkersFinder.R +679 -400
  145. biopipen/scripts/scrna/MetaMarkers.R +265 -161
  146. biopipen/scripts/scrna/ModuleScoreCalculator.R +66 -11
  147. biopipen/scripts/scrna/PseudoBulkDEG.R +678 -0
  148. biopipen/scripts/scrna/RadarPlots.R +355 -134
  149. biopipen/scripts/scrna/ScFGSEA.R +298 -100
  150. biopipen/scripts/scrna/ScSimulation.R +65 -0
  151. biopipen/scripts/scrna/ScVelo.py +617 -0
  152. biopipen/scripts/scrna/Seurat2AnnData.R +7 -0
  153. biopipen/scripts/scrna/SeuratClusterStats-clustree.R +87 -0
  154. biopipen/scripts/scrna/SeuratClusterStats-dimplots.R +36 -30
  155. biopipen/scripts/scrna/SeuratClusterStats-features.R +138 -187
  156. biopipen/scripts/scrna/SeuratClusterStats-ngenes.R +81 -0
  157. biopipen/scripts/scrna/SeuratClusterStats-stats.R +78 -89
  158. biopipen/scripts/scrna/SeuratClusterStats.R +47 -10
  159. biopipen/scripts/scrna/SeuratClustering.R +36 -233
  160. biopipen/scripts/scrna/SeuratLoading.R +2 -2
  161. biopipen/scripts/scrna/SeuratMap2Ref.R +84 -113
  162. biopipen/scripts/scrna/SeuratMetadataMutater.R +16 -6
  163. biopipen/scripts/scrna/SeuratPreparing.R +223 -173
  164. biopipen/scripts/scrna/SeuratSubClustering.R +64 -0
  165. biopipen/scripts/scrna/SeuratTo10X.R +27 -0
  166. biopipen/scripts/scrna/Slingshot.R +65 -0
  167. biopipen/scripts/scrna/Subset10X.R +2 -2
  168. biopipen/scripts/scrna/TopExpressingGenes.R +169 -135
  169. biopipen/scripts/scrna/celltypist-wrapper.py +195 -0
  170. biopipen/scripts/scrna/scvelo_paga.py +313 -0
  171. biopipen/scripts/scrna/seurat_anndata_conversion.py +98 -0
  172. biopipen/scripts/scrna_metabolic_landscape/MetabolicFeatures.R +447 -82
  173. biopipen/scripts/scrna_metabolic_landscape/MetabolicPathwayActivity.R +348 -241
  174. biopipen/scripts/scrna_metabolic_landscape/MetabolicPathwayHeterogeneity.R +188 -166
  175. biopipen/scripts/snp/MatrixEQTL.R +217 -0
  176. biopipen/scripts/snp/Plink2GTMat.py +148 -0
  177. biopipen/scripts/snp/PlinkCallRate.R +199 -0
  178. biopipen/scripts/snp/PlinkFilter.py +100 -0
  179. biopipen/scripts/snp/PlinkFreq.R +291 -0
  180. biopipen/scripts/snp/PlinkFromVcf.py +81 -0
  181. biopipen/scripts/snp/PlinkHWE.R +85 -0
  182. biopipen/scripts/snp/PlinkHet.R +96 -0
  183. biopipen/scripts/snp/PlinkIBD.R +196 -0
  184. biopipen/scripts/snp/PlinkSimulation.py +124 -0
  185. biopipen/scripts/snp/PlinkUpdateName.py +124 -0
  186. biopipen/scripts/stats/ChowTest.R +146 -0
  187. biopipen/scripts/stats/DiffCoexpr.R +152 -0
  188. biopipen/scripts/stats/LiquidAssoc.R +135 -0
  189. biopipen/scripts/stats/Mediation.R +108 -0
  190. biopipen/scripts/stats/MetaPvalue.R +130 -0
  191. biopipen/scripts/stats/MetaPvalue1.R +74 -0
  192. biopipen/scripts/tcgamaf/Maf2Vcf.py +2 -2
  193. biopipen/scripts/tcgamaf/MafAddChr.py +2 -2
  194. biopipen/scripts/tcr/Attach2Seurat.R +3 -2
  195. biopipen/scripts/tcr/CDR3AAPhyschem.R +211 -143
  196. biopipen/scripts/tcr/CDR3Clustering.R +343 -0
  197. biopipen/scripts/tcr/ClonalStats.R +526 -0
  198. biopipen/scripts/tcr/CloneResidency.R +255 -131
  199. biopipen/scripts/tcr/CloneSizeQQPlot.R +4 -4
  200. biopipen/scripts/tcr/GIANA/GIANA.py +1356 -797
  201. biopipen/scripts/tcr/GIANA/GIANA4.py +1362 -789
  202. biopipen/scripts/tcr/GIANA/query.py +164 -162
  203. biopipen/scripts/tcr/Immunarch-basic.R +31 -9
  204. biopipen/scripts/tcr/Immunarch-clonality.R +25 -5
  205. biopipen/scripts/tcr/Immunarch-diversity.R +352 -134
  206. biopipen/scripts/tcr/Immunarch-geneusage.R +45 -5
  207. biopipen/scripts/tcr/Immunarch-kmer.R +68 -8
  208. biopipen/scripts/tcr/Immunarch-overlap.R +84 -4
  209. biopipen/scripts/tcr/Immunarch-spectratyping.R +35 -6
  210. biopipen/scripts/tcr/Immunarch-tracking.R +38 -6
  211. biopipen/scripts/tcr/Immunarch-vjjunc.R +165 -0
  212. biopipen/scripts/tcr/Immunarch.R +63 -11
  213. biopipen/scripts/tcr/Immunarch2VDJtools.R +2 -2
  214. biopipen/scripts/tcr/ImmunarchFilter.R +4 -4
  215. biopipen/scripts/tcr/ImmunarchLoading.R +38 -29
  216. biopipen/scripts/tcr/SampleDiversity.R +1 -1
  217. biopipen/scripts/tcr/ScRepCombiningExpression.R +40 -0
  218. biopipen/scripts/tcr/ScRepLoading.R +166 -0
  219. biopipen/scripts/tcr/TCRClusterStats.R +176 -22
  220. biopipen/scripts/tcr/TCRDock.py +110 -0
  221. biopipen/scripts/tcr/TESSA.R +102 -118
  222. biopipen/scripts/tcr/VJUsage.R +5 -5
  223. biopipen/scripts/tcr/immunarch-patched.R +142 -0
  224. biopipen/scripts/tcr/vdjtools-patch.sh +1 -1
  225. biopipen/scripts/vcf/BcftoolsAnnotate.py +91 -0
  226. biopipen/scripts/vcf/BcftoolsFilter.py +90 -0
  227. biopipen/scripts/vcf/BcftoolsMerge.py +31 -0
  228. biopipen/scripts/vcf/BcftoolsSort.py +113 -0
  229. biopipen/scripts/vcf/BcftoolsView.py +73 -0
  230. biopipen/scripts/vcf/TruvariBench.sh +14 -7
  231. biopipen/scripts/vcf/TruvariBenchSummary.R +16 -13
  232. biopipen/scripts/vcf/TruvariConsistency.R +1 -1
  233. biopipen/scripts/vcf/Vcf2Bed.py +2 -2
  234. biopipen/scripts/vcf/VcfAnno.py +11 -11
  235. biopipen/scripts/vcf/VcfDownSample.sh +22 -10
  236. biopipen/scripts/vcf/VcfFilter.py +5 -5
  237. biopipen/scripts/vcf/VcfFix.py +7 -7
  238. biopipen/scripts/vcf/VcfFix_utils.py +13 -4
  239. biopipen/scripts/vcf/VcfIndex.py +3 -3
  240. biopipen/scripts/vcf/VcfIntersect.py +3 -3
  241. biopipen/scripts/vcf/VcfLiftOver.sh +5 -0
  242. biopipen/scripts/vcf/VcfSplitSamples.py +4 -4
  243. biopipen/scripts/vcf/bcftools_utils.py +52 -0
  244. biopipen/scripts/web/Download.py +8 -4
  245. biopipen/scripts/web/DownloadList.py +5 -5
  246. biopipen/scripts/web/GCloudStorageDownloadBucket.py +82 -0
  247. biopipen/scripts/web/GCloudStorageDownloadFile.py +23 -0
  248. biopipen/scripts/web/gcloud_common.py +49 -0
  249. biopipen/utils/gene.py +108 -60
  250. biopipen/utils/misc.py +146 -20
  251. biopipen/utils/reference.py +64 -20
  252. biopipen/utils/reporter.py +177 -0
  253. biopipen/utils/vcf.py +1 -1
  254. biopipen-0.34.26.dist-info/METADATA +27 -0
  255. biopipen-0.34.26.dist-info/RECORD +292 -0
  256. {biopipen-0.21.0.dist-info → biopipen-0.34.26.dist-info}/WHEEL +1 -1
  257. {biopipen-0.21.0.dist-info → biopipen-0.34.26.dist-info}/entry_points.txt +6 -2
  258. biopipen/ns/bcftools.py +0 -111
  259. biopipen/ns/scrna_basic.py +0 -255
  260. biopipen/reports/delim/SampleInfo.svelte +0 -36
  261. biopipen/reports/scrna/GeneExpressionInvistigation.svelte +0 -32
  262. biopipen/reports/scrna/ScFGSEA.svelte +0 -35
  263. biopipen/reports/scrna/SeuratClusterStats.svelte +0 -82
  264. biopipen/reports/scrna/SeuratMap2Ref.svelte +0 -20
  265. biopipen/reports/scrna/SeuratPreparing.svelte +0 -38
  266. biopipen/reports/scrna/TopExpressingGenes.svelte +0 -55
  267. biopipen/reports/scrna_metabolic_landscape/MetabolicFeaturesIntraSubset.svelte +0 -31
  268. biopipen/reports/utils/gsea.liq +0 -110
  269. biopipen/scripts/bcftools/BcftoolsAnnotate.py +0 -42
  270. biopipen/scripts/bcftools/BcftoolsFilter.py +0 -79
  271. biopipen/scripts/bcftools/BcftoolsSort.py +0 -19
  272. biopipen/scripts/gene/GeneNameConversion.py +0 -66
  273. biopipen/scripts/scrna/ExprImpution-alra.R +0 -32
  274. biopipen/scripts/scrna/ExprImpution-rmagic.R +0 -29
  275. biopipen/scripts/scrna/ExprImpution.R +0 -7
  276. biopipen/scripts/scrna/GeneExpressionInvistigation.R +0 -132
  277. biopipen/scripts/scrna/Write10X.R +0 -11
  278. biopipen/scripts/scrna_metabolic_landscape/MetabolicFeaturesIntraSubset.R +0 -150
  279. biopipen/scripts/tcr/TCRClustering.R +0 -280
  280. biopipen/utils/common_docstrs.py +0 -61
  281. biopipen/utils/gene.R +0 -49
  282. biopipen/utils/gsea.R +0 -193
  283. biopipen/utils/io.R +0 -20
  284. biopipen/utils/misc.R +0 -114
  285. biopipen/utils/mutate_helpers.R +0 -433
  286. biopipen/utils/plot.R +0 -173
  287. biopipen/utils/rnaseq.R +0 -48
  288. biopipen/utils/single_cell.R +0 -115
  289. biopipen-0.21.0.dist-info/METADATA +0 -22
  290. biopipen-0.21.0.dist-info/RECORD +0 -218
@@ -0,0 +1,196 @@
1
+ suppressPackageStartupMessages({
2
+ library(dplyr)
3
+ library(tidyr)
4
+ library(tibble)
5
+ library(plotthis)
6
+ library(biopipen.utils)
7
+ })
8
+
9
+ indir <- {{in.indir | r}}
10
+ outdir <- {{out.outdir | r}}
11
+ plink <- {{envs.plink | r}}
12
+ indep <- {{envs.indep | r}}
13
+ highld <- {{envs.highld | r}}
14
+ devpars <- {{envs.devpars | r}}
15
+ pihat <- {{envs.pihat | r}}
16
+ samid <- {{envs.samid | r}}
17
+ annofile <- {{envs.anno | r}}
18
+ doplot <- {{envs.plot | r}}
19
+ seed <- {{envs.seed | r}}
20
+ ncores <- {{envs.ncores | r}}
21
+
22
+ log <- get_logger()
23
+
24
+ bedfile <- Sys.glob(file.path(indir, '*.bed'))
25
+ if (length(bedfile) == 0)
26
+ stop("No bed files found in the input directory.")
27
+ if (length(bedfile) > 1) {
28
+ log$warn("Multiple bed files found in the input directory. Using the first one.")
29
+ bedfile <- bedfile[1]
30
+ }
31
+ input <- tools::file_path_sans_ext(bedfile)
32
+ output <- file.path(outdir, basename(input))
33
+
34
+ cmd <- c(
35
+ plink,
36
+ "--threads", ncores,
37
+ "--bfile", input,
38
+ "--indep-pairwise", indep,
39
+ "--keep-allele-order",
40
+ # One should be mindful of running this with < 50 samples
41
+ # "--bad-ld",
42
+ "--out", output
43
+ )
44
+ if (!is.null(highld) && !isFALSE(highld)) {
45
+ cmd <- c(cmd, "--range", "--exclude", highld)
46
+ }
47
+ run_command(cmd, fg = TRUE)
48
+
49
+ prunein <- paste0(output, '.prune.in')
50
+ cmd <- c(
51
+ plink,
52
+ "--threads", ncores,
53
+ "--bfile", input,
54
+ "--extract", prunein,
55
+ "--keep-allele-order",
56
+ "--genome",
57
+ "--out", output
58
+ )
59
+ run_command(cmd, fg = TRUE)
60
+
61
+ genome <- read.table(
62
+ paste0(output, '.genome'),
63
+ row.names = NULL,
64
+ header = TRUE,
65
+ check.names = FALSE
66
+ )
67
+ # "unmelt" it
68
+ # FID1 IID1 FID2 IID2 RT EZ Z0 Z1 Z2 PI_HAT PHE DST PPC RATIO
69
+ # s1 s1 s2 s2 UN NA 1.0000 0.0000 0.0000 0.0000 -1 0.866584 0.0000 0.9194
70
+ # s1 s1 s2 s2 UN NA 0.4846 0.3724 0.1431 0.3293 -1 0.913945 0.7236 2.0375
71
+ # s1 s1 s3 s3 UN NA 1.0000 0.0000 0.0000 0.0000 -1 0.867186 0.0000 1.0791
72
+ genome$SAMPLE1 <- paste(genome$FID1, genome$IID1, sep = "\t")
73
+ genome$SAMPLE2 <- paste(genome$FID2, genome$IID2, sep = "\t")
74
+
75
+
76
+ # get all samples
77
+ samples <- unique(c(genome$SAMPLE1, genome$SAMPLE2))
78
+ # make paired into a distance-like matrix
79
+ similarity <- genome %>%
80
+ select(SAMPLE1, SAMPLE2, PI_HAT) %>%
81
+ pivot_wider(names_from = SAMPLE2, values_from = PI_HAT, values_fill = NA) %>%
82
+ as.data.frame() %>%
83
+ column_to_rownames("SAMPLE1")
84
+ rm(genome)
85
+ # get the rownames back
86
+ samids <- rownames(similarity)
87
+ # get samples that didn't involved
88
+ missedrow <- setdiff(samples, rownames(similarity))
89
+ missedcol <- setdiff(samples, colnames(similarity))
90
+ similarity[missedrow, ] <- NA
91
+ similarity[, missedcol] <- NA
92
+ # order the matrix
93
+ similarity <- similarity[samples, samples, drop = FALSE]
94
+ # transpose the matrix to get the symmetric values
95
+ sim2 <- t(similarity)
96
+ isna <- is.na(similarity)
97
+ # fill the na's with their symmetric values
98
+ similarity[isna] <- sim2[isna]
99
+ rm(sim2)
100
+ # still missing: keep them
101
+ similarity[is.na(similarity)] <- 0
102
+ # get the marks (samples that fail the pihat cutoff)
103
+ nsams <- length(samples)
104
+ fails <- which(similarity > pihat)
105
+ marks <- data.frame(x = (fails - 1)%%nsams + 1, y = ceiling(fails/nsams))
106
+ diag(similarity) <- 1
107
+
108
+ failflags <- rep(F, nrow(marks))
109
+ freqs <- as.data.frame(table(factor(as.matrix(marks))))
110
+ freqs <- freqs[order(freqs$Freq, decreasing = T), 'Var1', drop = T]
111
+ ibd.fail <- c()
112
+ while (sum(failflags) < nrow(marks)) {
113
+ samidx <- freqs[1]
114
+ ibd.fail <- c(ibd.fail, samples[samidx])
115
+ freqs <- freqs[-1]
116
+ sapply(1:nrow(marks), function(i) {
117
+ if (samidx %in% marks[i,])
118
+ failflags[i] <<- TRUE
119
+ })
120
+ }
121
+
122
+ ibd_fail_file <- paste0(output, '.ibd.fail')
123
+ writeLines(ibd.fail, ibd_fail_file)
124
+ cmd <- c(
125
+ plink,
126
+ "--threads", ncores,
127
+ "--bfile", input,
128
+ "--remove", ibd_fail_file,
129
+ "--keep-allele-order",
130
+ "--make-bed",
131
+ "--out", output
132
+ )
133
+ run_command(cmd, fg = TRUE)
134
+
135
+ if (doplot) {
136
+ set.seed(seed)
137
+ library(ComplexHeatmap)
138
+ fontsize8 <- gpar(fontsize = 8)
139
+ fontsize9 <- gpar(fontsize = 9)
140
+ ht_opt$heatmap_row_names_gp <- fontsize8
141
+ ht_opt$heatmap_column_names_gp <- fontsize8
142
+ ht_opt$legend_title_gp <- fontsize9
143
+ ht_opt$legend_labels_gp <- fontsize8
144
+ ht_opt$simple_anno_size <- unit(3, "mm")
145
+
146
+ samids <- sapply(samples, function(sid) {
147
+ fidiid <- unlist(strsplit(sid, "\t", fixed = TRUE))
148
+ gsub(
149
+ "{fid}",
150
+ fidiid[1],
151
+ gsub("{iid}", fidiid[2], samid, fixed = TRUE),
152
+ fixed = TRUE
153
+ )
154
+ })
155
+ rownames(similarity) <- samids
156
+ colnames(similarity) <- samids
157
+
158
+ andata <- NULL
159
+ column_annotation <- NULL
160
+ if (!is.null(annofile) && !isFALSE(annofile)) {
161
+ options(stringsAsFactors = TRUE)
162
+ andata <- read.table(annofile, header = TRUE, row.names = 1, sep = "\t", check.names = FALSE)
163
+ andata <- andata[samids, , drop = FALSE]
164
+ for (anname in colnames(andata)) {
165
+ column_annotation <- c(column_annotation, anname)
166
+ }
167
+ }
168
+
169
+ p <- plotthis::Heatmap(
170
+ similarity,
171
+ name = "PI_HAT",
172
+ in_form = "matrix",
173
+ cell_type = "label",
174
+ rows_data = andata,
175
+ label = function(x) ifelse (x > pihat, '*', NA),
176
+ title = paste0("(*) PI_HAT > ", pihat),
177
+ clustering_distance_rows = function(m) as.dist(1-m),
178
+ clustering_distance_columns = function(m) as.dist(1-m),
179
+ show_row_names = TRUE,
180
+ show_column_names = TRUE,
181
+ column_annotation = column_annotation
182
+ )
183
+
184
+ res <- 100
185
+ height <- attr(p, "height") * res
186
+ width <- attr(p, "width") * res
187
+ png(
188
+ filename = paste0(output, '.ibd.png'),
189
+ width = width,
190
+ height = height,
191
+ res = res
192
+ )
193
+ print(p)
194
+ dev.off()
195
+
196
+ }
@@ -0,0 +1,124 @@
1
+ from pathlib import Path
2
+ from multiprocessing import Pool
3
+ from slugify import slugify
4
+ from simpleconf import Config
5
+ from biopipen.utils.misc import logger, run_command, dict_to_cli_args
6
+
7
+ configfile: str = {{in.configfile | quote}} # pyright: ignore # noqa: E999
8
+ outdir: str = {{out.outdir | quote}} # pyright: ignore
9
+ gtmatfile: str = {{out.gtmat | quote}} # pyright: ignore
10
+ config = Config.load(configfile)
11
+
12
+ default_nsnps = {{envs.nsnps | repr}} # pyright: ignore
13
+ default_ncases = {{envs.ncases | repr}} # pyright: ignore
14
+ default_nctrls = {{envs.nctrls | repr}} # pyright: ignore
15
+ default_plink = {{envs.plink | repr}} # pyright: ignore
16
+ default_seed = {{envs.seed | repr}} # pyright: ignore
17
+ default_label = {{envs.label | repr}} # pyright: ignore
18
+ default_prevalence = {{envs.prevalence | repr}} # pyright: ignore
19
+ default_minfreq = {{envs.minfreq | repr}} # pyright: ignore
20
+ default_maxfreq = {{envs.maxfreq | repr}} # pyright: ignore
21
+ default_hetodds = {{envs.hetodds | repr}} # pyright: ignore
22
+ default_homodds = {{envs.homodds | repr}} # pyright: ignore
23
+ default_missing = {{envs.missing | repr}} # pyright: ignore
24
+ default_args: dict = {{envs.args | repr}} # pyright: ignore
25
+ default_transpose_gtmat = {{envs.transpose_gtmat | repr}} # pyright: ignore
26
+ default_sample_prefix = {{envs.sample_prefix | repr}} # pyright: ignore
27
+
28
+ defaults = {
29
+ "nsnps": default_nsnps,
30
+ "ncases": default_ncases,
31
+ "nctrls": default_nctrls,
32
+ "plink": default_plink,
33
+ "seed": default_seed,
34
+ "label": default_label,
35
+ "prevalence": default_prevalence,
36
+ "minfreq": default_minfreq,
37
+ "maxfreq": default_maxfreq,
38
+ "hetodds": default_hetodds,
39
+ "homodds": default_homodds,
40
+ "missing": default_missing,
41
+ # "args": default_args,
42
+ "transpose_gtmat": default_transpose_gtmat,
43
+ "sample_prefix": default_sample_prefix,
44
+ }
45
+
46
+ def do_one_simulation(confitems):
47
+ args = default_args.copy()
48
+ args.update(confitems.pop("args", {}))
49
+ confs = defaults.copy()
50
+ confs.update(confitems)
51
+ transpose_gtmat = confs.pop("transpose_gtmat")
52
+ sample_prefix = confs.pop("sample_prefix")
53
+
54
+
55
+ logger.debug(" Generating parameters file")
56
+ params_file = Path(outdir) / "params.txt"
57
+ params_file.write_text(
58
+ f"{confs['nsnps']}\t{confs['label']}\t{confs['minfreq']}\t"
59
+ f"{confs['maxfreq']}\t{confs['hetodds']}\t{confs['homodds']}\n"
60
+ )
61
+
62
+ if confs.get('seed') is not None:
63
+ args["seed"] = confs['seed']
64
+
65
+ args["simulate"] = params_file
66
+ args["out"] = Path(outdir) / "sim_snps"
67
+ args["simulate-ncases"] = confs['ncases']
68
+ args["simulate-ncontrols"] = confs['nctrls']
69
+ args["simulate-prevalence"] = confs['prevalence']
70
+ args["simulate-missing"] = confs['missing']
71
+
72
+ cmd = [confs['plink']] + dict_to_cli_args(args)
73
+
74
+ logger.debug(" Running PLINK simulation ...")
75
+ run_command(cmd, fg=True)
76
+
77
+ # Transpose the genotype matrix
78
+ # CHR SNP (C)M POS COUNTED ALT per0_per0 per1_per1 per2_per2
79
+ # 1 SNP_0 0 1 D d 1 0 1
80
+ # 1 SNP_1 0 2 d D 0 1 0
81
+ # 1 SNP_2 0 3 d D 0 0 0
82
+ # 1 SNP_3 0 4 d D 0 0 0
83
+ # 1 SNP_4 0 5 D d 1 2 1
84
+ cmd = [
85
+ confs['plink'],
86
+ "--recode",
87
+ "A" if transpose_gtmat else "A-transpose",
88
+ "tab",
89
+ "--bfile",
90
+ args["out"],
91
+ "--out",
92
+ gtmatfile + ".plink.recoded",
93
+ ]
94
+ logger.debug("- Recoding into genotype matrix ...")
95
+ run_command(cmd, fg=True)
96
+
97
+ logger.debug(" Saving genotype matrix ...")
98
+ ## transpose_gtmat = False
99
+ # SNP_COUNTED per0_per0 per1_per1 per2_per2
100
+ # SNP_0_D 1 0 1
101
+ # SNP_1_d 0 1 0
102
+ # SNP_2_d 0 0 0
103
+ # SNP_3_d 0 0 0
104
+ # SNP_4_D 1 2 1
105
+ ## transpose_gtmat = True
106
+ # FID_IID SNP_0_D SNP_1_D SNP_2_D
107
+ # per0_per0 0 1 1
108
+ # per1_per1 0 2 0
109
+ # per2_per2 0 0 0
110
+ # per3_per3 1 1 0
111
+ # per4_per4 0 0 0
112
+ if transpose_gtmat:
113
+ cmd = f"cut -f1,2,7- {gtmatfile}.plink.recoded.raw | sed 's/\\t/_/'"
114
+ else:
115
+ cmd = f"cut -f2,5,7- {gtmatfile}.plink.recoded.traw | sed 's/\\t/_/'"
116
+
117
+ if sample_prefix:
118
+ cmd = f"{cmd} | sed 's/per[0-9]\\+_per/{sample_prefix}/g'"
119
+
120
+ cmd = f"{cmd} > {gtmatfile}"
121
+ run_command(cmd, fg=True)
122
+
123
+
124
+ do_one_simulation(config)
@@ -0,0 +1,124 @@
1
+ from pathlib import Path
2
+ from biopipen.utils.misc import run_command, dict_to_cli_args, logger
3
+
4
+ indir: str = {{in.indir | quote}} # pyright: ignore # noqa: #999
5
+ namefile: str = {{in.namefile | quote}} # pyright: ignore
6
+ outdir: str = {{out.outdir | quote}} # pyright: ignore
7
+ plink = {{envs.plink | repr}} # pyright: ignore
8
+ bcftools = {{envs.bcftools | repr}} # pyright: ignore
9
+ ncores = {{envs.ncores | repr}} # pyright: ignore
10
+ match_alt = {{envs.match_alt | repr}} # pyright: ignore
11
+
12
+ bedfile = list(Path(indir).glob("*.bed"))
13
+ if len(bedfile) == 0:
14
+ raise FileNotFoundError(f"No .bed file found in `in.indir`")
15
+ elif len(bedfile) > 1:
16
+ logger.warning(f"Multiple .bed files found in `in.indir`, using the first one.")
17
+
18
+ bedfile = bedfile[0]
19
+ input = bedfile.with_suffix("")
20
+ output = Path(outdir) / bedfile.stem
21
+
22
+ if namefile.endswith(".vcf") or namefile.endswith(".vcf.gz"):
23
+ logger.info("VCF file received, extracting names")
24
+ def alt_matched(bim_alt, vcf_alt, match_alt):
25
+ if match_alt == "none":
26
+ return True
27
+ if match_alt == "exact":
28
+ return bim_alt == vcf_alt
29
+
30
+ bim_alts = bim_alt.split(",")
31
+ vcf_alts = vcf_alt.split(",")
32
+ if match_alt == "all":
33
+ return set(bim_alts) == set(vcf_alts)
34
+ if match_alt == "any":
35
+ return bool(set(bim_alts) & set(vcf_alts))
36
+ if match_alt == "first_included":
37
+ return bim_alts[0] in vcf_alts
38
+ if match_alt == "first":
39
+ return bim_alts[0] == vcf_alts[0]
40
+
41
+ raise ValueError(f"Unknown match_alt: {match_alt}")
42
+
43
+ def readline(f):
44
+ line = f.readline().strip()
45
+ return line.split("\t") if line else None
46
+
47
+ namefile_tmp = Path(outdir) / "_namefile_from_vcf.txt"
48
+ infofile = Path(outdir) / "_information_from_vcf_unsorted.txt"
49
+ sorted_infofile = Path(outdir) / "_information_from_vcf_sorted.txt"
50
+ sorted_bim = Path(outdir) / "_sorted_bim.txt"
51
+ bt_cmd = [
52
+ bcftools, "query",
53
+ "-f", "%CHROM\\t%ID\\t0\\t%POS\\t%ALT\\t%REF\\n",
54
+ "-o", infofile,
55
+ namefile,
56
+ ]
57
+ ## infofile
58
+ # 1 rs10492 0 10492 T C
59
+ logger.info("- Extracting information from VCF file ...")
60
+ run_command(bt_cmd, fg=True)
61
+ # sort infofile
62
+ logger.info("- Sorting the information from VCF file ...")
63
+ run_command(
64
+ [
65
+ "sort",
66
+ "-k1,1", "-k4,4n", "-k6,6",
67
+ infofile,
68
+ "--parallel", ncores,
69
+ "-o", sorted_infofile
70
+ ],
71
+ env={"LC_ALL": "C"},
72
+ fg=True,
73
+ )
74
+
75
+ ## .bim file
76
+ # 1 1_10492 0 10492 T C
77
+ # sort .bim file
78
+ logger.info("- Sorting the .bim file ...")
79
+ run_command(
80
+ [
81
+ "sort",
82
+ "-k1,1", "-k4,4n", "-k6,6",
83
+ input.with_suffix(".bim"),
84
+ "--parallel", ncores,
85
+ "-o", sorted_bim
86
+ ],
87
+ env={"LC_ALL": "C"},
88
+ fg=True,
89
+ )
90
+ # query namefile for records in sorted bim file
91
+ logger.info("- Matching and generating the name file ...")
92
+ with sorted_bim.open() as fbim, sorted_infofile.open() as finfo, namefile_tmp.open("w") as fout: # noqa: E501
93
+ bim = readline(fbim)
94
+ info = readline(finfo)
95
+ while bim and info:
96
+ if (
97
+ bim[0] == info[0]
98
+ and bim[3] == info[3]
99
+ and bim[5] == info[5]
100
+ and alt_matched(bim[4], info[4], match_alt)
101
+ ):
102
+ fout.write(f"{bim[1]}\t{info[1]}\n")
103
+ bim = readline(fbim)
104
+ info = readline(finfo)
105
+ elif (
106
+ bim[0] < info[0]
107
+ or (bim[0] == info[0] and bim[3] < info[3])
108
+ or (bim[0] == info[0] and bim[3] == info[3] and bim[5] < info[5])
109
+ ):
110
+ bim = readline(fbim)
111
+ else:
112
+ info = readline(finfo)
113
+
114
+ namefile = str(namefile_tmp)
115
+
116
+ args = {
117
+ "": plink,
118
+ "bfile": input,
119
+ "out": output,
120
+ "make_bed": True,
121
+ "update_name": namefile,
122
+ }
123
+
124
+ run_command(dict_to_cli_args(args, dashify=True), fg=True)
@@ -0,0 +1,146 @@
1
+ library(rlang)
2
+ library(dplyr)
3
+ library(biopipen.utils)
4
+
5
+ infile <- {{in.infile | r}}
6
+ groupfile <- {{in.groupfile | r}}
7
+ fmlfile <- {{in.fmlfile | r}}
8
+ outfile <- {{out.outfile | r}}
9
+ padj <- {{envs.padj | r}}
10
+ transpose_input <- {{envs.transpose_input | r}}
11
+ transpose_group <- {{envs.transpose_group | r}}
12
+
13
+ log <- get_logger()
14
+
15
+ log$info("Reading input files ...")
16
+ indata <- read.table(infile, header = TRUE, sep = "\t", row.names = 1, check.names = FALSE)
17
+ if (transpose_input) {
18
+ indata <- t(indata)
19
+ }
20
+ groupdata <- read.table(groupfile, header = TRUE, sep = "\t", row.names = 1, check.names = FALSE)
21
+ if (transpose_group) {
22
+ groupdata <- t(groupdata)
23
+ }
24
+ allgroups = na.omit(unique(unlist(groupdata)))
25
+
26
+ fmldata <- read.table(fmlfile, header = TRUE, sep = "\t", row.names = NULL, check.names = FALSE)
27
+ colnames(fmldata)[1:2] <- c("Group", "Formula")
28
+
29
+ chow.test <- function(fml, grouping) {
30
+ formula <- as.formula(fml)
31
+ pooled_lm <- tryCatch(lm(formula, data = indata), error = function(e) NULL)
32
+ if (is.null(pooled_lm)) {
33
+ return(list(
34
+ pooled.lm = NA,
35
+ group.lms = NULL,
36
+ Fstat = NA,
37
+ group = grouping,
38
+ pooled.ssr = NA,
39
+ group.ssr = NA,
40
+ Pval = NA
41
+ ))
42
+ }
43
+
44
+ splitdata <- split(indata, groupdata[rownames(indata), grouping])
45
+ group_lms <- lapply(names(splitdata), function(g) {
46
+ tryCatch(lm(formula, data = splitdata[[g]]), error = function(e) NULL)
47
+ })
48
+ names(group_lms) <- names(splitdata)
49
+
50
+ fmvars <- all.vars(formula)
51
+ pooled.ssr <- sum(pooled_lm$residuals ^ 2)
52
+ subssr <- ifelse(any(is.null(group_lms)), NA, sum(sapply(group_lms, function(x) sum(x$residuals ^ 2))))
53
+ ngroups <- length(splitdata)
54
+ K <- ifelse(fmvars[2] == ".", ncol(indata), length(fmvars))
55
+ J <- (ngroups - 1) * K
56
+ DF <- nrow(indata) - ngroups * K
57
+ FS <- (pooled.ssr - subssr) * DF / subssr / J
58
+ list(
59
+ pooled.lm = pooled_lm,
60
+ group.lms = group_lms,
61
+ Fstat = FS,
62
+ group = grouping,
63
+ pooled.ssr = pooled.ssr,
64
+ group.ssr = subssr,
65
+ Pval = pf(FS, J, DF, lower.tail = FALSE)
66
+ )
67
+ }
68
+
69
+ formatlm <- function(m, g = NULL, type = "coeff") {
70
+ if (is.null(g)) {
71
+ vars <- all.vars(m$terms)
72
+ if (type == "pval") {
73
+ df <- as.data.frame(summary(m)$coefficients)
74
+ terms <- unlist(sapply(na.omit(c(vars[2:length(vars)], '(Intercept)', 'N')), function(x) {
75
+ pv <- df[x, 4] %||% df[bQuote(x), 4]
76
+ if (x == 'N') {
77
+ paste0('N=', nrow(m$model))
78
+ } else if (is.null(pv)) {
79
+ NULL
80
+ } else {
81
+ l <- ifelse(x == '(Intercept)', '_', x)
82
+ paste0(l, '=', signif(pv, digits = 4))
83
+ }
84
+ }))
85
+ } else {
86
+ coeff <- as.list(m$coefficients)
87
+ terms <- unlist(sapply(na.omit(c(vars[2:length(vars)], '(Intercept)', 'N')), function(x) {
88
+ ce <- coeff[[x]] %||% coeff[[bQuote(x)]]
89
+ if (x == 'N') {
90
+ paste0('N=', nrow(m$model))
91
+ } else if (is.null(ce)) {
92
+ NULL
93
+ } else {
94
+ l <- ifelse(x == '(Intercept)', '_', x)
95
+ paste0(l, '=', round(ce, 3))
96
+ }
97
+ }))
98
+ }
99
+ paste(terms[!is.null(terms)], collapse = ', ')
100
+ } else {
101
+ gm <- m[[as.character(g)]]
102
+ if (is.null(gm)) {
103
+ return(NA)
104
+ }
105
+ formatlm(gm, type = type)
106
+ }
107
+ }
108
+
109
+ log$info("Running Chow tests ...")
110
+ ncases <- nrow(fmldata)
111
+ results <- do_call(rbind, lapply(
112
+ seq_len(ncases),
113
+ function(i) {
114
+ fmlrow <- fmldata[i, , drop=TRUE]
115
+ if (i %% 100 == 0) {
116
+ log$info("- {i} / {ncases} ...")
117
+ }
118
+ log$debug(" Running Chow test for formula: {fmlrow$Formula} (grouping = {fmlrow$Group})")
119
+
120
+ res <- chow.test(fmlrow$Formula, fmlrow$Group)
121
+ fmlrow$Pooled_Coef <- formatlm(res$pooled.lm)
122
+ for (g in allgroups) {
123
+ fmlrow[[paste0("Group_", g, "_Coef")]] <- formatlm(res$group.lms, g)
124
+ }
125
+ # fmlrow$Groups <- formatlm(res$group.lms)
126
+ fmlrow$Pooled_Pval <- formatlm(res$pooled.lm, type="pval")
127
+ for (g in allgroups) {
128
+ fmlrow[[paste0("Group_", g, "_Pval")]] <- formatlm(res$group.lms, g, type="pval")
129
+ }
130
+ fmlrow$SSR <- res$group.ssr
131
+ fmlrow$SumSSR <- res$pooled.ssr
132
+ fmlrow$Fstat <- res$Fstat
133
+ fmlrow$Pval <- res$Pval
134
+ fmlrow
135
+ }
136
+ )) %>% as.data.frame()
137
+
138
+ if (padj != "none") {
139
+ log$info("Adjusting p-values ...")
140
+ results$Padj <- p.adjust(results$Pval, method = padj)
141
+ }
142
+
143
+ log$info("Writing output ...")
144
+ # unimplemented type 'list' in 'EncodeElement'
145
+ results <- apply(results, 2, as.character)
146
+ write.table(results, file = outfile, sep = "\t", quote = FALSE, row.names = FALSE)