biopipen 0.32.1__py3-none-any.whl → 0.33.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biopipen might be problematic. Click here for more details.

Files changed (134) hide show
  1. biopipen/__init__.py +1 -1
  2. biopipen/core/config.toml +6 -0
  3. biopipen/core/filters.py +77 -26
  4. biopipen/core/testing.py +6 -1
  5. biopipen/ns/bam.py +39 -0
  6. biopipen/ns/cellranger.py +5 -0
  7. biopipen/ns/cellranger_pipeline.py +2 -2
  8. biopipen/ns/cnvkit_pipeline.py +4 -1
  9. biopipen/ns/delim.py +33 -27
  10. biopipen/ns/protein.py +99 -0
  11. biopipen/ns/scrna.py +411 -250
  12. biopipen/ns/snp.py +16 -3
  13. biopipen/ns/tcr.py +125 -1
  14. biopipen/ns/vcf.py +34 -0
  15. biopipen/ns/web.py +5 -1
  16. biopipen/reports/scrna/SeuratClusterStats.svelte +1 -1
  17. biopipen/reports/scrna/SeuratMap2Ref.svelte +15 -2
  18. biopipen/reports/tcr/ClonalStats.svelte +15 -0
  19. biopipen/reports/utils/misc.liq +22 -7
  20. biopipen/scripts/bam/BamMerge.py +2 -2
  21. biopipen/scripts/bam/BamSampling.py +4 -4
  22. biopipen/scripts/bam/BamSort.py +141 -0
  23. biopipen/scripts/bam/BamSplitChroms.py +10 -10
  24. biopipen/scripts/bam/BamSubsetByBed.py +3 -3
  25. biopipen/scripts/bam/CNVpytor.py +10 -10
  26. biopipen/scripts/bam/ControlFREEC.py +11 -11
  27. biopipen/scripts/bed/Bed2Vcf.py +5 -5
  28. biopipen/scripts/bed/BedConsensus.py +5 -5
  29. biopipen/scripts/bed/BedLiftOver.sh +6 -4
  30. biopipen/scripts/bed/BedtoolsIntersect.py +4 -4
  31. biopipen/scripts/bed/BedtoolsMakeWindows.py +3 -3
  32. biopipen/scripts/bed/BedtoolsMerge.py +4 -4
  33. biopipen/scripts/cellranger/CellRangerCount.py +20 -9
  34. biopipen/scripts/cellranger/CellRangerSummary.R +20 -29
  35. biopipen/scripts/cellranger/CellRangerVdj.py +8 -8
  36. biopipen/scripts/cnvkit/CNVkitAccess.py +6 -6
  37. biopipen/scripts/cnvkit/CNVkitAutobin.py +25 -18
  38. biopipen/scripts/cnvkit/CNVkitBatch.py +5 -5
  39. biopipen/scripts/cnvkit/CNVkitCall.py +3 -3
  40. biopipen/scripts/cnvkit/CNVkitCoverage.py +2 -2
  41. biopipen/scripts/cnvkit/CNVkitDiagram.py +5 -5
  42. biopipen/scripts/cnvkit/CNVkitFix.py +3 -3
  43. biopipen/scripts/cnvkit/CNVkitGuessBaits.py +9 -5
  44. biopipen/scripts/cnvkit/CNVkitHeatmap.py +4 -4
  45. biopipen/scripts/cnvkit/CNVkitReference.py +2 -2
  46. biopipen/scripts/cnvkit/CNVkitScatter.py +5 -5
  47. biopipen/scripts/cnvkit/CNVkitSegment.py +5 -5
  48. biopipen/scripts/cnvkit/guess_baits.py +166 -93
  49. biopipen/scripts/delim/SampleInfo.R +85 -139
  50. biopipen/scripts/misc/Config2File.py +2 -2
  51. biopipen/scripts/misc/Str2File.py +2 -2
  52. biopipen/scripts/protein/MMCIF2PDB.py +33 -0
  53. biopipen/scripts/protein/PDB2Fasta.py +60 -0
  54. biopipen/scripts/protein/Prodigy.py +4 -4
  55. biopipen/scripts/protein/RMSD.py +178 -0
  56. biopipen/scripts/regulatory/MotifScan.py +8 -8
  57. biopipen/scripts/scrna/CellCellCommunication.py +59 -22
  58. biopipen/scripts/scrna/CellsDistribution.R +31 -6
  59. biopipen/scripts/scrna/MarkersFinder.R +272 -602
  60. biopipen/scripts/scrna/MetaMarkers.R +16 -7
  61. biopipen/scripts/scrna/RadarPlots.R +75 -35
  62. biopipen/scripts/scrna/SCP-plot.R +15202 -0
  63. biopipen/scripts/scrna/ScVelo.py +0 -0
  64. biopipen/scripts/scrna/SeuratClusterStats-clustree.R +23 -25
  65. biopipen/scripts/scrna/SeuratClusterStats-dimplots.R +26 -47
  66. biopipen/scripts/scrna/SeuratClusterStats-features.R +85 -385
  67. biopipen/scripts/scrna/SeuratClusterStats-ngenes.R +33 -13
  68. biopipen/scripts/scrna/SeuratClusterStats-stats.R +45 -228
  69. biopipen/scripts/scrna/SeuratClusterStats.R +13 -19
  70. biopipen/scripts/scrna/SeuratMap2Ref.R +16 -6
  71. biopipen/scripts/scrna/SeuratPreparing.R +138 -81
  72. biopipen/scripts/scrna/SlingShot.R +71 -0
  73. biopipen/scripts/scrna/TopExpressingGenes.R +9 -7
  74. biopipen/scripts/scrna/celltypist-wrapper.py +7 -6
  75. biopipen/scripts/snp/Plink2GTMat.py +26 -11
  76. biopipen/scripts/snp/PlinkFilter.py +7 -7
  77. biopipen/scripts/snp/PlinkFromVcf.py +8 -5
  78. biopipen/scripts/snp/PlinkSimulation.py +4 -4
  79. biopipen/scripts/snp/PlinkUpdateName.py +4 -4
  80. biopipen/scripts/stats/ChowTest.R +48 -22
  81. biopipen/scripts/tcgamaf/Maf2Vcf.py +2 -2
  82. biopipen/scripts/tcgamaf/MafAddChr.py +2 -2
  83. biopipen/scripts/tcr/CDR3AAPhyschem.R +12 -2
  84. biopipen/scripts/tcr/ClonalStats.R +484 -0
  85. biopipen/scripts/tcr/CloneResidency.R +23 -5
  86. biopipen/scripts/tcr/Immunarch-basic.R +8 -1
  87. biopipen/scripts/tcr/Immunarch-clonality.R +5 -0
  88. biopipen/scripts/tcr/Immunarch-diversity.R +25 -4
  89. biopipen/scripts/tcr/Immunarch-geneusage.R +15 -1
  90. biopipen/scripts/tcr/Immunarch-kmer.R +14 -1
  91. biopipen/scripts/tcr/Immunarch-overlap.R +15 -1
  92. biopipen/scripts/tcr/Immunarch-spectratyping.R +10 -1
  93. biopipen/scripts/tcr/Immunarch-tracking.R +6 -0
  94. biopipen/scripts/tcr/Immunarch-vjjunc.R +33 -0
  95. biopipen/scripts/tcr/ScRepLoading.R +127 -0
  96. biopipen/scripts/tcr/TCRClusterStats.R +24 -7
  97. biopipen/scripts/tcr/TCRDock.py +10 -6
  98. biopipen/scripts/tcr/TESSA.R +6 -1
  99. biopipen/scripts/tcr/vdjtools-patch.sh +1 -1
  100. biopipen/scripts/vcf/BcftoolsAnnotate.py +8 -8
  101. biopipen/scripts/vcf/BcftoolsFilter.py +3 -3
  102. biopipen/scripts/vcf/BcftoolsMerge.py +31 -0
  103. biopipen/scripts/vcf/BcftoolsSort.py +4 -4
  104. biopipen/scripts/vcf/BcftoolsView.py +5 -5
  105. biopipen/scripts/vcf/Vcf2Bed.py +2 -2
  106. biopipen/scripts/vcf/VcfAnno.py +11 -11
  107. biopipen/scripts/vcf/VcfDownSample.sh +22 -10
  108. biopipen/scripts/vcf/VcfFilter.py +5 -5
  109. biopipen/scripts/vcf/VcfFix.py +7 -7
  110. biopipen/scripts/vcf/VcfFix_utils.py +12 -3
  111. biopipen/scripts/vcf/VcfIndex.py +3 -3
  112. biopipen/scripts/vcf/VcfIntersect.py +3 -3
  113. biopipen/scripts/vcf/VcfLiftOver.sh +5 -0
  114. biopipen/scripts/vcf/VcfSplitSamples.py +4 -4
  115. biopipen/scripts/vcf/bcftools_utils.py +3 -3
  116. biopipen/scripts/web/Download.py +8 -4
  117. biopipen/scripts/web/DownloadList.py +5 -5
  118. biopipen/scripts/web/GCloudStorageDownloadBucket.py +5 -5
  119. biopipen/scripts/web/GCloudStorageDownloadFile.py +3 -3
  120. biopipen/scripts/web/gcloud_common.py +1 -1
  121. biopipen/utils/gsea.R +96 -42
  122. biopipen/utils/misc.R +205 -7
  123. biopipen/utils/misc.py +17 -8
  124. biopipen/utils/plot.R +53 -17
  125. biopipen/utils/reference.py +11 -11
  126. biopipen/utils/repr.R +146 -0
  127. biopipen/utils/vcf.py +1 -1
  128. {biopipen-0.32.1.dist-info → biopipen-0.33.0.dist-info}/METADATA +9 -9
  129. {biopipen-0.32.1.dist-info → biopipen-0.33.0.dist-info}/RECORD +131 -122
  130. {biopipen-0.32.1.dist-info → biopipen-0.33.0.dist-info}/WHEEL +1 -1
  131. biopipen/scripts/scrna/SeuratClusterStats-hists.R +0 -139
  132. biopipen/scripts/scrna/SeuratPreparing-common.R +0 -452
  133. biopipen/scripts/scrna/SeuratPreparing-doublet_detection.R +0 -201
  134. {biopipen-0.32.1.dist-info → biopipen-0.33.0.dist-info}/entry_points.txt +0 -0
@@ -1,12 +1,9 @@
1
- {{ biopipen_dir | joinpaths: "utils", "misc.R" | source_r }}
2
- {{ biopipen_dir | joinpaths: "utils", "caching.R" | source_r }}
3
-
4
1
  library(Seurat)
5
2
  library(future)
6
3
  library(bracer)
7
- library(ggplot2)
8
4
  library(dplyr)
9
- # library(tidyseurat)
5
+ library(glue)
6
+ library(biopipen.utils)
10
7
 
11
8
  metafile <- {{in.metafile | quote}}
12
9
  rdsfile <- {{out.rdsfile | quote}}
@@ -14,10 +11,9 @@ joboutdir <- {{job.outdir | quote}}
14
11
  envs <- {{envs | r: todot = "-", skip = 1}}
15
12
 
16
13
  if (isTRUE(envs$cache)) { envs$cache <- joboutdir }
17
- if (length(envs$cache) > 1) {
18
- log_warn("Multiple cache directories (envs.cache) detected, using the first one.")
19
- envs$cache <- envs$cache[1]
20
- }
14
+
15
+ log <- get_logger()
16
+ reporter <- get_reporter()
21
17
 
22
18
  set.seed(8525)
23
19
  # 8TB
@@ -26,15 +22,15 @@ options(future.rng.onMisuse="ignore")
26
22
  options(Seurat.object.assay.version = "v5")
27
23
  plan(strategy = "multicore", workers = envs$ncores)
28
24
 
29
- {{ biopipen_dir | joinpaths: "scripts", "scrna", "SeuratPreparing-common.R" | source_r }}
30
-
31
- add_report(
25
+ reporter$add(
32
26
  list(
33
27
  kind = "descr",
34
28
  name = "Filters applied",
35
29
  content = paste0(
36
30
  "<p>Cell filters: ", html_escape(envs$cell_qc), "</p>",
37
- "<p>Gene filters: ", html_escape(stringify_list(envs$gene_qc)), "</p>"
31
+ "<p>Gene filters: </p>",
32
+ "<p>- Min Cells: ", envs$gene_qc$min_cells, "</p>",
33
+ "<p>- Excludes: ", html_escape(envs$gene_qc$excludes %||% "Not set"), "</p>"
38
34
  )
39
35
  ),
40
36
  h1 = "Filters and QC"
@@ -48,16 +44,6 @@ metadata <- read.table(
48
44
  check.names = FALSE
49
45
  )
50
46
 
51
- cache_sig <- capture.output(str(metadata))
52
- dig_sig <- digest::digest(cache_sig, algo = "md5")
53
- dig_sig <- substr(dig_sig, 1, 8)
54
- cache_dir <- NULL
55
- if (is.character(envs$cache)) {
56
- cache_dir <- file.path(envs$cache, paste0(dig_sig, ".seuratpreparing_cache"))
57
- dir.create(cache_dir, recursive = TRUE, showWarnings = FALSE)
58
- writeLines(cache_sig, file.path(cache_dir, "signature.txt"))
59
- }
60
-
61
47
  meta_cols = colnames(metadata)
62
48
  if (!"Sample" %in% meta_cols) {
63
49
  stop("Error: Column `Sample` is not found in metafile.")
@@ -66,77 +52,148 @@ if (!"RNAData" %in% meta_cols) {
66
52
  stop("Error: Column `RNAData` is not found in metafile.")
67
53
  }
68
54
 
69
- samples = as.character(metadata$Sample)
70
-
71
- # used for plotting
72
- cell_qc_df = NULL
73
-
74
- plotsdir = file.path(joboutdir, "plots")
75
- dir.create(plotsdir, showWarnings = FALSE, recursive = TRUE)
76
-
77
- # features for cell QC
78
- feats = c(
79
- "nFeature_RNA", "nCount_RNA",
80
- "percent.mt", "percent.ribo", "percent.hb", "percent.plat"
81
- )
82
-
83
- sobj <- run_cell_qc(sobj)
84
-
85
- # plot and report the QC
86
- log_info("Plotting and reporting QC ...")
87
- dim_df = report_cell_qc(nrow(sobj))
88
-
89
- if (is.list(envs$gene_qc)) {
90
- sobj <- run_gene_qc(sobj)
91
- }
92
-
93
- dim_df = rbind(
94
- dim_df,
95
- data.frame(
96
- when = "After_Gene_QC",
97
- nCells = ncol(sobj),
98
- nGenes = nrow(sobj)
99
- )
55
+ qcdir = file.path(joboutdir, "qc")
56
+ dir.create(qcdir, showWarnings = FALSE, recursive = TRUE)
57
+
58
+ sobj <- LoadSeuratAndPerformQC(
59
+ metadata,
60
+ per_sample_qc = envs$cell_qc_per_sample,
61
+ cell_qc = envs$cell_qc,
62
+ gene_qc = envs$gene_qc,
63
+ tmpdir = joboutdir,
64
+ log = log,
65
+ cache = envs$cache)
66
+
67
+ log$info("Saving dimension table ...")
68
+ dim_df <- data.frame(
69
+ when = c("Before QC", "After QC"),
70
+ nCells = c(nrow(sobj@misc$cell_qc_df), sum(sobj@misc$cell_qc_df$.QC)),
71
+ nGenes = c(sobj@misc$gene_qc$before, sobj@misc$gene_qc$after)
100
72
  )
101
-
102
- log_info("Saving dimension table ...")
103
- write.table(dim_df, file = file.path(plotsdir, "dim.txt"),
73
+ write.table(dim_df, file = file.path(qcdir, "dim.txt"),
104
74
  row.names = FALSE, quote = FALSE, sep = "\t")
105
75
 
106
- add_report(
76
+ reporter$add(
107
77
  list(
108
78
  kind = "descr",
109
- content = paste(
110
- "The dimension table for the Seurat object. The table contains the number of cells and genes before and after QC."
111
- )
79
+ content = "The dimension table for the Seurat object. The table contains the number of cells and genes before and after QC. Note that the cell QC is performed before gene QC."
112
80
  ),
113
81
  list(
114
82
  kind = "table",
115
- data = list(path = file.path(plotsdir, "dim.txt"))
83
+ data = list(path = file.path(qcdir, "dim.txt"))
116
84
  ),
117
- h1 = "Filters and QC"
85
+ h1 = "Filters and QC",
86
+ h2 = "Dimension table"
118
87
  )
119
88
 
120
- sobj <- run_transformation(sobj)
121
- sobj <- run_integration(sobj)
89
+ log$info("Visualizing QC metrics ...")
90
+ for (pname in names(envs$qc_plots)) {
91
+ args <- envs$qc_plots[[pname]]
92
+ args$kind <- args$kind %||% "cell"
93
+ args$devpars <- args$devpars %||% list()
94
+ args$more_formats <- args$more_formats %||% character()
95
+ args$save_code <- args$save_code %||% FALSE
96
+ extract_vars(args, "kind", "devpars", "more_formats", "save_code")
97
+ if (kind == "gene") kind <- "gene_qc"
98
+ if (kind == "cell") kind <- "cell_qc"
99
+ args$object <- sobj
100
+ plot_fn <- if (kind == "cell_qc") {
101
+ gglogger::register(VizSeuratCellQC)
102
+ } else {
103
+ gglogger::register(VizSeuratGeneQC)
104
+ }
105
+ p <- do_call(plot_fn, args)
106
+ prefix <- file.path(qcdir, paste0(slugify(pname), "_", kind))
107
+ save_plot(p, prefix, devpars, formats = c("png", more_formats))
108
+ if (save_code) {
109
+ save_plotcode(p, prefix,
110
+ setup = c("library(biopipen.utils)", "load('data.RData')", "invisible(list2env('args'))"),
111
+ "args",
112
+ auto_data_setup = FALSE)
113
+ }
114
+ reporter$add(
115
+ reporter$image(prefix, more_formats, save_code, kind = "image"),
116
+ h1 = "Filters and QC",
117
+ h2 = html_escape(pname)
118
+ )
119
+ }
120
+
121
+ sobj <- RunSeuratTransformation(
122
+ sobj,
123
+ use_sct = envs$use_sct,
124
+ SCTransformArgs = envs$SCTransform,
125
+ NormalizeDataArgs = envs$NormalizeData,
126
+ FindVariableFeaturesArgs = envs$FindVariableFeatures,
127
+ ScaleDataArgs = envs$ScaleData,
128
+ RunPCAArgs = envs$RunPCA,
129
+ log = log,
130
+ cache = envs$cache
131
+ )
132
+ sobj <- RunSeuratIntegration(
133
+ sobj,
134
+ no_integration = envs$no_integration,
135
+ IntegrateLayersArgs = envs$IntegrateLayers,
136
+ log = log,
137
+ cache = envs$cache
138
+ )
122
139
 
123
140
  # This is the last step, doesn't need to be cached
124
- if (!is.null(envs$doublet_detector) && envs$doublet_detector != "none") {
125
- {{* biopipen_dir | joinpaths: "scripts", "scrna", "SeuratPreparing-doublet_detection.R" | source_r }}
126
-
127
- detector <- tolower(envs$doublet_detector)
128
- if (detector == "doubletfinder") detector <- "DoubletFinder"
129
- if (detector == "scdblfinder") detector <- "scDblFinder"
130
- dd <- run_dd(detector)
131
- save_dd(dd, detector)
132
- sobj <- add_dd_to_seurat(sobj, dd)
133
- plot_dd(sobj, dd, detector)
134
- sobj <- filter_dd(sobj, dd, detector)
135
- report_dd(detector)
136
- }
141
+ if (!identical(envs$doublet_detector, "none")) {
142
+ dbldir <- file.path(joboutdir, "doublets")
143
+ dir.create(dbldir, showWarnings = FALSE, recursive = TRUE)
144
+
145
+ sobj <- RunSeuratDoubletDetection(
146
+ sobj,
147
+ tool = envs$doublet_detector,
148
+ DoubletFinderArgs = envs$DoubletFinder,
149
+ scDblFinderArgs = envs$scDblFinder,
150
+ filter = FALSE,
151
+ log = log,
152
+ cache = envs$cache
153
+ )
137
154
 
155
+ log$info("Visualizing doublet detection results ...")
156
+ if (identical(tolower(envs$doublet_detector), "doubletfinder")) {
157
+ p <- VizSeuratDoublets(sobj, plot_type = "pK", x_text_angle = 90)
158
+ save_plot(
159
+ p, file.path(dbldir, "doubletfinder_pk"),
160
+ devpars = list(res = 100, width = 800, height = 600),
161
+ formats = "png")
162
+ reporter$add(
163
+ list(
164
+ kind = "descr",
165
+ content = paste(
166
+ "The pK plot from DoubletFinder to select the optimal pK value.",
167
+ "See more at https://github.com/chris-mcginnis-ucsf/DoubletFinder"
168
+ )
169
+ ),
170
+ list(
171
+ kind = "image",
172
+ src = file.path(dbldir, "doubletfinder_pk.png")
173
+ ),
174
+ h1 = glue("Doublet detection using {envs$doublet_detector}"),
175
+ h2 = "BC metric vs pK"
176
+ )
177
+ }
178
+
179
+ for (pt in c("dim", "pie")) {
180
+ p <- VizSeuratDoublets(sobj, plot_type = pt)
181
+ save_plot(p, file.path(dbldir, paste0("doublets_", pt)), formats = "png")
182
+
183
+ reporter$add(
184
+ list(
185
+ src = file.path(dbldir, paste0("doublets_", pt, ".png")),
186
+ descr = ifelse(pt == "dim", "Dimention Reduction Plot", "Pie Chart")
187
+ ),
188
+ h1 = glue("Doublet detection using {envs$doublet_detector}"),
189
+ h2 = "Doublets distribution",
190
+ ui = "table_of_images"
191
+ )
192
+ }
138
193
 
139
- log_info("Saving QC'ed seurat object ...")
140
- saveRDS(sobj, rdsfile)
194
+ sobj <- subset(sobj, subset = !!sym(paste0(sobj@misc$doublets$tool, "_DropletType")) != "doublet")
195
+ }
141
196
 
142
- save_report(joboutdir)
197
+ log$info("Saving QC'ed seurat object ...")
198
+ reporter$save(joboutdir)
199
+ saveRDS(sobj, rdsfile)
@@ -0,0 +1,71 @@
1
+ {{ biopipen_dir | joinpaths: "utils", "misc.R" | source_r }}
2
+
3
+ library(rlang)
4
+ library(Seurat)
5
+ library(slingshot)
6
+
7
+ sobjfile <- {{in.sobjfile | r}}
8
+ outfile <- {{out.outfile | r}}
9
+ group_by <- {{envs.group_by | r}}
10
+ reduction <- {{envs.reduction | r}}
11
+ dims <- {{envs.dims | r}}
12
+ start <- {{envs.start | r}}
13
+ end <- {{envs.end | r}}
14
+ prefix <- {{envs.prefix | r}}
15
+ reverse <- {{envs.reverse | r}}
16
+ align_start <- {{envs.align_start | r}}
17
+ seed <- {{envs.seed | r}}
18
+
19
+ set.seed(seed)
20
+ if (is.null(group_by)) {
21
+ stop("envs.group_by is required")
22
+ }
23
+
24
+ log_info("Reading Seurat object ...")
25
+ srt <- readRDS(sobjfile)
26
+
27
+ if (!group_by %in% colnames(srt@meta.data)) {
28
+ stop(paste("Grouping column", group_by, "not found in the Seurat object"))
29
+ }
30
+
31
+ reduction <- reduction %||% DefaultDimReduc(srt)
32
+ dims <- expand_dims(dims)
33
+
34
+ if (is.null(prefix)) {
35
+ prefix <- ""
36
+ } else {
37
+ prefix <- paste0(prefix, "_")
38
+ }
39
+
40
+ log_info("Filtering cells in NA group_by ...")
41
+ srt_sub <- srt[, !is.na(srt[[group_by, drop = TRUE]])]
42
+
43
+ log_info("Running Slingshot ...")
44
+ sl <- slingshot(
45
+ data = as.data.frame(srt_sub[[reduction]]@cell.embeddings[, dims]),
46
+ clusterLabels = as.character(srt_sub[[group_by, drop = TRUE]]),
47
+ start.clus = start, end.clus = end
48
+ )
49
+
50
+ command <- pbmc_small@commands[[1]]
51
+ attr(command, "name") <- "SlingShot"
52
+ attr(command, "call.string") <- "slingshot(...)"
53
+ attr(command, "params") <- list()
54
+ srt@commands <- srt@commands %||% list()
55
+ srt@commands$Slingshot <- command
56
+
57
+ df <- as.data.frame(slingPseudotime(sl))
58
+ colnames(df) <- paste0(prefix, colnames(df))
59
+ if (isTRUE(reverse)) {
60
+ if (isTRUE(align_start)) {
61
+ df <- apply(df, 2, function(x) max(x, na.rm = TRUE) - x)
62
+ } else {
63
+ df <- max(df, na.rm = TRUE) - df
64
+ }
65
+ }
66
+
67
+ srt <- AddMetaData(srt, metadata = df)
68
+ srt <- AddMetaData(srt, metadata = slingBranchID(sl), col.name = paste0(prefix, "BranchID"))
69
+
70
+ log_info("Saving Seurat object ...")
71
+ saveRDS(srt, outfile)
@@ -161,14 +161,16 @@ do_enrich <- function(expr, odir) {
161
161
  next
162
162
  }
163
163
 
164
- png(
165
- file.path(odir, paste0("Enrichr-", db, ".png")),
166
- res = 100, height = 1000, width = 1000
167
- )
168
- print(
169
- plotEnrich(enriched[[db]], showTerms = 20, title = db) +
164
+ enrich_p <- plotEnrich(enriched[[db]], showTerms = 20, title = db) +
170
165
  theme_prism()
171
- )
166
+ enrich_plot <- file.path(odir, paste0("Enrichr-", db, ".png"))
167
+ png(enrich_plot, res = 100, height = 1000, width = 1000)
168
+ print(enrich_p)
169
+ dev.off()
170
+
171
+ enrich_plot_pdf <- file.path(odir, paste0("Enrichr-", db, ".pdf"))
172
+ pdf(enrich_plot_pdf, height = 10, width = 10)
173
+ print(enrich_p)
172
174
  dev.off()
173
175
  }
174
176
  }
@@ -7,14 +7,13 @@ parser.add_argument(
7
7
  parser.add_argument("-o", "--output", required=True, help="Output file")
8
8
  parser.add_argument("-m", "--model", required=True, help="Model file")
9
9
  parser.add_argument(
10
- "-v", "--majority_voting",
11
- action="store_true",
12
- help="Majority voting"
10
+ "-v", "--majority_voting", action="store_true", help="Majority voting"
13
11
  )
14
12
  parser.add_argument(
15
- "-c", "--over_clustering",
13
+ "-c",
14
+ "--over_clustering",
16
15
  default="seurat_clusters",
17
- help="Over clustering. Ignored if the column does not exist."
16
+ help="Over clustering. Ignored if the column does not exist.",
18
17
  )
19
18
 
20
19
 
@@ -44,7 +43,9 @@ if __name__ == "__main__":
44
43
 
45
44
  if args.output.endswith(".h5ad"):
46
45
  try:
47
- out_adata._raw._var.rename(columns={"_index": "features"}, inplace=True)
46
+ out_adata._raw._var.rename( # type: ignore
47
+ columns={"_index": "features"}, inplace=True
48
+ )
48
49
  del out_adata.raw
49
50
  except (KeyError, AttributeError):
50
51
  pass
@@ -3,15 +3,16 @@ from os import path
3
3
  from glob import glob
4
4
  from biopipen.utils.misc import run_command, logger
5
5
 
6
- indir = {{in.indir | repr}} # noqa: E999 # pyright: ignore
7
- outfile = {{out.outfile | repr}} # pyright: ignore
8
- plink = {{envs.plink | repr}} # pyright: ignore
9
- ncores = {{envs.ncores | repr}} # pyright: ignore
10
- transpose = {{envs.transpose | repr}} # pyright: ignore
11
- samid = {{envs.samid | repr}} # pyright: ignore
12
- varid = {{envs.varid | repr}} # pyright: ignore
13
- trans_chr = {{envs.trans_chr | repr}} # pyright: ignore
14
- missing_id = {{envs.missing_id | repr}} # pyright: ignore
6
+ indir: str = {{in.indir | quote}} # noqa: E999 # pyright: ignore
7
+ outfile: str = {{out.outfile | quote}} # pyright: ignore
8
+ plink: str = {{envs.plink | quote}} # pyright: ignore
9
+ ncores: int = {{envs.ncores | repr}} # pyright: ignore
10
+ transpose: bool = {{envs.transpose | repr}} # pyright: ignore
11
+ samid: str = {{envs.samid | repr}} # pyright: ignore
12
+ varid: str = {{envs.varid | repr}} # pyright: ignore
13
+ trans_chr: dict = {{envs.trans_chr | repr}} # pyright: ignore
14
+ missing_id: str = {{envs.missing_id | repr}} # pyright: ignore
15
+ gtcoding: str = {{envs.gtcoding | repr}} # pyright: ignore
15
16
  trans_chr = trans_chr or {}
16
17
 
17
18
  bedfile = glob(path.join(indir, '*.bed'))
@@ -37,6 +38,14 @@ cmd = [
37
38
 
38
39
  run_command(cmd, fg=True, env={"cwd": path.dirname(outfile)})
39
40
 
41
+
42
+ def _vcf_gtcoding(gt):
43
+ try:
44
+ return str(2 - int(gt))
45
+ except (ValueError, TypeError):
46
+ return "NA"
47
+
48
+
40
49
  if not transpose: # rows are variants, columns are samples
41
50
  # .traw file is created, tab-separated, with the following columns:
42
51
  trawfile = output + ".traw"
@@ -82,7 +91,10 @@ if not transpose: # rows are variants, columns are samples
82
91
  .replace('{ref}', ref)
83
92
  .replace('{alt}', alt)
84
93
  )
85
- record = [variant] + line[6:]
94
+ if gtcoding == "plink":
95
+ record = [variant] + line[6:]
96
+ else: # vcf
97
+ record = [variant] + [_vcf_gtcoding(x) for x in line[6:]]
86
98
  fout.write('\t'.join(record) + '\n')
87
99
 
88
100
  else:
@@ -129,5 +141,8 @@ else:
129
141
  fid = line[0]
130
142
  iid = line[1]
131
143
  sam = samid.replace('{fid}', fid).replace('{iid}', iid)
132
- record = [sam] + line[6:]
144
+ if gtcoding == "plink":
145
+ record = [sam] + line[6:]
146
+ else: # vcf
147
+ record = [sam] + [_vcf_gtcoding(x) for x in line[6:]]
133
148
  fout.write('\t'.join(record) + '\n')
@@ -1,17 +1,17 @@
1
- """Script for snp.PlinkFilter"""
1
+ from __future__ import annotations
2
2
 
3
3
  from pathlib import Path
4
4
  from biopipen.utils.misc import run_command, dict_to_cli_args, logger
5
5
 
6
- indir = {{in.indir | repr}} # pyright: ignore # noqa: #999
7
- samples_file = {{in.samples_file | repr}} # pyright: ignore
8
- variants_file = {{in.variants_file | repr}} # pyright: ignore
9
- outdir = {{out.outdir | repr}} # pyright: ignore
6
+ indir: str = {{in.indir | quote}} # pyright: ignore # noqa: #999
7
+ samples_file = {{in.samples_file | quote}} # pyright: ignore
8
+ variants_file = {{in.variants_file | quote}} # pyright: ignore
9
+ outdir: str = {{out.outdir | quote}} # pyright: ignore
10
10
 
11
11
  plink = {{envs.plink | repr}} # pyright: ignore
12
12
  ncores = {{envs.ncores | repr}} # pyright: ignore
13
- samples = {{envs.samples | repr}} # pyright: ignore
14
- variants = {{envs.variants | repr}} # pyright: ignore
13
+ samples: list[str] | str = {{envs.samples | repr}} # pyright: ignore
14
+ variants: list[str] | str = {{envs.variants | repr}} # pyright: ignore
15
15
  e_samples_file = {{envs.samples_file | repr}} # pyright: ignore
16
16
  e_variants_file = {{envs.variants_file | repr}} # pyright: ignore
17
17
  keep = {{envs.keep | repr}} # pyright: ignore
@@ -1,12 +1,14 @@
1
- from os import path
1
+ from __future__ import annotations
2
+
3
+ from os import path, PathLike
2
4
  from biopipen.core.filters import dict_to_cli_args
3
5
  from biopipen.utils.reference import tabix_index
4
6
  from biopipen.utils.misc import run_command
5
7
 
6
- invcf = {{in.invcf | repr}} # noqa: E999 # pyright: ignore
7
- outprefix = {{in.invcf | stem0 | repr}} # pyright: ignore
8
- outdir = {{out.outdir | repr}} # pyright: ignore
9
- args = {{envs | dict | repr}} # pyright: ignore
8
+ invcf: str | PathLike = {{in.invcf | quote}} # noqa: E999 # pyright: ignore
9
+ outprefix: str = {{in.invcf | stem0 | quote}} # pyright: ignore
10
+ outdir: str = {{out.outdir | quote}} # pyright: ignore
11
+ args: dict = {{envs | dict}} # pyright: ignore
10
12
 
11
13
  plink = args.pop("plink")
12
14
  tabix = args.pop("tabix")
@@ -23,6 +25,7 @@ args.setdefault("max_alleles", 2)
23
25
  # This makes it possible to keep the allele order in the output
24
26
  # no need for plink2
25
27
  # args["keep_allele_order"] = True
28
+ args.setdefault("keep_allele_order", True)
26
29
 
27
30
  # resolve plink 1.x --set-missing-var-ids doesn't distinguish $1, $2,...
28
31
  # for ref and alts
@@ -4,9 +4,9 @@ from slugify import slugify
4
4
  from simpleconf import Config
5
5
  from biopipen.utils.misc import logger, run_command, dict_to_cli_args
6
6
 
7
- configfile = {{in.configfile | repr}} # pyright: ignore # noqa: E999
8
- outdir = {{out.outdir | repr}} # pyright: ignore
9
- gtmatfile = {{out.gtmat | repr}} # pyright: ignore
7
+ configfile: str = {{in.configfile | quote}} # pyright: ignore # noqa: E999
8
+ outdir: str = {{out.outdir | quote}} # pyright: ignore
9
+ gtmatfile: str = {{out.gtmat | quote}} # pyright: ignore
10
10
  config = Config.load(configfile)
11
11
 
12
12
  default_nsnps = {{envs.nsnps | repr}} # pyright: ignore
@@ -21,7 +21,7 @@ default_maxfreq = {{envs.maxfreq | repr}} # pyright: ignore
21
21
  default_hetodds = {{envs.hetodds | repr}} # pyright: ignore
22
22
  default_homodds = {{envs.homodds | repr}} # pyright: ignore
23
23
  default_missing = {{envs.missing | repr}} # pyright: ignore
24
- default_args = {{envs.args | repr}} # pyright: ignore
24
+ default_args: dict = {{envs.args | repr}} # pyright: ignore
25
25
  default_transpose_gtmat = {{envs.transpose_gtmat | repr}} # pyright: ignore
26
26
  default_sample_prefix = {{envs.sample_prefix | repr}} # pyright: ignore
27
27
 
@@ -1,9 +1,9 @@
1
1
  from pathlib import Path
2
2
  from biopipen.utils.misc import run_command, dict_to_cli_args, logger
3
3
 
4
- indir = {{in.indir | repr}} # pyright: ignore # noqa: #999
5
- namefile = {{in.namefile | repr}} # pyright: ignore
6
- outdir = {{out.outdir | repr}} # pyright: ignore
4
+ indir: str = {{in.indir | quote}} # pyright: ignore # noqa: #999
5
+ namefile: str = {{in.namefile | quote}} # pyright: ignore
6
+ outdir: str = {{out.outdir | quote}} # pyright: ignore
7
7
  plink = {{envs.plink | repr}} # pyright: ignore
8
8
  bcftools = {{envs.bcftools | repr}} # pyright: ignore
9
9
  ncores = {{envs.ncores | repr}} # pyright: ignore
@@ -111,7 +111,7 @@ if namefile.endswith(".vcf") or namefile.endswith(".vcf.gz"):
111
111
  else:
112
112
  info = readline(finfo)
113
113
 
114
- namefile = namefile_tmp
114
+ namefile = str(namefile_tmp)
115
115
 
116
116
  args = {
117
117
  "": plink,
@@ -12,15 +12,17 @@ transpose_input <- {{envs.transpose_input | r}}
12
12
  transpose_group <- {{envs.transpose_group | r}}
13
13
 
14
14
  log_info("Reading input files ...")
15
- indata <- read.table(infile, header = TRUE, sep = "\t", row.names = 1)
15
+ indata <- read.table(infile, header = TRUE, sep = "\t", row.names = 1, check.names = FALSE)
16
16
  if (transpose_input) {
17
17
  indata <- t(indata)
18
18
  }
19
- groupdata <- read.table(groupfile, header = TRUE, sep = "\t", row.names = 1)
19
+ groupdata <- read.table(groupfile, header = TRUE, sep = "\t", row.names = 1, check.names = FALSE)
20
20
  if (transpose_group) {
21
21
  groupdata <- t(groupdata)
22
22
  }
23
- fmldata <- read.table(fmlfile, header = TRUE, sep = "\t", row.names = NULL)
23
+ allgroups = na.omit(unique(unlist(groupdata)))
24
+
25
+ fmldata <- read.table(fmlfile, header = TRUE, sep = "\t", row.names = NULL, check.names = FALSE)
24
26
  colnames(fmldata)[1:2] <- c("Group", "Formula")
25
27
 
26
28
  chow.test <- function(fml, grouping) {
@@ -63,26 +65,43 @@ chow.test <- function(fml, grouping) {
63
65
  )
64
66
  }
65
67
 
66
- formatlm <- function(m) {
67
- if (class(m) == 'lm') {
68
- coeff <- as.list(m$coefficients)
68
+ formatlm <- function(m, g = NULL, type = "coeff") {
69
+ if (is.null(g)) {
69
70
  vars <- all.vars(m$terms)
70
- terms <- unlist(sapply(na.omit(c(vars[2:length(vars)], '(Intercept)', 'N')), function(x) {
71
- ce <- coeff[[x]] %||% coeff[[bQuote(x)]]
72
- if (x == 'N') {
73
- paste0('N=', nrow(m$model))
74
- } else if (is.null(ce)) {
75
- NULL
76
- } else {
77
- l <- ifelse(x == '(Intercept)', '_', x)
78
- paste0(l, '=', round(ce, 3))
79
- }
80
- }))
71
+ if (type == "pval") {
72
+ df <- as.data.frame(summary(m)$coefficients)
73
+ terms <- unlist(sapply(na.omit(c(vars[2:length(vars)], '(Intercept)', 'N')), function(x) {
74
+ pv <- df[x, 4] %||% df[bQuote(x), 4]
75
+ if (x == 'N') {
76
+ paste0('N=', nrow(m$model))
77
+ } else if (is.null(pv)) {
78
+ NULL
79
+ } else {
80
+ l <- ifelse(x == '(Intercept)', '_', x)
81
+ paste0(l, '=', signif(pv, digits = 4))
82
+ }
83
+ }))
84
+ } else {
85
+ coeff <- as.list(m$coefficients)
86
+ terms <- unlist(sapply(na.omit(c(vars[2:length(vars)], '(Intercept)', 'N')), function(x) {
87
+ ce <- coeff[[x]] %||% coeff[[bQuote(x)]]
88
+ if (x == 'N') {
89
+ paste0('N=', nrow(m$model))
90
+ } else if (is.null(ce)) {
91
+ NULL
92
+ } else {
93
+ l <- ifelse(x == '(Intercept)', '_', x)
94
+ paste0(l, '=', round(ce, 3))
95
+ }
96
+ }))
97
+ }
81
98
  paste(terms[!is.null(terms)], collapse = ', ')
82
99
  } else {
83
- paste(sapply(names(m), function(x) {
84
- paste0(x, ': ', formatlm(m[[x]]))
85
- }), collapse = ' // ')
100
+ gm <- m[[as.character(g)]]
101
+ if (is.null(gm)) {
102
+ return(NA)
103
+ }
104
+ formatlm(gm, type = type)
86
105
  }
87
106
  }
88
107
 
@@ -98,8 +117,15 @@ results <- do_call(rbind, lapply(
98
117
  log_debug(" Running Chow test for formula: {fmlrow$Formula} (grouping = {fmlrow$Group})")
99
118
 
100
119
  res <- chow.test(fmlrow$Formula, fmlrow$Group)
101
- fmlrow$Pooled <- formatlm(res$pooled.lm)
102
- fmlrow$Groups <- formatlm(res$group.lms)
120
+ fmlrow$Pooled_Coef <- formatlm(res$pooled.lm)
121
+ for (g in allgroups) {
122
+ fmlrow[[paste0("Group_", g, "_Coef")]] <- formatlm(res$group.lms, g)
123
+ }
124
+ # fmlrow$Groups <- formatlm(res$group.lms)
125
+ fmlrow$Pooled_Pval <- formatlm(res$pooled.lm, type="pval")
126
+ for (g in allgroups) {
127
+ fmlrow[[paste0("Group_", g, "_Pval")]] <- formatlm(res$group.lms, g, type="pval")
128
+ }
103
129
  fmlrow$SSR <- res$group.ssr
104
130
  fmlrow$SumSSR <- res$pooled.ssr
105
131
  fmlrow$Fstat <- res$Fstat