biopipen 0.21.0__py3-none-any.whl → 0.34.26__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (290) hide show
  1. biopipen/__init__.py +1 -1
  2. biopipen/core/config.toml +28 -0
  3. biopipen/core/filters.py +79 -4
  4. biopipen/core/proc.py +12 -3
  5. biopipen/core/testing.py +75 -3
  6. biopipen/ns/bam.py +148 -6
  7. biopipen/ns/bed.py +75 -0
  8. biopipen/ns/cellranger.py +186 -0
  9. biopipen/ns/cellranger_pipeline.py +126 -0
  10. biopipen/ns/cnv.py +19 -3
  11. biopipen/ns/cnvkit.py +1 -1
  12. biopipen/ns/cnvkit_pipeline.py +20 -12
  13. biopipen/ns/delim.py +34 -35
  14. biopipen/ns/gene.py +68 -23
  15. biopipen/ns/gsea.py +63 -37
  16. biopipen/ns/misc.py +39 -14
  17. biopipen/ns/plot.py +304 -1
  18. biopipen/ns/protein.py +183 -0
  19. biopipen/ns/regulatory.py +290 -0
  20. biopipen/ns/rnaseq.py +142 -5
  21. biopipen/ns/scrna.py +2053 -473
  22. biopipen/ns/scrna_metabolic_landscape.py +228 -382
  23. biopipen/ns/snp.py +659 -0
  24. biopipen/ns/stats.py +484 -0
  25. biopipen/ns/tcr.py +683 -98
  26. biopipen/ns/vcf.py +236 -2
  27. biopipen/ns/web.py +97 -6
  28. biopipen/reports/bam/CNVpytor.svelte +4 -9
  29. biopipen/reports/cellranger/CellRangerCount.svelte +18 -0
  30. biopipen/reports/cellranger/CellRangerSummary.svelte +16 -0
  31. biopipen/reports/cellranger/CellRangerVdj.svelte +18 -0
  32. biopipen/reports/cnvkit/CNVkitDiagram.svelte +1 -1
  33. biopipen/reports/cnvkit/CNVkitHeatmap.svelte +1 -1
  34. biopipen/reports/cnvkit/CNVkitScatter.svelte +1 -1
  35. biopipen/reports/common.svelte +15 -0
  36. biopipen/reports/protein/ProdigySummary.svelte +16 -0
  37. biopipen/reports/scrna/CellsDistribution.svelte +4 -39
  38. biopipen/reports/scrna/DimPlots.svelte +1 -1
  39. biopipen/reports/scrna/MarkersFinder.svelte +6 -126
  40. biopipen/reports/scrna/MetaMarkers.svelte +3 -75
  41. biopipen/reports/scrna/RadarPlots.svelte +4 -20
  42. biopipen/reports/scrna_metabolic_landscape/MetabolicFeatures.svelte +61 -22
  43. biopipen/reports/scrna_metabolic_landscape/MetabolicPathwayActivity.svelte +88 -82
  44. biopipen/reports/scrna_metabolic_landscape/MetabolicPathwayHeterogeneity.svelte +70 -10
  45. biopipen/reports/snp/PlinkCallRate.svelte +24 -0
  46. biopipen/reports/snp/PlinkFreq.svelte +18 -0
  47. biopipen/reports/snp/PlinkHWE.svelte +18 -0
  48. biopipen/reports/snp/PlinkHet.svelte +18 -0
  49. biopipen/reports/snp/PlinkIBD.svelte +18 -0
  50. biopipen/reports/tcr/CDR3AAPhyschem.svelte +19 -66
  51. biopipen/reports/tcr/ClonalStats.svelte +16 -0
  52. biopipen/reports/tcr/CloneResidency.svelte +3 -93
  53. biopipen/reports/tcr/Immunarch.svelte +4 -155
  54. biopipen/reports/tcr/TCRClusterStats.svelte +3 -45
  55. biopipen/reports/tcr/TESSA.svelte +11 -28
  56. biopipen/reports/utils/misc.liq +22 -7
  57. biopipen/scripts/bam/BamMerge.py +11 -15
  58. biopipen/scripts/bam/BamSampling.py +90 -0
  59. biopipen/scripts/bam/BamSort.py +141 -0
  60. biopipen/scripts/bam/BamSplitChroms.py +10 -10
  61. biopipen/scripts/bam/BamSubsetByBed.py +38 -0
  62. biopipen/scripts/bam/CNAClinic.R +41 -5
  63. biopipen/scripts/bam/CNVpytor.py +153 -54
  64. biopipen/scripts/bam/ControlFREEC.py +13 -14
  65. biopipen/scripts/bam/SamtoolsView.py +33 -0
  66. biopipen/scripts/bed/Bed2Vcf.py +5 -5
  67. biopipen/scripts/bed/BedConsensus.py +5 -5
  68. biopipen/scripts/bed/BedLiftOver.sh +6 -4
  69. biopipen/scripts/bed/BedtoolsIntersect.py +54 -0
  70. biopipen/scripts/bed/BedtoolsMakeWindows.py +47 -0
  71. biopipen/scripts/bed/BedtoolsMerge.py +4 -4
  72. biopipen/scripts/cellranger/CellRangerCount.py +138 -0
  73. biopipen/scripts/cellranger/CellRangerSummary.R +181 -0
  74. biopipen/scripts/cellranger/CellRangerVdj.py +112 -0
  75. biopipen/scripts/cnv/AneuploidyScore.R +55 -20
  76. biopipen/scripts/cnv/AneuploidyScoreSummary.R +221 -163
  77. biopipen/scripts/cnv/TMADScore.R +25 -9
  78. biopipen/scripts/cnv/TMADScoreSummary.R +57 -86
  79. biopipen/scripts/cnvkit/CNVkitAccess.py +7 -6
  80. biopipen/scripts/cnvkit/CNVkitAutobin.py +26 -18
  81. biopipen/scripts/cnvkit/CNVkitBatch.py +6 -6
  82. biopipen/scripts/cnvkit/CNVkitCall.py +3 -3
  83. biopipen/scripts/cnvkit/CNVkitCoverage.py +4 -3
  84. biopipen/scripts/cnvkit/CNVkitDiagram.py +5 -5
  85. biopipen/scripts/cnvkit/CNVkitFix.py +3 -3
  86. biopipen/scripts/cnvkit/CNVkitGuessBaits.py +12 -8
  87. biopipen/scripts/cnvkit/CNVkitHeatmap.py +5 -5
  88. biopipen/scripts/cnvkit/CNVkitReference.py +6 -5
  89. biopipen/scripts/cnvkit/CNVkitScatter.py +5 -5
  90. biopipen/scripts/cnvkit/CNVkitSegment.py +5 -5
  91. biopipen/scripts/cnvkit/guess_baits.py +166 -93
  92. biopipen/scripts/delim/RowsBinder.R +1 -1
  93. biopipen/scripts/delim/SampleInfo.R +116 -118
  94. biopipen/scripts/gene/GeneNameConversion.R +67 -0
  95. biopipen/scripts/gene/GenePromoters.R +61 -0
  96. biopipen/scripts/gsea/Enrichr.R +5 -5
  97. biopipen/scripts/gsea/FGSEA.R +184 -50
  98. biopipen/scripts/gsea/GSEA.R +2 -2
  99. biopipen/scripts/gsea/PreRank.R +5 -5
  100. biopipen/scripts/misc/Config2File.py +2 -2
  101. biopipen/scripts/misc/Plot.R +80 -0
  102. biopipen/scripts/misc/Shell.sh +15 -0
  103. biopipen/scripts/misc/Str2File.py +2 -2
  104. biopipen/scripts/plot/Heatmap.R +3 -3
  105. biopipen/scripts/plot/Manhattan.R +147 -0
  106. biopipen/scripts/plot/QQPlot.R +146 -0
  107. biopipen/scripts/plot/ROC.R +88 -0
  108. biopipen/scripts/plot/Scatter.R +112 -0
  109. biopipen/scripts/plot/VennDiagram.R +5 -9
  110. biopipen/scripts/protein/MMCIF2PDB.py +33 -0
  111. biopipen/scripts/protein/PDB2Fasta.py +60 -0
  112. biopipen/scripts/protein/Prodigy.py +119 -0
  113. biopipen/scripts/protein/ProdigySummary.R +140 -0
  114. biopipen/scripts/protein/RMSD.py +178 -0
  115. biopipen/scripts/regulatory/MotifAffinityTest.R +102 -0
  116. biopipen/scripts/regulatory/MotifAffinityTest_AtSNP.R +127 -0
  117. biopipen/scripts/regulatory/MotifAffinityTest_MotifBreakR.R +104 -0
  118. biopipen/scripts/regulatory/MotifScan.py +159 -0
  119. biopipen/scripts/regulatory/VariantMotifPlot.R +78 -0
  120. biopipen/scripts/regulatory/motifs-common.R +324 -0
  121. biopipen/scripts/rnaseq/Simulation-ESCO.R +180 -0
  122. biopipen/scripts/rnaseq/Simulation-RUVcorr.R +45 -0
  123. biopipen/scripts/rnaseq/Simulation.R +21 -0
  124. biopipen/scripts/rnaseq/UnitConversion.R +325 -54
  125. biopipen/scripts/scrna/AnnData2Seurat.R +40 -0
  126. biopipen/scripts/scrna/CCPlotR-patch.R +161 -0
  127. biopipen/scripts/scrna/CellCellCommunication.py +150 -0
  128. biopipen/scripts/scrna/CellCellCommunicationPlots.R +93 -0
  129. biopipen/scripts/scrna/CellSNPLite.py +30 -0
  130. biopipen/scripts/scrna/CellTypeAnnotation-celltypist.R +185 -0
  131. biopipen/scripts/scrna/CellTypeAnnotation-direct.R +68 -31
  132. biopipen/scripts/scrna/CellTypeAnnotation-hitype.R +27 -22
  133. biopipen/scripts/scrna/CellTypeAnnotation-sccatch.R +28 -20
  134. biopipen/scripts/scrna/CellTypeAnnotation-sctype.R +48 -25
  135. biopipen/scripts/scrna/CellTypeAnnotation.R +37 -1
  136. biopipen/scripts/scrna/CellsDistribution.R +456 -167
  137. biopipen/scripts/scrna/DimPlots.R +1 -1
  138. biopipen/scripts/scrna/ExprImputation-alra.R +109 -0
  139. biopipen/scripts/scrna/ExprImputation-rmagic.R +256 -0
  140. biopipen/scripts/scrna/{ExprImpution-scimpute.R → ExprImputation-scimpute.R} +8 -5
  141. biopipen/scripts/scrna/ExprImputation.R +7 -0
  142. biopipen/scripts/scrna/LoomTo10X.R +51 -0
  143. biopipen/scripts/scrna/MQuad.py +25 -0
  144. biopipen/scripts/scrna/MarkersFinder.R +679 -400
  145. biopipen/scripts/scrna/MetaMarkers.R +265 -161
  146. biopipen/scripts/scrna/ModuleScoreCalculator.R +66 -11
  147. biopipen/scripts/scrna/PseudoBulkDEG.R +678 -0
  148. biopipen/scripts/scrna/RadarPlots.R +355 -134
  149. biopipen/scripts/scrna/ScFGSEA.R +298 -100
  150. biopipen/scripts/scrna/ScSimulation.R +65 -0
  151. biopipen/scripts/scrna/ScVelo.py +617 -0
  152. biopipen/scripts/scrna/Seurat2AnnData.R +7 -0
  153. biopipen/scripts/scrna/SeuratClusterStats-clustree.R +87 -0
  154. biopipen/scripts/scrna/SeuratClusterStats-dimplots.R +36 -30
  155. biopipen/scripts/scrna/SeuratClusterStats-features.R +138 -187
  156. biopipen/scripts/scrna/SeuratClusterStats-ngenes.R +81 -0
  157. biopipen/scripts/scrna/SeuratClusterStats-stats.R +78 -89
  158. biopipen/scripts/scrna/SeuratClusterStats.R +47 -10
  159. biopipen/scripts/scrna/SeuratClustering.R +36 -233
  160. biopipen/scripts/scrna/SeuratLoading.R +2 -2
  161. biopipen/scripts/scrna/SeuratMap2Ref.R +84 -113
  162. biopipen/scripts/scrna/SeuratMetadataMutater.R +16 -6
  163. biopipen/scripts/scrna/SeuratPreparing.R +223 -173
  164. biopipen/scripts/scrna/SeuratSubClustering.R +64 -0
  165. biopipen/scripts/scrna/SeuratTo10X.R +27 -0
  166. biopipen/scripts/scrna/Slingshot.R +65 -0
  167. biopipen/scripts/scrna/Subset10X.R +2 -2
  168. biopipen/scripts/scrna/TopExpressingGenes.R +169 -135
  169. biopipen/scripts/scrna/celltypist-wrapper.py +195 -0
  170. biopipen/scripts/scrna/scvelo_paga.py +313 -0
  171. biopipen/scripts/scrna/seurat_anndata_conversion.py +98 -0
  172. biopipen/scripts/scrna_metabolic_landscape/MetabolicFeatures.R +447 -82
  173. biopipen/scripts/scrna_metabolic_landscape/MetabolicPathwayActivity.R +348 -241
  174. biopipen/scripts/scrna_metabolic_landscape/MetabolicPathwayHeterogeneity.R +188 -166
  175. biopipen/scripts/snp/MatrixEQTL.R +217 -0
  176. biopipen/scripts/snp/Plink2GTMat.py +148 -0
  177. biopipen/scripts/snp/PlinkCallRate.R +199 -0
  178. biopipen/scripts/snp/PlinkFilter.py +100 -0
  179. biopipen/scripts/snp/PlinkFreq.R +291 -0
  180. biopipen/scripts/snp/PlinkFromVcf.py +81 -0
  181. biopipen/scripts/snp/PlinkHWE.R +85 -0
  182. biopipen/scripts/snp/PlinkHet.R +96 -0
  183. biopipen/scripts/snp/PlinkIBD.R +196 -0
  184. biopipen/scripts/snp/PlinkSimulation.py +124 -0
  185. biopipen/scripts/snp/PlinkUpdateName.py +124 -0
  186. biopipen/scripts/stats/ChowTest.R +146 -0
  187. biopipen/scripts/stats/DiffCoexpr.R +152 -0
  188. biopipen/scripts/stats/LiquidAssoc.R +135 -0
  189. biopipen/scripts/stats/Mediation.R +108 -0
  190. biopipen/scripts/stats/MetaPvalue.R +130 -0
  191. biopipen/scripts/stats/MetaPvalue1.R +74 -0
  192. biopipen/scripts/tcgamaf/Maf2Vcf.py +2 -2
  193. biopipen/scripts/tcgamaf/MafAddChr.py +2 -2
  194. biopipen/scripts/tcr/Attach2Seurat.R +3 -2
  195. biopipen/scripts/tcr/CDR3AAPhyschem.R +211 -143
  196. biopipen/scripts/tcr/CDR3Clustering.R +343 -0
  197. biopipen/scripts/tcr/ClonalStats.R +526 -0
  198. biopipen/scripts/tcr/CloneResidency.R +255 -131
  199. biopipen/scripts/tcr/CloneSizeQQPlot.R +4 -4
  200. biopipen/scripts/tcr/GIANA/GIANA.py +1356 -797
  201. biopipen/scripts/tcr/GIANA/GIANA4.py +1362 -789
  202. biopipen/scripts/tcr/GIANA/query.py +164 -162
  203. biopipen/scripts/tcr/Immunarch-basic.R +31 -9
  204. biopipen/scripts/tcr/Immunarch-clonality.R +25 -5
  205. biopipen/scripts/tcr/Immunarch-diversity.R +352 -134
  206. biopipen/scripts/tcr/Immunarch-geneusage.R +45 -5
  207. biopipen/scripts/tcr/Immunarch-kmer.R +68 -8
  208. biopipen/scripts/tcr/Immunarch-overlap.R +84 -4
  209. biopipen/scripts/tcr/Immunarch-spectratyping.R +35 -6
  210. biopipen/scripts/tcr/Immunarch-tracking.R +38 -6
  211. biopipen/scripts/tcr/Immunarch-vjjunc.R +165 -0
  212. biopipen/scripts/tcr/Immunarch.R +63 -11
  213. biopipen/scripts/tcr/Immunarch2VDJtools.R +2 -2
  214. biopipen/scripts/tcr/ImmunarchFilter.R +4 -4
  215. biopipen/scripts/tcr/ImmunarchLoading.R +38 -29
  216. biopipen/scripts/tcr/SampleDiversity.R +1 -1
  217. biopipen/scripts/tcr/ScRepCombiningExpression.R +40 -0
  218. biopipen/scripts/tcr/ScRepLoading.R +166 -0
  219. biopipen/scripts/tcr/TCRClusterStats.R +176 -22
  220. biopipen/scripts/tcr/TCRDock.py +110 -0
  221. biopipen/scripts/tcr/TESSA.R +102 -118
  222. biopipen/scripts/tcr/VJUsage.R +5 -5
  223. biopipen/scripts/tcr/immunarch-patched.R +142 -0
  224. biopipen/scripts/tcr/vdjtools-patch.sh +1 -1
  225. biopipen/scripts/vcf/BcftoolsAnnotate.py +91 -0
  226. biopipen/scripts/vcf/BcftoolsFilter.py +90 -0
  227. biopipen/scripts/vcf/BcftoolsMerge.py +31 -0
  228. biopipen/scripts/vcf/BcftoolsSort.py +113 -0
  229. biopipen/scripts/vcf/BcftoolsView.py +73 -0
  230. biopipen/scripts/vcf/TruvariBench.sh +14 -7
  231. biopipen/scripts/vcf/TruvariBenchSummary.R +16 -13
  232. biopipen/scripts/vcf/TruvariConsistency.R +1 -1
  233. biopipen/scripts/vcf/Vcf2Bed.py +2 -2
  234. biopipen/scripts/vcf/VcfAnno.py +11 -11
  235. biopipen/scripts/vcf/VcfDownSample.sh +22 -10
  236. biopipen/scripts/vcf/VcfFilter.py +5 -5
  237. biopipen/scripts/vcf/VcfFix.py +7 -7
  238. biopipen/scripts/vcf/VcfFix_utils.py +13 -4
  239. biopipen/scripts/vcf/VcfIndex.py +3 -3
  240. biopipen/scripts/vcf/VcfIntersect.py +3 -3
  241. biopipen/scripts/vcf/VcfLiftOver.sh +5 -0
  242. biopipen/scripts/vcf/VcfSplitSamples.py +4 -4
  243. biopipen/scripts/vcf/bcftools_utils.py +52 -0
  244. biopipen/scripts/web/Download.py +8 -4
  245. biopipen/scripts/web/DownloadList.py +5 -5
  246. biopipen/scripts/web/GCloudStorageDownloadBucket.py +82 -0
  247. biopipen/scripts/web/GCloudStorageDownloadFile.py +23 -0
  248. biopipen/scripts/web/gcloud_common.py +49 -0
  249. biopipen/utils/gene.py +108 -60
  250. biopipen/utils/misc.py +146 -20
  251. biopipen/utils/reference.py +64 -20
  252. biopipen/utils/reporter.py +177 -0
  253. biopipen/utils/vcf.py +1 -1
  254. biopipen-0.34.26.dist-info/METADATA +27 -0
  255. biopipen-0.34.26.dist-info/RECORD +292 -0
  256. {biopipen-0.21.0.dist-info → biopipen-0.34.26.dist-info}/WHEEL +1 -1
  257. {biopipen-0.21.0.dist-info → biopipen-0.34.26.dist-info}/entry_points.txt +6 -2
  258. biopipen/ns/bcftools.py +0 -111
  259. biopipen/ns/scrna_basic.py +0 -255
  260. biopipen/reports/delim/SampleInfo.svelte +0 -36
  261. biopipen/reports/scrna/GeneExpressionInvistigation.svelte +0 -32
  262. biopipen/reports/scrna/ScFGSEA.svelte +0 -35
  263. biopipen/reports/scrna/SeuratClusterStats.svelte +0 -82
  264. biopipen/reports/scrna/SeuratMap2Ref.svelte +0 -20
  265. biopipen/reports/scrna/SeuratPreparing.svelte +0 -38
  266. biopipen/reports/scrna/TopExpressingGenes.svelte +0 -55
  267. biopipen/reports/scrna_metabolic_landscape/MetabolicFeaturesIntraSubset.svelte +0 -31
  268. biopipen/reports/utils/gsea.liq +0 -110
  269. biopipen/scripts/bcftools/BcftoolsAnnotate.py +0 -42
  270. biopipen/scripts/bcftools/BcftoolsFilter.py +0 -79
  271. biopipen/scripts/bcftools/BcftoolsSort.py +0 -19
  272. biopipen/scripts/gene/GeneNameConversion.py +0 -66
  273. biopipen/scripts/scrna/ExprImpution-alra.R +0 -32
  274. biopipen/scripts/scrna/ExprImpution-rmagic.R +0 -29
  275. biopipen/scripts/scrna/ExprImpution.R +0 -7
  276. biopipen/scripts/scrna/GeneExpressionInvistigation.R +0 -132
  277. biopipen/scripts/scrna/Write10X.R +0 -11
  278. biopipen/scripts/scrna_metabolic_landscape/MetabolicFeaturesIntraSubset.R +0 -150
  279. biopipen/scripts/tcr/TCRClustering.R +0 -280
  280. biopipen/utils/common_docstrs.py +0 -61
  281. biopipen/utils/gene.R +0 -49
  282. biopipen/utils/gsea.R +0 -193
  283. biopipen/utils/io.R +0 -20
  284. biopipen/utils/misc.R +0 -114
  285. biopipen/utils/mutate_helpers.R +0 -433
  286. biopipen/utils/plot.R +0 -173
  287. biopipen/utils/rnaseq.R +0 -48
  288. biopipen/utils/single_cell.R +0 -115
  289. biopipen-0.21.0.dist-info/METADATA +0 -22
  290. biopipen-0.21.0.dist-info/RECORD +0 -218
@@ -0,0 +1,82 @@
1
+ import concurrent.futures
2
+ from pathlib import Path
3
+
4
+ from biopipen.utils.misc import run_command, dict_to_cli_args, logger
5
+ from biopipen.scripts.web.gcloud_common import (
6
+ is_logged_in,
7
+ is_valid_gs_bucket_url,
8
+ get_file_path,
9
+ )
10
+
11
+ url: str = {{in.url | quote}} # pyright: ignore # noqa: E999
12
+ outdir = Path({{out.outdir | repr}}) # pyright: ignore
13
+ gcloud: str = {{envs.gcloud | quote}} # pyright: ignore
14
+ keep_structure = {{envs.keep_structure | repr}} # pyright: ignore
15
+ ncores: int = {{envs.ncores | repr}} # pyright: ignore
16
+ args: dict = {{envs.args | repr}} # pyright: ignore
17
+
18
+ if not is_valid_gs_bucket_url(url):
19
+ raise Exception(
20
+ f"Invalid Google Cloud Storage URL for a bucket: {url}. "
21
+ "URL should be in the format gs://bucket"
22
+ )
23
+
24
+ if not is_logged_in(gcloud):
25
+ raise Exception(
26
+ "You need to be logged in to gcloud to download files. "
27
+ "Please run `gcloud auth login` first."
28
+ )
29
+
30
+
31
+ def create_folders(folder_lines):
32
+ for folder_line in folder_lines:
33
+ folder_path = get_file_path(folder_line)
34
+ folder = outdir / folder_path
35
+ folder.mkdir(parents=True, exist_ok=True)
36
+
37
+
38
+ def download_file(i: int, line: str, total: int):
39
+ path = get_file_path(line)
40
+
41
+ if total <= 50:
42
+ logger.info(f"Downloading {path}")
43
+ elif 50 < total <= 500:
44
+ if i % 10 == 0:
45
+ logger.info(f"Downloading {i}/{total} ...")
46
+ else:
47
+ if i % 100 == 0:
48
+ logger.info(f"Downloading {i}/{total} ...")
49
+
50
+ if keep_structure:
51
+ target = (outdir / path)
52
+ else:
53
+ name = Path(path).name
54
+ target = outdir / name
55
+ if target.exists():
56
+ new_name = f"g{i}-{name}"
57
+ logger.warning(f"{name} already exists. Renaming to {new_name}.")
58
+ target = outdir / new_name
59
+
60
+ gs_args = args.copy()
61
+ gs_args[""] = [gcloud, "storage", "cp", line, target]
62
+ run_command(dict_to_cli_args(gs_args, dashify=True), fg=True)
63
+
64
+
65
+ def download_bucket():
66
+ out = run_command([gcloud, "storage", "ls", "--recursive", url], stdout="RETURN")
67
+ # remove empty lines and skip the root
68
+ out = list(filter(None, out.splitlines()[1:])) # type: ignore
69
+ if keep_structure:
70
+ # create folders first
71
+ logger.info(f"Creating folders to keep structure.")
72
+ folder_lines = [line[:-2] for line in out if line.endswith("/:")]
73
+ create_folders(folder_lines)
74
+
75
+ out = [line for line in out if not line.endswith("/:")]
76
+ length = len(out)
77
+ with concurrent.futures.ProcessPoolExecutor(max_workers=ncores) as executor:
78
+ executor.map(download_file, range(length), out, [length] * length)
79
+
80
+
81
+ if __name__ == "__main__":
82
+ download_bucket()
@@ -0,0 +1,23 @@
1
+ from biopipen.utils.misc import run_command, dict_to_cli_args
2
+ from biopipen.scripts.web.gcloud_common import is_logged_in, is_valid_gs_file_url
3
+
4
+ url: str = {{in.url | repr}} # pyright: ignore # noqa: E999
5
+ outfile = {{out.outfile | repr}} # pyright: ignore
6
+ gcloud: str = {{envs.gcloud | repr}} # pyright: ignore
7
+ args: dict = {{envs.args | repr}} # pyright: ignore
8
+
9
+ if not is_valid_gs_file_url(url):
10
+ raise Exception(
11
+ f"Invalid Google Cloud Storage URL for a file: {url}. "
12
+ "URL should be in the format gs://bucket/path/to/file"
13
+ )
14
+
15
+ if not is_logged_in(gcloud):
16
+ raise Exception(
17
+ "You need to be logged in to gcloud to download files. "
18
+ "Please run `gcloud auth login` first."
19
+ )
20
+
21
+ args[""] = [gcloud, "storage", "cp", url, outfile]
22
+
23
+ run_command(dict_to_cli_args(args, dashify=True), fg=True)
@@ -0,0 +1,49 @@
1
+ """Provides common functions for interacting with Google Cloud Storage."""
2
+ from biopipen.utils.misc import run_command
3
+
4
+
5
+ def is_logged_in(gcloud: str) -> bool:
6
+ """Check if the user is logged in to Google Cloud Storage.
7
+
8
+ Args:
9
+ gcloud: Path to the `gcloud` executable.
10
+
11
+ Returns:
12
+ bool: True if the user is logged in, False otherwise.
13
+ """
14
+ out = run_command([gcloud, "auth", "list"], stdout="RETURN")
15
+ return "ACTIVE" in out # type: ignore
16
+
17
+
18
+ def is_valid_gs_bucket_url(url: str) -> bool:
19
+ """Check if a URL is a valid Google Cloud Storage bucket URL.
20
+
21
+ Such as `gs://bucket`.
22
+ """
23
+ if not url.startswith("gs://"):
24
+ return False
25
+
26
+ url = url.rstrip("/")
27
+ return "/" not in url[5:]
28
+
29
+
30
+ def get_file_path(url: str) -> str:
31
+ """Get the file path from a Google Cloud Storage file URL, without bucket.
32
+
33
+ For example: gs://bucket/path/to/file -> path/to/file
34
+
35
+ Args:
36
+ url: The Google Cloud Storage file URL.
37
+
38
+ Returns:
39
+ str: The file path.
40
+ """
41
+ return url[5:].split("/", 1)[1]
42
+
43
+
44
+ def is_valid_gs_file_url(url: str) -> bool:
45
+ """Check if a URL is a valid Google Cloud Storage file URL.
46
+
47
+ Such as `gs://bucket/path/to/file`.
48
+ """
49
+ return url.startswith("gs://")
biopipen/utils/gene.py CHANGED
@@ -1,86 +1,134 @@
1
1
  """Do gene name conversion"""
2
+ from __future__ import annotations
3
+
4
+ import re
5
+ import contextlib
6
+ import pandas as pd
2
7
  from mygene import MyGeneInfo
3
- from datar.all import (
4
- c,
5
- f,
6
- group_by,
7
- desc,
8
- arrange,
9
- slice_head,
10
- tibble,
11
- left_join,
12
- mutate,
13
- is_na,
14
- across,
15
- if_else,
16
- filter_,
17
- pull,
18
- select,
19
- )
20
8
 
21
9
  mygene = MyGeneInfo()
22
10
 
23
11
 
24
- class QueryGenesNotFound(Exception):
12
+ class QueryGenesNotFound(ValueError):
25
13
  """When genes cannot be found"""
26
14
 
27
15
 
28
16
  def gene_name_conversion(
29
- genes,
30
- species,
31
- infmt,
32
- outfmt,
33
- notfound,
17
+ genes: list[str],
18
+ infmt: str | list[str],
19
+ outfmt: str,
20
+ dup: str = "first",
21
+ species: str = "human",
22
+ notfound: str = "na",
23
+ suppress_messages: bool = False,
34
24
  ):
35
25
  """Convert gene names using MyGeneInfo
36
26
 
37
27
  Args:
38
- genes: A sequence of genes
39
- species: The species to limit the query
40
- Supported: human, mouse, rat, fruitfly, nematode, zebrafish,
41
- thale-cress, frog and pig
42
-
43
- infmt: What's the original gene name format
44
- Available fields
45
- https://docs.mygene.info/en/latest/doc/query_service.html#available-fields
46
- outfmt: What's the target gene name format
47
- notfound: What to do if a conversion cannot be done.
48
- use-query: Ignore the conversion and use the original name
49
- skip: Ignore the conversion and skip the entire row in input file
50
- error: Report error
28
+ genes: A character/integer vector of gene names/ids
29
+ species: A character vector of species names
30
+ infmt: A character vector of input gene name formats
31
+ See the available scopes at
32
+ https://docs.mygene.info/en/latest/doc/data.html#available-fields
33
+ You can use ensg as a shortcut for ensembl.gene
34
+ outfmt: A character vector of output gene name formats
35
+ dup: How to deal with duplicate gene names found.
36
+ first: keep the first one (default), sorted by score descendingly
37
+ last: keep the last one, sorted by score descendingly
38
+ all: keep all of them, each will be a separate row
39
+ <X>: combine them into a single string, separated by X
40
+ notfound: How to deal with gene names that are not found
41
+ error: stop with an error message
42
+ use-query: use the query gene name as the converted gene name
43
+ skip: skip the gene names that are not found
44
+ ignore: Same as "skip"
45
+ na: use NA as the converted gene name (default)
46
+ suppress_messages: Suppress the messages while querying
51
47
 
52
48
  Returns:
53
- A dataframe with two columns, query and `outfmt`.
49
+ A dataframe with the query gene names and the converted gene names
50
+ When a gene name is not found, the converted name will be "NA"
51
+ When duplicate gene names are found, the one with the highest score will be kept
54
52
  """
55
- out = (
56
- mygene.querymany(
53
+ notfound = notfound.lower()
54
+ if notfound not in ("error", "use-query", "skip", "ignore", "na"):
55
+ raise ValueError(
56
+ "`notfound` of `gene_name_conversion` must be one of "
57
+ "'error', 'use-query', 'skip', 'ignore', 'na'"
58
+ )
59
+
60
+ if infmt in ["ensg", "ensmusg"]:
61
+ infmt = "ensembl.gene"
62
+ if outfmt in ["ensg", "ensmusg"]:
63
+ outfmt = "ensembl.gene"
64
+
65
+ orig_genes = genes[:]
66
+ if infmt == "ensembl.gene":
67
+ # Remove version numbers from ensembl gene ids
68
+ genes = [re.sub("\\..*", "", gene) for gene in genes]
69
+
70
+ query_df = pd.DataFrame({"query": genes, "orig": orig_genes})
71
+
72
+ if suppress_messages:
73
+ with contextlib.redirect_stdout(None):
74
+ out = mygene.querymany(
75
+ genes,
76
+ scopes=infmt,
77
+ fields=outfmt,
78
+ species=species,
79
+ as_dataframe=True,
80
+ df_index=False,
81
+ )
82
+ else:
83
+ out = mygene.querymany(
57
84
  genes,
58
85
  scopes=infmt,
59
86
  fields=outfmt,
87
+ species=species,
60
88
  as_dataframe=True,
61
89
  df_index=False,
62
- species=species,
63
90
  )
64
- >> group_by(f.query)
65
- >> arrange(desc(f._score))
66
- >> slice_head(1)
67
- >> select(~c(f._id, f._score, f.notfound))
68
- )
69
- if isinstance(outfmt, str):
70
- outfmt = [of.strip() for of in outfmt.split(",")]
71
- out = tibble(query=genes) >> left_join(out, by=f.query)
72
- if notfound == "use-query":
73
- out = out >> mutate(
74
- across(
75
- outfmt,
76
- lambda col, query: if_else(is_na(col), query, col),
77
- query=f.query,
78
- )
91
+
92
+ if out.shape[0] == 0:
93
+ return pd.DataFrame({"query": genes, "converted": ["NA"] * len(genes)})
94
+
95
+ if dup == "first":
96
+ out = (
97
+ out
98
+ .sort_values("_score", ascending=False)
99
+ .groupby("query")
100
+ .head(1)
101
+ .reset_index(drop=True)
79
102
  )
80
- elif notfound == "error" and any(is_na(out[outfmt[0]])):
81
- nagenes = out >> filter_(is_na(f[outfmt[0]])) >> pull(f.query)
82
- raise QueryGenesNotFound(nagenes)
83
- elif notfound == "skip":
84
- out = out >> filter_(~is_na(f[outfmt[0]]))
103
+ elif dup == "last":
104
+ out = (
105
+ out
106
+ .sort_values("_score", ascending=False)
107
+ .groupby("query")
108
+ .tail(1)
109
+ .reset_index(drop=True)
110
+ )
111
+ elif dup != "all":
112
+ out = (
113
+ out
114
+ .sort_values("_score", ascending=False)
115
+ .groupby("query")
116
+ .agg({outfmt: lambda x: f"{dup}".join([str(x) for x in x.unique()])})
117
+ .reset_index()
118
+ )
119
+
120
+ out = pd.merge(query_df, out, on="query", how="left")
121
+ out = out.drop(columns=["query"]).rename(columns={"orig": "query"})
122
+
123
+ if notfound == "error":
124
+ if out[outfmt].isnull().any():
125
+ nagenes = out[out[outfmt].isnull()]["query"].tolist()
126
+ raise QueryGenesNotFound(f"Query genes not found: {','.join(nagenes)}")
127
+ elif notfound == "use-query":
128
+ out[outfmt] = out[outfmt].combine_first(out["query"])
129
+ elif notfound in ["skip", "ignore"]:
130
+ out = out.dropna(subset=[outfmt])
131
+ else: # notfound == "na"
132
+ out[outfmt] = out[outfmt].fillna("NA")
85
133
 
86
134
  return out
biopipen/utils/misc.py CHANGED
@@ -1,30 +1,126 @@
1
1
  from __future__ import annotations
2
2
  from pathlib import Path
3
3
 
4
+ import os
4
5
  import sys
5
- from typing import List
6
+ import logging
7
+ from subprocess import Popen
8
+ from typing import List, Callable, Any
6
9
  from biopipen.core.filters import dict_to_cli_args # noqa: F401
7
10
 
11
+ logger = logging.getLogger("biopipen_job")
12
+ logger.setLevel(logging.DEBUG)
13
+ _handler = logging.StreamHandler(sys.stdout)
14
+ # Use same log format as in R
15
+ # {sprintf("%-7s", level)} [{format(time, "%Y-%m-%d %H:%M:%S")}] {msg}
16
+ # so the logs can be populated by pipen-poplog
17
+ _handler.setFormatter(
18
+ logging.Formatter(
19
+ "%(levelname)-7s [%(asctime)s] %(message)s",
20
+ datefmt="%Y-%m-%d %H:%M:%S",
21
+ )
22
+ )
23
+ logger.addHandler(_handler)
24
+
25
+
26
+ def require_package(
27
+ package: str,
28
+ version: str | None = None,
29
+ python: str | None = None,
30
+ ) -> None:
31
+ """Require a Python package to be installed with optional version check.
32
+
33
+ The version specifier should follow the format used by pip, e.g., '>=1.2.3'.
34
+ Multiple version specifiers can be separated by commas, e.g., '>=1.2.3,<2.0.0'.
8
35
 
9
- def exec_code(code, global_vars=None, local_vars=None, return_var=None):
10
- global_vars = global_vars or {}
11
- local_vars = local_vars or {}
12
- exec(code, global_vars, local_vars)
13
-
14
- if return_var is not None:
15
- return local_vars[return_var]
16
-
17
- return None
36
+ Args:
37
+ package (str): The name of the package to check.
38
+ version (str | None): The version specifier string.
39
+ python (str | None): The Python interpreter to use.
40
+ """
41
+ if not python:
42
+ import importlib
43
+ from importlib.metadata import version as get_version
44
+ from packaging.specifiers import SpecifierSet
45
+
46
+ try:
47
+ importlib.import_module(package)
48
+ except ImportError:
49
+ raise ImportError(f"Package '{package}' is required but not installed.")
50
+
51
+ if version:
52
+ installed_version = get_version(package)
53
+ specifier = SpecifierSet(version)
54
+ if installed_version not in specifier:
55
+ raise ImportError(
56
+ f"Package '{package}' version '{installed_version}' does not "
57
+ f"satisfy the requirement '{package}{version}'."
58
+ )
59
+ else:
60
+ import subprocess
61
+ from packaging.specifiers import SpecifierSet
62
+
63
+ # Check if package is installed using the specified Python interpreter
64
+ try:
65
+ result = subprocess.run(
66
+ [python, "-c", f"import {package}"],
67
+ capture_output=True,
68
+ text=True,
69
+ timeout=10,
70
+ )
71
+ if result.returncode != 0:
72
+ raise ImportError(
73
+ f"Package '{package}' is required but not installed in {python}."
74
+ )
75
+ except subprocess.TimeoutExpired:
76
+ raise ImportError(
77
+ f"Timeout while checking if package '{package}' is "
78
+ f"installed in {python}."
79
+ )
80
+ except FileNotFoundError:
81
+ raise ImportError(f"Python interpreter '{python}' not found.")
82
+
83
+ if version:
84
+ # Get the installed version
85
+ try:
86
+ version_cmd = (
87
+ f"from importlib.metadata import version; "
88
+ f"print(version('{package}'))"
89
+ )
90
+ result = subprocess.run(
91
+ [python, "-c", version_cmd],
92
+ capture_output=True,
93
+ text=True,
94
+ timeout=10,
95
+ )
96
+ if result.returncode != 0:
97
+ raise ImportError(
98
+ f"Failed to get version of package '{package}' "
99
+ f"in {python}."
100
+ )
101
+ installed_version = result.stdout.strip()
102
+ specifier = SpecifierSet(version)
103
+ if installed_version not in specifier:
104
+ raise ImportError(
105
+ f"Package '{package}' version '{installed_version}' "
106
+ f"in {python} does not satisfy the requirement "
107
+ f"'{package}{version}'."
108
+ )
109
+ except subprocess.TimeoutExpired:
110
+ raise ImportError(
111
+ f"Timeout while checking version of package '{package}' "
112
+ f"in {python}."
113
+ )
18
114
 
19
115
 
20
116
  def run_command(
21
- cmd: str | List[str],
117
+ cmd: str | List[Any],
22
118
  fg: bool = False,
23
119
  wait: bool = True,
24
120
  print_command: bool = True,
25
- print_command_handler: callable = print,
121
+ print_command_handler: Callable = print,
26
122
  **kwargs,
27
- ):
123
+ ) -> Popen | str:
28
124
  """Run a command.
29
125
 
30
126
  Args:
@@ -41,7 +137,7 @@ def run_command(
41
137
  The `Popen` object, or str when `stdout` is `RETURN` or `return`.
42
138
  """
43
139
  import shlex
44
- from subprocess import Popen, PIPE, STDOUT
140
+ from subprocess import PIPE, STDOUT
45
141
 
46
142
  if isinstance(cmd, list):
47
143
  cmd = [str(c) for c in cmd]
@@ -49,9 +145,12 @@ def run_command(
49
145
  if print_command:
50
146
  print_command_handler("RUNNING COMMAND:")
51
147
  if isinstance(cmd, str):
52
- print_command_handler(f" {cmd}")
148
+ print_command_handler(f" {cmd}\n")
53
149
  else:
54
- print_command_handler(f" {shlex.join(cmd)}")
150
+ print_command_handler(f" {shlex.join(cmd)}\n")
151
+ # flush the output if print_command_handler is print
152
+ if print_command_handler is print:
153
+ sys.stdout.flush()
55
154
 
56
155
  if isinstance(cmd, str):
57
156
  kwargs["shell"] = True
@@ -60,6 +159,7 @@ def run_command(
60
159
  kwargs["stdin"] = PIPE
61
160
 
62
161
  return_stdout = False
162
+ stdout_file = None
63
163
  if kwargs.get("stdout") is True:
64
164
  kwargs["stdout"] = PIPE
65
165
  elif kwargs.get("stdout") in ("RETURN", "return"):
@@ -68,7 +168,8 @@ def run_command(
68
168
  elif isinstance(kwargs.get("stdout"), (str, Path)):
69
169
  if isinstance(kwargs["stdout"], str):
70
170
  kwargs["stdout"] = Path(kwargs["stdout"])
71
- kwargs["stdout"] = kwargs["stdout"].open("w")
171
+ stdout_file = kwargs["stdout"].open("w")
172
+ kwargs["stdout"] = stdout_file
72
173
  kwargs["close_fds"] = True
73
174
 
74
175
  if kwargs.get("stderr") is True:
@@ -76,6 +177,10 @@ def run_command(
76
177
  elif kwargs.get("stderr") in ("STDOUT", "stdout"):
77
178
  kwargs["stderr"] = STDOUT
78
179
 
180
+ # Enable line buffering for stdout/stderr when redirecting to files or pipes
181
+ if kwargs.get("bufsize") == 1:
182
+ kwargs.setdefault("universal_newlines", True)
183
+
79
184
  if fg:
80
185
  if kwargs.get("stdout") or kwargs.get("stderr"):
81
186
  raise ValueError(
@@ -85,18 +190,39 @@ def run_command(
85
190
  kwargs["stderr"] = sys.stderr
86
191
  kwargs["universal_newlines"] = True
87
192
 
193
+ if "env" in kwargs:
194
+ kwargs["env"] = {**os.environ, **kwargs["env"]}
195
+
88
196
  try:
89
197
  p = Popen(cmd, **kwargs)
90
198
  except Exception as e:
91
- raise RuntimeError(f"Failed to run command: {e}")
199
+ raise RuntimeError(
200
+ f"Failed to run command: {e}\n"
201
+ f"Command (list): {cmd}\n"
202
+ f"Command (str): {shlex.join(cmd)}"
203
+ )
92
204
 
93
205
  if fg or wait or return_stdout:
94
206
  rc = p.wait()
95
207
  if rc != 0:
96
- raise RuntimeError(f"Failed to run command: {cmd}")
208
+ if stdout_file:
209
+ stdout_file.close()
210
+ if return_stdout and p.stdout:
211
+ p.stdout.close()
212
+ raise RuntimeError(
213
+ f"Failed to run command: rc={rc}\n"
214
+ f"Command (list): {cmd}\n"
215
+ f"Command (str): {shlex.join(cmd)}"
216
+ )
97
217
 
98
218
  if return_stdout:
99
- return p.stdout.read().decode()
219
+ try:
220
+ return p.stdout.read().decode() # type: ignore
221
+ finally:
222
+ p.stdout.close() # type: ignore
223
+
224
+ if stdout_file:
225
+ stdout_file.close()
100
226
 
101
227
  return p
102
228