biopipen 0.21.0__py3-none-any.whl → 0.34.26__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (290) hide show
  1. biopipen/__init__.py +1 -1
  2. biopipen/core/config.toml +28 -0
  3. biopipen/core/filters.py +79 -4
  4. biopipen/core/proc.py +12 -3
  5. biopipen/core/testing.py +75 -3
  6. biopipen/ns/bam.py +148 -6
  7. biopipen/ns/bed.py +75 -0
  8. biopipen/ns/cellranger.py +186 -0
  9. biopipen/ns/cellranger_pipeline.py +126 -0
  10. biopipen/ns/cnv.py +19 -3
  11. biopipen/ns/cnvkit.py +1 -1
  12. biopipen/ns/cnvkit_pipeline.py +20 -12
  13. biopipen/ns/delim.py +34 -35
  14. biopipen/ns/gene.py +68 -23
  15. biopipen/ns/gsea.py +63 -37
  16. biopipen/ns/misc.py +39 -14
  17. biopipen/ns/plot.py +304 -1
  18. biopipen/ns/protein.py +183 -0
  19. biopipen/ns/regulatory.py +290 -0
  20. biopipen/ns/rnaseq.py +142 -5
  21. biopipen/ns/scrna.py +2053 -473
  22. biopipen/ns/scrna_metabolic_landscape.py +228 -382
  23. biopipen/ns/snp.py +659 -0
  24. biopipen/ns/stats.py +484 -0
  25. biopipen/ns/tcr.py +683 -98
  26. biopipen/ns/vcf.py +236 -2
  27. biopipen/ns/web.py +97 -6
  28. biopipen/reports/bam/CNVpytor.svelte +4 -9
  29. biopipen/reports/cellranger/CellRangerCount.svelte +18 -0
  30. biopipen/reports/cellranger/CellRangerSummary.svelte +16 -0
  31. biopipen/reports/cellranger/CellRangerVdj.svelte +18 -0
  32. biopipen/reports/cnvkit/CNVkitDiagram.svelte +1 -1
  33. biopipen/reports/cnvkit/CNVkitHeatmap.svelte +1 -1
  34. biopipen/reports/cnvkit/CNVkitScatter.svelte +1 -1
  35. biopipen/reports/common.svelte +15 -0
  36. biopipen/reports/protein/ProdigySummary.svelte +16 -0
  37. biopipen/reports/scrna/CellsDistribution.svelte +4 -39
  38. biopipen/reports/scrna/DimPlots.svelte +1 -1
  39. biopipen/reports/scrna/MarkersFinder.svelte +6 -126
  40. biopipen/reports/scrna/MetaMarkers.svelte +3 -75
  41. biopipen/reports/scrna/RadarPlots.svelte +4 -20
  42. biopipen/reports/scrna_metabolic_landscape/MetabolicFeatures.svelte +61 -22
  43. biopipen/reports/scrna_metabolic_landscape/MetabolicPathwayActivity.svelte +88 -82
  44. biopipen/reports/scrna_metabolic_landscape/MetabolicPathwayHeterogeneity.svelte +70 -10
  45. biopipen/reports/snp/PlinkCallRate.svelte +24 -0
  46. biopipen/reports/snp/PlinkFreq.svelte +18 -0
  47. biopipen/reports/snp/PlinkHWE.svelte +18 -0
  48. biopipen/reports/snp/PlinkHet.svelte +18 -0
  49. biopipen/reports/snp/PlinkIBD.svelte +18 -0
  50. biopipen/reports/tcr/CDR3AAPhyschem.svelte +19 -66
  51. biopipen/reports/tcr/ClonalStats.svelte +16 -0
  52. biopipen/reports/tcr/CloneResidency.svelte +3 -93
  53. biopipen/reports/tcr/Immunarch.svelte +4 -155
  54. biopipen/reports/tcr/TCRClusterStats.svelte +3 -45
  55. biopipen/reports/tcr/TESSA.svelte +11 -28
  56. biopipen/reports/utils/misc.liq +22 -7
  57. biopipen/scripts/bam/BamMerge.py +11 -15
  58. biopipen/scripts/bam/BamSampling.py +90 -0
  59. biopipen/scripts/bam/BamSort.py +141 -0
  60. biopipen/scripts/bam/BamSplitChroms.py +10 -10
  61. biopipen/scripts/bam/BamSubsetByBed.py +38 -0
  62. biopipen/scripts/bam/CNAClinic.R +41 -5
  63. biopipen/scripts/bam/CNVpytor.py +153 -54
  64. biopipen/scripts/bam/ControlFREEC.py +13 -14
  65. biopipen/scripts/bam/SamtoolsView.py +33 -0
  66. biopipen/scripts/bed/Bed2Vcf.py +5 -5
  67. biopipen/scripts/bed/BedConsensus.py +5 -5
  68. biopipen/scripts/bed/BedLiftOver.sh +6 -4
  69. biopipen/scripts/bed/BedtoolsIntersect.py +54 -0
  70. biopipen/scripts/bed/BedtoolsMakeWindows.py +47 -0
  71. biopipen/scripts/bed/BedtoolsMerge.py +4 -4
  72. biopipen/scripts/cellranger/CellRangerCount.py +138 -0
  73. biopipen/scripts/cellranger/CellRangerSummary.R +181 -0
  74. biopipen/scripts/cellranger/CellRangerVdj.py +112 -0
  75. biopipen/scripts/cnv/AneuploidyScore.R +55 -20
  76. biopipen/scripts/cnv/AneuploidyScoreSummary.R +221 -163
  77. biopipen/scripts/cnv/TMADScore.R +25 -9
  78. biopipen/scripts/cnv/TMADScoreSummary.R +57 -86
  79. biopipen/scripts/cnvkit/CNVkitAccess.py +7 -6
  80. biopipen/scripts/cnvkit/CNVkitAutobin.py +26 -18
  81. biopipen/scripts/cnvkit/CNVkitBatch.py +6 -6
  82. biopipen/scripts/cnvkit/CNVkitCall.py +3 -3
  83. biopipen/scripts/cnvkit/CNVkitCoverage.py +4 -3
  84. biopipen/scripts/cnvkit/CNVkitDiagram.py +5 -5
  85. biopipen/scripts/cnvkit/CNVkitFix.py +3 -3
  86. biopipen/scripts/cnvkit/CNVkitGuessBaits.py +12 -8
  87. biopipen/scripts/cnvkit/CNVkitHeatmap.py +5 -5
  88. biopipen/scripts/cnvkit/CNVkitReference.py +6 -5
  89. biopipen/scripts/cnvkit/CNVkitScatter.py +5 -5
  90. biopipen/scripts/cnvkit/CNVkitSegment.py +5 -5
  91. biopipen/scripts/cnvkit/guess_baits.py +166 -93
  92. biopipen/scripts/delim/RowsBinder.R +1 -1
  93. biopipen/scripts/delim/SampleInfo.R +116 -118
  94. biopipen/scripts/gene/GeneNameConversion.R +67 -0
  95. biopipen/scripts/gene/GenePromoters.R +61 -0
  96. biopipen/scripts/gsea/Enrichr.R +5 -5
  97. biopipen/scripts/gsea/FGSEA.R +184 -50
  98. biopipen/scripts/gsea/GSEA.R +2 -2
  99. biopipen/scripts/gsea/PreRank.R +5 -5
  100. biopipen/scripts/misc/Config2File.py +2 -2
  101. biopipen/scripts/misc/Plot.R +80 -0
  102. biopipen/scripts/misc/Shell.sh +15 -0
  103. biopipen/scripts/misc/Str2File.py +2 -2
  104. biopipen/scripts/plot/Heatmap.R +3 -3
  105. biopipen/scripts/plot/Manhattan.R +147 -0
  106. biopipen/scripts/plot/QQPlot.R +146 -0
  107. biopipen/scripts/plot/ROC.R +88 -0
  108. biopipen/scripts/plot/Scatter.R +112 -0
  109. biopipen/scripts/plot/VennDiagram.R +5 -9
  110. biopipen/scripts/protein/MMCIF2PDB.py +33 -0
  111. biopipen/scripts/protein/PDB2Fasta.py +60 -0
  112. biopipen/scripts/protein/Prodigy.py +119 -0
  113. biopipen/scripts/protein/ProdigySummary.R +140 -0
  114. biopipen/scripts/protein/RMSD.py +178 -0
  115. biopipen/scripts/regulatory/MotifAffinityTest.R +102 -0
  116. biopipen/scripts/regulatory/MotifAffinityTest_AtSNP.R +127 -0
  117. biopipen/scripts/regulatory/MotifAffinityTest_MotifBreakR.R +104 -0
  118. biopipen/scripts/regulatory/MotifScan.py +159 -0
  119. biopipen/scripts/regulatory/VariantMotifPlot.R +78 -0
  120. biopipen/scripts/regulatory/motifs-common.R +324 -0
  121. biopipen/scripts/rnaseq/Simulation-ESCO.R +180 -0
  122. biopipen/scripts/rnaseq/Simulation-RUVcorr.R +45 -0
  123. biopipen/scripts/rnaseq/Simulation.R +21 -0
  124. biopipen/scripts/rnaseq/UnitConversion.R +325 -54
  125. biopipen/scripts/scrna/AnnData2Seurat.R +40 -0
  126. biopipen/scripts/scrna/CCPlotR-patch.R +161 -0
  127. biopipen/scripts/scrna/CellCellCommunication.py +150 -0
  128. biopipen/scripts/scrna/CellCellCommunicationPlots.R +93 -0
  129. biopipen/scripts/scrna/CellSNPLite.py +30 -0
  130. biopipen/scripts/scrna/CellTypeAnnotation-celltypist.R +185 -0
  131. biopipen/scripts/scrna/CellTypeAnnotation-direct.R +68 -31
  132. biopipen/scripts/scrna/CellTypeAnnotation-hitype.R +27 -22
  133. biopipen/scripts/scrna/CellTypeAnnotation-sccatch.R +28 -20
  134. biopipen/scripts/scrna/CellTypeAnnotation-sctype.R +48 -25
  135. biopipen/scripts/scrna/CellTypeAnnotation.R +37 -1
  136. biopipen/scripts/scrna/CellsDistribution.R +456 -167
  137. biopipen/scripts/scrna/DimPlots.R +1 -1
  138. biopipen/scripts/scrna/ExprImputation-alra.R +109 -0
  139. biopipen/scripts/scrna/ExprImputation-rmagic.R +256 -0
  140. biopipen/scripts/scrna/{ExprImpution-scimpute.R → ExprImputation-scimpute.R} +8 -5
  141. biopipen/scripts/scrna/ExprImputation.R +7 -0
  142. biopipen/scripts/scrna/LoomTo10X.R +51 -0
  143. biopipen/scripts/scrna/MQuad.py +25 -0
  144. biopipen/scripts/scrna/MarkersFinder.R +679 -400
  145. biopipen/scripts/scrna/MetaMarkers.R +265 -161
  146. biopipen/scripts/scrna/ModuleScoreCalculator.R +66 -11
  147. biopipen/scripts/scrna/PseudoBulkDEG.R +678 -0
  148. biopipen/scripts/scrna/RadarPlots.R +355 -134
  149. biopipen/scripts/scrna/ScFGSEA.R +298 -100
  150. biopipen/scripts/scrna/ScSimulation.R +65 -0
  151. biopipen/scripts/scrna/ScVelo.py +617 -0
  152. biopipen/scripts/scrna/Seurat2AnnData.R +7 -0
  153. biopipen/scripts/scrna/SeuratClusterStats-clustree.R +87 -0
  154. biopipen/scripts/scrna/SeuratClusterStats-dimplots.R +36 -30
  155. biopipen/scripts/scrna/SeuratClusterStats-features.R +138 -187
  156. biopipen/scripts/scrna/SeuratClusterStats-ngenes.R +81 -0
  157. biopipen/scripts/scrna/SeuratClusterStats-stats.R +78 -89
  158. biopipen/scripts/scrna/SeuratClusterStats.R +47 -10
  159. biopipen/scripts/scrna/SeuratClustering.R +36 -233
  160. biopipen/scripts/scrna/SeuratLoading.R +2 -2
  161. biopipen/scripts/scrna/SeuratMap2Ref.R +84 -113
  162. biopipen/scripts/scrna/SeuratMetadataMutater.R +16 -6
  163. biopipen/scripts/scrna/SeuratPreparing.R +223 -173
  164. biopipen/scripts/scrna/SeuratSubClustering.R +64 -0
  165. biopipen/scripts/scrna/SeuratTo10X.R +27 -0
  166. biopipen/scripts/scrna/Slingshot.R +65 -0
  167. biopipen/scripts/scrna/Subset10X.R +2 -2
  168. biopipen/scripts/scrna/TopExpressingGenes.R +169 -135
  169. biopipen/scripts/scrna/celltypist-wrapper.py +195 -0
  170. biopipen/scripts/scrna/scvelo_paga.py +313 -0
  171. biopipen/scripts/scrna/seurat_anndata_conversion.py +98 -0
  172. biopipen/scripts/scrna_metabolic_landscape/MetabolicFeatures.R +447 -82
  173. biopipen/scripts/scrna_metabolic_landscape/MetabolicPathwayActivity.R +348 -241
  174. biopipen/scripts/scrna_metabolic_landscape/MetabolicPathwayHeterogeneity.R +188 -166
  175. biopipen/scripts/snp/MatrixEQTL.R +217 -0
  176. biopipen/scripts/snp/Plink2GTMat.py +148 -0
  177. biopipen/scripts/snp/PlinkCallRate.R +199 -0
  178. biopipen/scripts/snp/PlinkFilter.py +100 -0
  179. biopipen/scripts/snp/PlinkFreq.R +291 -0
  180. biopipen/scripts/snp/PlinkFromVcf.py +81 -0
  181. biopipen/scripts/snp/PlinkHWE.R +85 -0
  182. biopipen/scripts/snp/PlinkHet.R +96 -0
  183. biopipen/scripts/snp/PlinkIBD.R +196 -0
  184. biopipen/scripts/snp/PlinkSimulation.py +124 -0
  185. biopipen/scripts/snp/PlinkUpdateName.py +124 -0
  186. biopipen/scripts/stats/ChowTest.R +146 -0
  187. biopipen/scripts/stats/DiffCoexpr.R +152 -0
  188. biopipen/scripts/stats/LiquidAssoc.R +135 -0
  189. biopipen/scripts/stats/Mediation.R +108 -0
  190. biopipen/scripts/stats/MetaPvalue.R +130 -0
  191. biopipen/scripts/stats/MetaPvalue1.R +74 -0
  192. biopipen/scripts/tcgamaf/Maf2Vcf.py +2 -2
  193. biopipen/scripts/tcgamaf/MafAddChr.py +2 -2
  194. biopipen/scripts/tcr/Attach2Seurat.R +3 -2
  195. biopipen/scripts/tcr/CDR3AAPhyschem.R +211 -143
  196. biopipen/scripts/tcr/CDR3Clustering.R +343 -0
  197. biopipen/scripts/tcr/ClonalStats.R +526 -0
  198. biopipen/scripts/tcr/CloneResidency.R +255 -131
  199. biopipen/scripts/tcr/CloneSizeQQPlot.R +4 -4
  200. biopipen/scripts/tcr/GIANA/GIANA.py +1356 -797
  201. biopipen/scripts/tcr/GIANA/GIANA4.py +1362 -789
  202. biopipen/scripts/tcr/GIANA/query.py +164 -162
  203. biopipen/scripts/tcr/Immunarch-basic.R +31 -9
  204. biopipen/scripts/tcr/Immunarch-clonality.R +25 -5
  205. biopipen/scripts/tcr/Immunarch-diversity.R +352 -134
  206. biopipen/scripts/tcr/Immunarch-geneusage.R +45 -5
  207. biopipen/scripts/tcr/Immunarch-kmer.R +68 -8
  208. biopipen/scripts/tcr/Immunarch-overlap.R +84 -4
  209. biopipen/scripts/tcr/Immunarch-spectratyping.R +35 -6
  210. biopipen/scripts/tcr/Immunarch-tracking.R +38 -6
  211. biopipen/scripts/tcr/Immunarch-vjjunc.R +165 -0
  212. biopipen/scripts/tcr/Immunarch.R +63 -11
  213. biopipen/scripts/tcr/Immunarch2VDJtools.R +2 -2
  214. biopipen/scripts/tcr/ImmunarchFilter.R +4 -4
  215. biopipen/scripts/tcr/ImmunarchLoading.R +38 -29
  216. biopipen/scripts/tcr/SampleDiversity.R +1 -1
  217. biopipen/scripts/tcr/ScRepCombiningExpression.R +40 -0
  218. biopipen/scripts/tcr/ScRepLoading.R +166 -0
  219. biopipen/scripts/tcr/TCRClusterStats.R +176 -22
  220. biopipen/scripts/tcr/TCRDock.py +110 -0
  221. biopipen/scripts/tcr/TESSA.R +102 -118
  222. biopipen/scripts/tcr/VJUsage.R +5 -5
  223. biopipen/scripts/tcr/immunarch-patched.R +142 -0
  224. biopipen/scripts/tcr/vdjtools-patch.sh +1 -1
  225. biopipen/scripts/vcf/BcftoolsAnnotate.py +91 -0
  226. biopipen/scripts/vcf/BcftoolsFilter.py +90 -0
  227. biopipen/scripts/vcf/BcftoolsMerge.py +31 -0
  228. biopipen/scripts/vcf/BcftoolsSort.py +113 -0
  229. biopipen/scripts/vcf/BcftoolsView.py +73 -0
  230. biopipen/scripts/vcf/TruvariBench.sh +14 -7
  231. biopipen/scripts/vcf/TruvariBenchSummary.R +16 -13
  232. biopipen/scripts/vcf/TruvariConsistency.R +1 -1
  233. biopipen/scripts/vcf/Vcf2Bed.py +2 -2
  234. biopipen/scripts/vcf/VcfAnno.py +11 -11
  235. biopipen/scripts/vcf/VcfDownSample.sh +22 -10
  236. biopipen/scripts/vcf/VcfFilter.py +5 -5
  237. biopipen/scripts/vcf/VcfFix.py +7 -7
  238. biopipen/scripts/vcf/VcfFix_utils.py +13 -4
  239. biopipen/scripts/vcf/VcfIndex.py +3 -3
  240. biopipen/scripts/vcf/VcfIntersect.py +3 -3
  241. biopipen/scripts/vcf/VcfLiftOver.sh +5 -0
  242. biopipen/scripts/vcf/VcfSplitSamples.py +4 -4
  243. biopipen/scripts/vcf/bcftools_utils.py +52 -0
  244. biopipen/scripts/web/Download.py +8 -4
  245. biopipen/scripts/web/DownloadList.py +5 -5
  246. biopipen/scripts/web/GCloudStorageDownloadBucket.py +82 -0
  247. biopipen/scripts/web/GCloudStorageDownloadFile.py +23 -0
  248. biopipen/scripts/web/gcloud_common.py +49 -0
  249. biopipen/utils/gene.py +108 -60
  250. biopipen/utils/misc.py +146 -20
  251. biopipen/utils/reference.py +64 -20
  252. biopipen/utils/reporter.py +177 -0
  253. biopipen/utils/vcf.py +1 -1
  254. biopipen-0.34.26.dist-info/METADATA +27 -0
  255. biopipen-0.34.26.dist-info/RECORD +292 -0
  256. {biopipen-0.21.0.dist-info → biopipen-0.34.26.dist-info}/WHEEL +1 -1
  257. {biopipen-0.21.0.dist-info → biopipen-0.34.26.dist-info}/entry_points.txt +6 -2
  258. biopipen/ns/bcftools.py +0 -111
  259. biopipen/ns/scrna_basic.py +0 -255
  260. biopipen/reports/delim/SampleInfo.svelte +0 -36
  261. biopipen/reports/scrna/GeneExpressionInvistigation.svelte +0 -32
  262. biopipen/reports/scrna/ScFGSEA.svelte +0 -35
  263. biopipen/reports/scrna/SeuratClusterStats.svelte +0 -82
  264. biopipen/reports/scrna/SeuratMap2Ref.svelte +0 -20
  265. biopipen/reports/scrna/SeuratPreparing.svelte +0 -38
  266. biopipen/reports/scrna/TopExpressingGenes.svelte +0 -55
  267. biopipen/reports/scrna_metabolic_landscape/MetabolicFeaturesIntraSubset.svelte +0 -31
  268. biopipen/reports/utils/gsea.liq +0 -110
  269. biopipen/scripts/bcftools/BcftoolsAnnotate.py +0 -42
  270. biopipen/scripts/bcftools/BcftoolsFilter.py +0 -79
  271. biopipen/scripts/bcftools/BcftoolsSort.py +0 -19
  272. biopipen/scripts/gene/GeneNameConversion.py +0 -66
  273. biopipen/scripts/scrna/ExprImpution-alra.R +0 -32
  274. biopipen/scripts/scrna/ExprImpution-rmagic.R +0 -29
  275. biopipen/scripts/scrna/ExprImpution.R +0 -7
  276. biopipen/scripts/scrna/GeneExpressionInvistigation.R +0 -132
  277. biopipen/scripts/scrna/Write10X.R +0 -11
  278. biopipen/scripts/scrna_metabolic_landscape/MetabolicFeaturesIntraSubset.R +0 -150
  279. biopipen/scripts/tcr/TCRClustering.R +0 -280
  280. biopipen/utils/common_docstrs.py +0 -61
  281. biopipen/utils/gene.R +0 -49
  282. biopipen/utils/gsea.R +0 -193
  283. biopipen/utils/io.R +0 -20
  284. biopipen/utils/misc.R +0 -114
  285. biopipen/utils/mutate_helpers.R +0 -433
  286. biopipen/utils/plot.R +0 -173
  287. biopipen/utils/rnaseq.R +0 -48
  288. biopipen/utils/single_cell.R +0 -115
  289. biopipen-0.21.0.dist-info/METADATA +0 -22
  290. biopipen-0.21.0.dist-info/RECORD +0 -218
@@ -24,7 +24,6 @@
24
24
  import sys, os, re, resource
25
25
  from os import path
26
26
  import numpy as np
27
- from Bio.SubsMat.MatrixInfo import blosum62
28
27
  import time
29
28
  from time import gmtime, strftime
30
29
  from operator import itemgetter
@@ -36,255 +35,585 @@ from sklearn.decomposition import PCA
36
35
  from sklearn.manifold import MDS
37
36
  import faiss
38
37
  from query import *
38
+ try:
39
+ from Bio.Align import substitution_matrices
40
+ blosum62 = substitution_matrices.load("BLOSUM62")
41
+ _tmp = {}
42
+ for ab1 in blosum62.alphabet:
43
+ for ab2 in blosum62.alphabet:
44
+ _tmp[(ab1, ab2)] = int(blosum62[(ab1, ab2)])
45
+ blosum62 = _tmp
46
+ except ModuleNotFoundError:
47
+ from Bio.SubsMat.MatrixInfo import blosum62
39
48
 
40
- AAstring='ACDEFGHIKLMNPQRSTVWY'
41
- AAstringList=list(AAstring)
42
- cur_dir=os.path.dirname(os.path.realpath(__file__))+'/'
49
+ AAstring = "ACDEFGHIKLMNPQRSTVWY"
50
+ AAstringList = list(AAstring)
51
+ cur_dir = os.path.dirname(os.path.realpath(__file__)) + "/"
43
52
 
44
- blosum62n={}
53
+ blosum62n = {}
45
54
  for kk in blosum62:
46
- a1=kk[0]
47
- a2=kk[1]
48
- vv=blosum62[kk]
49
- if vv>4:
50
- vv=4
51
- blosum62n[(a1,a2)]=vv
55
+ a1 = kk[0]
56
+ a2 = kk[1]
57
+ vv = blosum62[kk]
58
+ if vv > 4:
59
+ vv = 4
60
+ blosum62n[(a1, a2)] = vv
52
61
  if a1 != a2:
53
- blosum62n[(a2,a1)]=vv
54
-
55
- bl62={'A':[4,-1,-2,-2,0,-1,-1,0,-2,-1,-1,-1,-1,-2,-1,1,0,-3,-2,0],
56
- 'R':[-1,4,0,-2,-3,1,0,-2,0,-3,-2,2,-1,-3,-2,-1,-1,-3,-2,-3],
57
- 'N':[-2,0,4,1,-3,0,0,0,1,-3,-3,0,-2,-3,-2,1,0,-4,-2,-3],
58
- 'D':[-2,-2,1,4,-3,0,2,-1,-1,-3,-4,-1,-3,-3,-1,0,-1,-4,-3,-3],
59
- 'C':[0,-3,-3,-3,4,-3,-4,-3,-3,-1,-1,-3,-1,-2,-3,-1,-1,-2,-2,-1],
60
- 'Q':[-1,1,0,0,-3,4,2,-2,0,-3,-2,1,0,-3,-1,0,-1,-2,-1,-2],
61
- 'E':[-1,0,0,2,-4,2,4,-2,0,-3,-3,1,-2,-3,-1,0,-1,-3,-2,-2],
62
- 'G':[0,-2,0,-1,-3,-2,-2,4,-2,-4,-4,-2,-3,-3,-2,0,-2,-2,-3,-3],
63
- 'H':[-2,0,1,-1,-3,0,0,-2,4,-3,-3,-1,-2,-1,-2,-1,-2,-2,2,-3],
64
- 'I':[-1,-3,-3,-3,-1,-3,-3,-4,-3,4,2,-3,1,0,-3,-2,-1,-3,-1,3],
65
- 'L':[-1,-2,-3,-4,-1,-2,-3,-4,-3,2,4,-2,2,0,-3,-2,-1,-2,-1,1],
66
- 'K':[-1,2,0,-1,-3,1,1,-2,-1,-3,-2,4,-1,-3,-1,0,-1,-3,-2,-2],
67
- 'M':[-1,-1,-2,-3,-1,0,-2,-3,-2,1,2,-1,4,0,-2,-1,-1,-1,-1,1],
68
- 'F':[-2,-3,-3,-3,-2,-3,-3,-3,-1,0,0,-3,0,4,-4,-2,-2,1,3,-1],
69
- 'P':[-1,-2,-2,-1,-3,-1,-1,-2,-2,-3,-3,-1,-2,-4,4,-1,-1,-4,-3,-2],
70
- 'S':[1,-1,1,0,-1,0,0,0,-1,-2,-2,0,-1,-2,-1,4,1,-3,-2,-2],
71
- 'T':[0,-1,0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-2,-1,1,4,-2,-2,0],
72
- 'W':[-3,-3,-4,-4,-2,-2,-3,-2,-2,-3,-2,-3,-1,1,-4,-3,-2,4,2,-3],
73
- 'Y':[-2,-2,-2,-3,-2,-1,-2,-3,2,-1,-1,-2,-1,3,-3,-2,-2,2,4,-1],
74
- 'V':[0,-3,-3,-3,-1,-2,-2,-3,-3,3,1,-2,1,-1,-2,-2,0,-3,-1,4]}
75
-
76
- bl62c=np.array([np.array(x) for x in list(bl62.values())])
77
- bl62c=4-bl62c
78
-
79
- embedding=MDS(n_components=13, n_init=100, max_iter=1000, eps=0.00001, dissimilarity='precomputed')
80
- X=embedding.fit_transform(bl62c)
81
-
82
- bl62np={}
83
- vkk=list(bl62.keys())
62
+ blosum62n[(a2, a1)] = vv
63
+
64
+ bl62 = {
65
+ "A": [4, -1, -2, -2, 0, -1, -1, 0, -2, -1, -1, -1, -1, -2, -1, 1, 0, -3, -2, 0],
66
+ "R": [-1, 4, 0, -2, -3, 1, 0, -2, 0, -3, -2, 2, -1, -3, -2, -1, -1, -3, -2, -3],
67
+ "N": [-2, 0, 4, 1, -3, 0, 0, 0, 1, -3, -3, 0, -2, -3, -2, 1, 0, -4, -2, -3],
68
+ "D": [-2, -2, 1, 4, -3, 0, 2, -1, -1, -3, -4, -1, -3, -3, -1, 0, -1, -4, -3, -3],
69
+ "C": [0, -3, -3, -3, 4, -3, -4, -3, -3, -1, -1, -3, -1, -2, -3, -1, -1, -2, -2, -1],
70
+ "Q": [-1, 1, 0, 0, -3, 4, 2, -2, 0, -3, -2, 1, 0, -3, -1, 0, -1, -2, -1, -2],
71
+ "E": [-1, 0, 0, 2, -4, 2, 4, -2, 0, -3, -3, 1, -2, -3, -1, 0, -1, -3, -2, -2],
72
+ "G": [0, -2, 0, -1, -3, -2, -2, 4, -2, -4, -4, -2, -3, -3, -2, 0, -2, -2, -3, -3],
73
+ "H": [-2, 0, 1, -1, -3, 0, 0, -2, 4, -3, -3, -1, -2, -1, -2, -1, -2, -2, 2, -3],
74
+ "I": [-1, -3, -3, -3, -1, -3, -3, -4, -3, 4, 2, -3, 1, 0, -3, -2, -1, -3, -1, 3],
75
+ "L": [-1, -2, -3, -4, -1, -2, -3, -4, -3, 2, 4, -2, 2, 0, -3, -2, -1, -2, -1, 1],
76
+ "K": [-1, 2, 0, -1, -3, 1, 1, -2, -1, -3, -2, 4, -1, -3, -1, 0, -1, -3, -2, -2],
77
+ "M": [-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1, 4, 0, -2, -1, -1, -1, -1, 1],
78
+ "F": [-2, -3, -3, -3, -2, -3, -3, -3, -1, 0, 0, -3, 0, 4, -4, -2, -2, 1, 3, -1],
79
+ "P": [
80
+ -1,
81
+ -2,
82
+ -2,
83
+ -1,
84
+ -3,
85
+ -1,
86
+ -1,
87
+ -2,
88
+ -2,
89
+ -3,
90
+ -3,
91
+ -1,
92
+ -2,
93
+ -4,
94
+ 4,
95
+ -1,
96
+ -1,
97
+ -4,
98
+ -3,
99
+ -2,
100
+ ],
101
+ "S": [1, -1, 1, 0, -1, 0, 0, 0, -1, -2, -2, 0, -1, -2, -1, 4, 1, -3, -2, -2],
102
+ "T": [0, -1, 0, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, -2, -1, 1, 4, -2, -2, 0],
103
+ "W": [-3, -3, -4, -4, -2, -2, -3, -2, -2, -3, -2, -3, -1, 1, -4, -3, -2, 4, 2, -3],
104
+ "Y": [-2, -2, -2, -3, -2, -1, -2, -3, 2, -1, -1, -2, -1, 3, -3, -2, -2, 2, 4, -1],
105
+ "V": [0, -3, -3, -3, -1, -2, -2, -3, -3, 3, 1, -2, 1, -1, -2, -2, 0, -3, -1, 4],
106
+ }
107
+
108
+ bl62c = np.array([np.array(x) for x in list(bl62.values())])
109
+ bl62c = 4 - bl62c
110
+
111
+ embedding = MDS(
112
+ n_components=13, n_init=100, max_iter=1000, eps=0.00001, dissimilarity="precomputed"
113
+ )
114
+ X = embedding.fit_transform(bl62c)
115
+
116
+ bl62np = {}
117
+ vkk = list(bl62.keys())
84
118
  for ii in range(20):
85
- kk=vkk[ii]
86
- bl62np[kk]=np.array(list(X[ii,])+[0]*17)
119
+ kk = vkk[ii]
120
+ bl62np[kk] = np.array(list(X[ii,]) + [0] * 17)
87
121
 
88
-
89
- AAencodingDict={}
122
+
123
+ AAencodingDict = {}
90
124
  for ii in range(len(AAstringList)):
91
- aa=AAstringList[ii]
92
- CODE=[0]*(ii)+[1]+[0]*(20-ii)
93
- AAencodingDict[aa]=np.array(CODE)
94
-
95
- Ndim=16 ## optimized for isometric embedding
96
- n0=Ndim*6
97
- #M0=np.concatenate((np.concatenate((ZERO,M1),axis=1),np.concatenate((M1, ZERO),axis=1)))
98
- ZERO=np.zeros((Ndim,Ndim))
99
- II=np.eye(Ndim)
100
- M0=np.concatenate((np.concatenate((ZERO,ZERO, II),axis=1),np.concatenate((II, ZERO, ZERO),axis=1),np.concatenate((ZERO,II, ZERO),axis=1)))
125
+ aa = AAstringList[ii]
126
+ CODE = [0] * (ii) + [1] + [0] * (20 - ii)
127
+ AAencodingDict[aa] = np.array(CODE)
128
+
129
+ Ndim = 16 ## optimized for isometric embedding
130
+ n0 = Ndim * 6
131
+ # M0=np.concatenate((np.concatenate((ZERO,M1),axis=1),np.concatenate((M1, ZERO),axis=1)))
132
+ ZERO = np.zeros((Ndim, Ndim))
133
+ II = np.eye(Ndim)
134
+ M0 = np.concatenate(
135
+ (
136
+ np.concatenate((ZERO, ZERO, II), axis=1),
137
+ np.concatenate((II, ZERO, ZERO), axis=1),
138
+ np.concatenate((ZERO, II, ZERO), axis=1),
139
+ )
140
+ )
101
141
  ## Construct 6-th order cyclic group
102
- ZERO45=np.zeros((Ndim*3,Ndim*3))
103
- M6=np.concatenate((np.concatenate((ZERO45,M0),axis=1),np.concatenate((M0, ZERO45),axis=1)))
104
-
105
- X=np.array([[-0.31230882, -0.53572156, -0.01949946, -0.12211268, -0.70947917,
106
- -0.42211092, 0.02783931, 0.02637933, -0.41760305, 0.21809875,
107
- 0.53532768, 0.04833016, 0.07877711, 0.50464914, -0.26972087,
108
- -0.52416842],
109
- [ 0.29672002, 0.29005364, 0.18176298, -0.05103382, -0.34686519,
110
- 0.58024228, -0.49282931, 0.62304281, -0.09575202, 0.30115555,
111
- 0.09913529, 0.1577466 , -0.94391939, -0.10505925, 0.05482389,
112
- 0.38409897],
113
- [-0.42212537, 0.12225749, 0.16279646, 0.60099009, 0.19734216,
114
- 0.42819919, -0.33562418, 0.17036334, 0.4234109 , 0.46681561,
115
- -0.50347222, -0.37936876, 0.1494825 , 0.32176759, 0.28584684,
116
- 0.68469861],
117
- [ 0.18599294, -0.44017825, -0.4476952 , 0.34340976, 0.44603553,
118
- 0.40974629, -0.60045935, -0.09056728, 0.22147919, -0.33029418,
119
- 0.55635594, -0.54149972, 0.05459062, 0.57334159, -0.06227118,
120
- 0.65299872],
121
- [-0.19010428, 0.64418792, -0.85286762, 0.21380295, 0.37639516,
122
- -0.67753593, 0.38751609, 0.55746524, 0.01443766, 0.1776535 ,
123
- 0.62853954, -0.15048523, 0.55100206, -0.21426656, 0.3644061 ,
124
- -0.0018255 ],
125
- [ 0.7350723 , 0.10111267, 0.55640019, -0.18226966, 0.51658102,
126
- -0.19321508, -0.46599027, -0.02989911, 0.4036196 , -0.11978213,
127
- -0.29837524, -0.30232765, -0.36738065, -0.1379793 , 0.04362871,
128
- 0.33553714],
129
- [ 0.41134047, 0.13512443, 0.62492322, -0.10120261, -0.03093491,
130
- 0.23751917, -0.68338694, 0.05124762, 0.41533821, 0.46669353,
131
- 0.31467277, -0.02427587, 0.15361135, 0.70595112, -0.27952632,
132
- 0.32408931],
133
- [-0.33041265, -0.43860065, -0.5509376 , -0.04380843, -0.35160935,
134
- 0.25134855, 0.53409314, 0.54850824, 0.59490287, 0.32669345,
135
- -0.45355268, -0.56317041, -0.55416297, 0.18117841, -0.71600849,
136
- -0.08989825],
137
- [-0.40366849, 0.10978974, 0.0280101 , -0.46667987, -0.45607028,
138
- 0.54114052, -0.77552923, -0.10720425, 0.55252091, -0.34397153,
139
- -0.59813694, 0.15567728, 0.03071009, -0.02176143, 0.34442719,
140
- 0.14681541],
141
- [ 0.19280422, 0.35777863, 0.06139255, 0.20081699, -0.30546596,
142
- -0.56901549, -0.15290953, -0.31181573, -0.74523217, 0.22296016,
143
- -0.39143832, -0.16474685, 0.58064427, -0.77386654, 0.19713107,
144
- -0.49477418],
145
- [-0.16133903, 0.22112761, -0.53162136, 0.34764073, -0.08522381,
146
- -0.2510216 , 0.04699411, -0.25702389, -0.8739765 , -0.24171728,
147
- -0.24370533, 0.42193635, 0.41056913, -0.60378211, -0.65756832,
148
- 0.0845203 ],
149
- [-0.34792144, 0.18450939, 0.77038332, 0.63868511, -0.06221681,
150
- 0.11930421, 0.04895523, -0.22463059, -0.03268844, -0.58941354,
151
- 0.11640045, 0.32384901, -0.42952779, 0.58119471, 0.07288662,
152
- 0.26669673],
153
- [ 0.01834555, -0.16367754, 0.34900298, 0.45087949, 0.47073855,
154
- -0.37377404, 0.0606911 , 0.2455703 , -0.55182937, -0.20261009,
155
- 0.28325423, -0.04741146, 0.30565238, -0.62090653, 0.17528413,
156
- -0.60434975],
157
- [-0.55464981, 0.50918784, -0.21371646, -0.63996967, -0.37656862,
158
- 0.27852662, 0.3287838 , -0.56800869, 0.23260763, -0.20653106,
159
- 0.63261439, -0.22666691, 0.00726302, -0.60125196, 0.07139961,
160
- -0.35086639],
161
- [ 0.94039731, -0.25999326, 0.43922549, -0.485738 , -0.20492235,
162
- -0.26005626, 0.68776626, 0.57826888, -0.05973995, -0.1193658 ,
163
- -0.12102433, -0.22091354, 0.43427913, 0.71447886, 0.32745991,
164
- 0.03466398],
165
- [-0.13194625, -0.12262688, 0.18029209, 0.16555524, 0.39594125,
166
- -0.58110665, 0.16161717, 0.0839783 , 0.0911945 , 0.34546976,
167
- -0.29415349, 0.29891936, -0.60834721, 0.5943593 , -0.29473819,
168
- 0.4864154 ],
169
- [ 0.40850093, -0.4638894 , -0.39732987, -0.01972861, 0.51189582,
170
- 0.10176704, 0.37528519, -0.41479418, -0.1932531 , 0.54732221,
171
- -0.11876511, 0.32843973, -0.259283 , 0.59500132, 0.35168375,
172
- -0.21733727],
173
- [-0.50627723, -0.1973602 , -0.02339884, -0.66846048, 0.62696606,
174
- 0.60049717, 0.69143364, -0.48053591, 0.17812208, -0.58481821,
175
- -0.23551415, -0.06229112, 0.20993116, -0.72485884, 0.34375662,
176
- -0.23539168],
177
- [-0.51388312, -0.2788953 , 0.00859533, -0.5247195 , -0.18021544,
178
- 0.28372911, 0.10791359, 0.13033494, 0.34294013, -0.70310089,
179
- -0.13245433, 0.48661081, 0.08451644, -0.69990992, 0.0408274 ,
180
- -0.47204888],
181
- [ 0.68546275, 0.22581365, -0.32571833, 0.34394298, -0.43232367,
182
- -0.5041842 , 0.04784017, -0.53067936, -0.50049908, 0.36874221,
183
- 0.22429186, 0.4616482 , 0.11159174, -0.26827959, -0.39372848,
184
- -0.40987423]])
185
-
186
- bl62np={}
187
- vkk=list(bl62.keys())
142
+ ZERO45 = np.zeros((Ndim * 3, Ndim * 3))
143
+ M6 = np.concatenate(
144
+ (np.concatenate((ZERO45, M0), axis=1), np.concatenate((M0, ZERO45), axis=1))
145
+ )
146
+
147
+ X = np.array(
148
+ [
149
+ [
150
+ -0.31230882,
151
+ -0.53572156,
152
+ -0.01949946,
153
+ -0.12211268,
154
+ -0.70947917,
155
+ -0.42211092,
156
+ 0.02783931,
157
+ 0.02637933,
158
+ -0.41760305,
159
+ 0.21809875,
160
+ 0.53532768,
161
+ 0.04833016,
162
+ 0.07877711,
163
+ 0.50464914,
164
+ -0.26972087,
165
+ -0.52416842,
166
+ ],
167
+ [
168
+ 0.29672002,
169
+ 0.29005364,
170
+ 0.18176298,
171
+ -0.05103382,
172
+ -0.34686519,
173
+ 0.58024228,
174
+ -0.49282931,
175
+ 0.62304281,
176
+ -0.09575202,
177
+ 0.30115555,
178
+ 0.09913529,
179
+ 0.1577466,
180
+ -0.94391939,
181
+ -0.10505925,
182
+ 0.05482389,
183
+ 0.38409897,
184
+ ],
185
+ [
186
+ -0.42212537,
187
+ 0.12225749,
188
+ 0.16279646,
189
+ 0.60099009,
190
+ 0.19734216,
191
+ 0.42819919,
192
+ -0.33562418,
193
+ 0.17036334,
194
+ 0.4234109,
195
+ 0.46681561,
196
+ -0.50347222,
197
+ -0.37936876,
198
+ 0.1494825,
199
+ 0.32176759,
200
+ 0.28584684,
201
+ 0.68469861,
202
+ ],
203
+ [
204
+ 0.18599294,
205
+ -0.44017825,
206
+ -0.4476952,
207
+ 0.34340976,
208
+ 0.44603553,
209
+ 0.40974629,
210
+ -0.60045935,
211
+ -0.09056728,
212
+ 0.22147919,
213
+ -0.33029418,
214
+ 0.55635594,
215
+ -0.54149972,
216
+ 0.05459062,
217
+ 0.57334159,
218
+ -0.06227118,
219
+ 0.65299872,
220
+ ],
221
+ [
222
+ -0.19010428,
223
+ 0.64418792,
224
+ -0.85286762,
225
+ 0.21380295,
226
+ 0.37639516,
227
+ -0.67753593,
228
+ 0.38751609,
229
+ 0.55746524,
230
+ 0.01443766,
231
+ 0.1776535,
232
+ 0.62853954,
233
+ -0.15048523,
234
+ 0.55100206,
235
+ -0.21426656,
236
+ 0.3644061,
237
+ -0.0018255,
238
+ ],
239
+ [
240
+ 0.7350723,
241
+ 0.10111267,
242
+ 0.55640019,
243
+ -0.18226966,
244
+ 0.51658102,
245
+ -0.19321508,
246
+ -0.46599027,
247
+ -0.02989911,
248
+ 0.4036196,
249
+ -0.11978213,
250
+ -0.29837524,
251
+ -0.30232765,
252
+ -0.36738065,
253
+ -0.1379793,
254
+ 0.04362871,
255
+ 0.33553714,
256
+ ],
257
+ [
258
+ 0.41134047,
259
+ 0.13512443,
260
+ 0.62492322,
261
+ -0.10120261,
262
+ -0.03093491,
263
+ 0.23751917,
264
+ -0.68338694,
265
+ 0.05124762,
266
+ 0.41533821,
267
+ 0.46669353,
268
+ 0.31467277,
269
+ -0.02427587,
270
+ 0.15361135,
271
+ 0.70595112,
272
+ -0.27952632,
273
+ 0.32408931,
274
+ ],
275
+ [
276
+ -0.33041265,
277
+ -0.43860065,
278
+ -0.5509376,
279
+ -0.04380843,
280
+ -0.35160935,
281
+ 0.25134855,
282
+ 0.53409314,
283
+ 0.54850824,
284
+ 0.59490287,
285
+ 0.32669345,
286
+ -0.45355268,
287
+ -0.56317041,
288
+ -0.55416297,
289
+ 0.18117841,
290
+ -0.71600849,
291
+ -0.08989825,
292
+ ],
293
+ [
294
+ -0.40366849,
295
+ 0.10978974,
296
+ 0.0280101,
297
+ -0.46667987,
298
+ -0.45607028,
299
+ 0.54114052,
300
+ -0.77552923,
301
+ -0.10720425,
302
+ 0.55252091,
303
+ -0.34397153,
304
+ -0.59813694,
305
+ 0.15567728,
306
+ 0.03071009,
307
+ -0.02176143,
308
+ 0.34442719,
309
+ 0.14681541,
310
+ ],
311
+ [
312
+ 0.19280422,
313
+ 0.35777863,
314
+ 0.06139255,
315
+ 0.20081699,
316
+ -0.30546596,
317
+ -0.56901549,
318
+ -0.15290953,
319
+ -0.31181573,
320
+ -0.74523217,
321
+ 0.22296016,
322
+ -0.39143832,
323
+ -0.16474685,
324
+ 0.58064427,
325
+ -0.77386654,
326
+ 0.19713107,
327
+ -0.49477418,
328
+ ],
329
+ [
330
+ -0.16133903,
331
+ 0.22112761,
332
+ -0.53162136,
333
+ 0.34764073,
334
+ -0.08522381,
335
+ -0.2510216,
336
+ 0.04699411,
337
+ -0.25702389,
338
+ -0.8739765,
339
+ -0.24171728,
340
+ -0.24370533,
341
+ 0.42193635,
342
+ 0.41056913,
343
+ -0.60378211,
344
+ -0.65756832,
345
+ 0.0845203,
346
+ ],
347
+ [
348
+ -0.34792144,
349
+ 0.18450939,
350
+ 0.77038332,
351
+ 0.63868511,
352
+ -0.06221681,
353
+ 0.11930421,
354
+ 0.04895523,
355
+ -0.22463059,
356
+ -0.03268844,
357
+ -0.58941354,
358
+ 0.11640045,
359
+ 0.32384901,
360
+ -0.42952779,
361
+ 0.58119471,
362
+ 0.07288662,
363
+ 0.26669673,
364
+ ],
365
+ [
366
+ 0.01834555,
367
+ -0.16367754,
368
+ 0.34900298,
369
+ 0.45087949,
370
+ 0.47073855,
371
+ -0.37377404,
372
+ 0.0606911,
373
+ 0.2455703,
374
+ -0.55182937,
375
+ -0.20261009,
376
+ 0.28325423,
377
+ -0.04741146,
378
+ 0.30565238,
379
+ -0.62090653,
380
+ 0.17528413,
381
+ -0.60434975,
382
+ ],
383
+ [
384
+ -0.55464981,
385
+ 0.50918784,
386
+ -0.21371646,
387
+ -0.63996967,
388
+ -0.37656862,
389
+ 0.27852662,
390
+ 0.3287838,
391
+ -0.56800869,
392
+ 0.23260763,
393
+ -0.20653106,
394
+ 0.63261439,
395
+ -0.22666691,
396
+ 0.00726302,
397
+ -0.60125196,
398
+ 0.07139961,
399
+ -0.35086639,
400
+ ],
401
+ [
402
+ 0.94039731,
403
+ -0.25999326,
404
+ 0.43922549,
405
+ -0.485738,
406
+ -0.20492235,
407
+ -0.26005626,
408
+ 0.68776626,
409
+ 0.57826888,
410
+ -0.05973995,
411
+ -0.1193658,
412
+ -0.12102433,
413
+ -0.22091354,
414
+ 0.43427913,
415
+ 0.71447886,
416
+ 0.32745991,
417
+ 0.03466398,
418
+ ],
419
+ [
420
+ -0.13194625,
421
+ -0.12262688,
422
+ 0.18029209,
423
+ 0.16555524,
424
+ 0.39594125,
425
+ -0.58110665,
426
+ 0.16161717,
427
+ 0.0839783,
428
+ 0.0911945,
429
+ 0.34546976,
430
+ -0.29415349,
431
+ 0.29891936,
432
+ -0.60834721,
433
+ 0.5943593,
434
+ -0.29473819,
435
+ 0.4864154,
436
+ ],
437
+ [
438
+ 0.40850093,
439
+ -0.4638894,
440
+ -0.39732987,
441
+ -0.01972861,
442
+ 0.51189582,
443
+ 0.10176704,
444
+ 0.37528519,
445
+ -0.41479418,
446
+ -0.1932531,
447
+ 0.54732221,
448
+ -0.11876511,
449
+ 0.32843973,
450
+ -0.259283,
451
+ 0.59500132,
452
+ 0.35168375,
453
+ -0.21733727,
454
+ ],
455
+ [
456
+ -0.50627723,
457
+ -0.1973602,
458
+ -0.02339884,
459
+ -0.66846048,
460
+ 0.62696606,
461
+ 0.60049717,
462
+ 0.69143364,
463
+ -0.48053591,
464
+ 0.17812208,
465
+ -0.58481821,
466
+ -0.23551415,
467
+ -0.06229112,
468
+ 0.20993116,
469
+ -0.72485884,
470
+ 0.34375662,
471
+ -0.23539168,
472
+ ],
473
+ [
474
+ -0.51388312,
475
+ -0.2788953,
476
+ 0.00859533,
477
+ -0.5247195,
478
+ -0.18021544,
479
+ 0.28372911,
480
+ 0.10791359,
481
+ 0.13033494,
482
+ 0.34294013,
483
+ -0.70310089,
484
+ -0.13245433,
485
+ 0.48661081,
486
+ 0.08451644,
487
+ -0.69990992,
488
+ 0.0408274,
489
+ -0.47204888,
490
+ ],
491
+ [
492
+ 0.68546275,
493
+ 0.22581365,
494
+ -0.32571833,
495
+ 0.34394298,
496
+ -0.43232367,
497
+ -0.5041842,
498
+ 0.04784017,
499
+ -0.53067936,
500
+ -0.50049908,
501
+ 0.36874221,
502
+ 0.22429186,
503
+ 0.4616482,
504
+ 0.11159174,
505
+ -0.26827959,
506
+ -0.39372848,
507
+ -0.40987423,
508
+ ],
509
+ ]
510
+ )
511
+
512
+ bl62np = {}
513
+ vkk = list(bl62.keys())
188
514
  for ii in range(20):
189
- kk=vkk[ii]
190
- bl62np[kk]=np.array(list(X[ii,])+[0]*Ndim*5)
515
+ kk = vkk[ii]
516
+ bl62np[kk] = np.array(list(X[ii,]) + [0] * Ndim * 5)
517
+
191
518
 
192
519
  def EncodingCDR3(s, M, n0):
193
- sL=list(s)
194
- x=np.array([0]*n0)
520
+ sL = list(s)
521
+ x = np.array([0] * n0)
195
522
  for ii in range(len(sL)):
196
- x = np.dot(M, (x+bl62np[sL[ii]]))
523
+ x = np.dot(M, (x + bl62np[sL[ii]]))
197
524
  return x
198
525
 
526
+
199
527
  def BuildLengthDict(seqs, sIDs, vGene=[], INFO=[]):
200
- LLs=[10,11,12,13,14,15,16,17,18,19,20,21,22,23,24]
201
- LengthD={}
202
- SeqD={}
203
- VgeneD={}
204
- InfoD={}
205
- AAs=set(list(AAencodingDict.keys()))
206
- NAs=len(AAencodingDict)
207
- cNAs=0
528
+ LLs = [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]
529
+ LengthD = {}
530
+ SeqD = {}
531
+ VgeneD = {}
532
+ InfoD = {}
533
+ AAs = set(list(AAencodingDict.keys()))
534
+ NAs = len(AAencodingDict)
535
+ cNAs = 0
208
536
  for ii in range(len(seqs)):
209
- ID=sIDs[ii]
210
- ss=seqs[ii]
211
- ssAA=set(list(ss))
212
- TMP=list(ssAA | AAs)
537
+ ID = sIDs[ii]
538
+ ss = seqs[ii]
539
+ ssAA = set(list(ss))
540
+ TMP = list(ssAA | AAs)
213
541
  if len(TMP) > NAs:
214
542
  ## CDR3 containing non amino acid letter
215
- #print('Warning: CDR3: '+ss + ' contains non amino acid letter!')
216
- cNAs+=1
543
+ # print('Warning: CDR3: '+ss + ' contains non amino acid letter!')
544
+ cNAs += 1
217
545
  continue
218
- if len(vGene)>0:
219
- vv=vGene[ii]
220
- if len(INFO)>0:
221
- info=INFO[ii]
222
- L=len(ss)
546
+ if len(vGene) > 0:
547
+ vv = vGene[ii]
548
+ if len(INFO) > 0:
549
+ info = INFO[ii]
550
+ L = len(ss)
223
551
  if L not in LLs:
224
552
  continue
225
553
  if L not in LengthD:
226
- LengthD[L]=[ID]
227
- SeqD[L]=[ss]
228
- if len(vGene)>0:
229
- VgeneD[L]=[vv]
230
- if len(INFO)>0:
231
- InfoD[L]=[info]
554
+ LengthD[L] = [ID]
555
+ SeqD[L] = [ss]
556
+ if len(vGene) > 0:
557
+ VgeneD[L] = [vv]
558
+ if len(INFO) > 0:
559
+ InfoD[L] = [info]
232
560
  else:
233
561
  LengthD[L].append(ID)
234
562
  SeqD[L].append(ss)
235
- if len(vGene)>0:
563
+ if len(vGene) > 0:
236
564
  VgeneD[L].append(vv)
237
- if len(INFO)>0:
565
+ if len(INFO) > 0:
238
566
  InfoD[L].append(info)
239
- if cNAs>0:
240
- print("Warning: Skipped %d sequences with non AA letter!" %(cNAs))
567
+ if cNAs > 0:
568
+ print("Warning: Skipped %d sequences with non AA letter!" % (cNAs))
241
569
  return LengthD, VgeneD, InfoD, SeqD
242
570
 
571
+
243
572
  def CollapseUnique(LD, VD, ID, SD):
244
- kks=LD.keys()
245
- LDu={}
246
- VDu={}
247
- IDu={}
248
- SDu={}
573
+ kks = LD.keys()
574
+ LDu = {}
575
+ VDu = {}
576
+ IDu = {}
577
+ SDu = {}
249
578
  for kk in kks:
250
- vvL=list(LD[kk])
251
- if len(VD)>0:
252
- vvV=list(VD[kk])
579
+ vvL = list(LD[kk])
580
+ if len(VD) > 0:
581
+ vvV = list(VD[kk])
253
582
  else:
254
- vvV=['TRBV2-1*01']*len(vvL)
255
- vvI=list(ID[kk])
256
- vvS=list(SD[kk])
257
- zz=zip(vvL, vvS, vvV, vvI)
258
- zzs=sorted(zz, key = lambda x: (x[1], x[2]))
259
- nz=len(zzs)
260
- pointer_pre=0
261
- pointer_cur=1
262
- s_pre=zzs[pointer_pre][1]
263
- v_pre=zzs[pointer_pre][2]
264
- uS=[s_pre]
265
- uV=[v_pre]
266
- uI=[[zzs[pointer_pre][3]]]
583
+ vvV = ["TRBV2-1*01"] * len(vvL)
584
+ vvI = list(ID[kk])
585
+ vvS = list(SD[kk])
586
+ zz = zip(vvL, vvS, vvV, vvI)
587
+ zzs = sorted(zz, key=lambda x: (x[1], x[2]))
588
+ nz = len(zzs)
589
+ pointer_pre = 0
590
+ pointer_cur = 1
591
+ s_pre = zzs[pointer_pre][1]
592
+ v_pre = zzs[pointer_pre][2]
593
+ uS = [s_pre]
594
+ uV = [v_pre]
595
+ uI = [[zzs[pointer_pre][3]]]
267
596
  while pointer_cur < nz:
268
- s_cur=zzs[pointer_cur][1]
269
- v_cur=zzs[pointer_cur][2]
597
+ s_cur = zzs[pointer_cur][1]
598
+ v_cur = zzs[pointer_cur][2]
270
599
  if s_cur == s_pre and v_cur == v_pre:
271
- uI[len(uI)-1].append(zzs[pointer_cur][3])
600
+ uI[len(uI) - 1].append(zzs[pointer_cur][3])
272
601
  pointer_cur += 1
273
602
  continue
274
603
  else:
275
604
  uS.append(s_cur)
276
605
  uV.append(v_cur)
277
606
  uI.append([zzs[pointer_cur][3]])
278
- s_pre=s_cur
279
- v_pre=v_cur
280
- pointer_pre=pointer_cur
607
+ s_pre = s_cur
608
+ v_pre = v_cur
609
+ pointer_pre = pointer_cur
281
610
  pointer_cur += 1
282
- uL=[x for x in range(len(uS))]
283
- LDu[kk]=uL
284
- SDu[kk]=uS
285
- if len(VD)>0:
286
- VDu[kk]=uV
287
- IDu[kk]=uI
611
+ uL = [x for x in range(len(uS))]
612
+ LDu[kk] = uL
613
+ SDu[kk] = uS
614
+ if len(VD) > 0:
615
+ VDu[kk] = uV
616
+ IDu[kk] = uI
288
617
  return LDu, VDu, IDu, SDu
289
618
 
290
619
 
@@ -296,14 +625,15 @@ class CDR3:
296
625
  ## KS: Kmer size
297
626
  ## st: the first 0:(st-1) amino acids will not be included in K-merization
298
627
  ## ed: the last L-ed amino acids will be skipped
299
- self.s=s
300
- self.ID=sID
301
- L=len(s)
302
- self.L=L
303
- sub_s=s[st: (L-ed)]
304
- Ls=len(sub_s)
305
- Kmer=[sub_s[x:(x+KS)] for x in range(0,Ls-KS+1)]
306
- self.Kmer=Kmer
628
+ self.s = s
629
+ self.ID = sID
630
+ L = len(s)
631
+ self.L = L
632
+ sub_s = s[st : (L - ed)]
633
+ Ls = len(sub_s)
634
+ Kmer = [sub_s[x : (x + KS)] for x in range(0, Ls - KS + 1)]
635
+ self.Kmer = Kmer
636
+
307
637
 
308
638
  class KmerSet:
309
639
  ## Kmer set for fast read searching based on mismatch-allowed Kmer index
@@ -312,263 +642,277 @@ class KmerSet:
312
642
  ## Seqs and sIDs must have the same length
313
643
  if len(Seqs) != len(sIDs):
314
644
  raise "Sequence and ID lists have different length. Please check input."
315
- KmerDict={}
316
- N=len(Seqs)
317
- self.N=N
318
- CDR3Dict={}
319
- LLs=[]
320
- for ii in range(0,N):
321
- s=Seqs[ii]
322
- sID=sIDs[ii]
323
- cc=CDR3(s,sID,KS,st,ed)
324
- CDR3Dict[cc.ID]=cc.Kmer
325
- KK=cc.Kmer
645
+ KmerDict = {}
646
+ N = len(Seqs)
647
+ self.N = N
648
+ CDR3Dict = {}
649
+ LLs = []
650
+ for ii in range(0, N):
651
+ s = Seqs[ii]
652
+ sID = sIDs[ii]
653
+ cc = CDR3(s, sID, KS, st, ed)
654
+ CDR3Dict[cc.ID] = cc.Kmer
655
+ KK = cc.Kmer
326
656
  LLs.append(cc.L)
327
657
  for kk in KK:
328
658
  if kk not in KmerDict:
329
- KmerDict[kk]=[sID]
659
+ KmerDict[kk] = [sID]
330
660
  else:
331
661
  KmerDict[kk].append(sID)
332
- self.KD=KmerDict
333
- self.KS=KS
334
- self.CD=CDR3Dict
335
- self.LL=LLs
336
- def FindKmerNeighbor(self,kk):
337
- KS=self.KS
338
- KS_n1=[]
662
+ self.KD = KmerDict
663
+ self.KS = KS
664
+ self.CD = CDR3Dict
665
+ self.LL = LLs
666
+
667
+ def FindKmerNeighbor(self, kk):
668
+ KS = self.KS
669
+ KS_n1 = []
339
670
  for jj in range(KS):
340
- kk_pre=[kk[0:jj]]*20
341
- kk_suf=[kk[(jj+1):KS]]*20
342
- kkn=list(zip(kk_pre,AAstringList,kk_suf))
343
- KS_n1+=[''.join(list(x)) for x in kkn]
671
+ kk_pre = [kk[0:jj]] * 20
672
+ kk_suf = [kk[(jj + 1) : KS]] * 20
673
+ kkn = list(zip(kk_pre, AAstringList, kk_suf))
674
+ KS_n1 += ["".join(list(x)) for x in kkn]
344
675
  return KS_n1
345
- def FindKmerNeighbor2(self,kk):
676
+
677
+ def FindKmerNeighbor2(self, kk):
346
678
  ## KS>=6, allowing 2 mismatches. CDR3 length must be >= 10
347
- KS=self.KS
348
- KS_n1=[]
679
+ KS = self.KS
680
+ KS_n1 = []
349
681
  for jj in range(KS):
350
682
  for ii in range(KS):
351
- if ii<=jj:
683
+ if ii <= jj:
352
684
  continue
353
- kk_pre=[kk[0:jj]]*20
354
- kk_mid=[kk[(jj+1):ii]]*20
355
- kk_suf=[kk[(ii+1):KS]]*400
356
- kkn=list(zip(kk_pre,AAstringList,kk_mid))
357
- kkn=[''.join(list(x)) for x in kkn]
358
- kkn=[[x]*20 for x in kkn]
359
- kkn=list(chain(*kkn))
360
- kkn2=list(zip(kkn, AAstringList*20, kk_suf))
361
- kkn2=[''.join(list(x)) for x in kkn2]
362
- KS_n1+=kkn2
685
+ kk_pre = [kk[0:jj]] * 20
686
+ kk_mid = [kk[(jj + 1) : ii]] * 20
687
+ kk_suf = [kk[(ii + 1) : KS]] * 400
688
+ kkn = list(zip(kk_pre, AAstringList, kk_mid))
689
+ kkn = ["".join(list(x)) for x in kkn]
690
+ kkn = [[x] * 20 for x in kkn]
691
+ kkn = list(chain(*kkn))
692
+ kkn2 = list(zip(kkn, AAstringList * 20, kk_suf))
693
+ kkn2 = ["".join(list(x)) for x in kkn2]
694
+ KS_n1 += kkn2
363
695
  return KS_n1
696
+
364
697
  def KmerIndex(self):
365
698
  ## For each K-mer, find its nearest neighbor with 1 character mismatch
366
- KKs=list(self.KD.keys())
367
- KS=self.KS
368
- KKs_set=set(KKs)
369
- Skk='_'.join(KKs)
370
- KI_Dict={}
699
+ KKs = list(self.KD.keys())
700
+ KS = self.KS
701
+ KKs_set = set(KKs)
702
+ Skk = "_".join(KKs)
703
+ KI_Dict = {}
371
704
  for kk in KKs:
372
- ## kk_neighbor=[]
373
- ## for jj in range(KS):
374
- ## kk_pre=kk[0:jj]
375
- ## kk_suf=kk[(jj+1):KS]
376
- ## pat=kk_pre+'['+AAstring+']{1}'+kk_suf
377
- ## p=re.compile(pat)
378
- ## mm=[m.group() for m in p.finditer(Skk)]
379
- ## kk_neighbor+=mm
380
- KS_n=set(self.FindKmerNeighbor(kk))
705
+ ## kk_neighbor=[]
706
+ ## for jj in range(KS):
707
+ ## kk_pre=kk[0:jj]
708
+ ## kk_suf=kk[(jj+1):KS]
709
+ ## pat=kk_pre+'['+AAstring+']{1}'+kk_suf
710
+ ## p=re.compile(pat)
711
+ ## mm=[m.group() for m in p.finditer(Skk)]
712
+ ## kk_neighbor+=mm
713
+ KS_n = set(self.FindKmerNeighbor(kk))
381
714
  kk_neighbor = KS_n & KKs_set
382
- KI_Dict[kk]=list(kk_neighbor)
715
+ KI_Dict[kk] = list(kk_neighbor)
383
716
  return KI_Dict
717
+
384
718
  def updateKD(self, KI):
385
719
  ## group sequences sharing motifs with 1-2 mismatches
386
- KD=self.KD
387
- KDnew={}
720
+ KD = self.KD
721
+ KDnew = {}
388
722
  for kk in KD:
389
- kkm=KI[kk]
390
- vvL=itemgetter(*kkm)(KD)
391
- if isinstance(vvL[0],list):
392
- vvL=list(chain(*vvL))
393
- KDnew[kk]=vvL
723
+ kkm = KI[kk]
724
+ vvL = itemgetter(*kkm)(KD)
725
+ if isinstance(vvL[0], list):
726
+ vvL = list(chain(*vvL))
727
+ KDnew[kk] = vvL
394
728
  return KDnew
395
729
 
396
- def GenerateMotifGraph(mD,seqs,seqID):
397
- SeqShareGraph={}
398
- mDL={}
730
+
731
+ def GenerateMotifGraph(mD, seqs, seqID):
732
+ SeqShareGraph = {}
733
+ mDL = {}
399
734
  for kk in mD:
400
- vv=mD[kk]
401
- LL=[]
735
+ vv = mD[kk]
736
+ LL = []
402
737
  for v in vv:
403
738
  LL.append(len(seqs[v]))
404
- mDL[kk]=LL
739
+ mDL[kk] = LL
405
740
  for kk in mD:
406
- vv=mD[kk]
407
- LL=mDL[kk]
408
- nv=len(vv)
409
- for ii in range(0,nv):
410
- id_1=vv[ii]
411
- L1=LL[ii]
412
- for jj in range(ii,nv):
413
- if jj==ii:
741
+ vv = mD[kk]
742
+ LL = mDL[kk]
743
+ nv = len(vv)
744
+ for ii in range(0, nv):
745
+ id_1 = vv[ii]
746
+ L1 = LL[ii]
747
+ for jj in range(ii, nv):
748
+ if jj == ii:
414
749
  continue
415
- id_2=vv[jj]
416
- L2=LL[jj]
750
+ id_2 = vv[jj]
751
+ L2 = LL[jj]
417
752
  if L2 != L1:
418
753
  continue
419
754
  if id_1 not in SeqShareGraph:
420
- SeqShareGraph[id_1]=[id_2]
755
+ SeqShareGraph[id_1] = [id_2]
421
756
  elif id_2 not in SeqShareGraph[id_1]:
422
757
  SeqShareGraph[id_1].append(id_2)
423
758
  if id_2 not in SeqShareGraph:
424
- SeqShareGraph[id_2]=[id_1]
759
+ SeqShareGraph[id_2] = [id_1]
425
760
  elif id_1 not in SeqShareGraph[id_2]:
426
761
  SeqShareGraph[id_2].append(id_1)
427
762
  return SeqShareGraph
428
763
 
764
+
429
765
  def generateSSG(Kset, CDR3s, k_thr=2):
430
- KD=Kset.KD
431
- KI=Kset.KmerIndex()
432
- KDnew=Kset.updateKD(KI)
433
- CD=Kset.CD
434
- LL=np.array(Kset.LL)
435
- SSG={}
766
+ KD = Kset.KD
767
+ KI = Kset.KmerIndex()
768
+ KDnew = Kset.updateKD(KI)
769
+ CD = Kset.CD
770
+ LL = np.array(Kset.LL)
771
+ SSG = {}
436
772
  for kk in CD:
437
- vv=itemgetter(*CD[kk])(KDnew)
438
- if isinstance(vv[0],list):
439
- vv=list(chain(*vv))
440
- vv1=[]
441
- c=Counter(vv)
773
+ vv = itemgetter(*CD[kk])(KDnew)
774
+ if isinstance(vv[0], list):
775
+ vv = list(chain(*vv))
776
+ vv1 = []
777
+ c = Counter(vv)
442
778
  for k in c:
443
- if c[k]>=k_thr:
779
+ if c[k] >= k_thr:
444
780
  vv1.append(k)
445
- vv1=np.array(vv1)
446
- if len(vv1)==0:
781
+ vv1 = np.array(vv1)
782
+ if len(vv1) == 0:
447
783
  continue
448
- cdr3=CDR3s[kk]
449
- L0=len(cdr3)
450
- idx=np.where(LL[vv1]==L0)[0]
451
- if len(idx)==0:
784
+ cdr3 = CDR3s[kk]
785
+ L0 = len(cdr3)
786
+ idx = np.where(LL[vv1] == L0)[0]
787
+ if len(idx) == 0:
452
788
  continue
453
- vvs=list(vv1[idx])
789
+ vvs = list(vv1[idx])
454
790
  vvs.remove(kk)
455
- if len(vvs)>0:
456
- SSG[kk]=vvs
791
+ if len(vvs) > 0:
792
+ SSG[kk] = vvs
457
793
  return SSG
458
794
 
459
- def SeqComparison(s1,s2,gap=-6):
460
- n=len(s1)
461
- CorList=[]
462
- score=0
463
- for kk in range(0,n):
464
- aa=s1[kk]
465
- bb=s2[kk]
466
- if aa in ['.','-','*'] or bb in ['.','-','*']:
467
- if aa!=bb:
795
+
796
+ def SeqComparison(s1, s2, gap=-6):
797
+ n = len(s1)
798
+ CorList = []
799
+ score = 0
800
+ for kk in range(0, n):
801
+ aa = s1[kk]
802
+ bb = s2[kk]
803
+ if aa in [".", "-", "*"] or bb in [".", "-", "*"]:
804
+ if aa != bb:
468
805
  score += gap
469
806
  continue
470
- if aa==bb:
471
- # score += min(4,blosum62[(aa,aa)])
472
- score += blosum62n[(aa,aa)]
807
+ if aa == bb:
808
+ # score += min(4,blosum62[(aa,aa)])
809
+ score += blosum62n[(aa, aa)]
473
810
  continue
474
- KEY=(aa,bb)
475
- # if KEY not in blosum62:
476
- # KEY=(bb,aa)
477
- # if KEY not in blosum62:
478
- # raise "Non-standard amino acid coding!"
479
- score+=blosum62n[KEY]
811
+ KEY = (aa, bb)
812
+ # if KEY not in blosum62:
813
+ # KEY=(bb,aa)
814
+ # if KEY not in blosum62:
815
+ # raise "Non-standard amino acid coding!"
816
+ score += blosum62n[KEY]
480
817
  return score
481
818
 
482
- def NHLocalAlignment(Seq1,Seq2,gap_thr=1,gap=-6):
483
- n1=len(Seq1)
484
- n2=len(Seq2)
485
- if n1<n2:
486
- Seq=Seq1
487
- Seq1=Seq2
488
- Seq2=Seq
489
- nn=n2-n1
819
+
820
+ def NHLocalAlignment(Seq1, Seq2, gap_thr=1, gap=-6):
821
+ n1 = len(Seq1)
822
+ n2 = len(Seq2)
823
+ if n1 < n2:
824
+ Seq = Seq1
825
+ Seq1 = Seq2
826
+ Seq2 = Seq
827
+ nn = n2 - n1
490
828
  else:
491
- nn=n1-n2
492
- if nn>gap_thr:
829
+ nn = n1 - n2
830
+ if nn > gap_thr:
493
831
  return -1
494
- SeqList1=[Seq1]
495
- SeqList2=InsertGap(Seq2,nn)
496
- alns=[]
497
- SCOREList=[]
832
+ SeqList1 = [Seq1]
833
+ SeqList2 = InsertGap(Seq2, nn)
834
+ alns = []
835
+ SCOREList = []
498
836
  for s1 in SeqList1:
499
837
  for s2 in SeqList2:
500
- SCOREList.append(SeqComparison(s1,s2,gap))
501
- maxS=max(SCOREList)
838
+ SCOREList.append(SeqComparison(s1, s2, gap))
839
+ maxS = max(SCOREList)
502
840
  return maxS
503
841
 
504
- def InsertGap(Seq,n):
842
+
843
+ def InsertGap(Seq, n):
505
844
  ## Insert n gaps to Seq; n<=2
506
- if n==0:
845
+ if n == 0:
507
846
  return [Seq]
508
- ns=len(Seq)
509
- SeqList=[]
510
- if(n==1):
511
- for kk in range(0,ns+1):
512
- SeqNew=Seq[0:kk]+'-'+Seq[kk:]
847
+ ns = len(Seq)
848
+ SeqList = []
849
+ if n == 1:
850
+ for kk in range(0, ns + 1):
851
+ SeqNew = Seq[0:kk] + "-" + Seq[kk:]
513
852
  SeqList.append(SeqNew)
514
- if(n==2):
515
- for kk in range(0,ns+1):
516
- SeqNew=Seq[0:kk]+'-'+Seq[kk:]
517
- for jj in range(0,ns+2):
518
- SeqNew0=SeqNew[0:jj]+'-'+SeqNew[jj:]
853
+ if n == 2:
854
+ for kk in range(0, ns + 1):
855
+ SeqNew = Seq[0:kk] + "-" + Seq[kk:]
856
+ for jj in range(0, ns + 2):
857
+ SeqNew0 = SeqNew[0:jj] + "-" + SeqNew[jj:]
519
858
  SeqList.append(SeqNew0)
520
859
  return SeqList
521
860
 
522
- def falign(s1, s2, V1, V2 ,st,VScore={}, UseV=True, gapn=1, gap=-6):
523
- mid1=s1[st:-2]
524
- mid2=s2[st:-2]
861
+
862
+ def falign(s1, s2, V1, V2, st, VScore={}, UseV=True, gapn=1, gap=-6):
863
+ mid1 = s1[st:-2]
864
+ mid2 = s2[st:-2]
525
865
  if UseV:
526
- if V2==V1:
527
- V_score=4
866
+ if V2 == V1:
867
+ V_score = 4
528
868
  else:
529
- Vkey=(V1,V2)
869
+ Vkey = (V1, V2)
530
870
  if Vkey not in VScore:
531
- Vkey=(V2,V1)
871
+ Vkey = (V2, V1)
532
872
  if Vkey not in VScore:
533
- #print("V gene not found!")
873
+ # print("V gene not found!")
534
874
  return 0
535
875
  else:
536
- V_score=VScore[Vkey]/20.0
876
+ V_score = VScore[Vkey] / 20.0
537
877
  else:
538
- V_score=4.0
539
- aln=NHLocalAlignment(mid1,mid2,gapn,gap)
540
- score=aln/float(max(len(mid1),len(mid2)))+V_score
878
+ V_score = 4.0
879
+ aln = NHLocalAlignment(mid1, mid2, gapn, gap)
880
+ score = aln / float(max(len(mid1), len(mid2))) + V_score
541
881
  return score
542
882
 
883
+
543
884
  def UpdateSSG(SSG, seqs, Vgenes, Vscore={}, UseV=True, gap=-6, gapn=1, cutoff=7.5):
544
- SSGnew={}
545
- count=0
546
- t1=time.time()
547
- N=len(list(chain(*list(SSG.values()))))
548
- # print("Number of pairs to be processed: %d" %N)
885
+ SSGnew = {}
886
+ count = 0
887
+ t1 = time.time()
888
+ N = len(list(chain(*list(SSG.values()))))
889
+ # print("Number of pairs to be processed: %d" %N)
549
890
  for kk in SSG:
550
- s1=seqs[kk]
551
- V1=Vgenes[kk]
552
- VV=SSG[kk]
891
+ s1 = seqs[kk]
892
+ V1 = Vgenes[kk]
893
+ VV = SSG[kk]
553
894
  for vv in VV:
554
- s2=seqs[vv]
555
- V2=Vgenes[vv]
556
- score=falign(s1, s2, V1, V2, st=3, VScore=Vscore, UseV=UseV, gap=-6, gapn=1)
557
- count+=1
558
- if count % 1000000 ==0:
559
- t2=time.time()
560
- # print("Processed %d pairs. Elapsed time %f" %(count, t2-t1))
561
- if score>=cutoff:
895
+ s2 = seqs[vv]
896
+ V2 = Vgenes[vv]
897
+ score = falign(
898
+ s1, s2, V1, V2, st=3, VScore=Vscore, UseV=UseV, gap=-6, gapn=1
899
+ )
900
+ count += 1
901
+ if count % 1000000 == 0:
902
+ t2 = time.time()
903
+ # print("Processed %d pairs. Elapsed time %f" %(count, t2-t1))
904
+ if score >= cutoff:
562
905
  if kk not in SSGnew:
563
- SSGnew[kk]=[vv]
906
+ SSGnew[kk] = [vv]
564
907
  else:
565
908
  SSGnew[kk].append(vv)
566
909
  return SSGnew
567
910
 
911
+
568
912
  def dfs(graph, start):
569
- '''
913
+ """
570
914
  Non-resursive depth first search
571
- '''
915
+ """
572
916
  visited = set()
573
917
  stack = [start]
574
918
  while stack:
@@ -576,443 +920,503 @@ def dfs(graph, start):
576
920
  if vertex not in visited:
577
921
  visited.add(vertex)
578
922
  stack.extend(set(graph[vertex]) - visited)
579
-
923
+
580
924
  return visited
581
925
 
926
+
582
927
  def IdentifyMotifCluster(SSG):
583
928
  ## Input SeqShareGraph dictionary representation of sparse matrix
584
- POS=set(SSG.keys())
585
- NP=len(POS)
586
- ClusterList=[]
587
- tmpL=set(chain(*ClusterList))
588
- count=0
929
+ POS = set(SSG.keys())
930
+ NP = len(POS)
931
+ ClusterList = []
932
+ tmpL = set(chain(*ClusterList))
933
+ count = 0
589
934
  while 1:
590
- xx=POS ^ tmpL
591
- if len(xx)==0:
592
- break
593
- for ii in xx:
594
- # STACK=LoadComm([],ii)
595
- STACK=dfs(SSG,ii)
596
- tmpL = tmpL | STACK
597
- ClusterList.append(list(STACK))
598
- # tmpL=set(chain(*ClusterList))
599
- count+=1
600
- if count % 200 ==0:
601
- print (" Solved %d clusters" %(count))
602
- break
935
+ xx = POS ^ tmpL
936
+ if len(xx) == 0:
937
+ break
938
+ for ii in xx:
939
+ # STACK=LoadComm([],ii)
940
+ STACK = dfs(SSG, ii)
941
+ tmpL = tmpL | STACK
942
+ ClusterList.append(list(STACK))
943
+ # tmpL=set(chain(*ClusterList))
944
+ count += 1
945
+ if count % 200 == 0:
946
+ print(" Solved %d clusters" % (count))
947
+ break
603
948
  return ClusterList
604
949
 
950
+
605
951
  def IdentifyVgeneCluster(sMat):
606
952
  ## Input Vgene score matrix
607
- vG={}
608
- n=len(sMat)
609
- IDs=[x for x in range(n)]
953
+ vG = {}
954
+ n = len(sMat)
955
+ IDs = [x for x in range(n)]
610
956
  for kk in IDs:
611
- LL=sMat[:,kk]
612
- vL=np.where(LL>=thr_v)[0]
613
- if len(vL)>0:
614
- vG[kk]=vL
615
- CL=IdentifyMotifCluster(vG)
957
+ LL = sMat[:, kk]
958
+ vL = np.where(LL >= thr_v)[0]
959
+ if len(vL) > 0:
960
+ vG[kk] = vL
961
+ CL = IdentifyMotifCluster(vG)
616
962
  return CL
617
-
963
+
964
+
618
965
  def ParseFa(fname):
619
- InputStr=open(fname).readlines()
620
- FaDict={}
621
- seq=''
966
+ InputStr = open(fname).readlines()
967
+ FaDict = {}
968
+ seq = ""
622
969
  for line in InputStr:
623
- if line.startswith('>'):
624
- if len(seq)>0:
625
- FaDict[seqHead]=seq
626
- seq=''
627
- seqHead=line.strip()
970
+ if line.startswith(">"):
971
+ if len(seq) > 0:
972
+ FaDict[seqHead] = seq
973
+ seq = ""
974
+ seqHead = line.strip()
628
975
  else:
629
- seq+=line.strip()
976
+ seq += line.strip()
630
977
  if seqHead not in FaDict:
631
- FaDict[seqHead]=seq
978
+ FaDict[seqHead] = seq
632
979
  return FaDict
633
980
 
981
+
634
982
  def PreCalculateVgeneDist(VgeneFa="Imgt_Human_TRBV.fasta"):
635
983
  ## Only run one time if needed
636
- FaDict=ParseFa(cur_dir+VgeneFa)
637
- VScore={}
638
- CDR1Dict={}
639
- CDR2Dict={}
984
+ FaDict = ParseFa(cur_dir + VgeneFa)
985
+ VScore = {}
986
+ CDR1Dict = {}
987
+ CDR2Dict = {}
640
988
  for kk in FaDict:
641
- if '|' in kk:
642
- VV=kk.split('|')[1]
989
+ if "|" in kk:
990
+ VV = kk.split("|")[1]
643
991
  else:
644
- VV=kk[1:]
645
- CDR1Dict[VV]=FaDict[kk][26:37] ## Imgt CDR1: 27 - 38
646
- CDR2Dict[VV]=FaDict[kk][55:64] ## Imgt CDR2: 56 - 65
647
- Vkeys=list(CDR1Dict.keys())
648
- nn=len(Vkeys)
649
- for ii in range(0,nn):
650
- V1=Vkeys[ii]
651
- s1_CDR1=CDR1Dict[V1]
652
- s1_CDR2=CDR2Dict[V1]
653
- for jj in range(ii,nn):
654
- V2=Vkeys[jj]
655
- s2_CDR1=CDR1Dict[V2]
656
- s2_CDR2=CDR2Dict[V2]
657
- score1=SeqComparison(s1_CDR1,s2_CDR1)
658
- score2=SeqComparison(s2_CDR2,s2_CDR2)
659
- #print score1+score2
660
- VScore[(V1,V2)]=score1+score2
661
- gg=open('VgeneScores.txt','w')
992
+ VV = kk[1:]
993
+ CDR1Dict[VV] = FaDict[kk][26:37] ## Imgt CDR1: 27 - 38
994
+ CDR2Dict[VV] = FaDict[kk][55:64] ## Imgt CDR2: 56 - 65
995
+ Vkeys = list(CDR1Dict.keys())
996
+ nn = len(Vkeys)
997
+ for ii in range(0, nn):
998
+ V1 = Vkeys[ii]
999
+ s1_CDR1 = CDR1Dict[V1]
1000
+ s1_CDR2 = CDR2Dict[V1]
1001
+ for jj in range(ii, nn):
1002
+ V2 = Vkeys[jj]
1003
+ s2_CDR1 = CDR1Dict[V2]
1004
+ s2_CDR2 = CDR2Dict[V2]
1005
+ score1 = SeqComparison(s1_CDR1, s2_CDR1)
1006
+ score2 = SeqComparison(s2_CDR2, s2_CDR2)
1007
+ # print score1+score2
1008
+ VScore[(V1, V2)] = score1 + score2
1009
+ gg = open("VgeneScores.txt", "w")
662
1010
  for kk in VScore:
663
- vv=VScore[kk]
664
- line=kk[0]+'\t'+kk[1]+'\t'+str(vv)+'\n'
1011
+ vv = VScore[kk]
1012
+ line = kk[0] + "\t" + kk[1] + "\t" + str(vv) + "\n"
665
1013
  gg.write(line)
666
1014
  gg.close()
667
1015
 
668
- def EncodeRepertoire(inputfile, outdir, outfile='',exact=True, ST=3, thr_v=3.7, thr_s=3.5, VDict={},Vgene=True,thr_iso=10, gap=-6, GPU=False,Mat=False, verbose=False):
1016
+
1017
+ def EncodeRepertoire(
1018
+ inputfile,
1019
+ outdir,
1020
+ outfile="",
1021
+ exact=True,
1022
+ ST=3,
1023
+ thr_v=3.7,
1024
+ thr_s=3.5,
1025
+ VDict={},
1026
+ Vgene=True,
1027
+ thr_iso=10,
1028
+ gap=-6,
1029
+ GPU=False,
1030
+ Mat=False,
1031
+ verbose=False,
1032
+ ):
669
1033
  ## No V gene version
670
1034
  ## Encode CDR3 sequences into 96 dimensional space and perform k-means clustering
671
1035
  ## If exact is True, SW alignment will be performed within each cluster after isometric encoding and clustering
672
- h=open(inputfile)
673
- t1=time.time()
674
- alines=h.readlines()
675
- ww=alines[0].strip().split('\t')
676
- if not ww[0].startswith('C'):
1036
+ h = open(inputfile)
1037
+ t1 = time.time()
1038
+ alines = h.readlines()
1039
+ ww = alines[0].strip().split("\t")
1040
+ if not ww[0].startswith("C"):
677
1041
  ## header line
678
- hline=alines[0]
679
- alines=alines[1:]
680
- elif 'CDR3' in ww[0]:
681
- hline=alines[0]
682
- alines=alines[1:]
1042
+ hline = alines[0]
1043
+ alines = alines[1:]
1044
+ elif "CDR3" in ww[0]:
1045
+ hline = alines[0]
1046
+ alines = alines[1:]
683
1047
  else:
684
- hline='CDR3\t'+'\t'.join(['Info'+str(x) for x in range(len(ww)-1)])
685
- seqs=[]
686
- vgs=[]
687
- infoList=[]
688
- count=0
1048
+ hline = "CDR3\t" + "\t".join(["Info" + str(x) for x in range(len(ww) - 1)])
1049
+ seqs = []
1050
+ vgs = []
1051
+ infoList = []
1052
+ count = 0
689
1053
  if verbose:
690
- print('Creating CDR3 list')
1054
+ print("Creating CDR3 list")
691
1055
  for ll in alines:
692
- ww=ll.strip().split('\t')
693
- cdr3=ww[0]
694
- if '*' in cdr3:
1056
+ ww = ll.strip().split("\t")
1057
+ cdr3 = ww[0]
1058
+ if "*" in cdr3:
695
1059
  continue
696
- if '_' in cdr3:
1060
+ if "_" in cdr3:
697
1061
  continue
698
1062
  seqs.append(ww[0])
699
1063
  if Vgene:
700
1064
  vgs.append(ww[1])
701
- infoList.append('\t'.join(ww[1:]))
1065
+ infoList.append("\t".join(ww[1:]))
702
1066
  else:
703
- infoList.append('\t'.join(ww[1:]))
704
- count+=1
705
- if len(outfile)==0:
706
- outfile=inputfile.split('/')
707
- outfile=outfile[len(outfile)-1]
708
- outfile=outdir+'/'+re.sub('\\.[txcsv]+','',outfile)+'-'+'-RotationEncodingBL62.txt'
709
- g=open(outfile,'w')
710
- tm=strftime("%Y-%m-%d %H:%M:%S", gmtime())
711
- InfoLine='##TIME:'+tm+'|cmd: '+sys.argv[0]+'|'+inputfile+'|IsometricDistance_Thr='+str(thr_iso)+'|thr_v='+str(thr_v)+'|thr_s='+str(thr_s)+'|exact='+str(exact)+'|Vgene='+str(Vgene)+'|ST='+str(ST)
712
- g.write(InfoLine+'\n')
713
- g.write("##Column Info: CDR3 aa sequence, cluster id, other information in the input file\n")
714
- gr=0
1067
+ infoList.append("\t".join(ww[1:]))
1068
+ count += 1
1069
+ if len(outfile) == 0:
1070
+ outfile = inputfile.split("/")
1071
+ outfile = outfile[len(outfile) - 1]
1072
+ outfile = (
1073
+ outdir
1074
+ + "/"
1075
+ + re.sub("\\.[txcsv]+", "", outfile)
1076
+ + "-"
1077
+ + "-RotationEncodingBL62.txt"
1078
+ )
1079
+ g = open(outfile, "w")
1080
+ tm = strftime("%Y-%m-%d %H:%M:%S", gmtime())
1081
+ InfoLine = (
1082
+ "##TIME:"
1083
+ + tm
1084
+ + "|cmd: "
1085
+ + sys.argv[0]
1086
+ + "|"
1087
+ + inputfile
1088
+ + "|IsometricDistance_Thr="
1089
+ + str(thr_iso)
1090
+ + "|thr_v="
1091
+ + str(thr_v)
1092
+ + "|thr_s="
1093
+ + str(thr_s)
1094
+ + "|exact="
1095
+ + str(exact)
1096
+ + "|Vgene="
1097
+ + str(Vgene)
1098
+ + "|ST="
1099
+ + str(ST)
1100
+ )
1101
+ g.write(InfoLine + "\n")
1102
+ g.write(
1103
+ "##Column Info: CDR3 aa sequence, cluster id, other information in the input file\n"
1104
+ )
1105
+ gr = 0
715
1106
  ## Split into different lengths
716
- LD,VD, ID,SD= BuildLengthDict(seqs, vGene=vgs,INFO=infoList,sIDs=[x for x in range(len(seqs))])
1107
+ LD, VD, ID, SD = BuildLengthDict(
1108
+ seqs, vGene=vgs, INFO=infoList, sIDs=[x for x in range(len(seqs))]
1109
+ )
717
1110
  LDu, VDu, IDu, SDu = CollapseUnique(LD, VD, ID, SD)
718
1111
  if Mat:
719
- Mfile=outfile+'_EncodingMatrix.txt'
720
- h=open(Mfile, 'w')
1112
+ Mfile = outfile + "_EncodingMatrix.txt"
1113
+ h = open(Mfile, "w")
721
1114
  for kk in LDu:
722
1115
  if verbose:
723
- print("---Process CDR3s with length %d ---" %(kk))
724
- vSD=LDu[kk]
725
- vSD0=[x for x in range(len(vSD))]
726
- vss=SDu[kk]
727
- vInfo=IDu[kk]
728
- flagL=[len(x)-1 for x in vInfo]
1116
+ print("---Process CDR3s with length %d ---" % (kk))
1117
+ vSD = LDu[kk]
1118
+ vSD0 = [x for x in range(len(vSD))]
1119
+ vss = SDu[kk]
1120
+ vInfo = IDu[kk]
1121
+ flagL = [len(x) - 1 for x in vInfo]
729
1122
  if verbose:
730
- print(' Performing CDR3 encoding')
731
- dM=np.array([EncodingCDR3(x[ST:-2], M6, n0) for x in vss])
732
- dM=dM.astype("float32")
1123
+ print(" Performing CDR3 encoding")
1124
+ dM = np.array([EncodingCDR3(x[ST:-2], M6, n0) for x in vss])
1125
+ dM = dM.astype("float32")
733
1126
  if verbose:
734
- print(" The number of sequences is %d" %(dM.shape[0]))
1127
+ print(" The number of sequences is %d" % (dM.shape[0]))
735
1128
  if Mat:
736
1129
  for ii in range(len(vss)):
737
- line=vss[ii]+'\t'+vInfo[ii][0]+'\t'
738
- NUMs=[str(xx) for xx in dM[ii,:]]
739
- line += '\t'.join(NUMs) + '\n'
1130
+ line = vss[ii] + "\t" + vInfo[ii][0] + "\t"
1131
+ NUMs = [str(xx) for xx in dM[ii, :]]
1132
+ line += "\t".join(NUMs) + "\n"
740
1133
  h.write(line)
741
- sID=[x for x in range(dM.shape[0])]
742
- t2=time.time()
1134
+ sID = [x for x in range(dM.shape[0])]
1135
+ t2 = time.time()
743
1136
  if verbose:
744
- print(' Done! Total time elapsed %f' %(t2-t1))
745
- Cls = ClusterCDR3(dM, flagL, thr=thr_iso - 0.5*(15-kk), verbose=verbose) ## change cutoff with different lengths
1137
+ print(" Done! Total time elapsed %f" % (t2 - t1))
1138
+ Cls = ClusterCDR3(
1139
+ dM, flagL, thr=thr_iso - 0.5 * (15 - kk), verbose=verbose
1140
+ ) ## change cutoff with different lengths
746
1141
  if verbose:
747
1142
  print(" Handling identical CDR3 groups")
748
- Cls_u=[]
1143
+ Cls_u = []
749
1144
  for ii in range(len(Cls)):
750
- cc=Cls[ii]
1145
+ cc = Cls[ii]
751
1146
  if len(cc) == 1:
752
1147
  ## Handle identical CDR3 groups first
753
- if flagL[cc[0]]>0:
1148
+ if flagL[cc[0]] > 0:
754
1149
  gr += 1
755
- jj=cc[0]
1150
+ jj = cc[0]
756
1151
  for v_info in vInfo[jj]:
757
- line=vss[jj]+'\t'+str(gr)+'\t'+v_info+'\n'
758
- _=g.write(line)
1152
+ line = vss[jj] + "\t" + str(gr) + "\t" + v_info + "\n"
1153
+ _ = g.write(line)
759
1154
  else:
760
1155
  Cls_u.append(cc)
761
- Cls=Cls_u
762
- t2=time.time()
1156
+ Cls = Cls_u
1157
+ t2 = time.time()
763
1158
  if verbose:
764
- print(' Done! Total time elapsed %f' %(t2-t1))
1159
+ print(" Done! Total time elapsed %f" % (t2 - t1))
765
1160
  if Vgene:
766
- vVgene=VDu[kk]
1161
+ vVgene = VDu[kk]
767
1162
  if verbose:
768
- print(' Matching variable genes')
769
- Cls_v=[]
1163
+ print(" Matching variable genes")
1164
+ Cls_v = []
770
1165
  for cc in Cls:
771
- Nc=len(cc)
772
- sMat={}
1166
+ Nc = len(cc)
1167
+ sMat = {}
773
1168
  for ii in range(Nc):
774
- v1=vVgene[cc[ii]]
775
- for jj in range(ii,Nc):
776
- if jj==ii:
1169
+ v1 = vVgene[cc[ii]]
1170
+ for jj in range(ii, Nc):
1171
+ if jj == ii:
777
1172
  continue
778
- v2=vVgene[cc[jj]]
1173
+ v2 = vVgene[cc[jj]]
779
1174
  if (v1, v2) not in VDict:
780
1175
  if v1 == v2:
781
1176
  if ii not in sMat:
782
- sMat[ii]=[jj]
1177
+ sMat[ii] = [jj]
783
1178
  else:
784
1179
  sMat[ii].append(jj)
785
1180
  if jj not in sMat:
786
- sMat[jj]=[ii]
1181
+ sMat[jj] = [ii]
787
1182
  else:
788
1183
  sMat[jj].append(ii)
789
1184
  continue
790
- if VDict[(v1,v2)] >= thr_v:
791
- if ii not in sMat:
792
- sMat[ii]=[jj]
793
- else:
794
- sMat[ii].append(jj)
795
- if jj not in sMat:
796
- sMat[jj]=[ii]
797
- else:
798
- sMat[jj].append(ii)
799
- vCL=IdentifyMotifCluster(sMat)
800
- vCL_List=list(chain(*vCL))
1185
+ if VDict[(v1, v2)] >= thr_v:
1186
+ if ii not in sMat:
1187
+ sMat[ii] = [jj]
1188
+ else:
1189
+ sMat[ii].append(jj)
1190
+ if jj not in sMat:
1191
+ sMat[jj] = [ii]
1192
+ else:
1193
+ sMat[jj].append(ii)
1194
+ vCL = IdentifyMotifCluster(sMat)
1195
+ vCL_List = list(chain(*vCL))
801
1196
  for ii in range(Nc):
802
- uu=flagL[cc[ii]]
803
- if uu>0 and ii not in vCL_List:
1197
+ uu = flagL[cc[ii]]
1198
+ if uu > 0 and ii not in vCL_List:
804
1199
  vCL.append([ii])
805
1200
  for vcc in vCL:
806
1201
  Cls_v.append(list(np.array(cc)[np.array(vcc)]))
807
- Cls=[]
1202
+ Cls = []
808
1203
  for ii in range(len(Cls_v)):
809
- cc=Cls_v[ii]
1204
+ cc = Cls_v[ii]
810
1205
  if len(cc) == 1:
811
1206
  ## Handle identical CDR3 groups first
812
1207
  gr += 1
813
- jj=cc[0]
1208
+ jj = cc[0]
814
1209
  for v_info in vInfo[jj]:
815
- line=vss[jj]+'\t'+str(gr)+'\t'+v_info+'\n'
816
- _=g.write(line)
1210
+ line = vss[jj] + "\t" + str(gr) + "\t" + v_info + "\n"
1211
+ _ = g.write(line)
817
1212
  else:
818
1213
  Cls.append(cc)
819
1214
  if exact:
820
1215
  if verbose:
821
- print(' Performing Smith-Waterman alignment')
822
- Cls_s=[]
1216
+ print(" Performing Smith-Waterman alignment")
1217
+ Cls_s = []
823
1218
  for cc in Cls:
824
- Nc=len(cc)
825
- if len(cc)<=3:
826
- sMat=np.zeros((Nc,Nc))
1219
+ Nc = len(cc)
1220
+ if len(cc) <= 3:
1221
+ sMat = np.zeros((Nc, Nc))
827
1222
  for ii in range(Nc):
828
- s1=vss[cc[ii]]
829
- for jj in range(ii,Nc):
830
- if jj==ii:
1223
+ s1 = vss[cc[ii]]
1224
+ for jj in range(ii, Nc):
1225
+ if jj == ii:
831
1226
  continue
832
- s2=vss[cc[jj]]
1227
+ s2 = vss[cc[jj]]
833
1228
  if len(s1) != len(s2):
834
1229
  continue
835
- if len(s1)<=5:
1230
+ if len(s1) <= 5:
836
1231
  continue
837
- sw=SeqComparison(s1[ST:-2],s2[ST:-2],gap=gap)
838
- sw=sw/(len(s1)-ST-2)
839
- sMat[ii,jj]=sw
840
- sMat[jj,ii]=sw
841
- s_max=[]
1232
+ sw = SeqComparison(s1[ST:-2], s2[ST:-2], gap=gap)
1233
+ sw = sw / (len(s1) - ST - 2)
1234
+ sMat[ii, jj] = sw
1235
+ sMat[jj, ii] = sw
1236
+ s_max = []
842
1237
  for ii in range(Nc):
843
- s_max.append(np.max(sMat[:,ii]))
844
- cc_new=[]
1238
+ s_max.append(np.max(sMat[:, ii]))
1239
+ cc_new = []
845
1240
  for ii in range(Nc):
846
- if s_max[ii]>=thr_s:
1241
+ if s_max[ii] >= thr_s:
847
1242
  cc_new.append(cc[ii])
848
- if len(cc_new)>1:
1243
+ if len(cc_new) > 1:
849
1244
  Cls_s.append(cc_new)
850
1245
  else:
851
1246
  for ii in range(Nc):
852
- uu=flagL[cc[ii]]
853
- if uu>0:
1247
+ uu = flagL[cc[ii]]
1248
+ if uu > 0:
854
1249
  Cls_s.append([cc[ii]])
855
- # print(Cls_s)
856
- Cls_sList=list(chain(*Cls_s))
1250
+ # print(Cls_s)
1251
+ Cls_sList = list(chain(*Cls_s))
857
1252
  for ii in range(len(cc)):
858
- uu=flagL[cc[ii]]
859
- if uu>0 and cc[ii] not in Cls_sList:
1253
+ uu = flagL[cc[ii]]
1254
+ if uu > 0 and cc[ii] not in Cls_sList:
860
1255
  Cls_s.append([cc[ii]])
861
1256
  else:
862
- CDR3s=[vss[x] for x in cc]
863
- sIDs=np.array([vSD0[x] for x in cc])
864
- sIDs0=[x for x in range(len(cc))]
865
- Kset=KmerSet(CDR3s, sIDs0, KS=5, st=ST, ed=2)
866
- SSG=generateSSG(Kset, CDR3s, k_thr=1)
867
- tmpVgenes=['TRBV2']*len(CDR3s)
868
- SSGnew=UpdateSSG(SSG, CDR3s, tmpVgenes, Vscore=VDict, cutoff=thr_s+4)
869
- CLall=IdentifyMotifCluster(SSGnew)
870
- CLall_list=list(chain(*CLall))
1257
+ CDR3s = [vss[x] for x in cc]
1258
+ sIDs = np.array([vSD0[x] for x in cc])
1259
+ sIDs0 = [x for x in range(len(cc))]
1260
+ Kset = KmerSet(CDR3s, sIDs0, KS=5, st=ST, ed=2)
1261
+ SSG = generateSSG(Kset, CDR3s, k_thr=1)
1262
+ tmpVgenes = ["TRBV2"] * len(CDR3s)
1263
+ SSGnew = UpdateSSG(
1264
+ SSG, CDR3s, tmpVgenes, Vscore=VDict, cutoff=thr_s + 4
1265
+ )
1266
+ CLall = IdentifyMotifCluster(SSGnew)
1267
+ CLall_list = list(chain(*CLall))
871
1268
  for ii in range(len(cc)):
872
- uu=flagL[cc[ii]]
873
- if uu>0 and ii not in CLall_list:
1269
+ uu = flagL[cc[ii]]
1270
+ if uu > 0 and ii not in CLall_list:
874
1271
  CLall.append([ii])
875
1272
  for cl in CLall:
876
- ccs=list(sIDs[np.array(cl)])
1273
+ ccs = list(sIDs[np.array(cl)])
877
1274
  Cls_s.append(ccs)
878
- Cls=Cls_s
1275
+ Cls = Cls_s
879
1276
  if verbose:
880
- print(' Writing results into file')
1277
+ print(" Writing results into file")
881
1278
  for ii in range(len(Cls)):
882
- # if ii % 100000 == 0 and ii>0:
883
- #print(' %d sequences written' %(ii))
884
- cc=Cls[ii]
885
- gr+=1
1279
+ # if ii % 100000 == 0 and ii>0:
1280
+ # print(' %d sequences written' %(ii))
1281
+ cc = Cls[ii]
1282
+ gr += 1
886
1283
  for jj in cc:
887
1284
  for v_info in vInfo[jj]:
888
- line=vss[jj]+'\t'+str(gr)+'\t'+v_info+'\n'
889
- _=g.write(line)
1285
+ line = vss[jj] + "\t" + str(gr) + "\t" + v_info + "\n"
1286
+ _ = g.write(line)
890
1287
  g.close()
891
1288
  if Mat:
892
1289
  h.close()
893
1290
 
1291
+
894
1292
  def OrderUnique(Ig):
895
- vv=list(Ig.values())
896
- kk=list(Ig.keys())
897
- LL=[len(x[1]) for x in vv]
898
- v0=[x[0][0] for x in vv]
899
- v1=[x[0][1] for x in vv]
900
- zkk=zip(kk,v0,v1,LL)
901
- zkks=sorted(zkk,key=lambda x: (x[1],x[3]))
902
- nk=len(zkks)
903
- keep_id=[0]
904
- ii=1
905
- n_pre=str(zkks[0][1])+'_'+str(zkks[0][2])
906
- while ii<nk:
907
- n_cur=str(zkks[ii][1])+'_'+str(zkks[ii][2])
908
- if n_cur==n_pre:
909
- ii+=1
1293
+ vv = list(Ig.values())
1294
+ kk = list(Ig.keys())
1295
+ LL = [len(x[1]) for x in vv]
1296
+ v0 = [x[0][0] for x in vv]
1297
+ v1 = [x[0][1] for x in vv]
1298
+ zkk = zip(kk, v0, v1, LL)
1299
+ zkks = sorted(zkk, key=lambda x: (x[1], x[3]))
1300
+ nk = len(zkks)
1301
+ keep_id = [0]
1302
+ ii = 1
1303
+ n_pre = str(zkks[0][1]) + "_" + str(zkks[0][2])
1304
+ while ii < nk:
1305
+ n_cur = str(zkks[ii][1]) + "_" + str(zkks[ii][2])
1306
+ if n_cur == n_pre:
1307
+ ii += 1
910
1308
  continue
911
1309
  else:
912
1310
  keep_id.append(ii)
913
- n_pre=n_cur
914
- ii+=1
1311
+ n_pre = n_cur
1312
+ ii += 1
915
1313
  continue
916
- nid=[x[0] for x in zkks]
917
- filtered_id=np.array(nid)[np.array(keep_id)]
918
- Igs={}
1314
+ nid = [x[0] for x in zkks]
1315
+ filtered_id = np.array(nid)[np.array(keep_id)]
1316
+ Igs = {}
919
1317
  for ii in filtered_id:
920
- Igs[kk[ii]]=vv[ii]
1318
+ Igs[kk[ii]] = vv[ii]
921
1319
  return Igs, filtered_id
922
1320
 
1321
+
923
1322
  def ClusterCDR3(dM, flagL, thr=10, GPU=False, verbose=False):
924
1323
  ## flagL: flag vector for identical CDR3 groups, >0 for grouped non-identical CDR3s
925
- Cls=[]
926
- flag=0
927
- dM1=dM
928
- flagL=np.array(flagL)
1324
+ Cls = []
1325
+ flag = 0
1326
+ dM1 = dM
1327
+ flagL = np.array(flagL)
929
1328
  if GPU:
930
1329
  res = faiss.StandardGpuResources()
931
1330
  while 1:
932
- # print(" %d number of clusters, with %d sequences" %(len(Cls),dM1.shape[0]))
1331
+ # print(" %d number of clusters, with %d sequences" %(len(Cls),dM1.shape[0]))
933
1332
  if verbose:
934
- print('=',end='')
935
- index = faiss.IndexFlatL2(Ndim*6)
1333
+ print("=", end="")
1334
+ index = faiss.IndexFlatL2(Ndim * 6)
936
1335
  if GPU:
937
1336
  index = faiss.index_cpu_to_gpu(res, 0, index)
938
1337
  index.add(dM1)
939
- if flag==0:
1338
+ if flag == 0:
940
1339
  D, I = index.search(dM1, 2)
941
- vv=np.where((D[:,1]<=thr))[0]
942
- vv0=np.where((D[:,1]>thr) & (flagL>0))[0]
1340
+ vv = np.where((D[:, 1] <= thr))[0]
1341
+ vv0 = np.where((D[:, 1] > thr) & (flagL > 0))[0]
943
1342
  for v in vv0:
944
1343
  Cls.append([v])
945
- tmp_dM=np.zeros((len(vv),Ndim*6))
946
- Ig_new={}
1344
+ tmp_dM = np.zeros((len(vv), Ndim * 6))
1345
+ Ig_new = {}
947
1346
  for ii in range(len(vv)):
948
- v=vv[ii]
949
- Idx=I[v,]
1347
+ v = vv[ii]
1348
+ Idx = I[v,]
950
1349
  if v not in Idx:
951
- Idx[0]=v
952
- Ig_new[ii]=(sorted(list(set(Idx))),sorted(list(set(Idx))))
953
- tmp_dM[ii,]=(dM1[Idx[0],]+dM1[Idx[1],])/2
954
- if len(Ig_new)==0:
1350
+ Idx[0] = v
1351
+ Ig_new[ii] = (sorted(list(set(Idx))), sorted(list(set(Idx))))
1352
+ tmp_dM[ii,] = (dM1[Idx[0],] + dM1[Idx[1],]) / 2
1353
+ if len(Ig_new) == 0:
955
1354
  if verbose:
956
- print('type 0 break')
1355
+ print("type 0 break")
957
1356
  break
958
- # print('%d of sequence left at cycle %d' %(len(Ig_new),flag))
959
- Igs, fid=OrderUnique(Ig_new)
960
- tmp_dM=tmp_dM[fid,]
961
- Ig_new=Igs
1357
+ # print('%d of sequence left at cycle %d' %(len(Ig_new),flag))
1358
+ Igs, fid = OrderUnique(Ig_new)
1359
+ tmp_dM = tmp_dM[fid,]
1360
+ Ig_new = Igs
962
1361
  else:
963
- D, I = index.search(dM1,2)
964
- vv=np.where(D[:,1]<=thr)[0]
965
- vv0=np.where(D[:,1]>thr)[0]
1362
+ D, I = index.search(dM1, 2)
1363
+ vv = np.where(D[:, 1] <= thr)[0]
1364
+ vv0 = np.where(D[:, 1] > thr)[0]
966
1365
  ## move groups in vv0 to Cls
967
- kkg=list(Ig.keys())
1366
+ kkg = list(Ig.keys())
968
1367
  for v in vv0:
969
- ng=list(Ig[kkg[v]][1])
970
- # if ng not in Cls:
1368
+ ng = list(Ig[kkg[v]][1])
1369
+ # if ng not in Cls:
971
1370
  Cls.append(ng)
972
- tmp_dM=np.zeros((len(vv),Ndim*6))
973
- Ig_new={}
1371
+ tmp_dM = np.zeros((len(vv), Ndim * 6))
1372
+ Ig_new = {}
974
1373
  for ii in range(len(vv)):
975
- v=vv[ii]
976
- idx1=I[v,0]
977
- idx2=I[v,1]
1374
+ v = vv[ii]
1375
+ idx1 = I[v, 0]
1376
+ idx2 = I[v, 1]
978
1377
  if v not in I[v,]:
979
- idx1=v
980
- # Ig_new[ii]=sorted(list(set(list(Ig[kkg[idx1]])+list(Ig[kkg[idx2]]))))
981
- Ig_new[ii]=(sorted(list(set([idx1,idx2]))), ## First entry records the relative index of a sequence clique
982
- sorted(list(set(list(Ig[kkg[idx1]][1])+list(Ig[kkg[idx2]][1]))))) ## Second entry records the absolute index of a sequence
983
- tmp_dM[ii,]=(dM1[idx1,]+dM1[idx2,])/2
984
- if len(Ig_new)==0:
1378
+ idx1 = v
1379
+ # Ig_new[ii]=sorted(list(set(list(Ig[kkg[idx1]])+list(Ig[kkg[idx2]]))))
1380
+ Ig_new[ii] = (
1381
+ sorted(
1382
+ list(set([idx1, idx2]))
1383
+ ), ## First entry records the relative index of a sequence clique
1384
+ sorted(list(set(list(Ig[kkg[idx1]][1]) + list(Ig[kkg[idx2]][1])))),
1385
+ ) ## Second entry records the absolute index of a sequence
1386
+ tmp_dM[ii,] = (dM1[idx1,] + dM1[idx2,]) / 2
1387
+ if len(Ig_new) == 0:
985
1388
  if verbose:
986
1389
  print("\ntype I break")
987
- kkg=list(Ig.keys())
1390
+ kkg = list(Ig.keys())
988
1391
  for kk in kkg:
989
- ng=list(Ig[kk][1])
1392
+ ng = list(Ig[kk][1])
990
1393
  if ng not in Cls:
991
1394
  Cls.append(ng)
992
1395
  break
993
- # print('%d of sequence left at cycle %d' %(len(Ig_new),flag))
994
- Igs, fid=OrderUnique(Ig_new)
995
- tmp_dM=tmp_dM[fid,]
996
- Ig_new=Igs
997
- if flag>0:
1396
+ # print('%d of sequence left at cycle %d' %(len(Ig_new),flag))
1397
+ Igs, fid = OrderUnique(Ig_new)
1398
+ tmp_dM = tmp_dM[fid,]
1399
+ Ig_new = Igs
1400
+ if flag > 0:
998
1401
  if Ig == Ig_new:
999
1402
  if verbose:
1000
1403
  print("\ntype II break")
1001
- kkg=list(Ig.keys())
1404
+ kkg = list(Ig.keys())
1002
1405
  for kk in kkg:
1003
- ng=list(Ig[kk][1])
1406
+ ng = list(Ig[kk][1])
1004
1407
  if ng in Cls:
1005
1408
  continue
1006
1409
  Cls.append(ng)
1007
1410
  break
1008
- Ig=Ig_new
1009
- tmp_dM=tmp_dM.astype('float32')
1010
- dM1=tmp_dM
1011
- flag+=1
1411
+ Ig = Ig_new
1412
+ tmp_dM = tmp_dM.astype("float32")
1413
+ dM1 = tmp_dM
1414
+ flag += 1
1012
1415
  return Cls
1013
1416
 
1014
- def ClusterCDR3r(dM, flagL, thr = 10, verbose = False):
1015
- index = faiss.IndexFlatL2(Ndim*6)
1417
+
1418
+ def ClusterCDR3r(dM, flagL, thr=10, verbose=False):
1419
+ index = faiss.IndexFlatL2(Ndim * 6)
1016
1420
  index.add(dM)
1017
1421
  lims, D, I = index.range_search(dM, thr)
1018
1422
  # with open('cdr3.npy', 'wb') as f:
@@ -1020,53 +1424,70 @@ def ClusterCDR3r(dM, flagL, thr = 10, verbose = False):
1020
1424
  # np.save(f, D)
1021
1425
  # np.save(f, I)
1022
1426
  # np.save(f, dM)
1023
-
1427
+
1024
1428
  # now clustering results
1025
1429
  N = dM.shape[0]
1026
- neighborSize = np.array([lims[cur_idx_i+1] - lims[cur_idx_i] for cur_idx_i in range(N)])
1430
+ neighborSize = np.array(
1431
+ [lims[cur_idx_i + 1] - lims[cur_idx_i] for cur_idx_i in range(N)]
1432
+ )
1027
1433
  # to_cluster = np.ones( (N,))
1028
1434
  clusterNo = 0
1029
- cluster = - np.ones( (N, ), dtype = np.int32)
1435
+ cluster = -np.ones((N,), dtype=np.int32)
1030
1436
  idx = np.where(cluster < 0)[0]
1031
1437
  unclustered = [np.argmax(neighborSize[idx])]
1032
1438
  depth = 0
1033
1439
  while True:
1034
- if len(unclustered) == 0: break
1440
+ if len(unclustered) == 0:
1441
+ break
1035
1442
  # cur_idx = unclustered[0] # first unclustered index
1036
1443
  cur_idx = unclustered
1037
- cluster[cur_idx] = clusterNo # assign cluster
1038
-
1039
- neighbor = np.unique(np.array(list(chain (* [I[(lims[cur_idx_i]): lims[cur_idx_i+1]] for cur_idx_i in cur_idx]))))
1444
+ cluster[cur_idx] = clusterNo # assign cluster
1445
+
1446
+ neighbor = np.unique(
1447
+ np.array(
1448
+ list(
1449
+ chain(
1450
+ *[
1451
+ I[(lims[cur_idx_i]) : lims[cur_idx_i + 1]]
1452
+ for cur_idx_i in cur_idx
1453
+ ]
1454
+ )
1455
+ )
1456
+ )
1457
+ )
1040
1458
  # find those unclusterred
1041
1459
  idx = np.where(cluster[neighbor] < 0)[0]
1042
1460
  if len(idx) == 0:
1043
1461
  depth = 0
1044
1462
  clusterNo += 1
1045
1463
  idx = np.where(cluster < 0)[0]
1046
- if len(idx) == 0: break
1464
+ if len(idx) == 0:
1465
+ break
1047
1466
  unclustered = [idx[np.argmax(neighborSize[idx])]]
1048
-
1467
+
1049
1468
  else:
1050
1469
  if depth > 3:
1051
1470
  depth = 0
1052
1471
  clusterNo += 1
1053
1472
  unclustered = neighbor[idx]
1054
1473
  depth += 1
1055
- # print('clusterNo = ', clusterNo)
1056
- Cls = [ [] for i in range(clusterNo)]
1474
+ # print('clusterNo = ', clusterNo)
1475
+ Cls = [[] for i in range(clusterNo)]
1057
1476
  for idx, i in enumerate(cluster):
1058
- Cls[i].append(idx)
1059
- # print("Cls[:5] = ", Cls[:5])
1060
- # print("len(Cls) = ", len(Cls),
1061
- # ', #elem=', sum([len(i) for i in Cls]),
1062
- # ', #single=', sum([len(i) for i in Cls if len(i) == 1]),
1063
- # ', #non_single=', sum([len(i) for i in Cls if len(i) != 1]),
1064
- # ', #max=', max([len(i) for i in Cls]))
1477
+ Cls[i].append(idx)
1478
+ # print("Cls[:5] = ", Cls[:5])
1479
+ # print("len(Cls) = ", len(Cls),
1480
+ # ', #elem=', sum([len(i) for i in Cls]),
1481
+ # ', #single=', sum([len(i) for i in Cls if len(i) == 1]),
1482
+ # ', #non_single=', sum([len(i) for i in Cls if len(i) != 1]),
1483
+ # ', #max=', max([len(i) for i in Cls]))
1065
1484
  return Cls
1066
1485
 
1486
+
1067
1487
  def CommandLineParser():
1068
- parser=OptionParser()
1069
- print ('''
1488
+ parser = OptionParser()
1489
+ print(
1490
+ """
1070
1491
  GIANA: Geometric Isometry based ANtigen-specific tcr Alignment
1071
1492
  Ultrafast short peptide alignment exclusively designed for large-scale adaptome analysis
1072
1493
 
@@ -1079,130 +1500,282 @@ Input columns:
1079
1500
 
1080
1501
  !!! ALL amino acid letters must be CAPITAL !!!
1081
1502
 
1082
- ''')
1083
- parser.add_option("-d","--directory",dest="Directory",help="Input repertoire sequencing file directory. Please make sure that all the files in the directory are input files.",default="")
1084
- parser.add_option("-f","--file",dest="File",default='',help="Input single file of CDR3 sequences for grouping")
1085
- parser.add_option("-F","--fileList",dest="files",default='',help='Alternative input: a file containing the full path to all the files. If given, overwrite -d and -f option')
1086
- parser.add_option("-t","--threshold",dest="thr",default=7,help="Isometric distance threshold for calling similar CDR3 groups. Without -E, smaller value will increase speed. With -E, smaller value will increase specificity. Must be smaller than 12.")
1087
- parser.add_option("-S","--threshold_score",dest="thr_s",default=3.6, help="Threshold for Smith-Waterman alignment score (normalized by CDR3 length). Default 3.6")
1088
- parser.add_option("-G","--threshold_vgene",dest="thr_v",default=3.7,help="Threshold for variable gene comparison. Default 3.7.")
1089
- parser.add_option("-o","--output",dest="OutDir",default='./',help="Output directory for intermediate and final outputs.")
1090
- parser.add_option("-O","--outfile",dest="OutFile",default='',help="Output file name. If not given, a file with --RotationEncoding will be added to the input file as the output file name.")
1091
- parser.add_option("-T","--startPosition",dest='ST',default=3, help="Starting position of CDR3 sequence. The first ST letters are omitted. CDR3 sequence length L must be >= ST+7 ")
1092
- parser.add_option("-g","--GapPenalty",dest="Gap",default= -6,help="Gap penalty,default= -6. Not used.")
1093
- parser.add_option("-n","--GapNumber",dest="GapN",default=1,help="Maximum number of gaps allowed when performing alignment. Max=1, default=1. Not used.")
1094
- parser.add_option("-V","--VariableGeneFa",dest="VFa",default="Imgt_Human_TRBV.fasta",help="IMGT Human beta variable gene sequences")
1095
- parser.add_option("-v","--VariableGene",dest="V",default=True,action="store_false",help="If False, GIANA will omit variable gene information and use CDR3 sequences only. This will yield reduced specificity. The cut-off will automatically become the current value-4.0")
1096
- parser.add_option("-e","--Exact",dest="E",default=True,action="store_false",help="If False, GIANA will not perform Smith-Waterman alignment after isometric encoding.")
1097
- parser.add_option("-N","--NumberOfThreads",dest="NN",default=1,help="Number of threads for multiple processing. Not working so well.")
1098
- parser.add_option("-M","--EncodingMatrix", dest="Mat", default=False,action="store_true", help="If true, GIANA will export the isometric encoding matrix for each TCR. Default: False.")
1099
- parser.add_option("-U","--UseGPU",dest="GPU", default=False, action="store_true",help="Use GPU for Faiss indexing. Must be CUDA GPUs.")
1100
- parser.add_option("-q","--queryFile",dest="Query",default='',help="Input query file, if given, GIANA will run in query mode, also need to provide -r option.")
1101
- parser.add_option("-r","--refFile",dest="ref", default='',help="Input reference file. Query model required.")
1102
- parser.add_option("-b","--Verbose", dest='v', default=False, action="store_true", help="Verbose option: if given, GIANA will print intermediate messages.")
1503
+ """
1504
+ )
1505
+ parser.add_option(
1506
+ "-d",
1507
+ "--directory",
1508
+ dest="Directory",
1509
+ help="Input repertoire sequencing file directory. Please make sure that all the files in the directory are input files.",
1510
+ default="",
1511
+ )
1512
+ parser.add_option(
1513
+ "-f",
1514
+ "--file",
1515
+ dest="File",
1516
+ default="",
1517
+ help="Input single file of CDR3 sequences for grouping",
1518
+ )
1519
+ parser.add_option(
1520
+ "-F",
1521
+ "--fileList",
1522
+ dest="files",
1523
+ default="",
1524
+ help="Alternative input: a file containing the full path to all the files. If given, overwrite -d and -f option",
1525
+ )
1526
+ parser.add_option(
1527
+ "-t",
1528
+ "--threshold",
1529
+ dest="thr",
1530
+ default=7,
1531
+ help="Isometric distance threshold for calling similar CDR3 groups. Without -E, smaller value will increase speed. With -E, smaller value will increase specificity. Must be smaller than 12.",
1532
+ )
1533
+ parser.add_option(
1534
+ "-S",
1535
+ "--threshold_score",
1536
+ dest="thr_s",
1537
+ default=3.6,
1538
+ help="Threshold for Smith-Waterman alignment score (normalized by CDR3 length). Default 3.6",
1539
+ )
1540
+ parser.add_option(
1541
+ "-G",
1542
+ "--threshold_vgene",
1543
+ dest="thr_v",
1544
+ default=3.7,
1545
+ help="Threshold for variable gene comparison. Default 3.7.",
1546
+ )
1547
+ parser.add_option(
1548
+ "-o",
1549
+ "--output",
1550
+ dest="OutDir",
1551
+ default="./",
1552
+ help="Output directory for intermediate and final outputs.",
1553
+ )
1554
+ parser.add_option(
1555
+ "-O",
1556
+ "--outfile",
1557
+ dest="OutFile",
1558
+ default="",
1559
+ help="Output file name. If not given, a file with --RotationEncoding will be added to the input file as the output file name.",
1560
+ )
1561
+ parser.add_option(
1562
+ "-T",
1563
+ "--startPosition",
1564
+ dest="ST",
1565
+ default=3,
1566
+ help="Starting position of CDR3 sequence. The first ST letters are omitted. CDR3 sequence length L must be >= ST+7 ",
1567
+ )
1568
+ parser.add_option(
1569
+ "-g",
1570
+ "--GapPenalty",
1571
+ dest="Gap",
1572
+ default=-6,
1573
+ help="Gap penalty,default= -6. Not used.",
1574
+ )
1575
+ parser.add_option(
1576
+ "-n",
1577
+ "--GapNumber",
1578
+ dest="GapN",
1579
+ default=1,
1580
+ help="Maximum number of gaps allowed when performing alignment. Max=1, default=1. Not used.",
1581
+ )
1582
+ parser.add_option(
1583
+ "-V",
1584
+ "--VariableGeneFa",
1585
+ dest="VFa",
1586
+ default="Imgt_Human_TRBV.fasta",
1587
+ help="IMGT Human beta variable gene sequences",
1588
+ )
1589
+ parser.add_option(
1590
+ "-v",
1591
+ "--VariableGene",
1592
+ dest="V",
1593
+ default=True,
1594
+ action="store_false",
1595
+ help="If False, GIANA will omit variable gene information and use CDR3 sequences only. This will yield reduced specificity. The cut-off will automatically become the current value-4.0",
1596
+ )
1597
+ parser.add_option(
1598
+ "-e",
1599
+ "--Exact",
1600
+ dest="E",
1601
+ default=True,
1602
+ action="store_false",
1603
+ help="If False, GIANA will not perform Smith-Waterman alignment after isometric encoding.",
1604
+ )
1605
+ parser.add_option(
1606
+ "-N",
1607
+ "--NumberOfThreads",
1608
+ dest="NN",
1609
+ default=1,
1610
+ help="Number of threads for multiple processing. Not working so well.",
1611
+ )
1612
+ parser.add_option(
1613
+ "-M",
1614
+ "--EncodingMatrix",
1615
+ dest="Mat",
1616
+ default=False,
1617
+ action="store_true",
1618
+ help="If true, GIANA will export the isometric encoding matrix for each TCR. Default: False.",
1619
+ )
1620
+ parser.add_option(
1621
+ "-U",
1622
+ "--UseGPU",
1623
+ dest="GPU",
1624
+ default=False,
1625
+ action="store_true",
1626
+ help="Use GPU for Faiss indexing. Must be CUDA GPUs.",
1627
+ )
1628
+ parser.add_option(
1629
+ "-q",
1630
+ "--queryFile",
1631
+ dest="Query",
1632
+ default="",
1633
+ help="Input query file, if given, GIANA will run in query mode, also need to provide -r option.",
1634
+ )
1635
+ parser.add_option(
1636
+ "-r",
1637
+ "--refFile",
1638
+ dest="ref",
1639
+ default="",
1640
+ help="Input reference file. Query model required.",
1641
+ )
1642
+ parser.add_option(
1643
+ "-b",
1644
+ "--Verbose",
1645
+ dest="v",
1646
+ default=False,
1647
+ action="store_true",
1648
+ help="Verbose option: if given, GIANA will print intermediate messages.",
1649
+ )
1103
1650
  return parser.parse_args()
1104
1651
 
1652
+
1105
1653
  def main():
1106
- (opt,_)=CommandLineParser()
1107
- cutoff=float(opt.thr)
1108
- OutDir=opt.OutDir
1109
- thr_s=float(opt.thr_s)
1654
+ (opt, _) = CommandLineParser()
1655
+ cutoff = float(opt.thr)
1656
+ OutDir = opt.OutDir
1657
+ thr_s = float(opt.thr_s)
1110
1658
  ## Check if query mode first
1111
- qFile=opt.Query
1112
- if len(qFile)>0:
1659
+ qFile = opt.Query
1660
+ if len(qFile) > 0:
1113
1661
  ## query mode
1114
- t1=time.time()
1115
- if qFile.endswith('/'):
1662
+ t1 = time.time()
1663
+ if qFile.endswith("/"):
1116
1664
  ## input query is a directory
1117
- qFs=os.listdir(qFile)
1118
- qFileList=[]
1665
+ qFs = os.listdir(qFile)
1666
+ qFileList = []
1119
1667
  for ff in qFs:
1120
- qFileList.append(qFile+ff)
1668
+ qFileList.append(qFile + ff)
1121
1669
  else:
1122
- qFileList=[qFile]
1123
- rFile=opt.ref
1124
- if len(rFile)==0:
1125
- raise("Must provide reference file in query mode!")
1670
+ qFileList = [qFile]
1671
+ rFile = opt.ref
1672
+ if len(rFile) == 0:
1673
+ raise ("Must provide reference file in query mode!")
1126
1674
  else:
1127
1675
  ## check if reference cluster file exists
1128
- rFile0=re.sub('\\.txt','',rFile)
1129
- refClusterFile=rFile0+'--RotationEncodingBL62.txt'
1676
+ rFile0 = re.sub("\\.txt", "", rFile)
1677
+ refClusterFile = rFile0 + "--RotationEncodingBL62.txt"
1130
1678
  if not os.path.exists(refClusterFile):
1131
- raise("Must run clustering on reference file first! Did you forget to put the clustering file in this directory?")
1132
- rData=CreateReference(rFile)
1133
- t2=time.time()
1134
- print("Reference created. Elapsed %f" %(t2-t1))
1679
+ raise (
1680
+ "Must run clustering on reference file first! Did you forget to put the clustering file in this directory?"
1681
+ )
1682
+ rData = CreateReference(rFile)
1683
+ t2 = time.time()
1684
+ print("Reference created. Elapsed %f" % (t2 - t1))
1135
1685
  for qf in qFileList:
1136
- t2_0=time.time()
1137
- print("Querying "+qf)
1138
- qf_s=qf.split('/')[-1]
1139
- #outFile=re.sub('\\.txt','',qf_s)+'_query_'+rFile0+'.txt'
1140
- outFile=os.path.splitext(qf_s)[0]+'_query_'+os.path.basename(rFile0)+'.txt'
1141
- of=OutDir+'/'+outFile
1686
+ t2_0 = time.time()
1687
+ print("Querying " + qf)
1688
+ qf_s = qf.split("/")[-1]
1689
+ # outFile=re.sub('\\.txt','',qf_s)+'_query_'+rFile0+'.txt'
1690
+ outFile = (
1691
+ os.path.splitext(qf_s)[0]
1692
+ + "_query_"
1693
+ + os.path.basename(rFile0)
1694
+ + ".txt"
1695
+ )
1696
+ of = OutDir + "/" + outFile
1142
1697
  if path.exists(of):
1143
- print(of+' already exits. Skipping.')
1698
+ print(of + " already exits. Skipping.")
1144
1699
  continue
1145
1700
  MakeQuery(qf, rData, thr=cutoff, thr_s=thr_s)
1146
- t2=time.time()
1147
- print(" Build query clustering file. Elapsed %f" %(t2-t1))
1701
+ t2 = time.time()
1702
+ print(" Build query clustering file. Elapsed %f" % (t2 - t1))
1148
1703
  print("Now mering with reference cluster")
1149
- MergeExist(refClusterFile, OutDir+'/'+outFile)
1150
- t2=time.time()
1151
- print(" Time of elapsed for query %s: %f" %(qf, t2-t2_0))
1704
+ MergeExist(refClusterFile, OutDir + "/" + outFile)
1705
+ t2 = time.time()
1706
+ print(" Time of elapsed for query %s: %f" % (qf, t2 - t2_0))
1152
1707
  else:
1153
1708
  ## regular clustering mode
1154
- FileDir=opt.Directory
1155
- if len(FileDir)>0:
1156
- files=os.listdir(FileDir)
1157
- files0=[]
1158
- for ff in files:
1159
- ff=FileDir+'/'+ff
1160
- files0.append(ff)
1161
- files=files0
1709
+ FileDir = opt.Directory
1710
+ if len(FileDir) > 0:
1711
+ files = os.listdir(FileDir)
1712
+ files0 = []
1713
+ for ff in files:
1714
+ ff = FileDir + "/" + ff
1715
+ files0.append(ff)
1716
+ files = files0
1162
1717
  else:
1163
- files=[]
1164
- File=opt.File
1165
- if len(File)>0:
1166
- files=[File]
1167
- FileList=opt.files
1168
- if len(FileList)>0:
1169
- files=[]
1170
- fL=open(FileList)
1171
- for ff in fL.readlines():
1172
- files.append(ff.strip())
1173
- VFa=opt.VFa
1718
+ files = []
1719
+ File = opt.File
1720
+ if len(File) > 0:
1721
+ files = [File]
1722
+ FileList = opt.files
1723
+ if len(FileList) > 0:
1724
+ files = []
1725
+ fL = open(FileList)
1726
+ for ff in fL.readlines():
1727
+ files.append(ff.strip())
1728
+ VFa = opt.VFa
1174
1729
  PreCalculateVgeneDist(VFa)
1175
- vf=open('./VgeneScores.txt') ## Use tcrDist's Vgene 80-score calculation
1176
- VScore={}
1177
- VV=opt.V
1178
- EE=opt.E
1179
- Mat=opt.Mat
1180
- ST=int(opt.ST)
1181
- thr_v=float(opt.thr_v)
1182
- verbose=opt.v
1730
+ vf = open("./VgeneScores.txt") ## Use tcrDist's Vgene 80-score calculation
1731
+ VScore = {}
1732
+ VV = opt.V
1733
+ EE = opt.E
1734
+ Mat = opt.Mat
1735
+ ST = int(opt.ST)
1736
+ thr_v = float(opt.thr_v)
1737
+ verbose = opt.v
1183
1738
  if VV:
1184
1739
  while 1:
1185
- line=vf.readline()
1186
- if len(line)==0:
1740
+ line = vf.readline()
1741
+ if len(line) == 0:
1187
1742
  break
1188
- ww=line.strip().split('\t')
1189
- VScore[(ww[0],ww[1])]=int(ww[2])/20
1190
- VScore[(ww[1],ww[0])]=int(ww[2])/20
1191
- Gap=int(opt.Gap)
1192
- Gapn=int(opt.GapN)
1193
- OutFile=opt.OutFile
1194
- GPU=opt.GPU
1195
- st=3
1196
- ed=1
1197
- NT=int(opt.NN)
1743
+ ww = line.strip().split("\t")
1744
+ VScore[(ww[0], ww[1])] = int(ww[2]) / 20
1745
+ VScore[(ww[1], ww[0])] = int(ww[2]) / 20
1746
+ Gap = int(opt.Gap)
1747
+ Gapn = int(opt.GapN)
1748
+ OutFile = opt.OutFile
1749
+ GPU = opt.GPU
1750
+ st = 3
1751
+ ed = 1
1752
+ NT = int(opt.NN)
1198
1753
  faiss.omp_set_num_threads(NT)
1199
1754
  for ff in files:
1200
- print("Processing %s" %ff)
1201
- EncodeRepertoire(ff, OutDir, OutFile, ST=ST, thr_s=thr_s, thr_v=thr_v, exact=EE,VDict=VScore, Vgene=VV, thr_iso=cutoff, gap=Gap, GPU=GPU, Mat=Mat, verbose=verbose)
1202
-
1755
+ print("Processing %s" % ff)
1756
+ EncodeRepertoire(
1757
+ ff,
1758
+ OutDir,
1759
+ OutFile,
1760
+ ST=ST,
1761
+ thr_s=thr_s,
1762
+ thr_v=thr_v,
1763
+ exact=EE,
1764
+ VDict=VScore,
1765
+ Vgene=VV,
1766
+ thr_iso=cutoff,
1767
+ gap=Gap,
1768
+ GPU=GPU,
1769
+ Mat=Mat,
1770
+ verbose=verbose,
1771
+ )
1772
+
1773
+
1203
1774
  if __name__ == "__main__":
1204
- t0=time.time()
1775
+ t0 = time.time()
1205
1776
  main()
1206
- print ("Total time elapsed: %f" %(time.time()-t0))
1207
- print ("Maximum memory usage: %f MB" %(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss/1000000))
1208
-
1777
+ print("Total time elapsed: %f" % (time.time() - t0))
1778
+ print(
1779
+ "Maximum memory usage: %f MB"
1780
+ % (resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1000000)
1781
+ )