biopipen 0.21.0__py3-none-any.whl → 0.34.26__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biopipen/__init__.py +1 -1
- biopipen/core/config.toml +28 -0
- biopipen/core/filters.py +79 -4
- biopipen/core/proc.py +12 -3
- biopipen/core/testing.py +75 -3
- biopipen/ns/bam.py +148 -6
- biopipen/ns/bed.py +75 -0
- biopipen/ns/cellranger.py +186 -0
- biopipen/ns/cellranger_pipeline.py +126 -0
- biopipen/ns/cnv.py +19 -3
- biopipen/ns/cnvkit.py +1 -1
- biopipen/ns/cnvkit_pipeline.py +20 -12
- biopipen/ns/delim.py +34 -35
- biopipen/ns/gene.py +68 -23
- biopipen/ns/gsea.py +63 -37
- biopipen/ns/misc.py +39 -14
- biopipen/ns/plot.py +304 -1
- biopipen/ns/protein.py +183 -0
- biopipen/ns/regulatory.py +290 -0
- biopipen/ns/rnaseq.py +142 -5
- biopipen/ns/scrna.py +2053 -473
- biopipen/ns/scrna_metabolic_landscape.py +228 -382
- biopipen/ns/snp.py +659 -0
- biopipen/ns/stats.py +484 -0
- biopipen/ns/tcr.py +683 -98
- biopipen/ns/vcf.py +236 -2
- biopipen/ns/web.py +97 -6
- biopipen/reports/bam/CNVpytor.svelte +4 -9
- biopipen/reports/cellranger/CellRangerCount.svelte +18 -0
- biopipen/reports/cellranger/CellRangerSummary.svelte +16 -0
- biopipen/reports/cellranger/CellRangerVdj.svelte +18 -0
- biopipen/reports/cnvkit/CNVkitDiagram.svelte +1 -1
- biopipen/reports/cnvkit/CNVkitHeatmap.svelte +1 -1
- biopipen/reports/cnvkit/CNVkitScatter.svelte +1 -1
- biopipen/reports/common.svelte +15 -0
- biopipen/reports/protein/ProdigySummary.svelte +16 -0
- biopipen/reports/scrna/CellsDistribution.svelte +4 -39
- biopipen/reports/scrna/DimPlots.svelte +1 -1
- biopipen/reports/scrna/MarkersFinder.svelte +6 -126
- biopipen/reports/scrna/MetaMarkers.svelte +3 -75
- biopipen/reports/scrna/RadarPlots.svelte +4 -20
- biopipen/reports/scrna_metabolic_landscape/MetabolicFeatures.svelte +61 -22
- biopipen/reports/scrna_metabolic_landscape/MetabolicPathwayActivity.svelte +88 -82
- biopipen/reports/scrna_metabolic_landscape/MetabolicPathwayHeterogeneity.svelte +70 -10
- biopipen/reports/snp/PlinkCallRate.svelte +24 -0
- biopipen/reports/snp/PlinkFreq.svelte +18 -0
- biopipen/reports/snp/PlinkHWE.svelte +18 -0
- biopipen/reports/snp/PlinkHet.svelte +18 -0
- biopipen/reports/snp/PlinkIBD.svelte +18 -0
- biopipen/reports/tcr/CDR3AAPhyschem.svelte +19 -66
- biopipen/reports/tcr/ClonalStats.svelte +16 -0
- biopipen/reports/tcr/CloneResidency.svelte +3 -93
- biopipen/reports/tcr/Immunarch.svelte +4 -155
- biopipen/reports/tcr/TCRClusterStats.svelte +3 -45
- biopipen/reports/tcr/TESSA.svelte +11 -28
- biopipen/reports/utils/misc.liq +22 -7
- biopipen/scripts/bam/BamMerge.py +11 -15
- biopipen/scripts/bam/BamSampling.py +90 -0
- biopipen/scripts/bam/BamSort.py +141 -0
- biopipen/scripts/bam/BamSplitChroms.py +10 -10
- biopipen/scripts/bam/BamSubsetByBed.py +38 -0
- biopipen/scripts/bam/CNAClinic.R +41 -5
- biopipen/scripts/bam/CNVpytor.py +153 -54
- biopipen/scripts/bam/ControlFREEC.py +13 -14
- biopipen/scripts/bam/SamtoolsView.py +33 -0
- biopipen/scripts/bed/Bed2Vcf.py +5 -5
- biopipen/scripts/bed/BedConsensus.py +5 -5
- biopipen/scripts/bed/BedLiftOver.sh +6 -4
- biopipen/scripts/bed/BedtoolsIntersect.py +54 -0
- biopipen/scripts/bed/BedtoolsMakeWindows.py +47 -0
- biopipen/scripts/bed/BedtoolsMerge.py +4 -4
- biopipen/scripts/cellranger/CellRangerCount.py +138 -0
- biopipen/scripts/cellranger/CellRangerSummary.R +181 -0
- biopipen/scripts/cellranger/CellRangerVdj.py +112 -0
- biopipen/scripts/cnv/AneuploidyScore.R +55 -20
- biopipen/scripts/cnv/AneuploidyScoreSummary.R +221 -163
- biopipen/scripts/cnv/TMADScore.R +25 -9
- biopipen/scripts/cnv/TMADScoreSummary.R +57 -86
- biopipen/scripts/cnvkit/CNVkitAccess.py +7 -6
- biopipen/scripts/cnvkit/CNVkitAutobin.py +26 -18
- biopipen/scripts/cnvkit/CNVkitBatch.py +6 -6
- biopipen/scripts/cnvkit/CNVkitCall.py +3 -3
- biopipen/scripts/cnvkit/CNVkitCoverage.py +4 -3
- biopipen/scripts/cnvkit/CNVkitDiagram.py +5 -5
- biopipen/scripts/cnvkit/CNVkitFix.py +3 -3
- biopipen/scripts/cnvkit/CNVkitGuessBaits.py +12 -8
- biopipen/scripts/cnvkit/CNVkitHeatmap.py +5 -5
- biopipen/scripts/cnvkit/CNVkitReference.py +6 -5
- biopipen/scripts/cnvkit/CNVkitScatter.py +5 -5
- biopipen/scripts/cnvkit/CNVkitSegment.py +5 -5
- biopipen/scripts/cnvkit/guess_baits.py +166 -93
- biopipen/scripts/delim/RowsBinder.R +1 -1
- biopipen/scripts/delim/SampleInfo.R +116 -118
- biopipen/scripts/gene/GeneNameConversion.R +67 -0
- biopipen/scripts/gene/GenePromoters.R +61 -0
- biopipen/scripts/gsea/Enrichr.R +5 -5
- biopipen/scripts/gsea/FGSEA.R +184 -50
- biopipen/scripts/gsea/GSEA.R +2 -2
- biopipen/scripts/gsea/PreRank.R +5 -5
- biopipen/scripts/misc/Config2File.py +2 -2
- biopipen/scripts/misc/Plot.R +80 -0
- biopipen/scripts/misc/Shell.sh +15 -0
- biopipen/scripts/misc/Str2File.py +2 -2
- biopipen/scripts/plot/Heatmap.R +3 -3
- biopipen/scripts/plot/Manhattan.R +147 -0
- biopipen/scripts/plot/QQPlot.R +146 -0
- biopipen/scripts/plot/ROC.R +88 -0
- biopipen/scripts/plot/Scatter.R +112 -0
- biopipen/scripts/plot/VennDiagram.R +5 -9
- biopipen/scripts/protein/MMCIF2PDB.py +33 -0
- biopipen/scripts/protein/PDB2Fasta.py +60 -0
- biopipen/scripts/protein/Prodigy.py +119 -0
- biopipen/scripts/protein/ProdigySummary.R +140 -0
- biopipen/scripts/protein/RMSD.py +178 -0
- biopipen/scripts/regulatory/MotifAffinityTest.R +102 -0
- biopipen/scripts/regulatory/MotifAffinityTest_AtSNP.R +127 -0
- biopipen/scripts/regulatory/MotifAffinityTest_MotifBreakR.R +104 -0
- biopipen/scripts/regulatory/MotifScan.py +159 -0
- biopipen/scripts/regulatory/VariantMotifPlot.R +78 -0
- biopipen/scripts/regulatory/motifs-common.R +324 -0
- biopipen/scripts/rnaseq/Simulation-ESCO.R +180 -0
- biopipen/scripts/rnaseq/Simulation-RUVcorr.R +45 -0
- biopipen/scripts/rnaseq/Simulation.R +21 -0
- biopipen/scripts/rnaseq/UnitConversion.R +325 -54
- biopipen/scripts/scrna/AnnData2Seurat.R +40 -0
- biopipen/scripts/scrna/CCPlotR-patch.R +161 -0
- biopipen/scripts/scrna/CellCellCommunication.py +150 -0
- biopipen/scripts/scrna/CellCellCommunicationPlots.R +93 -0
- biopipen/scripts/scrna/CellSNPLite.py +30 -0
- biopipen/scripts/scrna/CellTypeAnnotation-celltypist.R +185 -0
- biopipen/scripts/scrna/CellTypeAnnotation-direct.R +68 -31
- biopipen/scripts/scrna/CellTypeAnnotation-hitype.R +27 -22
- biopipen/scripts/scrna/CellTypeAnnotation-sccatch.R +28 -20
- biopipen/scripts/scrna/CellTypeAnnotation-sctype.R +48 -25
- biopipen/scripts/scrna/CellTypeAnnotation.R +37 -1
- biopipen/scripts/scrna/CellsDistribution.R +456 -167
- biopipen/scripts/scrna/DimPlots.R +1 -1
- biopipen/scripts/scrna/ExprImputation-alra.R +109 -0
- biopipen/scripts/scrna/ExprImputation-rmagic.R +256 -0
- biopipen/scripts/scrna/{ExprImpution-scimpute.R → ExprImputation-scimpute.R} +8 -5
- biopipen/scripts/scrna/ExprImputation.R +7 -0
- biopipen/scripts/scrna/LoomTo10X.R +51 -0
- biopipen/scripts/scrna/MQuad.py +25 -0
- biopipen/scripts/scrna/MarkersFinder.R +679 -400
- biopipen/scripts/scrna/MetaMarkers.R +265 -161
- biopipen/scripts/scrna/ModuleScoreCalculator.R +66 -11
- biopipen/scripts/scrna/PseudoBulkDEG.R +678 -0
- biopipen/scripts/scrna/RadarPlots.R +355 -134
- biopipen/scripts/scrna/ScFGSEA.R +298 -100
- biopipen/scripts/scrna/ScSimulation.R +65 -0
- biopipen/scripts/scrna/ScVelo.py +617 -0
- biopipen/scripts/scrna/Seurat2AnnData.R +7 -0
- biopipen/scripts/scrna/SeuratClusterStats-clustree.R +87 -0
- biopipen/scripts/scrna/SeuratClusterStats-dimplots.R +36 -30
- biopipen/scripts/scrna/SeuratClusterStats-features.R +138 -187
- biopipen/scripts/scrna/SeuratClusterStats-ngenes.R +81 -0
- biopipen/scripts/scrna/SeuratClusterStats-stats.R +78 -89
- biopipen/scripts/scrna/SeuratClusterStats.R +47 -10
- biopipen/scripts/scrna/SeuratClustering.R +36 -233
- biopipen/scripts/scrna/SeuratLoading.R +2 -2
- biopipen/scripts/scrna/SeuratMap2Ref.R +84 -113
- biopipen/scripts/scrna/SeuratMetadataMutater.R +16 -6
- biopipen/scripts/scrna/SeuratPreparing.R +223 -173
- biopipen/scripts/scrna/SeuratSubClustering.R +64 -0
- biopipen/scripts/scrna/SeuratTo10X.R +27 -0
- biopipen/scripts/scrna/Slingshot.R +65 -0
- biopipen/scripts/scrna/Subset10X.R +2 -2
- biopipen/scripts/scrna/TopExpressingGenes.R +169 -135
- biopipen/scripts/scrna/celltypist-wrapper.py +195 -0
- biopipen/scripts/scrna/scvelo_paga.py +313 -0
- biopipen/scripts/scrna/seurat_anndata_conversion.py +98 -0
- biopipen/scripts/scrna_metabolic_landscape/MetabolicFeatures.R +447 -82
- biopipen/scripts/scrna_metabolic_landscape/MetabolicPathwayActivity.R +348 -241
- biopipen/scripts/scrna_metabolic_landscape/MetabolicPathwayHeterogeneity.R +188 -166
- biopipen/scripts/snp/MatrixEQTL.R +217 -0
- biopipen/scripts/snp/Plink2GTMat.py +148 -0
- biopipen/scripts/snp/PlinkCallRate.R +199 -0
- biopipen/scripts/snp/PlinkFilter.py +100 -0
- biopipen/scripts/snp/PlinkFreq.R +291 -0
- biopipen/scripts/snp/PlinkFromVcf.py +81 -0
- biopipen/scripts/snp/PlinkHWE.R +85 -0
- biopipen/scripts/snp/PlinkHet.R +96 -0
- biopipen/scripts/snp/PlinkIBD.R +196 -0
- biopipen/scripts/snp/PlinkSimulation.py +124 -0
- biopipen/scripts/snp/PlinkUpdateName.py +124 -0
- biopipen/scripts/stats/ChowTest.R +146 -0
- biopipen/scripts/stats/DiffCoexpr.R +152 -0
- biopipen/scripts/stats/LiquidAssoc.R +135 -0
- biopipen/scripts/stats/Mediation.R +108 -0
- biopipen/scripts/stats/MetaPvalue.R +130 -0
- biopipen/scripts/stats/MetaPvalue1.R +74 -0
- biopipen/scripts/tcgamaf/Maf2Vcf.py +2 -2
- biopipen/scripts/tcgamaf/MafAddChr.py +2 -2
- biopipen/scripts/tcr/Attach2Seurat.R +3 -2
- biopipen/scripts/tcr/CDR3AAPhyschem.R +211 -143
- biopipen/scripts/tcr/CDR3Clustering.R +343 -0
- biopipen/scripts/tcr/ClonalStats.R +526 -0
- biopipen/scripts/tcr/CloneResidency.R +255 -131
- biopipen/scripts/tcr/CloneSizeQQPlot.R +4 -4
- biopipen/scripts/tcr/GIANA/GIANA.py +1356 -797
- biopipen/scripts/tcr/GIANA/GIANA4.py +1362 -789
- biopipen/scripts/tcr/GIANA/query.py +164 -162
- biopipen/scripts/tcr/Immunarch-basic.R +31 -9
- biopipen/scripts/tcr/Immunarch-clonality.R +25 -5
- biopipen/scripts/tcr/Immunarch-diversity.R +352 -134
- biopipen/scripts/tcr/Immunarch-geneusage.R +45 -5
- biopipen/scripts/tcr/Immunarch-kmer.R +68 -8
- biopipen/scripts/tcr/Immunarch-overlap.R +84 -4
- biopipen/scripts/tcr/Immunarch-spectratyping.R +35 -6
- biopipen/scripts/tcr/Immunarch-tracking.R +38 -6
- biopipen/scripts/tcr/Immunarch-vjjunc.R +165 -0
- biopipen/scripts/tcr/Immunarch.R +63 -11
- biopipen/scripts/tcr/Immunarch2VDJtools.R +2 -2
- biopipen/scripts/tcr/ImmunarchFilter.R +4 -4
- biopipen/scripts/tcr/ImmunarchLoading.R +38 -29
- biopipen/scripts/tcr/SampleDiversity.R +1 -1
- biopipen/scripts/tcr/ScRepCombiningExpression.R +40 -0
- biopipen/scripts/tcr/ScRepLoading.R +166 -0
- biopipen/scripts/tcr/TCRClusterStats.R +176 -22
- biopipen/scripts/tcr/TCRDock.py +110 -0
- biopipen/scripts/tcr/TESSA.R +102 -118
- biopipen/scripts/tcr/VJUsage.R +5 -5
- biopipen/scripts/tcr/immunarch-patched.R +142 -0
- biopipen/scripts/tcr/vdjtools-patch.sh +1 -1
- biopipen/scripts/vcf/BcftoolsAnnotate.py +91 -0
- biopipen/scripts/vcf/BcftoolsFilter.py +90 -0
- biopipen/scripts/vcf/BcftoolsMerge.py +31 -0
- biopipen/scripts/vcf/BcftoolsSort.py +113 -0
- biopipen/scripts/vcf/BcftoolsView.py +73 -0
- biopipen/scripts/vcf/TruvariBench.sh +14 -7
- biopipen/scripts/vcf/TruvariBenchSummary.R +16 -13
- biopipen/scripts/vcf/TruvariConsistency.R +1 -1
- biopipen/scripts/vcf/Vcf2Bed.py +2 -2
- biopipen/scripts/vcf/VcfAnno.py +11 -11
- biopipen/scripts/vcf/VcfDownSample.sh +22 -10
- biopipen/scripts/vcf/VcfFilter.py +5 -5
- biopipen/scripts/vcf/VcfFix.py +7 -7
- biopipen/scripts/vcf/VcfFix_utils.py +13 -4
- biopipen/scripts/vcf/VcfIndex.py +3 -3
- biopipen/scripts/vcf/VcfIntersect.py +3 -3
- biopipen/scripts/vcf/VcfLiftOver.sh +5 -0
- biopipen/scripts/vcf/VcfSplitSamples.py +4 -4
- biopipen/scripts/vcf/bcftools_utils.py +52 -0
- biopipen/scripts/web/Download.py +8 -4
- biopipen/scripts/web/DownloadList.py +5 -5
- biopipen/scripts/web/GCloudStorageDownloadBucket.py +82 -0
- biopipen/scripts/web/GCloudStorageDownloadFile.py +23 -0
- biopipen/scripts/web/gcloud_common.py +49 -0
- biopipen/utils/gene.py +108 -60
- biopipen/utils/misc.py +146 -20
- biopipen/utils/reference.py +64 -20
- biopipen/utils/reporter.py +177 -0
- biopipen/utils/vcf.py +1 -1
- biopipen-0.34.26.dist-info/METADATA +27 -0
- biopipen-0.34.26.dist-info/RECORD +292 -0
- {biopipen-0.21.0.dist-info → biopipen-0.34.26.dist-info}/WHEEL +1 -1
- {biopipen-0.21.0.dist-info → biopipen-0.34.26.dist-info}/entry_points.txt +6 -2
- biopipen/ns/bcftools.py +0 -111
- biopipen/ns/scrna_basic.py +0 -255
- biopipen/reports/delim/SampleInfo.svelte +0 -36
- biopipen/reports/scrna/GeneExpressionInvistigation.svelte +0 -32
- biopipen/reports/scrna/ScFGSEA.svelte +0 -35
- biopipen/reports/scrna/SeuratClusterStats.svelte +0 -82
- biopipen/reports/scrna/SeuratMap2Ref.svelte +0 -20
- biopipen/reports/scrna/SeuratPreparing.svelte +0 -38
- biopipen/reports/scrna/TopExpressingGenes.svelte +0 -55
- biopipen/reports/scrna_metabolic_landscape/MetabolicFeaturesIntraSubset.svelte +0 -31
- biopipen/reports/utils/gsea.liq +0 -110
- biopipen/scripts/bcftools/BcftoolsAnnotate.py +0 -42
- biopipen/scripts/bcftools/BcftoolsFilter.py +0 -79
- biopipen/scripts/bcftools/BcftoolsSort.py +0 -19
- biopipen/scripts/gene/GeneNameConversion.py +0 -66
- biopipen/scripts/scrna/ExprImpution-alra.R +0 -32
- biopipen/scripts/scrna/ExprImpution-rmagic.R +0 -29
- biopipen/scripts/scrna/ExprImpution.R +0 -7
- biopipen/scripts/scrna/GeneExpressionInvistigation.R +0 -132
- biopipen/scripts/scrna/Write10X.R +0 -11
- biopipen/scripts/scrna_metabolic_landscape/MetabolicFeaturesIntraSubset.R +0 -150
- biopipen/scripts/tcr/TCRClustering.R +0 -280
- biopipen/utils/common_docstrs.py +0 -61
- biopipen/utils/gene.R +0 -49
- biopipen/utils/gsea.R +0 -193
- biopipen/utils/io.R +0 -20
- biopipen/utils/misc.R +0 -114
- biopipen/utils/mutate_helpers.R +0 -433
- biopipen/utils/plot.R +0 -173
- biopipen/utils/rnaseq.R +0 -48
- biopipen/utils/single_cell.R +0 -115
- biopipen-0.21.0.dist-info/METADATA +0 -22
- biopipen-0.21.0.dist-info/RECORD +0 -218
|
@@ -24,7 +24,6 @@
|
|
|
24
24
|
import sys, os, re, resource
|
|
25
25
|
from os import path
|
|
26
26
|
import numpy as np
|
|
27
|
-
from Bio.SubsMat.MatrixInfo import blosum62
|
|
28
27
|
import time
|
|
29
28
|
from time import gmtime, strftime
|
|
30
29
|
from operator import itemgetter
|
|
@@ -36,255 +35,585 @@ from sklearn.decomposition import PCA
|
|
|
36
35
|
from sklearn.manifold import MDS
|
|
37
36
|
import faiss
|
|
38
37
|
from query import *
|
|
38
|
+
try:
|
|
39
|
+
from Bio.Align import substitution_matrices
|
|
40
|
+
blosum62 = substitution_matrices.load("BLOSUM62")
|
|
41
|
+
_tmp = {}
|
|
42
|
+
for ab1 in blosum62.alphabet:
|
|
43
|
+
for ab2 in blosum62.alphabet:
|
|
44
|
+
_tmp[(ab1, ab2)] = int(blosum62[(ab1, ab2)])
|
|
45
|
+
blosum62 = _tmp
|
|
46
|
+
except ModuleNotFoundError:
|
|
47
|
+
from Bio.SubsMat.MatrixInfo import blosum62
|
|
39
48
|
|
|
40
|
-
AAstring=
|
|
41
|
-
AAstringList=list(AAstring)
|
|
42
|
-
cur_dir=os.path.dirname(os.path.realpath(__file__))+
|
|
49
|
+
AAstring = "ACDEFGHIKLMNPQRSTVWY"
|
|
50
|
+
AAstringList = list(AAstring)
|
|
51
|
+
cur_dir = os.path.dirname(os.path.realpath(__file__)) + "/"
|
|
43
52
|
|
|
44
|
-
blosum62n={}
|
|
53
|
+
blosum62n = {}
|
|
45
54
|
for kk in blosum62:
|
|
46
|
-
a1=kk[0]
|
|
47
|
-
a2=kk[1]
|
|
48
|
-
vv=blosum62[kk]
|
|
49
|
-
if vv>4:
|
|
50
|
-
vv=4
|
|
51
|
-
blosum62n[(a1,a2)]=vv
|
|
55
|
+
a1 = kk[0]
|
|
56
|
+
a2 = kk[1]
|
|
57
|
+
vv = blosum62[kk]
|
|
58
|
+
if vv > 4:
|
|
59
|
+
vv = 4
|
|
60
|
+
blosum62n[(a1, a2)] = vv
|
|
52
61
|
if a1 != a2:
|
|
53
|
-
blosum62n[(a2,a1)]=vv
|
|
54
|
-
|
|
55
|
-
bl62={
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
62
|
+
blosum62n[(a2, a1)] = vv
|
|
63
|
+
|
|
64
|
+
bl62 = {
|
|
65
|
+
"A": [4, -1, -2, -2, 0, -1, -1, 0, -2, -1, -1, -1, -1, -2, -1, 1, 0, -3, -2, 0],
|
|
66
|
+
"R": [-1, 4, 0, -2, -3, 1, 0, -2, 0, -3, -2, 2, -1, -3, -2, -1, -1, -3, -2, -3],
|
|
67
|
+
"N": [-2, 0, 4, 1, -3, 0, 0, 0, 1, -3, -3, 0, -2, -3, -2, 1, 0, -4, -2, -3],
|
|
68
|
+
"D": [-2, -2, 1, 4, -3, 0, 2, -1, -1, -3, -4, -1, -3, -3, -1, 0, -1, -4, -3, -3],
|
|
69
|
+
"C": [0, -3, -3, -3, 4, -3, -4, -3, -3, -1, -1, -3, -1, -2, -3, -1, -1, -2, -2, -1],
|
|
70
|
+
"Q": [-1, 1, 0, 0, -3, 4, 2, -2, 0, -3, -2, 1, 0, -3, -1, 0, -1, -2, -1, -2],
|
|
71
|
+
"E": [-1, 0, 0, 2, -4, 2, 4, -2, 0, -3, -3, 1, -2, -3, -1, 0, -1, -3, -2, -2],
|
|
72
|
+
"G": [0, -2, 0, -1, -3, -2, -2, 4, -2, -4, -4, -2, -3, -3, -2, 0, -2, -2, -3, -3],
|
|
73
|
+
"H": [-2, 0, 1, -1, -3, 0, 0, -2, 4, -3, -3, -1, -2, -1, -2, -1, -2, -2, 2, -3],
|
|
74
|
+
"I": [-1, -3, -3, -3, -1, -3, -3, -4, -3, 4, 2, -3, 1, 0, -3, -2, -1, -3, -1, 3],
|
|
75
|
+
"L": [-1, -2, -3, -4, -1, -2, -3, -4, -3, 2, 4, -2, 2, 0, -3, -2, -1, -2, -1, 1],
|
|
76
|
+
"K": [-1, 2, 0, -1, -3, 1, 1, -2, -1, -3, -2, 4, -1, -3, -1, 0, -1, -3, -2, -2],
|
|
77
|
+
"M": [-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1, 4, 0, -2, -1, -1, -1, -1, 1],
|
|
78
|
+
"F": [-2, -3, -3, -3, -2, -3, -3, -3, -1, 0, 0, -3, 0, 4, -4, -2, -2, 1, 3, -1],
|
|
79
|
+
"P": [
|
|
80
|
+
-1,
|
|
81
|
+
-2,
|
|
82
|
+
-2,
|
|
83
|
+
-1,
|
|
84
|
+
-3,
|
|
85
|
+
-1,
|
|
86
|
+
-1,
|
|
87
|
+
-2,
|
|
88
|
+
-2,
|
|
89
|
+
-3,
|
|
90
|
+
-3,
|
|
91
|
+
-1,
|
|
92
|
+
-2,
|
|
93
|
+
-4,
|
|
94
|
+
4,
|
|
95
|
+
-1,
|
|
96
|
+
-1,
|
|
97
|
+
-4,
|
|
98
|
+
-3,
|
|
99
|
+
-2,
|
|
100
|
+
],
|
|
101
|
+
"S": [1, -1, 1, 0, -1, 0, 0, 0, -1, -2, -2, 0, -1, -2, -1, 4, 1, -3, -2, -2],
|
|
102
|
+
"T": [0, -1, 0, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, -2, -1, 1, 4, -2, -2, 0],
|
|
103
|
+
"W": [-3, -3, -4, -4, -2, -2, -3, -2, -2, -3, -2, -3, -1, 1, -4, -3, -2, 4, 2, -3],
|
|
104
|
+
"Y": [-2, -2, -2, -3, -2, -1, -2, -3, 2, -1, -1, -2, -1, 3, -3, -2, -2, 2, 4, -1],
|
|
105
|
+
"V": [0, -3, -3, -3, -1, -2, -2, -3, -3, 3, 1, -2, 1, -1, -2, -2, 0, -3, -1, 4],
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
bl62c = np.array([np.array(x) for x in list(bl62.values())])
|
|
109
|
+
bl62c = 4 - bl62c
|
|
110
|
+
|
|
111
|
+
embedding = MDS(
|
|
112
|
+
n_components=13, n_init=100, max_iter=1000, eps=0.00001, dissimilarity="precomputed"
|
|
113
|
+
)
|
|
114
|
+
X = embedding.fit_transform(bl62c)
|
|
115
|
+
|
|
116
|
+
bl62np = {}
|
|
117
|
+
vkk = list(bl62.keys())
|
|
84
118
|
for ii in range(20):
|
|
85
|
-
kk=vkk[ii]
|
|
86
|
-
bl62np[kk]=np.array(list(X[ii,])+[0]*17)
|
|
119
|
+
kk = vkk[ii]
|
|
120
|
+
bl62np[kk] = np.array(list(X[ii,]) + [0] * 17)
|
|
87
121
|
|
|
88
|
-
|
|
89
|
-
AAencodingDict={}
|
|
122
|
+
|
|
123
|
+
AAencodingDict = {}
|
|
90
124
|
for ii in range(len(AAstringList)):
|
|
91
|
-
aa=AAstringList[ii]
|
|
92
|
-
CODE=[0]*(ii)+[1]+[0]*(20-ii)
|
|
93
|
-
AAencodingDict[aa]=np.array(CODE)
|
|
94
|
-
|
|
95
|
-
Ndim=16 ## optimized for isometric embedding
|
|
96
|
-
n0=Ndim*6
|
|
97
|
-
#M0=np.concatenate((np.concatenate((ZERO,M1),axis=1),np.concatenate((M1, ZERO),axis=1)))
|
|
98
|
-
ZERO=np.zeros((Ndim,Ndim))
|
|
99
|
-
II=np.eye(Ndim)
|
|
100
|
-
M0
|
|
125
|
+
aa = AAstringList[ii]
|
|
126
|
+
CODE = [0] * (ii) + [1] + [0] * (20 - ii)
|
|
127
|
+
AAencodingDict[aa] = np.array(CODE)
|
|
128
|
+
|
|
129
|
+
Ndim = 16 ## optimized for isometric embedding
|
|
130
|
+
n0 = Ndim * 6
|
|
131
|
+
# M0=np.concatenate((np.concatenate((ZERO,M1),axis=1),np.concatenate((M1, ZERO),axis=1)))
|
|
132
|
+
ZERO = np.zeros((Ndim, Ndim))
|
|
133
|
+
II = np.eye(Ndim)
|
|
134
|
+
M0 = np.concatenate(
|
|
135
|
+
(
|
|
136
|
+
np.concatenate((ZERO, ZERO, II), axis=1),
|
|
137
|
+
np.concatenate((II, ZERO, ZERO), axis=1),
|
|
138
|
+
np.concatenate((ZERO, II, ZERO), axis=1),
|
|
139
|
+
)
|
|
140
|
+
)
|
|
101
141
|
## Construct 6-th order cyclic group
|
|
102
|
-
ZERO45=np.zeros((Ndim*3,Ndim*3))
|
|
103
|
-
M6=np.concatenate(
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
142
|
+
ZERO45 = np.zeros((Ndim * 3, Ndim * 3))
|
|
143
|
+
M6 = np.concatenate(
|
|
144
|
+
(np.concatenate((ZERO45, M0), axis=1), np.concatenate((M0, ZERO45), axis=1))
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
X = np.array(
|
|
148
|
+
[
|
|
149
|
+
[
|
|
150
|
+
-0.31230882,
|
|
151
|
+
-0.53572156,
|
|
152
|
+
-0.01949946,
|
|
153
|
+
-0.12211268,
|
|
154
|
+
-0.70947917,
|
|
155
|
+
-0.42211092,
|
|
156
|
+
0.02783931,
|
|
157
|
+
0.02637933,
|
|
158
|
+
-0.41760305,
|
|
159
|
+
0.21809875,
|
|
160
|
+
0.53532768,
|
|
161
|
+
0.04833016,
|
|
162
|
+
0.07877711,
|
|
163
|
+
0.50464914,
|
|
164
|
+
-0.26972087,
|
|
165
|
+
-0.52416842,
|
|
166
|
+
],
|
|
167
|
+
[
|
|
168
|
+
0.29672002,
|
|
169
|
+
0.29005364,
|
|
170
|
+
0.18176298,
|
|
171
|
+
-0.05103382,
|
|
172
|
+
-0.34686519,
|
|
173
|
+
0.58024228,
|
|
174
|
+
-0.49282931,
|
|
175
|
+
0.62304281,
|
|
176
|
+
-0.09575202,
|
|
177
|
+
0.30115555,
|
|
178
|
+
0.09913529,
|
|
179
|
+
0.1577466,
|
|
180
|
+
-0.94391939,
|
|
181
|
+
-0.10505925,
|
|
182
|
+
0.05482389,
|
|
183
|
+
0.38409897,
|
|
184
|
+
],
|
|
185
|
+
[
|
|
186
|
+
-0.42212537,
|
|
187
|
+
0.12225749,
|
|
188
|
+
0.16279646,
|
|
189
|
+
0.60099009,
|
|
190
|
+
0.19734216,
|
|
191
|
+
0.42819919,
|
|
192
|
+
-0.33562418,
|
|
193
|
+
0.17036334,
|
|
194
|
+
0.4234109,
|
|
195
|
+
0.46681561,
|
|
196
|
+
-0.50347222,
|
|
197
|
+
-0.37936876,
|
|
198
|
+
0.1494825,
|
|
199
|
+
0.32176759,
|
|
200
|
+
0.28584684,
|
|
201
|
+
0.68469861,
|
|
202
|
+
],
|
|
203
|
+
[
|
|
204
|
+
0.18599294,
|
|
205
|
+
-0.44017825,
|
|
206
|
+
-0.4476952,
|
|
207
|
+
0.34340976,
|
|
208
|
+
0.44603553,
|
|
209
|
+
0.40974629,
|
|
210
|
+
-0.60045935,
|
|
211
|
+
-0.09056728,
|
|
212
|
+
0.22147919,
|
|
213
|
+
-0.33029418,
|
|
214
|
+
0.55635594,
|
|
215
|
+
-0.54149972,
|
|
216
|
+
0.05459062,
|
|
217
|
+
0.57334159,
|
|
218
|
+
-0.06227118,
|
|
219
|
+
0.65299872,
|
|
220
|
+
],
|
|
221
|
+
[
|
|
222
|
+
-0.19010428,
|
|
223
|
+
0.64418792,
|
|
224
|
+
-0.85286762,
|
|
225
|
+
0.21380295,
|
|
226
|
+
0.37639516,
|
|
227
|
+
-0.67753593,
|
|
228
|
+
0.38751609,
|
|
229
|
+
0.55746524,
|
|
230
|
+
0.01443766,
|
|
231
|
+
0.1776535,
|
|
232
|
+
0.62853954,
|
|
233
|
+
-0.15048523,
|
|
234
|
+
0.55100206,
|
|
235
|
+
-0.21426656,
|
|
236
|
+
0.3644061,
|
|
237
|
+
-0.0018255,
|
|
238
|
+
],
|
|
239
|
+
[
|
|
240
|
+
0.7350723,
|
|
241
|
+
0.10111267,
|
|
242
|
+
0.55640019,
|
|
243
|
+
-0.18226966,
|
|
244
|
+
0.51658102,
|
|
245
|
+
-0.19321508,
|
|
246
|
+
-0.46599027,
|
|
247
|
+
-0.02989911,
|
|
248
|
+
0.4036196,
|
|
249
|
+
-0.11978213,
|
|
250
|
+
-0.29837524,
|
|
251
|
+
-0.30232765,
|
|
252
|
+
-0.36738065,
|
|
253
|
+
-0.1379793,
|
|
254
|
+
0.04362871,
|
|
255
|
+
0.33553714,
|
|
256
|
+
],
|
|
257
|
+
[
|
|
258
|
+
0.41134047,
|
|
259
|
+
0.13512443,
|
|
260
|
+
0.62492322,
|
|
261
|
+
-0.10120261,
|
|
262
|
+
-0.03093491,
|
|
263
|
+
0.23751917,
|
|
264
|
+
-0.68338694,
|
|
265
|
+
0.05124762,
|
|
266
|
+
0.41533821,
|
|
267
|
+
0.46669353,
|
|
268
|
+
0.31467277,
|
|
269
|
+
-0.02427587,
|
|
270
|
+
0.15361135,
|
|
271
|
+
0.70595112,
|
|
272
|
+
-0.27952632,
|
|
273
|
+
0.32408931,
|
|
274
|
+
],
|
|
275
|
+
[
|
|
276
|
+
-0.33041265,
|
|
277
|
+
-0.43860065,
|
|
278
|
+
-0.5509376,
|
|
279
|
+
-0.04380843,
|
|
280
|
+
-0.35160935,
|
|
281
|
+
0.25134855,
|
|
282
|
+
0.53409314,
|
|
283
|
+
0.54850824,
|
|
284
|
+
0.59490287,
|
|
285
|
+
0.32669345,
|
|
286
|
+
-0.45355268,
|
|
287
|
+
-0.56317041,
|
|
288
|
+
-0.55416297,
|
|
289
|
+
0.18117841,
|
|
290
|
+
-0.71600849,
|
|
291
|
+
-0.08989825,
|
|
292
|
+
],
|
|
293
|
+
[
|
|
294
|
+
-0.40366849,
|
|
295
|
+
0.10978974,
|
|
296
|
+
0.0280101,
|
|
297
|
+
-0.46667987,
|
|
298
|
+
-0.45607028,
|
|
299
|
+
0.54114052,
|
|
300
|
+
-0.77552923,
|
|
301
|
+
-0.10720425,
|
|
302
|
+
0.55252091,
|
|
303
|
+
-0.34397153,
|
|
304
|
+
-0.59813694,
|
|
305
|
+
0.15567728,
|
|
306
|
+
0.03071009,
|
|
307
|
+
-0.02176143,
|
|
308
|
+
0.34442719,
|
|
309
|
+
0.14681541,
|
|
310
|
+
],
|
|
311
|
+
[
|
|
312
|
+
0.19280422,
|
|
313
|
+
0.35777863,
|
|
314
|
+
0.06139255,
|
|
315
|
+
0.20081699,
|
|
316
|
+
-0.30546596,
|
|
317
|
+
-0.56901549,
|
|
318
|
+
-0.15290953,
|
|
319
|
+
-0.31181573,
|
|
320
|
+
-0.74523217,
|
|
321
|
+
0.22296016,
|
|
322
|
+
-0.39143832,
|
|
323
|
+
-0.16474685,
|
|
324
|
+
0.58064427,
|
|
325
|
+
-0.77386654,
|
|
326
|
+
0.19713107,
|
|
327
|
+
-0.49477418,
|
|
328
|
+
],
|
|
329
|
+
[
|
|
330
|
+
-0.16133903,
|
|
331
|
+
0.22112761,
|
|
332
|
+
-0.53162136,
|
|
333
|
+
0.34764073,
|
|
334
|
+
-0.08522381,
|
|
335
|
+
-0.2510216,
|
|
336
|
+
0.04699411,
|
|
337
|
+
-0.25702389,
|
|
338
|
+
-0.8739765,
|
|
339
|
+
-0.24171728,
|
|
340
|
+
-0.24370533,
|
|
341
|
+
0.42193635,
|
|
342
|
+
0.41056913,
|
|
343
|
+
-0.60378211,
|
|
344
|
+
-0.65756832,
|
|
345
|
+
0.0845203,
|
|
346
|
+
],
|
|
347
|
+
[
|
|
348
|
+
-0.34792144,
|
|
349
|
+
0.18450939,
|
|
350
|
+
0.77038332,
|
|
351
|
+
0.63868511,
|
|
352
|
+
-0.06221681,
|
|
353
|
+
0.11930421,
|
|
354
|
+
0.04895523,
|
|
355
|
+
-0.22463059,
|
|
356
|
+
-0.03268844,
|
|
357
|
+
-0.58941354,
|
|
358
|
+
0.11640045,
|
|
359
|
+
0.32384901,
|
|
360
|
+
-0.42952779,
|
|
361
|
+
0.58119471,
|
|
362
|
+
0.07288662,
|
|
363
|
+
0.26669673,
|
|
364
|
+
],
|
|
365
|
+
[
|
|
366
|
+
0.01834555,
|
|
367
|
+
-0.16367754,
|
|
368
|
+
0.34900298,
|
|
369
|
+
0.45087949,
|
|
370
|
+
0.47073855,
|
|
371
|
+
-0.37377404,
|
|
372
|
+
0.0606911,
|
|
373
|
+
0.2455703,
|
|
374
|
+
-0.55182937,
|
|
375
|
+
-0.20261009,
|
|
376
|
+
0.28325423,
|
|
377
|
+
-0.04741146,
|
|
378
|
+
0.30565238,
|
|
379
|
+
-0.62090653,
|
|
380
|
+
0.17528413,
|
|
381
|
+
-0.60434975,
|
|
382
|
+
],
|
|
383
|
+
[
|
|
384
|
+
-0.55464981,
|
|
385
|
+
0.50918784,
|
|
386
|
+
-0.21371646,
|
|
387
|
+
-0.63996967,
|
|
388
|
+
-0.37656862,
|
|
389
|
+
0.27852662,
|
|
390
|
+
0.3287838,
|
|
391
|
+
-0.56800869,
|
|
392
|
+
0.23260763,
|
|
393
|
+
-0.20653106,
|
|
394
|
+
0.63261439,
|
|
395
|
+
-0.22666691,
|
|
396
|
+
0.00726302,
|
|
397
|
+
-0.60125196,
|
|
398
|
+
0.07139961,
|
|
399
|
+
-0.35086639,
|
|
400
|
+
],
|
|
401
|
+
[
|
|
402
|
+
0.94039731,
|
|
403
|
+
-0.25999326,
|
|
404
|
+
0.43922549,
|
|
405
|
+
-0.485738,
|
|
406
|
+
-0.20492235,
|
|
407
|
+
-0.26005626,
|
|
408
|
+
0.68776626,
|
|
409
|
+
0.57826888,
|
|
410
|
+
-0.05973995,
|
|
411
|
+
-0.1193658,
|
|
412
|
+
-0.12102433,
|
|
413
|
+
-0.22091354,
|
|
414
|
+
0.43427913,
|
|
415
|
+
0.71447886,
|
|
416
|
+
0.32745991,
|
|
417
|
+
0.03466398,
|
|
418
|
+
],
|
|
419
|
+
[
|
|
420
|
+
-0.13194625,
|
|
421
|
+
-0.12262688,
|
|
422
|
+
0.18029209,
|
|
423
|
+
0.16555524,
|
|
424
|
+
0.39594125,
|
|
425
|
+
-0.58110665,
|
|
426
|
+
0.16161717,
|
|
427
|
+
0.0839783,
|
|
428
|
+
0.0911945,
|
|
429
|
+
0.34546976,
|
|
430
|
+
-0.29415349,
|
|
431
|
+
0.29891936,
|
|
432
|
+
-0.60834721,
|
|
433
|
+
0.5943593,
|
|
434
|
+
-0.29473819,
|
|
435
|
+
0.4864154,
|
|
436
|
+
],
|
|
437
|
+
[
|
|
438
|
+
0.40850093,
|
|
439
|
+
-0.4638894,
|
|
440
|
+
-0.39732987,
|
|
441
|
+
-0.01972861,
|
|
442
|
+
0.51189582,
|
|
443
|
+
0.10176704,
|
|
444
|
+
0.37528519,
|
|
445
|
+
-0.41479418,
|
|
446
|
+
-0.1932531,
|
|
447
|
+
0.54732221,
|
|
448
|
+
-0.11876511,
|
|
449
|
+
0.32843973,
|
|
450
|
+
-0.259283,
|
|
451
|
+
0.59500132,
|
|
452
|
+
0.35168375,
|
|
453
|
+
-0.21733727,
|
|
454
|
+
],
|
|
455
|
+
[
|
|
456
|
+
-0.50627723,
|
|
457
|
+
-0.1973602,
|
|
458
|
+
-0.02339884,
|
|
459
|
+
-0.66846048,
|
|
460
|
+
0.62696606,
|
|
461
|
+
0.60049717,
|
|
462
|
+
0.69143364,
|
|
463
|
+
-0.48053591,
|
|
464
|
+
0.17812208,
|
|
465
|
+
-0.58481821,
|
|
466
|
+
-0.23551415,
|
|
467
|
+
-0.06229112,
|
|
468
|
+
0.20993116,
|
|
469
|
+
-0.72485884,
|
|
470
|
+
0.34375662,
|
|
471
|
+
-0.23539168,
|
|
472
|
+
],
|
|
473
|
+
[
|
|
474
|
+
-0.51388312,
|
|
475
|
+
-0.2788953,
|
|
476
|
+
0.00859533,
|
|
477
|
+
-0.5247195,
|
|
478
|
+
-0.18021544,
|
|
479
|
+
0.28372911,
|
|
480
|
+
0.10791359,
|
|
481
|
+
0.13033494,
|
|
482
|
+
0.34294013,
|
|
483
|
+
-0.70310089,
|
|
484
|
+
-0.13245433,
|
|
485
|
+
0.48661081,
|
|
486
|
+
0.08451644,
|
|
487
|
+
-0.69990992,
|
|
488
|
+
0.0408274,
|
|
489
|
+
-0.47204888,
|
|
490
|
+
],
|
|
491
|
+
[
|
|
492
|
+
0.68546275,
|
|
493
|
+
0.22581365,
|
|
494
|
+
-0.32571833,
|
|
495
|
+
0.34394298,
|
|
496
|
+
-0.43232367,
|
|
497
|
+
-0.5041842,
|
|
498
|
+
0.04784017,
|
|
499
|
+
-0.53067936,
|
|
500
|
+
-0.50049908,
|
|
501
|
+
0.36874221,
|
|
502
|
+
0.22429186,
|
|
503
|
+
0.4616482,
|
|
504
|
+
0.11159174,
|
|
505
|
+
-0.26827959,
|
|
506
|
+
-0.39372848,
|
|
507
|
+
-0.40987423,
|
|
508
|
+
],
|
|
509
|
+
]
|
|
510
|
+
)
|
|
511
|
+
|
|
512
|
+
bl62np = {}
|
|
513
|
+
vkk = list(bl62.keys())
|
|
188
514
|
for ii in range(20):
|
|
189
|
-
kk=vkk[ii]
|
|
190
|
-
bl62np[kk]=np.array(list(X[ii,])+[0]*Ndim*5)
|
|
515
|
+
kk = vkk[ii]
|
|
516
|
+
bl62np[kk] = np.array(list(X[ii,]) + [0] * Ndim * 5)
|
|
517
|
+
|
|
191
518
|
|
|
192
519
|
def EncodingCDR3(s, M, n0):
|
|
193
|
-
sL=list(s)
|
|
194
|
-
x=np.array([0]*n0)
|
|
520
|
+
sL = list(s)
|
|
521
|
+
x = np.array([0] * n0)
|
|
195
522
|
for ii in range(len(sL)):
|
|
196
|
-
x = np.dot(M, (x+bl62np[sL[ii]]))
|
|
523
|
+
x = np.dot(M, (x + bl62np[sL[ii]]))
|
|
197
524
|
return x
|
|
198
525
|
|
|
526
|
+
|
|
199
527
|
def BuildLengthDict(seqs, sIDs, vGene=[], INFO=[]):
|
|
200
|
-
LLs=[10,11,12,13,14,15,16,17,18,19,20,21,22,23,24]
|
|
201
|
-
LengthD={}
|
|
202
|
-
SeqD={}
|
|
203
|
-
VgeneD={}
|
|
204
|
-
InfoD={}
|
|
205
|
-
AAs=set(list(AAencodingDict.keys()))
|
|
206
|
-
NAs=len(AAencodingDict)
|
|
207
|
-
cNAs=0
|
|
528
|
+
LLs = [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]
|
|
529
|
+
LengthD = {}
|
|
530
|
+
SeqD = {}
|
|
531
|
+
VgeneD = {}
|
|
532
|
+
InfoD = {}
|
|
533
|
+
AAs = set(list(AAencodingDict.keys()))
|
|
534
|
+
NAs = len(AAencodingDict)
|
|
535
|
+
cNAs = 0
|
|
208
536
|
for ii in range(len(seqs)):
|
|
209
|
-
ID=sIDs[ii]
|
|
210
|
-
ss=seqs[ii]
|
|
211
|
-
ssAA=set(list(ss))
|
|
212
|
-
TMP=list(ssAA | AAs)
|
|
537
|
+
ID = sIDs[ii]
|
|
538
|
+
ss = seqs[ii]
|
|
539
|
+
ssAA = set(list(ss))
|
|
540
|
+
TMP = list(ssAA | AAs)
|
|
213
541
|
if len(TMP) > NAs:
|
|
214
542
|
## CDR3 containing non amino acid letter
|
|
215
|
-
#print('Warning: CDR3: '+ss + ' contains non amino acid letter!')
|
|
216
|
-
cNAs+=1
|
|
543
|
+
# print('Warning: CDR3: '+ss + ' contains non amino acid letter!')
|
|
544
|
+
cNAs += 1
|
|
217
545
|
continue
|
|
218
|
-
if len(vGene)>0:
|
|
219
|
-
vv=vGene[ii]
|
|
220
|
-
if len(INFO)>0:
|
|
221
|
-
info=INFO[ii]
|
|
222
|
-
L=len(ss)
|
|
546
|
+
if len(vGene) > 0:
|
|
547
|
+
vv = vGene[ii]
|
|
548
|
+
if len(INFO) > 0:
|
|
549
|
+
info = INFO[ii]
|
|
550
|
+
L = len(ss)
|
|
223
551
|
if L not in LLs:
|
|
224
552
|
continue
|
|
225
553
|
if L not in LengthD:
|
|
226
|
-
LengthD[L]=[ID]
|
|
227
|
-
SeqD[L]=[ss]
|
|
228
|
-
if len(vGene)>0:
|
|
229
|
-
VgeneD[L]=[vv]
|
|
230
|
-
if len(INFO)>0:
|
|
231
|
-
InfoD[L]=[info]
|
|
554
|
+
LengthD[L] = [ID]
|
|
555
|
+
SeqD[L] = [ss]
|
|
556
|
+
if len(vGene) > 0:
|
|
557
|
+
VgeneD[L] = [vv]
|
|
558
|
+
if len(INFO) > 0:
|
|
559
|
+
InfoD[L] = [info]
|
|
232
560
|
else:
|
|
233
561
|
LengthD[L].append(ID)
|
|
234
562
|
SeqD[L].append(ss)
|
|
235
|
-
if len(vGene)>0:
|
|
563
|
+
if len(vGene) > 0:
|
|
236
564
|
VgeneD[L].append(vv)
|
|
237
|
-
if len(INFO)>0:
|
|
565
|
+
if len(INFO) > 0:
|
|
238
566
|
InfoD[L].append(info)
|
|
239
|
-
if cNAs>0:
|
|
240
|
-
print("Warning: Skipped %d sequences with non AA letter!" %(cNAs))
|
|
567
|
+
if cNAs > 0:
|
|
568
|
+
print("Warning: Skipped %d sequences with non AA letter!" % (cNAs))
|
|
241
569
|
return LengthD, VgeneD, InfoD, SeqD
|
|
242
570
|
|
|
571
|
+
|
|
243
572
|
def CollapseUnique(LD, VD, ID, SD):
|
|
244
|
-
kks=LD.keys()
|
|
245
|
-
LDu={}
|
|
246
|
-
VDu={}
|
|
247
|
-
IDu={}
|
|
248
|
-
SDu={}
|
|
573
|
+
kks = LD.keys()
|
|
574
|
+
LDu = {}
|
|
575
|
+
VDu = {}
|
|
576
|
+
IDu = {}
|
|
577
|
+
SDu = {}
|
|
249
578
|
for kk in kks:
|
|
250
|
-
vvL=list(LD[kk])
|
|
251
|
-
if len(VD)>0:
|
|
252
|
-
vvV=list(VD[kk])
|
|
579
|
+
vvL = list(LD[kk])
|
|
580
|
+
if len(VD) > 0:
|
|
581
|
+
vvV = list(VD[kk])
|
|
253
582
|
else:
|
|
254
|
-
vvV=[
|
|
255
|
-
vvI=list(ID[kk])
|
|
256
|
-
vvS=list(SD[kk])
|
|
257
|
-
zz=zip(vvL, vvS, vvV, vvI)
|
|
258
|
-
zzs=sorted(zz, key
|
|
259
|
-
nz=len(zzs)
|
|
260
|
-
pointer_pre=0
|
|
261
|
-
pointer_cur=1
|
|
262
|
-
s_pre=zzs[pointer_pre][1]
|
|
263
|
-
v_pre=zzs[pointer_pre][2]
|
|
264
|
-
uS=[s_pre]
|
|
265
|
-
uV=[v_pre]
|
|
266
|
-
uI=[[zzs[pointer_pre][3]]]
|
|
583
|
+
vvV = ["TRBV2-1*01"] * len(vvL)
|
|
584
|
+
vvI = list(ID[kk])
|
|
585
|
+
vvS = list(SD[kk])
|
|
586
|
+
zz = zip(vvL, vvS, vvV, vvI)
|
|
587
|
+
zzs = sorted(zz, key=lambda x: (x[1], x[2]))
|
|
588
|
+
nz = len(zzs)
|
|
589
|
+
pointer_pre = 0
|
|
590
|
+
pointer_cur = 1
|
|
591
|
+
s_pre = zzs[pointer_pre][1]
|
|
592
|
+
v_pre = zzs[pointer_pre][2]
|
|
593
|
+
uS = [s_pre]
|
|
594
|
+
uV = [v_pre]
|
|
595
|
+
uI = [[zzs[pointer_pre][3]]]
|
|
267
596
|
while pointer_cur < nz:
|
|
268
|
-
s_cur=zzs[pointer_cur][1]
|
|
269
|
-
v_cur=zzs[pointer_cur][2]
|
|
597
|
+
s_cur = zzs[pointer_cur][1]
|
|
598
|
+
v_cur = zzs[pointer_cur][2]
|
|
270
599
|
if s_cur == s_pre and v_cur == v_pre:
|
|
271
|
-
uI[len(uI)-1].append(zzs[pointer_cur][3])
|
|
600
|
+
uI[len(uI) - 1].append(zzs[pointer_cur][3])
|
|
272
601
|
pointer_cur += 1
|
|
273
602
|
continue
|
|
274
603
|
else:
|
|
275
604
|
uS.append(s_cur)
|
|
276
605
|
uV.append(v_cur)
|
|
277
606
|
uI.append([zzs[pointer_cur][3]])
|
|
278
|
-
s_pre=s_cur
|
|
279
|
-
v_pre=v_cur
|
|
280
|
-
pointer_pre=pointer_cur
|
|
607
|
+
s_pre = s_cur
|
|
608
|
+
v_pre = v_cur
|
|
609
|
+
pointer_pre = pointer_cur
|
|
281
610
|
pointer_cur += 1
|
|
282
|
-
uL=[x for x in range(len(uS))]
|
|
283
|
-
LDu[kk]=uL
|
|
284
|
-
SDu[kk]=uS
|
|
285
|
-
if len(VD)>0:
|
|
286
|
-
VDu[kk]=uV
|
|
287
|
-
IDu[kk]=uI
|
|
611
|
+
uL = [x for x in range(len(uS))]
|
|
612
|
+
LDu[kk] = uL
|
|
613
|
+
SDu[kk] = uS
|
|
614
|
+
if len(VD) > 0:
|
|
615
|
+
VDu[kk] = uV
|
|
616
|
+
IDu[kk] = uI
|
|
288
617
|
return LDu, VDu, IDu, SDu
|
|
289
618
|
|
|
290
619
|
|
|
@@ -296,14 +625,15 @@ class CDR3:
|
|
|
296
625
|
## KS: Kmer size
|
|
297
626
|
## st: the first 0:(st-1) amino acids will not be included in K-merization
|
|
298
627
|
## ed: the last L-ed amino acids will be skipped
|
|
299
|
-
self.s=s
|
|
300
|
-
self.ID=sID
|
|
301
|
-
L=len(s)
|
|
302
|
-
self.L=L
|
|
303
|
-
sub_s=s[st: (L-ed)]
|
|
304
|
-
Ls=len(sub_s)
|
|
305
|
-
Kmer=[sub_s[x:(x+KS)] for x in range(0,Ls-KS+1)]
|
|
306
|
-
self.Kmer=Kmer
|
|
628
|
+
self.s = s
|
|
629
|
+
self.ID = sID
|
|
630
|
+
L = len(s)
|
|
631
|
+
self.L = L
|
|
632
|
+
sub_s = s[st : (L - ed)]
|
|
633
|
+
Ls = len(sub_s)
|
|
634
|
+
Kmer = [sub_s[x : (x + KS)] for x in range(0, Ls - KS + 1)]
|
|
635
|
+
self.Kmer = Kmer
|
|
636
|
+
|
|
307
637
|
|
|
308
638
|
class KmerSet:
|
|
309
639
|
## Kmer set for fast read searching based on mismatch-allowed Kmer index
|
|
@@ -312,263 +642,277 @@ class KmerSet:
|
|
|
312
642
|
## Seqs and sIDs must have the same length
|
|
313
643
|
if len(Seqs) != len(sIDs):
|
|
314
644
|
raise "Sequence and ID lists have different length. Please check input."
|
|
315
|
-
KmerDict={}
|
|
316
|
-
N=len(Seqs)
|
|
317
|
-
self.N=N
|
|
318
|
-
CDR3Dict={}
|
|
319
|
-
LLs=[]
|
|
320
|
-
for ii in range(0,N):
|
|
321
|
-
s=Seqs[ii]
|
|
322
|
-
sID=sIDs[ii]
|
|
323
|
-
cc=CDR3(s,sID,KS,st,ed)
|
|
324
|
-
CDR3Dict[cc.ID]=cc.Kmer
|
|
325
|
-
KK=cc.Kmer
|
|
645
|
+
KmerDict = {}
|
|
646
|
+
N = len(Seqs)
|
|
647
|
+
self.N = N
|
|
648
|
+
CDR3Dict = {}
|
|
649
|
+
LLs = []
|
|
650
|
+
for ii in range(0, N):
|
|
651
|
+
s = Seqs[ii]
|
|
652
|
+
sID = sIDs[ii]
|
|
653
|
+
cc = CDR3(s, sID, KS, st, ed)
|
|
654
|
+
CDR3Dict[cc.ID] = cc.Kmer
|
|
655
|
+
KK = cc.Kmer
|
|
326
656
|
LLs.append(cc.L)
|
|
327
657
|
for kk in KK:
|
|
328
658
|
if kk not in KmerDict:
|
|
329
|
-
KmerDict[kk]=[sID]
|
|
659
|
+
KmerDict[kk] = [sID]
|
|
330
660
|
else:
|
|
331
661
|
KmerDict[kk].append(sID)
|
|
332
|
-
self.KD=KmerDict
|
|
333
|
-
self.KS=KS
|
|
334
|
-
self.CD=CDR3Dict
|
|
335
|
-
self.LL=LLs
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
662
|
+
self.KD = KmerDict
|
|
663
|
+
self.KS = KS
|
|
664
|
+
self.CD = CDR3Dict
|
|
665
|
+
self.LL = LLs
|
|
666
|
+
|
|
667
|
+
def FindKmerNeighbor(self, kk):
|
|
668
|
+
KS = self.KS
|
|
669
|
+
KS_n1 = []
|
|
339
670
|
for jj in range(KS):
|
|
340
|
-
kk_pre=[kk[0:jj]]*20
|
|
341
|
-
kk_suf=[kk[(jj+1):KS]]*20
|
|
342
|
-
kkn=list(zip(kk_pre,AAstringList,kk_suf))
|
|
343
|
-
KS_n1+=[
|
|
671
|
+
kk_pre = [kk[0:jj]] * 20
|
|
672
|
+
kk_suf = [kk[(jj + 1) : KS]] * 20
|
|
673
|
+
kkn = list(zip(kk_pre, AAstringList, kk_suf))
|
|
674
|
+
KS_n1 += ["".join(list(x)) for x in kkn]
|
|
344
675
|
return KS_n1
|
|
345
|
-
|
|
676
|
+
|
|
677
|
+
def FindKmerNeighbor2(self, kk):
|
|
346
678
|
## KS>=6, allowing 2 mismatches. CDR3 length must be >= 10
|
|
347
|
-
KS=self.KS
|
|
348
|
-
KS_n1=[]
|
|
679
|
+
KS = self.KS
|
|
680
|
+
KS_n1 = []
|
|
349
681
|
for jj in range(KS):
|
|
350
682
|
for ii in range(KS):
|
|
351
|
-
if ii<=jj:
|
|
683
|
+
if ii <= jj:
|
|
352
684
|
continue
|
|
353
|
-
kk_pre=[kk[0:jj]]*20
|
|
354
|
-
kk_mid=[kk[(jj+1):ii]]*20
|
|
355
|
-
kk_suf=[kk[(ii+1):KS]]*400
|
|
356
|
-
kkn=list(zip(kk_pre,AAstringList,kk_mid))
|
|
357
|
-
kkn=[
|
|
358
|
-
kkn=[[x]*20 for x in kkn]
|
|
359
|
-
kkn=list(chain(*kkn))
|
|
360
|
-
kkn2=list(zip(kkn, AAstringList*20, kk_suf))
|
|
361
|
-
kkn2=[
|
|
362
|
-
KS_n1+=kkn2
|
|
685
|
+
kk_pre = [kk[0:jj]] * 20
|
|
686
|
+
kk_mid = [kk[(jj + 1) : ii]] * 20
|
|
687
|
+
kk_suf = [kk[(ii + 1) : KS]] * 400
|
|
688
|
+
kkn = list(zip(kk_pre, AAstringList, kk_mid))
|
|
689
|
+
kkn = ["".join(list(x)) for x in kkn]
|
|
690
|
+
kkn = [[x] * 20 for x in kkn]
|
|
691
|
+
kkn = list(chain(*kkn))
|
|
692
|
+
kkn2 = list(zip(kkn, AAstringList * 20, kk_suf))
|
|
693
|
+
kkn2 = ["".join(list(x)) for x in kkn2]
|
|
694
|
+
KS_n1 += kkn2
|
|
363
695
|
return KS_n1
|
|
696
|
+
|
|
364
697
|
def KmerIndex(self):
|
|
365
698
|
## For each K-mer, find its nearest neighbor with 1 character mismatch
|
|
366
|
-
KKs=list(self.KD.keys())
|
|
367
|
-
KS=self.KS
|
|
368
|
-
KKs_set=set(KKs)
|
|
369
|
-
Skk=
|
|
370
|
-
KI_Dict={}
|
|
699
|
+
KKs = list(self.KD.keys())
|
|
700
|
+
KS = self.KS
|
|
701
|
+
KKs_set = set(KKs)
|
|
702
|
+
Skk = "_".join(KKs)
|
|
703
|
+
KI_Dict = {}
|
|
371
704
|
for kk in KKs:
|
|
372
|
-
## kk_neighbor=[]
|
|
373
|
-
## for jj in range(KS):
|
|
374
|
-
## kk_pre=kk[0:jj]
|
|
375
|
-
## kk_suf=kk[(jj+1):KS]
|
|
376
|
-
## pat=kk_pre+'['+AAstring+']{1}'+kk_suf
|
|
377
|
-
## p=re.compile(pat)
|
|
378
|
-
## mm=[m.group() for m in p.finditer(Skk)]
|
|
379
|
-
## kk_neighbor+=mm
|
|
380
|
-
KS_n=set(self.FindKmerNeighbor(kk))
|
|
705
|
+
## kk_neighbor=[]
|
|
706
|
+
## for jj in range(KS):
|
|
707
|
+
## kk_pre=kk[0:jj]
|
|
708
|
+
## kk_suf=kk[(jj+1):KS]
|
|
709
|
+
## pat=kk_pre+'['+AAstring+']{1}'+kk_suf
|
|
710
|
+
## p=re.compile(pat)
|
|
711
|
+
## mm=[m.group() for m in p.finditer(Skk)]
|
|
712
|
+
## kk_neighbor+=mm
|
|
713
|
+
KS_n = set(self.FindKmerNeighbor(kk))
|
|
381
714
|
kk_neighbor = KS_n & KKs_set
|
|
382
|
-
KI_Dict[kk]=list(kk_neighbor)
|
|
715
|
+
KI_Dict[kk] = list(kk_neighbor)
|
|
383
716
|
return KI_Dict
|
|
717
|
+
|
|
384
718
|
def updateKD(self, KI):
|
|
385
719
|
## group sequences sharing motifs with 1-2 mismatches
|
|
386
|
-
KD=self.KD
|
|
387
|
-
KDnew={}
|
|
720
|
+
KD = self.KD
|
|
721
|
+
KDnew = {}
|
|
388
722
|
for kk in KD:
|
|
389
|
-
kkm=KI[kk]
|
|
390
|
-
vvL=itemgetter(*kkm)(KD)
|
|
391
|
-
if isinstance(vvL[0],list):
|
|
392
|
-
vvL=list(chain(*vvL))
|
|
393
|
-
KDnew[kk]=vvL
|
|
723
|
+
kkm = KI[kk]
|
|
724
|
+
vvL = itemgetter(*kkm)(KD)
|
|
725
|
+
if isinstance(vvL[0], list):
|
|
726
|
+
vvL = list(chain(*vvL))
|
|
727
|
+
KDnew[kk] = vvL
|
|
394
728
|
return KDnew
|
|
395
729
|
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
730
|
+
|
|
731
|
+
def GenerateMotifGraph(mD, seqs, seqID):
|
|
732
|
+
SeqShareGraph = {}
|
|
733
|
+
mDL = {}
|
|
399
734
|
for kk in mD:
|
|
400
|
-
vv=mD[kk]
|
|
401
|
-
LL=[]
|
|
735
|
+
vv = mD[kk]
|
|
736
|
+
LL = []
|
|
402
737
|
for v in vv:
|
|
403
738
|
LL.append(len(seqs[v]))
|
|
404
|
-
mDL[kk]=LL
|
|
739
|
+
mDL[kk] = LL
|
|
405
740
|
for kk in mD:
|
|
406
|
-
vv=mD[kk]
|
|
407
|
-
LL=mDL[kk]
|
|
408
|
-
nv=len(vv)
|
|
409
|
-
for ii in range(0,nv):
|
|
410
|
-
id_1=vv[ii]
|
|
411
|
-
L1=LL[ii]
|
|
412
|
-
for jj in range(ii,nv):
|
|
413
|
-
if jj==ii:
|
|
741
|
+
vv = mD[kk]
|
|
742
|
+
LL = mDL[kk]
|
|
743
|
+
nv = len(vv)
|
|
744
|
+
for ii in range(0, nv):
|
|
745
|
+
id_1 = vv[ii]
|
|
746
|
+
L1 = LL[ii]
|
|
747
|
+
for jj in range(ii, nv):
|
|
748
|
+
if jj == ii:
|
|
414
749
|
continue
|
|
415
|
-
id_2=vv[jj]
|
|
416
|
-
L2=LL[jj]
|
|
750
|
+
id_2 = vv[jj]
|
|
751
|
+
L2 = LL[jj]
|
|
417
752
|
if L2 != L1:
|
|
418
753
|
continue
|
|
419
754
|
if id_1 not in SeqShareGraph:
|
|
420
|
-
SeqShareGraph[id_1]=[id_2]
|
|
755
|
+
SeqShareGraph[id_1] = [id_2]
|
|
421
756
|
elif id_2 not in SeqShareGraph[id_1]:
|
|
422
757
|
SeqShareGraph[id_1].append(id_2)
|
|
423
758
|
if id_2 not in SeqShareGraph:
|
|
424
|
-
SeqShareGraph[id_2]=[id_1]
|
|
759
|
+
SeqShareGraph[id_2] = [id_1]
|
|
425
760
|
elif id_1 not in SeqShareGraph[id_2]:
|
|
426
761
|
SeqShareGraph[id_2].append(id_1)
|
|
427
762
|
return SeqShareGraph
|
|
428
763
|
|
|
764
|
+
|
|
429
765
|
def generateSSG(Kset, CDR3s, k_thr=2):
|
|
430
|
-
KD=Kset.KD
|
|
431
|
-
KI=Kset.KmerIndex()
|
|
432
|
-
KDnew=Kset.updateKD(KI)
|
|
433
|
-
CD=Kset.CD
|
|
434
|
-
LL=np.array(Kset.LL)
|
|
435
|
-
SSG={}
|
|
766
|
+
KD = Kset.KD
|
|
767
|
+
KI = Kset.KmerIndex()
|
|
768
|
+
KDnew = Kset.updateKD(KI)
|
|
769
|
+
CD = Kset.CD
|
|
770
|
+
LL = np.array(Kset.LL)
|
|
771
|
+
SSG = {}
|
|
436
772
|
for kk in CD:
|
|
437
|
-
vv=itemgetter(*CD[kk])(KDnew)
|
|
438
|
-
if isinstance(vv[0],list):
|
|
439
|
-
vv=list(chain(*vv))
|
|
440
|
-
vv1=[]
|
|
441
|
-
c=Counter(vv)
|
|
773
|
+
vv = itemgetter(*CD[kk])(KDnew)
|
|
774
|
+
if isinstance(vv[0], list):
|
|
775
|
+
vv = list(chain(*vv))
|
|
776
|
+
vv1 = []
|
|
777
|
+
c = Counter(vv)
|
|
442
778
|
for k in c:
|
|
443
|
-
if c[k]>=k_thr:
|
|
779
|
+
if c[k] >= k_thr:
|
|
444
780
|
vv1.append(k)
|
|
445
|
-
vv1=np.array(vv1)
|
|
446
|
-
if len(vv1)==0:
|
|
781
|
+
vv1 = np.array(vv1)
|
|
782
|
+
if len(vv1) == 0:
|
|
447
783
|
continue
|
|
448
|
-
cdr3=CDR3s[kk]
|
|
449
|
-
L0=len(cdr3)
|
|
450
|
-
idx=np.where(LL[vv1]==L0)[0]
|
|
451
|
-
if len(idx)==0:
|
|
784
|
+
cdr3 = CDR3s[kk]
|
|
785
|
+
L0 = len(cdr3)
|
|
786
|
+
idx = np.where(LL[vv1] == L0)[0]
|
|
787
|
+
if len(idx) == 0:
|
|
452
788
|
continue
|
|
453
|
-
vvs=list(vv1[idx])
|
|
789
|
+
vvs = list(vv1[idx])
|
|
454
790
|
vvs.remove(kk)
|
|
455
|
-
if len(vvs)>0:
|
|
456
|
-
SSG[kk]=vvs
|
|
791
|
+
if len(vvs) > 0:
|
|
792
|
+
SSG[kk] = vvs
|
|
457
793
|
return SSG
|
|
458
794
|
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
795
|
+
|
|
796
|
+
def SeqComparison(s1, s2, gap=-6):
|
|
797
|
+
n = len(s1)
|
|
798
|
+
CorList = []
|
|
799
|
+
score = 0
|
|
800
|
+
for kk in range(0, n):
|
|
801
|
+
aa = s1[kk]
|
|
802
|
+
bb = s2[kk]
|
|
803
|
+
if aa in [".", "-", "*"] or bb in [".", "-", "*"]:
|
|
804
|
+
if aa != bb:
|
|
468
805
|
score += gap
|
|
469
806
|
continue
|
|
470
|
-
if aa==bb:
|
|
471
|
-
# score += min(4,blosum62[(aa,aa)])
|
|
472
|
-
score += blosum62n[(aa,aa)]
|
|
807
|
+
if aa == bb:
|
|
808
|
+
# score += min(4,blosum62[(aa,aa)])
|
|
809
|
+
score += blosum62n[(aa, aa)]
|
|
473
810
|
continue
|
|
474
|
-
KEY=(aa,bb)
|
|
475
|
-
# if KEY not in blosum62:
|
|
476
|
-
# KEY=(bb,aa)
|
|
477
|
-
# if KEY not in blosum62:
|
|
478
|
-
# raise "Non-standard amino acid coding!"
|
|
479
|
-
score+=blosum62n[KEY]
|
|
811
|
+
KEY = (aa, bb)
|
|
812
|
+
# if KEY not in blosum62:
|
|
813
|
+
# KEY=(bb,aa)
|
|
814
|
+
# if KEY not in blosum62:
|
|
815
|
+
# raise "Non-standard amino acid coding!"
|
|
816
|
+
score += blosum62n[KEY]
|
|
480
817
|
return score
|
|
481
818
|
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
Seq1
|
|
488
|
-
Seq2
|
|
489
|
-
|
|
819
|
+
|
|
820
|
+
def NHLocalAlignment(Seq1, Seq2, gap_thr=1, gap=-6):
|
|
821
|
+
n1 = len(Seq1)
|
|
822
|
+
n2 = len(Seq2)
|
|
823
|
+
if n1 < n2:
|
|
824
|
+
Seq = Seq1
|
|
825
|
+
Seq1 = Seq2
|
|
826
|
+
Seq2 = Seq
|
|
827
|
+
nn = n2 - n1
|
|
490
828
|
else:
|
|
491
|
-
nn=n1-n2
|
|
492
|
-
if nn>gap_thr:
|
|
829
|
+
nn = n1 - n2
|
|
830
|
+
if nn > gap_thr:
|
|
493
831
|
return -1
|
|
494
|
-
SeqList1=[Seq1]
|
|
495
|
-
SeqList2=InsertGap(Seq2,nn)
|
|
496
|
-
alns=[]
|
|
497
|
-
SCOREList=[]
|
|
832
|
+
SeqList1 = [Seq1]
|
|
833
|
+
SeqList2 = InsertGap(Seq2, nn)
|
|
834
|
+
alns = []
|
|
835
|
+
SCOREList = []
|
|
498
836
|
for s1 in SeqList1:
|
|
499
837
|
for s2 in SeqList2:
|
|
500
|
-
|
|
501
|
-
maxS=max(SCOREList)
|
|
838
|
+
SCOREList.append(SeqComparison(s1, s2, gap))
|
|
839
|
+
maxS = max(SCOREList)
|
|
502
840
|
return maxS
|
|
503
841
|
|
|
504
|
-
|
|
842
|
+
|
|
843
|
+
def InsertGap(Seq, n):
|
|
505
844
|
## Insert n gaps to Seq; n<=2
|
|
506
|
-
if n==0:
|
|
845
|
+
if n == 0:
|
|
507
846
|
return [Seq]
|
|
508
|
-
ns=len(Seq)
|
|
509
|
-
SeqList=[]
|
|
510
|
-
if
|
|
511
|
-
for kk in range(0,ns+1):
|
|
512
|
-
SeqNew=Seq[0:kk]+
|
|
847
|
+
ns = len(Seq)
|
|
848
|
+
SeqList = []
|
|
849
|
+
if n == 1:
|
|
850
|
+
for kk in range(0, ns + 1):
|
|
851
|
+
SeqNew = Seq[0:kk] + "-" + Seq[kk:]
|
|
513
852
|
SeqList.append(SeqNew)
|
|
514
|
-
if
|
|
515
|
-
for kk in range(0,ns+1):
|
|
516
|
-
SeqNew=Seq[0:kk]+
|
|
517
|
-
for jj in range(0,ns+2):
|
|
518
|
-
SeqNew0=SeqNew[0:jj]+
|
|
853
|
+
if n == 2:
|
|
854
|
+
for kk in range(0, ns + 1):
|
|
855
|
+
SeqNew = Seq[0:kk] + "-" + Seq[kk:]
|
|
856
|
+
for jj in range(0, ns + 2):
|
|
857
|
+
SeqNew0 = SeqNew[0:jj] + "-" + SeqNew[jj:]
|
|
519
858
|
SeqList.append(SeqNew0)
|
|
520
859
|
return SeqList
|
|
521
860
|
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
861
|
+
|
|
862
|
+
def falign(s1, s2, V1, V2, st, VScore={}, UseV=True, gapn=1, gap=-6):
|
|
863
|
+
mid1 = s1[st:-2]
|
|
864
|
+
mid2 = s2[st:-2]
|
|
525
865
|
if UseV:
|
|
526
|
-
if V2==V1:
|
|
527
|
-
V_score=4
|
|
866
|
+
if V2 == V1:
|
|
867
|
+
V_score = 4
|
|
528
868
|
else:
|
|
529
|
-
Vkey=(V1,V2)
|
|
869
|
+
Vkey = (V1, V2)
|
|
530
870
|
if Vkey not in VScore:
|
|
531
|
-
Vkey=(V2,V1)
|
|
871
|
+
Vkey = (V2, V1)
|
|
532
872
|
if Vkey not in VScore:
|
|
533
|
-
#print("V gene not found!")
|
|
873
|
+
# print("V gene not found!")
|
|
534
874
|
return 0
|
|
535
875
|
else:
|
|
536
|
-
V_score=VScore[Vkey]/20.0
|
|
876
|
+
V_score = VScore[Vkey] / 20.0
|
|
537
877
|
else:
|
|
538
|
-
V_score=4.0
|
|
539
|
-
aln=NHLocalAlignment(mid1,mid2,gapn,gap)
|
|
540
|
-
score=aln/float(max(len(mid1),len(mid2)))+V_score
|
|
878
|
+
V_score = 4.0
|
|
879
|
+
aln = NHLocalAlignment(mid1, mid2, gapn, gap)
|
|
880
|
+
score = aln / float(max(len(mid1), len(mid2))) + V_score
|
|
541
881
|
return score
|
|
542
882
|
|
|
883
|
+
|
|
543
884
|
def UpdateSSG(SSG, seqs, Vgenes, Vscore={}, UseV=True, gap=-6, gapn=1, cutoff=7.5):
|
|
544
|
-
SSGnew={}
|
|
545
|
-
count=0
|
|
546
|
-
t1=time.time()
|
|
547
|
-
N=len(list(chain(*list(SSG.values()))))
|
|
548
|
-
# print("Number of pairs to be processed: %d" %N)
|
|
885
|
+
SSGnew = {}
|
|
886
|
+
count = 0
|
|
887
|
+
t1 = time.time()
|
|
888
|
+
N = len(list(chain(*list(SSG.values()))))
|
|
889
|
+
# print("Number of pairs to be processed: %d" %N)
|
|
549
890
|
for kk in SSG:
|
|
550
|
-
s1=seqs[kk]
|
|
551
|
-
V1=Vgenes[kk]
|
|
552
|
-
VV=SSG[kk]
|
|
891
|
+
s1 = seqs[kk]
|
|
892
|
+
V1 = Vgenes[kk]
|
|
893
|
+
VV = SSG[kk]
|
|
553
894
|
for vv in VV:
|
|
554
|
-
s2=seqs[vv]
|
|
555
|
-
V2=Vgenes[vv]
|
|
556
|
-
score=falign(
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
895
|
+
s2 = seqs[vv]
|
|
896
|
+
V2 = Vgenes[vv]
|
|
897
|
+
score = falign(
|
|
898
|
+
s1, s2, V1, V2, st=3, VScore=Vscore, UseV=UseV, gap=-6, gapn=1
|
|
899
|
+
)
|
|
900
|
+
count += 1
|
|
901
|
+
if count % 1000000 == 0:
|
|
902
|
+
t2 = time.time()
|
|
903
|
+
# print("Processed %d pairs. Elapsed time %f" %(count, t2-t1))
|
|
904
|
+
if score >= cutoff:
|
|
562
905
|
if kk not in SSGnew:
|
|
563
|
-
SSGnew[kk]=[vv]
|
|
906
|
+
SSGnew[kk] = [vv]
|
|
564
907
|
else:
|
|
565
908
|
SSGnew[kk].append(vv)
|
|
566
909
|
return SSGnew
|
|
567
910
|
|
|
911
|
+
|
|
568
912
|
def dfs(graph, start):
|
|
569
|
-
|
|
913
|
+
"""
|
|
570
914
|
Non-resursive depth first search
|
|
571
|
-
|
|
915
|
+
"""
|
|
572
916
|
visited = set()
|
|
573
917
|
stack = [start]
|
|
574
918
|
while stack:
|
|
@@ -576,443 +920,503 @@ def dfs(graph, start):
|
|
|
576
920
|
if vertex not in visited:
|
|
577
921
|
visited.add(vertex)
|
|
578
922
|
stack.extend(set(graph[vertex]) - visited)
|
|
579
|
-
|
|
923
|
+
|
|
580
924
|
return visited
|
|
581
925
|
|
|
926
|
+
|
|
582
927
|
def IdentifyMotifCluster(SSG):
|
|
583
928
|
## Input SeqShareGraph dictionary representation of sparse matrix
|
|
584
|
-
POS=set(SSG.keys())
|
|
585
|
-
NP=len(POS)
|
|
586
|
-
ClusterList=[]
|
|
587
|
-
tmpL=set(chain(*ClusterList))
|
|
588
|
-
count=0
|
|
929
|
+
POS = set(SSG.keys())
|
|
930
|
+
NP = len(POS)
|
|
931
|
+
ClusterList = []
|
|
932
|
+
tmpL = set(chain(*ClusterList))
|
|
933
|
+
count = 0
|
|
589
934
|
while 1:
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
# STACK=LoadComm([],ii)
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
# tmpL=set(chain(*ClusterList))
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
935
|
+
xx = POS ^ tmpL
|
|
936
|
+
if len(xx) == 0:
|
|
937
|
+
break
|
|
938
|
+
for ii in xx:
|
|
939
|
+
# STACK=LoadComm([],ii)
|
|
940
|
+
STACK = dfs(SSG, ii)
|
|
941
|
+
tmpL = tmpL | STACK
|
|
942
|
+
ClusterList.append(list(STACK))
|
|
943
|
+
# tmpL=set(chain(*ClusterList))
|
|
944
|
+
count += 1
|
|
945
|
+
if count % 200 == 0:
|
|
946
|
+
print(" Solved %d clusters" % (count))
|
|
947
|
+
break
|
|
603
948
|
return ClusterList
|
|
604
949
|
|
|
950
|
+
|
|
605
951
|
def IdentifyVgeneCluster(sMat):
|
|
606
952
|
## Input Vgene score matrix
|
|
607
|
-
vG={}
|
|
608
|
-
n=len(sMat)
|
|
609
|
-
IDs=[x for x in range(n)]
|
|
953
|
+
vG = {}
|
|
954
|
+
n = len(sMat)
|
|
955
|
+
IDs = [x for x in range(n)]
|
|
610
956
|
for kk in IDs:
|
|
611
|
-
LL=sMat[:,kk]
|
|
612
|
-
vL=np.where(LL>=thr_v)[0]
|
|
613
|
-
if len(vL)>0:
|
|
614
|
-
vG[kk]=vL
|
|
615
|
-
CL=IdentifyMotifCluster(vG)
|
|
957
|
+
LL = sMat[:, kk]
|
|
958
|
+
vL = np.where(LL >= thr_v)[0]
|
|
959
|
+
if len(vL) > 0:
|
|
960
|
+
vG[kk] = vL
|
|
961
|
+
CL = IdentifyMotifCluster(vG)
|
|
616
962
|
return CL
|
|
617
|
-
|
|
963
|
+
|
|
964
|
+
|
|
618
965
|
def ParseFa(fname):
|
|
619
|
-
InputStr=open(fname).readlines()
|
|
620
|
-
FaDict={}
|
|
621
|
-
seq=
|
|
966
|
+
InputStr = open(fname).readlines()
|
|
967
|
+
FaDict = {}
|
|
968
|
+
seq = ""
|
|
622
969
|
for line in InputStr:
|
|
623
|
-
if line.startswith(
|
|
624
|
-
if len(seq)>0:
|
|
625
|
-
FaDict[seqHead]=seq
|
|
626
|
-
seq=
|
|
627
|
-
seqHead=line.strip()
|
|
970
|
+
if line.startswith(">"):
|
|
971
|
+
if len(seq) > 0:
|
|
972
|
+
FaDict[seqHead] = seq
|
|
973
|
+
seq = ""
|
|
974
|
+
seqHead = line.strip()
|
|
628
975
|
else:
|
|
629
|
-
seq+=line.strip()
|
|
976
|
+
seq += line.strip()
|
|
630
977
|
if seqHead not in FaDict:
|
|
631
|
-
FaDict[seqHead]=seq
|
|
978
|
+
FaDict[seqHead] = seq
|
|
632
979
|
return FaDict
|
|
633
980
|
|
|
981
|
+
|
|
634
982
|
def PreCalculateVgeneDist(VgeneFa="Imgt_Human_TRBV.fasta"):
|
|
635
983
|
## Only run one time if needed
|
|
636
|
-
FaDict=ParseFa(cur_dir+VgeneFa)
|
|
637
|
-
VScore={}
|
|
638
|
-
CDR1Dict={}
|
|
639
|
-
CDR2Dict={}
|
|
984
|
+
FaDict = ParseFa(cur_dir + VgeneFa)
|
|
985
|
+
VScore = {}
|
|
986
|
+
CDR1Dict = {}
|
|
987
|
+
CDR2Dict = {}
|
|
640
988
|
for kk in FaDict:
|
|
641
|
-
if
|
|
642
|
-
VV=kk.split(
|
|
989
|
+
if "|" in kk:
|
|
990
|
+
VV = kk.split("|")[1]
|
|
643
991
|
else:
|
|
644
|
-
VV=kk[1:]
|
|
645
|
-
CDR1Dict[VV]=FaDict[kk][26:37] ## Imgt CDR1: 27 - 38
|
|
646
|
-
CDR2Dict[VV]=FaDict[kk][55:64] ## Imgt CDR2: 56 - 65
|
|
647
|
-
Vkeys=list(CDR1Dict.keys())
|
|
648
|
-
nn=len(Vkeys)
|
|
649
|
-
for ii in range(0,nn):
|
|
650
|
-
V1=Vkeys[ii]
|
|
651
|
-
s1_CDR1=CDR1Dict[V1]
|
|
652
|
-
s1_CDR2=CDR2Dict[V1]
|
|
653
|
-
for jj in range(ii,nn):
|
|
654
|
-
V2=Vkeys[jj]
|
|
655
|
-
s2_CDR1=CDR1Dict[V2]
|
|
656
|
-
s2_CDR2=CDR2Dict[V2]
|
|
657
|
-
score1=SeqComparison(s1_CDR1,s2_CDR1)
|
|
658
|
-
score2=SeqComparison(s2_CDR2,s2_CDR2)
|
|
659
|
-
#print score1+score2
|
|
660
|
-
VScore[(V1,V2)]=score1+score2
|
|
661
|
-
gg=open(
|
|
992
|
+
VV = kk[1:]
|
|
993
|
+
CDR1Dict[VV] = FaDict[kk][26:37] ## Imgt CDR1: 27 - 38
|
|
994
|
+
CDR2Dict[VV] = FaDict[kk][55:64] ## Imgt CDR2: 56 - 65
|
|
995
|
+
Vkeys = list(CDR1Dict.keys())
|
|
996
|
+
nn = len(Vkeys)
|
|
997
|
+
for ii in range(0, nn):
|
|
998
|
+
V1 = Vkeys[ii]
|
|
999
|
+
s1_CDR1 = CDR1Dict[V1]
|
|
1000
|
+
s1_CDR2 = CDR2Dict[V1]
|
|
1001
|
+
for jj in range(ii, nn):
|
|
1002
|
+
V2 = Vkeys[jj]
|
|
1003
|
+
s2_CDR1 = CDR1Dict[V2]
|
|
1004
|
+
s2_CDR2 = CDR2Dict[V2]
|
|
1005
|
+
score1 = SeqComparison(s1_CDR1, s2_CDR1)
|
|
1006
|
+
score2 = SeqComparison(s2_CDR2, s2_CDR2)
|
|
1007
|
+
# print score1+score2
|
|
1008
|
+
VScore[(V1, V2)] = score1 + score2
|
|
1009
|
+
gg = open("VgeneScores.txt", "w")
|
|
662
1010
|
for kk in VScore:
|
|
663
|
-
vv=VScore[kk]
|
|
664
|
-
line=kk[0]+
|
|
1011
|
+
vv = VScore[kk]
|
|
1012
|
+
line = kk[0] + "\t" + kk[1] + "\t" + str(vv) + "\n"
|
|
665
1013
|
gg.write(line)
|
|
666
1014
|
gg.close()
|
|
667
1015
|
|
|
668
|
-
|
|
1016
|
+
|
|
1017
|
+
def EncodeRepertoire(
|
|
1018
|
+
inputfile,
|
|
1019
|
+
outdir,
|
|
1020
|
+
outfile="",
|
|
1021
|
+
exact=True,
|
|
1022
|
+
ST=3,
|
|
1023
|
+
thr_v=3.7,
|
|
1024
|
+
thr_s=3.5,
|
|
1025
|
+
VDict={},
|
|
1026
|
+
Vgene=True,
|
|
1027
|
+
thr_iso=10,
|
|
1028
|
+
gap=-6,
|
|
1029
|
+
GPU=False,
|
|
1030
|
+
Mat=False,
|
|
1031
|
+
verbose=False,
|
|
1032
|
+
):
|
|
669
1033
|
## No V gene version
|
|
670
1034
|
## Encode CDR3 sequences into 96 dimensional space and perform k-means clustering
|
|
671
1035
|
## If exact is True, SW alignment will be performed within each cluster after isometric encoding and clustering
|
|
672
|
-
h=open(inputfile)
|
|
673
|
-
t1=time.time()
|
|
674
|
-
alines=h.readlines()
|
|
675
|
-
ww=alines[0].strip().split(
|
|
676
|
-
if not ww[0].startswith(
|
|
1036
|
+
h = open(inputfile)
|
|
1037
|
+
t1 = time.time()
|
|
1038
|
+
alines = h.readlines()
|
|
1039
|
+
ww = alines[0].strip().split("\t")
|
|
1040
|
+
if not ww[0].startswith("C"):
|
|
677
1041
|
## header line
|
|
678
|
-
hline=alines[0]
|
|
679
|
-
alines=alines[1:]
|
|
680
|
-
elif
|
|
681
|
-
hline=alines[0]
|
|
682
|
-
alines=alines[1:]
|
|
1042
|
+
hline = alines[0]
|
|
1043
|
+
alines = alines[1:]
|
|
1044
|
+
elif "CDR3" in ww[0]:
|
|
1045
|
+
hline = alines[0]
|
|
1046
|
+
alines = alines[1:]
|
|
683
1047
|
else:
|
|
684
|
-
hline=
|
|
685
|
-
seqs=[]
|
|
686
|
-
vgs=[]
|
|
687
|
-
infoList=[]
|
|
688
|
-
count=0
|
|
1048
|
+
hline = "CDR3\t" + "\t".join(["Info" + str(x) for x in range(len(ww) - 1)])
|
|
1049
|
+
seqs = []
|
|
1050
|
+
vgs = []
|
|
1051
|
+
infoList = []
|
|
1052
|
+
count = 0
|
|
689
1053
|
if verbose:
|
|
690
|
-
print(
|
|
1054
|
+
print("Creating CDR3 list")
|
|
691
1055
|
for ll in alines:
|
|
692
|
-
ww=ll.strip().split(
|
|
693
|
-
cdr3=ww[0]
|
|
694
|
-
if
|
|
1056
|
+
ww = ll.strip().split("\t")
|
|
1057
|
+
cdr3 = ww[0]
|
|
1058
|
+
if "*" in cdr3:
|
|
695
1059
|
continue
|
|
696
|
-
if
|
|
1060
|
+
if "_" in cdr3:
|
|
697
1061
|
continue
|
|
698
1062
|
seqs.append(ww[0])
|
|
699
1063
|
if Vgene:
|
|
700
1064
|
vgs.append(ww[1])
|
|
701
|
-
infoList.append(
|
|
1065
|
+
infoList.append("\t".join(ww[1:]))
|
|
702
1066
|
else:
|
|
703
|
-
infoList.append(
|
|
704
|
-
count+=1
|
|
705
|
-
if len(outfile)==0:
|
|
706
|
-
outfile=inputfile.split(
|
|
707
|
-
outfile=outfile[len(outfile)-1]
|
|
708
|
-
outfile=
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
1067
|
+
infoList.append("\t".join(ww[1:]))
|
|
1068
|
+
count += 1
|
|
1069
|
+
if len(outfile) == 0:
|
|
1070
|
+
outfile = inputfile.split("/")
|
|
1071
|
+
outfile = outfile[len(outfile) - 1]
|
|
1072
|
+
outfile = (
|
|
1073
|
+
outdir
|
|
1074
|
+
+ "/"
|
|
1075
|
+
+ re.sub("\\.[txcsv]+", "", outfile)
|
|
1076
|
+
+ "-"
|
|
1077
|
+
+ "-RotationEncodingBL62.txt"
|
|
1078
|
+
)
|
|
1079
|
+
g = open(outfile, "w")
|
|
1080
|
+
tm = strftime("%Y-%m-%d %H:%M:%S", gmtime())
|
|
1081
|
+
InfoLine = (
|
|
1082
|
+
"##TIME:"
|
|
1083
|
+
+ tm
|
|
1084
|
+
+ "|cmd: "
|
|
1085
|
+
+ sys.argv[0]
|
|
1086
|
+
+ "|"
|
|
1087
|
+
+ inputfile
|
|
1088
|
+
+ "|IsometricDistance_Thr="
|
|
1089
|
+
+ str(thr_iso)
|
|
1090
|
+
+ "|thr_v="
|
|
1091
|
+
+ str(thr_v)
|
|
1092
|
+
+ "|thr_s="
|
|
1093
|
+
+ str(thr_s)
|
|
1094
|
+
+ "|exact="
|
|
1095
|
+
+ str(exact)
|
|
1096
|
+
+ "|Vgene="
|
|
1097
|
+
+ str(Vgene)
|
|
1098
|
+
+ "|ST="
|
|
1099
|
+
+ str(ST)
|
|
1100
|
+
)
|
|
1101
|
+
g.write(InfoLine + "\n")
|
|
1102
|
+
g.write(
|
|
1103
|
+
"##Column Info: CDR3 aa sequence, cluster id, other information in the input file\n"
|
|
1104
|
+
)
|
|
1105
|
+
gr = 0
|
|
715
1106
|
## Split into different lengths
|
|
716
|
-
LD,VD, ID,SD= BuildLengthDict(
|
|
1107
|
+
LD, VD, ID, SD = BuildLengthDict(
|
|
1108
|
+
seqs, vGene=vgs, INFO=infoList, sIDs=[x for x in range(len(seqs))]
|
|
1109
|
+
)
|
|
717
1110
|
LDu, VDu, IDu, SDu = CollapseUnique(LD, VD, ID, SD)
|
|
718
1111
|
if Mat:
|
|
719
|
-
Mfile=outfile+
|
|
720
|
-
h=open(Mfile,
|
|
1112
|
+
Mfile = outfile + "_EncodingMatrix.txt"
|
|
1113
|
+
h = open(Mfile, "w")
|
|
721
1114
|
for kk in LDu:
|
|
722
1115
|
if verbose:
|
|
723
|
-
print("---Process CDR3s with length %d ---" %(kk))
|
|
724
|
-
vSD=LDu[kk]
|
|
725
|
-
vSD0=[x for x in range(len(vSD))]
|
|
726
|
-
vss=SDu[kk]
|
|
727
|
-
vInfo=IDu[kk]
|
|
728
|
-
flagL=[len(x)-1 for x in vInfo]
|
|
1116
|
+
print("---Process CDR3s with length %d ---" % (kk))
|
|
1117
|
+
vSD = LDu[kk]
|
|
1118
|
+
vSD0 = [x for x in range(len(vSD))]
|
|
1119
|
+
vss = SDu[kk]
|
|
1120
|
+
vInfo = IDu[kk]
|
|
1121
|
+
flagL = [len(x) - 1 for x in vInfo]
|
|
729
1122
|
if verbose:
|
|
730
|
-
print(
|
|
731
|
-
dM=np.array([EncodingCDR3(x[ST:-2], M6, n0) for x in vss])
|
|
732
|
-
dM=dM.astype("float32")
|
|
1123
|
+
print(" Performing CDR3 encoding")
|
|
1124
|
+
dM = np.array([EncodingCDR3(x[ST:-2], M6, n0) for x in vss])
|
|
1125
|
+
dM = dM.astype("float32")
|
|
733
1126
|
if verbose:
|
|
734
|
-
print(" The number of sequences is %d" %(dM.shape[0]))
|
|
1127
|
+
print(" The number of sequences is %d" % (dM.shape[0]))
|
|
735
1128
|
if Mat:
|
|
736
1129
|
for ii in range(len(vss)):
|
|
737
|
-
line=vss[ii]+
|
|
738
|
-
NUMs=[str(xx) for xx in dM[ii
|
|
739
|
-
line +=
|
|
1130
|
+
line = vss[ii] + "\t" + vInfo[ii][0] + "\t"
|
|
1131
|
+
NUMs = [str(xx) for xx in dM[ii, :]]
|
|
1132
|
+
line += "\t".join(NUMs) + "\n"
|
|
740
1133
|
h.write(line)
|
|
741
|
-
sID=[x for x in range(dM.shape[0])]
|
|
742
|
-
t2=time.time()
|
|
1134
|
+
sID = [x for x in range(dM.shape[0])]
|
|
1135
|
+
t2 = time.time()
|
|
743
1136
|
if verbose:
|
|
744
|
-
print(
|
|
745
|
-
Cls = ClusterCDR3(
|
|
1137
|
+
print(" Done! Total time elapsed %f" % (t2 - t1))
|
|
1138
|
+
Cls = ClusterCDR3(
|
|
1139
|
+
dM, flagL, thr=thr_iso - 0.5 * (15 - kk), verbose=verbose
|
|
1140
|
+
) ## change cutoff with different lengths
|
|
746
1141
|
if verbose:
|
|
747
1142
|
print(" Handling identical CDR3 groups")
|
|
748
|
-
Cls_u=[]
|
|
1143
|
+
Cls_u = []
|
|
749
1144
|
for ii in range(len(Cls)):
|
|
750
|
-
cc=Cls[ii]
|
|
1145
|
+
cc = Cls[ii]
|
|
751
1146
|
if len(cc) == 1:
|
|
752
1147
|
## Handle identical CDR3 groups first
|
|
753
|
-
if flagL[cc[0]]>0:
|
|
1148
|
+
if flagL[cc[0]] > 0:
|
|
754
1149
|
gr += 1
|
|
755
|
-
jj=cc[0]
|
|
1150
|
+
jj = cc[0]
|
|
756
1151
|
for v_info in vInfo[jj]:
|
|
757
|
-
line=vss[jj]+
|
|
758
|
-
_=g.write(line)
|
|
1152
|
+
line = vss[jj] + "\t" + str(gr) + "\t" + v_info + "\n"
|
|
1153
|
+
_ = g.write(line)
|
|
759
1154
|
else:
|
|
760
1155
|
Cls_u.append(cc)
|
|
761
|
-
Cls=Cls_u
|
|
762
|
-
t2=time.time()
|
|
1156
|
+
Cls = Cls_u
|
|
1157
|
+
t2 = time.time()
|
|
763
1158
|
if verbose:
|
|
764
|
-
print(
|
|
1159
|
+
print(" Done! Total time elapsed %f" % (t2 - t1))
|
|
765
1160
|
if Vgene:
|
|
766
|
-
vVgene=VDu[kk]
|
|
1161
|
+
vVgene = VDu[kk]
|
|
767
1162
|
if verbose:
|
|
768
|
-
print(
|
|
769
|
-
Cls_v=[]
|
|
1163
|
+
print(" Matching variable genes")
|
|
1164
|
+
Cls_v = []
|
|
770
1165
|
for cc in Cls:
|
|
771
|
-
Nc=len(cc)
|
|
772
|
-
sMat={}
|
|
1166
|
+
Nc = len(cc)
|
|
1167
|
+
sMat = {}
|
|
773
1168
|
for ii in range(Nc):
|
|
774
|
-
v1=vVgene[cc[ii]]
|
|
775
|
-
for jj in range(ii,Nc):
|
|
776
|
-
if jj==ii:
|
|
1169
|
+
v1 = vVgene[cc[ii]]
|
|
1170
|
+
for jj in range(ii, Nc):
|
|
1171
|
+
if jj == ii:
|
|
777
1172
|
continue
|
|
778
|
-
v2=vVgene[cc[jj]]
|
|
1173
|
+
v2 = vVgene[cc[jj]]
|
|
779
1174
|
if (v1, v2) not in VDict:
|
|
780
1175
|
if v1 == v2:
|
|
781
1176
|
if ii not in sMat:
|
|
782
|
-
sMat[ii]=[jj]
|
|
1177
|
+
sMat[ii] = [jj]
|
|
783
1178
|
else:
|
|
784
1179
|
sMat[ii].append(jj)
|
|
785
1180
|
if jj not in sMat:
|
|
786
|
-
sMat[jj]=[ii]
|
|
1181
|
+
sMat[jj] = [ii]
|
|
787
1182
|
else:
|
|
788
1183
|
sMat[jj].append(ii)
|
|
789
1184
|
continue
|
|
790
|
-
if VDict[(v1,v2)] >= thr_v:
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
|
|
799
|
-
vCL=IdentifyMotifCluster(sMat)
|
|
800
|
-
vCL_List=list(chain(*vCL))
|
|
1185
|
+
if VDict[(v1, v2)] >= thr_v:
|
|
1186
|
+
if ii not in sMat:
|
|
1187
|
+
sMat[ii] = [jj]
|
|
1188
|
+
else:
|
|
1189
|
+
sMat[ii].append(jj)
|
|
1190
|
+
if jj not in sMat:
|
|
1191
|
+
sMat[jj] = [ii]
|
|
1192
|
+
else:
|
|
1193
|
+
sMat[jj].append(ii)
|
|
1194
|
+
vCL = IdentifyMotifCluster(sMat)
|
|
1195
|
+
vCL_List = list(chain(*vCL))
|
|
801
1196
|
for ii in range(Nc):
|
|
802
|
-
uu=flagL[cc[ii]]
|
|
803
|
-
if uu>0 and ii not in vCL_List:
|
|
1197
|
+
uu = flagL[cc[ii]]
|
|
1198
|
+
if uu > 0 and ii not in vCL_List:
|
|
804
1199
|
vCL.append([ii])
|
|
805
1200
|
for vcc in vCL:
|
|
806
1201
|
Cls_v.append(list(np.array(cc)[np.array(vcc)]))
|
|
807
|
-
Cls=[]
|
|
1202
|
+
Cls = []
|
|
808
1203
|
for ii in range(len(Cls_v)):
|
|
809
|
-
cc=Cls_v[ii]
|
|
1204
|
+
cc = Cls_v[ii]
|
|
810
1205
|
if len(cc) == 1:
|
|
811
1206
|
## Handle identical CDR3 groups first
|
|
812
1207
|
gr += 1
|
|
813
|
-
jj=cc[0]
|
|
1208
|
+
jj = cc[0]
|
|
814
1209
|
for v_info in vInfo[jj]:
|
|
815
|
-
line=vss[jj]+
|
|
816
|
-
_=g.write(line)
|
|
1210
|
+
line = vss[jj] + "\t" + str(gr) + "\t" + v_info + "\n"
|
|
1211
|
+
_ = g.write(line)
|
|
817
1212
|
else:
|
|
818
1213
|
Cls.append(cc)
|
|
819
1214
|
if exact:
|
|
820
1215
|
if verbose:
|
|
821
|
-
print(
|
|
822
|
-
Cls_s=[]
|
|
1216
|
+
print(" Performing Smith-Waterman alignment")
|
|
1217
|
+
Cls_s = []
|
|
823
1218
|
for cc in Cls:
|
|
824
|
-
Nc=len(cc)
|
|
825
|
-
if len(cc)<=3:
|
|
826
|
-
sMat=np.zeros((Nc,Nc))
|
|
1219
|
+
Nc = len(cc)
|
|
1220
|
+
if len(cc) <= 3:
|
|
1221
|
+
sMat = np.zeros((Nc, Nc))
|
|
827
1222
|
for ii in range(Nc):
|
|
828
|
-
s1=vss[cc[ii]]
|
|
829
|
-
for jj in range(ii,Nc):
|
|
830
|
-
if jj==ii:
|
|
1223
|
+
s1 = vss[cc[ii]]
|
|
1224
|
+
for jj in range(ii, Nc):
|
|
1225
|
+
if jj == ii:
|
|
831
1226
|
continue
|
|
832
|
-
s2=vss[cc[jj]]
|
|
1227
|
+
s2 = vss[cc[jj]]
|
|
833
1228
|
if len(s1) != len(s2):
|
|
834
1229
|
continue
|
|
835
|
-
if len(s1)<=5:
|
|
1230
|
+
if len(s1) <= 5:
|
|
836
1231
|
continue
|
|
837
|
-
sw=SeqComparison(s1[ST:-2],s2[ST:-2],gap=gap)
|
|
838
|
-
sw=sw/(len(s1)-ST-2)
|
|
839
|
-
sMat[ii,jj]=sw
|
|
840
|
-
sMat[jj,ii]=sw
|
|
841
|
-
s_max=[]
|
|
1232
|
+
sw = SeqComparison(s1[ST:-2], s2[ST:-2], gap=gap)
|
|
1233
|
+
sw = sw / (len(s1) - ST - 2)
|
|
1234
|
+
sMat[ii, jj] = sw
|
|
1235
|
+
sMat[jj, ii] = sw
|
|
1236
|
+
s_max = []
|
|
842
1237
|
for ii in range(Nc):
|
|
843
|
-
s_max.append(np.max(sMat[:,ii]))
|
|
844
|
-
cc_new=[]
|
|
1238
|
+
s_max.append(np.max(sMat[:, ii]))
|
|
1239
|
+
cc_new = []
|
|
845
1240
|
for ii in range(Nc):
|
|
846
|
-
if s_max[ii]>=thr_s:
|
|
1241
|
+
if s_max[ii] >= thr_s:
|
|
847
1242
|
cc_new.append(cc[ii])
|
|
848
|
-
if len(cc_new)>1:
|
|
1243
|
+
if len(cc_new) > 1:
|
|
849
1244
|
Cls_s.append(cc_new)
|
|
850
1245
|
else:
|
|
851
1246
|
for ii in range(Nc):
|
|
852
|
-
uu=flagL[cc[ii]]
|
|
853
|
-
if uu>0:
|
|
1247
|
+
uu = flagL[cc[ii]]
|
|
1248
|
+
if uu > 0:
|
|
854
1249
|
Cls_s.append([cc[ii]])
|
|
855
|
-
# print(Cls_s)
|
|
856
|
-
Cls_sList=list(chain(*Cls_s))
|
|
1250
|
+
# print(Cls_s)
|
|
1251
|
+
Cls_sList = list(chain(*Cls_s))
|
|
857
1252
|
for ii in range(len(cc)):
|
|
858
|
-
uu=flagL[cc[ii]]
|
|
859
|
-
if uu>0 and cc[ii] not in Cls_sList:
|
|
1253
|
+
uu = flagL[cc[ii]]
|
|
1254
|
+
if uu > 0 and cc[ii] not in Cls_sList:
|
|
860
1255
|
Cls_s.append([cc[ii]])
|
|
861
1256
|
else:
|
|
862
|
-
CDR3s=[vss[x] for x in cc]
|
|
863
|
-
sIDs=np.array([vSD0[x] for x in cc])
|
|
864
|
-
sIDs0=[x for x in range(len(cc))]
|
|
865
|
-
Kset=KmerSet(CDR3s, sIDs0, KS=5, st=ST, ed=2)
|
|
866
|
-
SSG=generateSSG(Kset, CDR3s, k_thr=1)
|
|
867
|
-
tmpVgenes=[
|
|
868
|
-
SSGnew=UpdateSSG(
|
|
869
|
-
|
|
870
|
-
|
|
1257
|
+
CDR3s = [vss[x] for x in cc]
|
|
1258
|
+
sIDs = np.array([vSD0[x] for x in cc])
|
|
1259
|
+
sIDs0 = [x for x in range(len(cc))]
|
|
1260
|
+
Kset = KmerSet(CDR3s, sIDs0, KS=5, st=ST, ed=2)
|
|
1261
|
+
SSG = generateSSG(Kset, CDR3s, k_thr=1)
|
|
1262
|
+
tmpVgenes = ["TRBV2"] * len(CDR3s)
|
|
1263
|
+
SSGnew = UpdateSSG(
|
|
1264
|
+
SSG, CDR3s, tmpVgenes, Vscore=VDict, cutoff=thr_s + 4
|
|
1265
|
+
)
|
|
1266
|
+
CLall = IdentifyMotifCluster(SSGnew)
|
|
1267
|
+
CLall_list = list(chain(*CLall))
|
|
871
1268
|
for ii in range(len(cc)):
|
|
872
|
-
uu=flagL[cc[ii]]
|
|
873
|
-
if uu>0 and ii not in CLall_list:
|
|
1269
|
+
uu = flagL[cc[ii]]
|
|
1270
|
+
if uu > 0 and ii not in CLall_list:
|
|
874
1271
|
CLall.append([ii])
|
|
875
1272
|
for cl in CLall:
|
|
876
|
-
ccs=list(sIDs[np.array(cl)])
|
|
1273
|
+
ccs = list(sIDs[np.array(cl)])
|
|
877
1274
|
Cls_s.append(ccs)
|
|
878
|
-
Cls=Cls_s
|
|
1275
|
+
Cls = Cls_s
|
|
879
1276
|
if verbose:
|
|
880
|
-
print(
|
|
1277
|
+
print(" Writing results into file")
|
|
881
1278
|
for ii in range(len(Cls)):
|
|
882
|
-
# if ii % 100000 == 0 and ii>0:
|
|
883
|
-
|
|
884
|
-
cc=Cls[ii]
|
|
885
|
-
gr+=1
|
|
1279
|
+
# if ii % 100000 == 0 and ii>0:
|
|
1280
|
+
# print(' %d sequences written' %(ii))
|
|
1281
|
+
cc = Cls[ii]
|
|
1282
|
+
gr += 1
|
|
886
1283
|
for jj in cc:
|
|
887
1284
|
for v_info in vInfo[jj]:
|
|
888
|
-
line=vss[jj]+
|
|
889
|
-
_=g.write(line)
|
|
1285
|
+
line = vss[jj] + "\t" + str(gr) + "\t" + v_info + "\n"
|
|
1286
|
+
_ = g.write(line)
|
|
890
1287
|
g.close()
|
|
891
1288
|
if Mat:
|
|
892
1289
|
h.close()
|
|
893
1290
|
|
|
1291
|
+
|
|
894
1292
|
def OrderUnique(Ig):
|
|
895
|
-
vv=list(Ig.values())
|
|
896
|
-
kk=list(Ig.keys())
|
|
897
|
-
LL=[len(x[1]) for x in vv]
|
|
898
|
-
v0=[x[0][0] for x in vv]
|
|
899
|
-
v1=[x[0][1] for x in vv]
|
|
900
|
-
zkk=zip(kk,v0,v1,LL)
|
|
901
|
-
zkks=sorted(zkk,key=lambda x: (x[1],x[3]))
|
|
902
|
-
nk=len(zkks)
|
|
903
|
-
keep_id=[0]
|
|
904
|
-
ii=1
|
|
905
|
-
n_pre=str(zkks[0][1])+
|
|
906
|
-
while ii<nk:
|
|
907
|
-
n_cur=str(zkks[ii][1])+
|
|
908
|
-
if n_cur==n_pre:
|
|
909
|
-
ii+=1
|
|
1293
|
+
vv = list(Ig.values())
|
|
1294
|
+
kk = list(Ig.keys())
|
|
1295
|
+
LL = [len(x[1]) for x in vv]
|
|
1296
|
+
v0 = [x[0][0] for x in vv]
|
|
1297
|
+
v1 = [x[0][1] for x in vv]
|
|
1298
|
+
zkk = zip(kk, v0, v1, LL)
|
|
1299
|
+
zkks = sorted(zkk, key=lambda x: (x[1], x[3]))
|
|
1300
|
+
nk = len(zkks)
|
|
1301
|
+
keep_id = [0]
|
|
1302
|
+
ii = 1
|
|
1303
|
+
n_pre = str(zkks[0][1]) + "_" + str(zkks[0][2])
|
|
1304
|
+
while ii < nk:
|
|
1305
|
+
n_cur = str(zkks[ii][1]) + "_" + str(zkks[ii][2])
|
|
1306
|
+
if n_cur == n_pre:
|
|
1307
|
+
ii += 1
|
|
910
1308
|
continue
|
|
911
1309
|
else:
|
|
912
1310
|
keep_id.append(ii)
|
|
913
|
-
n_pre=n_cur
|
|
914
|
-
ii+=1
|
|
1311
|
+
n_pre = n_cur
|
|
1312
|
+
ii += 1
|
|
915
1313
|
continue
|
|
916
|
-
nid=[x[0] for x in zkks]
|
|
917
|
-
filtered_id=np.array(nid)[np.array(keep_id)]
|
|
918
|
-
Igs={}
|
|
1314
|
+
nid = [x[0] for x in zkks]
|
|
1315
|
+
filtered_id = np.array(nid)[np.array(keep_id)]
|
|
1316
|
+
Igs = {}
|
|
919
1317
|
for ii in filtered_id:
|
|
920
|
-
Igs[kk[ii]]=vv[ii]
|
|
1318
|
+
Igs[kk[ii]] = vv[ii]
|
|
921
1319
|
return Igs, filtered_id
|
|
922
1320
|
|
|
1321
|
+
|
|
923
1322
|
def ClusterCDR3(dM, flagL, thr=10, GPU=False, verbose=False):
|
|
924
1323
|
## flagL: flag vector for identical CDR3 groups, >0 for grouped non-identical CDR3s
|
|
925
|
-
Cls=[]
|
|
926
|
-
flag=0
|
|
927
|
-
dM1=dM
|
|
928
|
-
flagL=np.array(flagL)
|
|
1324
|
+
Cls = []
|
|
1325
|
+
flag = 0
|
|
1326
|
+
dM1 = dM
|
|
1327
|
+
flagL = np.array(flagL)
|
|
929
1328
|
if GPU:
|
|
930
1329
|
res = faiss.StandardGpuResources()
|
|
931
1330
|
while 1:
|
|
932
|
-
# print(" %d number of clusters, with %d sequences" %(len(Cls),dM1.shape[0]))
|
|
1331
|
+
# print(" %d number of clusters, with %d sequences" %(len(Cls),dM1.shape[0]))
|
|
933
1332
|
if verbose:
|
|
934
|
-
print(
|
|
935
|
-
index = faiss.IndexFlatL2(Ndim*6)
|
|
1333
|
+
print("=", end="")
|
|
1334
|
+
index = faiss.IndexFlatL2(Ndim * 6)
|
|
936
1335
|
if GPU:
|
|
937
1336
|
index = faiss.index_cpu_to_gpu(res, 0, index)
|
|
938
1337
|
index.add(dM1)
|
|
939
|
-
if flag==0:
|
|
1338
|
+
if flag == 0:
|
|
940
1339
|
D, I = index.search(dM1, 2)
|
|
941
|
-
vv=np.where((D[:,1]<=thr))[0]
|
|
942
|
-
vv0=np.where((D[:,1]>thr) & (flagL>0))[0]
|
|
1340
|
+
vv = np.where((D[:, 1] <= thr))[0]
|
|
1341
|
+
vv0 = np.where((D[:, 1] > thr) & (flagL > 0))[0]
|
|
943
1342
|
for v in vv0:
|
|
944
1343
|
Cls.append([v])
|
|
945
|
-
tmp_dM=np.zeros((len(vv),Ndim*6))
|
|
946
|
-
Ig_new={}
|
|
1344
|
+
tmp_dM = np.zeros((len(vv), Ndim * 6))
|
|
1345
|
+
Ig_new = {}
|
|
947
1346
|
for ii in range(len(vv)):
|
|
948
|
-
v=vv[ii]
|
|
949
|
-
Idx=I[v,]
|
|
1347
|
+
v = vv[ii]
|
|
1348
|
+
Idx = I[v,]
|
|
950
1349
|
if v not in Idx:
|
|
951
|
-
Idx[0]=v
|
|
952
|
-
Ig_new[ii]=(sorted(list(set(Idx))),sorted(list(set(Idx))))
|
|
953
|
-
tmp_dM[ii,]=(dM1[Idx[0],]+dM1[Idx[1],])/2
|
|
954
|
-
if len(Ig_new)==0:
|
|
1350
|
+
Idx[0] = v
|
|
1351
|
+
Ig_new[ii] = (sorted(list(set(Idx))), sorted(list(set(Idx))))
|
|
1352
|
+
tmp_dM[ii,] = (dM1[Idx[0],] + dM1[Idx[1],]) / 2
|
|
1353
|
+
if len(Ig_new) == 0:
|
|
955
1354
|
if verbose:
|
|
956
|
-
print(
|
|
1355
|
+
print("type 0 break")
|
|
957
1356
|
break
|
|
958
|
-
# print('%d of sequence left at cycle %d' %(len(Ig_new),flag))
|
|
959
|
-
Igs, fid=OrderUnique(Ig_new)
|
|
960
|
-
tmp_dM=tmp_dM[fid,]
|
|
961
|
-
Ig_new=Igs
|
|
1357
|
+
# print('%d of sequence left at cycle %d' %(len(Ig_new),flag))
|
|
1358
|
+
Igs, fid = OrderUnique(Ig_new)
|
|
1359
|
+
tmp_dM = tmp_dM[fid,]
|
|
1360
|
+
Ig_new = Igs
|
|
962
1361
|
else:
|
|
963
|
-
D, I = index.search(dM1,2)
|
|
964
|
-
vv=np.where(D[:,1]<=thr)[0]
|
|
965
|
-
vv0=np.where(D[:,1]>thr)[0]
|
|
1362
|
+
D, I = index.search(dM1, 2)
|
|
1363
|
+
vv = np.where(D[:, 1] <= thr)[0]
|
|
1364
|
+
vv0 = np.where(D[:, 1] > thr)[0]
|
|
966
1365
|
## move groups in vv0 to Cls
|
|
967
|
-
kkg=list(Ig.keys())
|
|
1366
|
+
kkg = list(Ig.keys())
|
|
968
1367
|
for v in vv0:
|
|
969
|
-
ng=list(Ig[kkg[v]][1])
|
|
970
|
-
|
|
1368
|
+
ng = list(Ig[kkg[v]][1])
|
|
1369
|
+
# if ng not in Cls:
|
|
971
1370
|
Cls.append(ng)
|
|
972
|
-
tmp_dM=np.zeros((len(vv),Ndim*6))
|
|
973
|
-
Ig_new={}
|
|
1371
|
+
tmp_dM = np.zeros((len(vv), Ndim * 6))
|
|
1372
|
+
Ig_new = {}
|
|
974
1373
|
for ii in range(len(vv)):
|
|
975
|
-
v=vv[ii]
|
|
976
|
-
idx1=I[v,0]
|
|
977
|
-
idx2=I[v,1]
|
|
1374
|
+
v = vv[ii]
|
|
1375
|
+
idx1 = I[v, 0]
|
|
1376
|
+
idx2 = I[v, 1]
|
|
978
1377
|
if v not in I[v,]:
|
|
979
|
-
idx1=v
|
|
980
|
-
# Ig_new[ii]=sorted(list(set(list(Ig[kkg[idx1]])+list(Ig[kkg[idx2]]))))
|
|
981
|
-
Ig_new[ii]=(
|
|
982
|
-
|
|
983
|
-
|
|
984
|
-
|
|
1378
|
+
idx1 = v
|
|
1379
|
+
# Ig_new[ii]=sorted(list(set(list(Ig[kkg[idx1]])+list(Ig[kkg[idx2]]))))
|
|
1380
|
+
Ig_new[ii] = (
|
|
1381
|
+
sorted(
|
|
1382
|
+
list(set([idx1, idx2]))
|
|
1383
|
+
), ## First entry records the relative index of a sequence clique
|
|
1384
|
+
sorted(list(set(list(Ig[kkg[idx1]][1]) + list(Ig[kkg[idx2]][1])))),
|
|
1385
|
+
) ## Second entry records the absolute index of a sequence
|
|
1386
|
+
tmp_dM[ii,] = (dM1[idx1,] + dM1[idx2,]) / 2
|
|
1387
|
+
if len(Ig_new) == 0:
|
|
985
1388
|
if verbose:
|
|
986
1389
|
print("\ntype I break")
|
|
987
|
-
kkg=list(Ig.keys())
|
|
1390
|
+
kkg = list(Ig.keys())
|
|
988
1391
|
for kk in kkg:
|
|
989
|
-
ng=list(Ig[kk][1])
|
|
1392
|
+
ng = list(Ig[kk][1])
|
|
990
1393
|
if ng not in Cls:
|
|
991
1394
|
Cls.append(ng)
|
|
992
1395
|
break
|
|
993
|
-
# print('%d of sequence left at cycle %d' %(len(Ig_new),flag))
|
|
994
|
-
Igs, fid=OrderUnique(Ig_new)
|
|
995
|
-
tmp_dM=tmp_dM[fid,]
|
|
996
|
-
Ig_new=Igs
|
|
997
|
-
if flag>0:
|
|
1396
|
+
# print('%d of sequence left at cycle %d' %(len(Ig_new),flag))
|
|
1397
|
+
Igs, fid = OrderUnique(Ig_new)
|
|
1398
|
+
tmp_dM = tmp_dM[fid,]
|
|
1399
|
+
Ig_new = Igs
|
|
1400
|
+
if flag > 0:
|
|
998
1401
|
if Ig == Ig_new:
|
|
999
1402
|
if verbose:
|
|
1000
1403
|
print("\ntype II break")
|
|
1001
|
-
kkg=list(Ig.keys())
|
|
1404
|
+
kkg = list(Ig.keys())
|
|
1002
1405
|
for kk in kkg:
|
|
1003
|
-
ng=list(Ig[kk][1])
|
|
1406
|
+
ng = list(Ig[kk][1])
|
|
1004
1407
|
if ng in Cls:
|
|
1005
1408
|
continue
|
|
1006
1409
|
Cls.append(ng)
|
|
1007
1410
|
break
|
|
1008
|
-
Ig=Ig_new
|
|
1009
|
-
tmp_dM=tmp_dM.astype(
|
|
1010
|
-
dM1=tmp_dM
|
|
1011
|
-
flag+=1
|
|
1411
|
+
Ig = Ig_new
|
|
1412
|
+
tmp_dM = tmp_dM.astype("float32")
|
|
1413
|
+
dM1 = tmp_dM
|
|
1414
|
+
flag += 1
|
|
1012
1415
|
return Cls
|
|
1013
1416
|
|
|
1014
|
-
|
|
1015
|
-
|
|
1417
|
+
|
|
1418
|
+
def ClusterCDR3r(dM, flagL, thr=10, verbose=False):
|
|
1419
|
+
index = faiss.IndexFlatL2(Ndim * 6)
|
|
1016
1420
|
index.add(dM)
|
|
1017
1421
|
lims, D, I = index.range_search(dM, thr)
|
|
1018
1422
|
# with open('cdr3.npy', 'wb') as f:
|
|
@@ -1020,53 +1424,70 @@ def ClusterCDR3r(dM, flagL, thr = 10, verbose = False):
|
|
|
1020
1424
|
# np.save(f, D)
|
|
1021
1425
|
# np.save(f, I)
|
|
1022
1426
|
# np.save(f, dM)
|
|
1023
|
-
|
|
1427
|
+
|
|
1024
1428
|
# now clustering results
|
|
1025
1429
|
N = dM.shape[0]
|
|
1026
|
-
neighborSize = np.array(
|
|
1430
|
+
neighborSize = np.array(
|
|
1431
|
+
[lims[cur_idx_i + 1] - lims[cur_idx_i] for cur_idx_i in range(N)]
|
|
1432
|
+
)
|
|
1027
1433
|
# to_cluster = np.ones( (N,))
|
|
1028
1434
|
clusterNo = 0
|
|
1029
|
-
cluster = -
|
|
1435
|
+
cluster = -np.ones((N,), dtype=np.int32)
|
|
1030
1436
|
idx = np.where(cluster < 0)[0]
|
|
1031
1437
|
unclustered = [np.argmax(neighborSize[idx])]
|
|
1032
1438
|
depth = 0
|
|
1033
1439
|
while True:
|
|
1034
|
-
if len(unclustered) == 0:
|
|
1440
|
+
if len(unclustered) == 0:
|
|
1441
|
+
break
|
|
1035
1442
|
# cur_idx = unclustered[0] # first unclustered index
|
|
1036
1443
|
cur_idx = unclustered
|
|
1037
|
-
cluster[cur_idx] = clusterNo
|
|
1038
|
-
|
|
1039
|
-
neighbor = np.unique(
|
|
1444
|
+
cluster[cur_idx] = clusterNo # assign cluster
|
|
1445
|
+
|
|
1446
|
+
neighbor = np.unique(
|
|
1447
|
+
np.array(
|
|
1448
|
+
list(
|
|
1449
|
+
chain(
|
|
1450
|
+
*[
|
|
1451
|
+
I[(lims[cur_idx_i]) : lims[cur_idx_i + 1]]
|
|
1452
|
+
for cur_idx_i in cur_idx
|
|
1453
|
+
]
|
|
1454
|
+
)
|
|
1455
|
+
)
|
|
1456
|
+
)
|
|
1457
|
+
)
|
|
1040
1458
|
# find those unclusterred
|
|
1041
1459
|
idx = np.where(cluster[neighbor] < 0)[0]
|
|
1042
1460
|
if len(idx) == 0:
|
|
1043
1461
|
depth = 0
|
|
1044
1462
|
clusterNo += 1
|
|
1045
1463
|
idx = np.where(cluster < 0)[0]
|
|
1046
|
-
if len(idx) == 0:
|
|
1464
|
+
if len(idx) == 0:
|
|
1465
|
+
break
|
|
1047
1466
|
unclustered = [idx[np.argmax(neighborSize[idx])]]
|
|
1048
|
-
|
|
1467
|
+
|
|
1049
1468
|
else:
|
|
1050
1469
|
if depth > 3:
|
|
1051
1470
|
depth = 0
|
|
1052
1471
|
clusterNo += 1
|
|
1053
1472
|
unclustered = neighbor[idx]
|
|
1054
1473
|
depth += 1
|
|
1055
|
-
# print('clusterNo = ', clusterNo)
|
|
1056
|
-
Cls = [
|
|
1474
|
+
# print('clusterNo = ', clusterNo)
|
|
1475
|
+
Cls = [[] for i in range(clusterNo)]
|
|
1057
1476
|
for idx, i in enumerate(cluster):
|
|
1058
|
-
|
|
1059
|
-
# print("Cls[:5] = ", Cls[:5])
|
|
1060
|
-
# print("len(Cls) = ", len(Cls),
|
|
1061
|
-
# ', #elem=', sum([len(i) for i in Cls]),
|
|
1062
|
-
# ', #single=', sum([len(i) for i in Cls if len(i) == 1]),
|
|
1063
|
-
# ', #non_single=', sum([len(i) for i in Cls if len(i) != 1]),
|
|
1064
|
-
# ', #max=', max([len(i) for i in Cls]))
|
|
1477
|
+
Cls[i].append(idx)
|
|
1478
|
+
# print("Cls[:5] = ", Cls[:5])
|
|
1479
|
+
# print("len(Cls) = ", len(Cls),
|
|
1480
|
+
# ', #elem=', sum([len(i) for i in Cls]),
|
|
1481
|
+
# ', #single=', sum([len(i) for i in Cls if len(i) == 1]),
|
|
1482
|
+
# ', #non_single=', sum([len(i) for i in Cls if len(i) != 1]),
|
|
1483
|
+
# ', #max=', max([len(i) for i in Cls]))
|
|
1065
1484
|
return Cls
|
|
1066
1485
|
|
|
1486
|
+
|
|
1067
1487
|
def CommandLineParser():
|
|
1068
|
-
parser=OptionParser()
|
|
1069
|
-
print
|
|
1488
|
+
parser = OptionParser()
|
|
1489
|
+
print(
|
|
1490
|
+
"""
|
|
1070
1491
|
GIANA: Geometric Isometry based ANtigen-specific tcr Alignment
|
|
1071
1492
|
Ultrafast short peptide alignment exclusively designed for large-scale adaptome analysis
|
|
1072
1493
|
|
|
@@ -1079,130 +1500,282 @@ Input columns:
|
|
|
1079
1500
|
|
|
1080
1501
|
!!! ALL amino acid letters must be CAPITAL !!!
|
|
1081
1502
|
|
|
1082
|
-
|
|
1083
|
-
|
|
1084
|
-
parser.add_option(
|
|
1085
|
-
|
|
1086
|
-
|
|
1087
|
-
|
|
1088
|
-
|
|
1089
|
-
|
|
1090
|
-
|
|
1091
|
-
parser.add_option(
|
|
1092
|
-
|
|
1093
|
-
|
|
1094
|
-
|
|
1095
|
-
|
|
1096
|
-
|
|
1097
|
-
|
|
1098
|
-
parser.add_option(
|
|
1099
|
-
|
|
1100
|
-
|
|
1101
|
-
|
|
1102
|
-
|
|
1503
|
+
"""
|
|
1504
|
+
)
|
|
1505
|
+
parser.add_option(
|
|
1506
|
+
"-d",
|
|
1507
|
+
"--directory",
|
|
1508
|
+
dest="Directory",
|
|
1509
|
+
help="Input repertoire sequencing file directory. Please make sure that all the files in the directory are input files.",
|
|
1510
|
+
default="",
|
|
1511
|
+
)
|
|
1512
|
+
parser.add_option(
|
|
1513
|
+
"-f",
|
|
1514
|
+
"--file",
|
|
1515
|
+
dest="File",
|
|
1516
|
+
default="",
|
|
1517
|
+
help="Input single file of CDR3 sequences for grouping",
|
|
1518
|
+
)
|
|
1519
|
+
parser.add_option(
|
|
1520
|
+
"-F",
|
|
1521
|
+
"--fileList",
|
|
1522
|
+
dest="files",
|
|
1523
|
+
default="",
|
|
1524
|
+
help="Alternative input: a file containing the full path to all the files. If given, overwrite -d and -f option",
|
|
1525
|
+
)
|
|
1526
|
+
parser.add_option(
|
|
1527
|
+
"-t",
|
|
1528
|
+
"--threshold",
|
|
1529
|
+
dest="thr",
|
|
1530
|
+
default=7,
|
|
1531
|
+
help="Isometric distance threshold for calling similar CDR3 groups. Without -E, smaller value will increase speed. With -E, smaller value will increase specificity. Must be smaller than 12.",
|
|
1532
|
+
)
|
|
1533
|
+
parser.add_option(
|
|
1534
|
+
"-S",
|
|
1535
|
+
"--threshold_score",
|
|
1536
|
+
dest="thr_s",
|
|
1537
|
+
default=3.6,
|
|
1538
|
+
help="Threshold for Smith-Waterman alignment score (normalized by CDR3 length). Default 3.6",
|
|
1539
|
+
)
|
|
1540
|
+
parser.add_option(
|
|
1541
|
+
"-G",
|
|
1542
|
+
"--threshold_vgene",
|
|
1543
|
+
dest="thr_v",
|
|
1544
|
+
default=3.7,
|
|
1545
|
+
help="Threshold for variable gene comparison. Default 3.7.",
|
|
1546
|
+
)
|
|
1547
|
+
parser.add_option(
|
|
1548
|
+
"-o",
|
|
1549
|
+
"--output",
|
|
1550
|
+
dest="OutDir",
|
|
1551
|
+
default="./",
|
|
1552
|
+
help="Output directory for intermediate and final outputs.",
|
|
1553
|
+
)
|
|
1554
|
+
parser.add_option(
|
|
1555
|
+
"-O",
|
|
1556
|
+
"--outfile",
|
|
1557
|
+
dest="OutFile",
|
|
1558
|
+
default="",
|
|
1559
|
+
help="Output file name. If not given, a file with --RotationEncoding will be added to the input file as the output file name.",
|
|
1560
|
+
)
|
|
1561
|
+
parser.add_option(
|
|
1562
|
+
"-T",
|
|
1563
|
+
"--startPosition",
|
|
1564
|
+
dest="ST",
|
|
1565
|
+
default=3,
|
|
1566
|
+
help="Starting position of CDR3 sequence. The first ST letters are omitted. CDR3 sequence length L must be >= ST+7 ",
|
|
1567
|
+
)
|
|
1568
|
+
parser.add_option(
|
|
1569
|
+
"-g",
|
|
1570
|
+
"--GapPenalty",
|
|
1571
|
+
dest="Gap",
|
|
1572
|
+
default=-6,
|
|
1573
|
+
help="Gap penalty,default= -6. Not used.",
|
|
1574
|
+
)
|
|
1575
|
+
parser.add_option(
|
|
1576
|
+
"-n",
|
|
1577
|
+
"--GapNumber",
|
|
1578
|
+
dest="GapN",
|
|
1579
|
+
default=1,
|
|
1580
|
+
help="Maximum number of gaps allowed when performing alignment. Max=1, default=1. Not used.",
|
|
1581
|
+
)
|
|
1582
|
+
parser.add_option(
|
|
1583
|
+
"-V",
|
|
1584
|
+
"--VariableGeneFa",
|
|
1585
|
+
dest="VFa",
|
|
1586
|
+
default="Imgt_Human_TRBV.fasta",
|
|
1587
|
+
help="IMGT Human beta variable gene sequences",
|
|
1588
|
+
)
|
|
1589
|
+
parser.add_option(
|
|
1590
|
+
"-v",
|
|
1591
|
+
"--VariableGene",
|
|
1592
|
+
dest="V",
|
|
1593
|
+
default=True,
|
|
1594
|
+
action="store_false",
|
|
1595
|
+
help="If False, GIANA will omit variable gene information and use CDR3 sequences only. This will yield reduced specificity. The cut-off will automatically become the current value-4.0",
|
|
1596
|
+
)
|
|
1597
|
+
parser.add_option(
|
|
1598
|
+
"-e",
|
|
1599
|
+
"--Exact",
|
|
1600
|
+
dest="E",
|
|
1601
|
+
default=True,
|
|
1602
|
+
action="store_false",
|
|
1603
|
+
help="If False, GIANA will not perform Smith-Waterman alignment after isometric encoding.",
|
|
1604
|
+
)
|
|
1605
|
+
parser.add_option(
|
|
1606
|
+
"-N",
|
|
1607
|
+
"--NumberOfThreads",
|
|
1608
|
+
dest="NN",
|
|
1609
|
+
default=1,
|
|
1610
|
+
help="Number of threads for multiple processing. Not working so well.",
|
|
1611
|
+
)
|
|
1612
|
+
parser.add_option(
|
|
1613
|
+
"-M",
|
|
1614
|
+
"--EncodingMatrix",
|
|
1615
|
+
dest="Mat",
|
|
1616
|
+
default=False,
|
|
1617
|
+
action="store_true",
|
|
1618
|
+
help="If true, GIANA will export the isometric encoding matrix for each TCR. Default: False.",
|
|
1619
|
+
)
|
|
1620
|
+
parser.add_option(
|
|
1621
|
+
"-U",
|
|
1622
|
+
"--UseGPU",
|
|
1623
|
+
dest="GPU",
|
|
1624
|
+
default=False,
|
|
1625
|
+
action="store_true",
|
|
1626
|
+
help="Use GPU for Faiss indexing. Must be CUDA GPUs.",
|
|
1627
|
+
)
|
|
1628
|
+
parser.add_option(
|
|
1629
|
+
"-q",
|
|
1630
|
+
"--queryFile",
|
|
1631
|
+
dest="Query",
|
|
1632
|
+
default="",
|
|
1633
|
+
help="Input query file, if given, GIANA will run in query mode, also need to provide -r option.",
|
|
1634
|
+
)
|
|
1635
|
+
parser.add_option(
|
|
1636
|
+
"-r",
|
|
1637
|
+
"--refFile",
|
|
1638
|
+
dest="ref",
|
|
1639
|
+
default="",
|
|
1640
|
+
help="Input reference file. Query model required.",
|
|
1641
|
+
)
|
|
1642
|
+
parser.add_option(
|
|
1643
|
+
"-b",
|
|
1644
|
+
"--Verbose",
|
|
1645
|
+
dest="v",
|
|
1646
|
+
default=False,
|
|
1647
|
+
action="store_true",
|
|
1648
|
+
help="Verbose option: if given, GIANA will print intermediate messages.",
|
|
1649
|
+
)
|
|
1103
1650
|
return parser.parse_args()
|
|
1104
1651
|
|
|
1652
|
+
|
|
1105
1653
|
def main():
|
|
1106
|
-
(opt,_)=CommandLineParser()
|
|
1107
|
-
cutoff=float(opt.thr)
|
|
1108
|
-
OutDir=opt.OutDir
|
|
1109
|
-
thr_s=float(opt.thr_s)
|
|
1654
|
+
(opt, _) = CommandLineParser()
|
|
1655
|
+
cutoff = float(opt.thr)
|
|
1656
|
+
OutDir = opt.OutDir
|
|
1657
|
+
thr_s = float(opt.thr_s)
|
|
1110
1658
|
## Check if query mode first
|
|
1111
|
-
qFile=opt.Query
|
|
1112
|
-
if len(qFile)>0:
|
|
1659
|
+
qFile = opt.Query
|
|
1660
|
+
if len(qFile) > 0:
|
|
1113
1661
|
## query mode
|
|
1114
|
-
t1=time.time()
|
|
1115
|
-
if qFile.endswith(
|
|
1662
|
+
t1 = time.time()
|
|
1663
|
+
if qFile.endswith("/"):
|
|
1116
1664
|
## input query is a directory
|
|
1117
|
-
qFs=os.listdir(qFile)
|
|
1118
|
-
qFileList=[]
|
|
1665
|
+
qFs = os.listdir(qFile)
|
|
1666
|
+
qFileList = []
|
|
1119
1667
|
for ff in qFs:
|
|
1120
|
-
qFileList.append(qFile+ff)
|
|
1668
|
+
qFileList.append(qFile + ff)
|
|
1121
1669
|
else:
|
|
1122
|
-
qFileList=[qFile]
|
|
1123
|
-
rFile=opt.ref
|
|
1124
|
-
if len(rFile)==0:
|
|
1125
|
-
raise("Must provide reference file in query mode!")
|
|
1670
|
+
qFileList = [qFile]
|
|
1671
|
+
rFile = opt.ref
|
|
1672
|
+
if len(rFile) == 0:
|
|
1673
|
+
raise ("Must provide reference file in query mode!")
|
|
1126
1674
|
else:
|
|
1127
1675
|
## check if reference cluster file exists
|
|
1128
|
-
rFile0=re.sub(
|
|
1129
|
-
refClusterFile=rFile0+
|
|
1676
|
+
rFile0 = re.sub("\\.txt", "", rFile)
|
|
1677
|
+
refClusterFile = rFile0 + "--RotationEncodingBL62.txt"
|
|
1130
1678
|
if not os.path.exists(refClusterFile):
|
|
1131
|
-
raise(
|
|
1132
|
-
|
|
1133
|
-
|
|
1134
|
-
|
|
1679
|
+
raise (
|
|
1680
|
+
"Must run clustering on reference file first! Did you forget to put the clustering file in this directory?"
|
|
1681
|
+
)
|
|
1682
|
+
rData = CreateReference(rFile)
|
|
1683
|
+
t2 = time.time()
|
|
1684
|
+
print("Reference created. Elapsed %f" % (t2 - t1))
|
|
1135
1685
|
for qf in qFileList:
|
|
1136
|
-
t2_0=time.time()
|
|
1137
|
-
print("Querying "+qf)
|
|
1138
|
-
qf_s=qf.split(
|
|
1139
|
-
#outFile=re.sub('\\.txt','',qf_s)+'_query_'+rFile0+'.txt'
|
|
1140
|
-
outFile=
|
|
1141
|
-
|
|
1686
|
+
t2_0 = time.time()
|
|
1687
|
+
print("Querying " + qf)
|
|
1688
|
+
qf_s = qf.split("/")[-1]
|
|
1689
|
+
# outFile=re.sub('\\.txt','',qf_s)+'_query_'+rFile0+'.txt'
|
|
1690
|
+
outFile = (
|
|
1691
|
+
os.path.splitext(qf_s)[0]
|
|
1692
|
+
+ "_query_"
|
|
1693
|
+
+ os.path.basename(rFile0)
|
|
1694
|
+
+ ".txt"
|
|
1695
|
+
)
|
|
1696
|
+
of = OutDir + "/" + outFile
|
|
1142
1697
|
if path.exists(of):
|
|
1143
|
-
print(of+
|
|
1698
|
+
print(of + " already exits. Skipping.")
|
|
1144
1699
|
continue
|
|
1145
1700
|
MakeQuery(qf, rData, thr=cutoff, thr_s=thr_s)
|
|
1146
|
-
t2=time.time()
|
|
1147
|
-
print(" Build query clustering file. Elapsed %f" %(t2-t1))
|
|
1701
|
+
t2 = time.time()
|
|
1702
|
+
print(" Build query clustering file. Elapsed %f" % (t2 - t1))
|
|
1148
1703
|
print("Now mering with reference cluster")
|
|
1149
|
-
MergeExist(refClusterFile, OutDir+
|
|
1150
|
-
t2=time.time()
|
|
1151
|
-
print(" Time of elapsed for query %s: %f" %(qf, t2-t2_0))
|
|
1704
|
+
MergeExist(refClusterFile, OutDir + "/" + outFile)
|
|
1705
|
+
t2 = time.time()
|
|
1706
|
+
print(" Time of elapsed for query %s: %f" % (qf, t2 - t2_0))
|
|
1152
1707
|
else:
|
|
1153
1708
|
## regular clustering mode
|
|
1154
|
-
FileDir=opt.Directory
|
|
1155
|
-
if len(FileDir)>0:
|
|
1156
|
-
|
|
1157
|
-
|
|
1158
|
-
|
|
1159
|
-
|
|
1160
|
-
|
|
1161
|
-
|
|
1709
|
+
FileDir = opt.Directory
|
|
1710
|
+
if len(FileDir) > 0:
|
|
1711
|
+
files = os.listdir(FileDir)
|
|
1712
|
+
files0 = []
|
|
1713
|
+
for ff in files:
|
|
1714
|
+
ff = FileDir + "/" + ff
|
|
1715
|
+
files0.append(ff)
|
|
1716
|
+
files = files0
|
|
1162
1717
|
else:
|
|
1163
|
-
|
|
1164
|
-
File=opt.File
|
|
1165
|
-
if len(File)>0:
|
|
1166
|
-
|
|
1167
|
-
FileList=opt.files
|
|
1168
|
-
if len(FileList)>0:
|
|
1169
|
-
|
|
1170
|
-
|
|
1171
|
-
|
|
1172
|
-
|
|
1173
|
-
VFa=opt.VFa
|
|
1718
|
+
files = []
|
|
1719
|
+
File = opt.File
|
|
1720
|
+
if len(File) > 0:
|
|
1721
|
+
files = [File]
|
|
1722
|
+
FileList = opt.files
|
|
1723
|
+
if len(FileList) > 0:
|
|
1724
|
+
files = []
|
|
1725
|
+
fL = open(FileList)
|
|
1726
|
+
for ff in fL.readlines():
|
|
1727
|
+
files.append(ff.strip())
|
|
1728
|
+
VFa = opt.VFa
|
|
1174
1729
|
PreCalculateVgeneDist(VFa)
|
|
1175
|
-
vf=open(
|
|
1176
|
-
VScore={}
|
|
1177
|
-
VV=opt.V
|
|
1178
|
-
EE=opt.E
|
|
1179
|
-
Mat=opt.Mat
|
|
1180
|
-
ST=int(opt.ST)
|
|
1181
|
-
thr_v=float(opt.thr_v)
|
|
1182
|
-
verbose=opt.v
|
|
1730
|
+
vf = open("./VgeneScores.txt") ## Use tcrDist's Vgene 80-score calculation
|
|
1731
|
+
VScore = {}
|
|
1732
|
+
VV = opt.V
|
|
1733
|
+
EE = opt.E
|
|
1734
|
+
Mat = opt.Mat
|
|
1735
|
+
ST = int(opt.ST)
|
|
1736
|
+
thr_v = float(opt.thr_v)
|
|
1737
|
+
verbose = opt.v
|
|
1183
1738
|
if VV:
|
|
1184
1739
|
while 1:
|
|
1185
|
-
line=vf.readline()
|
|
1186
|
-
if len(line)==0:
|
|
1740
|
+
line = vf.readline()
|
|
1741
|
+
if len(line) == 0:
|
|
1187
1742
|
break
|
|
1188
|
-
ww=line.strip().split(
|
|
1189
|
-
VScore[(ww[0],ww[1])]=int(ww[2])/20
|
|
1190
|
-
VScore[(ww[1],ww[0])]=int(ww[2])/20
|
|
1191
|
-
Gap=int(opt.Gap)
|
|
1192
|
-
Gapn=int(opt.GapN)
|
|
1193
|
-
OutFile=opt.OutFile
|
|
1194
|
-
GPU=opt.GPU
|
|
1195
|
-
st=3
|
|
1196
|
-
ed=1
|
|
1197
|
-
NT=int(opt.NN)
|
|
1743
|
+
ww = line.strip().split("\t")
|
|
1744
|
+
VScore[(ww[0], ww[1])] = int(ww[2]) / 20
|
|
1745
|
+
VScore[(ww[1], ww[0])] = int(ww[2]) / 20
|
|
1746
|
+
Gap = int(opt.Gap)
|
|
1747
|
+
Gapn = int(opt.GapN)
|
|
1748
|
+
OutFile = opt.OutFile
|
|
1749
|
+
GPU = opt.GPU
|
|
1750
|
+
st = 3
|
|
1751
|
+
ed = 1
|
|
1752
|
+
NT = int(opt.NN)
|
|
1198
1753
|
faiss.omp_set_num_threads(NT)
|
|
1199
1754
|
for ff in files:
|
|
1200
|
-
print("Processing %s" %ff)
|
|
1201
|
-
EncodeRepertoire(
|
|
1202
|
-
|
|
1755
|
+
print("Processing %s" % ff)
|
|
1756
|
+
EncodeRepertoire(
|
|
1757
|
+
ff,
|
|
1758
|
+
OutDir,
|
|
1759
|
+
OutFile,
|
|
1760
|
+
ST=ST,
|
|
1761
|
+
thr_s=thr_s,
|
|
1762
|
+
thr_v=thr_v,
|
|
1763
|
+
exact=EE,
|
|
1764
|
+
VDict=VScore,
|
|
1765
|
+
Vgene=VV,
|
|
1766
|
+
thr_iso=cutoff,
|
|
1767
|
+
gap=Gap,
|
|
1768
|
+
GPU=GPU,
|
|
1769
|
+
Mat=Mat,
|
|
1770
|
+
verbose=verbose,
|
|
1771
|
+
)
|
|
1772
|
+
|
|
1773
|
+
|
|
1203
1774
|
if __name__ == "__main__":
|
|
1204
|
-
t0=time.time()
|
|
1775
|
+
t0 = time.time()
|
|
1205
1776
|
main()
|
|
1206
|
-
print
|
|
1207
|
-
print
|
|
1208
|
-
|
|
1777
|
+
print("Total time elapsed: %f" % (time.time() - t0))
|
|
1778
|
+
print(
|
|
1779
|
+
"Maximum memory usage: %f MB"
|
|
1780
|
+
% (resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1000000)
|
|
1781
|
+
)
|