biopipen 0.21.0__py3-none-any.whl → 0.34.26__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biopipen/__init__.py +1 -1
- biopipen/core/config.toml +28 -0
- biopipen/core/filters.py +79 -4
- biopipen/core/proc.py +12 -3
- biopipen/core/testing.py +75 -3
- biopipen/ns/bam.py +148 -6
- biopipen/ns/bed.py +75 -0
- biopipen/ns/cellranger.py +186 -0
- biopipen/ns/cellranger_pipeline.py +126 -0
- biopipen/ns/cnv.py +19 -3
- biopipen/ns/cnvkit.py +1 -1
- biopipen/ns/cnvkit_pipeline.py +20 -12
- biopipen/ns/delim.py +34 -35
- biopipen/ns/gene.py +68 -23
- biopipen/ns/gsea.py +63 -37
- biopipen/ns/misc.py +39 -14
- biopipen/ns/plot.py +304 -1
- biopipen/ns/protein.py +183 -0
- biopipen/ns/regulatory.py +290 -0
- biopipen/ns/rnaseq.py +142 -5
- biopipen/ns/scrna.py +2053 -473
- biopipen/ns/scrna_metabolic_landscape.py +228 -382
- biopipen/ns/snp.py +659 -0
- biopipen/ns/stats.py +484 -0
- biopipen/ns/tcr.py +683 -98
- biopipen/ns/vcf.py +236 -2
- biopipen/ns/web.py +97 -6
- biopipen/reports/bam/CNVpytor.svelte +4 -9
- biopipen/reports/cellranger/CellRangerCount.svelte +18 -0
- biopipen/reports/cellranger/CellRangerSummary.svelte +16 -0
- biopipen/reports/cellranger/CellRangerVdj.svelte +18 -0
- biopipen/reports/cnvkit/CNVkitDiagram.svelte +1 -1
- biopipen/reports/cnvkit/CNVkitHeatmap.svelte +1 -1
- biopipen/reports/cnvkit/CNVkitScatter.svelte +1 -1
- biopipen/reports/common.svelte +15 -0
- biopipen/reports/protein/ProdigySummary.svelte +16 -0
- biopipen/reports/scrna/CellsDistribution.svelte +4 -39
- biopipen/reports/scrna/DimPlots.svelte +1 -1
- biopipen/reports/scrna/MarkersFinder.svelte +6 -126
- biopipen/reports/scrna/MetaMarkers.svelte +3 -75
- biopipen/reports/scrna/RadarPlots.svelte +4 -20
- biopipen/reports/scrna_metabolic_landscape/MetabolicFeatures.svelte +61 -22
- biopipen/reports/scrna_metabolic_landscape/MetabolicPathwayActivity.svelte +88 -82
- biopipen/reports/scrna_metabolic_landscape/MetabolicPathwayHeterogeneity.svelte +70 -10
- biopipen/reports/snp/PlinkCallRate.svelte +24 -0
- biopipen/reports/snp/PlinkFreq.svelte +18 -0
- biopipen/reports/snp/PlinkHWE.svelte +18 -0
- biopipen/reports/snp/PlinkHet.svelte +18 -0
- biopipen/reports/snp/PlinkIBD.svelte +18 -0
- biopipen/reports/tcr/CDR3AAPhyschem.svelte +19 -66
- biopipen/reports/tcr/ClonalStats.svelte +16 -0
- biopipen/reports/tcr/CloneResidency.svelte +3 -93
- biopipen/reports/tcr/Immunarch.svelte +4 -155
- biopipen/reports/tcr/TCRClusterStats.svelte +3 -45
- biopipen/reports/tcr/TESSA.svelte +11 -28
- biopipen/reports/utils/misc.liq +22 -7
- biopipen/scripts/bam/BamMerge.py +11 -15
- biopipen/scripts/bam/BamSampling.py +90 -0
- biopipen/scripts/bam/BamSort.py +141 -0
- biopipen/scripts/bam/BamSplitChroms.py +10 -10
- biopipen/scripts/bam/BamSubsetByBed.py +38 -0
- biopipen/scripts/bam/CNAClinic.R +41 -5
- biopipen/scripts/bam/CNVpytor.py +153 -54
- biopipen/scripts/bam/ControlFREEC.py +13 -14
- biopipen/scripts/bam/SamtoolsView.py +33 -0
- biopipen/scripts/bed/Bed2Vcf.py +5 -5
- biopipen/scripts/bed/BedConsensus.py +5 -5
- biopipen/scripts/bed/BedLiftOver.sh +6 -4
- biopipen/scripts/bed/BedtoolsIntersect.py +54 -0
- biopipen/scripts/bed/BedtoolsMakeWindows.py +47 -0
- biopipen/scripts/bed/BedtoolsMerge.py +4 -4
- biopipen/scripts/cellranger/CellRangerCount.py +138 -0
- biopipen/scripts/cellranger/CellRangerSummary.R +181 -0
- biopipen/scripts/cellranger/CellRangerVdj.py +112 -0
- biopipen/scripts/cnv/AneuploidyScore.R +55 -20
- biopipen/scripts/cnv/AneuploidyScoreSummary.R +221 -163
- biopipen/scripts/cnv/TMADScore.R +25 -9
- biopipen/scripts/cnv/TMADScoreSummary.R +57 -86
- biopipen/scripts/cnvkit/CNVkitAccess.py +7 -6
- biopipen/scripts/cnvkit/CNVkitAutobin.py +26 -18
- biopipen/scripts/cnvkit/CNVkitBatch.py +6 -6
- biopipen/scripts/cnvkit/CNVkitCall.py +3 -3
- biopipen/scripts/cnvkit/CNVkitCoverage.py +4 -3
- biopipen/scripts/cnvkit/CNVkitDiagram.py +5 -5
- biopipen/scripts/cnvkit/CNVkitFix.py +3 -3
- biopipen/scripts/cnvkit/CNVkitGuessBaits.py +12 -8
- biopipen/scripts/cnvkit/CNVkitHeatmap.py +5 -5
- biopipen/scripts/cnvkit/CNVkitReference.py +6 -5
- biopipen/scripts/cnvkit/CNVkitScatter.py +5 -5
- biopipen/scripts/cnvkit/CNVkitSegment.py +5 -5
- biopipen/scripts/cnvkit/guess_baits.py +166 -93
- biopipen/scripts/delim/RowsBinder.R +1 -1
- biopipen/scripts/delim/SampleInfo.R +116 -118
- biopipen/scripts/gene/GeneNameConversion.R +67 -0
- biopipen/scripts/gene/GenePromoters.R +61 -0
- biopipen/scripts/gsea/Enrichr.R +5 -5
- biopipen/scripts/gsea/FGSEA.R +184 -50
- biopipen/scripts/gsea/GSEA.R +2 -2
- biopipen/scripts/gsea/PreRank.R +5 -5
- biopipen/scripts/misc/Config2File.py +2 -2
- biopipen/scripts/misc/Plot.R +80 -0
- biopipen/scripts/misc/Shell.sh +15 -0
- biopipen/scripts/misc/Str2File.py +2 -2
- biopipen/scripts/plot/Heatmap.R +3 -3
- biopipen/scripts/plot/Manhattan.R +147 -0
- biopipen/scripts/plot/QQPlot.R +146 -0
- biopipen/scripts/plot/ROC.R +88 -0
- biopipen/scripts/plot/Scatter.R +112 -0
- biopipen/scripts/plot/VennDiagram.R +5 -9
- biopipen/scripts/protein/MMCIF2PDB.py +33 -0
- biopipen/scripts/protein/PDB2Fasta.py +60 -0
- biopipen/scripts/protein/Prodigy.py +119 -0
- biopipen/scripts/protein/ProdigySummary.R +140 -0
- biopipen/scripts/protein/RMSD.py +178 -0
- biopipen/scripts/regulatory/MotifAffinityTest.R +102 -0
- biopipen/scripts/regulatory/MotifAffinityTest_AtSNP.R +127 -0
- biopipen/scripts/regulatory/MotifAffinityTest_MotifBreakR.R +104 -0
- biopipen/scripts/regulatory/MotifScan.py +159 -0
- biopipen/scripts/regulatory/VariantMotifPlot.R +78 -0
- biopipen/scripts/regulatory/motifs-common.R +324 -0
- biopipen/scripts/rnaseq/Simulation-ESCO.R +180 -0
- biopipen/scripts/rnaseq/Simulation-RUVcorr.R +45 -0
- biopipen/scripts/rnaseq/Simulation.R +21 -0
- biopipen/scripts/rnaseq/UnitConversion.R +325 -54
- biopipen/scripts/scrna/AnnData2Seurat.R +40 -0
- biopipen/scripts/scrna/CCPlotR-patch.R +161 -0
- biopipen/scripts/scrna/CellCellCommunication.py +150 -0
- biopipen/scripts/scrna/CellCellCommunicationPlots.R +93 -0
- biopipen/scripts/scrna/CellSNPLite.py +30 -0
- biopipen/scripts/scrna/CellTypeAnnotation-celltypist.R +185 -0
- biopipen/scripts/scrna/CellTypeAnnotation-direct.R +68 -31
- biopipen/scripts/scrna/CellTypeAnnotation-hitype.R +27 -22
- biopipen/scripts/scrna/CellTypeAnnotation-sccatch.R +28 -20
- biopipen/scripts/scrna/CellTypeAnnotation-sctype.R +48 -25
- biopipen/scripts/scrna/CellTypeAnnotation.R +37 -1
- biopipen/scripts/scrna/CellsDistribution.R +456 -167
- biopipen/scripts/scrna/DimPlots.R +1 -1
- biopipen/scripts/scrna/ExprImputation-alra.R +109 -0
- biopipen/scripts/scrna/ExprImputation-rmagic.R +256 -0
- biopipen/scripts/scrna/{ExprImpution-scimpute.R → ExprImputation-scimpute.R} +8 -5
- biopipen/scripts/scrna/ExprImputation.R +7 -0
- biopipen/scripts/scrna/LoomTo10X.R +51 -0
- biopipen/scripts/scrna/MQuad.py +25 -0
- biopipen/scripts/scrna/MarkersFinder.R +679 -400
- biopipen/scripts/scrna/MetaMarkers.R +265 -161
- biopipen/scripts/scrna/ModuleScoreCalculator.R +66 -11
- biopipen/scripts/scrna/PseudoBulkDEG.R +678 -0
- biopipen/scripts/scrna/RadarPlots.R +355 -134
- biopipen/scripts/scrna/ScFGSEA.R +298 -100
- biopipen/scripts/scrna/ScSimulation.R +65 -0
- biopipen/scripts/scrna/ScVelo.py +617 -0
- biopipen/scripts/scrna/Seurat2AnnData.R +7 -0
- biopipen/scripts/scrna/SeuratClusterStats-clustree.R +87 -0
- biopipen/scripts/scrna/SeuratClusterStats-dimplots.R +36 -30
- biopipen/scripts/scrna/SeuratClusterStats-features.R +138 -187
- biopipen/scripts/scrna/SeuratClusterStats-ngenes.R +81 -0
- biopipen/scripts/scrna/SeuratClusterStats-stats.R +78 -89
- biopipen/scripts/scrna/SeuratClusterStats.R +47 -10
- biopipen/scripts/scrna/SeuratClustering.R +36 -233
- biopipen/scripts/scrna/SeuratLoading.R +2 -2
- biopipen/scripts/scrna/SeuratMap2Ref.R +84 -113
- biopipen/scripts/scrna/SeuratMetadataMutater.R +16 -6
- biopipen/scripts/scrna/SeuratPreparing.R +223 -173
- biopipen/scripts/scrna/SeuratSubClustering.R +64 -0
- biopipen/scripts/scrna/SeuratTo10X.R +27 -0
- biopipen/scripts/scrna/Slingshot.R +65 -0
- biopipen/scripts/scrna/Subset10X.R +2 -2
- biopipen/scripts/scrna/TopExpressingGenes.R +169 -135
- biopipen/scripts/scrna/celltypist-wrapper.py +195 -0
- biopipen/scripts/scrna/scvelo_paga.py +313 -0
- biopipen/scripts/scrna/seurat_anndata_conversion.py +98 -0
- biopipen/scripts/scrna_metabolic_landscape/MetabolicFeatures.R +447 -82
- biopipen/scripts/scrna_metabolic_landscape/MetabolicPathwayActivity.R +348 -241
- biopipen/scripts/scrna_metabolic_landscape/MetabolicPathwayHeterogeneity.R +188 -166
- biopipen/scripts/snp/MatrixEQTL.R +217 -0
- biopipen/scripts/snp/Plink2GTMat.py +148 -0
- biopipen/scripts/snp/PlinkCallRate.R +199 -0
- biopipen/scripts/snp/PlinkFilter.py +100 -0
- biopipen/scripts/snp/PlinkFreq.R +291 -0
- biopipen/scripts/snp/PlinkFromVcf.py +81 -0
- biopipen/scripts/snp/PlinkHWE.R +85 -0
- biopipen/scripts/snp/PlinkHet.R +96 -0
- biopipen/scripts/snp/PlinkIBD.R +196 -0
- biopipen/scripts/snp/PlinkSimulation.py +124 -0
- biopipen/scripts/snp/PlinkUpdateName.py +124 -0
- biopipen/scripts/stats/ChowTest.R +146 -0
- biopipen/scripts/stats/DiffCoexpr.R +152 -0
- biopipen/scripts/stats/LiquidAssoc.R +135 -0
- biopipen/scripts/stats/Mediation.R +108 -0
- biopipen/scripts/stats/MetaPvalue.R +130 -0
- biopipen/scripts/stats/MetaPvalue1.R +74 -0
- biopipen/scripts/tcgamaf/Maf2Vcf.py +2 -2
- biopipen/scripts/tcgamaf/MafAddChr.py +2 -2
- biopipen/scripts/tcr/Attach2Seurat.R +3 -2
- biopipen/scripts/tcr/CDR3AAPhyschem.R +211 -143
- biopipen/scripts/tcr/CDR3Clustering.R +343 -0
- biopipen/scripts/tcr/ClonalStats.R +526 -0
- biopipen/scripts/tcr/CloneResidency.R +255 -131
- biopipen/scripts/tcr/CloneSizeQQPlot.R +4 -4
- biopipen/scripts/tcr/GIANA/GIANA.py +1356 -797
- biopipen/scripts/tcr/GIANA/GIANA4.py +1362 -789
- biopipen/scripts/tcr/GIANA/query.py +164 -162
- biopipen/scripts/tcr/Immunarch-basic.R +31 -9
- biopipen/scripts/tcr/Immunarch-clonality.R +25 -5
- biopipen/scripts/tcr/Immunarch-diversity.R +352 -134
- biopipen/scripts/tcr/Immunarch-geneusage.R +45 -5
- biopipen/scripts/tcr/Immunarch-kmer.R +68 -8
- biopipen/scripts/tcr/Immunarch-overlap.R +84 -4
- biopipen/scripts/tcr/Immunarch-spectratyping.R +35 -6
- biopipen/scripts/tcr/Immunarch-tracking.R +38 -6
- biopipen/scripts/tcr/Immunarch-vjjunc.R +165 -0
- biopipen/scripts/tcr/Immunarch.R +63 -11
- biopipen/scripts/tcr/Immunarch2VDJtools.R +2 -2
- biopipen/scripts/tcr/ImmunarchFilter.R +4 -4
- biopipen/scripts/tcr/ImmunarchLoading.R +38 -29
- biopipen/scripts/tcr/SampleDiversity.R +1 -1
- biopipen/scripts/tcr/ScRepCombiningExpression.R +40 -0
- biopipen/scripts/tcr/ScRepLoading.R +166 -0
- biopipen/scripts/tcr/TCRClusterStats.R +176 -22
- biopipen/scripts/tcr/TCRDock.py +110 -0
- biopipen/scripts/tcr/TESSA.R +102 -118
- biopipen/scripts/tcr/VJUsage.R +5 -5
- biopipen/scripts/tcr/immunarch-patched.R +142 -0
- biopipen/scripts/tcr/vdjtools-patch.sh +1 -1
- biopipen/scripts/vcf/BcftoolsAnnotate.py +91 -0
- biopipen/scripts/vcf/BcftoolsFilter.py +90 -0
- biopipen/scripts/vcf/BcftoolsMerge.py +31 -0
- biopipen/scripts/vcf/BcftoolsSort.py +113 -0
- biopipen/scripts/vcf/BcftoolsView.py +73 -0
- biopipen/scripts/vcf/TruvariBench.sh +14 -7
- biopipen/scripts/vcf/TruvariBenchSummary.R +16 -13
- biopipen/scripts/vcf/TruvariConsistency.R +1 -1
- biopipen/scripts/vcf/Vcf2Bed.py +2 -2
- biopipen/scripts/vcf/VcfAnno.py +11 -11
- biopipen/scripts/vcf/VcfDownSample.sh +22 -10
- biopipen/scripts/vcf/VcfFilter.py +5 -5
- biopipen/scripts/vcf/VcfFix.py +7 -7
- biopipen/scripts/vcf/VcfFix_utils.py +13 -4
- biopipen/scripts/vcf/VcfIndex.py +3 -3
- biopipen/scripts/vcf/VcfIntersect.py +3 -3
- biopipen/scripts/vcf/VcfLiftOver.sh +5 -0
- biopipen/scripts/vcf/VcfSplitSamples.py +4 -4
- biopipen/scripts/vcf/bcftools_utils.py +52 -0
- biopipen/scripts/web/Download.py +8 -4
- biopipen/scripts/web/DownloadList.py +5 -5
- biopipen/scripts/web/GCloudStorageDownloadBucket.py +82 -0
- biopipen/scripts/web/GCloudStorageDownloadFile.py +23 -0
- biopipen/scripts/web/gcloud_common.py +49 -0
- biopipen/utils/gene.py +108 -60
- biopipen/utils/misc.py +146 -20
- biopipen/utils/reference.py +64 -20
- biopipen/utils/reporter.py +177 -0
- biopipen/utils/vcf.py +1 -1
- biopipen-0.34.26.dist-info/METADATA +27 -0
- biopipen-0.34.26.dist-info/RECORD +292 -0
- {biopipen-0.21.0.dist-info → biopipen-0.34.26.dist-info}/WHEEL +1 -1
- {biopipen-0.21.0.dist-info → biopipen-0.34.26.dist-info}/entry_points.txt +6 -2
- biopipen/ns/bcftools.py +0 -111
- biopipen/ns/scrna_basic.py +0 -255
- biopipen/reports/delim/SampleInfo.svelte +0 -36
- biopipen/reports/scrna/GeneExpressionInvistigation.svelte +0 -32
- biopipen/reports/scrna/ScFGSEA.svelte +0 -35
- biopipen/reports/scrna/SeuratClusterStats.svelte +0 -82
- biopipen/reports/scrna/SeuratMap2Ref.svelte +0 -20
- biopipen/reports/scrna/SeuratPreparing.svelte +0 -38
- biopipen/reports/scrna/TopExpressingGenes.svelte +0 -55
- biopipen/reports/scrna_metabolic_landscape/MetabolicFeaturesIntraSubset.svelte +0 -31
- biopipen/reports/utils/gsea.liq +0 -110
- biopipen/scripts/bcftools/BcftoolsAnnotate.py +0 -42
- biopipen/scripts/bcftools/BcftoolsFilter.py +0 -79
- biopipen/scripts/bcftools/BcftoolsSort.py +0 -19
- biopipen/scripts/gene/GeneNameConversion.py +0 -66
- biopipen/scripts/scrna/ExprImpution-alra.R +0 -32
- biopipen/scripts/scrna/ExprImpution-rmagic.R +0 -29
- biopipen/scripts/scrna/ExprImpution.R +0 -7
- biopipen/scripts/scrna/GeneExpressionInvistigation.R +0 -132
- biopipen/scripts/scrna/Write10X.R +0 -11
- biopipen/scripts/scrna_metabolic_landscape/MetabolicFeaturesIntraSubset.R +0 -150
- biopipen/scripts/tcr/TCRClustering.R +0 -280
- biopipen/utils/common_docstrs.py +0 -61
- biopipen/utils/gene.R +0 -49
- biopipen/utils/gsea.R +0 -193
- biopipen/utils/io.R +0 -20
- biopipen/utils/misc.R +0 -114
- biopipen/utils/mutate_helpers.R +0 -433
- biopipen/utils/plot.R +0 -173
- biopipen/utils/rnaseq.R +0 -48
- biopipen/utils/single_cell.R +0 -115
- biopipen-0.21.0.dist-info/METADATA +0 -22
- biopipen-0.21.0.dist-info/RECORD +0 -218
|
@@ -25,7 +25,6 @@ import sys, os, re, resource
|
|
|
25
25
|
from os import path
|
|
26
26
|
import numpy as np
|
|
27
27
|
from copy import deepcopy
|
|
28
|
-
from Bio.SubsMat.MatrixInfo import blosum62
|
|
29
28
|
import time
|
|
30
29
|
from time import gmtime, strftime
|
|
31
30
|
from operator import itemgetter
|
|
@@ -38,254 +37,574 @@ from sklearn.manifold import MDS
|
|
|
38
37
|
import faiss
|
|
39
38
|
from query import *
|
|
40
39
|
|
|
41
|
-
AAstring=
|
|
42
|
-
AAstringList=list(AAstring)
|
|
43
|
-
cur_dir=os.path.dirname(os.path.realpath(__file__))+
|
|
40
|
+
AAstring = "ACDEFGHIKLMNPQRSTVWY"
|
|
41
|
+
AAstringList = list(AAstring)
|
|
42
|
+
cur_dir = os.path.dirname(os.path.realpath(__file__)) + "/"
|
|
44
43
|
|
|
45
|
-
blosum62n={}
|
|
44
|
+
blosum62n = {}
|
|
46
45
|
for kk in blosum62:
|
|
47
|
-
a1=kk[0]
|
|
48
|
-
a2=kk[1]
|
|
49
|
-
vv=blosum62[kk]
|
|
50
|
-
if vv>4:
|
|
51
|
-
vv=4
|
|
52
|
-
blosum62n[(a1,a2)]=vv
|
|
46
|
+
a1 = kk[0]
|
|
47
|
+
a2 = kk[1]
|
|
48
|
+
vv = blosum62[kk]
|
|
49
|
+
if vv > 4:
|
|
50
|
+
vv = 4
|
|
51
|
+
blosum62n[(a1, a2)] = vv
|
|
53
52
|
if a1 != a2:
|
|
54
|
-
blosum62n[(a2,a1)]=vv
|
|
55
|
-
|
|
56
|
-
bl62={
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
53
|
+
blosum62n[(a2, a1)] = vv
|
|
54
|
+
|
|
55
|
+
bl62 = {
|
|
56
|
+
"A": [4, -1, -2, -2, 0, -1, -1, 0, -2, -1, -1, -1, -1, -2, -1, 1, 0, -3, -2, 0],
|
|
57
|
+
"R": [-1, 4, 0, -2, -3, 1, 0, -2, 0, -3, -2, 2, -1, -3, -2, -1, -1, -3, -2, -3],
|
|
58
|
+
"N": [-2, 0, 4, 1, -3, 0, 0, 0, 1, -3, -3, 0, -2, -3, -2, 1, 0, -4, -2, -3],
|
|
59
|
+
"D": [-2, -2, 1, 4, -3, 0, 2, -1, -1, -3, -4, -1, -3, -3, -1, 0, -1, -4, -3, -3],
|
|
60
|
+
"C": [0, -3, -3, -3, 4, -3, -4, -3, -3, -1, -1, -3, -1, -2, -3, -1, -1, -2, -2, -1],
|
|
61
|
+
"Q": [-1, 1, 0, 0, -3, 4, 2, -2, 0, -3, -2, 1, 0, -3, -1, 0, -1, -2, -1, -2],
|
|
62
|
+
"E": [-1, 0, 0, 2, -4, 2, 4, -2, 0, -3, -3, 1, -2, -3, -1, 0, -1, -3, -2, -2],
|
|
63
|
+
"G": [0, -2, 0, -1, -3, -2, -2, 4, -2, -4, -4, -2, -3, -3, -2, 0, -2, -2, -3, -3],
|
|
64
|
+
"H": [-2, 0, 1, -1, -3, 0, 0, -2, 4, -3, -3, -1, -2, -1, -2, -1, -2, -2, 2, -3],
|
|
65
|
+
"I": [-1, -3, -3, -3, -1, -3, -3, -4, -3, 4, 2, -3, 1, 0, -3, -2, -1, -3, -1, 3],
|
|
66
|
+
"L": [-1, -2, -3, -4, -1, -2, -3, -4, -3, 2, 4, -2, 2, 0, -3, -2, -1, -2, -1, 1],
|
|
67
|
+
"K": [-1, 2, 0, -1, -3, 1, 1, -2, -1, -3, -2, 4, -1, -3, -1, 0, -1, -3, -2, -2],
|
|
68
|
+
"M": [-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1, 4, 0, -2, -1, -1, -1, -1, 1],
|
|
69
|
+
"F": [-2, -3, -3, -3, -2, -3, -3, -3, -1, 0, 0, -3, 0, 4, -4, -2, -2, 1, 3, -1],
|
|
70
|
+
"P": [
|
|
71
|
+
-1,
|
|
72
|
+
-2,
|
|
73
|
+
-2,
|
|
74
|
+
-1,
|
|
75
|
+
-3,
|
|
76
|
+
-1,
|
|
77
|
+
-1,
|
|
78
|
+
-2,
|
|
79
|
+
-2,
|
|
80
|
+
-3,
|
|
81
|
+
-3,
|
|
82
|
+
-1,
|
|
83
|
+
-2,
|
|
84
|
+
-4,
|
|
85
|
+
4,
|
|
86
|
+
-1,
|
|
87
|
+
-1,
|
|
88
|
+
-4,
|
|
89
|
+
-3,
|
|
90
|
+
-2,
|
|
91
|
+
],
|
|
92
|
+
"S": [1, -1, 1, 0, -1, 0, 0, 0, -1, -2, -2, 0, -1, -2, -1, 4, 1, -3, -2, -2],
|
|
93
|
+
"T": [0, -1, 0, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, -2, -1, 1, 4, -2, -2, 0],
|
|
94
|
+
"W": [-3, -3, -4, -4, -2, -2, -3, -2, -2, -3, -2, -3, -1, 1, -4, -3, -2, 4, 2, -3],
|
|
95
|
+
"Y": [-2, -2, -2, -3, -2, -1, -2, -3, 2, -1, -1, -2, -1, 3, -3, -2, -2, 2, 4, -1],
|
|
96
|
+
"V": [0, -3, -3, -3, -1, -2, -2, -3, -3, 3, 1, -2, 1, -1, -2, -2, 0, -3, -1, 4],
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
bl62c = np.array([np.array(x) for x in list(bl62.values())])
|
|
100
|
+
bl62c = 4 - bl62c
|
|
101
|
+
|
|
102
|
+
embedding = MDS(
|
|
103
|
+
n_components=13, n_init=100, max_iter=1000, eps=0.00001, dissimilarity="precomputed"
|
|
104
|
+
)
|
|
105
|
+
X = embedding.fit_transform(bl62c)
|
|
106
|
+
|
|
107
|
+
bl62np = {}
|
|
108
|
+
vkk = list(bl62.keys())
|
|
85
109
|
for ii in range(20):
|
|
86
|
-
kk=vkk[ii]
|
|
87
|
-
bl62np[kk]=np.array(list(X[ii,])+[0]*17)
|
|
110
|
+
kk = vkk[ii]
|
|
111
|
+
bl62np[kk] = np.array(list(X[ii,]) + [0] * 17)
|
|
88
112
|
|
|
89
|
-
|
|
90
|
-
AAencodingDict={}
|
|
113
|
+
|
|
114
|
+
AAencodingDict = {}
|
|
91
115
|
for ii in range(len(AAstringList)):
|
|
92
|
-
aa=AAstringList[ii]
|
|
93
|
-
CODE=[0]*(ii)+[1]+[0]*(20-ii)
|
|
94
|
-
AAencodingDict[aa]=np.array(CODE)
|
|
95
|
-
|
|
96
|
-
Ndim=16 ## optimized for isometric embedding
|
|
97
|
-
n0=Ndim*6
|
|
98
|
-
#M0=np.concatenate((np.concatenate((ZERO,M1),axis=1),np.concatenate((M1, ZERO),axis=1)))
|
|
99
|
-
ZERO=np.zeros((Ndim,Ndim))
|
|
100
|
-
II=np.eye(Ndim)
|
|
101
|
-
M0
|
|
116
|
+
aa = AAstringList[ii]
|
|
117
|
+
CODE = [0] * (ii) + [1] + [0] * (20 - ii)
|
|
118
|
+
AAencodingDict[aa] = np.array(CODE)
|
|
119
|
+
|
|
120
|
+
Ndim = 16 ## optimized for isometric embedding
|
|
121
|
+
n0 = Ndim * 6
|
|
122
|
+
# M0=np.concatenate((np.concatenate((ZERO,M1),axis=1),np.concatenate((M1, ZERO),axis=1)))
|
|
123
|
+
ZERO = np.zeros((Ndim, Ndim))
|
|
124
|
+
II = np.eye(Ndim)
|
|
125
|
+
M0 = np.concatenate(
|
|
126
|
+
(
|
|
127
|
+
np.concatenate((ZERO, ZERO, II), axis=1),
|
|
128
|
+
np.concatenate((II, ZERO, ZERO), axis=1),
|
|
129
|
+
np.concatenate((ZERO, II, ZERO), axis=1),
|
|
130
|
+
)
|
|
131
|
+
)
|
|
102
132
|
## Construct 6-th order cyclic group
|
|
103
|
-
ZERO45=np.zeros((Ndim*3,Ndim*3))
|
|
104
|
-
M6=np.concatenate(
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
133
|
+
ZERO45 = np.zeros((Ndim * 3, Ndim * 3))
|
|
134
|
+
M6 = np.concatenate(
|
|
135
|
+
(np.concatenate((ZERO45, M0), axis=1), np.concatenate((M0, ZERO45), axis=1))
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
X = np.array(
|
|
139
|
+
[
|
|
140
|
+
[
|
|
141
|
+
-0.31230882,
|
|
142
|
+
-0.53572156,
|
|
143
|
+
-0.01949946,
|
|
144
|
+
-0.12211268,
|
|
145
|
+
-0.70947917,
|
|
146
|
+
-0.42211092,
|
|
147
|
+
0.02783931,
|
|
148
|
+
0.02637933,
|
|
149
|
+
-0.41760305,
|
|
150
|
+
0.21809875,
|
|
151
|
+
0.53532768,
|
|
152
|
+
0.04833016,
|
|
153
|
+
0.07877711,
|
|
154
|
+
0.50464914,
|
|
155
|
+
-0.26972087,
|
|
156
|
+
-0.52416842,
|
|
157
|
+
],
|
|
158
|
+
[
|
|
159
|
+
0.29672002,
|
|
160
|
+
0.29005364,
|
|
161
|
+
0.18176298,
|
|
162
|
+
-0.05103382,
|
|
163
|
+
-0.34686519,
|
|
164
|
+
0.58024228,
|
|
165
|
+
-0.49282931,
|
|
166
|
+
0.62304281,
|
|
167
|
+
-0.09575202,
|
|
168
|
+
0.30115555,
|
|
169
|
+
0.09913529,
|
|
170
|
+
0.1577466,
|
|
171
|
+
-0.94391939,
|
|
172
|
+
-0.10505925,
|
|
173
|
+
0.05482389,
|
|
174
|
+
0.38409897,
|
|
175
|
+
],
|
|
176
|
+
[
|
|
177
|
+
-0.42212537,
|
|
178
|
+
0.12225749,
|
|
179
|
+
0.16279646,
|
|
180
|
+
0.60099009,
|
|
181
|
+
0.19734216,
|
|
182
|
+
0.42819919,
|
|
183
|
+
-0.33562418,
|
|
184
|
+
0.17036334,
|
|
185
|
+
0.4234109,
|
|
186
|
+
0.46681561,
|
|
187
|
+
-0.50347222,
|
|
188
|
+
-0.37936876,
|
|
189
|
+
0.1494825,
|
|
190
|
+
0.32176759,
|
|
191
|
+
0.28584684,
|
|
192
|
+
0.68469861,
|
|
193
|
+
],
|
|
194
|
+
[
|
|
195
|
+
0.18599294,
|
|
196
|
+
-0.44017825,
|
|
197
|
+
-0.4476952,
|
|
198
|
+
0.34340976,
|
|
199
|
+
0.44603553,
|
|
200
|
+
0.40974629,
|
|
201
|
+
-0.60045935,
|
|
202
|
+
-0.09056728,
|
|
203
|
+
0.22147919,
|
|
204
|
+
-0.33029418,
|
|
205
|
+
0.55635594,
|
|
206
|
+
-0.54149972,
|
|
207
|
+
0.05459062,
|
|
208
|
+
0.57334159,
|
|
209
|
+
-0.06227118,
|
|
210
|
+
0.65299872,
|
|
211
|
+
],
|
|
212
|
+
[
|
|
213
|
+
-0.19010428,
|
|
214
|
+
0.64418792,
|
|
215
|
+
-0.85286762,
|
|
216
|
+
0.21380295,
|
|
217
|
+
0.37639516,
|
|
218
|
+
-0.67753593,
|
|
219
|
+
0.38751609,
|
|
220
|
+
0.55746524,
|
|
221
|
+
0.01443766,
|
|
222
|
+
0.1776535,
|
|
223
|
+
0.62853954,
|
|
224
|
+
-0.15048523,
|
|
225
|
+
0.55100206,
|
|
226
|
+
-0.21426656,
|
|
227
|
+
0.3644061,
|
|
228
|
+
-0.0018255,
|
|
229
|
+
],
|
|
230
|
+
[
|
|
231
|
+
0.7350723,
|
|
232
|
+
0.10111267,
|
|
233
|
+
0.55640019,
|
|
234
|
+
-0.18226966,
|
|
235
|
+
0.51658102,
|
|
236
|
+
-0.19321508,
|
|
237
|
+
-0.46599027,
|
|
238
|
+
-0.02989911,
|
|
239
|
+
0.4036196,
|
|
240
|
+
-0.11978213,
|
|
241
|
+
-0.29837524,
|
|
242
|
+
-0.30232765,
|
|
243
|
+
-0.36738065,
|
|
244
|
+
-0.1379793,
|
|
245
|
+
0.04362871,
|
|
246
|
+
0.33553714,
|
|
247
|
+
],
|
|
248
|
+
[
|
|
249
|
+
0.41134047,
|
|
250
|
+
0.13512443,
|
|
251
|
+
0.62492322,
|
|
252
|
+
-0.10120261,
|
|
253
|
+
-0.03093491,
|
|
254
|
+
0.23751917,
|
|
255
|
+
-0.68338694,
|
|
256
|
+
0.05124762,
|
|
257
|
+
0.41533821,
|
|
258
|
+
0.46669353,
|
|
259
|
+
0.31467277,
|
|
260
|
+
-0.02427587,
|
|
261
|
+
0.15361135,
|
|
262
|
+
0.70595112,
|
|
263
|
+
-0.27952632,
|
|
264
|
+
0.32408931,
|
|
265
|
+
],
|
|
266
|
+
[
|
|
267
|
+
-0.33041265,
|
|
268
|
+
-0.43860065,
|
|
269
|
+
-0.5509376,
|
|
270
|
+
-0.04380843,
|
|
271
|
+
-0.35160935,
|
|
272
|
+
0.25134855,
|
|
273
|
+
0.53409314,
|
|
274
|
+
0.54850824,
|
|
275
|
+
0.59490287,
|
|
276
|
+
0.32669345,
|
|
277
|
+
-0.45355268,
|
|
278
|
+
-0.56317041,
|
|
279
|
+
-0.55416297,
|
|
280
|
+
0.18117841,
|
|
281
|
+
-0.71600849,
|
|
282
|
+
-0.08989825,
|
|
283
|
+
],
|
|
284
|
+
[
|
|
285
|
+
-0.40366849,
|
|
286
|
+
0.10978974,
|
|
287
|
+
0.0280101,
|
|
288
|
+
-0.46667987,
|
|
289
|
+
-0.45607028,
|
|
290
|
+
0.54114052,
|
|
291
|
+
-0.77552923,
|
|
292
|
+
-0.10720425,
|
|
293
|
+
0.55252091,
|
|
294
|
+
-0.34397153,
|
|
295
|
+
-0.59813694,
|
|
296
|
+
0.15567728,
|
|
297
|
+
0.03071009,
|
|
298
|
+
-0.02176143,
|
|
299
|
+
0.34442719,
|
|
300
|
+
0.14681541,
|
|
301
|
+
],
|
|
302
|
+
[
|
|
303
|
+
0.19280422,
|
|
304
|
+
0.35777863,
|
|
305
|
+
0.06139255,
|
|
306
|
+
0.20081699,
|
|
307
|
+
-0.30546596,
|
|
308
|
+
-0.56901549,
|
|
309
|
+
-0.15290953,
|
|
310
|
+
-0.31181573,
|
|
311
|
+
-0.74523217,
|
|
312
|
+
0.22296016,
|
|
313
|
+
-0.39143832,
|
|
314
|
+
-0.16474685,
|
|
315
|
+
0.58064427,
|
|
316
|
+
-0.77386654,
|
|
317
|
+
0.19713107,
|
|
318
|
+
-0.49477418,
|
|
319
|
+
],
|
|
320
|
+
[
|
|
321
|
+
-0.16133903,
|
|
322
|
+
0.22112761,
|
|
323
|
+
-0.53162136,
|
|
324
|
+
0.34764073,
|
|
325
|
+
-0.08522381,
|
|
326
|
+
-0.2510216,
|
|
327
|
+
0.04699411,
|
|
328
|
+
-0.25702389,
|
|
329
|
+
-0.8739765,
|
|
330
|
+
-0.24171728,
|
|
331
|
+
-0.24370533,
|
|
332
|
+
0.42193635,
|
|
333
|
+
0.41056913,
|
|
334
|
+
-0.60378211,
|
|
335
|
+
-0.65756832,
|
|
336
|
+
0.0845203,
|
|
337
|
+
],
|
|
338
|
+
[
|
|
339
|
+
-0.34792144,
|
|
340
|
+
0.18450939,
|
|
341
|
+
0.77038332,
|
|
342
|
+
0.63868511,
|
|
343
|
+
-0.06221681,
|
|
344
|
+
0.11930421,
|
|
345
|
+
0.04895523,
|
|
346
|
+
-0.22463059,
|
|
347
|
+
-0.03268844,
|
|
348
|
+
-0.58941354,
|
|
349
|
+
0.11640045,
|
|
350
|
+
0.32384901,
|
|
351
|
+
-0.42952779,
|
|
352
|
+
0.58119471,
|
|
353
|
+
0.07288662,
|
|
354
|
+
0.26669673,
|
|
355
|
+
],
|
|
356
|
+
[
|
|
357
|
+
0.01834555,
|
|
358
|
+
-0.16367754,
|
|
359
|
+
0.34900298,
|
|
360
|
+
0.45087949,
|
|
361
|
+
0.47073855,
|
|
362
|
+
-0.37377404,
|
|
363
|
+
0.0606911,
|
|
364
|
+
0.2455703,
|
|
365
|
+
-0.55182937,
|
|
366
|
+
-0.20261009,
|
|
367
|
+
0.28325423,
|
|
368
|
+
-0.04741146,
|
|
369
|
+
0.30565238,
|
|
370
|
+
-0.62090653,
|
|
371
|
+
0.17528413,
|
|
372
|
+
-0.60434975,
|
|
373
|
+
],
|
|
374
|
+
[
|
|
375
|
+
-0.55464981,
|
|
376
|
+
0.50918784,
|
|
377
|
+
-0.21371646,
|
|
378
|
+
-0.63996967,
|
|
379
|
+
-0.37656862,
|
|
380
|
+
0.27852662,
|
|
381
|
+
0.3287838,
|
|
382
|
+
-0.56800869,
|
|
383
|
+
0.23260763,
|
|
384
|
+
-0.20653106,
|
|
385
|
+
0.63261439,
|
|
386
|
+
-0.22666691,
|
|
387
|
+
0.00726302,
|
|
388
|
+
-0.60125196,
|
|
389
|
+
0.07139961,
|
|
390
|
+
-0.35086639,
|
|
391
|
+
],
|
|
392
|
+
[
|
|
393
|
+
0.94039731,
|
|
394
|
+
-0.25999326,
|
|
395
|
+
0.43922549,
|
|
396
|
+
-0.485738,
|
|
397
|
+
-0.20492235,
|
|
398
|
+
-0.26005626,
|
|
399
|
+
0.68776626,
|
|
400
|
+
0.57826888,
|
|
401
|
+
-0.05973995,
|
|
402
|
+
-0.1193658,
|
|
403
|
+
-0.12102433,
|
|
404
|
+
-0.22091354,
|
|
405
|
+
0.43427913,
|
|
406
|
+
0.71447886,
|
|
407
|
+
0.32745991,
|
|
408
|
+
0.03466398,
|
|
409
|
+
],
|
|
410
|
+
[
|
|
411
|
+
-0.13194625,
|
|
412
|
+
-0.12262688,
|
|
413
|
+
0.18029209,
|
|
414
|
+
0.16555524,
|
|
415
|
+
0.39594125,
|
|
416
|
+
-0.58110665,
|
|
417
|
+
0.16161717,
|
|
418
|
+
0.0839783,
|
|
419
|
+
0.0911945,
|
|
420
|
+
0.34546976,
|
|
421
|
+
-0.29415349,
|
|
422
|
+
0.29891936,
|
|
423
|
+
-0.60834721,
|
|
424
|
+
0.5943593,
|
|
425
|
+
-0.29473819,
|
|
426
|
+
0.4864154,
|
|
427
|
+
],
|
|
428
|
+
[
|
|
429
|
+
0.40850093,
|
|
430
|
+
-0.4638894,
|
|
431
|
+
-0.39732987,
|
|
432
|
+
-0.01972861,
|
|
433
|
+
0.51189582,
|
|
434
|
+
0.10176704,
|
|
435
|
+
0.37528519,
|
|
436
|
+
-0.41479418,
|
|
437
|
+
-0.1932531,
|
|
438
|
+
0.54732221,
|
|
439
|
+
-0.11876511,
|
|
440
|
+
0.32843973,
|
|
441
|
+
-0.259283,
|
|
442
|
+
0.59500132,
|
|
443
|
+
0.35168375,
|
|
444
|
+
-0.21733727,
|
|
445
|
+
],
|
|
446
|
+
[
|
|
447
|
+
-0.50627723,
|
|
448
|
+
-0.1973602,
|
|
449
|
+
-0.02339884,
|
|
450
|
+
-0.66846048,
|
|
451
|
+
0.62696606,
|
|
452
|
+
0.60049717,
|
|
453
|
+
0.69143364,
|
|
454
|
+
-0.48053591,
|
|
455
|
+
0.17812208,
|
|
456
|
+
-0.58481821,
|
|
457
|
+
-0.23551415,
|
|
458
|
+
-0.06229112,
|
|
459
|
+
0.20993116,
|
|
460
|
+
-0.72485884,
|
|
461
|
+
0.34375662,
|
|
462
|
+
-0.23539168,
|
|
463
|
+
],
|
|
464
|
+
[
|
|
465
|
+
-0.51388312,
|
|
466
|
+
-0.2788953,
|
|
467
|
+
0.00859533,
|
|
468
|
+
-0.5247195,
|
|
469
|
+
-0.18021544,
|
|
470
|
+
0.28372911,
|
|
471
|
+
0.10791359,
|
|
472
|
+
0.13033494,
|
|
473
|
+
0.34294013,
|
|
474
|
+
-0.70310089,
|
|
475
|
+
-0.13245433,
|
|
476
|
+
0.48661081,
|
|
477
|
+
0.08451644,
|
|
478
|
+
-0.69990992,
|
|
479
|
+
0.0408274,
|
|
480
|
+
-0.47204888,
|
|
481
|
+
],
|
|
482
|
+
[
|
|
483
|
+
0.68546275,
|
|
484
|
+
0.22581365,
|
|
485
|
+
-0.32571833,
|
|
486
|
+
0.34394298,
|
|
487
|
+
-0.43232367,
|
|
488
|
+
-0.5041842,
|
|
489
|
+
0.04784017,
|
|
490
|
+
-0.53067936,
|
|
491
|
+
-0.50049908,
|
|
492
|
+
0.36874221,
|
|
493
|
+
0.22429186,
|
|
494
|
+
0.4616482,
|
|
495
|
+
0.11159174,
|
|
496
|
+
-0.26827959,
|
|
497
|
+
-0.39372848,
|
|
498
|
+
-0.40987423,
|
|
499
|
+
],
|
|
500
|
+
]
|
|
501
|
+
)
|
|
502
|
+
|
|
503
|
+
bl62np = {}
|
|
504
|
+
vkk = list(bl62.keys())
|
|
189
505
|
for ii in range(20):
|
|
190
|
-
kk=vkk[ii]
|
|
191
|
-
bl62np[kk]=np.array(list(X[ii,])+[0]*Ndim*5)
|
|
506
|
+
kk = vkk[ii]
|
|
507
|
+
bl62np[kk] = np.array(list(X[ii,]) + [0] * Ndim * 5)
|
|
508
|
+
|
|
192
509
|
|
|
193
510
|
def EncodingCDR3(s, M, n0):
|
|
194
|
-
sL=list(s)
|
|
195
|
-
x=np.array([0]*n0)
|
|
511
|
+
sL = list(s)
|
|
512
|
+
x = np.array([0] * n0)
|
|
196
513
|
for ii in range(len(sL)):
|
|
197
|
-
x = np.dot(M, (x+bl62np[sL[ii]]))
|
|
514
|
+
x = np.dot(M, (x + bl62np[sL[ii]]))
|
|
198
515
|
return x
|
|
199
516
|
|
|
517
|
+
|
|
200
518
|
def BuildLengthDict(seqs, sIDs, vGene=[], INFO=[]):
|
|
201
|
-
LLs=[10,11,12,13,14,15,16,17,18,19,20,21,22,23,24]
|
|
202
|
-
LengthD={}
|
|
203
|
-
SeqD={}
|
|
204
|
-
VgeneD={}
|
|
205
|
-
InfoD={}
|
|
206
|
-
AAs=set(list(AAencodingDict.keys()))
|
|
207
|
-
NAs=len(AAencodingDict)
|
|
208
|
-
cNAs=0
|
|
519
|
+
LLs = [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]
|
|
520
|
+
LengthD = {}
|
|
521
|
+
SeqD = {}
|
|
522
|
+
VgeneD = {}
|
|
523
|
+
InfoD = {}
|
|
524
|
+
AAs = set(list(AAencodingDict.keys()))
|
|
525
|
+
NAs = len(AAencodingDict)
|
|
526
|
+
cNAs = 0
|
|
209
527
|
for ii in range(len(seqs)):
|
|
210
|
-
ID=sIDs[ii]
|
|
211
|
-
ss=seqs[ii]
|
|
212
|
-
ssAA=set(list(ss))
|
|
213
|
-
TMP=list(ssAA | AAs)
|
|
528
|
+
ID = sIDs[ii]
|
|
529
|
+
ss = seqs[ii]
|
|
530
|
+
ssAA = set(list(ss))
|
|
531
|
+
TMP = list(ssAA | AAs)
|
|
214
532
|
if len(TMP) > NAs:
|
|
215
533
|
## CDR3 containing non amino acid letter
|
|
216
|
-
#print('Warning: CDR3: '+ss + ' contains non amino acid letter!')
|
|
217
|
-
cNAs+=1
|
|
534
|
+
# print('Warning: CDR3: '+ss + ' contains non amino acid letter!')
|
|
535
|
+
cNAs += 1
|
|
218
536
|
continue
|
|
219
|
-
if len(vGene)>0:
|
|
220
|
-
vv=vGene[ii]
|
|
221
|
-
if len(INFO)>0:
|
|
222
|
-
info=INFO[ii]
|
|
223
|
-
L=len(ss)
|
|
537
|
+
if len(vGene) > 0:
|
|
538
|
+
vv = vGene[ii]
|
|
539
|
+
if len(INFO) > 0:
|
|
540
|
+
info = INFO[ii]
|
|
541
|
+
L = len(ss)
|
|
224
542
|
if L not in LLs:
|
|
225
543
|
continue
|
|
226
544
|
if L not in LengthD:
|
|
227
|
-
LengthD[L]=[ID]
|
|
228
|
-
SeqD[L]=[ss]
|
|
229
|
-
if len(vGene)>0:
|
|
230
|
-
VgeneD[L]=[vv]
|
|
231
|
-
if len(INFO)>0:
|
|
232
|
-
InfoD[L]=[info]
|
|
545
|
+
LengthD[L] = [ID]
|
|
546
|
+
SeqD[L] = [ss]
|
|
547
|
+
if len(vGene) > 0:
|
|
548
|
+
VgeneD[L] = [vv]
|
|
549
|
+
if len(INFO) > 0:
|
|
550
|
+
InfoD[L] = [info]
|
|
233
551
|
else:
|
|
234
552
|
LengthD[L].append(ID)
|
|
235
553
|
SeqD[L].append(ss)
|
|
236
|
-
if len(vGene)>0:
|
|
554
|
+
if len(vGene) > 0:
|
|
237
555
|
VgeneD[L].append(vv)
|
|
238
|
-
if len(INFO)>0:
|
|
556
|
+
if len(INFO) > 0:
|
|
239
557
|
InfoD[L].append(info)
|
|
240
|
-
if cNAs>0:
|
|
241
|
-
print("Warning: Skipped %d sequences with non AA letter!" %(cNAs))
|
|
558
|
+
if cNAs > 0:
|
|
559
|
+
print("Warning: Skipped %d sequences with non AA letter!" % (cNAs))
|
|
242
560
|
return LengthD, VgeneD, InfoD, SeqD
|
|
243
561
|
|
|
562
|
+
|
|
244
563
|
def CollapseUnique(LD, VD, ID, SD):
|
|
245
|
-
kks=LD.keys()
|
|
246
|
-
LDu={}
|
|
247
|
-
VDu={}
|
|
248
|
-
IDu={}
|
|
249
|
-
SDu={}
|
|
564
|
+
kks = LD.keys()
|
|
565
|
+
LDu = {}
|
|
566
|
+
VDu = {}
|
|
567
|
+
IDu = {}
|
|
568
|
+
SDu = {}
|
|
250
569
|
for kk in kks:
|
|
251
|
-
vvL=list(LD[kk])
|
|
252
|
-
if len(VD)>0:
|
|
253
|
-
vvV=list(VD[kk])
|
|
570
|
+
vvL = list(LD[kk])
|
|
571
|
+
if len(VD) > 0:
|
|
572
|
+
vvV = list(VD[kk])
|
|
254
573
|
else:
|
|
255
|
-
vvV=[
|
|
256
|
-
vvI=list(ID[kk])
|
|
257
|
-
vvS=list(SD[kk])
|
|
258
|
-
zz=zip(vvL, vvS, vvV, vvI)
|
|
259
|
-
zzs=sorted(zz, key
|
|
260
|
-
nz=len(zzs)
|
|
261
|
-
pointer_pre=0
|
|
262
|
-
pointer_cur=1
|
|
263
|
-
s_pre=zzs[pointer_pre][1]
|
|
264
|
-
v_pre=zzs[pointer_pre][2]
|
|
265
|
-
uS=[s_pre]
|
|
266
|
-
uV=[v_pre]
|
|
267
|
-
uI=[[zzs[pointer_pre][3]]]
|
|
574
|
+
vvV = ["TRBV2-1*01"] * len(vvL)
|
|
575
|
+
vvI = list(ID[kk])
|
|
576
|
+
vvS = list(SD[kk])
|
|
577
|
+
zz = zip(vvL, vvS, vvV, vvI)
|
|
578
|
+
zzs = sorted(zz, key=lambda x: (x[1], x[2]))
|
|
579
|
+
nz = len(zzs)
|
|
580
|
+
pointer_pre = 0
|
|
581
|
+
pointer_cur = 1
|
|
582
|
+
s_pre = zzs[pointer_pre][1]
|
|
583
|
+
v_pre = zzs[pointer_pre][2]
|
|
584
|
+
uS = [s_pre]
|
|
585
|
+
uV = [v_pre]
|
|
586
|
+
uI = [[zzs[pointer_pre][3]]]
|
|
268
587
|
while pointer_cur < nz:
|
|
269
|
-
s_cur=zzs[pointer_cur][1]
|
|
270
|
-
v_cur=zzs[pointer_cur][2]
|
|
588
|
+
s_cur = zzs[pointer_cur][1]
|
|
589
|
+
v_cur = zzs[pointer_cur][2]
|
|
271
590
|
if s_cur == s_pre and v_cur == v_pre:
|
|
272
|
-
uI[len(uI)-1].append(zzs[pointer_cur][3])
|
|
591
|
+
uI[len(uI) - 1].append(zzs[pointer_cur][3])
|
|
273
592
|
pointer_cur += 1
|
|
274
593
|
continue
|
|
275
594
|
else:
|
|
276
595
|
uS.append(s_cur)
|
|
277
596
|
uV.append(v_cur)
|
|
278
597
|
uI.append([zzs[pointer_cur][3]])
|
|
279
|
-
s_pre=s_cur
|
|
280
|
-
v_pre=v_cur
|
|
281
|
-
pointer_pre=pointer_cur
|
|
598
|
+
s_pre = s_cur
|
|
599
|
+
v_pre = v_cur
|
|
600
|
+
pointer_pre = pointer_cur
|
|
282
601
|
pointer_cur += 1
|
|
283
|
-
uL=[x for x in range(len(uS))]
|
|
284
|
-
LDu[kk]=uL
|
|
285
|
-
SDu[kk]=uS
|
|
286
|
-
if len(VD)>0:
|
|
287
|
-
VDu[kk]=uV
|
|
288
|
-
IDu[kk]=uI
|
|
602
|
+
uL = [x for x in range(len(uS))]
|
|
603
|
+
LDu[kk] = uL
|
|
604
|
+
SDu[kk] = uS
|
|
605
|
+
if len(VD) > 0:
|
|
606
|
+
VDu[kk] = uV
|
|
607
|
+
IDu[kk] = uI
|
|
289
608
|
return LDu, VDu, IDu, SDu
|
|
290
609
|
|
|
291
610
|
|
|
@@ -297,14 +616,15 @@ class CDR3:
|
|
|
297
616
|
## KS: Kmer size
|
|
298
617
|
## st: the first 0:(st-1) amino acids will not be included in K-merization
|
|
299
618
|
## ed: the last L-ed amino acids will be skipped
|
|
300
|
-
self.s=s
|
|
301
|
-
self.ID=sID
|
|
302
|
-
L=len(s)
|
|
303
|
-
self.L=L
|
|
304
|
-
sub_s=s[st: (L-ed)]
|
|
305
|
-
Ls=len(sub_s)
|
|
306
|
-
Kmer=[sub_s[x:(x+KS)] for x in range(0,Ls-KS+1)]
|
|
307
|
-
self.Kmer=Kmer
|
|
619
|
+
self.s = s
|
|
620
|
+
self.ID = sID
|
|
621
|
+
L = len(s)
|
|
622
|
+
self.L = L
|
|
623
|
+
sub_s = s[st : (L - ed)]
|
|
624
|
+
Ls = len(sub_s)
|
|
625
|
+
Kmer = [sub_s[x : (x + KS)] for x in range(0, Ls - KS + 1)]
|
|
626
|
+
self.Kmer = Kmer
|
|
627
|
+
|
|
308
628
|
|
|
309
629
|
class KmerSet:
|
|
310
630
|
## Kmer set for fast read searching based on mismatch-allowed Kmer index
|
|
@@ -313,263 +633,277 @@ class KmerSet:
|
|
|
313
633
|
## Seqs and sIDs must have the same length
|
|
314
634
|
if len(Seqs) != len(sIDs):
|
|
315
635
|
raise "Sequence and ID lists have different length. Please check input."
|
|
316
|
-
KmerDict={}
|
|
317
|
-
N=len(Seqs)
|
|
318
|
-
self.N=N
|
|
319
|
-
CDR3Dict={}
|
|
320
|
-
LLs=[]
|
|
321
|
-
for ii in range(0,N):
|
|
322
|
-
s=Seqs[ii]
|
|
323
|
-
sID=sIDs[ii]
|
|
324
|
-
cc=CDR3(s,sID,KS,st,ed)
|
|
325
|
-
CDR3Dict[cc.ID]=cc.Kmer
|
|
326
|
-
KK=cc.Kmer
|
|
636
|
+
KmerDict = {}
|
|
637
|
+
N = len(Seqs)
|
|
638
|
+
self.N = N
|
|
639
|
+
CDR3Dict = {}
|
|
640
|
+
LLs = []
|
|
641
|
+
for ii in range(0, N):
|
|
642
|
+
s = Seqs[ii]
|
|
643
|
+
sID = sIDs[ii]
|
|
644
|
+
cc = CDR3(s, sID, KS, st, ed)
|
|
645
|
+
CDR3Dict[cc.ID] = cc.Kmer
|
|
646
|
+
KK = cc.Kmer
|
|
327
647
|
LLs.append(cc.L)
|
|
328
648
|
for kk in KK:
|
|
329
649
|
if kk not in KmerDict:
|
|
330
|
-
KmerDict[kk]=[sID]
|
|
650
|
+
KmerDict[kk] = [sID]
|
|
331
651
|
else:
|
|
332
652
|
KmerDict[kk].append(sID)
|
|
333
|
-
self.KD=KmerDict
|
|
334
|
-
self.KS=KS
|
|
335
|
-
self.CD=CDR3Dict
|
|
336
|
-
self.LL=LLs
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
653
|
+
self.KD = KmerDict
|
|
654
|
+
self.KS = KS
|
|
655
|
+
self.CD = CDR3Dict
|
|
656
|
+
self.LL = LLs
|
|
657
|
+
|
|
658
|
+
def FindKmerNeighbor(self, kk):
|
|
659
|
+
KS = self.KS
|
|
660
|
+
KS_n1 = []
|
|
340
661
|
for jj in range(KS):
|
|
341
|
-
kk_pre=[kk[0:jj]]*20
|
|
342
|
-
kk_suf=[kk[(jj+1):KS]]*20
|
|
343
|
-
kkn=list(zip(kk_pre,AAstringList,kk_suf))
|
|
344
|
-
KS_n1+=[
|
|
662
|
+
kk_pre = [kk[0:jj]] * 20
|
|
663
|
+
kk_suf = [kk[(jj + 1) : KS]] * 20
|
|
664
|
+
kkn = list(zip(kk_pre, AAstringList, kk_suf))
|
|
665
|
+
KS_n1 += ["".join(list(x)) for x in kkn]
|
|
345
666
|
return KS_n1
|
|
346
|
-
|
|
667
|
+
|
|
668
|
+
def FindKmerNeighbor2(self, kk):
|
|
347
669
|
## KS>=6, allowing 2 mismatches. CDR3 length must be >= 10
|
|
348
|
-
KS=self.KS
|
|
349
|
-
KS_n1=[]
|
|
670
|
+
KS = self.KS
|
|
671
|
+
KS_n1 = []
|
|
350
672
|
for jj in range(KS):
|
|
351
673
|
for ii in range(KS):
|
|
352
|
-
if ii<=jj:
|
|
674
|
+
if ii <= jj:
|
|
353
675
|
continue
|
|
354
|
-
kk_pre=[kk[0:jj]]*20
|
|
355
|
-
kk_mid=[kk[(jj+1):ii]]*20
|
|
356
|
-
kk_suf=[kk[(ii+1):KS]]*400
|
|
357
|
-
kkn=list(zip(kk_pre,AAstringList,kk_mid))
|
|
358
|
-
kkn=[
|
|
359
|
-
kkn=[[x]*20 for x in kkn]
|
|
360
|
-
kkn=list(chain(*kkn))
|
|
361
|
-
kkn2=list(zip(kkn, AAstringList*20, kk_suf))
|
|
362
|
-
kkn2=[
|
|
363
|
-
KS_n1+=kkn2
|
|
676
|
+
kk_pre = [kk[0:jj]] * 20
|
|
677
|
+
kk_mid = [kk[(jj + 1) : ii]] * 20
|
|
678
|
+
kk_suf = [kk[(ii + 1) : KS]] * 400
|
|
679
|
+
kkn = list(zip(kk_pre, AAstringList, kk_mid))
|
|
680
|
+
kkn = ["".join(list(x)) for x in kkn]
|
|
681
|
+
kkn = [[x] * 20 for x in kkn]
|
|
682
|
+
kkn = list(chain(*kkn))
|
|
683
|
+
kkn2 = list(zip(kkn, AAstringList * 20, kk_suf))
|
|
684
|
+
kkn2 = ["".join(list(x)) for x in kkn2]
|
|
685
|
+
KS_n1 += kkn2
|
|
364
686
|
return KS_n1
|
|
687
|
+
|
|
365
688
|
def KmerIndex(self):
|
|
366
689
|
## For each K-mer, find its nearest neighbor with 1 character mismatch
|
|
367
|
-
KKs=list(self.KD.keys())
|
|
368
|
-
KS=self.KS
|
|
369
|
-
KKs_set=set(KKs)
|
|
370
|
-
Skk=
|
|
371
|
-
KI_Dict={}
|
|
690
|
+
KKs = list(self.KD.keys())
|
|
691
|
+
KS = self.KS
|
|
692
|
+
KKs_set = set(KKs)
|
|
693
|
+
Skk = "_".join(KKs)
|
|
694
|
+
KI_Dict = {}
|
|
372
695
|
for kk in KKs:
|
|
373
|
-
## kk_neighbor=[]
|
|
374
|
-
## for jj in range(KS):
|
|
375
|
-
## kk_pre=kk[0:jj]
|
|
376
|
-
## kk_suf=kk[(jj+1):KS]
|
|
377
|
-
## pat=kk_pre+'['+AAstring+']{1}'+kk_suf
|
|
378
|
-
## p=re.compile(pat)
|
|
379
|
-
## mm=[m.group() for m in p.finditer(Skk)]
|
|
380
|
-
## kk_neighbor+=mm
|
|
381
|
-
KS_n=set(self.FindKmerNeighbor(kk))
|
|
696
|
+
## kk_neighbor=[]
|
|
697
|
+
## for jj in range(KS):
|
|
698
|
+
## kk_pre=kk[0:jj]
|
|
699
|
+
## kk_suf=kk[(jj+1):KS]
|
|
700
|
+
## pat=kk_pre+'['+AAstring+']{1}'+kk_suf
|
|
701
|
+
## p=re.compile(pat)
|
|
702
|
+
## mm=[m.group() for m in p.finditer(Skk)]
|
|
703
|
+
## kk_neighbor+=mm
|
|
704
|
+
KS_n = set(self.FindKmerNeighbor(kk))
|
|
382
705
|
kk_neighbor = KS_n & KKs_set
|
|
383
|
-
KI_Dict[kk]=list(kk_neighbor)
|
|
706
|
+
KI_Dict[kk] = list(kk_neighbor)
|
|
384
707
|
return KI_Dict
|
|
708
|
+
|
|
385
709
|
def updateKD(self, KI):
|
|
386
710
|
## group sequences sharing motifs with 1-2 mismatches
|
|
387
|
-
KD=self.KD
|
|
388
|
-
KDnew={}
|
|
711
|
+
KD = self.KD
|
|
712
|
+
KDnew = {}
|
|
389
713
|
for kk in KD:
|
|
390
|
-
kkm=KI[kk]
|
|
391
|
-
vvL=itemgetter(*kkm)(KD)
|
|
392
|
-
if isinstance(vvL[0],list):
|
|
393
|
-
vvL=list(chain(*vvL))
|
|
394
|
-
KDnew[kk]=vvL
|
|
714
|
+
kkm = KI[kk]
|
|
715
|
+
vvL = itemgetter(*kkm)(KD)
|
|
716
|
+
if isinstance(vvL[0], list):
|
|
717
|
+
vvL = list(chain(*vvL))
|
|
718
|
+
KDnew[kk] = vvL
|
|
395
719
|
return KDnew
|
|
396
720
|
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
721
|
+
|
|
722
|
+
def GenerateMotifGraph(mD, seqs, seqID):
|
|
723
|
+
SeqShareGraph = {}
|
|
724
|
+
mDL = {}
|
|
400
725
|
for kk in mD:
|
|
401
|
-
vv=mD[kk]
|
|
402
|
-
LL=[]
|
|
726
|
+
vv = mD[kk]
|
|
727
|
+
LL = []
|
|
403
728
|
for v in vv:
|
|
404
729
|
LL.append(len(seqs[v]))
|
|
405
|
-
mDL[kk]=LL
|
|
730
|
+
mDL[kk] = LL
|
|
406
731
|
for kk in mD:
|
|
407
|
-
vv=mD[kk]
|
|
408
|
-
LL=mDL[kk]
|
|
409
|
-
nv=len(vv)
|
|
410
|
-
for ii in range(0,nv):
|
|
411
|
-
id_1=vv[ii]
|
|
412
|
-
L1=LL[ii]
|
|
413
|
-
for jj in range(ii,nv):
|
|
414
|
-
if jj==ii:
|
|
732
|
+
vv = mD[kk]
|
|
733
|
+
LL = mDL[kk]
|
|
734
|
+
nv = len(vv)
|
|
735
|
+
for ii in range(0, nv):
|
|
736
|
+
id_1 = vv[ii]
|
|
737
|
+
L1 = LL[ii]
|
|
738
|
+
for jj in range(ii, nv):
|
|
739
|
+
if jj == ii:
|
|
415
740
|
continue
|
|
416
|
-
id_2=vv[jj]
|
|
417
|
-
L2=LL[jj]
|
|
741
|
+
id_2 = vv[jj]
|
|
742
|
+
L2 = LL[jj]
|
|
418
743
|
if L2 != L1:
|
|
419
744
|
continue
|
|
420
745
|
if id_1 not in SeqShareGraph:
|
|
421
|
-
SeqShareGraph[id_1]=[id_2]
|
|
746
|
+
SeqShareGraph[id_1] = [id_2]
|
|
422
747
|
elif id_2 not in SeqShareGraph[id_1]:
|
|
423
748
|
SeqShareGraph[id_1].append(id_2)
|
|
424
749
|
if id_2 not in SeqShareGraph:
|
|
425
|
-
SeqShareGraph[id_2]=[id_1]
|
|
750
|
+
SeqShareGraph[id_2] = [id_1]
|
|
426
751
|
elif id_1 not in SeqShareGraph[id_2]:
|
|
427
752
|
SeqShareGraph[id_2].append(id_1)
|
|
428
753
|
return SeqShareGraph
|
|
429
754
|
|
|
755
|
+
|
|
430
756
|
def generateSSG(Kset, CDR3s, k_thr=2):
|
|
431
|
-
KD=Kset.KD
|
|
432
|
-
KI=Kset.KmerIndex()
|
|
433
|
-
KDnew=Kset.updateKD(KI)
|
|
434
|
-
CD=Kset.CD
|
|
435
|
-
LL=np.array(Kset.LL)
|
|
436
|
-
SSG={}
|
|
757
|
+
KD = Kset.KD
|
|
758
|
+
KI = Kset.KmerIndex()
|
|
759
|
+
KDnew = Kset.updateKD(KI)
|
|
760
|
+
CD = Kset.CD
|
|
761
|
+
LL = np.array(Kset.LL)
|
|
762
|
+
SSG = {}
|
|
437
763
|
for kk in CD:
|
|
438
|
-
vv=itemgetter(*CD[kk])(KDnew)
|
|
439
|
-
if isinstance(vv[0],list):
|
|
440
|
-
vv=list(chain(*vv))
|
|
441
|
-
vv1=[]
|
|
442
|
-
c=Counter(vv)
|
|
764
|
+
vv = itemgetter(*CD[kk])(KDnew)
|
|
765
|
+
if isinstance(vv[0], list):
|
|
766
|
+
vv = list(chain(*vv))
|
|
767
|
+
vv1 = []
|
|
768
|
+
c = Counter(vv)
|
|
443
769
|
for k in c:
|
|
444
|
-
if c[k]>=k_thr:
|
|
770
|
+
if c[k] >= k_thr:
|
|
445
771
|
vv1.append(k)
|
|
446
|
-
vv1=np.array(vv1)
|
|
447
|
-
if len(vv1)==0:
|
|
772
|
+
vv1 = np.array(vv1)
|
|
773
|
+
if len(vv1) == 0:
|
|
448
774
|
continue
|
|
449
|
-
cdr3=CDR3s[kk]
|
|
450
|
-
L0=len(cdr3)
|
|
451
|
-
idx=np.where(LL[vv1]==L0)[0]
|
|
452
|
-
if len(idx)==0:
|
|
775
|
+
cdr3 = CDR3s[kk]
|
|
776
|
+
L0 = len(cdr3)
|
|
777
|
+
idx = np.where(LL[vv1] == L0)[0]
|
|
778
|
+
if len(idx) == 0:
|
|
453
779
|
continue
|
|
454
|
-
vvs=list(vv1[idx])
|
|
780
|
+
vvs = list(vv1[idx])
|
|
455
781
|
vvs.remove(kk)
|
|
456
|
-
if len(vvs)>0:
|
|
457
|
-
SSG[kk]=vvs
|
|
782
|
+
if len(vvs) > 0:
|
|
783
|
+
SSG[kk] = vvs
|
|
458
784
|
return SSG
|
|
459
785
|
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
786
|
+
|
|
787
|
+
def SeqComparison(s1, s2, gap=-6):
|
|
788
|
+
n = len(s1)
|
|
789
|
+
CorList = []
|
|
790
|
+
score = 0
|
|
791
|
+
for kk in range(0, n):
|
|
792
|
+
aa = s1[kk]
|
|
793
|
+
bb = s2[kk]
|
|
794
|
+
if aa in [".", "-", "*"] or bb in [".", "-", "*"]:
|
|
795
|
+
if aa != bb:
|
|
469
796
|
score += gap
|
|
470
797
|
continue
|
|
471
|
-
if aa==bb:
|
|
472
|
-
# score += min(4,blosum62[(aa,aa)])
|
|
473
|
-
score += blosum62n[(aa,aa)]
|
|
798
|
+
if aa == bb:
|
|
799
|
+
# score += min(4,blosum62[(aa,aa)])
|
|
800
|
+
score += blosum62n[(aa, aa)]
|
|
474
801
|
continue
|
|
475
|
-
KEY=(aa,bb)
|
|
476
|
-
# if KEY not in blosum62:
|
|
477
|
-
# KEY=(bb,aa)
|
|
478
|
-
# if KEY not in blosum62:
|
|
479
|
-
# raise "Non-standard amino acid coding!"
|
|
480
|
-
score+=blosum62n[KEY]
|
|
802
|
+
KEY = (aa, bb)
|
|
803
|
+
# if KEY not in blosum62:
|
|
804
|
+
# KEY=(bb,aa)
|
|
805
|
+
# if KEY not in blosum62:
|
|
806
|
+
# raise "Non-standard amino acid coding!"
|
|
807
|
+
score += blosum62n[KEY]
|
|
481
808
|
return score
|
|
482
809
|
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
Seq1
|
|
489
|
-
Seq2
|
|
490
|
-
|
|
810
|
+
|
|
811
|
+
def NHLocalAlignment(Seq1, Seq2, gap_thr=1, gap=-6):
|
|
812
|
+
n1 = len(Seq1)
|
|
813
|
+
n2 = len(Seq2)
|
|
814
|
+
if n1 < n2:
|
|
815
|
+
Seq = Seq1
|
|
816
|
+
Seq1 = Seq2
|
|
817
|
+
Seq2 = Seq
|
|
818
|
+
nn = n2 - n1
|
|
491
819
|
else:
|
|
492
|
-
nn=n1-n2
|
|
493
|
-
if nn>gap_thr:
|
|
820
|
+
nn = n1 - n2
|
|
821
|
+
if nn > gap_thr:
|
|
494
822
|
return -1
|
|
495
|
-
SeqList1=[Seq1]
|
|
496
|
-
SeqList2=InsertGap(Seq2,nn)
|
|
497
|
-
alns=[]
|
|
498
|
-
SCOREList=[]
|
|
823
|
+
SeqList1 = [Seq1]
|
|
824
|
+
SeqList2 = InsertGap(Seq2, nn)
|
|
825
|
+
alns = []
|
|
826
|
+
SCOREList = []
|
|
499
827
|
for s1 in SeqList1:
|
|
500
828
|
for s2 in SeqList2:
|
|
501
|
-
|
|
502
|
-
maxS=max(SCOREList)
|
|
829
|
+
SCOREList.append(SeqComparison(s1, s2, gap))
|
|
830
|
+
maxS = max(SCOREList)
|
|
503
831
|
return maxS
|
|
504
832
|
|
|
505
|
-
|
|
833
|
+
|
|
834
|
+
def InsertGap(Seq, n):
|
|
506
835
|
## Insert n gaps to Seq; n<=2
|
|
507
|
-
if n==0:
|
|
836
|
+
if n == 0:
|
|
508
837
|
return [Seq]
|
|
509
|
-
ns=len(Seq)
|
|
510
|
-
SeqList=[]
|
|
511
|
-
if
|
|
512
|
-
for kk in range(0,ns+1):
|
|
513
|
-
SeqNew=Seq[0:kk]+
|
|
838
|
+
ns = len(Seq)
|
|
839
|
+
SeqList = []
|
|
840
|
+
if n == 1:
|
|
841
|
+
for kk in range(0, ns + 1):
|
|
842
|
+
SeqNew = Seq[0:kk] + "-" + Seq[kk:]
|
|
514
843
|
SeqList.append(SeqNew)
|
|
515
|
-
if
|
|
516
|
-
for kk in range(0,ns+1):
|
|
517
|
-
SeqNew=Seq[0:kk]+
|
|
518
|
-
for jj in range(0,ns+2):
|
|
519
|
-
SeqNew0=SeqNew[0:jj]+
|
|
844
|
+
if n == 2:
|
|
845
|
+
for kk in range(0, ns + 1):
|
|
846
|
+
SeqNew = Seq[0:kk] + "-" + Seq[kk:]
|
|
847
|
+
for jj in range(0, ns + 2):
|
|
848
|
+
SeqNew0 = SeqNew[0:jj] + "-" + SeqNew[jj:]
|
|
520
849
|
SeqList.append(SeqNew0)
|
|
521
850
|
return SeqList
|
|
522
851
|
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
852
|
+
|
|
853
|
+
def falign(s1, s2, V1, V2, st, VScore={}, UseV=True, gapn=1, gap=-6):
|
|
854
|
+
mid1 = s1[st:-2]
|
|
855
|
+
mid2 = s2[st:-2]
|
|
526
856
|
if UseV:
|
|
527
|
-
if V2==V1:
|
|
528
|
-
V_score=4
|
|
857
|
+
if V2 == V1:
|
|
858
|
+
V_score = 4
|
|
529
859
|
else:
|
|
530
|
-
Vkey=(V1,V2)
|
|
860
|
+
Vkey = (V1, V2)
|
|
531
861
|
if Vkey not in VScore:
|
|
532
|
-
Vkey=(V2,V1)
|
|
862
|
+
Vkey = (V2, V1)
|
|
533
863
|
if Vkey not in VScore:
|
|
534
|
-
#print("V gene not found!")
|
|
864
|
+
# print("V gene not found!")
|
|
535
865
|
return 0
|
|
536
866
|
else:
|
|
537
|
-
V_score=VScore[Vkey]/20.0
|
|
867
|
+
V_score = VScore[Vkey] / 20.0
|
|
538
868
|
else:
|
|
539
|
-
V_score=4.0
|
|
540
|
-
aln=NHLocalAlignment(mid1,mid2,gapn,gap)
|
|
541
|
-
score=aln/float(max(len(mid1),len(mid2)))+V_score
|
|
869
|
+
V_score = 4.0
|
|
870
|
+
aln = NHLocalAlignment(mid1, mid2, gapn, gap)
|
|
871
|
+
score = aln / float(max(len(mid1), len(mid2))) + V_score
|
|
542
872
|
return score
|
|
543
873
|
|
|
874
|
+
|
|
544
875
|
def UpdateSSG(SSG, seqs, Vgenes, Vscore={}, UseV=True, gap=-6, gapn=1, cutoff=7.5):
|
|
545
|
-
SSGnew={}
|
|
546
|
-
count=0
|
|
547
|
-
t1=time.time()
|
|
548
|
-
N=len(list(chain(*list(SSG.values()))))
|
|
549
|
-
# print("Number of pairs to be processed: %d" %N)
|
|
876
|
+
SSGnew = {}
|
|
877
|
+
count = 0
|
|
878
|
+
t1 = time.time()
|
|
879
|
+
N = len(list(chain(*list(SSG.values()))))
|
|
880
|
+
# print("Number of pairs to be processed: %d" %N)
|
|
550
881
|
for kk in SSG:
|
|
551
|
-
s1=seqs[kk]
|
|
552
|
-
V1=Vgenes[kk]
|
|
553
|
-
VV=SSG[kk]
|
|
882
|
+
s1 = seqs[kk]
|
|
883
|
+
V1 = Vgenes[kk]
|
|
884
|
+
VV = SSG[kk]
|
|
554
885
|
for vv in VV:
|
|
555
|
-
s2=seqs[vv]
|
|
556
|
-
V2=Vgenes[vv]
|
|
557
|
-
score=falign(
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
886
|
+
s2 = seqs[vv]
|
|
887
|
+
V2 = Vgenes[vv]
|
|
888
|
+
score = falign(
|
|
889
|
+
s1, s2, V1, V2, st=3, VScore=Vscore, UseV=UseV, gap=-6, gapn=1
|
|
890
|
+
)
|
|
891
|
+
count += 1
|
|
892
|
+
if count % 1000000 == 0:
|
|
893
|
+
t2 = time.time()
|
|
894
|
+
# print("Processed %d pairs. Elapsed time %f" %(count, t2-t1))
|
|
895
|
+
if score >= cutoff:
|
|
563
896
|
if kk not in SSGnew:
|
|
564
|
-
SSGnew[kk]=[vv]
|
|
897
|
+
SSGnew[kk] = [vv]
|
|
565
898
|
else:
|
|
566
899
|
SSGnew[kk].append(vv)
|
|
567
900
|
return SSGnew
|
|
568
901
|
|
|
902
|
+
|
|
569
903
|
def dfs(graph, start):
|
|
570
|
-
|
|
904
|
+
"""
|
|
571
905
|
Non-resursive depth first search
|
|
572
|
-
|
|
906
|
+
"""
|
|
573
907
|
visited = set()
|
|
574
908
|
stack = [start]
|
|
575
909
|
while stack:
|
|
@@ -577,95 +911,100 @@ def dfs(graph, start):
|
|
|
577
911
|
if vertex not in visited:
|
|
578
912
|
visited.add(vertex)
|
|
579
913
|
stack.extend(set(graph[vertex]) - visited)
|
|
580
|
-
|
|
914
|
+
|
|
581
915
|
return visited
|
|
582
916
|
|
|
917
|
+
|
|
583
918
|
def IdentifyMotifCluster(SSG):
|
|
584
919
|
## Input SeqShareGraph dictionary representation of sparse matrix
|
|
585
|
-
POS=set(SSG.keys())
|
|
586
|
-
NP=len(POS)
|
|
587
|
-
ClusterList=[]
|
|
588
|
-
tmpL=set(chain(*ClusterList))
|
|
589
|
-
count=0
|
|
920
|
+
POS = set(SSG.keys())
|
|
921
|
+
NP = len(POS)
|
|
922
|
+
ClusterList = []
|
|
923
|
+
tmpL = set(chain(*ClusterList))
|
|
924
|
+
count = 0
|
|
590
925
|
while 1:
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
# STACK=LoadComm([],ii)
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
# tmpL=set(chain(*ClusterList))
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
926
|
+
xx = POS ^ tmpL
|
|
927
|
+
if len(xx) == 0:
|
|
928
|
+
break
|
|
929
|
+
for ii in xx:
|
|
930
|
+
# STACK=LoadComm([],ii)
|
|
931
|
+
STACK = dfs(SSG, ii)
|
|
932
|
+
tmpL = tmpL | STACK
|
|
933
|
+
ClusterList.append(list(STACK))
|
|
934
|
+
# tmpL=set(chain(*ClusterList))
|
|
935
|
+
count += 1
|
|
936
|
+
if count % 200 == 0:
|
|
937
|
+
print(" Solved %d clusters" % (count))
|
|
938
|
+
break
|
|
604
939
|
return ClusterList
|
|
605
940
|
|
|
941
|
+
|
|
606
942
|
def IdentifyVgeneCluster(sMat):
|
|
607
943
|
## Input Vgene score matrix
|
|
608
|
-
vG={}
|
|
609
|
-
n=len(sMat)
|
|
610
|
-
IDs=[x for x in range(n)]
|
|
944
|
+
vG = {}
|
|
945
|
+
n = len(sMat)
|
|
946
|
+
IDs = [x for x in range(n)]
|
|
611
947
|
for kk in IDs:
|
|
612
|
-
LL=sMat[:,kk]
|
|
613
|
-
vL=np.where(LL>=thr_v)[0]
|
|
614
|
-
if len(vL)>0:
|
|
615
|
-
vG[kk]=vL
|
|
616
|
-
CL=IdentifyMotifCluster(vG)
|
|
948
|
+
LL = sMat[:, kk]
|
|
949
|
+
vL = np.where(LL >= thr_v)[0]
|
|
950
|
+
if len(vL) > 0:
|
|
951
|
+
vG[kk] = vL
|
|
952
|
+
CL = IdentifyMotifCluster(vG)
|
|
617
953
|
return CL
|
|
618
|
-
|
|
954
|
+
|
|
955
|
+
|
|
619
956
|
def ParseFa(fname):
|
|
620
|
-
InputStr=open(fname).readlines()
|
|
621
|
-
FaDict={}
|
|
622
|
-
seq=
|
|
957
|
+
InputStr = open(fname).readlines()
|
|
958
|
+
FaDict = {}
|
|
959
|
+
seq = ""
|
|
623
960
|
for line in InputStr:
|
|
624
|
-
if line.startswith(
|
|
625
|
-
if len(seq)>0:
|
|
626
|
-
FaDict[seqHead]=seq
|
|
627
|
-
seq=
|
|
628
|
-
seqHead=line.strip()
|
|
961
|
+
if line.startswith(">"):
|
|
962
|
+
if len(seq) > 0:
|
|
963
|
+
FaDict[seqHead] = seq
|
|
964
|
+
seq = ""
|
|
965
|
+
seqHead = line.strip()
|
|
629
966
|
else:
|
|
630
|
-
seq+=line.strip()
|
|
967
|
+
seq += line.strip()
|
|
631
968
|
if seqHead not in FaDict:
|
|
632
|
-
FaDict[seqHead]=seq
|
|
969
|
+
FaDict[seqHead] = seq
|
|
633
970
|
return FaDict
|
|
634
971
|
|
|
972
|
+
|
|
635
973
|
def PreCalculateVgeneDist(VgeneFa="Imgt_Human_TRBV.fasta"):
|
|
636
974
|
## Only run one time if needed
|
|
637
|
-
FaDict=ParseFa(cur_dir+VgeneFa)
|
|
638
|
-
VScore={}
|
|
639
|
-
CDR1Dict={}
|
|
640
|
-
CDR2Dict={}
|
|
975
|
+
FaDict = ParseFa(cur_dir + VgeneFa)
|
|
976
|
+
VScore = {}
|
|
977
|
+
CDR1Dict = {}
|
|
978
|
+
CDR2Dict = {}
|
|
641
979
|
for kk in FaDict:
|
|
642
|
-
if
|
|
643
|
-
VV=kk.split(
|
|
980
|
+
if "|" in kk:
|
|
981
|
+
VV = kk.split("|")[1]
|
|
644
982
|
else:
|
|
645
|
-
VV=kk[1:]
|
|
646
|
-
CDR1Dict[VV]=FaDict[kk][26:37] ## Imgt CDR1: 27 - 38
|
|
647
|
-
CDR2Dict[VV]=FaDict[kk][55:64] ## Imgt CDR2: 56 - 65
|
|
648
|
-
Vkeys=list(CDR1Dict.keys())
|
|
649
|
-
nn=len(Vkeys)
|
|
650
|
-
for ii in range(0,nn):
|
|
651
|
-
V1=Vkeys[ii]
|
|
652
|
-
s1_CDR1=CDR1Dict[V1]
|
|
653
|
-
s1_CDR2=CDR2Dict[V1]
|
|
654
|
-
for jj in range(ii,nn):
|
|
655
|
-
V2=Vkeys[jj]
|
|
656
|
-
s2_CDR1=CDR1Dict[V2]
|
|
657
|
-
s2_CDR2=CDR2Dict[V2]
|
|
658
|
-
score1=SeqComparison(s1_CDR1,s2_CDR1)
|
|
659
|
-
score2=SeqComparison(s1_CDR2,s2_CDR2)
|
|
660
|
-
#print score1+score2
|
|
661
|
-
VScore[(V1,V2)]=score1+score2
|
|
662
|
-
gg=open(
|
|
983
|
+
VV = kk[1:]
|
|
984
|
+
CDR1Dict[VV] = FaDict[kk][26:37] ## Imgt CDR1: 27 - 38
|
|
985
|
+
CDR2Dict[VV] = FaDict[kk][55:64] ## Imgt CDR2: 56 - 65
|
|
986
|
+
Vkeys = list(CDR1Dict.keys())
|
|
987
|
+
nn = len(Vkeys)
|
|
988
|
+
for ii in range(0, nn):
|
|
989
|
+
V1 = Vkeys[ii]
|
|
990
|
+
s1_CDR1 = CDR1Dict[V1]
|
|
991
|
+
s1_CDR2 = CDR2Dict[V1]
|
|
992
|
+
for jj in range(ii, nn):
|
|
993
|
+
V2 = Vkeys[jj]
|
|
994
|
+
s2_CDR1 = CDR1Dict[V2]
|
|
995
|
+
s2_CDR2 = CDR2Dict[V2]
|
|
996
|
+
score1 = SeqComparison(s1_CDR1, s2_CDR1)
|
|
997
|
+
score2 = SeqComparison(s1_CDR2, s2_CDR2)
|
|
998
|
+
# print score1+score2
|
|
999
|
+
VScore[(V1, V2)] = score1 + score2
|
|
1000
|
+
gg = open("VgeneScores.txt", "w")
|
|
663
1001
|
for kk in VScore:
|
|
664
|
-
vv=VScore[kk]
|
|
665
|
-
line=kk[0]+
|
|
1002
|
+
vv = VScore[kk]
|
|
1003
|
+
line = kk[0] + "\t" + kk[1] + "\t" + str(vv) + "\n"
|
|
666
1004
|
gg.write(line)
|
|
667
1005
|
gg.close()
|
|
668
1006
|
|
|
1007
|
+
|
|
669
1008
|
def MergeCL(Cls):
|
|
670
1009
|
## merge pre-clusters according to shared sequences
|
|
671
1010
|
## shared sequences between pre-clusters are due to approximated centroid nearest neighbor search
|
|
@@ -673,16 +1012,16 @@ def MergeCL(Cls):
|
|
|
673
1012
|
for idx, cc in enumerate(Cls):
|
|
674
1013
|
for x in cc:
|
|
675
1014
|
if x not in vDict:
|
|
676
|
-
vDict[x]=[idx]
|
|
1015
|
+
vDict[x] = [idx]
|
|
677
1016
|
else:
|
|
678
1017
|
vDict[x].append(idx)
|
|
679
|
-
Cls_new=[]
|
|
1018
|
+
Cls_new = []
|
|
680
1019
|
cGraph = {}
|
|
681
1020
|
for kk in vDict:
|
|
682
|
-
vv=vDict[kk]
|
|
683
|
-
if len(vv)>1:
|
|
1021
|
+
vv = vDict[kk]
|
|
1022
|
+
if len(vv) > 1:
|
|
684
1023
|
for ii in vv:
|
|
685
|
-
vv1=deepcopy(vv)
|
|
1024
|
+
vv1 = deepcopy(vv)
|
|
686
1025
|
vv1.pop(vv1.index(ii))
|
|
687
1026
|
if ii not in cGraph:
|
|
688
1027
|
cGraph[ii] = vv1
|
|
@@ -690,21 +1029,21 @@ def MergeCL(Cls):
|
|
|
690
1029
|
cGraph[ii] += list(set(vv1 + cGraph[ii]))
|
|
691
1030
|
DupKeys = list(cGraph.keys())
|
|
692
1031
|
for kk in vDict:
|
|
693
|
-
vv=vDict[kk]
|
|
694
|
-
if len(vv)==1:
|
|
1032
|
+
vv = vDict[kk]
|
|
1033
|
+
if len(vv) == 1:
|
|
695
1034
|
if vv[0] in DupKeys:
|
|
696
1035
|
continue
|
|
697
1036
|
cc = Cls[vv[0]]
|
|
698
1037
|
if cc not in Cls_new:
|
|
699
1038
|
Cls_new.append(cc)
|
|
700
|
-
Cls_Dup=[]
|
|
1039
|
+
Cls_Dup = []
|
|
701
1040
|
for kk in cGraph:
|
|
702
1041
|
cc = dfs(cGraph, kk)
|
|
703
1042
|
cc = list(cc)
|
|
704
1043
|
cc = sorted(cc)
|
|
705
1044
|
if cc not in Cls_Dup:
|
|
706
1045
|
Cls_Dup.append(cc)
|
|
707
|
-
if len(Cls_Dup)>0:
|
|
1046
|
+
if len(Cls_Dup) > 0:
|
|
708
1047
|
for cdup in Cls_Dup:
|
|
709
1048
|
cc_merged = []
|
|
710
1049
|
for ii in cdup:
|
|
@@ -715,355 +1054,411 @@ def MergeCL(Cls):
|
|
|
715
1054
|
Cls_new.append(cc_merged)
|
|
716
1055
|
return Cls_new
|
|
717
1056
|
|
|
718
|
-
|
|
1057
|
+
|
|
1058
|
+
def EncodeRepertoire(
|
|
1059
|
+
inputfile,
|
|
1060
|
+
outdir,
|
|
1061
|
+
outfile="",
|
|
1062
|
+
exact=True,
|
|
1063
|
+
ST=3,
|
|
1064
|
+
thr_v=3.7,
|
|
1065
|
+
thr_s=3.5,
|
|
1066
|
+
VDict={},
|
|
1067
|
+
Vgene=True,
|
|
1068
|
+
thr_iso=10,
|
|
1069
|
+
gap=-6,
|
|
1070
|
+
GPU=False,
|
|
1071
|
+
Mat=False,
|
|
1072
|
+
verbose=False,
|
|
1073
|
+
):
|
|
719
1074
|
## No V gene version
|
|
720
1075
|
## Encode CDR3 sequences into 96 dimensional space and perform k-means clustering
|
|
721
1076
|
## If exact is True, SW alignment will be performed within each cluster after isometric encoding and clustering
|
|
722
|
-
h=open(inputfile)
|
|
723
|
-
t1=time.time()
|
|
724
|
-
alines=h.readlines()
|
|
725
|
-
ww=alines[0].strip().split(
|
|
726
|
-
if not ww[0].startswith(
|
|
1077
|
+
h = open(inputfile)
|
|
1078
|
+
t1 = time.time()
|
|
1079
|
+
alines = h.readlines()
|
|
1080
|
+
ww = alines[0].strip().split("\t")
|
|
1081
|
+
if not ww[0].startswith("C"):
|
|
727
1082
|
## header line
|
|
728
|
-
hline=alines[0]
|
|
729
|
-
alines=alines[1:]
|
|
730
|
-
elif
|
|
731
|
-
hline=alines[0]
|
|
732
|
-
alines=alines[1:]
|
|
1083
|
+
hline = alines[0]
|
|
1084
|
+
alines = alines[1:]
|
|
1085
|
+
elif "CDR3" in ww[0]:
|
|
1086
|
+
hline = alines[0]
|
|
1087
|
+
alines = alines[1:]
|
|
733
1088
|
else:
|
|
734
|
-
hline=
|
|
735
|
-
seqs=[]
|
|
736
|
-
vgs=[]
|
|
737
|
-
infoList=[]
|
|
738
|
-
count=0
|
|
1089
|
+
hline = "CDR3\t" + "\t".join(["Info" + str(x) for x in range(len(ww) - 1)])
|
|
1090
|
+
seqs = []
|
|
1091
|
+
vgs = []
|
|
1092
|
+
infoList = []
|
|
1093
|
+
count = 0
|
|
739
1094
|
if verbose:
|
|
740
|
-
print(
|
|
1095
|
+
print("Creating CDR3 list")
|
|
741
1096
|
for ll in alines:
|
|
742
|
-
ww=ll.strip().split(
|
|
743
|
-
cdr3=ww[0]
|
|
744
|
-
if
|
|
1097
|
+
ww = ll.strip().split("\t")
|
|
1098
|
+
cdr3 = ww[0]
|
|
1099
|
+
if "*" in cdr3:
|
|
745
1100
|
continue
|
|
746
|
-
if
|
|
1101
|
+
if "_" in cdr3:
|
|
747
1102
|
continue
|
|
748
1103
|
seqs.append(ww[0])
|
|
749
1104
|
if Vgene:
|
|
750
1105
|
vgs.append(ww[1])
|
|
751
|
-
infoList.append(
|
|
1106
|
+
infoList.append("\t".join(ww[1:]))
|
|
752
1107
|
else:
|
|
753
|
-
infoList.append(
|
|
754
|
-
count+=1
|
|
755
|
-
if len(outfile)==0:
|
|
756
|
-
outfile=inputfile.split(
|
|
757
|
-
outfile=outfile[len(outfile)-1]
|
|
758
|
-
outfile=
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
1108
|
+
infoList.append("\t".join(ww[1:]))
|
|
1109
|
+
count += 1
|
|
1110
|
+
if len(outfile) == 0:
|
|
1111
|
+
outfile = inputfile.split("/")
|
|
1112
|
+
outfile = outfile[len(outfile) - 1]
|
|
1113
|
+
outfile = (
|
|
1114
|
+
outdir
|
|
1115
|
+
+ "/"
|
|
1116
|
+
+ re.sub("\\.[txcsv]+", "", outfile)
|
|
1117
|
+
+ "-"
|
|
1118
|
+
+ "-RotationEncodingBL62.txt"
|
|
1119
|
+
)
|
|
1120
|
+
g = open(outfile, "w")
|
|
1121
|
+
tm = strftime("%Y-%m-%d %H:%M:%S", gmtime())
|
|
1122
|
+
InfoLine = (
|
|
1123
|
+
"##TIME:"
|
|
1124
|
+
+ tm
|
|
1125
|
+
+ "|cmd: "
|
|
1126
|
+
+ sys.argv[0]
|
|
1127
|
+
+ "|"
|
|
1128
|
+
+ inputfile
|
|
1129
|
+
+ "|IsometricDistance_Thr="
|
|
1130
|
+
+ str(thr_iso)
|
|
1131
|
+
+ "|thr_v="
|
|
1132
|
+
+ str(thr_v)
|
|
1133
|
+
+ "|thr_s="
|
|
1134
|
+
+ str(thr_s)
|
|
1135
|
+
+ "|exact="
|
|
1136
|
+
+ str(exact)
|
|
1137
|
+
+ "|Vgene="
|
|
1138
|
+
+ str(Vgene)
|
|
1139
|
+
+ "|ST="
|
|
1140
|
+
+ str(ST)
|
|
1141
|
+
)
|
|
1142
|
+
g.write(InfoLine + "\n")
|
|
1143
|
+
g.write(
|
|
1144
|
+
"##Column Info: CDR3 aa sequence, cluster id, other information in the input file\n"
|
|
1145
|
+
)
|
|
1146
|
+
gr = 0
|
|
765
1147
|
## Split into different lengths
|
|
766
|
-
LD,VD, ID,SD= BuildLengthDict(
|
|
1148
|
+
LD, VD, ID, SD = BuildLengthDict(
|
|
1149
|
+
seqs, vGene=vgs, INFO=infoList, sIDs=[x for x in range(len(seqs))]
|
|
1150
|
+
)
|
|
767
1151
|
LDu, VDu, IDu, SDu = CollapseUnique(LD, VD, ID, SD)
|
|
768
1152
|
if Mat:
|
|
769
|
-
Mfile=outfile+
|
|
770
|
-
h=open(Mfile,
|
|
1153
|
+
Mfile = outfile + "_EncodingMatrix.txt"
|
|
1154
|
+
h = open(Mfile, "w")
|
|
771
1155
|
for kk in LDu:
|
|
772
1156
|
if verbose:
|
|
773
|
-
print("---Process CDR3s with length %d ---" %(kk))
|
|
774
|
-
vSD=LDu[kk]
|
|
775
|
-
vSD0=[x for x in range(len(vSD))]
|
|
776
|
-
vss=SDu[kk]
|
|
777
|
-
vInfo=IDu[kk]
|
|
778
|
-
flagL=[len(x)-1 for x in vInfo]
|
|
1157
|
+
print("---Process CDR3s with length %d ---" % (kk))
|
|
1158
|
+
vSD = LDu[kk]
|
|
1159
|
+
vSD0 = [x for x in range(len(vSD))]
|
|
1160
|
+
vss = SDu[kk]
|
|
1161
|
+
vInfo = IDu[kk]
|
|
1162
|
+
flagL = [len(x) - 1 for x in vInfo]
|
|
779
1163
|
if verbose:
|
|
780
|
-
print(
|
|
781
|
-
dM=np.array([EncodingCDR3(x[ST:-2], M6, n0) for x in vss])
|
|
782
|
-
dM=dM.astype("float32")
|
|
1164
|
+
print(" Performing CDR3 encoding")
|
|
1165
|
+
dM = np.array([EncodingCDR3(x[ST:-2], M6, n0) for x in vss])
|
|
1166
|
+
dM = dM.astype("float32")
|
|
783
1167
|
if verbose:
|
|
784
|
-
print(" The number of sequences is %d" %(dM.shape[0]))
|
|
1168
|
+
print(" The number of sequences is %d" % (dM.shape[0]))
|
|
785
1169
|
if Mat:
|
|
786
1170
|
for ii in range(len(vss)):
|
|
787
|
-
line=vss[ii]+
|
|
788
|
-
NUMs=[str(xx) for xx in dM[ii
|
|
789
|
-
line +=
|
|
1171
|
+
line = vss[ii] + "\t" + vInfo[ii][0] + "\t"
|
|
1172
|
+
NUMs = [str(xx) for xx in dM[ii, :]]
|
|
1173
|
+
line += "\t".join(NUMs) + "\n"
|
|
790
1174
|
h.write(line)
|
|
791
|
-
sID=[x for x in range(dM.shape[0])]
|
|
792
|
-
t2=time.time()
|
|
1175
|
+
sID = [x for x in range(dM.shape[0])]
|
|
1176
|
+
t2 = time.time()
|
|
793
1177
|
if verbose:
|
|
794
|
-
print(
|
|
795
|
-
Cls = ClusterCDR3(
|
|
1178
|
+
print(" Done! Total time elapsed %f" % (t2 - t1))
|
|
1179
|
+
Cls = ClusterCDR3(
|
|
1180
|
+
dM, flagL, thr=thr_iso - 0.5 * (15 - kk), verbose=verbose
|
|
1181
|
+
) ## change cutoff with different lengths
|
|
796
1182
|
Cls = MergeCL(Cls)
|
|
797
1183
|
if verbose:
|
|
798
1184
|
print(" Handling identical CDR3 groups")
|
|
799
|
-
Cls_u=[]
|
|
1185
|
+
Cls_u = []
|
|
800
1186
|
for ii in range(len(Cls)):
|
|
801
|
-
cc=Cls[ii]
|
|
1187
|
+
cc = Cls[ii]
|
|
802
1188
|
if len(cc) == 1:
|
|
803
1189
|
## Handle identical CDR3 groups first
|
|
804
|
-
if flagL[cc[0]]>0:
|
|
1190
|
+
if flagL[cc[0]] > 0:
|
|
805
1191
|
gr += 1
|
|
806
|
-
jj=cc[0]
|
|
1192
|
+
jj = cc[0]
|
|
807
1193
|
for v_info in vInfo[jj]:
|
|
808
|
-
line=vss[jj]+
|
|
809
|
-
_=g.write(line)
|
|
1194
|
+
line = vss[jj] + "\t" + str(gr) + "\t" + v_info + "\n"
|
|
1195
|
+
_ = g.write(line)
|
|
810
1196
|
else:
|
|
811
1197
|
Cls_u.append(cc)
|
|
812
|
-
Cls=Cls_u
|
|
813
|
-
t2=time.time()
|
|
1198
|
+
Cls = Cls_u
|
|
1199
|
+
t2 = time.time()
|
|
814
1200
|
if verbose:
|
|
815
|
-
print(
|
|
1201
|
+
print(" Done! Total time elapsed %f" % (t2 - t1))
|
|
816
1202
|
if Vgene:
|
|
817
|
-
vVgene=VDu[kk]
|
|
1203
|
+
vVgene = VDu[kk]
|
|
818
1204
|
if verbose:
|
|
819
|
-
print(
|
|
820
|
-
Cls_v=[]
|
|
1205
|
+
print(" Matching variable genes")
|
|
1206
|
+
Cls_v = []
|
|
821
1207
|
for cc in Cls:
|
|
822
|
-
Nc=len(cc)
|
|
823
|
-
sMat={}
|
|
1208
|
+
Nc = len(cc)
|
|
1209
|
+
sMat = {}
|
|
824
1210
|
for ii in range(Nc):
|
|
825
|
-
v1=vVgene[cc[ii]]
|
|
826
|
-
for jj in range(ii,Nc):
|
|
827
|
-
if jj==ii:
|
|
1211
|
+
v1 = vVgene[cc[ii]]
|
|
1212
|
+
for jj in range(ii, Nc):
|
|
1213
|
+
if jj == ii:
|
|
828
1214
|
continue
|
|
829
|
-
v2=vVgene[cc[jj]]
|
|
1215
|
+
v2 = vVgene[cc[jj]]
|
|
830
1216
|
if (v1, v2) not in VDict:
|
|
831
1217
|
if v1 == v2:
|
|
832
1218
|
if ii not in sMat:
|
|
833
|
-
sMat[ii]=[jj]
|
|
1219
|
+
sMat[ii] = [jj]
|
|
834
1220
|
else:
|
|
835
1221
|
sMat[ii].append(jj)
|
|
836
1222
|
if jj not in sMat:
|
|
837
|
-
sMat[jj]=[ii]
|
|
1223
|
+
sMat[jj] = [ii]
|
|
838
1224
|
else:
|
|
839
1225
|
sMat[jj].append(ii)
|
|
840
1226
|
continue
|
|
841
|
-
if VDict[(v1,v2)] >= thr_v:
|
|
842
|
-
|
|
843
|
-
|
|
844
|
-
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
vCL=IdentifyMotifCluster(sMat)
|
|
851
|
-
vCL_List=list(chain(*vCL))
|
|
1227
|
+
if VDict[(v1, v2)] >= thr_v:
|
|
1228
|
+
if ii not in sMat:
|
|
1229
|
+
sMat[ii] = [jj]
|
|
1230
|
+
else:
|
|
1231
|
+
sMat[ii].append(jj)
|
|
1232
|
+
if jj not in sMat:
|
|
1233
|
+
sMat[jj] = [ii]
|
|
1234
|
+
else:
|
|
1235
|
+
sMat[jj].append(ii)
|
|
1236
|
+
vCL = IdentifyMotifCluster(sMat)
|
|
1237
|
+
vCL_List = list(chain(*vCL))
|
|
852
1238
|
for ii in range(Nc):
|
|
853
|
-
uu=flagL[cc[ii]]
|
|
854
|
-
if uu>0 and ii not in vCL_List:
|
|
1239
|
+
uu = flagL[cc[ii]]
|
|
1240
|
+
if uu > 0 and ii not in vCL_List:
|
|
855
1241
|
vCL.append([ii])
|
|
856
1242
|
for vcc in vCL:
|
|
857
1243
|
Cls_v.append(list(np.array(cc)[np.array(vcc)]))
|
|
858
|
-
Cls=[]
|
|
1244
|
+
Cls = []
|
|
859
1245
|
for ii in range(len(Cls_v)):
|
|
860
|
-
cc=Cls_v[ii]
|
|
1246
|
+
cc = Cls_v[ii]
|
|
861
1247
|
if len(cc) == 1:
|
|
862
1248
|
## Handle identical CDR3 groups first
|
|
863
1249
|
gr += 1
|
|
864
|
-
jj=cc[0]
|
|
1250
|
+
jj = cc[0]
|
|
865
1251
|
for v_info in vInfo[jj]:
|
|
866
|
-
line=vss[jj]+
|
|
867
|
-
_=g.write(line)
|
|
1252
|
+
line = vss[jj] + "\t" + str(gr) + "\t" + v_info + "\n"
|
|
1253
|
+
_ = g.write(line)
|
|
868
1254
|
else:
|
|
869
1255
|
Cls.append(cc)
|
|
870
1256
|
if exact:
|
|
871
1257
|
if verbose:
|
|
872
|
-
print(
|
|
873
|
-
Cls_s=[]
|
|
1258
|
+
print(" Performing Smith-Waterman alignment")
|
|
1259
|
+
Cls_s = []
|
|
874
1260
|
for cc in Cls:
|
|
875
|
-
Nc=len(cc)
|
|
876
|
-
if len(cc)<=3:
|
|
877
|
-
sMat=np.zeros((Nc,Nc))
|
|
1261
|
+
Nc = len(cc)
|
|
1262
|
+
if len(cc) <= 3:
|
|
1263
|
+
sMat = np.zeros((Nc, Nc))
|
|
878
1264
|
for ii in range(Nc):
|
|
879
|
-
s1=vss[cc[ii]]
|
|
880
|
-
for jj in range(ii,Nc):
|
|
881
|
-
if jj==ii:
|
|
1265
|
+
s1 = vss[cc[ii]]
|
|
1266
|
+
for jj in range(ii, Nc):
|
|
1267
|
+
if jj == ii:
|
|
882
1268
|
continue
|
|
883
|
-
s2=vss[cc[jj]]
|
|
1269
|
+
s2 = vss[cc[jj]]
|
|
884
1270
|
if len(s1) != len(s2):
|
|
885
1271
|
continue
|
|
886
|
-
if len(s1)<=5:
|
|
1272
|
+
if len(s1) <= 5:
|
|
887
1273
|
continue
|
|
888
|
-
sw=SeqComparison(s1[ST:-2],s2[ST:-2],gap=gap)
|
|
889
|
-
sw=sw/(len(s1)-ST-2)
|
|
890
|
-
sMat[ii,jj]=sw
|
|
891
|
-
sMat[jj,ii]=sw
|
|
892
|
-
s_max=[]
|
|
1274
|
+
sw = SeqComparison(s1[ST:-2], s2[ST:-2], gap=gap)
|
|
1275
|
+
sw = sw / (len(s1) - ST - 2)
|
|
1276
|
+
sMat[ii, jj] = sw
|
|
1277
|
+
sMat[jj, ii] = sw
|
|
1278
|
+
s_max = []
|
|
893
1279
|
for ii in range(Nc):
|
|
894
|
-
s_max.append(np.max(sMat[:,ii]))
|
|
895
|
-
cc_new=[]
|
|
1280
|
+
s_max.append(np.max(sMat[:, ii]))
|
|
1281
|
+
cc_new = []
|
|
896
1282
|
for ii in range(Nc):
|
|
897
|
-
if s_max[ii]>=thr_s:
|
|
1283
|
+
if s_max[ii] >= thr_s:
|
|
898
1284
|
cc_new.append(cc[ii])
|
|
899
|
-
if len(cc_new)>1:
|
|
1285
|
+
if len(cc_new) > 1:
|
|
900
1286
|
Cls_s.append(cc_new)
|
|
901
1287
|
else:
|
|
902
1288
|
for ii in range(Nc):
|
|
903
|
-
uu=flagL[cc[ii]]
|
|
904
|
-
if uu>0:
|
|
1289
|
+
uu = flagL[cc[ii]]
|
|
1290
|
+
if uu > 0:
|
|
905
1291
|
Cls_s.append([cc[ii]])
|
|
906
|
-
# print(Cls_s)
|
|
907
|
-
Cls_sList=list(chain(*Cls_s))
|
|
1292
|
+
# print(Cls_s)
|
|
1293
|
+
Cls_sList = list(chain(*Cls_s))
|
|
908
1294
|
for ii in range(len(cc)):
|
|
909
|
-
uu=flagL[cc[ii]]
|
|
910
|
-
if uu>0 and cc[ii] not in Cls_sList:
|
|
1295
|
+
uu = flagL[cc[ii]]
|
|
1296
|
+
if uu > 0 and cc[ii] not in Cls_sList:
|
|
911
1297
|
Cls_s.append([cc[ii]])
|
|
912
1298
|
else:
|
|
913
|
-
CDR3s=[vss[x] for x in cc]
|
|
914
|
-
sIDs=np.array([vSD0[x] for x in cc])
|
|
915
|
-
sIDs0=[x for x in range(len(cc))]
|
|
916
|
-
Kset=KmerSet(CDR3s, sIDs0, KS=5, st=ST, ed=2)
|
|
917
|
-
SSG=generateSSG(Kset, CDR3s, k_thr=1)
|
|
918
|
-
tmpVgenes=[
|
|
919
|
-
SSGnew=UpdateSSG(
|
|
920
|
-
|
|
921
|
-
|
|
1299
|
+
CDR3s = [vss[x] for x in cc]
|
|
1300
|
+
sIDs = np.array([vSD0[x] for x in cc])
|
|
1301
|
+
sIDs0 = [x for x in range(len(cc))]
|
|
1302
|
+
Kset = KmerSet(CDR3s, sIDs0, KS=5, st=ST, ed=2)
|
|
1303
|
+
SSG = generateSSG(Kset, CDR3s, k_thr=1)
|
|
1304
|
+
tmpVgenes = ["TRBV2"] * len(CDR3s)
|
|
1305
|
+
SSGnew = UpdateSSG(
|
|
1306
|
+
SSG, CDR3s, tmpVgenes, Vscore=VDict, cutoff=thr_s + 4
|
|
1307
|
+
)
|
|
1308
|
+
CLall = IdentifyMotifCluster(SSGnew)
|
|
1309
|
+
CLall_list = list(chain(*CLall))
|
|
922
1310
|
for ii in range(len(cc)):
|
|
923
|
-
uu=flagL[cc[ii]]
|
|
924
|
-
if uu>0 and ii not in CLall_list:
|
|
1311
|
+
uu = flagL[cc[ii]]
|
|
1312
|
+
if uu > 0 and ii not in CLall_list:
|
|
925
1313
|
CLall.append([ii])
|
|
926
1314
|
for cl in CLall:
|
|
927
|
-
ccs=list(sIDs[np.array(cl)])
|
|
1315
|
+
ccs = list(sIDs[np.array(cl)])
|
|
928
1316
|
Cls_s.append(ccs)
|
|
929
|
-
Cls=Cls_s
|
|
1317
|
+
Cls = Cls_s
|
|
930
1318
|
if verbose:
|
|
931
|
-
print(
|
|
1319
|
+
print(" Writing results into file")
|
|
932
1320
|
for ii in range(len(Cls)):
|
|
933
|
-
# if ii % 100000 == 0 and ii>0:
|
|
934
|
-
|
|
935
|
-
cc=Cls[ii]
|
|
936
|
-
gr+=1
|
|
1321
|
+
# if ii % 100000 == 0 and ii>0:
|
|
1322
|
+
# print(' %d sequences written' %(ii))
|
|
1323
|
+
cc = Cls[ii]
|
|
1324
|
+
gr += 1
|
|
937
1325
|
for jj in cc:
|
|
938
1326
|
for v_info in vInfo[jj]:
|
|
939
|
-
line=vss[jj]+
|
|
940
|
-
_=g.write(line)
|
|
1327
|
+
line = vss[jj] + "\t" + str(gr) + "\t" + v_info + "\n"
|
|
1328
|
+
_ = g.write(line)
|
|
941
1329
|
g.close()
|
|
942
1330
|
if Mat:
|
|
943
1331
|
h.close()
|
|
944
1332
|
|
|
1333
|
+
|
|
945
1334
|
def OrderUnique(Ig):
|
|
946
|
-
vv=list(Ig.values())
|
|
947
|
-
kk=list(Ig.keys())
|
|
948
|
-
LL=[len(x[1]) for x in vv]
|
|
949
|
-
v0=[x[0][0] for x in vv]
|
|
950
|
-
v1=[x[0][1] for x in vv]
|
|
951
|
-
zkk=zip(kk,v0,v1,LL)
|
|
952
|
-
zkks=sorted(zkk,key=lambda x: (x[1],x[3]))
|
|
953
|
-
nk=len(zkks)
|
|
954
|
-
keep_id=[0]
|
|
955
|
-
ii=1
|
|
956
|
-
n_pre=str(zkks[0][1])+
|
|
957
|
-
while ii<nk:
|
|
958
|
-
n_cur=str(zkks[ii][1])+
|
|
959
|
-
if n_cur==n_pre:
|
|
960
|
-
ii+=1
|
|
1335
|
+
vv = list(Ig.values())
|
|
1336
|
+
kk = list(Ig.keys())
|
|
1337
|
+
LL = [len(x[1]) for x in vv]
|
|
1338
|
+
v0 = [x[0][0] for x in vv]
|
|
1339
|
+
v1 = [x[0][1] for x in vv]
|
|
1340
|
+
zkk = zip(kk, v0, v1, LL)
|
|
1341
|
+
zkks = sorted(zkk, key=lambda x: (x[1], x[3]))
|
|
1342
|
+
nk = len(zkks)
|
|
1343
|
+
keep_id = [0]
|
|
1344
|
+
ii = 1
|
|
1345
|
+
n_pre = str(zkks[0][1]) + "_" + str(zkks[0][2])
|
|
1346
|
+
while ii < nk:
|
|
1347
|
+
n_cur = str(zkks[ii][1]) + "_" + str(zkks[ii][2])
|
|
1348
|
+
if n_cur == n_pre:
|
|
1349
|
+
ii += 1
|
|
961
1350
|
continue
|
|
962
1351
|
else:
|
|
963
1352
|
keep_id.append(ii)
|
|
964
|
-
n_pre=n_cur
|
|
965
|
-
ii+=1
|
|
1353
|
+
n_pre = n_cur
|
|
1354
|
+
ii += 1
|
|
966
1355
|
continue
|
|
967
|
-
nid=[x[0] for x in zkks]
|
|
968
|
-
filtered_id=np.array(nid)[np.array(keep_id)]
|
|
969
|
-
Igs={}
|
|
1356
|
+
nid = [x[0] for x in zkks]
|
|
1357
|
+
filtered_id = np.array(nid)[np.array(keep_id)]
|
|
1358
|
+
Igs = {}
|
|
970
1359
|
for ii in filtered_id:
|
|
971
|
-
Igs[kk[ii]]=vv[ii]
|
|
1360
|
+
Igs[kk[ii]] = vv[ii]
|
|
972
1361
|
return Igs, filtered_id
|
|
973
1362
|
|
|
1363
|
+
|
|
974
1364
|
def ClusterCDR3(dM, flagL, thr=10, GPU=False, verbose=False):
|
|
975
1365
|
## flagL: flag vector for identical CDR3 groups, >0 for grouped non-identical CDR3s
|
|
976
|
-
Cls=[]
|
|
977
|
-
flag=0
|
|
978
|
-
dM1=dM
|
|
979
|
-
flagL=np.array(flagL)
|
|
1366
|
+
Cls = []
|
|
1367
|
+
flag = 0
|
|
1368
|
+
dM1 = dM
|
|
1369
|
+
flagL = np.array(flagL)
|
|
980
1370
|
if GPU:
|
|
981
1371
|
res = faiss.StandardGpuResources()
|
|
982
1372
|
while 1:
|
|
983
|
-
# print(" %d number of clusters, with %d sequences" %(len(Cls),dM1.shape[0]))
|
|
1373
|
+
# print(" %d number of clusters, with %d sequences" %(len(Cls),dM1.shape[0]))
|
|
984
1374
|
if verbose:
|
|
985
|
-
print(
|
|
986
|
-
index = faiss.IndexFlatL2(Ndim*6)
|
|
1375
|
+
print("=", end="")
|
|
1376
|
+
index = faiss.IndexFlatL2(Ndim * 6)
|
|
987
1377
|
if GPU:
|
|
988
1378
|
index = faiss.index_cpu_to_gpu(res, 0, index)
|
|
989
1379
|
index.add(dM1)
|
|
990
|
-
if flag==0:
|
|
1380
|
+
if flag == 0:
|
|
991
1381
|
D, I = index.search(dM1, 2)
|
|
992
|
-
vv=np.where((D[:,1]<=thr))[0]
|
|
993
|
-
vv0=np.where((D[:,1]>thr) & (flagL>0))[0]
|
|
1382
|
+
vv = np.where((D[:, 1] <= thr))[0]
|
|
1383
|
+
vv0 = np.where((D[:, 1] > thr) & (flagL > 0))[0]
|
|
994
1384
|
for v in vv0:
|
|
995
1385
|
Cls.append([v])
|
|
996
|
-
tmp_dM=np.zeros((len(vv),Ndim*6))
|
|
997
|
-
Ig_new={}
|
|
1386
|
+
tmp_dM = np.zeros((len(vv), Ndim * 6))
|
|
1387
|
+
Ig_new = {}
|
|
998
1388
|
for ii in range(len(vv)):
|
|
999
|
-
v=vv[ii]
|
|
1000
|
-
Idx=I[v,]
|
|
1389
|
+
v = vv[ii]
|
|
1390
|
+
Idx = I[v,]
|
|
1001
1391
|
if v not in Idx:
|
|
1002
|
-
Idx[0]=v
|
|
1003
|
-
Ig_new[ii]=(sorted(list(set(Idx))),sorted(list(set(Idx))))
|
|
1004
|
-
tmp_dM[ii,]=(dM1[Idx[0],]+dM1[Idx[1],])/2
|
|
1005
|
-
if len(Ig_new)==0:
|
|
1392
|
+
Idx[0] = v
|
|
1393
|
+
Ig_new[ii] = (sorted(list(set(Idx))), sorted(list(set(Idx))))
|
|
1394
|
+
tmp_dM[ii,] = (dM1[Idx[0],] + dM1[Idx[1],]) / 2
|
|
1395
|
+
if len(Ig_new) == 0:
|
|
1006
1396
|
if verbose:
|
|
1007
|
-
print(
|
|
1397
|
+
print("type 0 break")
|
|
1008
1398
|
break
|
|
1009
|
-
# print('%d of sequence left at cycle %d' %(len(Ig_new),flag))
|
|
1010
|
-
Igs, fid=OrderUnique(Ig_new)
|
|
1011
|
-
tmp_dM=tmp_dM[fid,]
|
|
1012
|
-
Ig_new=Igs
|
|
1399
|
+
# print('%d of sequence left at cycle %d' %(len(Ig_new),flag))
|
|
1400
|
+
Igs, fid = OrderUnique(Ig_new)
|
|
1401
|
+
tmp_dM = tmp_dM[fid,]
|
|
1402
|
+
Ig_new = Igs
|
|
1013
1403
|
else:
|
|
1014
|
-
D, I = index.search(dM1,2)
|
|
1015
|
-
vv=np.where(D[:,1]<=thr)[0]
|
|
1016
|
-
vv0=np.where(D[:,1]>thr)[0]
|
|
1404
|
+
D, I = index.search(dM1, 2)
|
|
1405
|
+
vv = np.where(D[:, 1] <= thr)[0]
|
|
1406
|
+
vv0 = np.where(D[:, 1] > thr)[0]
|
|
1017
1407
|
## move groups in vv0 to Cls
|
|
1018
|
-
kkg=list(Ig.keys())
|
|
1408
|
+
kkg = list(Ig.keys())
|
|
1019
1409
|
for v in vv0:
|
|
1020
|
-
ng=list(Ig[kkg[v]][1])
|
|
1021
|
-
|
|
1410
|
+
ng = list(Ig[kkg[v]][1])
|
|
1411
|
+
# if ng not in Cls:
|
|
1022
1412
|
Cls.append(ng)
|
|
1023
|
-
tmp_dM=np.zeros((len(vv),Ndim*6))
|
|
1024
|
-
Ig_new={}
|
|
1413
|
+
tmp_dM = np.zeros((len(vv), Ndim * 6))
|
|
1414
|
+
Ig_new = {}
|
|
1025
1415
|
for ii in range(len(vv)):
|
|
1026
|
-
v=vv[ii]
|
|
1027
|
-
idx1=I[v,0]
|
|
1028
|
-
idx2=I[v,1]
|
|
1416
|
+
v = vv[ii]
|
|
1417
|
+
idx1 = I[v, 0]
|
|
1418
|
+
idx2 = I[v, 1]
|
|
1029
1419
|
if v not in I[v,]:
|
|
1030
|
-
idx1=v
|
|
1031
|
-
# Ig_new[ii]=sorted(list(set(list(Ig[kkg[idx1]])+list(Ig[kkg[idx2]]))))
|
|
1032
|
-
Ig_new[ii]=(
|
|
1033
|
-
|
|
1034
|
-
|
|
1035
|
-
|
|
1420
|
+
idx1 = v
|
|
1421
|
+
# Ig_new[ii]=sorted(list(set(list(Ig[kkg[idx1]])+list(Ig[kkg[idx2]]))))
|
|
1422
|
+
Ig_new[ii] = (
|
|
1423
|
+
sorted(
|
|
1424
|
+
list(set([idx1, idx2]))
|
|
1425
|
+
), ## First entry records the relative index of a sequence clique
|
|
1426
|
+
sorted(list(set(list(Ig[kkg[idx1]][1]) + list(Ig[kkg[idx2]][1])))),
|
|
1427
|
+
) ## Second entry records the absolute index of a sequence
|
|
1428
|
+
tmp_dM[ii,] = (dM1[idx1,] + dM1[idx2,]) / 2
|
|
1429
|
+
if len(Ig_new) == 0:
|
|
1036
1430
|
if verbose:
|
|
1037
1431
|
print("\ntype I break")
|
|
1038
|
-
kkg=list(Ig.keys())
|
|
1432
|
+
kkg = list(Ig.keys())
|
|
1039
1433
|
for kk in kkg:
|
|
1040
|
-
ng=list(Ig[kk][1])
|
|
1434
|
+
ng = list(Ig[kk][1])
|
|
1041
1435
|
if ng not in Cls:
|
|
1042
1436
|
Cls.append(ng)
|
|
1043
1437
|
break
|
|
1044
|
-
# print('%d of sequence left at cycle %d' %(len(Ig_new),flag))
|
|
1045
|
-
Igs, fid=OrderUnique(Ig_new)
|
|
1046
|
-
tmp_dM=tmp_dM[fid,]
|
|
1047
|
-
Ig_new=Igs
|
|
1048
|
-
if flag>0:
|
|
1438
|
+
# print('%d of sequence left at cycle %d' %(len(Ig_new),flag))
|
|
1439
|
+
Igs, fid = OrderUnique(Ig_new)
|
|
1440
|
+
tmp_dM = tmp_dM[fid,]
|
|
1441
|
+
Ig_new = Igs
|
|
1442
|
+
if flag > 0:
|
|
1049
1443
|
if Ig == Ig_new:
|
|
1050
1444
|
if verbose:
|
|
1051
1445
|
print("\ntype II break")
|
|
1052
|
-
kkg=list(Ig.keys())
|
|
1446
|
+
kkg = list(Ig.keys())
|
|
1053
1447
|
for kk in kkg:
|
|
1054
|
-
ng=list(Ig[kk][1])
|
|
1448
|
+
ng = list(Ig[kk][1])
|
|
1055
1449
|
if ng in Cls:
|
|
1056
1450
|
continue
|
|
1057
1451
|
Cls.append(ng)
|
|
1058
1452
|
break
|
|
1059
|
-
Ig=Ig_new
|
|
1060
|
-
tmp_dM=tmp_dM.astype(
|
|
1061
|
-
dM1=tmp_dM
|
|
1062
|
-
flag+=1
|
|
1453
|
+
Ig = Ig_new
|
|
1454
|
+
tmp_dM = tmp_dM.astype("float32")
|
|
1455
|
+
dM1 = tmp_dM
|
|
1456
|
+
flag += 1
|
|
1063
1457
|
return Cls
|
|
1064
1458
|
|
|
1065
|
-
|
|
1066
|
-
|
|
1459
|
+
|
|
1460
|
+
def ClusterCDR3r(dM, flagL, thr=10, verbose=False):
|
|
1461
|
+
index = faiss.IndexFlatL2(Ndim * 6)
|
|
1067
1462
|
index.add(dM)
|
|
1068
1463
|
lims, D, I = index.range_search(dM, thr)
|
|
1069
1464
|
# with open('cdr3.npy', 'wb') as f:
|
|
@@ -1071,53 +1466,70 @@ def ClusterCDR3r(dM, flagL, thr = 10, verbose = False):
|
|
|
1071
1466
|
# np.save(f, D)
|
|
1072
1467
|
# np.save(f, I)
|
|
1073
1468
|
# np.save(f, dM)
|
|
1074
|
-
|
|
1469
|
+
|
|
1075
1470
|
# now clustering results
|
|
1076
1471
|
N = dM.shape[0]
|
|
1077
|
-
neighborSize = np.array(
|
|
1472
|
+
neighborSize = np.array(
|
|
1473
|
+
[lims[cur_idx_i + 1] - lims[cur_idx_i] for cur_idx_i in range(N)]
|
|
1474
|
+
)
|
|
1078
1475
|
# to_cluster = np.ones( (N,))
|
|
1079
1476
|
clusterNo = 0
|
|
1080
|
-
cluster = -
|
|
1477
|
+
cluster = -np.ones((N,), dtype=np.int32)
|
|
1081
1478
|
idx = np.where(cluster < 0)[0]
|
|
1082
1479
|
unclustered = [np.argmax(neighborSize[idx])]
|
|
1083
1480
|
depth = 0
|
|
1084
1481
|
while True:
|
|
1085
|
-
if len(unclustered) == 0:
|
|
1482
|
+
if len(unclustered) == 0:
|
|
1483
|
+
break
|
|
1086
1484
|
# cur_idx = unclustered[0] # first unclustered index
|
|
1087
1485
|
cur_idx = unclustered
|
|
1088
|
-
cluster[cur_idx] = clusterNo
|
|
1089
|
-
|
|
1090
|
-
neighbor = np.unique(
|
|
1486
|
+
cluster[cur_idx] = clusterNo # assign cluster
|
|
1487
|
+
|
|
1488
|
+
neighbor = np.unique(
|
|
1489
|
+
np.array(
|
|
1490
|
+
list(
|
|
1491
|
+
chain(
|
|
1492
|
+
*[
|
|
1493
|
+
I[(lims[cur_idx_i]) : lims[cur_idx_i + 1]]
|
|
1494
|
+
for cur_idx_i in cur_idx
|
|
1495
|
+
]
|
|
1496
|
+
)
|
|
1497
|
+
)
|
|
1498
|
+
)
|
|
1499
|
+
)
|
|
1091
1500
|
# find those unclusterred
|
|
1092
1501
|
idx = np.where(cluster[neighbor] < 0)[0]
|
|
1093
1502
|
if len(idx) == 0:
|
|
1094
1503
|
depth = 0
|
|
1095
1504
|
clusterNo += 1
|
|
1096
1505
|
idx = np.where(cluster < 0)[0]
|
|
1097
|
-
if len(idx) == 0:
|
|
1506
|
+
if len(idx) == 0:
|
|
1507
|
+
break
|
|
1098
1508
|
unclustered = [idx[np.argmax(neighborSize[idx])]]
|
|
1099
|
-
|
|
1509
|
+
|
|
1100
1510
|
else:
|
|
1101
1511
|
if depth > 3:
|
|
1102
1512
|
depth = 0
|
|
1103
1513
|
clusterNo += 1
|
|
1104
1514
|
unclustered = neighbor[idx]
|
|
1105
1515
|
depth += 1
|
|
1106
|
-
# print('clusterNo = ', clusterNo)
|
|
1107
|
-
Cls = [
|
|
1516
|
+
# print('clusterNo = ', clusterNo)
|
|
1517
|
+
Cls = [[] for i in range(clusterNo)]
|
|
1108
1518
|
for idx, i in enumerate(cluster):
|
|
1109
|
-
|
|
1110
|
-
# print("Cls[:5] = ", Cls[:5])
|
|
1111
|
-
# print("len(Cls) = ", len(Cls),
|
|
1112
|
-
# ', #elem=', sum([len(i) for i in Cls]),
|
|
1113
|
-
# ', #single=', sum([len(i) for i in Cls if len(i) == 1]),
|
|
1114
|
-
# ', #non_single=', sum([len(i) for i in Cls if len(i) != 1]),
|
|
1115
|
-
# ', #max=', max([len(i) for i in Cls]))
|
|
1519
|
+
Cls[i].append(idx)
|
|
1520
|
+
# print("Cls[:5] = ", Cls[:5])
|
|
1521
|
+
# print("len(Cls) = ", len(Cls),
|
|
1522
|
+
# ', #elem=', sum([len(i) for i in Cls]),
|
|
1523
|
+
# ', #single=', sum([len(i) for i in Cls if len(i) == 1]),
|
|
1524
|
+
# ', #non_single=', sum([len(i) for i in Cls if len(i) != 1]),
|
|
1525
|
+
# ', #max=', max([len(i) for i in Cls]))
|
|
1116
1526
|
return Cls
|
|
1117
1527
|
|
|
1528
|
+
|
|
1118
1529
|
def CommandLineParser():
|
|
1119
|
-
parser=OptionParser()
|
|
1120
|
-
print
|
|
1530
|
+
parser = OptionParser()
|
|
1531
|
+
print(
|
|
1532
|
+
"""
|
|
1121
1533
|
GIANA: Geometric Isometry based ANtigen-specific tcr Alignment
|
|
1122
1534
|
Ultrafast short peptide alignment exclusively designed for large-scale adaptome analysis
|
|
1123
1535
|
|
|
@@ -1130,129 +1542,276 @@ Input columns:
|
|
|
1130
1542
|
|
|
1131
1543
|
!!! ALL amino acid letters must be CAPITAL !!!
|
|
1132
1544
|
|
|
1133
|
-
|
|
1134
|
-
|
|
1135
|
-
parser.add_option(
|
|
1136
|
-
|
|
1137
|
-
|
|
1138
|
-
|
|
1139
|
-
|
|
1140
|
-
|
|
1141
|
-
|
|
1142
|
-
parser.add_option(
|
|
1143
|
-
|
|
1144
|
-
|
|
1145
|
-
|
|
1146
|
-
|
|
1147
|
-
|
|
1148
|
-
|
|
1149
|
-
parser.add_option(
|
|
1150
|
-
|
|
1151
|
-
|
|
1152
|
-
|
|
1153
|
-
|
|
1545
|
+
"""
|
|
1546
|
+
)
|
|
1547
|
+
parser.add_option(
|
|
1548
|
+
"-d",
|
|
1549
|
+
"--directory",
|
|
1550
|
+
dest="Directory",
|
|
1551
|
+
help="Input repertoire sequencing file directory. Please make sure that all the files in the directory are input files.",
|
|
1552
|
+
default="",
|
|
1553
|
+
)
|
|
1554
|
+
parser.add_option(
|
|
1555
|
+
"-f",
|
|
1556
|
+
"--file",
|
|
1557
|
+
dest="File",
|
|
1558
|
+
default="",
|
|
1559
|
+
help="Input single file of CDR3 sequences for grouping",
|
|
1560
|
+
)
|
|
1561
|
+
parser.add_option(
|
|
1562
|
+
"-F",
|
|
1563
|
+
"--fileList",
|
|
1564
|
+
dest="files",
|
|
1565
|
+
default="",
|
|
1566
|
+
help="Alternative input: a file containing the full path to all the files. If given, overwrite -d and -f option",
|
|
1567
|
+
)
|
|
1568
|
+
parser.add_option(
|
|
1569
|
+
"-t",
|
|
1570
|
+
"--threshold",
|
|
1571
|
+
dest="thr",
|
|
1572
|
+
default=7,
|
|
1573
|
+
help="Isometric distance threshold for calling similar CDR3 groups. Without -E, smaller value will increase speed. With -E, smaller value will increase specificity. Must be smaller than 12.",
|
|
1574
|
+
)
|
|
1575
|
+
parser.add_option(
|
|
1576
|
+
"-S",
|
|
1577
|
+
"--threshold_score",
|
|
1578
|
+
dest="thr_s",
|
|
1579
|
+
default=3.5,
|
|
1580
|
+
help="Threshold for Smith-Waterman alignment score (normalized by CDR3 length). Default 3.5",
|
|
1581
|
+
)
|
|
1582
|
+
parser.add_option(
|
|
1583
|
+
"-G",
|
|
1584
|
+
"--threshold_vgene",
|
|
1585
|
+
dest="thr_v",
|
|
1586
|
+
default=3.7,
|
|
1587
|
+
help="Threshold for variable gene comparison. Default 3.7.",
|
|
1588
|
+
)
|
|
1589
|
+
parser.add_option(
|
|
1590
|
+
"-o",
|
|
1591
|
+
"--output",
|
|
1592
|
+
dest="OutDir",
|
|
1593
|
+
default="./",
|
|
1594
|
+
help="Output directory for intermediate and final outputs.",
|
|
1595
|
+
)
|
|
1596
|
+
parser.add_option(
|
|
1597
|
+
"-O",
|
|
1598
|
+
"--outfile",
|
|
1599
|
+
dest="OutFile",
|
|
1600
|
+
default="",
|
|
1601
|
+
help="Output file name. If not given, a file with --RotationEncoding will be added to the input file as the output file name.",
|
|
1602
|
+
)
|
|
1603
|
+
parser.add_option(
|
|
1604
|
+
"-T",
|
|
1605
|
+
"--startPosition",
|
|
1606
|
+
dest="ST",
|
|
1607
|
+
default=3,
|
|
1608
|
+
help="Starting position of CDR3 sequence. The first ST letters are omitted. CDR3 sequence length L must be >= ST+7 ",
|
|
1609
|
+
)
|
|
1610
|
+
parser.add_option(
|
|
1611
|
+
"-g",
|
|
1612
|
+
"--GapPenalty",
|
|
1613
|
+
dest="Gap",
|
|
1614
|
+
default=-6,
|
|
1615
|
+
help="Gap penalty,default= -6. Not used.",
|
|
1616
|
+
)
|
|
1617
|
+
parser.add_option(
|
|
1618
|
+
"-n",
|
|
1619
|
+
"--GapNumber",
|
|
1620
|
+
dest="GapN",
|
|
1621
|
+
default=1,
|
|
1622
|
+
help="Maximum number of gaps allowed when performing alignment. Max=1, default=1. Not used.",
|
|
1623
|
+
)
|
|
1624
|
+
parser.add_option(
|
|
1625
|
+
"-V",
|
|
1626
|
+
"--VariableGeneFa",
|
|
1627
|
+
dest="VFa",
|
|
1628
|
+
default="Imgt_Human_TRBV.fasta",
|
|
1629
|
+
help="IMGT Human beta variable gene sequences",
|
|
1630
|
+
)
|
|
1631
|
+
parser.add_option(
|
|
1632
|
+
"-v",
|
|
1633
|
+
"--VariableGene",
|
|
1634
|
+
dest="V",
|
|
1635
|
+
default=True,
|
|
1636
|
+
action="store_false",
|
|
1637
|
+
help="If False, GIANA will omit variable gene information and use CDR3 sequences only. This will yield reduced specificity. The cut-off will automatically become the current value-4.0",
|
|
1638
|
+
)
|
|
1639
|
+
parser.add_option(
|
|
1640
|
+
"-e",
|
|
1641
|
+
"--Exact",
|
|
1642
|
+
dest="E",
|
|
1643
|
+
default=True,
|
|
1644
|
+
action="store_false",
|
|
1645
|
+
help="If False, GIANA will not perform Smith-Waterman alignment after isometric encoding.",
|
|
1646
|
+
)
|
|
1647
|
+
parser.add_option(
|
|
1648
|
+
"-N",
|
|
1649
|
+
"--NumberOfThreads",
|
|
1650
|
+
dest="NN",
|
|
1651
|
+
default=1,
|
|
1652
|
+
help="Number of threads for multiple processing. Not working so well.",
|
|
1653
|
+
)
|
|
1654
|
+
parser.add_option(
|
|
1655
|
+
"-M",
|
|
1656
|
+
"--EncodingMatrix",
|
|
1657
|
+
dest="Mat",
|
|
1658
|
+
default=False,
|
|
1659
|
+
action="store_true",
|
|
1660
|
+
help="If true, GIANA will export the isometric encoding matrix for each TCR. Default: False.",
|
|
1661
|
+
)
|
|
1662
|
+
parser.add_option(
|
|
1663
|
+
"-U",
|
|
1664
|
+
"--UseGPU",
|
|
1665
|
+
dest="GPU",
|
|
1666
|
+
default=False,
|
|
1667
|
+
action="store_true",
|
|
1668
|
+
help="Use GPU for Faiss indexing. Must be CUDA GPUs.",
|
|
1669
|
+
)
|
|
1670
|
+
parser.add_option(
|
|
1671
|
+
"-q",
|
|
1672
|
+
"--queryFile",
|
|
1673
|
+
dest="Query",
|
|
1674
|
+
default="",
|
|
1675
|
+
help="Input query file, if given, GIANA will run in query mode, also need to provide -r option.",
|
|
1676
|
+
)
|
|
1677
|
+
parser.add_option(
|
|
1678
|
+
"-r",
|
|
1679
|
+
"--refFile",
|
|
1680
|
+
dest="ref",
|
|
1681
|
+
default="",
|
|
1682
|
+
help="Input reference file. Query model required.",
|
|
1683
|
+
)
|
|
1684
|
+
parser.add_option(
|
|
1685
|
+
"-b",
|
|
1686
|
+
"--Verbose",
|
|
1687
|
+
dest="v",
|
|
1688
|
+
default=False,
|
|
1689
|
+
action="store_true",
|
|
1690
|
+
help="Verbose option: if given, GIANA will print intermediate messages.",
|
|
1691
|
+
)
|
|
1154
1692
|
return parser.parse_args()
|
|
1155
1693
|
|
|
1694
|
+
|
|
1156
1695
|
def main():
|
|
1157
|
-
(opt,_)=CommandLineParser()
|
|
1158
|
-
cutoff=float(opt.thr)
|
|
1159
|
-
OutDir=opt.OutDir
|
|
1160
|
-
thr_s=float(opt.thr_s)
|
|
1696
|
+
(opt, _) = CommandLineParser()
|
|
1697
|
+
cutoff = float(opt.thr)
|
|
1698
|
+
OutDir = opt.OutDir
|
|
1699
|
+
thr_s = float(opt.thr_s)
|
|
1161
1700
|
## Check if query mode first
|
|
1162
|
-
qFile=opt.Query
|
|
1163
|
-
if len(qFile)>0:
|
|
1701
|
+
qFile = opt.Query
|
|
1702
|
+
if len(qFile) > 0:
|
|
1164
1703
|
## query mode
|
|
1165
|
-
t1=time.time()
|
|
1166
|
-
if qFile.endswith(
|
|
1704
|
+
t1 = time.time()
|
|
1705
|
+
if qFile.endswith("/"):
|
|
1167
1706
|
## input query is a directory
|
|
1168
|
-
qFs=os.listdir(qFile)
|
|
1169
|
-
qFileList=[]
|
|
1707
|
+
qFs = os.listdir(qFile)
|
|
1708
|
+
qFileList = []
|
|
1170
1709
|
for ff in qFs:
|
|
1171
|
-
qFileList.append(qFile+ff)
|
|
1710
|
+
qFileList.append(qFile + ff)
|
|
1172
1711
|
else:
|
|
1173
|
-
qFileList=[qFile]
|
|
1174
|
-
rFile=opt.ref
|
|
1175
|
-
if len(rFile)==0:
|
|
1176
|
-
raise("Must provide reference file in query mode!")
|
|
1712
|
+
qFileList = [qFile]
|
|
1713
|
+
rFile = opt.ref
|
|
1714
|
+
if len(rFile) == 0:
|
|
1715
|
+
raise ("Must provide reference file in query mode!")
|
|
1177
1716
|
else:
|
|
1178
1717
|
## check if reference cluster file exists
|
|
1179
|
-
rFile0=re.sub(
|
|
1180
|
-
refClusterFile=rFile0+
|
|
1718
|
+
rFile0 = re.sub("\\.txt", "", rFile)
|
|
1719
|
+
refClusterFile = rFile0 + "--RotationEncodingBL62.txt"
|
|
1181
1720
|
if not os.path.exists(refClusterFile):
|
|
1182
|
-
raise(
|
|
1183
|
-
|
|
1184
|
-
|
|
1185
|
-
|
|
1721
|
+
raise (
|
|
1722
|
+
"Must run clustering on reference file first! Did you forget to put the clustering file in this directory?"
|
|
1723
|
+
)
|
|
1724
|
+
rData = CreateReference(rFile)
|
|
1725
|
+
t2 = time.time()
|
|
1726
|
+
print("Reference created. Elapsed %f" % (t2 - t1))
|
|
1186
1727
|
for qf in qFileList:
|
|
1187
|
-
t2_0=time.time()
|
|
1188
|
-
print("Querying "+qf)
|
|
1189
|
-
qf_s=qf.split(
|
|
1190
|
-
outFile=re.sub(
|
|
1191
|
-
of=OutDir+
|
|
1728
|
+
t2_0 = time.time()
|
|
1729
|
+
print("Querying " + qf)
|
|
1730
|
+
qf_s = qf.split("/")[-1]
|
|
1731
|
+
outFile = re.sub("\\.txt", "", qf_s) + "_query_" + rFile0 + ".txt"
|
|
1732
|
+
of = OutDir + "/" + outFile
|
|
1192
1733
|
if path.exists(of):
|
|
1193
|
-
print(of+
|
|
1734
|
+
print(of + " already exits. Skipping.")
|
|
1194
1735
|
continue
|
|
1195
1736
|
MakeQuery(qf, rData, thr=cutoff, thr_s=thr_s)
|
|
1196
|
-
t2=time.time()
|
|
1197
|
-
print(" Build query clustering file. Elapsed %f" %(t2-t1))
|
|
1737
|
+
t2 = time.time()
|
|
1738
|
+
print(" Build query clustering file. Elapsed %f" % (t2 - t1))
|
|
1198
1739
|
print("Now mering with reference cluster")
|
|
1199
|
-
MergeExist(refClusterFile, OutDir+
|
|
1200
|
-
t2=time.time()
|
|
1201
|
-
print(" Time of elapsed for query %s: %f" %(qf, t2-t2_0))
|
|
1740
|
+
MergeExist(refClusterFile, OutDir + "/" + outFile)
|
|
1741
|
+
t2 = time.time()
|
|
1742
|
+
print(" Time of elapsed for query %s: %f" % (qf, t2 - t2_0))
|
|
1202
1743
|
else:
|
|
1203
1744
|
## regular clustering mode
|
|
1204
|
-
FileDir=opt.Directory
|
|
1205
|
-
if len(FileDir)>0:
|
|
1206
|
-
|
|
1207
|
-
|
|
1208
|
-
|
|
1209
|
-
|
|
1210
|
-
|
|
1211
|
-
|
|
1745
|
+
FileDir = opt.Directory
|
|
1746
|
+
if len(FileDir) > 0:
|
|
1747
|
+
files = os.listdir(FileDir)
|
|
1748
|
+
files0 = []
|
|
1749
|
+
for ff in files:
|
|
1750
|
+
ff = FileDir + "/" + ff
|
|
1751
|
+
files0.append(ff)
|
|
1752
|
+
files = files0
|
|
1212
1753
|
else:
|
|
1213
|
-
|
|
1214
|
-
File=opt.File
|
|
1215
|
-
if len(File)>0:
|
|
1216
|
-
|
|
1217
|
-
FileList=opt.files
|
|
1218
|
-
if len(FileList)>0:
|
|
1219
|
-
|
|
1220
|
-
|
|
1221
|
-
|
|
1222
|
-
|
|
1223
|
-
VFa=opt.VFa
|
|
1754
|
+
files = []
|
|
1755
|
+
File = opt.File
|
|
1756
|
+
if len(File) > 0:
|
|
1757
|
+
files = [File]
|
|
1758
|
+
FileList = opt.files
|
|
1759
|
+
if len(FileList) > 0:
|
|
1760
|
+
files = []
|
|
1761
|
+
fL = open(FileList)
|
|
1762
|
+
for ff in fL.readlines():
|
|
1763
|
+
files.append(ff.strip())
|
|
1764
|
+
VFa = opt.VFa
|
|
1224
1765
|
PreCalculateVgeneDist(VFa)
|
|
1225
|
-
vf=open(
|
|
1226
|
-
VScore={}
|
|
1227
|
-
VV=opt.V
|
|
1228
|
-
EE=opt.E
|
|
1229
|
-
Mat=opt.Mat
|
|
1230
|
-
ST=int(opt.ST)
|
|
1231
|
-
thr_v=float(opt.thr_v)
|
|
1232
|
-
verbose=opt.v
|
|
1766
|
+
vf = open("./VgeneScores.txt") ## Use tcrDist's Vgene 80-score calculation
|
|
1767
|
+
VScore = {}
|
|
1768
|
+
VV = opt.V
|
|
1769
|
+
EE = opt.E
|
|
1770
|
+
Mat = opt.Mat
|
|
1771
|
+
ST = int(opt.ST)
|
|
1772
|
+
thr_v = float(opt.thr_v)
|
|
1773
|
+
verbose = opt.v
|
|
1233
1774
|
if VV:
|
|
1234
1775
|
while 1:
|
|
1235
|
-
line=vf.readline()
|
|
1236
|
-
if len(line)==0:
|
|
1776
|
+
line = vf.readline()
|
|
1777
|
+
if len(line) == 0:
|
|
1237
1778
|
break
|
|
1238
|
-
ww=line.strip().split(
|
|
1239
|
-
VScore[(ww[0],ww[1])]=int(ww[2])/20
|
|
1240
|
-
VScore[(ww[1],ww[0])]=int(ww[2])/20
|
|
1241
|
-
Gap=int(opt.Gap)
|
|
1242
|
-
Gapn=int(opt.GapN)
|
|
1243
|
-
OutFile=opt.OutFile
|
|
1244
|
-
GPU=opt.GPU
|
|
1245
|
-
st=3
|
|
1246
|
-
ed=1
|
|
1247
|
-
NT=int(opt.NN)
|
|
1779
|
+
ww = line.strip().split("\t")
|
|
1780
|
+
VScore[(ww[0], ww[1])] = int(ww[2]) / 20
|
|
1781
|
+
VScore[(ww[1], ww[0])] = int(ww[2]) / 20
|
|
1782
|
+
Gap = int(opt.Gap)
|
|
1783
|
+
Gapn = int(opt.GapN)
|
|
1784
|
+
OutFile = opt.OutFile
|
|
1785
|
+
GPU = opt.GPU
|
|
1786
|
+
st = 3
|
|
1787
|
+
ed = 1
|
|
1788
|
+
NT = int(opt.NN)
|
|
1248
1789
|
faiss.omp_set_num_threads(NT)
|
|
1249
1790
|
for ff in files:
|
|
1250
|
-
print("Processing %s" %ff)
|
|
1251
|
-
EncodeRepertoire(
|
|
1252
|
-
|
|
1791
|
+
print("Processing %s" % ff)
|
|
1792
|
+
EncodeRepertoire(
|
|
1793
|
+
ff,
|
|
1794
|
+
OutDir,
|
|
1795
|
+
OutFile,
|
|
1796
|
+
ST=ST,
|
|
1797
|
+
thr_s=thr_s,
|
|
1798
|
+
thr_v=thr_v,
|
|
1799
|
+
exact=EE,
|
|
1800
|
+
VDict=VScore,
|
|
1801
|
+
Vgene=VV,
|
|
1802
|
+
thr_iso=cutoff,
|
|
1803
|
+
gap=Gap,
|
|
1804
|
+
GPU=GPU,
|
|
1805
|
+
Mat=Mat,
|
|
1806
|
+
verbose=verbose,
|
|
1807
|
+
)
|
|
1808
|
+
|
|
1809
|
+
|
|
1253
1810
|
if __name__ == "__main__":
|
|
1254
|
-
t0=time.time()
|
|
1811
|
+
t0 = time.time()
|
|
1255
1812
|
main()
|
|
1256
|
-
print
|
|
1257
|
-
print
|
|
1258
|
-
|
|
1813
|
+
print("Total time elapsed: %f" % (time.time() - t0))
|
|
1814
|
+
print(
|
|
1815
|
+
"Maximum memory usage: %f MB"
|
|
1816
|
+
% (resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1000000)
|
|
1817
|
+
)
|