PyPI - biopipen - Versions diffs - 0.21.0__py3-none-any.whl → 0.34.26__py3-none-any.whl - Mend

biopipen 0.21.0py3-none-any.whl → 0.34.26py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (290) hide show

biopipen/__init__.py +1 -1
biopipen/core/config.toml +28 -0
biopipen/core/filters.py +79 -4
biopipen/core/proc.py +12 -3
biopipen/core/testing.py +75 -3
biopipen/ns/bam.py +148 -6
biopipen/ns/bed.py +75 -0
biopipen/ns/cellranger.py +186 -0
biopipen/ns/cellranger_pipeline.py +126 -0
biopipen/ns/cnv.py +19 -3
biopipen/ns/cnvkit.py +1 -1
biopipen/ns/cnvkit_pipeline.py +20 -12
biopipen/ns/delim.py +34 -35
biopipen/ns/gene.py +68 -23
biopipen/ns/gsea.py +63 -37
biopipen/ns/misc.py +39 -14
biopipen/ns/plot.py +304 -1
biopipen/ns/protein.py +183 -0
biopipen/ns/regulatory.py +290 -0
biopipen/ns/rnaseq.py +142 -5
biopipen/ns/scrna.py +2053 -473
biopipen/ns/scrna_metabolic_landscape.py +228 -382
biopipen/ns/snp.py +659 -0
biopipen/ns/stats.py +484 -0
biopipen/ns/tcr.py +683 -98
biopipen/ns/vcf.py +236 -2
biopipen/ns/web.py +97 -6
biopipen/reports/bam/CNVpytor.svelte +4 -9
biopipen/reports/cellranger/CellRangerCount.svelte +18 -0
biopipen/reports/cellranger/CellRangerSummary.svelte +16 -0
biopipen/reports/cellranger/CellRangerVdj.svelte +18 -0
biopipen/reports/cnvkit/CNVkitDiagram.svelte +1 -1
biopipen/reports/cnvkit/CNVkitHeatmap.svelte +1 -1
biopipen/reports/cnvkit/CNVkitScatter.svelte +1 -1
biopipen/reports/common.svelte +15 -0
biopipen/reports/protein/ProdigySummary.svelte +16 -0
biopipen/reports/scrna/CellsDistribution.svelte +4 -39
biopipen/reports/scrna/DimPlots.svelte +1 -1
biopipen/reports/scrna/MarkersFinder.svelte +6 -126
biopipen/reports/scrna/MetaMarkers.svelte +3 -75
biopipen/reports/scrna/RadarPlots.svelte +4 -20
biopipen/reports/scrna_metabolic_landscape/MetabolicFeatures.svelte +61 -22
biopipen/reports/scrna_metabolic_landscape/MetabolicPathwayActivity.svelte +88 -82
biopipen/reports/scrna_metabolic_landscape/MetabolicPathwayHeterogeneity.svelte +70 -10
biopipen/reports/snp/PlinkCallRate.svelte +24 -0
biopipen/reports/snp/PlinkFreq.svelte +18 -0
biopipen/reports/snp/PlinkHWE.svelte +18 -0
biopipen/reports/snp/PlinkHet.svelte +18 -0
biopipen/reports/snp/PlinkIBD.svelte +18 -0
biopipen/reports/tcr/CDR3AAPhyschem.svelte +19 -66
biopipen/reports/tcr/ClonalStats.svelte +16 -0
biopipen/reports/tcr/CloneResidency.svelte +3 -93
biopipen/reports/tcr/Immunarch.svelte +4 -155
biopipen/reports/tcr/TCRClusterStats.svelte +3 -45
biopipen/reports/tcr/TESSA.svelte +11 -28
biopipen/reports/utils/misc.liq +22 -7
biopipen/scripts/bam/BamMerge.py +11 -15
biopipen/scripts/bam/BamSampling.py +90 -0
biopipen/scripts/bam/BamSort.py +141 -0
biopipen/scripts/bam/BamSplitChroms.py +10 -10
biopipen/scripts/bam/BamSubsetByBed.py +38 -0
biopipen/scripts/bam/CNAClinic.R +41 -5
biopipen/scripts/bam/CNVpytor.py +153 -54
biopipen/scripts/bam/ControlFREEC.py +13 -14
biopipen/scripts/bam/SamtoolsView.py +33 -0
biopipen/scripts/bed/Bed2Vcf.py +5 -5
biopipen/scripts/bed/BedConsensus.py +5 -5
biopipen/scripts/bed/BedLiftOver.sh +6 -4
biopipen/scripts/bed/BedtoolsIntersect.py +54 -0
biopipen/scripts/bed/BedtoolsMakeWindows.py +47 -0
biopipen/scripts/bed/BedtoolsMerge.py +4 -4
biopipen/scripts/cellranger/CellRangerCount.py +138 -0
biopipen/scripts/cellranger/CellRangerSummary.R +181 -0
biopipen/scripts/cellranger/CellRangerVdj.py +112 -0
biopipen/scripts/cnv/AneuploidyScore.R +55 -20
biopipen/scripts/cnv/AneuploidyScoreSummary.R +221 -163
biopipen/scripts/cnv/TMADScore.R +25 -9
biopipen/scripts/cnv/TMADScoreSummary.R +57 -86
biopipen/scripts/cnvkit/CNVkitAccess.py +7 -6
biopipen/scripts/cnvkit/CNVkitAutobin.py +26 -18
biopipen/scripts/cnvkit/CNVkitBatch.py +6 -6
biopipen/scripts/cnvkit/CNVkitCall.py +3 -3
biopipen/scripts/cnvkit/CNVkitCoverage.py +4 -3
biopipen/scripts/cnvkit/CNVkitDiagram.py +5 -5
biopipen/scripts/cnvkit/CNVkitFix.py +3 -3
biopipen/scripts/cnvkit/CNVkitGuessBaits.py +12 -8
biopipen/scripts/cnvkit/CNVkitHeatmap.py +5 -5
biopipen/scripts/cnvkit/CNVkitReference.py +6 -5
biopipen/scripts/cnvkit/CNVkitScatter.py +5 -5
biopipen/scripts/cnvkit/CNVkitSegment.py +5 -5
biopipen/scripts/cnvkit/guess_baits.py +166 -93
biopipen/scripts/delim/RowsBinder.R +1 -1
biopipen/scripts/delim/SampleInfo.R +116 -118
biopipen/scripts/gene/GeneNameConversion.R +67 -0
biopipen/scripts/gene/GenePromoters.R +61 -0
biopipen/scripts/gsea/Enrichr.R +5 -5
biopipen/scripts/gsea/FGSEA.R +184 -50
biopipen/scripts/gsea/GSEA.R +2 -2
biopipen/scripts/gsea/PreRank.R +5 -5
biopipen/scripts/misc/Config2File.py +2 -2
biopipen/scripts/misc/Plot.R +80 -0
biopipen/scripts/misc/Shell.sh +15 -0
biopipen/scripts/misc/Str2File.py +2 -2
biopipen/scripts/plot/Heatmap.R +3 -3
biopipen/scripts/plot/Manhattan.R +147 -0
biopipen/scripts/plot/QQPlot.R +146 -0
biopipen/scripts/plot/ROC.R +88 -0
biopipen/scripts/plot/Scatter.R +112 -0
biopipen/scripts/plot/VennDiagram.R +5 -9
biopipen/scripts/protein/MMCIF2PDB.py +33 -0
biopipen/scripts/protein/PDB2Fasta.py +60 -0
biopipen/scripts/protein/Prodigy.py +119 -0
biopipen/scripts/protein/ProdigySummary.R +140 -0
biopipen/scripts/protein/RMSD.py +178 -0
biopipen/scripts/regulatory/MotifAffinityTest.R +102 -0
biopipen/scripts/regulatory/MotifAffinityTest_AtSNP.R +127 -0
biopipen/scripts/regulatory/MotifAffinityTest_MotifBreakR.R +104 -0
biopipen/scripts/regulatory/MotifScan.py +159 -0
biopipen/scripts/regulatory/VariantMotifPlot.R +78 -0
biopipen/scripts/regulatory/motifs-common.R +324 -0
biopipen/scripts/rnaseq/Simulation-ESCO.R +180 -0
biopipen/scripts/rnaseq/Simulation-RUVcorr.R +45 -0
biopipen/scripts/rnaseq/Simulation.R +21 -0
biopipen/scripts/rnaseq/UnitConversion.R +325 -54
biopipen/scripts/scrna/AnnData2Seurat.R +40 -0
biopipen/scripts/scrna/CCPlotR-patch.R +161 -0
biopipen/scripts/scrna/CellCellCommunication.py +150 -0
biopipen/scripts/scrna/CellCellCommunicationPlots.R +93 -0
biopipen/scripts/scrna/CellSNPLite.py +30 -0
biopipen/scripts/scrna/CellTypeAnnotation-celltypist.R +185 -0
biopipen/scripts/scrna/CellTypeAnnotation-direct.R +68 -31
biopipen/scripts/scrna/CellTypeAnnotation-hitype.R +27 -22
biopipen/scripts/scrna/CellTypeAnnotation-sccatch.R +28 -20
biopipen/scripts/scrna/CellTypeAnnotation-sctype.R +48 -25
biopipen/scripts/scrna/CellTypeAnnotation.R +37 -1
biopipen/scripts/scrna/CellsDistribution.R +456 -167
biopipen/scripts/scrna/DimPlots.R +1 -1
biopipen/scripts/scrna/ExprImputation-alra.R +109 -0
biopipen/scripts/scrna/ExprImputation-rmagic.R +256 -0
biopipen/scripts/scrna/{ExprImpution-scimpute.R → ExprImputation-scimpute.R} +8 -5
biopipen/scripts/scrna/ExprImputation.R +7 -0
biopipen/scripts/scrna/LoomTo10X.R +51 -0
biopipen/scripts/scrna/MQuad.py +25 -0
biopipen/scripts/scrna/MarkersFinder.R +679 -400
biopipen/scripts/scrna/MetaMarkers.R +265 -161
biopipen/scripts/scrna/ModuleScoreCalculator.R +66 -11
biopipen/scripts/scrna/PseudoBulkDEG.R +678 -0
biopipen/scripts/scrna/RadarPlots.R +355 -134
biopipen/scripts/scrna/ScFGSEA.R +298 -100
biopipen/scripts/scrna/ScSimulation.R +65 -0
biopipen/scripts/scrna/ScVelo.py +617 -0
biopipen/scripts/scrna/Seurat2AnnData.R +7 -0
biopipen/scripts/scrna/SeuratClusterStats-clustree.R +87 -0
biopipen/scripts/scrna/SeuratClusterStats-dimplots.R +36 -30
biopipen/scripts/scrna/SeuratClusterStats-features.R +138 -187
biopipen/scripts/scrna/SeuratClusterStats-ngenes.R +81 -0
biopipen/scripts/scrna/SeuratClusterStats-stats.R +78 -89
biopipen/scripts/scrna/SeuratClusterStats.R +47 -10
biopipen/scripts/scrna/SeuratClustering.R +36 -233
biopipen/scripts/scrna/SeuratLoading.R +2 -2
biopipen/scripts/scrna/SeuratMap2Ref.R +84 -113
biopipen/scripts/scrna/SeuratMetadataMutater.R +16 -6
biopipen/scripts/scrna/SeuratPreparing.R +223 -173
biopipen/scripts/scrna/SeuratSubClustering.R +64 -0
biopipen/scripts/scrna/SeuratTo10X.R +27 -0
biopipen/scripts/scrna/Slingshot.R +65 -0
biopipen/scripts/scrna/Subset10X.R +2 -2
biopipen/scripts/scrna/TopExpressingGenes.R +169 -135
biopipen/scripts/scrna/celltypist-wrapper.py +195 -0
biopipen/scripts/scrna/scvelo_paga.py +313 -0
biopipen/scripts/scrna/seurat_anndata_conversion.py +98 -0
biopipen/scripts/scrna_metabolic_landscape/MetabolicFeatures.R +447 -82
biopipen/scripts/scrna_metabolic_landscape/MetabolicPathwayActivity.R +348 -241
biopipen/scripts/scrna_metabolic_landscape/MetabolicPathwayHeterogeneity.R +188 -166
biopipen/scripts/snp/MatrixEQTL.R +217 -0
biopipen/scripts/snp/Plink2GTMat.py +148 -0
biopipen/scripts/snp/PlinkCallRate.R +199 -0
biopipen/scripts/snp/PlinkFilter.py +100 -0
biopipen/scripts/snp/PlinkFreq.R +291 -0
biopipen/scripts/snp/PlinkFromVcf.py +81 -0
biopipen/scripts/snp/PlinkHWE.R +85 -0
biopipen/scripts/snp/PlinkHet.R +96 -0
biopipen/scripts/snp/PlinkIBD.R +196 -0
biopipen/scripts/snp/PlinkSimulation.py +124 -0
biopipen/scripts/snp/PlinkUpdateName.py +124 -0
biopipen/scripts/stats/ChowTest.R +146 -0
biopipen/scripts/stats/DiffCoexpr.R +152 -0
biopipen/scripts/stats/LiquidAssoc.R +135 -0
biopipen/scripts/stats/Mediation.R +108 -0
biopipen/scripts/stats/MetaPvalue.R +130 -0
biopipen/scripts/stats/MetaPvalue1.R +74 -0
biopipen/scripts/tcgamaf/Maf2Vcf.py +2 -2
biopipen/scripts/tcgamaf/MafAddChr.py +2 -2
biopipen/scripts/tcr/Attach2Seurat.R +3 -2
biopipen/scripts/tcr/CDR3AAPhyschem.R +211 -143
biopipen/scripts/tcr/CDR3Clustering.R +343 -0
biopipen/scripts/tcr/ClonalStats.R +526 -0
biopipen/scripts/tcr/CloneResidency.R +255 -131
biopipen/scripts/tcr/CloneSizeQQPlot.R +4 -4
biopipen/scripts/tcr/GIANA/GIANA.py +1356 -797
biopipen/scripts/tcr/GIANA/GIANA4.py +1362 -789
biopipen/scripts/tcr/GIANA/query.py +164 -162
biopipen/scripts/tcr/Immunarch-basic.R +31 -9
biopipen/scripts/tcr/Immunarch-clonality.R +25 -5
biopipen/scripts/tcr/Immunarch-diversity.R +352 -134
biopipen/scripts/tcr/Immunarch-geneusage.R +45 -5
biopipen/scripts/tcr/Immunarch-kmer.R +68 -8
biopipen/scripts/tcr/Immunarch-overlap.R +84 -4
biopipen/scripts/tcr/Immunarch-spectratyping.R +35 -6
biopipen/scripts/tcr/Immunarch-tracking.R +38 -6
biopipen/scripts/tcr/Immunarch-vjjunc.R +165 -0
biopipen/scripts/tcr/Immunarch.R +63 -11
biopipen/scripts/tcr/Immunarch2VDJtools.R +2 -2
biopipen/scripts/tcr/ImmunarchFilter.R +4 -4
biopipen/scripts/tcr/ImmunarchLoading.R +38 -29
biopipen/scripts/tcr/SampleDiversity.R +1 -1
biopipen/scripts/tcr/ScRepCombiningExpression.R +40 -0
biopipen/scripts/tcr/ScRepLoading.R +166 -0
biopipen/scripts/tcr/TCRClusterStats.R +176 -22
biopipen/scripts/tcr/TCRDock.py +110 -0
biopipen/scripts/tcr/TESSA.R +102 -118
biopipen/scripts/tcr/VJUsage.R +5 -5
biopipen/scripts/tcr/immunarch-patched.R +142 -0
biopipen/scripts/tcr/vdjtools-patch.sh +1 -1
biopipen/scripts/vcf/BcftoolsAnnotate.py +91 -0
biopipen/scripts/vcf/BcftoolsFilter.py +90 -0
biopipen/scripts/vcf/BcftoolsMerge.py +31 -0
biopipen/scripts/vcf/BcftoolsSort.py +113 -0
biopipen/scripts/vcf/BcftoolsView.py +73 -0
biopipen/scripts/vcf/TruvariBench.sh +14 -7
biopipen/scripts/vcf/TruvariBenchSummary.R +16 -13
biopipen/scripts/vcf/TruvariConsistency.R +1 -1
biopipen/scripts/vcf/Vcf2Bed.py +2 -2
biopipen/scripts/vcf/VcfAnno.py +11 -11
biopipen/scripts/vcf/VcfDownSample.sh +22 -10
biopipen/scripts/vcf/VcfFilter.py +5 -5
biopipen/scripts/vcf/VcfFix.py +7 -7
biopipen/scripts/vcf/VcfFix_utils.py +13 -4
biopipen/scripts/vcf/VcfIndex.py +3 -3
biopipen/scripts/vcf/VcfIntersect.py +3 -3
biopipen/scripts/vcf/VcfLiftOver.sh +5 -0
biopipen/scripts/vcf/VcfSplitSamples.py +4 -4
biopipen/scripts/vcf/bcftools_utils.py +52 -0
biopipen/scripts/web/Download.py +8 -4
biopipen/scripts/web/DownloadList.py +5 -5
biopipen/scripts/web/GCloudStorageDownloadBucket.py +82 -0
biopipen/scripts/web/GCloudStorageDownloadFile.py +23 -0
biopipen/scripts/web/gcloud_common.py +49 -0
biopipen/utils/gene.py +108 -60
biopipen/utils/misc.py +146 -20
biopipen/utils/reference.py +64 -20
biopipen/utils/reporter.py +177 -0
biopipen/utils/vcf.py +1 -1
biopipen-0.34.26.dist-info/METADATA +27 -0
biopipen-0.34.26.dist-info/RECORD +292 -0
{biopipen-0.21.0.dist-info → biopipen-0.34.26.dist-info}/WHEEL +1 -1
{biopipen-0.21.0.dist-info → biopipen-0.34.26.dist-info}/entry_points.txt +6 -2
biopipen/ns/bcftools.py +0 -111
biopipen/ns/scrna_basic.py +0 -255
biopipen/reports/delim/SampleInfo.svelte +0 -36
biopipen/reports/scrna/GeneExpressionInvistigation.svelte +0 -32
biopipen/reports/scrna/ScFGSEA.svelte +0 -35
biopipen/reports/scrna/SeuratClusterStats.svelte +0 -82
biopipen/reports/scrna/SeuratMap2Ref.svelte +0 -20
biopipen/reports/scrna/SeuratPreparing.svelte +0 -38
biopipen/reports/scrna/TopExpressingGenes.svelte +0 -55
biopipen/reports/scrna_metabolic_landscape/MetabolicFeaturesIntraSubset.svelte +0 -31
biopipen/reports/utils/gsea.liq +0 -110
biopipen/scripts/bcftools/BcftoolsAnnotate.py +0 -42
biopipen/scripts/bcftools/BcftoolsFilter.py +0 -79
biopipen/scripts/bcftools/BcftoolsSort.py +0 -19
biopipen/scripts/gene/GeneNameConversion.py +0 -66
biopipen/scripts/scrna/ExprImpution-alra.R +0 -32
biopipen/scripts/scrna/ExprImpution-rmagic.R +0 -29
biopipen/scripts/scrna/ExprImpution.R +0 -7
biopipen/scripts/scrna/GeneExpressionInvistigation.R +0 -132
biopipen/scripts/scrna/Write10X.R +0 -11
biopipen/scripts/scrna_metabolic_landscape/MetabolicFeaturesIntraSubset.R +0 -150
biopipen/scripts/tcr/TCRClustering.R +0 -280
biopipen/utils/common_docstrs.py +0 -61
biopipen/utils/gene.R +0 -49
biopipen/utils/gsea.R +0 -193
biopipen/utils/io.R +0 -20
biopipen/utils/misc.R +0 -114
biopipen/utils/mutate_helpers.R +0 -433
biopipen/utils/plot.R +0 -173
biopipen/utils/rnaseq.R +0 -48
biopipen/utils/single_cell.R +0 -115
biopipen-0.21.0.dist-info/METADATA +0 -22
biopipen-0.21.0.dist-info/RECORD +0 -218

biopipen/scripts/tcr/GIANA/GIANA.py CHANGED Viewed

@@ -25,7 +25,6 @@ import sys, os, re, resource
 from os import path
 import numpy as np
 from copy import deepcopy
-from Bio.SubsMat.MatrixInfo import blosum62
 import time
 from time import gmtime, strftime
 from operator import itemgetter
@@ -38,254 +37,574 @@ from sklearn.manifold import MDS
 import faiss
 from query import *
-AAstring='ACDEFGHIKLMNPQRSTVWY'
-AAstringList=list(AAstring)
-cur_dir=os.path.dirname(os.path.realpath(__file__))+'/'
+AAstring = "ACDEFGHIKLMNPQRSTVWY"
+AAstringList = list(AAstring)
+cur_dir = os.path.dirname(os.path.realpath(__file__)) + "/"
-blosum62n={}
+blosum62n = {}
 for kk in blosum62:
-    a1=kk[0]
-    a2=kk[1]
-    vv=blosum62[kk]
-    if vv>4:
-        vv=4
-    blosum62n[(a1,a2)]=vv
+    a1 = kk[0]
+    a2 = kk[1]
+    vv = blosum62[kk]
+    if vv > 4:
+        vv = 4
+    blosum62n[(a1, a2)] = vv
     if a1 != a2:
-        blosum62n[(a2,a1)]=vv
-bl62={'A':[4,-1,-2,-2,0,-1,-1,0,-2,-1,-1,-1,-1,-2,-1,1,0,-3,-2,0],
-      'R':[-1,4,0,-2,-3,1,0,-2,0,-3,-2,2,-1,-3,-2,-1,-1,-3,-2,-3],
-      'N':[-2,0,4,1,-3,0,0,0,1,-3,-3,0,-2,-3,-2,1,0,-4,-2,-3],
-      'D':[-2,-2,1,4,-3,0,2,-1,-1,-3,-4,-1,-3,-3,-1,0,-1,-4,-3,-3],
-      'C':[0,-3,-3,-3,4,-3,-4,-3,-3,-1,-1,-3,-1,-2,-3,-1,-1,-2,-2,-1],
-      'Q':[-1,1,0,0,-3,4,2,-2,0,-3,-2,1,0,-3,-1,0,-1,-2,-1,-2],
-      'E':[-1,0,0,2,-4,2,4,-2,0,-3,-3,1,-2,-3,-1,0,-1,-3,-2,-2],
-      'G':[0,-2,0,-1,-3,-2,-2,4,-2,-4,-4,-2,-3,-3,-2,0,-2,-2,-3,-3],
-      'H':[-2,0,1,-1,-3,0,0,-2,4,-3,-3,-1,-2,-1,-2,-1,-2,-2,2,-3],
-      'I':[-1,-3,-3,-3,-1,-3,-3,-4,-3,4,2,-3,1,0,-3,-2,-1,-3,-1,3],
-      'L':[-1,-2,-3,-4,-1,-2,-3,-4,-3,2,4,-2,2,0,-3,-2,-1,-2,-1,1],
-      'K':[-1,2,0,-1,-3,1,1,-2,-1,-3,-2,4,-1,-3,-1,0,-1,-3,-2,-2],
-      'M':[-1,-1,-2,-3,-1,0,-2,-3,-2,1,2,-1,4,0,-2,-1,-1,-1,-1,1],
-      'F':[-2,-3,-3,-3,-2,-3,-3,-3,-1,0,0,-3,0,4,-4,-2,-2,1,3,-1],
-      'P':[-1,-2,-2,-1,-3,-1,-1,-2,-2,-3,-3,-1,-2,-4,4,-1,-1,-4,-3,-2],
-      'S':[1,-1,1,0,-1,0,0,0,-1,-2,-2,0,-1,-2,-1,4,1,-3,-2,-2],
-      'T':[0,-1,0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-2,-1,1,4,-2,-2,0],
-      'W':[-3,-3,-4,-4,-2,-2,-3,-2,-2,-3,-2,-3,-1,1,-4,-3,-2,4,2,-3],
-      'Y':[-2,-2,-2,-3,-2,-1,-2,-3,2,-1,-1,-2,-1,3,-3,-2,-2,2,4,-1],
-      'V':[0,-3,-3,-3,-1,-2,-2,-3,-3,3,1,-2,1,-1,-2,-2,0,-3,-1,4]}
-bl62c=np.array([np.array(x) for x in list(bl62.values())])
-bl62c=4-bl62c
-embedding=MDS(n_components=13, n_init=100, max_iter=1000, eps=0.00001, dissimilarity='precomputed')
-X=embedding.fit_transform(bl62c)
-bl62np={}
-vkk=list(bl62.keys())
+        blosum62n[(a2, a1)] = vv
+bl62 = {
+    "A": [4, -1, -2, -2, 0, -1, -1, 0, -2, -1, -1, -1, -1, -2, -1, 1, 0, -3, -2, 0],
+    "R": [-1, 4, 0, -2, -3, 1, 0, -2, 0, -3, -2, 2, -1, -3, -2, -1, -1, -3, -2, -3],
+    "N": [-2, 0, 4, 1, -3, 0, 0, 0, 1, -3, -3, 0, -2, -3, -2, 1, 0, -4, -2, -3],
+    "D": [-2, -2, 1, 4, -3, 0, 2, -1, -1, -3, -4, -1, -3, -3, -1, 0, -1, -4, -3, -3],
+    "C": [0, -3, -3, -3, 4, -3, -4, -3, -3, -1, -1, -3, -1, -2, -3, -1, -1, -2, -2, -1],
+    "Q": [-1, 1, 0, 0, -3, 4, 2, -2, 0, -3, -2, 1, 0, -3, -1, 0, -1, -2, -1, -2],
+    "E": [-1, 0, 0, 2, -4, 2, 4, -2, 0, -3, -3, 1, -2, -3, -1, 0, -1, -3, -2, -2],
+    "G": [0, -2, 0, -1, -3, -2, -2, 4, -2, -4, -4, -2, -3, -3, -2, 0, -2, -2, -3, -3],
+    "H": [-2, 0, 1, -1, -3, 0, 0, -2, 4, -3, -3, -1, -2, -1, -2, -1, -2, -2, 2, -3],
+    "I": [-1, -3, -3, -3, -1, -3, -3, -4, -3, 4, 2, -3, 1, 0, -3, -2, -1, -3, -1, 3],
+    "L": [-1, -2, -3, -4, -1, -2, -3, -4, -3, 2, 4, -2, 2, 0, -3, -2, -1, -2, -1, 1],
+    "K": [-1, 2, 0, -1, -3, 1, 1, -2, -1, -3, -2, 4, -1, -3, -1, 0, -1, -3, -2, -2],
+    "M": [-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1, 4, 0, -2, -1, -1, -1, -1, 1],
+    "F": [-2, -3, -3, -3, -2, -3, -3, -3, -1, 0, 0, -3, 0, 4, -4, -2, -2, 1, 3, -1],
+    "P": [
+        -1,
+        -2,
+        -2,
+        -1,
+        -3,
+        -1,
+        -1,
+        -2,
+        -2,
+        -3,
+        -3,
+        -1,
+        -2,
+        -4,
+        4,
+        -1,
+        -1,
+        -4,
+        -3,
+        -2,
+    ],
+    "S": [1, -1, 1, 0, -1, 0, 0, 0, -1, -2, -2, 0, -1, -2, -1, 4, 1, -3, -2, -2],
+    "T": [0, -1, 0, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, -2, -1, 1, 4, -2, -2, 0],
+    "W": [-3, -3, -4, -4, -2, -2, -3, -2, -2, -3, -2, -3, -1, 1, -4, -3, -2, 4, 2, -3],
+    "Y": [-2, -2, -2, -3, -2, -1, -2, -3, 2, -1, -1, -2, -1, 3, -3, -2, -2, 2, 4, -1],
+    "V": [0, -3, -3, -3, -1, -2, -2, -3, -3, 3, 1, -2, 1, -1, -2, -2, 0, -3, -1, 4],
+}
+bl62c = np.array([np.array(x) for x in list(bl62.values())])
+bl62c = 4 - bl62c
+embedding = MDS(
+    n_components=13, n_init=100, max_iter=1000, eps=0.00001, dissimilarity="precomputed"
+)
+X = embedding.fit_transform(bl62c)
+bl62np = {}
+vkk = list(bl62.keys())
 for ii in range(20):
-    kk=vkk[ii]
-    bl62np[kk]=np.array(list(X[ii,])+[0]*17)
+    kk = vkk[ii]
+    bl62np[kk] = np.array(list(X[ii,]) + [0] * 17)
-AAencodingDict={}
+AAencodingDict = {}
 for ii in range(len(AAstringList)):
-    aa=AAstringList[ii]
-    CODE=[0]*(ii)+[1]+[0]*(20-ii)
-    AAencodingDict[aa]=np.array(CODE)
-Ndim=16  ## optimized for isometric embedding
-n0=Ndim*6
-#M0=np.concatenate((np.concatenate((ZERO,M1),axis=1),np.concatenate((M1, ZERO),axis=1)))
-ZERO=np.zeros((Ndim,Ndim))
-II=np.eye(Ndim)
-M0=np.concatenate((np.concatenate((ZERO,ZERO, II),axis=1),np.concatenate((II, ZERO, ZERO),axis=1),np.concatenate((ZERO,II, ZERO),axis=1)))
+    aa = AAstringList[ii]
+    CODE = [0] * (ii) + [1] + [0] * (20 - ii)
+    AAencodingDict[aa] = np.array(CODE)
+Ndim = 16  ## optimized for isometric embedding
+n0 = Ndim * 6
+# M0=np.concatenate((np.concatenate((ZERO,M1),axis=1),np.concatenate((M1, ZERO),axis=1)))
+ZERO = np.zeros((Ndim, Ndim))
+II = np.eye(Ndim)
+M0 = np.concatenate(
+    (
+        np.concatenate((ZERO, ZERO, II), axis=1),
+        np.concatenate((II, ZERO, ZERO), axis=1),
+        np.concatenate((ZERO, II, ZERO), axis=1),
+    )
+)
 ## Construct 6-th order cyclic group
-ZERO45=np.zeros((Ndim*3,Ndim*3))
-M6=np.concatenate((np.concatenate((ZERO45,M0),axis=1),np.concatenate((M0, ZERO45),axis=1)))
-X=np.array([[-0.31230882, -0.53572156, -0.01949946, -0.12211268, -0.70947917,
-        -0.42211092,  0.02783931,  0.02637933, -0.41760305,  0.21809875,
-         0.53532768,  0.04833016,  0.07877711,  0.50464914, -0.26972087,
-        -0.52416842],
-       [ 0.29672002,  0.29005364,  0.18176298, -0.05103382, -0.34686519,
-         0.58024228, -0.49282931,  0.62304281, -0.09575202,  0.30115555,
-         0.09913529,  0.1577466 , -0.94391939, -0.10505925,  0.05482389,
-         0.38409897],
-       [-0.42212537,  0.12225749,  0.16279646,  0.60099009,  0.19734216,
-         0.42819919, -0.33562418,  0.17036334,  0.4234109 ,  0.46681561,
-        -0.50347222, -0.37936876,  0.1494825 ,  0.32176759,  0.28584684,
-         0.68469861],
-       [ 0.18599294, -0.44017825, -0.4476952 ,  0.34340976,  0.44603553,
-         0.40974629, -0.60045935, -0.09056728,  0.22147919, -0.33029418,
-         0.55635594, -0.54149972,  0.05459062,  0.57334159, -0.06227118,
-         0.65299872],
-       [-0.19010428,  0.64418792, -0.85286762,  0.21380295,  0.37639516,
-        -0.67753593,  0.38751609,  0.55746524,  0.01443766,  0.1776535 ,
-         0.62853954, -0.15048523,  0.55100206, -0.21426656,  0.3644061 ,
-        -0.0018255 ],
-       [ 0.7350723 ,  0.10111267,  0.55640019, -0.18226966,  0.51658102,
-        -0.19321508, -0.46599027, -0.02989911,  0.4036196 , -0.11978213,
-        -0.29837524, -0.30232765, -0.36738065, -0.1379793 ,  0.04362871,
-         0.33553714],
-       [ 0.41134047,  0.13512443,  0.62492322, -0.10120261, -0.03093491,
-         0.23751917, -0.68338694,  0.05124762,  0.41533821,  0.46669353,
-         0.31467277, -0.02427587,  0.15361135,  0.70595112, -0.27952632,
-         0.32408931],
-       [-0.33041265, -0.43860065, -0.5509376 , -0.04380843, -0.35160935,
-         0.25134855,  0.53409314,  0.54850824,  0.59490287,  0.32669345,
-        -0.45355268, -0.56317041, -0.55416297,  0.18117841, -0.71600849,
-        -0.08989825],
-       [-0.40366849,  0.10978974,  0.0280101 , -0.46667987, -0.45607028,
-         0.54114052, -0.77552923, -0.10720425,  0.55252091, -0.34397153,
-        -0.59813694,  0.15567728,  0.03071009, -0.02176143,  0.34442719,
-         0.14681541],
-       [ 0.19280422,  0.35777863,  0.06139255,  0.20081699, -0.30546596,
-        -0.56901549, -0.15290953, -0.31181573, -0.74523217,  0.22296016,
-        -0.39143832, -0.16474685,  0.58064427, -0.77386654,  0.19713107,
-        -0.49477418],
-       [-0.16133903,  0.22112761, -0.53162136,  0.34764073, -0.08522381,
-        -0.2510216 ,  0.04699411, -0.25702389, -0.8739765 , -0.24171728,
-        -0.24370533,  0.42193635,  0.41056913, -0.60378211, -0.65756832,
-         0.0845203 ],
-       [-0.34792144,  0.18450939,  0.77038332,  0.63868511, -0.06221681,
-         0.11930421,  0.04895523, -0.22463059, -0.03268844, -0.58941354,
-         0.11640045,  0.32384901, -0.42952779,  0.58119471,  0.07288662,
-         0.26669673],
-       [ 0.01834555, -0.16367754,  0.34900298,  0.45087949,  0.47073855,
-        -0.37377404,  0.0606911 ,  0.2455703 , -0.55182937, -0.20261009,
-         0.28325423, -0.04741146,  0.30565238, -0.62090653,  0.17528413,
-        -0.60434975],
-       [-0.55464981,  0.50918784, -0.21371646, -0.63996967, -0.37656862,
-         0.27852662,  0.3287838 , -0.56800869,  0.23260763, -0.20653106,
-         0.63261439, -0.22666691,  0.00726302, -0.60125196,  0.07139961,
-        -0.35086639],
-       [ 0.94039731, -0.25999326,  0.43922549, -0.485738  , -0.20492235,
-        -0.26005626,  0.68776626,  0.57826888, -0.05973995, -0.1193658 ,
-        -0.12102433, -0.22091354,  0.43427913,  0.71447886,  0.32745991,
-         0.03466398],
-       [-0.13194625, -0.12262688,  0.18029209,  0.16555524,  0.39594125,
-        -0.58110665,  0.16161717,  0.0839783 ,  0.0911945 ,  0.34546976,
-        -0.29415349,  0.29891936, -0.60834721,  0.5943593 , -0.29473819,
-         0.4864154 ],
-       [ 0.40850093, -0.4638894 , -0.39732987, -0.01972861,  0.51189582,
-         0.10176704,  0.37528519, -0.41479418, -0.1932531 ,  0.54732221,
-        -0.11876511,  0.32843973, -0.259283  ,  0.59500132,  0.35168375,
-        -0.21733727],
-       [-0.50627723, -0.1973602 , -0.02339884, -0.66846048,  0.62696606,
-         0.60049717,  0.69143364, -0.48053591,  0.17812208, -0.58481821,
-        -0.23551415, -0.06229112,  0.20993116, -0.72485884,  0.34375662,
-        -0.23539168],
-       [-0.51388312, -0.2788953 ,  0.00859533, -0.5247195 , -0.18021544,
-         0.28372911,  0.10791359,  0.13033494,  0.34294013, -0.70310089,
-        -0.13245433,  0.48661081,  0.08451644, -0.69990992,  0.0408274 ,
-        -0.47204888],
-       [ 0.68546275,  0.22581365, -0.32571833,  0.34394298, -0.43232367,
-        -0.5041842 ,  0.04784017, -0.53067936, -0.50049908,  0.36874221,
-         0.22429186,  0.4616482 ,  0.11159174, -0.26827959, -0.39372848,
-        -0.40987423]])
-bl62np={}
-vkk=list(bl62.keys())
+ZERO45 = np.zeros((Ndim * 3, Ndim * 3))
+M6 = np.concatenate(
+    (np.concatenate((ZERO45, M0), axis=1), np.concatenate((M0, ZERO45), axis=1))
+)
+X = np.array(
+    [
+        [
+            -0.31230882,
+            -0.53572156,
+            -0.01949946,
+            -0.12211268,
+            -0.70947917,
+            -0.42211092,
+            0.02783931,
+            0.02637933,
+            -0.41760305,
+            0.21809875,
+            0.53532768,
+            0.04833016,
+            0.07877711,
+            0.50464914,
+            -0.26972087,
+            -0.52416842,
+        ],
+        [
+            0.29672002,
+            0.29005364,
+            0.18176298,
+            -0.05103382,
+            -0.34686519,
+            0.58024228,
+            -0.49282931,
+            0.62304281,
+            -0.09575202,
+            0.30115555,
+            0.09913529,
+            0.1577466,
+            -0.94391939,
+            -0.10505925,
+            0.05482389,
+            0.38409897,
+        ],
+        [
+            -0.42212537,
+            0.12225749,
+            0.16279646,
+            0.60099009,
+            0.19734216,
+            0.42819919,
+            -0.33562418,
+            0.17036334,
+            0.4234109,
+            0.46681561,
+            -0.50347222,
+            -0.37936876,
+            0.1494825,
+            0.32176759,
+            0.28584684,
+            0.68469861,
+        ],
+        [
+            0.18599294,
+            -0.44017825,
+            -0.4476952,
+            0.34340976,
+            0.44603553,
+            0.40974629,
+            -0.60045935,
+            -0.09056728,
+            0.22147919,
+            -0.33029418,
+            0.55635594,
+            -0.54149972,
+            0.05459062,
+            0.57334159,
+            -0.06227118,
+            0.65299872,
+        ],
+        [
+            -0.19010428,
+            0.64418792,
+            -0.85286762,
+            0.21380295,
+            0.37639516,
+            -0.67753593,
+            0.38751609,
+            0.55746524,
+            0.01443766,
+            0.1776535,
+            0.62853954,
+            -0.15048523,
+            0.55100206,
+            -0.21426656,
+            0.3644061,
+            -0.0018255,
+        ],
+        [
+            0.7350723,
+            0.10111267,
+            0.55640019,
+            -0.18226966,
+            0.51658102,
+            -0.19321508,
+            -0.46599027,
+            -0.02989911,
+            0.4036196,
+            -0.11978213,
+            -0.29837524,
+            -0.30232765,
+            -0.36738065,
+            -0.1379793,
+            0.04362871,
+            0.33553714,
+        ],
+        [
+            0.41134047,
+            0.13512443,
+            0.62492322,
+            -0.10120261,
+            -0.03093491,
+            0.23751917,
+            -0.68338694,
+            0.05124762,
+            0.41533821,
+            0.46669353,
+            0.31467277,
+            -0.02427587,
+            0.15361135,
+            0.70595112,
+            -0.27952632,
+            0.32408931,
+        ],
+        [
+            -0.33041265,
+            -0.43860065,
+            -0.5509376,
+            -0.04380843,
+            -0.35160935,
+            0.25134855,
+            0.53409314,
+            0.54850824,
+            0.59490287,
+            0.32669345,
+            -0.45355268,
+            -0.56317041,
+            -0.55416297,
+            0.18117841,
+            -0.71600849,
+            -0.08989825,
+        ],
+        [
+            -0.40366849,
+            0.10978974,
+            0.0280101,
+            -0.46667987,
+            -0.45607028,
+            0.54114052,
+            -0.77552923,
+            -0.10720425,
+            0.55252091,
+            -0.34397153,
+            -0.59813694,
+            0.15567728,
+            0.03071009,
+            -0.02176143,
+            0.34442719,
+            0.14681541,
+        ],
+        [
+            0.19280422,
+            0.35777863,
+            0.06139255,
+            0.20081699,
+            -0.30546596,
+            -0.56901549,
+            -0.15290953,
+            -0.31181573,
+            -0.74523217,
+            0.22296016,
+            -0.39143832,
+            -0.16474685,
+            0.58064427,
+            -0.77386654,
+            0.19713107,
+            -0.49477418,
+        ],
+        [
+            -0.16133903,
+            0.22112761,
+            -0.53162136,
+            0.34764073,
+            -0.08522381,
+            -0.2510216,
+            0.04699411,
+            -0.25702389,
+            -0.8739765,
+            -0.24171728,
+            -0.24370533,
+            0.42193635,
+            0.41056913,
+            -0.60378211,
+            -0.65756832,
+            0.0845203,
+        ],
+        [
+            -0.34792144,
+            0.18450939,
+            0.77038332,
+            0.63868511,
+            -0.06221681,
+            0.11930421,
+            0.04895523,
+            -0.22463059,
+            -0.03268844,
+            -0.58941354,
+            0.11640045,
+            0.32384901,
+            -0.42952779,
+            0.58119471,
+            0.07288662,
+            0.26669673,
+        ],
+        [
+            0.01834555,
+            -0.16367754,
+            0.34900298,
+            0.45087949,
+            0.47073855,
+            -0.37377404,
+            0.0606911,
+            0.2455703,
+            -0.55182937,
+            -0.20261009,
+            0.28325423,
+            -0.04741146,
+            0.30565238,
+            -0.62090653,
+            0.17528413,
+            -0.60434975,
+        ],
+        [
+            -0.55464981,
+            0.50918784,
+            -0.21371646,
+            -0.63996967,
+            -0.37656862,
+            0.27852662,
+            0.3287838,
+            -0.56800869,
+            0.23260763,
+            -0.20653106,
+            0.63261439,
+            -0.22666691,
+            0.00726302,
+            -0.60125196,
+            0.07139961,
+            -0.35086639,
+        ],
+        [
+            0.94039731,
+            -0.25999326,
+            0.43922549,
+            -0.485738,
+            -0.20492235,
+            -0.26005626,
+            0.68776626,
+            0.57826888,
+            -0.05973995,
+            -0.1193658,
+            -0.12102433,
+            -0.22091354,
+            0.43427913,
+            0.71447886,
+            0.32745991,
+            0.03466398,
+        ],
+        [
+            -0.13194625,
+            -0.12262688,
+            0.18029209,
+            0.16555524,
+            0.39594125,
+            -0.58110665,
+            0.16161717,
+            0.0839783,
+            0.0911945,
+            0.34546976,
+            -0.29415349,
+            0.29891936,
+            -0.60834721,
+            0.5943593,
+            -0.29473819,
+            0.4864154,
+        ],
+        [
+            0.40850093,
+            -0.4638894,
+            -0.39732987,
+            -0.01972861,
+            0.51189582,
+            0.10176704,
+            0.37528519,
+            -0.41479418,
+            -0.1932531,
+            0.54732221,
+            -0.11876511,
+            0.32843973,
+            -0.259283,
+            0.59500132,
+            0.35168375,
+            -0.21733727,
+        ],
+        [
+            -0.50627723,
+            -0.1973602,
+            -0.02339884,
+            -0.66846048,
+            0.62696606,
+            0.60049717,
+            0.69143364,
+            -0.48053591,
+            0.17812208,
+            -0.58481821,
+            -0.23551415,
+            -0.06229112,
+            0.20993116,
+            -0.72485884,
+            0.34375662,
+            -0.23539168,
+        ],
+        [
+            -0.51388312,
+            -0.2788953,
+            0.00859533,
+            -0.5247195,
+            -0.18021544,
+            0.28372911,
+            0.10791359,
+            0.13033494,
+            0.34294013,
+            -0.70310089,
+            -0.13245433,
+            0.48661081,
+            0.08451644,
+            -0.69990992,
+            0.0408274,
+            -0.47204888,
+        ],
+        [
+            0.68546275,
+            0.22581365,
+            -0.32571833,
+            0.34394298,
+            -0.43232367,
+            -0.5041842,
+            0.04784017,
+            -0.53067936,
+            -0.50049908,
+            0.36874221,
+            0.22429186,
+            0.4616482,
+            0.11159174,
+            -0.26827959,
+            -0.39372848,
+            -0.40987423,
+        ],
+    ]
+)
+bl62np = {}
+vkk = list(bl62.keys())
 for ii in range(20):
-    kk=vkk[ii]
-    bl62np[kk]=np.array(list(X[ii,])+[0]*Ndim*5)
+    kk = vkk[ii]
+    bl62np[kk] = np.array(list(X[ii,]) + [0] * Ndim * 5)
 def EncodingCDR3(s, M, n0):
-    sL=list(s)
-    x=np.array([0]*n0)
+    sL = list(s)
+    x = np.array([0] * n0)
     for ii in range(len(sL)):
-        x = np.dot(M, (x+bl62np[sL[ii]]))
+        x = np.dot(M, (x + bl62np[sL[ii]]))
     return x
 def BuildLengthDict(seqs, sIDs, vGene=[], INFO=[]):
-    LLs=[10,11,12,13,14,15,16,17,18,19,20,21,22,23,24]
-    LengthD={}
-    SeqD={}
-    VgeneD={}
-    InfoD={}
-    AAs=set(list(AAencodingDict.keys()))
-    NAs=len(AAencodingDict)
-    cNAs=0
+    LLs = [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]
+    LengthD = {}
+    SeqD = {}
+    VgeneD = {}
+    InfoD = {}
+    AAs = set(list(AAencodingDict.keys()))
+    NAs = len(AAencodingDict)
+    cNAs = 0
     for ii in range(len(seqs)):
-        ID=sIDs[ii]
-        ss=seqs[ii]
-        ssAA=set(list(ss))
-        TMP=list(ssAA | AAs)
+        ID = sIDs[ii]
+        ss = seqs[ii]
+        ssAA = set(list(ss))
+        TMP = list(ssAA | AAs)
         if len(TMP) > NAs:
             ## CDR3 containing non amino acid letter
-            #print('Warning: CDR3: '+ss + ' contains non amino acid letter!')
-            cNAs+=1
+            # print('Warning: CDR3: '+ss + ' contains non amino acid letter!')
+            cNAs += 1
             continue
-        if len(vGene)>0:
-            vv=vGene[ii]
-        if len(INFO)>0:
-            info=INFO[ii]
-        L=len(ss)
+        if len(vGene) > 0:
+            vv = vGene[ii]
+        if len(INFO) > 0:
+            info = INFO[ii]
+        L = len(ss)
         if L not in LLs:
             continue
         if L not in LengthD:
-            LengthD[L]=[ID]
-            SeqD[L]=[ss]
-            if len(vGene)>0:
-                VgeneD[L]=[vv]
-            if len(INFO)>0:
-                InfoD[L]=[info]
+            LengthD[L] = [ID]
+            SeqD[L] = [ss]
+            if len(vGene) > 0:
+                VgeneD[L] = [vv]
+            if len(INFO) > 0:
+                InfoD[L] = [info]
         else:
             LengthD[L].append(ID)
             SeqD[L].append(ss)
-            if len(vGene)>0:
+            if len(vGene) > 0:
                 VgeneD[L].append(vv)
-            if len(INFO)>0:
+            if len(INFO) > 0:
                 InfoD[L].append(info)
-    if cNAs>0:
-        print("Warning: Skipped %d sequences with non AA letter!" %(cNAs))
+    if cNAs > 0:
+        print("Warning: Skipped %d sequences with non AA letter!" % (cNAs))
     return LengthD, VgeneD, InfoD, SeqD
 def CollapseUnique(LD, VD, ID, SD):
-    kks=LD.keys()
-    LDu={}
-    VDu={}
-    IDu={}
-    SDu={}
+    kks = LD.keys()
+    LDu = {}
+    VDu = {}
+    IDu = {}
+    SDu = {}
     for kk in kks:
-        vvL=list(LD[kk])
-        if len(VD)>0:
-            vvV=list(VD[kk])
+        vvL = list(LD[kk])
+        if len(VD) > 0:
+            vvV = list(VD[kk])
         else:
-            vvV=['TRBV2-1*01']*len(vvL)
-        vvI=list(ID[kk])
-        vvS=list(SD[kk])
-        zz=zip(vvL, vvS, vvV, vvI)
-        zzs=sorted(zz, key = lambda x: (x[1], x[2]))
-        nz=len(zzs)
-        pointer_pre=0
-        pointer_cur=1
-        s_pre=zzs[pointer_pre][1]
-        v_pre=zzs[pointer_pre][2]
-        uS=[s_pre]
-        uV=[v_pre]
-        uI=[[zzs[pointer_pre][3]]]
+            vvV = ["TRBV2-1*01"] * len(vvL)
+        vvI = list(ID[kk])
+        vvS = list(SD[kk])
+        zz = zip(vvL, vvS, vvV, vvI)
+        zzs = sorted(zz, key=lambda x: (x[1], x[2]))
+        nz = len(zzs)
+        pointer_pre = 0
+        pointer_cur = 1
+        s_pre = zzs[pointer_pre][1]
+        v_pre = zzs[pointer_pre][2]
+        uS = [s_pre]
+        uV = [v_pre]
+        uI = [[zzs[pointer_pre][3]]]
         while pointer_cur < nz:
-            s_cur=zzs[pointer_cur][1]
-            v_cur=zzs[pointer_cur][2]
+            s_cur = zzs[pointer_cur][1]
+            v_cur = zzs[pointer_cur][2]
             if s_cur == s_pre and v_cur == v_pre:
-                uI[len(uI)-1].append(zzs[pointer_cur][3])
+                uI[len(uI) - 1].append(zzs[pointer_cur][3])
                 pointer_cur += 1
                 continue
             else:
                 uS.append(s_cur)
                 uV.append(v_cur)
                 uI.append([zzs[pointer_cur][3]])
-                s_pre=s_cur
-                v_pre=v_cur
-                pointer_pre=pointer_cur
+                s_pre = s_cur
+                v_pre = v_cur
+                pointer_pre = pointer_cur
                 pointer_cur += 1
-        uL=[x for x in range(len(uS))]
-        LDu[kk]=uL
-        SDu[kk]=uS
-        if len(VD)>0:
-            VDu[kk]=uV
-        IDu[kk]=uI
+        uL = [x for x in range(len(uS))]
+        LDu[kk] = uL
+        SDu[kk] = uS
+        if len(VD) > 0:
+            VDu[kk] = uV
+        IDu[kk] = uI
     return LDu, VDu, IDu, SDu
@@ -297,14 +616,15 @@ class CDR3:
         ## KS: Kmer size
         ## st: the first 0:(st-1) amino acids will not be included in K-merization
         ## ed: the last L-ed amino acids will be skipped
-        self.s=s
-        self.ID=sID
-        L=len(s)
-        self.L=L
-        sub_s=s[st: (L-ed)]
-        Ls=len(sub_s)
-        Kmer=[sub_s[x:(x+KS)] for x in range(0,Ls-KS+1)]
-        self.Kmer=Kmer
+        self.s = s
+        self.ID = sID
+        L = len(s)
+        self.L = L
+        sub_s = s[st : (L - ed)]
+        Ls = len(sub_s)
+        Kmer = [sub_s[x : (x + KS)] for x in range(0, Ls - KS + 1)]
+        self.Kmer = Kmer
 class KmerSet:
     ## Kmer set for fast read searching based on mismatch-allowed Kmer index
@@ -313,263 +633,277 @@ class KmerSet:
         ## Seqs and sIDs must have the same length
         if len(Seqs) != len(sIDs):
             raise "Sequence and ID lists have different length. Please check input."
-        KmerDict={}
-        N=len(Seqs)
-        self.N=N
-        CDR3Dict={}
-        LLs=[]
-        for ii in range(0,N):
-            s=Seqs[ii]
-            sID=sIDs[ii]
-            cc=CDR3(s,sID,KS,st,ed)
-            CDR3Dict[cc.ID]=cc.Kmer
-            KK=cc.Kmer
+        KmerDict = {}
+        N = len(Seqs)
+        self.N = N
+        CDR3Dict = {}
+        LLs = []
+        for ii in range(0, N):
+            s = Seqs[ii]
+            sID = sIDs[ii]
+            cc = CDR3(s, sID, KS, st, ed)
+            CDR3Dict[cc.ID] = cc.Kmer
+            KK = cc.Kmer
             LLs.append(cc.L)
             for kk in KK:
                 if kk not in KmerDict:
-                    KmerDict[kk]=[sID]
+                    KmerDict[kk] = [sID]
                 else:
                     KmerDict[kk].append(sID)
-        self.KD=KmerDict
-        self.KS=KS
-        self.CD=CDR3Dict
-        self.LL=LLs
-    def FindKmerNeighbor(self,kk):
-        KS=self.KS
-        KS_n1=[]
+        self.KD = KmerDict
+        self.KS = KS
+        self.CD = CDR3Dict
+        self.LL = LLs
+    def FindKmerNeighbor(self, kk):
+        KS = self.KS
+        KS_n1 = []
         for jj in range(KS):
-            kk_pre=[kk[0:jj]]*20
-            kk_suf=[kk[(jj+1):KS]]*20
-            kkn=list(zip(kk_pre,AAstringList,kk_suf))
-            KS_n1+=[''.join(list(x)) for x in kkn]
+            kk_pre = [kk[0:jj]] * 20
+            kk_suf = [kk[(jj + 1) : KS]] * 20
+            kkn = list(zip(kk_pre, AAstringList, kk_suf))
+            KS_n1 += ["".join(list(x)) for x in kkn]
         return KS_n1
-    def FindKmerNeighbor2(self,kk):
+    def FindKmerNeighbor2(self, kk):
         ## KS>=6, allowing 2 mismatches. CDR3 length must be >= 10
-        KS=self.KS
-        KS_n1=[]
+        KS = self.KS
+        KS_n1 = []
         for jj in range(KS):
             for ii in range(KS):
-                if ii<=jj:
+                if ii <= jj:
                     continue
-                kk_pre=[kk[0:jj]]*20
-                kk_mid=[kk[(jj+1):ii]]*20
-                kk_suf=[kk[(ii+1):KS]]*400
-                kkn=list(zip(kk_pre,AAstringList,kk_mid))
-                kkn=[''.join(list(x)) for x in kkn]
-                kkn=[[x]*20 for x in kkn]
-                kkn=list(chain(*kkn))
-                kkn2=list(zip(kkn, AAstringList*20, kk_suf))
-                kkn2=[''.join(list(x)) for x in kkn2]
-                KS_n1+=kkn2
+                kk_pre = [kk[0:jj]] * 20
+                kk_mid = [kk[(jj + 1) : ii]] * 20
+                kk_suf = [kk[(ii + 1) : KS]] * 400
+                kkn = list(zip(kk_pre, AAstringList, kk_mid))
+                kkn = ["".join(list(x)) for x in kkn]
+                kkn = [[x] * 20 for x in kkn]
+                kkn = list(chain(*kkn))
+                kkn2 = list(zip(kkn, AAstringList * 20, kk_suf))
+                kkn2 = ["".join(list(x)) for x in kkn2]
+                KS_n1 += kkn2
         return KS_n1
     def KmerIndex(self):
         ## For each K-mer, find its nearest neighbor with 1 character mismatch
-        KKs=list(self.KD.keys())
-        KS=self.KS
-        KKs_set=set(KKs)
-        Skk='_'.join(KKs)
-        KI_Dict={}
+        KKs = list(self.KD.keys())
+        KS = self.KS
+        KKs_set = set(KKs)
+        Skk = "_".join(KKs)
+        KI_Dict = {}
         for kk in KKs:
-##            kk_neighbor=[]
-##            for jj in range(KS):
-##                kk_pre=kk[0:jj]
-##                kk_suf=kk[(jj+1):KS]
-##                pat=kk_pre+'['+AAstring+']{1}'+kk_suf
-##                p=re.compile(pat)
-##                mm=[m.group() for m in p.finditer(Skk)]
-##                kk_neighbor+=mm
-            KS_n=set(self.FindKmerNeighbor(kk))
+            ##            kk_neighbor=[]
+            ##            for jj in range(KS):
+            ##                kk_pre=kk[0:jj]
+            ##                kk_suf=kk[(jj+1):KS]
+            ##                pat=kk_pre+'['+AAstring+']{1}'+kk_suf
+            ##                p=re.compile(pat)
+            ##                mm=[m.group() for m in p.finditer(Skk)]
+            ##                kk_neighbor+=mm
+            KS_n = set(self.FindKmerNeighbor(kk))
             kk_neighbor = KS_n & KKs_set
-            KI_Dict[kk]=list(kk_neighbor)
+            KI_Dict[kk] = list(kk_neighbor)
         return KI_Dict
     def updateKD(self, KI):
         ## group sequences sharing motifs with 1-2 mismatches
-        KD=self.KD
-        KDnew={}
+        KD = self.KD
+        KDnew = {}
         for kk in KD:
-            kkm=KI[kk]
-            vvL=itemgetter(*kkm)(KD)
-            if isinstance(vvL[0],list):
-                vvL=list(chain(*vvL))
-            KDnew[kk]=vvL
+            kkm = KI[kk]
+            vvL = itemgetter(*kkm)(KD)
+            if isinstance(vvL[0], list):
+                vvL = list(chain(*vvL))
+            KDnew[kk] = vvL
         return KDnew
-def GenerateMotifGraph(mD,seqs,seqID):
-    SeqShareGraph={}
-    mDL={}
+def GenerateMotifGraph(mD, seqs, seqID):
+    SeqShareGraph = {}
+    mDL = {}
     for kk in mD:
-        vv=mD[kk]
-        LL=[]
+        vv = mD[kk]
+        LL = []
         for v in vv:
             LL.append(len(seqs[v]))
-        mDL[kk]=LL
+        mDL[kk] = LL
     for kk in mD:
-        vv=mD[kk]
-        LL=mDL[kk]
-        nv=len(vv)
-        for ii in range(0,nv):
-            id_1=vv[ii]
-            L1=LL[ii]
-            for jj in range(ii,nv):
-                if jj==ii:
+        vv = mD[kk]
+        LL = mDL[kk]
+        nv = len(vv)
+        for ii in range(0, nv):
+            id_1 = vv[ii]
+            L1 = LL[ii]
+            for jj in range(ii, nv):
+                if jj == ii:
                     continue
-                id_2=vv[jj]
-                L2=LL[jj]
+                id_2 = vv[jj]
+                L2 = LL[jj]
                 if L2 != L1:
                     continue
                 if id_1 not in SeqShareGraph:
-                    SeqShareGraph[id_1]=[id_2]
+                    SeqShareGraph[id_1] = [id_2]
                 elif id_2 not in SeqShareGraph[id_1]:
                     SeqShareGraph[id_1].append(id_2)
                 if id_2 not in SeqShareGraph:
-                    SeqShareGraph[id_2]=[id_1]
+                    SeqShareGraph[id_2] = [id_1]
                 elif id_1 not in SeqShareGraph[id_2]:
                     SeqShareGraph[id_2].append(id_1)
     return SeqShareGraph
 def generateSSG(Kset, CDR3s, k_thr=2):
-    KD=Kset.KD
-    KI=Kset.KmerIndex()
-    KDnew=Kset.updateKD(KI)
-    CD=Kset.CD
-    LL=np.array(Kset.LL)
-    SSG={}
+    KD = Kset.KD
+    KI = Kset.KmerIndex()
+    KDnew = Kset.updateKD(KI)
+    CD = Kset.CD
+    LL = np.array(Kset.LL)
+    SSG = {}
     for kk in CD:
-        vv=itemgetter(*CD[kk])(KDnew)
-        if isinstance(vv[0],list):
-            vv=list(chain(*vv))
-        vv1=[]
-        c=Counter(vv)
+        vv = itemgetter(*CD[kk])(KDnew)
+        if isinstance(vv[0], list):
+            vv = list(chain(*vv))
+        vv1 = []
+        c = Counter(vv)
         for k in c:
-            if c[k]>=k_thr:
+            if c[k] >= k_thr:
                 vv1.append(k)
-        vv1=np.array(vv1)
-        if len(vv1)==0:
+        vv1 = np.array(vv1)
+        if len(vv1) == 0:
             continue
-        cdr3=CDR3s[kk]
-        L0=len(cdr3)
-        idx=np.where(LL[vv1]==L0)[0]
-        if len(idx)==0:
+        cdr3 = CDR3s[kk]
+        L0 = len(cdr3)
+        idx = np.where(LL[vv1] == L0)[0]
+        if len(idx) == 0:
             continue
-        vvs=list(vv1[idx])
+        vvs = list(vv1[idx])
         vvs.remove(kk)
-        if len(vvs)>0:
-            SSG[kk]=vvs
+        if len(vvs) > 0:
+            SSG[kk] = vvs
     return SSG
-def SeqComparison(s1,s2,gap=-6):
-    n=len(s1)
-    CorList=[]
-    score=0
-    for kk in range(0,n):
-        aa=s1[kk]
-        bb=s2[kk]
-        if aa in ['.','-','*'] or bb in ['.','-','*']:
-            if aa!=bb:
+def SeqComparison(s1, s2, gap=-6):
+    n = len(s1)
+    CorList = []
+    score = 0
+    for kk in range(0, n):
+        aa = s1[kk]
+        bb = s2[kk]
+        if aa in [".", "-", "*"] or bb in [".", "-", "*"]:
+            if aa != bb:
                 score += gap
             continue
-        if aa==bb:
-#            score += min(4,blosum62[(aa,aa)])
-            score += blosum62n[(aa,aa)]
+        if aa == bb:
+            #            score += min(4,blosum62[(aa,aa)])
+            score += blosum62n[(aa, aa)]
             continue
-        KEY=(aa,bb)
-#        if KEY not in blosum62:
-#            KEY=(bb,aa)
-#        if KEY not in blosum62:
-#            raise "Non-standard amino acid coding!"
-        score+=blosum62n[KEY]
+        KEY = (aa, bb)
+        #        if KEY not in blosum62:
+        #            KEY=(bb,aa)
+        #        if KEY not in blosum62:
+        #            raise "Non-standard amino acid coding!"
+        score += blosum62n[KEY]
     return score
-def NHLocalAlignment(Seq1,Seq2,gap_thr=1,gap=-6):
-    n1=len(Seq1)
-    n2=len(Seq2)
-    if n1<n2:
-        Seq=Seq1
-        Seq1=Seq2
-        Seq2=Seq
-        nn=n2-n1
+def NHLocalAlignment(Seq1, Seq2, gap_thr=1, gap=-6):
+    n1 = len(Seq1)
+    n2 = len(Seq2)
+    if n1 < n2:
+        Seq = Seq1
+        Seq1 = Seq2
+        Seq2 = Seq
+        nn = n2 - n1
     else:
-        nn=n1-n2
-    if nn>gap_thr:
+        nn = n1 - n2
+    if nn > gap_thr:
         return -1
-    SeqList1=[Seq1]
-    SeqList2=InsertGap(Seq2,nn)
-    alns=[]
-    SCOREList=[]
+    SeqList1 = [Seq1]
+    SeqList2 = InsertGap(Seq2, nn)
+    alns = []
+    SCOREList = []
     for s1 in SeqList1:
         for s2 in SeqList2:
-                SCOREList.append(SeqComparison(s1,s2,gap))
-    maxS=max(SCOREList)
+            SCOREList.append(SeqComparison(s1, s2, gap))
+    maxS = max(SCOREList)
     return maxS
-def InsertGap(Seq,n):
+def InsertGap(Seq, n):
     ## Insert n gaps to Seq; n<=2
-    if n==0:
+    if n == 0:
         return [Seq]
-    ns=len(Seq)
-    SeqList=[]
-    if(n==1):
-        for kk in range(0,ns+1):
-            SeqNew=Seq[0:kk]+'-'+Seq[kk:]
+    ns = len(Seq)
+    SeqList = []
+    if n == 1:
+        for kk in range(0, ns + 1):
+            SeqNew = Seq[0:kk] + "-" + Seq[kk:]
             SeqList.append(SeqNew)
-    if(n==2):
-        for kk in range(0,ns+1):
-            SeqNew=Seq[0:kk]+'-'+Seq[kk:]
-            for jj in range(0,ns+2):
-                SeqNew0=SeqNew[0:jj]+'-'+SeqNew[jj:]
+    if n == 2:
+        for kk in range(0, ns + 1):
+            SeqNew = Seq[0:kk] + "-" + Seq[kk:]
+            for jj in range(0, ns + 2):
+                SeqNew0 = SeqNew[0:jj] + "-" + SeqNew[jj:]
                 SeqList.append(SeqNew0)
     return SeqList
-def falign(s1, s2, V1, V2 ,st,VScore={}, UseV=True, gapn=1, gap=-6):
-    mid1=s1[st:-2]
-    mid2=s2[st:-2]
+def falign(s1, s2, V1, V2, st, VScore={}, UseV=True, gapn=1, gap=-6):
+    mid1 = s1[st:-2]
+    mid2 = s2[st:-2]
     if UseV:
-        if V2==V1:
-            V_score=4
+        if V2 == V1:
+            V_score = 4
         else:
-            Vkey=(V1,V2)
+            Vkey = (V1, V2)
             if Vkey not in VScore:
-                Vkey=(V2,V1)
+                Vkey = (V2, V1)
             if Vkey not in VScore:
-                #print("V gene not found!")
+                # print("V gene not found!")
                 return 0
             else:
-                V_score=VScore[Vkey]/20.0
+                V_score = VScore[Vkey] / 20.0
     else:
-        V_score=4.0
-    aln=NHLocalAlignment(mid1,mid2,gapn,gap)
-    score=aln/float(max(len(mid1),len(mid2)))+V_score
+        V_score = 4.0
+    aln = NHLocalAlignment(mid1, mid2, gapn, gap)
+    score = aln / float(max(len(mid1), len(mid2))) + V_score
     return score
 def UpdateSSG(SSG, seqs, Vgenes, Vscore={}, UseV=True, gap=-6, gapn=1, cutoff=7.5):
-    SSGnew={}
-    count=0
-    t1=time.time()
-    N=len(list(chain(*list(SSG.values()))))
-#    print("Number of pairs to be processed: %d" %N)
+    SSGnew = {}
+    count = 0
+    t1 = time.time()
+    N = len(list(chain(*list(SSG.values()))))
+    #    print("Number of pairs to be processed: %d" %N)
     for kk in SSG:
-        s1=seqs[kk]
-        V1=Vgenes[kk]
-        VV=SSG[kk]
+        s1 = seqs[kk]
+        V1 = Vgenes[kk]
+        VV = SSG[kk]
         for vv in VV:
-            s2=seqs[vv]
-            V2=Vgenes[vv]
-            score=falign(s1, s2, V1, V2, st=3, VScore=Vscore, UseV=UseV, gap=-6, gapn=1)
-            count+=1
-            if count % 1000000 ==0:
-                t2=time.time()
-#                print("Processed %d pairs. Elapsed time %f" %(count, t2-t1))
-            if score>=cutoff:
+            s2 = seqs[vv]
+            V2 = Vgenes[vv]
+            score = falign(
+                s1, s2, V1, V2, st=3, VScore=Vscore, UseV=UseV, gap=-6, gapn=1
+            )
+            count += 1
+            if count % 1000000 == 0:
+                t2 = time.time()
+            #                print("Processed %d pairs. Elapsed time %f" %(count, t2-t1))
+            if score >= cutoff:
                 if kk not in SSGnew:
-                    SSGnew[kk]=[vv]
+                    SSGnew[kk] = [vv]
                 else:
                     SSGnew[kk].append(vv)
     return SSGnew
 def dfs(graph, start):
-    '''
+    """
     Non-resursive depth first search
-    '''
+    """
     visited = set()
     stack = [start]
     while stack:
@@ -577,95 +911,100 @@ def dfs(graph, start):
         if vertex not in visited:
             visited.add(vertex)
             stack.extend(set(graph[vertex]) - visited)
     return visited
 def IdentifyMotifCluster(SSG):
     ## Input SeqShareGraph dictionary representation of sparse matrix
-    POS=set(SSG.keys())
-    NP=len(POS)
-    ClusterList=[]
-    tmpL=set(chain(*ClusterList))
-    count=0
+    POS = set(SSG.keys())
+    NP = len(POS)
+    ClusterList = []
+    tmpL = set(chain(*ClusterList))
+    count = 0
     while 1:
-            xx=POS ^ tmpL
-            if len(xx)==0:
-                break
-            for ii in xx:
-#            STACK=LoadComm([],ii)
-                STACK=dfs(SSG,ii)
-                tmpL = tmpL | STACK
-                ClusterList.append(list(STACK))
-#                tmpL=set(chain(*ClusterList))
-                count+=1
-                if count % 200 ==0:
-                    print ("    Solved %d clusters" %(count))
-                break
+        xx = POS ^ tmpL
+        if len(xx) == 0:
+            break
+        for ii in xx:
+            #            STACK=LoadComm([],ii)
+            STACK = dfs(SSG, ii)
+            tmpL = tmpL | STACK
+            ClusterList.append(list(STACK))
+            #                tmpL=set(chain(*ClusterList))
+            count += 1
+            if count % 200 == 0:
+                print("    Solved %d clusters" % (count))
+            break
     return ClusterList
 def IdentifyVgeneCluster(sMat):
     ## Input Vgene score matrix
-    vG={}
-    n=len(sMat)
-    IDs=[x for x in range(n)]
+    vG = {}
+    n = len(sMat)
+    IDs = [x for x in range(n)]
     for kk in IDs:
-        LL=sMat[:,kk]
-        vL=np.where(LL>=thr_v)[0]
-        if len(vL)>0:
-            vG[kk]=vL
-    CL=IdentifyMotifCluster(vG)
+        LL = sMat[:, kk]
+        vL = np.where(LL >= thr_v)[0]
+        if len(vL) > 0:
+            vG[kk] = vL
+    CL = IdentifyMotifCluster(vG)
     return CL
 def ParseFa(fname):
-    InputStr=open(fname).readlines()
-    FaDict={}
-    seq=''
+    InputStr = open(fname).readlines()
+    FaDict = {}
+    seq = ""
     for line in InputStr:
-        if line.startswith('>'):
-            if len(seq)>0:
-                FaDict[seqHead]=seq
-                seq=''
-            seqHead=line.strip()
+        if line.startswith(">"):
+            if len(seq) > 0:
+                FaDict[seqHead] = seq
+                seq = ""
+            seqHead = line.strip()
         else:
-            seq+=line.strip()
+            seq += line.strip()
     if seqHead not in FaDict:
-        FaDict[seqHead]=seq
+        FaDict[seqHead] = seq
     return FaDict
 def PreCalculateVgeneDist(VgeneFa="Imgt_Human_TRBV.fasta"):
     ## Only run one time if needed
-    FaDict=ParseFa(cur_dir+VgeneFa)
-    VScore={}
-    CDR1Dict={}
-    CDR2Dict={}
+    FaDict = ParseFa(cur_dir + VgeneFa)
+    VScore = {}
+    CDR1Dict = {}
+    CDR2Dict = {}
     for kk in FaDict:
-        if '|' in kk:
-            VV=kk.split('|')[1]
+        if "|" in kk:
+            VV = kk.split("|")[1]
         else:
-            VV=kk[1:]
-        CDR1Dict[VV]=FaDict[kk][26:37]  ## Imgt CDR1: 27 - 38
-        CDR2Dict[VV]=FaDict[kk][55:64]  ## Imgt CDR2: 56 - 65
-    Vkeys=list(CDR1Dict.keys())
-    nn=len(Vkeys)
-    for ii in range(0,nn):
-        V1=Vkeys[ii]
-        s1_CDR1=CDR1Dict[V1]
-        s1_CDR2=CDR2Dict[V1]
-        for jj in range(ii,nn):
-            V2=Vkeys[jj]
-            s2_CDR1=CDR1Dict[V2]
-            s2_CDR2=CDR2Dict[V2]
-            score1=SeqComparison(s1_CDR1,s2_CDR1)
-            score2=SeqComparison(s1_CDR2,s2_CDR2)
-            #print score1+score2
-            VScore[(V1,V2)]=score1+score2
-    gg=open('VgeneScores.txt','w')
+            VV = kk[1:]
+        CDR1Dict[VV] = FaDict[kk][26:37]  ## Imgt CDR1: 27 - 38
+        CDR2Dict[VV] = FaDict[kk][55:64]  ## Imgt CDR2: 56 - 65
+    Vkeys = list(CDR1Dict.keys())
+    nn = len(Vkeys)
+    for ii in range(0, nn):
+        V1 = Vkeys[ii]
+        s1_CDR1 = CDR1Dict[V1]
+        s1_CDR2 = CDR2Dict[V1]
+        for jj in range(ii, nn):
+            V2 = Vkeys[jj]
+            s2_CDR1 = CDR1Dict[V2]
+            s2_CDR2 = CDR2Dict[V2]
+            score1 = SeqComparison(s1_CDR1, s2_CDR1)
+            score2 = SeqComparison(s1_CDR2, s2_CDR2)
+            # print score1+score2
+            VScore[(V1, V2)] = score1 + score2
+    gg = open("VgeneScores.txt", "w")
     for kk in VScore:
-        vv=VScore[kk]
-        line=kk[0]+'\t'+kk[1]+'\t'+str(vv)+'\n'
+        vv = VScore[kk]
+        line = kk[0] + "\t" + kk[1] + "\t" + str(vv) + "\n"
         gg.write(line)
     gg.close()
 def MergeCL(Cls):
     ## merge pre-clusters according to shared sequences
     ## shared sequences between pre-clusters are due to approximated centroid nearest neighbor search
@@ -673,16 +1012,16 @@ def MergeCL(Cls):
     for idx, cc in enumerate(Cls):
         for x in cc:
             if x not in vDict:
-                vDict[x]=[idx]
+                vDict[x] = [idx]
             else:
                 vDict[x].append(idx)
-    Cls_new=[]
+    Cls_new = []
     cGraph = {}
     for kk in vDict:
-        vv=vDict[kk]
-        if len(vv)>1:
+        vv = vDict[kk]
+        if len(vv) > 1:
             for ii in vv:
-                vv1=deepcopy(vv)
+                vv1 = deepcopy(vv)
                 vv1.pop(vv1.index(ii))
                 if ii not in cGraph:
                     cGraph[ii] = vv1
@@ -690,21 +1029,21 @@ def MergeCL(Cls):
                     cGraph[ii] += list(set(vv1 + cGraph[ii]))
     DupKeys = list(cGraph.keys())
     for kk in vDict:
-        vv=vDict[kk]
-        if len(vv)==1:
+        vv = vDict[kk]
+        if len(vv) == 1:
             if vv[0] in DupKeys:
                 continue
             cc = Cls[vv[0]]
             if cc not in Cls_new:
                 Cls_new.append(cc)
-    Cls_Dup=[]
+    Cls_Dup = []
     for kk in cGraph:
         cc = dfs(cGraph, kk)
         cc = list(cc)
         cc = sorted(cc)
         if cc not in Cls_Dup:
             Cls_Dup.append(cc)
-    if len(Cls_Dup)>0:
+    if len(Cls_Dup) > 0:
         for cdup in Cls_Dup:
             cc_merged = []
             for ii in cdup:
@@ -715,355 +1054,411 @@ def MergeCL(Cls):
                 Cls_new.append(cc_merged)
     return Cls_new
-def EncodeRepertoire(inputfile, outdir, outfile='',exact=True, ST=3, thr_v=3.7, thr_s=3.5, VDict={},Vgene=True,thr_iso=10, gap=-6, GPU=False,Mat=False, verbose=False):
+def EncodeRepertoire(
+    inputfile,
+    outdir,
+    outfile="",
+    exact=True,
+    ST=3,
+    thr_v=3.7,
+    thr_s=3.5,
+    VDict={},
+    Vgene=True,
+    thr_iso=10,
+    gap=-6,
+    GPU=False,
+    Mat=False,
+    verbose=False,
+):
     ## No V gene version
     ## Encode CDR3 sequences into 96 dimensional space and perform k-means clustering
     ## If exact is True, SW alignment will be performed within each cluster after isometric encoding and clustering
-    h=open(inputfile)
-    t1=time.time()
-    alines=h.readlines()
-    ww=alines[0].strip().split('\t')
-    if not ww[0].startswith('C'):
+    h = open(inputfile)
+    t1 = time.time()
+    alines = h.readlines()
+    ww = alines[0].strip().split("\t")
+    if not ww[0].startswith("C"):
         ## header line
-        hline=alines[0]
-        alines=alines[1:]
-    elif 'CDR3' in ww[0]:
-        hline=alines[0]
-        alines=alines[1:]
+        hline = alines[0]
+        alines = alines[1:]
+    elif "CDR3" in ww[0]:
+        hline = alines[0]
+        alines = alines[1:]
     else:
-        hline='CDR3\t'+'\t'.join(['Info'+str(x) for x in range(len(ww)-1)])
-    seqs=[]
-    vgs=[]
-    infoList=[]
-    count=0
+        hline = "CDR3\t" + "\t".join(["Info" + str(x) for x in range(len(ww) - 1)])
+    seqs = []
+    vgs = []
+    infoList = []
+    count = 0
     if verbose:
-        print('Creating CDR3 list')
+        print("Creating CDR3 list")
     for ll in alines:
-        ww=ll.strip().split('\t')
-        cdr3=ww[0]
-        if '*' in cdr3:
+        ww = ll.strip().split("\t")
+        cdr3 = ww[0]
+        if "*" in cdr3:
             continue
-        if '_' in cdr3:
+        if "_" in cdr3:
             continue
         seqs.append(ww[0])
         if Vgene:
             vgs.append(ww[1])
-            infoList.append('\t'.join(ww[1:]))
+            infoList.append("\t".join(ww[1:]))
         else:
-            infoList.append('\t'.join(ww[1:]))
-        count+=1
-    if len(outfile)==0:
-        outfile=inputfile.split('/')
-        outfile=outfile[len(outfile)-1]
-        outfile=outdir+'/'+re.sub('\\.[txcsv]+','',outfile)+'-'+'-RotationEncodingBL62.txt'
-    g=open(outfile,'w')
-    tm=strftime("%Y-%m-%d %H:%M:%S", gmtime())
-    InfoLine='##TIME:'+tm+'|cmd: '+sys.argv[0]+'|'+inputfile+'|IsometricDistance_Thr='+str(thr_iso)+'|thr_v='+str(thr_v)+'|thr_s='+str(thr_s)+'|exact='+str(exact)+'|Vgene='+str(Vgene)+'|ST='+str(ST)
-    g.write(InfoLine+'\n')
-    g.write("##Column Info: CDR3 aa sequence, cluster id, other information in the input file\n")
-    gr=0
+            infoList.append("\t".join(ww[1:]))
+        count += 1
+    if len(outfile) == 0:
+        outfile = inputfile.split("/")
+        outfile = outfile[len(outfile) - 1]
+        outfile = (
+            outdir
+            + "/"
+            + re.sub("\\.[txcsv]+", "", outfile)
+            + "-"
+            + "-RotationEncodingBL62.txt"
+        )
+    g = open(outfile, "w")
+    tm = strftime("%Y-%m-%d %H:%M:%S", gmtime())
+    InfoLine = (
+        "##TIME:"
+        + tm
+        + "|cmd: "
+        + sys.argv[0]
+        + "|"
+        + inputfile
+        + "|IsometricDistance_Thr="
+        + str(thr_iso)
+        + "|thr_v="
+        + str(thr_v)
+        + "|thr_s="
+        + str(thr_s)
+        + "|exact="
+        + str(exact)
+        + "|Vgene="
+        + str(Vgene)
+        + "|ST="
+        + str(ST)
+    )
+    g.write(InfoLine + "\n")
+    g.write(
+        "##Column Info: CDR3 aa sequence, cluster id, other information in the input file\n"
+    )
+    gr = 0
     ## Split into different lengths
-    LD,VD, ID,SD= BuildLengthDict(seqs, vGene=vgs,INFO=infoList,sIDs=[x for x in range(len(seqs))])
+    LD, VD, ID, SD = BuildLengthDict(
+        seqs, vGene=vgs, INFO=infoList, sIDs=[x for x in range(len(seqs))]
+    )
     LDu, VDu, IDu, SDu = CollapseUnique(LD, VD, ID, SD)
     if Mat:
-        Mfile=outfile+'_EncodingMatrix.txt'
-        h=open(Mfile, 'w')
+        Mfile = outfile + "_EncodingMatrix.txt"
+        h = open(Mfile, "w")
     for kk in LDu:
         if verbose:
-            print("---Process CDR3s with length %d ---" %(kk))
-        vSD=LDu[kk]
-        vSD0=[x for x in range(len(vSD))]
-        vss=SDu[kk]
-        vInfo=IDu[kk]
-        flagL=[len(x)-1 for x in vInfo]
+            print("---Process CDR3s with length %d ---" % (kk))
+        vSD = LDu[kk]
+        vSD0 = [x for x in range(len(vSD))]
+        vss = SDu[kk]
+        vInfo = IDu[kk]
+        flagL = [len(x) - 1 for x in vInfo]
         if verbose:
-            print(' Performing CDR3 encoding')
-        dM=np.array([EncodingCDR3(x[ST:-2], M6, n0) for x in vss])
-        dM=dM.astype("float32")
+            print(" Performing CDR3 encoding")
+        dM = np.array([EncodingCDR3(x[ST:-2], M6, n0) for x in vss])
+        dM = dM.astype("float32")
         if verbose:
-            print(" The number of sequences is %d" %(dM.shape[0]))
+            print(" The number of sequences is %d" % (dM.shape[0]))
         if Mat:
             for ii in range(len(vss)):
-                line=vss[ii]+'\t'+vInfo[ii][0]+'\t'
-                NUMs=[str(xx) for xx in dM[ii,:]]
-                line += '\t'.join(NUMs) + '\n'
+                line = vss[ii] + "\t" + vInfo[ii][0] + "\t"
+                NUMs = [str(xx) for xx in dM[ii, :]]
+                line += "\t".join(NUMs) + "\n"
                 h.write(line)
-        sID=[x for x in range(dM.shape[0])]
-        t2=time.time()
+        sID = [x for x in range(dM.shape[0])]
+        t2 = time.time()
         if verbose:
-            print(' Done! Total time elapsed %f' %(t2-t1))
-        Cls = ClusterCDR3(dM, flagL, thr=thr_iso - 0.5*(15-kk), verbose=verbose)  ## change cutoff with different lengths
+            print(" Done! Total time elapsed %f" % (t2 - t1))
+        Cls = ClusterCDR3(
+            dM, flagL, thr=thr_iso - 0.5 * (15 - kk), verbose=verbose
+        )  ## change cutoff with different lengths
         Cls = MergeCL(Cls)
         if verbose:
             print("     Handling identical CDR3 groups")
-        Cls_u=[]
+        Cls_u = []
         for ii in range(len(Cls)):
-            cc=Cls[ii]
+            cc = Cls[ii]
             if len(cc) == 1:
                 ## Handle identical CDR3 groups first
-                if flagL[cc[0]]>0:
+                if flagL[cc[0]] > 0:
                     gr += 1
-                    jj=cc[0]
+                    jj = cc[0]
                     for v_info in vInfo[jj]:
-                        line=vss[jj]+'\t'+str(gr)+'\t'+v_info+'\n'
-                        _=g.write(line)
+                        line = vss[jj] + "\t" + str(gr) + "\t" + v_info + "\n"
+                        _ = g.write(line)
             else:
                 Cls_u.append(cc)
-        Cls=Cls_u
-        t2=time.time()
+        Cls = Cls_u
+        t2 = time.time()
         if verbose:
-            print(' Done! Total time elapsed %f' %(t2-t1))
+            print(" Done! Total time elapsed %f" % (t2 - t1))
         if Vgene:
-            vVgene=VDu[kk]
+            vVgene = VDu[kk]
             if verbose:
-                print('     Matching variable genes')
-            Cls_v=[]
+                print("     Matching variable genes")
+            Cls_v = []
             for cc in Cls:
-                Nc=len(cc)
-                sMat={}
+                Nc = len(cc)
+                sMat = {}
                 for ii in range(Nc):
-                    v1=vVgene[cc[ii]]
-                    for jj in range(ii,Nc):
-                        if jj==ii:
+                    v1 = vVgene[cc[ii]]
+                    for jj in range(ii, Nc):
+                        if jj == ii:
                             continue
-                        v2=vVgene[cc[jj]]
+                        v2 = vVgene[cc[jj]]
                         if (v1, v2) not in VDict:
                             if v1 == v2:
                                 if ii not in sMat:
-                                    sMat[ii]=[jj]
+                                    sMat[ii] = [jj]
                                 else:
                                     sMat[ii].append(jj)
                                 if jj not in sMat:
-                                    sMat[jj]=[ii]
+                                    sMat[jj] = [ii]
                                 else:
                                     sMat[jj].append(ii)
                             continue
-                        if VDict[(v1,v2)] >= thr_v:
-                                if ii not in sMat:
-                                    sMat[ii]=[jj]
-                                else:
-                                    sMat[ii].append(jj)
-                                if jj not in sMat:
-                                    sMat[jj]=[ii]
-                                else:
-                                    sMat[jj].append(ii)
-                vCL=IdentifyMotifCluster(sMat)
-                vCL_List=list(chain(*vCL))
+                        if VDict[(v1, v2)] >= thr_v:
+                            if ii not in sMat:
+                                sMat[ii] = [jj]
+                            else:
+                                sMat[ii].append(jj)
+                            if jj not in sMat:
+                                sMat[jj] = [ii]
+                            else:
+                                sMat[jj].append(ii)
+                vCL = IdentifyMotifCluster(sMat)
+                vCL_List = list(chain(*vCL))
                 for ii in range(Nc):
-                    uu=flagL[cc[ii]]
-                    if uu>0 and ii not in vCL_List:
+                    uu = flagL[cc[ii]]
+                    if uu > 0 and ii not in vCL_List:
                         vCL.append([ii])
                 for vcc in vCL:
                     Cls_v.append(list(np.array(cc)[np.array(vcc)]))
-            Cls=[]
+            Cls = []
             for ii in range(len(Cls_v)):
-                cc=Cls_v[ii]
+                cc = Cls_v[ii]
                 if len(cc) == 1:
                     ## Handle identical CDR3 groups first
                     gr += 1
-                    jj=cc[0]
+                    jj = cc[0]
                     for v_info in vInfo[jj]:
-                        line=vss[jj]+'\t'+str(gr)+'\t'+v_info+'\n'
-                        _=g.write(line)
+                        line = vss[jj] + "\t" + str(gr) + "\t" + v_info + "\n"
+                        _ = g.write(line)
                 else:
                     Cls.append(cc)
         if exact:
             if verbose:
-                print(' Performing Smith-Waterman alignment')
-            Cls_s=[]
+                print(" Performing Smith-Waterman alignment")
+            Cls_s = []
             for cc in Cls:
-                Nc=len(cc)
-                if len(cc)<=3:
-                    sMat=np.zeros((Nc,Nc))
+                Nc = len(cc)
+                if len(cc) <= 3:
+                    sMat = np.zeros((Nc, Nc))
                     for ii in range(Nc):
-                        s1=vss[cc[ii]]
-                        for jj in range(ii,Nc):
-                            if jj==ii:
+                        s1 = vss[cc[ii]]
+                        for jj in range(ii, Nc):
+                            if jj == ii:
                                 continue
-                            s2=vss[cc[jj]]
+                            s2 = vss[cc[jj]]
                             if len(s1) != len(s2):
                                 continue
-                            if len(s1)<=5:
+                            if len(s1) <= 5:
                                 continue
-                            sw=SeqComparison(s1[ST:-2],s2[ST:-2],gap=gap)
-                            sw=sw/(len(s1)-ST-2)
-                            sMat[ii,jj]=sw
-                            sMat[jj,ii]=sw
-                    s_max=[]
+                            sw = SeqComparison(s1[ST:-2], s2[ST:-2], gap=gap)
+                            sw = sw / (len(s1) - ST - 2)
+                            sMat[ii, jj] = sw
+                            sMat[jj, ii] = sw
+                    s_max = []
                     for ii in range(Nc):
-                        s_max.append(np.max(sMat[:,ii]))
-                    cc_new=[]
+                        s_max.append(np.max(sMat[:, ii]))
+                    cc_new = []
                     for ii in range(Nc):
-                        if s_max[ii]>=thr_s:
+                        if s_max[ii] >= thr_s:
                             cc_new.append(cc[ii])
-                    if len(cc_new)>1:
+                    if len(cc_new) > 1:
                         Cls_s.append(cc_new)
                     else:
                         for ii in range(Nc):
-                            uu=flagL[cc[ii]]
-                            if uu>0:
+                            uu = flagL[cc[ii]]
+                            if uu > 0:
                                 Cls_s.append([cc[ii]])
-#                    print(Cls_s)
-                    Cls_sList=list(chain(*Cls_s))
+                    #                    print(Cls_s)
+                    Cls_sList = list(chain(*Cls_s))
                     for ii in range(len(cc)):
-                        uu=flagL[cc[ii]]
-                        if uu>0 and cc[ii] not in Cls_sList:
+                        uu = flagL[cc[ii]]
+                        if uu > 0 and cc[ii] not in Cls_sList:
                             Cls_s.append([cc[ii]])
                 else:
-                    CDR3s=[vss[x] for x in cc]
-                    sIDs=np.array([vSD0[x] for x in cc])
-                    sIDs0=[x for x in range(len(cc))]
-                    Kset=KmerSet(CDR3s, sIDs0, KS=5, st=ST, ed=2)
-                    SSG=generateSSG(Kset, CDR3s, k_thr=1)
-                    tmpVgenes=['TRBV2']*len(CDR3s)
-                    SSGnew=UpdateSSG(SSG, CDR3s, tmpVgenes, Vscore=VDict, cutoff=thr_s+4)
-                    CLall=IdentifyMotifCluster(SSGnew)
-                    CLall_list=list(chain(*CLall))
+                    CDR3s = [vss[x] for x in cc]
+                    sIDs = np.array([vSD0[x] for x in cc])
+                    sIDs0 = [x for x in range(len(cc))]
+                    Kset = KmerSet(CDR3s, sIDs0, KS=5, st=ST, ed=2)
+                    SSG = generateSSG(Kset, CDR3s, k_thr=1)
+                    tmpVgenes = ["TRBV2"] * len(CDR3s)
+                    SSGnew = UpdateSSG(
+                        SSG, CDR3s, tmpVgenes, Vscore=VDict, cutoff=thr_s + 4
+                    )
+                    CLall = IdentifyMotifCluster(SSGnew)
+                    CLall_list = list(chain(*CLall))
                     for ii in range(len(cc)):
-                        uu=flagL[cc[ii]]
-                        if uu>0 and ii not in CLall_list:
+                        uu = flagL[cc[ii]]
+                        if uu > 0 and ii not in CLall_list:
                             CLall.append([ii])
                     for cl in CLall:
-                        ccs=list(sIDs[np.array(cl)])
+                        ccs = list(sIDs[np.array(cl)])
                         Cls_s.append(ccs)
-            Cls=Cls_s
+            Cls = Cls_s
         if verbose:
-            print(' Writing results into file')
+            print(" Writing results into file")
         for ii in range(len(Cls)):
-#            if ii % 100000 == 0 and ii>0:
-                #print('      %d sequences written' %(ii))
-            cc=Cls[ii]
-            gr+=1
+            #            if ii % 100000 == 0 and ii>0:
+            # print('      %d sequences written' %(ii))
+            cc = Cls[ii]
+            gr += 1
             for jj in cc:
                 for v_info in vInfo[jj]:
-                    line=vss[jj]+'\t'+str(gr)+'\t'+v_info+'\n'
-                    _=g.write(line)
+                    line = vss[jj] + "\t" + str(gr) + "\t" + v_info + "\n"
+                    _ = g.write(line)
     g.close()
     if Mat:
         h.close()
 def OrderUnique(Ig):
-    vv=list(Ig.values())
-    kk=list(Ig.keys())
-    LL=[len(x[1]) for x in vv]
-    v0=[x[0][0] for x in vv]
-    v1=[x[0][1] for x in vv]
-    zkk=zip(kk,v0,v1,LL)
-    zkks=sorted(zkk,key=lambda x: (x[1],x[3]))
-    nk=len(zkks)
-    keep_id=[0]
-    ii=1
-    n_pre=str(zkks[0][1])+'_'+str(zkks[0][2])
-    while ii<nk:
-        n_cur=str(zkks[ii][1])+'_'+str(zkks[ii][2])
-        if n_cur==n_pre:
-            ii+=1
+    vv = list(Ig.values())
+    kk = list(Ig.keys())
+    LL = [len(x[1]) for x in vv]
+    v0 = [x[0][0] for x in vv]
+    v1 = [x[0][1] for x in vv]
+    zkk = zip(kk, v0, v1, LL)
+    zkks = sorted(zkk, key=lambda x: (x[1], x[3]))
+    nk = len(zkks)
+    keep_id = [0]
+    ii = 1
+    n_pre = str(zkks[0][1]) + "_" + str(zkks[0][2])
+    while ii < nk:
+        n_cur = str(zkks[ii][1]) + "_" + str(zkks[ii][2])
+        if n_cur == n_pre:
+            ii += 1
             continue
         else:
             keep_id.append(ii)
-            n_pre=n_cur
-            ii+=1
+            n_pre = n_cur
+            ii += 1
             continue
-    nid=[x[0] for x in zkks]
-    filtered_id=np.array(nid)[np.array(keep_id)]
-    Igs={}
+    nid = [x[0] for x in zkks]
+    filtered_id = np.array(nid)[np.array(keep_id)]
+    Igs = {}
     for ii in filtered_id:
-        Igs[kk[ii]]=vv[ii]
+        Igs[kk[ii]] = vv[ii]
     return Igs, filtered_id
 def ClusterCDR3(dM, flagL, thr=10, GPU=False, verbose=False):
     ## flagL: flag vector for identical CDR3 groups, >0 for grouped non-identical CDR3s
-    Cls=[]
-    flag=0
-    dM1=dM
-    flagL=np.array(flagL)
+    Cls = []
+    flag = 0
+    dM1 = dM
+    flagL = np.array(flagL)
     if GPU:
         res = faiss.StandardGpuResources()
     while 1:
-#        print("     %d number of clusters, with %d sequences" %(len(Cls),dM1.shape[0]))
+        #        print("     %d number of clusters, with %d sequences" %(len(Cls),dM1.shape[0]))
         if verbose:
-            print('=',end='')
-        index = faiss.IndexFlatL2(Ndim*6)
+            print("=", end="")
+        index = faiss.IndexFlatL2(Ndim * 6)
         if GPU:
             index = faiss.index_cpu_to_gpu(res, 0, index)
         index.add(dM1)
-        if flag==0:
+        if flag == 0:
             D, I = index.search(dM1, 2)
-            vv=np.where((D[:,1]<=thr))[0]
-            vv0=np.where((D[:,1]>thr) & (flagL>0))[0]
+            vv = np.where((D[:, 1] <= thr))[0]
+            vv0 = np.where((D[:, 1] > thr) & (flagL > 0))[0]
             for v in vv0:
                 Cls.append([v])
-            tmp_dM=np.zeros((len(vv),Ndim*6))
-            Ig_new={}
+            tmp_dM = np.zeros((len(vv), Ndim * 6))
+            Ig_new = {}
             for ii in range(len(vv)):
-                v=vv[ii]
-                Idx=I[v,]
+                v = vv[ii]
+                Idx = I[v,]
                 if v not in Idx:
-                    Idx[0]=v
-                Ig_new[ii]=(sorted(list(set(Idx))),sorted(list(set(Idx))))
-                tmp_dM[ii,]=(dM1[Idx[0],]+dM1[Idx[1],])/2
-            if len(Ig_new)==0:
+                    Idx[0] = v
+                Ig_new[ii] = (sorted(list(set(Idx))), sorted(list(set(Idx))))
+                tmp_dM[ii,] = (dM1[Idx[0],] + dM1[Idx[1],]) / 2
+            if len(Ig_new) == 0:
                 if verbose:
-                    print('type 0 break')
+                    print("type 0 break")
                 break
-#                print('%d of sequence left at cycle %d' %(len(Ig_new),flag))
-            Igs, fid=OrderUnique(Ig_new)
-            tmp_dM=tmp_dM[fid,]
-            Ig_new=Igs
+            #                print('%d of sequence left at cycle %d' %(len(Ig_new),flag))
+            Igs, fid = OrderUnique(Ig_new)
+            tmp_dM = tmp_dM[fid,]
+            Ig_new = Igs
         else:
-            D, I = index.search(dM1,2)
-            vv=np.where(D[:,1]<=thr)[0]
-            vv0=np.where(D[:,1]>thr)[0]
+            D, I = index.search(dM1, 2)
+            vv = np.where(D[:, 1] <= thr)[0]
+            vv0 = np.where(D[:, 1] > thr)[0]
             ## move groups in vv0 to Cls
-            kkg=list(Ig.keys())
+            kkg = list(Ig.keys())
             for v in vv0:
-                ng=list(Ig[kkg[v]][1])
-    #            if ng not in Cls:
+                ng = list(Ig[kkg[v]][1])
+                #            if ng not in Cls:
                 Cls.append(ng)
-            tmp_dM=np.zeros((len(vv),Ndim*6))
-            Ig_new={}
+            tmp_dM = np.zeros((len(vv), Ndim * 6))
+            Ig_new = {}
             for ii in range(len(vv)):
-                v=vv[ii]
-                idx1=I[v,0]
-                idx2=I[v,1]
+                v = vv[ii]
+                idx1 = I[v, 0]
+                idx2 = I[v, 1]
                 if v not in I[v,]:
-                    idx1=v
-#                Ig_new[ii]=sorted(list(set(list(Ig[kkg[idx1]])+list(Ig[kkg[idx2]]))))
-                Ig_new[ii]=(sorted(list(set([idx1,idx2]))),  ## First entry records the relative index of a sequence clique
-                            sorted(list(set(list(Ig[kkg[idx1]][1])+list(Ig[kkg[idx2]][1])))))  ## Second entry records the absolute index of a sequence
-                tmp_dM[ii,]=(dM1[idx1,]+dM1[idx2,])/2
-            if len(Ig_new)==0:
+                    idx1 = v
+                #                Ig_new[ii]=sorted(list(set(list(Ig[kkg[idx1]])+list(Ig[kkg[idx2]]))))
+                Ig_new[ii] = (
+                    sorted(
+                        list(set([idx1, idx2]))
+                    ),  ## First entry records the relative index of a sequence clique
+                    sorted(list(set(list(Ig[kkg[idx1]][1]) + list(Ig[kkg[idx2]][1])))),
+                )  ## Second entry records the absolute index of a sequence
+                tmp_dM[ii,] = (dM1[idx1,] + dM1[idx2,]) / 2
+            if len(Ig_new) == 0:
                 if verbose:
                     print("\ntype I break")
-                kkg=list(Ig.keys())
+                kkg = list(Ig.keys())
                 for kk in kkg:
-                    ng=list(Ig[kk][1])
+                    ng = list(Ig[kk][1])
                     if ng not in Cls:
                         Cls.append(ng)
                 break
-#            print('%d of sequence left at cycle %d' %(len(Ig_new),flag))
-            Igs, fid=OrderUnique(Ig_new)
-            tmp_dM=tmp_dM[fid,]
-            Ig_new=Igs
-        if flag>0:
+            #            print('%d of sequence left at cycle %d' %(len(Ig_new),flag))
+            Igs, fid = OrderUnique(Ig_new)
+            tmp_dM = tmp_dM[fid,]
+            Ig_new = Igs
+        if flag > 0:
             if Ig == Ig_new:
                 if verbose:
                     print("\ntype II break")
-                kkg=list(Ig.keys())
+                kkg = list(Ig.keys())
                 for kk in kkg:
-                    ng=list(Ig[kk][1])
+                    ng = list(Ig[kk][1])
                     if ng in Cls:
                         continue
                     Cls.append(ng)
                 break
-        Ig=Ig_new
-        tmp_dM=tmp_dM.astype('float32')
-        dM1=tmp_dM
-        flag+=1
+        Ig = Ig_new
+        tmp_dM = tmp_dM.astype("float32")
+        dM1 = tmp_dM
+        flag += 1
     return Cls
-def ClusterCDR3r(dM, flagL, thr = 10, verbose = False):
-    index = faiss.IndexFlatL2(Ndim*6)
+def ClusterCDR3r(dM, flagL, thr=10, verbose=False):
+    index = faiss.IndexFlatL2(Ndim * 6)
     index.add(dM)
     lims, D, I = index.range_search(dM, thr)
     # with open('cdr3.npy', 'wb') as f:
@@ -1071,53 +1466,70 @@ def ClusterCDR3r(dM, flagL, thr = 10, verbose = False):
     #     np.save(f, D)
     #     np.save(f, I)
     #     np.save(f, dM)
     # now clustering results
     N = dM.shape[0]
-    neighborSize = np.array([lims[cur_idx_i+1] - lims[cur_idx_i] for cur_idx_i in range(N)])
+    neighborSize = np.array(
+        [lims[cur_idx_i + 1] - lims[cur_idx_i] for cur_idx_i in range(N)]
+    )
     # to_cluster = np.ones( (N,))
     clusterNo = 0
-    cluster = - np.ones( (N, ),  dtype = np.int32)
+    cluster = -np.ones((N,), dtype=np.int32)
     idx = np.where(cluster < 0)[0]
     unclustered = [np.argmax(neighborSize[idx])]
     depth = 0
     while True:
-        if len(unclustered) == 0: break
+        if len(unclustered) == 0:
+            break
         # cur_idx = unclustered[0] # first unclustered index
         cur_idx = unclustered
-        cluster[cur_idx] = clusterNo # assign cluster
-        neighbor = np.unique(np.array(list(chain (* [I[(lims[cur_idx_i]): lims[cur_idx_i+1]] for cur_idx_i in cur_idx]))))
+        cluster[cur_idx] = clusterNo  # assign cluster
+        neighbor = np.unique(
+            np.array(
+                list(
+                    chain(
+                        *[
+                            I[(lims[cur_idx_i]) : lims[cur_idx_i + 1]]
+                            for cur_idx_i in cur_idx
+                        ]
+                    )
+                )
+            )
+        )
         # find those unclusterred
         idx = np.where(cluster[neighbor] < 0)[0]
         if len(idx) == 0:
             depth = 0
             clusterNo += 1
             idx = np.where(cluster < 0)[0]
-            if len(idx) == 0: break
+            if len(idx) == 0:
+                break
             unclustered = [idx[np.argmax(neighborSize[idx])]]
         else:
             if depth > 3:
                 depth = 0
                 clusterNo += 1
             unclustered = neighbor[idx]
             depth += 1
-#    print('clusterNo = ', clusterNo)
-    Cls = [ [] for i in range(clusterNo)]
+    #    print('clusterNo = ', clusterNo)
+    Cls = [[] for i in range(clusterNo)]
     for idx, i in enumerate(cluster):
-            Cls[i].append(idx)
-#    print("Cls[:5] = ", Cls[:5])
-#    print("len(Cls) = ", len(Cls),
-#          ', #elem=', sum([len(i) for i in Cls]),
-#          ', #single=', sum([len(i) for i in Cls if len(i) == 1]),
-#          ', #non_single=', sum([len(i) for i in Cls if len(i) != 1]),
-#          ', #max=', max([len(i) for i in Cls]))
+        Cls[i].append(idx)
+    #    print("Cls[:5] = ", Cls[:5])
+    #    print("len(Cls) = ", len(Cls),
+    #          ', #elem=', sum([len(i) for i in Cls]),
+    #          ', #single=', sum([len(i) for i in Cls if len(i) == 1]),
+    #          ', #non_single=', sum([len(i) for i in Cls if len(i) != 1]),
+    #          ', #max=', max([len(i) for i in Cls]))
     return Cls
 def CommandLineParser():
-    parser=OptionParser()
-    print ('''
+    parser = OptionParser()
+    print(
+        """
 GIANA: Geometric Isometry based ANtigen-specific tcr Alignment
 Ultrafast short peptide alignment exclusively designed for large-scale adaptome analysis
@@ -1130,129 +1542,276 @@ Input columns:
 !!! ALL amino acid letters must be CAPITAL !!!
-''')
-    parser.add_option("-d","--directory",dest="Directory",help="Input repertoire sequencing file directory. Please make sure that all the files in the directory are input files.",default="")
-    parser.add_option("-f","--file",dest="File",default='',help="Input single file of CDR3 sequences for grouping")
-    parser.add_option("-F","--fileList",dest="files",default='',help='Alternative input: a file containing the full path to all the files. If given, overwrite -d and -f option')
-    parser.add_option("-t","--threshold",dest="thr",default=7,help="Isometric distance threshold for calling similar CDR3 groups. Without -E, smaller value will increase speed. With -E, smaller value will increase specificity. Must be smaller than 12.")
-    parser.add_option("-S","--threshold_score",dest="thr_s",default=3.5, help="Threshold for Smith-Waterman alignment score (normalized by CDR3 length). Default 3.5")
-    parser.add_option("-G","--threshold_vgene",dest="thr_v",default=3.7,help="Threshold for variable gene comparison. Default 3.7.")
-    parser.add_option("-o","--output",dest="OutDir",default='./',help="Output directory for intermediate and final outputs.")
-    parser.add_option("-O","--outfile",dest="OutFile",default='',help="Output file name. If not given, a file with --RotationEncoding will be added to the input file as the output file name.")
-    parser.add_option("-T","--startPosition",dest='ST',default=3, help="Starting position of CDR3 sequence. The first ST letters are omitted. CDR3 sequence length L must be >= ST+7 ")
-    parser.add_option("-g","--GapPenalty",dest="Gap",default= -6,help="Gap penalty,default= -6. Not used.")
-    parser.add_option("-n","--GapNumber",dest="GapN",default=1,help="Maximum number of gaps allowed when performing alignment. Max=1, default=1. Not used.")
-    parser.add_option("-V","--VariableGeneFa",dest="VFa",default="Imgt_Human_TRBV.fasta",help="IMGT Human beta variable gene sequences")
-    parser.add_option("-v","--VariableGene",dest="V",default=True,action="store_false",help="If False, GIANA will omit variable gene information and use CDR3 sequences only. This will yield reduced specificity. The cut-off will automatically become the current value-4.0")
-    parser.add_option("-e","--Exact",dest="E",default=True,action="store_false",help="If False, GIANA will not perform Smith-Waterman alignment after isometric encoding.")
-    parser.add_option("-N","--NumberOfThreads",dest="NN",default=1,help="Number of threads for multiple processing. Not working so well.")
-    parser.add_option("-M","--EncodingMatrix", dest="Mat", default=False,action="store_true", help="If true, GIANA will export the isometric encoding matrix for each TCR. Default: False.")
-    parser.add_option("-U","--UseGPU",dest="GPU", default=False, action="store_true",help="Use GPU for Faiss indexing. Must be CUDA GPUs.")
-    parser.add_option("-q","--queryFile",dest="Query",default='',help="Input query file, if given, GIANA will run in query mode, also need to provide -r option.")
-    parser.add_option("-r","--refFile",dest="ref", default='',help="Input reference file. Query model required.")
-    parser.add_option("-b","--Verbose", dest='v', default=False, action="store_true", help="Verbose option: if given, GIANA will print intermediate messages.")
+"""
+    )
+    parser.add_option(
+        "-d",
+        "--directory",
+        dest="Directory",
+        help="Input repertoire sequencing file directory. Please make sure that all the files in the directory are input files.",
+        default="",
+    )
+    parser.add_option(
+        "-f",
+        "--file",
+        dest="File",
+        default="",
+        help="Input single file of CDR3 sequences for grouping",
+    )
+    parser.add_option(
+        "-F",
+        "--fileList",
+        dest="files",
+        default="",
+        help="Alternative input: a file containing the full path to all the files. If given, overwrite -d and -f option",
+    )
+    parser.add_option(
+        "-t",
+        "--threshold",
+        dest="thr",
+        default=7,
+        help="Isometric distance threshold for calling similar CDR3 groups. Without -E, smaller value will increase speed. With -E, smaller value will increase specificity. Must be smaller than 12.",
+    )
+    parser.add_option(
+        "-S",
+        "--threshold_score",
+        dest="thr_s",
+        default=3.5,
+        help="Threshold for Smith-Waterman alignment score (normalized by CDR3 length). Default 3.5",
+    )
+    parser.add_option(
+        "-G",
+        "--threshold_vgene",
+        dest="thr_v",
+        default=3.7,
+        help="Threshold for variable gene comparison. Default 3.7.",
+    )
+    parser.add_option(
+        "-o",
+        "--output",
+        dest="OutDir",
+        default="./",
+        help="Output directory for intermediate and final outputs.",
+    )
+    parser.add_option(
+        "-O",
+        "--outfile",
+        dest="OutFile",
+        default="",
+        help="Output file name. If not given, a file with --RotationEncoding will be added to the input file as the output file name.",
+    )
+    parser.add_option(
+        "-T",
+        "--startPosition",
+        dest="ST",
+        default=3,
+        help="Starting position of CDR3 sequence. The first ST letters are omitted. CDR3 sequence length L must be >= ST+7 ",
+    )
+    parser.add_option(
+        "-g",
+        "--GapPenalty",
+        dest="Gap",
+        default=-6,
+        help="Gap penalty,default= -6. Not used.",
+    )
+    parser.add_option(
+        "-n",
+        "--GapNumber",
+        dest="GapN",
+        default=1,
+        help="Maximum number of gaps allowed when performing alignment. Max=1, default=1. Not used.",
+    )
+    parser.add_option(
+        "-V",
+        "--VariableGeneFa",
+        dest="VFa",
+        default="Imgt_Human_TRBV.fasta",
+        help="IMGT Human beta variable gene sequences",
+    )
+    parser.add_option(
+        "-v",
+        "--VariableGene",
+        dest="V",
+        default=True,
+        action="store_false",
+        help="If False, GIANA will omit variable gene information and use CDR3 sequences only. This will yield reduced specificity. The cut-off will automatically become the current value-4.0",
+    )
+    parser.add_option(
+        "-e",
+        "--Exact",
+        dest="E",
+        default=True,
+        action="store_false",
+        help="If False, GIANA will not perform Smith-Waterman alignment after isometric encoding.",
+    )
+    parser.add_option(
+        "-N",
+        "--NumberOfThreads",
+        dest="NN",
+        default=1,
+        help="Number of threads for multiple processing. Not working so well.",
+    )
+    parser.add_option(
+        "-M",
+        "--EncodingMatrix",
+        dest="Mat",
+        default=False,
+        action="store_true",
+        help="If true, GIANA will export the isometric encoding matrix for each TCR. Default: False.",
+    )
+    parser.add_option(
+        "-U",
+        "--UseGPU",
+        dest="GPU",
+        default=False,
+        action="store_true",
+        help="Use GPU for Faiss indexing. Must be CUDA GPUs.",
+    )
+    parser.add_option(
+        "-q",
+        "--queryFile",
+        dest="Query",
+        default="",
+        help="Input query file, if given, GIANA will run in query mode, also need to provide -r option.",
+    )
+    parser.add_option(
+        "-r",
+        "--refFile",
+        dest="ref",
+        default="",
+        help="Input reference file. Query model required.",
+    )
+    parser.add_option(
+        "-b",
+        "--Verbose",
+        dest="v",
+        default=False,
+        action="store_true",
+        help="Verbose option: if given, GIANA will print intermediate messages.",
+    )
     return parser.parse_args()
 def main():
-    (opt,_)=CommandLineParser()
-    cutoff=float(opt.thr)
-    OutDir=opt.OutDir
-    thr_s=float(opt.thr_s)
+    (opt, _) = CommandLineParser()
+    cutoff = float(opt.thr)
+    OutDir = opt.OutDir
+    thr_s = float(opt.thr_s)
     ## Check if query mode first
-    qFile=opt.Query
-    if len(qFile)>0:
+    qFile = opt.Query
+    if len(qFile) > 0:
         ## query mode
-        t1=time.time()
-        if qFile.endswith('/'):
+        t1 = time.time()
+        if qFile.endswith("/"):
             ## input query is a directory
-            qFs=os.listdir(qFile)
-            qFileList=[]
+            qFs = os.listdir(qFile)
+            qFileList = []
             for ff in qFs:
-                qFileList.append(qFile+ff)
+                qFileList.append(qFile + ff)
         else:
-            qFileList=[qFile]
-        rFile=opt.ref
-        if len(rFile)==0:
-            raise("Must provide reference file in query mode!")
+            qFileList = [qFile]
+        rFile = opt.ref
+        if len(rFile) == 0:
+            raise ("Must provide reference file in query mode!")
         else:
             ## check if reference cluster file exists
-            rFile0=re.sub('\\.txt','',rFile)
-            refClusterFile=rFile0+'--RotationEncodingBL62.txt'
+            rFile0 = re.sub("\\.txt", "", rFile)
+            refClusterFile = rFile0 + "--RotationEncodingBL62.txt"
             if not os.path.exists(refClusterFile):
-                raise("Must run clustering on reference file first! Did you forget to put the clustering file in this directory?")
-            rData=CreateReference(rFile)
-            t2=time.time()
-            print("Reference created. Elapsed %f" %(t2-t1))
+                raise (
+                    "Must run clustering on reference file first! Did you forget to put the clustering file in this directory?"
+                )
+            rData = CreateReference(rFile)
+            t2 = time.time()
+            print("Reference created. Elapsed %f" % (t2 - t1))
             for qf in qFileList:
-                t2_0=time.time()
-                print("Querying "+qf)
-                qf_s=qf.split('/')[-1]
-                outFile=re.sub('\\.txt','',qf_s)+'_query_'+rFile0+'.txt'
-                of=OutDir+'/'+outFile
+                t2_0 = time.time()
+                print("Querying " + qf)
+                qf_s = qf.split("/")[-1]
+                outFile = re.sub("\\.txt", "", qf_s) + "_query_" + rFile0 + ".txt"
+                of = OutDir + "/" + outFile
                 if path.exists(of):
-                    print(of+' already exits. Skipping.')
+                    print(of + " already exits. Skipping.")
                     continue
                 MakeQuery(qf, rData, thr=cutoff, thr_s=thr_s)
-                t2=time.time()
-                print("     Build query clustering file. Elapsed %f" %(t2-t1))
+                t2 = time.time()
+                print("     Build query clustering file. Elapsed %f" % (t2 - t1))
                 print("Now mering with reference cluster")
-                MergeExist(refClusterFile, OutDir+'/'+outFile)
-                t2=time.time()
-                print("         Time of elapsed for query %s: %f" %(qf, t2-t2_0))
+                MergeExist(refClusterFile, OutDir + "/" + outFile)
+                t2 = time.time()
+                print("         Time of elapsed for query %s: %f" % (qf, t2 - t2_0))
     else:
         ## regular clustering mode
-        FileDir=opt.Directory
-        if len(FileDir)>0:
-                files=os.listdir(FileDir)
-                files0=[]
-                for ff in files:
-                        ff=FileDir+'/'+ff
-                        files0.append(ff)
-                files=files0
+        FileDir = opt.Directory
+        if len(FileDir) > 0:
+            files = os.listdir(FileDir)
+            files0 = []
+            for ff in files:
+                ff = FileDir + "/" + ff
+                files0.append(ff)
+            files = files0
         else:
-                files=[]
-        File=opt.File
-        if len(File)>0:
-                files=[File]
-        FileList=opt.files
-        if len(FileList)>0:
-                files=[]
-                fL=open(FileList)
-                for ff in fL.readlines():
-                        files.append(ff.strip())
-        VFa=opt.VFa
+            files = []
+        File = opt.File
+        if len(File) > 0:
+            files = [File]
+        FileList = opt.files
+        if len(FileList) > 0:
+            files = []
+            fL = open(FileList)
+            for ff in fL.readlines():
+                files.append(ff.strip())
+        VFa = opt.VFa
         PreCalculateVgeneDist(VFa)
-        vf=open('./VgeneScores.txt')  ## Use tcrDist's Vgene 80-score calculation
-        VScore={}
-        VV=opt.V
-        EE=opt.E
-        Mat=opt.Mat
-        ST=int(opt.ST)
-        thr_v=float(opt.thr_v)
-        verbose=opt.v
+        vf = open("./VgeneScores.txt")  ## Use tcrDist's Vgene 80-score calculation
+        VScore = {}
+        VV = opt.V
+        EE = opt.E
+        Mat = opt.Mat
+        ST = int(opt.ST)
+        thr_v = float(opt.thr_v)
+        verbose = opt.v
         if VV:
             while 1:
-                line=vf.readline()
-                if len(line)==0:
+                line = vf.readline()
+                if len(line) == 0:
                     break
-                ww=line.strip().split('\t')
-                VScore[(ww[0],ww[1])]=int(ww[2])/20
-                VScore[(ww[1],ww[0])]=int(ww[2])/20
-        Gap=int(opt.Gap)
-        Gapn=int(opt.GapN)
-        OutFile=opt.OutFile
-        GPU=opt.GPU
-        st=3
-        ed=1
-        NT=int(opt.NN)
+                ww = line.strip().split("\t")
+                VScore[(ww[0], ww[1])] = int(ww[2]) / 20
+                VScore[(ww[1], ww[0])] = int(ww[2]) / 20
+        Gap = int(opt.Gap)
+        Gapn = int(opt.GapN)
+        OutFile = opt.OutFile
+        GPU = opt.GPU
+        st = 3
+        ed = 1
+        NT = int(opt.NN)
         faiss.omp_set_num_threads(NT)
         for ff in files:
-            print("Processing %s" %ff)
-            EncodeRepertoire(ff, OutDir, OutFile, ST=ST, thr_s=thr_s, thr_v=thr_v, exact=EE,VDict=VScore, Vgene=VV, thr_iso=cutoff, gap=Gap, GPU=GPU, Mat=Mat, verbose=verbose)
+            print("Processing %s" % ff)
+            EncodeRepertoire(
+                ff,
+                OutDir,
+                OutFile,
+                ST=ST,
+                thr_s=thr_s,
+                thr_v=thr_v,
+                exact=EE,
+                VDict=VScore,
+                Vgene=VV,
+                thr_iso=cutoff,
+                gap=Gap,
+                GPU=GPU,
+                Mat=Mat,
+                verbose=verbose,
+            )
 if __name__ == "__main__":
-    t0=time.time()
+    t0 = time.time()
     main()
-    print ("Total time elapsed: %f" %(time.time()-t0))
-    print ("Maximum memory usage: %f MB" %(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss/1000000))
+    print("Total time elapsed: %f" % (time.time() - t0))
+    print(
+        "Maximum memory usage: %f MB"
+        % (resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1000000)
+    )

biopipen 0.21.0__py3-none-any.whl → 0.34.26__py3-none-any.whl

biopipen 0.21.0py3-none-any.whl → 0.34.26py3-none-any.whl