biopipen 0.31.7__py3-none-any.whl → 0.32.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biopipen might be problematic. Click here for more details.
- biopipen/__init__.py +1 -1
- biopipen/ns/scrna.py +153 -0
- biopipen/reports/scrna/CellCellCommunicationPlots.svelte +14 -0
- biopipen/reports/scrna/SeuratMap2Ref.svelte +10 -6
- biopipen/reports/scrna/TopExpressingGenes.svelte +1 -1
- biopipen/scripts/scrna/AnnData2Seurat.R +22 -14
- biopipen/scripts/scrna/CCPlotR-patch.R +161 -0
- biopipen/scripts/scrna/CellCellCommunication.py +101 -0
- biopipen/scripts/scrna/CellCellCommunicationPlots.R +191 -0
- biopipen/scripts/scrna/ScFGSEA.R +1 -1
- biopipen/scripts/scrna/Seurat2AnnData.R +2 -42
- biopipen/scripts/scrna/SeuratClusterStats-features.R +1 -1
- biopipen/scripts/scrna/SeuratMap2Ref.R +20 -1
- biopipen/scripts/scrna/SeuratPreparing-common.R +6 -6
- biopipen/scripts/tcr/GIANA/GIANA.py +1356 -797
- biopipen/scripts/tcr/GIANA/GIANA4.py +1364 -789
- biopipen/scripts/tcr/GIANA/query.py +164 -162
- biopipen/scripts/tcr/TCRClustering.R +25 -4
- biopipen/utils/single_cell.R +92 -1
- {biopipen-0.31.7.dist-info → biopipen-0.32.1.dist-info}/METADATA +2 -1
- {biopipen-0.31.7.dist-info → biopipen-0.32.1.dist-info}/RECORD +23 -19
- {biopipen-0.31.7.dist-info → biopipen-0.32.1.dist-info}/WHEEL +1 -1
- {biopipen-0.31.7.dist-info → biopipen-0.32.1.dist-info}/entry_points.txt +0 -0
|
@@ -24,7 +24,6 @@
|
|
|
24
24
|
import sys, os, re, resource
|
|
25
25
|
from os import path
|
|
26
26
|
import numpy as np
|
|
27
|
-
from Bio.SubsMat.MatrixInfo import blosum62
|
|
28
27
|
import time
|
|
29
28
|
from time import gmtime, strftime
|
|
30
29
|
from operator import itemgetter
|
|
@@ -36,255 +35,587 @@ from sklearn.decomposition import PCA
|
|
|
36
35
|
from sklearn.manifold import MDS
|
|
37
36
|
import faiss
|
|
38
37
|
from query import *
|
|
38
|
+
try:
|
|
39
|
+
from Bio.SubsMat.MatrixInfo import blosum62
|
|
40
|
+
print(blosum62)
|
|
41
|
+
except ModuleNotFoundError:
|
|
42
|
+
from Bio.Align import substitution_matrices
|
|
43
|
+
blosum62 = substitution_matrices.load("BLOSUM62")
|
|
44
|
+
_tmp = {}
|
|
45
|
+
for ab1 in blosum62.alphabet:
|
|
46
|
+
for ab2 in blosum62.alphabet:
|
|
47
|
+
_tmp[(ab1, ab2)] = int(blosum62[(ab1, ab2)])
|
|
48
|
+
blosum62 = _tmp
|
|
49
|
+
print(blosum62)
|
|
39
50
|
|
|
40
|
-
AAstring=
|
|
41
|
-
AAstringList=list(AAstring)
|
|
42
|
-
cur_dir=os.path.dirname(os.path.realpath(__file__))+
|
|
51
|
+
AAstring = "ACDEFGHIKLMNPQRSTVWY"
|
|
52
|
+
AAstringList = list(AAstring)
|
|
53
|
+
cur_dir = os.path.dirname(os.path.realpath(__file__)) + "/"
|
|
43
54
|
|
|
44
|
-
blosum62n={}
|
|
55
|
+
blosum62n = {}
|
|
45
56
|
for kk in blosum62:
|
|
46
|
-
a1=kk[0]
|
|
47
|
-
a2=kk[1]
|
|
48
|
-
vv=blosum62[kk]
|
|
49
|
-
if vv>4:
|
|
50
|
-
vv=4
|
|
51
|
-
blosum62n[(a1,a2)]=vv
|
|
57
|
+
a1 = kk[0]
|
|
58
|
+
a2 = kk[1]
|
|
59
|
+
vv = blosum62[kk]
|
|
60
|
+
if vv > 4:
|
|
61
|
+
vv = 4
|
|
62
|
+
blosum62n[(a1, a2)] = vv
|
|
52
63
|
if a1 != a2:
|
|
53
|
-
blosum62n[(a2,a1)]=vv
|
|
54
|
-
|
|
55
|
-
bl62={
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
64
|
+
blosum62n[(a2, a1)] = vv
|
|
65
|
+
|
|
66
|
+
bl62 = {
|
|
67
|
+
"A": [4, -1, -2, -2, 0, -1, -1, 0, -2, -1, -1, -1, -1, -2, -1, 1, 0, -3, -2, 0],
|
|
68
|
+
"R": [-1, 4, 0, -2, -3, 1, 0, -2, 0, -3, -2, 2, -1, -3, -2, -1, -1, -3, -2, -3],
|
|
69
|
+
"N": [-2, 0, 4, 1, -3, 0, 0, 0, 1, -3, -3, 0, -2, -3, -2, 1, 0, -4, -2, -3],
|
|
70
|
+
"D": [-2, -2, 1, 4, -3, 0, 2, -1, -1, -3, -4, -1, -3, -3, -1, 0, -1, -4, -3, -3],
|
|
71
|
+
"C": [0, -3, -3, -3, 4, -3, -4, -3, -3, -1, -1, -3, -1, -2, -3, -1, -1, -2, -2, -1],
|
|
72
|
+
"Q": [-1, 1, 0, 0, -3, 4, 2, -2, 0, -3, -2, 1, 0, -3, -1, 0, -1, -2, -1, -2],
|
|
73
|
+
"E": [-1, 0, 0, 2, -4, 2, 4, -2, 0, -3, -3, 1, -2, -3, -1, 0, -1, -3, -2, -2],
|
|
74
|
+
"G": [0, -2, 0, -1, -3, -2, -2, 4, -2, -4, -4, -2, -3, -3, -2, 0, -2, -2, -3, -3],
|
|
75
|
+
"H": [-2, 0, 1, -1, -3, 0, 0, -2, 4, -3, -3, -1, -2, -1, -2, -1, -2, -2, 2, -3],
|
|
76
|
+
"I": [-1, -3, -3, -3, -1, -3, -3, -4, -3, 4, 2, -3, 1, 0, -3, -2, -1, -3, -1, 3],
|
|
77
|
+
"L": [-1, -2, -3, -4, -1, -2, -3, -4, -3, 2, 4, -2, 2, 0, -3, -2, -1, -2, -1, 1],
|
|
78
|
+
"K": [-1, 2, 0, -1, -3, 1, 1, -2, -1, -3, -2, 4, -1, -3, -1, 0, -1, -3, -2, -2],
|
|
79
|
+
"M": [-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1, 4, 0, -2, -1, -1, -1, -1, 1],
|
|
80
|
+
"F": [-2, -3, -3, -3, -2, -3, -3, -3, -1, 0, 0, -3, 0, 4, -4, -2, -2, 1, 3, -1],
|
|
81
|
+
"P": [
|
|
82
|
+
-1,
|
|
83
|
+
-2,
|
|
84
|
+
-2,
|
|
85
|
+
-1,
|
|
86
|
+
-3,
|
|
87
|
+
-1,
|
|
88
|
+
-1,
|
|
89
|
+
-2,
|
|
90
|
+
-2,
|
|
91
|
+
-3,
|
|
92
|
+
-3,
|
|
93
|
+
-1,
|
|
94
|
+
-2,
|
|
95
|
+
-4,
|
|
96
|
+
4,
|
|
97
|
+
-1,
|
|
98
|
+
-1,
|
|
99
|
+
-4,
|
|
100
|
+
-3,
|
|
101
|
+
-2,
|
|
102
|
+
],
|
|
103
|
+
"S": [1, -1, 1, 0, -1, 0, 0, 0, -1, -2, -2, 0, -1, -2, -1, 4, 1, -3, -2, -2],
|
|
104
|
+
"T": [0, -1, 0, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, -2, -1, 1, 4, -2, -2, 0],
|
|
105
|
+
"W": [-3, -3, -4, -4, -2, -2, -3, -2, -2, -3, -2, -3, -1, 1, -4, -3, -2, 4, 2, -3],
|
|
106
|
+
"Y": [-2, -2, -2, -3, -2, -1, -2, -3, 2, -1, -1, -2, -1, 3, -3, -2, -2, 2, 4, -1],
|
|
107
|
+
"V": [0, -3, -3, -3, -1, -2, -2, -3, -3, 3, 1, -2, 1, -1, -2, -2, 0, -3, -1, 4],
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
bl62c = np.array([np.array(x) for x in list(bl62.values())])
|
|
111
|
+
bl62c = 4 - bl62c
|
|
112
|
+
|
|
113
|
+
embedding = MDS(
|
|
114
|
+
n_components=13, n_init=100, max_iter=1000, eps=0.00001, dissimilarity="precomputed"
|
|
115
|
+
)
|
|
116
|
+
X = embedding.fit_transform(bl62c)
|
|
117
|
+
|
|
118
|
+
bl62np = {}
|
|
119
|
+
vkk = list(bl62.keys())
|
|
84
120
|
for ii in range(20):
|
|
85
|
-
kk=vkk[ii]
|
|
86
|
-
bl62np[kk]=np.array(list(X[ii,])+[0]*17)
|
|
121
|
+
kk = vkk[ii]
|
|
122
|
+
bl62np[kk] = np.array(list(X[ii,]) + [0] * 17)
|
|
87
123
|
|
|
88
|
-
|
|
89
|
-
AAencodingDict={}
|
|
124
|
+
|
|
125
|
+
AAencodingDict = {}
|
|
90
126
|
for ii in range(len(AAstringList)):
|
|
91
|
-
aa=AAstringList[ii]
|
|
92
|
-
CODE=[0]*(ii)+[1]+[0]*(20-ii)
|
|
93
|
-
AAencodingDict[aa]=np.array(CODE)
|
|
94
|
-
|
|
95
|
-
Ndim=16 ## optimized for isometric embedding
|
|
96
|
-
n0=Ndim*6
|
|
97
|
-
#M0=np.concatenate((np.concatenate((ZERO,M1),axis=1),np.concatenate((M1, ZERO),axis=1)))
|
|
98
|
-
ZERO=np.zeros((Ndim,Ndim))
|
|
99
|
-
II=np.eye(Ndim)
|
|
100
|
-
M0
|
|
127
|
+
aa = AAstringList[ii]
|
|
128
|
+
CODE = [0] * (ii) + [1] + [0] * (20 - ii)
|
|
129
|
+
AAencodingDict[aa] = np.array(CODE)
|
|
130
|
+
|
|
131
|
+
Ndim = 16 ## optimized for isometric embedding
|
|
132
|
+
n0 = Ndim * 6
|
|
133
|
+
# M0=np.concatenate((np.concatenate((ZERO,M1),axis=1),np.concatenate((M1, ZERO),axis=1)))
|
|
134
|
+
ZERO = np.zeros((Ndim, Ndim))
|
|
135
|
+
II = np.eye(Ndim)
|
|
136
|
+
M0 = np.concatenate(
|
|
137
|
+
(
|
|
138
|
+
np.concatenate((ZERO, ZERO, II), axis=1),
|
|
139
|
+
np.concatenate((II, ZERO, ZERO), axis=1),
|
|
140
|
+
np.concatenate((ZERO, II, ZERO), axis=1),
|
|
141
|
+
)
|
|
142
|
+
)
|
|
101
143
|
## Construct 6-th order cyclic group
|
|
102
|
-
ZERO45=np.zeros((Ndim*3,Ndim*3))
|
|
103
|
-
M6=np.concatenate(
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
144
|
+
ZERO45 = np.zeros((Ndim * 3, Ndim * 3))
|
|
145
|
+
M6 = np.concatenate(
|
|
146
|
+
(np.concatenate((ZERO45, M0), axis=1), np.concatenate((M0, ZERO45), axis=1))
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
X = np.array(
|
|
150
|
+
[
|
|
151
|
+
[
|
|
152
|
+
-0.31230882,
|
|
153
|
+
-0.53572156,
|
|
154
|
+
-0.01949946,
|
|
155
|
+
-0.12211268,
|
|
156
|
+
-0.70947917,
|
|
157
|
+
-0.42211092,
|
|
158
|
+
0.02783931,
|
|
159
|
+
0.02637933,
|
|
160
|
+
-0.41760305,
|
|
161
|
+
0.21809875,
|
|
162
|
+
0.53532768,
|
|
163
|
+
0.04833016,
|
|
164
|
+
0.07877711,
|
|
165
|
+
0.50464914,
|
|
166
|
+
-0.26972087,
|
|
167
|
+
-0.52416842,
|
|
168
|
+
],
|
|
169
|
+
[
|
|
170
|
+
0.29672002,
|
|
171
|
+
0.29005364,
|
|
172
|
+
0.18176298,
|
|
173
|
+
-0.05103382,
|
|
174
|
+
-0.34686519,
|
|
175
|
+
0.58024228,
|
|
176
|
+
-0.49282931,
|
|
177
|
+
0.62304281,
|
|
178
|
+
-0.09575202,
|
|
179
|
+
0.30115555,
|
|
180
|
+
0.09913529,
|
|
181
|
+
0.1577466,
|
|
182
|
+
-0.94391939,
|
|
183
|
+
-0.10505925,
|
|
184
|
+
0.05482389,
|
|
185
|
+
0.38409897,
|
|
186
|
+
],
|
|
187
|
+
[
|
|
188
|
+
-0.42212537,
|
|
189
|
+
0.12225749,
|
|
190
|
+
0.16279646,
|
|
191
|
+
0.60099009,
|
|
192
|
+
0.19734216,
|
|
193
|
+
0.42819919,
|
|
194
|
+
-0.33562418,
|
|
195
|
+
0.17036334,
|
|
196
|
+
0.4234109,
|
|
197
|
+
0.46681561,
|
|
198
|
+
-0.50347222,
|
|
199
|
+
-0.37936876,
|
|
200
|
+
0.1494825,
|
|
201
|
+
0.32176759,
|
|
202
|
+
0.28584684,
|
|
203
|
+
0.68469861,
|
|
204
|
+
],
|
|
205
|
+
[
|
|
206
|
+
0.18599294,
|
|
207
|
+
-0.44017825,
|
|
208
|
+
-0.4476952,
|
|
209
|
+
0.34340976,
|
|
210
|
+
0.44603553,
|
|
211
|
+
0.40974629,
|
|
212
|
+
-0.60045935,
|
|
213
|
+
-0.09056728,
|
|
214
|
+
0.22147919,
|
|
215
|
+
-0.33029418,
|
|
216
|
+
0.55635594,
|
|
217
|
+
-0.54149972,
|
|
218
|
+
0.05459062,
|
|
219
|
+
0.57334159,
|
|
220
|
+
-0.06227118,
|
|
221
|
+
0.65299872,
|
|
222
|
+
],
|
|
223
|
+
[
|
|
224
|
+
-0.19010428,
|
|
225
|
+
0.64418792,
|
|
226
|
+
-0.85286762,
|
|
227
|
+
0.21380295,
|
|
228
|
+
0.37639516,
|
|
229
|
+
-0.67753593,
|
|
230
|
+
0.38751609,
|
|
231
|
+
0.55746524,
|
|
232
|
+
0.01443766,
|
|
233
|
+
0.1776535,
|
|
234
|
+
0.62853954,
|
|
235
|
+
-0.15048523,
|
|
236
|
+
0.55100206,
|
|
237
|
+
-0.21426656,
|
|
238
|
+
0.3644061,
|
|
239
|
+
-0.0018255,
|
|
240
|
+
],
|
|
241
|
+
[
|
|
242
|
+
0.7350723,
|
|
243
|
+
0.10111267,
|
|
244
|
+
0.55640019,
|
|
245
|
+
-0.18226966,
|
|
246
|
+
0.51658102,
|
|
247
|
+
-0.19321508,
|
|
248
|
+
-0.46599027,
|
|
249
|
+
-0.02989911,
|
|
250
|
+
0.4036196,
|
|
251
|
+
-0.11978213,
|
|
252
|
+
-0.29837524,
|
|
253
|
+
-0.30232765,
|
|
254
|
+
-0.36738065,
|
|
255
|
+
-0.1379793,
|
|
256
|
+
0.04362871,
|
|
257
|
+
0.33553714,
|
|
258
|
+
],
|
|
259
|
+
[
|
|
260
|
+
0.41134047,
|
|
261
|
+
0.13512443,
|
|
262
|
+
0.62492322,
|
|
263
|
+
-0.10120261,
|
|
264
|
+
-0.03093491,
|
|
265
|
+
0.23751917,
|
|
266
|
+
-0.68338694,
|
|
267
|
+
0.05124762,
|
|
268
|
+
0.41533821,
|
|
269
|
+
0.46669353,
|
|
270
|
+
0.31467277,
|
|
271
|
+
-0.02427587,
|
|
272
|
+
0.15361135,
|
|
273
|
+
0.70595112,
|
|
274
|
+
-0.27952632,
|
|
275
|
+
0.32408931,
|
|
276
|
+
],
|
|
277
|
+
[
|
|
278
|
+
-0.33041265,
|
|
279
|
+
-0.43860065,
|
|
280
|
+
-0.5509376,
|
|
281
|
+
-0.04380843,
|
|
282
|
+
-0.35160935,
|
|
283
|
+
0.25134855,
|
|
284
|
+
0.53409314,
|
|
285
|
+
0.54850824,
|
|
286
|
+
0.59490287,
|
|
287
|
+
0.32669345,
|
|
288
|
+
-0.45355268,
|
|
289
|
+
-0.56317041,
|
|
290
|
+
-0.55416297,
|
|
291
|
+
0.18117841,
|
|
292
|
+
-0.71600849,
|
|
293
|
+
-0.08989825,
|
|
294
|
+
],
|
|
295
|
+
[
|
|
296
|
+
-0.40366849,
|
|
297
|
+
0.10978974,
|
|
298
|
+
0.0280101,
|
|
299
|
+
-0.46667987,
|
|
300
|
+
-0.45607028,
|
|
301
|
+
0.54114052,
|
|
302
|
+
-0.77552923,
|
|
303
|
+
-0.10720425,
|
|
304
|
+
0.55252091,
|
|
305
|
+
-0.34397153,
|
|
306
|
+
-0.59813694,
|
|
307
|
+
0.15567728,
|
|
308
|
+
0.03071009,
|
|
309
|
+
-0.02176143,
|
|
310
|
+
0.34442719,
|
|
311
|
+
0.14681541,
|
|
312
|
+
],
|
|
313
|
+
[
|
|
314
|
+
0.19280422,
|
|
315
|
+
0.35777863,
|
|
316
|
+
0.06139255,
|
|
317
|
+
0.20081699,
|
|
318
|
+
-0.30546596,
|
|
319
|
+
-0.56901549,
|
|
320
|
+
-0.15290953,
|
|
321
|
+
-0.31181573,
|
|
322
|
+
-0.74523217,
|
|
323
|
+
0.22296016,
|
|
324
|
+
-0.39143832,
|
|
325
|
+
-0.16474685,
|
|
326
|
+
0.58064427,
|
|
327
|
+
-0.77386654,
|
|
328
|
+
0.19713107,
|
|
329
|
+
-0.49477418,
|
|
330
|
+
],
|
|
331
|
+
[
|
|
332
|
+
-0.16133903,
|
|
333
|
+
0.22112761,
|
|
334
|
+
-0.53162136,
|
|
335
|
+
0.34764073,
|
|
336
|
+
-0.08522381,
|
|
337
|
+
-0.2510216,
|
|
338
|
+
0.04699411,
|
|
339
|
+
-0.25702389,
|
|
340
|
+
-0.8739765,
|
|
341
|
+
-0.24171728,
|
|
342
|
+
-0.24370533,
|
|
343
|
+
0.42193635,
|
|
344
|
+
0.41056913,
|
|
345
|
+
-0.60378211,
|
|
346
|
+
-0.65756832,
|
|
347
|
+
0.0845203,
|
|
348
|
+
],
|
|
349
|
+
[
|
|
350
|
+
-0.34792144,
|
|
351
|
+
0.18450939,
|
|
352
|
+
0.77038332,
|
|
353
|
+
0.63868511,
|
|
354
|
+
-0.06221681,
|
|
355
|
+
0.11930421,
|
|
356
|
+
0.04895523,
|
|
357
|
+
-0.22463059,
|
|
358
|
+
-0.03268844,
|
|
359
|
+
-0.58941354,
|
|
360
|
+
0.11640045,
|
|
361
|
+
0.32384901,
|
|
362
|
+
-0.42952779,
|
|
363
|
+
0.58119471,
|
|
364
|
+
0.07288662,
|
|
365
|
+
0.26669673,
|
|
366
|
+
],
|
|
367
|
+
[
|
|
368
|
+
0.01834555,
|
|
369
|
+
-0.16367754,
|
|
370
|
+
0.34900298,
|
|
371
|
+
0.45087949,
|
|
372
|
+
0.47073855,
|
|
373
|
+
-0.37377404,
|
|
374
|
+
0.0606911,
|
|
375
|
+
0.2455703,
|
|
376
|
+
-0.55182937,
|
|
377
|
+
-0.20261009,
|
|
378
|
+
0.28325423,
|
|
379
|
+
-0.04741146,
|
|
380
|
+
0.30565238,
|
|
381
|
+
-0.62090653,
|
|
382
|
+
0.17528413,
|
|
383
|
+
-0.60434975,
|
|
384
|
+
],
|
|
385
|
+
[
|
|
386
|
+
-0.55464981,
|
|
387
|
+
0.50918784,
|
|
388
|
+
-0.21371646,
|
|
389
|
+
-0.63996967,
|
|
390
|
+
-0.37656862,
|
|
391
|
+
0.27852662,
|
|
392
|
+
0.3287838,
|
|
393
|
+
-0.56800869,
|
|
394
|
+
0.23260763,
|
|
395
|
+
-0.20653106,
|
|
396
|
+
0.63261439,
|
|
397
|
+
-0.22666691,
|
|
398
|
+
0.00726302,
|
|
399
|
+
-0.60125196,
|
|
400
|
+
0.07139961,
|
|
401
|
+
-0.35086639,
|
|
402
|
+
],
|
|
403
|
+
[
|
|
404
|
+
0.94039731,
|
|
405
|
+
-0.25999326,
|
|
406
|
+
0.43922549,
|
|
407
|
+
-0.485738,
|
|
408
|
+
-0.20492235,
|
|
409
|
+
-0.26005626,
|
|
410
|
+
0.68776626,
|
|
411
|
+
0.57826888,
|
|
412
|
+
-0.05973995,
|
|
413
|
+
-0.1193658,
|
|
414
|
+
-0.12102433,
|
|
415
|
+
-0.22091354,
|
|
416
|
+
0.43427913,
|
|
417
|
+
0.71447886,
|
|
418
|
+
0.32745991,
|
|
419
|
+
0.03466398,
|
|
420
|
+
],
|
|
421
|
+
[
|
|
422
|
+
-0.13194625,
|
|
423
|
+
-0.12262688,
|
|
424
|
+
0.18029209,
|
|
425
|
+
0.16555524,
|
|
426
|
+
0.39594125,
|
|
427
|
+
-0.58110665,
|
|
428
|
+
0.16161717,
|
|
429
|
+
0.0839783,
|
|
430
|
+
0.0911945,
|
|
431
|
+
0.34546976,
|
|
432
|
+
-0.29415349,
|
|
433
|
+
0.29891936,
|
|
434
|
+
-0.60834721,
|
|
435
|
+
0.5943593,
|
|
436
|
+
-0.29473819,
|
|
437
|
+
0.4864154,
|
|
438
|
+
],
|
|
439
|
+
[
|
|
440
|
+
0.40850093,
|
|
441
|
+
-0.4638894,
|
|
442
|
+
-0.39732987,
|
|
443
|
+
-0.01972861,
|
|
444
|
+
0.51189582,
|
|
445
|
+
0.10176704,
|
|
446
|
+
0.37528519,
|
|
447
|
+
-0.41479418,
|
|
448
|
+
-0.1932531,
|
|
449
|
+
0.54732221,
|
|
450
|
+
-0.11876511,
|
|
451
|
+
0.32843973,
|
|
452
|
+
-0.259283,
|
|
453
|
+
0.59500132,
|
|
454
|
+
0.35168375,
|
|
455
|
+
-0.21733727,
|
|
456
|
+
],
|
|
457
|
+
[
|
|
458
|
+
-0.50627723,
|
|
459
|
+
-0.1973602,
|
|
460
|
+
-0.02339884,
|
|
461
|
+
-0.66846048,
|
|
462
|
+
0.62696606,
|
|
463
|
+
0.60049717,
|
|
464
|
+
0.69143364,
|
|
465
|
+
-0.48053591,
|
|
466
|
+
0.17812208,
|
|
467
|
+
-0.58481821,
|
|
468
|
+
-0.23551415,
|
|
469
|
+
-0.06229112,
|
|
470
|
+
0.20993116,
|
|
471
|
+
-0.72485884,
|
|
472
|
+
0.34375662,
|
|
473
|
+
-0.23539168,
|
|
474
|
+
],
|
|
475
|
+
[
|
|
476
|
+
-0.51388312,
|
|
477
|
+
-0.2788953,
|
|
478
|
+
0.00859533,
|
|
479
|
+
-0.5247195,
|
|
480
|
+
-0.18021544,
|
|
481
|
+
0.28372911,
|
|
482
|
+
0.10791359,
|
|
483
|
+
0.13033494,
|
|
484
|
+
0.34294013,
|
|
485
|
+
-0.70310089,
|
|
486
|
+
-0.13245433,
|
|
487
|
+
0.48661081,
|
|
488
|
+
0.08451644,
|
|
489
|
+
-0.69990992,
|
|
490
|
+
0.0408274,
|
|
491
|
+
-0.47204888,
|
|
492
|
+
],
|
|
493
|
+
[
|
|
494
|
+
0.68546275,
|
|
495
|
+
0.22581365,
|
|
496
|
+
-0.32571833,
|
|
497
|
+
0.34394298,
|
|
498
|
+
-0.43232367,
|
|
499
|
+
-0.5041842,
|
|
500
|
+
0.04784017,
|
|
501
|
+
-0.53067936,
|
|
502
|
+
-0.50049908,
|
|
503
|
+
0.36874221,
|
|
504
|
+
0.22429186,
|
|
505
|
+
0.4616482,
|
|
506
|
+
0.11159174,
|
|
507
|
+
-0.26827959,
|
|
508
|
+
-0.39372848,
|
|
509
|
+
-0.40987423,
|
|
510
|
+
],
|
|
511
|
+
]
|
|
512
|
+
)
|
|
513
|
+
|
|
514
|
+
bl62np = {}
|
|
515
|
+
vkk = list(bl62.keys())
|
|
188
516
|
for ii in range(20):
|
|
189
|
-
kk=vkk[ii]
|
|
190
|
-
bl62np[kk]=np.array(list(X[ii,])+[0]*Ndim*5)
|
|
517
|
+
kk = vkk[ii]
|
|
518
|
+
bl62np[kk] = np.array(list(X[ii,]) + [0] * Ndim * 5)
|
|
519
|
+
|
|
191
520
|
|
|
192
521
|
def EncodingCDR3(s, M, n0):
|
|
193
|
-
sL=list(s)
|
|
194
|
-
x=np.array([0]*n0)
|
|
522
|
+
sL = list(s)
|
|
523
|
+
x = np.array([0] * n0)
|
|
195
524
|
for ii in range(len(sL)):
|
|
196
|
-
x = np.dot(M, (x+bl62np[sL[ii]]))
|
|
525
|
+
x = np.dot(M, (x + bl62np[sL[ii]]))
|
|
197
526
|
return x
|
|
198
527
|
|
|
528
|
+
|
|
199
529
|
def BuildLengthDict(seqs, sIDs, vGene=[], INFO=[]):
|
|
200
|
-
LLs=[10,11,12,13,14,15,16,17,18,19,20,21,22,23,24]
|
|
201
|
-
LengthD={}
|
|
202
|
-
SeqD={}
|
|
203
|
-
VgeneD={}
|
|
204
|
-
InfoD={}
|
|
205
|
-
AAs=set(list(AAencodingDict.keys()))
|
|
206
|
-
NAs=len(AAencodingDict)
|
|
207
|
-
cNAs=0
|
|
530
|
+
LLs = [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]
|
|
531
|
+
LengthD = {}
|
|
532
|
+
SeqD = {}
|
|
533
|
+
VgeneD = {}
|
|
534
|
+
InfoD = {}
|
|
535
|
+
AAs = set(list(AAencodingDict.keys()))
|
|
536
|
+
NAs = len(AAencodingDict)
|
|
537
|
+
cNAs = 0
|
|
208
538
|
for ii in range(len(seqs)):
|
|
209
|
-
ID=sIDs[ii]
|
|
210
|
-
ss=seqs[ii]
|
|
211
|
-
ssAA=set(list(ss))
|
|
212
|
-
TMP=list(ssAA | AAs)
|
|
539
|
+
ID = sIDs[ii]
|
|
540
|
+
ss = seqs[ii]
|
|
541
|
+
ssAA = set(list(ss))
|
|
542
|
+
TMP = list(ssAA | AAs)
|
|
213
543
|
if len(TMP) > NAs:
|
|
214
544
|
## CDR3 containing non amino acid letter
|
|
215
|
-
#print('Warning: CDR3: '+ss + ' contains non amino acid letter!')
|
|
216
|
-
cNAs+=1
|
|
545
|
+
# print('Warning: CDR3: '+ss + ' contains non amino acid letter!')
|
|
546
|
+
cNAs += 1
|
|
217
547
|
continue
|
|
218
|
-
if len(vGene)>0:
|
|
219
|
-
vv=vGene[ii]
|
|
220
|
-
if len(INFO)>0:
|
|
221
|
-
info=INFO[ii]
|
|
222
|
-
L=len(ss)
|
|
548
|
+
if len(vGene) > 0:
|
|
549
|
+
vv = vGene[ii]
|
|
550
|
+
if len(INFO) > 0:
|
|
551
|
+
info = INFO[ii]
|
|
552
|
+
L = len(ss)
|
|
223
553
|
if L not in LLs:
|
|
224
554
|
continue
|
|
225
555
|
if L not in LengthD:
|
|
226
|
-
LengthD[L]=[ID]
|
|
227
|
-
SeqD[L]=[ss]
|
|
228
|
-
if len(vGene)>0:
|
|
229
|
-
VgeneD[L]=[vv]
|
|
230
|
-
if len(INFO)>0:
|
|
231
|
-
InfoD[L]=[info]
|
|
556
|
+
LengthD[L] = [ID]
|
|
557
|
+
SeqD[L] = [ss]
|
|
558
|
+
if len(vGene) > 0:
|
|
559
|
+
VgeneD[L] = [vv]
|
|
560
|
+
if len(INFO) > 0:
|
|
561
|
+
InfoD[L] = [info]
|
|
232
562
|
else:
|
|
233
563
|
LengthD[L].append(ID)
|
|
234
564
|
SeqD[L].append(ss)
|
|
235
|
-
if len(vGene)>0:
|
|
565
|
+
if len(vGene) > 0:
|
|
236
566
|
VgeneD[L].append(vv)
|
|
237
|
-
if len(INFO)>0:
|
|
567
|
+
if len(INFO) > 0:
|
|
238
568
|
InfoD[L].append(info)
|
|
239
|
-
if cNAs>0:
|
|
240
|
-
print("Warning: Skipped %d sequences with non AA letter!" %(cNAs))
|
|
569
|
+
if cNAs > 0:
|
|
570
|
+
print("Warning: Skipped %d sequences with non AA letter!" % (cNAs))
|
|
241
571
|
return LengthD, VgeneD, InfoD, SeqD
|
|
242
572
|
|
|
573
|
+
|
|
243
574
|
def CollapseUnique(LD, VD, ID, SD):
|
|
244
|
-
kks=LD.keys()
|
|
245
|
-
LDu={}
|
|
246
|
-
VDu={}
|
|
247
|
-
IDu={}
|
|
248
|
-
SDu={}
|
|
575
|
+
kks = LD.keys()
|
|
576
|
+
LDu = {}
|
|
577
|
+
VDu = {}
|
|
578
|
+
IDu = {}
|
|
579
|
+
SDu = {}
|
|
249
580
|
for kk in kks:
|
|
250
|
-
vvL=list(LD[kk])
|
|
251
|
-
if len(VD)>0:
|
|
252
|
-
vvV=list(VD[kk])
|
|
581
|
+
vvL = list(LD[kk])
|
|
582
|
+
if len(VD) > 0:
|
|
583
|
+
vvV = list(VD[kk])
|
|
253
584
|
else:
|
|
254
|
-
vvV=[
|
|
255
|
-
vvI=list(ID[kk])
|
|
256
|
-
vvS=list(SD[kk])
|
|
257
|
-
zz=zip(vvL, vvS, vvV, vvI)
|
|
258
|
-
zzs=sorted(zz, key
|
|
259
|
-
nz=len(zzs)
|
|
260
|
-
pointer_pre=0
|
|
261
|
-
pointer_cur=1
|
|
262
|
-
s_pre=zzs[pointer_pre][1]
|
|
263
|
-
v_pre=zzs[pointer_pre][2]
|
|
264
|
-
uS=[s_pre]
|
|
265
|
-
uV=[v_pre]
|
|
266
|
-
uI=[[zzs[pointer_pre][3]]]
|
|
585
|
+
vvV = ["TRBV2-1*01"] * len(vvL)
|
|
586
|
+
vvI = list(ID[kk])
|
|
587
|
+
vvS = list(SD[kk])
|
|
588
|
+
zz = zip(vvL, vvS, vvV, vvI)
|
|
589
|
+
zzs = sorted(zz, key=lambda x: (x[1], x[2]))
|
|
590
|
+
nz = len(zzs)
|
|
591
|
+
pointer_pre = 0
|
|
592
|
+
pointer_cur = 1
|
|
593
|
+
s_pre = zzs[pointer_pre][1]
|
|
594
|
+
v_pre = zzs[pointer_pre][2]
|
|
595
|
+
uS = [s_pre]
|
|
596
|
+
uV = [v_pre]
|
|
597
|
+
uI = [[zzs[pointer_pre][3]]]
|
|
267
598
|
while pointer_cur < nz:
|
|
268
|
-
s_cur=zzs[pointer_cur][1]
|
|
269
|
-
v_cur=zzs[pointer_cur][2]
|
|
599
|
+
s_cur = zzs[pointer_cur][1]
|
|
600
|
+
v_cur = zzs[pointer_cur][2]
|
|
270
601
|
if s_cur == s_pre and v_cur == v_pre:
|
|
271
|
-
uI[len(uI)-1].append(zzs[pointer_cur][3])
|
|
602
|
+
uI[len(uI) - 1].append(zzs[pointer_cur][3])
|
|
272
603
|
pointer_cur += 1
|
|
273
604
|
continue
|
|
274
605
|
else:
|
|
275
606
|
uS.append(s_cur)
|
|
276
607
|
uV.append(v_cur)
|
|
277
608
|
uI.append([zzs[pointer_cur][3]])
|
|
278
|
-
s_pre=s_cur
|
|
279
|
-
v_pre=v_cur
|
|
280
|
-
pointer_pre=pointer_cur
|
|
609
|
+
s_pre = s_cur
|
|
610
|
+
v_pre = v_cur
|
|
611
|
+
pointer_pre = pointer_cur
|
|
281
612
|
pointer_cur += 1
|
|
282
|
-
uL=[x for x in range(len(uS))]
|
|
283
|
-
LDu[kk]=uL
|
|
284
|
-
SDu[kk]=uS
|
|
285
|
-
if len(VD)>0:
|
|
286
|
-
VDu[kk]=uV
|
|
287
|
-
IDu[kk]=uI
|
|
613
|
+
uL = [x for x in range(len(uS))]
|
|
614
|
+
LDu[kk] = uL
|
|
615
|
+
SDu[kk] = uS
|
|
616
|
+
if len(VD) > 0:
|
|
617
|
+
VDu[kk] = uV
|
|
618
|
+
IDu[kk] = uI
|
|
288
619
|
return LDu, VDu, IDu, SDu
|
|
289
620
|
|
|
290
621
|
|
|
@@ -296,14 +627,15 @@ class CDR3:
|
|
|
296
627
|
## KS: Kmer size
|
|
297
628
|
## st: the first 0:(st-1) amino acids will not be included in K-merization
|
|
298
629
|
## ed: the last L-ed amino acids will be skipped
|
|
299
|
-
self.s=s
|
|
300
|
-
self.ID=sID
|
|
301
|
-
L=len(s)
|
|
302
|
-
self.L=L
|
|
303
|
-
sub_s=s[st: (L-ed)]
|
|
304
|
-
Ls=len(sub_s)
|
|
305
|
-
Kmer=[sub_s[x:(x+KS)] for x in range(0,Ls-KS+1)]
|
|
306
|
-
self.Kmer=Kmer
|
|
630
|
+
self.s = s
|
|
631
|
+
self.ID = sID
|
|
632
|
+
L = len(s)
|
|
633
|
+
self.L = L
|
|
634
|
+
sub_s = s[st : (L - ed)]
|
|
635
|
+
Ls = len(sub_s)
|
|
636
|
+
Kmer = [sub_s[x : (x + KS)] for x in range(0, Ls - KS + 1)]
|
|
637
|
+
self.Kmer = Kmer
|
|
638
|
+
|
|
307
639
|
|
|
308
640
|
class KmerSet:
|
|
309
641
|
## Kmer set for fast read searching based on mismatch-allowed Kmer index
|
|
@@ -312,263 +644,277 @@ class KmerSet:
|
|
|
312
644
|
## Seqs and sIDs must have the same length
|
|
313
645
|
if len(Seqs) != len(sIDs):
|
|
314
646
|
raise "Sequence and ID lists have different length. Please check input."
|
|
315
|
-
KmerDict={}
|
|
316
|
-
N=len(Seqs)
|
|
317
|
-
self.N=N
|
|
318
|
-
CDR3Dict={}
|
|
319
|
-
LLs=[]
|
|
320
|
-
for ii in range(0,N):
|
|
321
|
-
s=Seqs[ii]
|
|
322
|
-
sID=sIDs[ii]
|
|
323
|
-
cc=CDR3(s,sID,KS,st,ed)
|
|
324
|
-
CDR3Dict[cc.ID]=cc.Kmer
|
|
325
|
-
KK=cc.Kmer
|
|
647
|
+
KmerDict = {}
|
|
648
|
+
N = len(Seqs)
|
|
649
|
+
self.N = N
|
|
650
|
+
CDR3Dict = {}
|
|
651
|
+
LLs = []
|
|
652
|
+
for ii in range(0, N):
|
|
653
|
+
s = Seqs[ii]
|
|
654
|
+
sID = sIDs[ii]
|
|
655
|
+
cc = CDR3(s, sID, KS, st, ed)
|
|
656
|
+
CDR3Dict[cc.ID] = cc.Kmer
|
|
657
|
+
KK = cc.Kmer
|
|
326
658
|
LLs.append(cc.L)
|
|
327
659
|
for kk in KK:
|
|
328
660
|
if kk not in KmerDict:
|
|
329
|
-
KmerDict[kk]=[sID]
|
|
661
|
+
KmerDict[kk] = [sID]
|
|
330
662
|
else:
|
|
331
663
|
KmerDict[kk].append(sID)
|
|
332
|
-
self.KD=KmerDict
|
|
333
|
-
self.KS=KS
|
|
334
|
-
self.CD=CDR3Dict
|
|
335
|
-
self.LL=LLs
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
664
|
+
self.KD = KmerDict
|
|
665
|
+
self.KS = KS
|
|
666
|
+
self.CD = CDR3Dict
|
|
667
|
+
self.LL = LLs
|
|
668
|
+
|
|
669
|
+
def FindKmerNeighbor(self, kk):
|
|
670
|
+
KS = self.KS
|
|
671
|
+
KS_n1 = []
|
|
339
672
|
for jj in range(KS):
|
|
340
|
-
kk_pre=[kk[0:jj]]*20
|
|
341
|
-
kk_suf=[kk[(jj+1):KS]]*20
|
|
342
|
-
kkn=list(zip(kk_pre,AAstringList,kk_suf))
|
|
343
|
-
KS_n1+=[
|
|
673
|
+
kk_pre = [kk[0:jj]] * 20
|
|
674
|
+
kk_suf = [kk[(jj + 1) : KS]] * 20
|
|
675
|
+
kkn = list(zip(kk_pre, AAstringList, kk_suf))
|
|
676
|
+
KS_n1 += ["".join(list(x)) for x in kkn]
|
|
344
677
|
return KS_n1
|
|
345
|
-
|
|
678
|
+
|
|
679
|
+
def FindKmerNeighbor2(self, kk):
|
|
346
680
|
## KS>=6, allowing 2 mismatches. CDR3 length must be >= 10
|
|
347
|
-
KS=self.KS
|
|
348
|
-
KS_n1=[]
|
|
681
|
+
KS = self.KS
|
|
682
|
+
KS_n1 = []
|
|
349
683
|
for jj in range(KS):
|
|
350
684
|
for ii in range(KS):
|
|
351
|
-
if ii<=jj:
|
|
685
|
+
if ii <= jj:
|
|
352
686
|
continue
|
|
353
|
-
kk_pre=[kk[0:jj]]*20
|
|
354
|
-
kk_mid=[kk[(jj+1):ii]]*20
|
|
355
|
-
kk_suf=[kk[(ii+1):KS]]*400
|
|
356
|
-
kkn=list(zip(kk_pre,AAstringList,kk_mid))
|
|
357
|
-
kkn=[
|
|
358
|
-
kkn=[[x]*20 for x in kkn]
|
|
359
|
-
kkn=list(chain(*kkn))
|
|
360
|
-
kkn2=list(zip(kkn, AAstringList*20, kk_suf))
|
|
361
|
-
kkn2=[
|
|
362
|
-
KS_n1+=kkn2
|
|
687
|
+
kk_pre = [kk[0:jj]] * 20
|
|
688
|
+
kk_mid = [kk[(jj + 1) : ii]] * 20
|
|
689
|
+
kk_suf = [kk[(ii + 1) : KS]] * 400
|
|
690
|
+
kkn = list(zip(kk_pre, AAstringList, kk_mid))
|
|
691
|
+
kkn = ["".join(list(x)) for x in kkn]
|
|
692
|
+
kkn = [[x] * 20 for x in kkn]
|
|
693
|
+
kkn = list(chain(*kkn))
|
|
694
|
+
kkn2 = list(zip(kkn, AAstringList * 20, kk_suf))
|
|
695
|
+
kkn2 = ["".join(list(x)) for x in kkn2]
|
|
696
|
+
KS_n1 += kkn2
|
|
363
697
|
return KS_n1
|
|
698
|
+
|
|
364
699
|
def KmerIndex(self):
|
|
365
700
|
## For each K-mer, find its nearest neighbor with 1 character mismatch
|
|
366
|
-
KKs=list(self.KD.keys())
|
|
367
|
-
KS=self.KS
|
|
368
|
-
KKs_set=set(KKs)
|
|
369
|
-
Skk=
|
|
370
|
-
KI_Dict={}
|
|
701
|
+
KKs = list(self.KD.keys())
|
|
702
|
+
KS = self.KS
|
|
703
|
+
KKs_set = set(KKs)
|
|
704
|
+
Skk = "_".join(KKs)
|
|
705
|
+
KI_Dict = {}
|
|
371
706
|
for kk in KKs:
|
|
372
|
-
## kk_neighbor=[]
|
|
373
|
-
## for jj in range(KS):
|
|
374
|
-
## kk_pre=kk[0:jj]
|
|
375
|
-
## kk_suf=kk[(jj+1):KS]
|
|
376
|
-
## pat=kk_pre+'['+AAstring+']{1}'+kk_suf
|
|
377
|
-
## p=re.compile(pat)
|
|
378
|
-
## mm=[m.group() for m in p.finditer(Skk)]
|
|
379
|
-
## kk_neighbor+=mm
|
|
380
|
-
KS_n=set(self.FindKmerNeighbor(kk))
|
|
707
|
+
## kk_neighbor=[]
|
|
708
|
+
## for jj in range(KS):
|
|
709
|
+
## kk_pre=kk[0:jj]
|
|
710
|
+
## kk_suf=kk[(jj+1):KS]
|
|
711
|
+
## pat=kk_pre+'['+AAstring+']{1}'+kk_suf
|
|
712
|
+
## p=re.compile(pat)
|
|
713
|
+
## mm=[m.group() for m in p.finditer(Skk)]
|
|
714
|
+
## kk_neighbor+=mm
|
|
715
|
+
KS_n = set(self.FindKmerNeighbor(kk))
|
|
381
716
|
kk_neighbor = KS_n & KKs_set
|
|
382
|
-
KI_Dict[kk]=list(kk_neighbor)
|
|
717
|
+
KI_Dict[kk] = list(kk_neighbor)
|
|
383
718
|
return KI_Dict
|
|
719
|
+
|
|
384
720
|
def updateKD(self, KI):
|
|
385
721
|
## group sequences sharing motifs with 1-2 mismatches
|
|
386
|
-
KD=self.KD
|
|
387
|
-
KDnew={}
|
|
722
|
+
KD = self.KD
|
|
723
|
+
KDnew = {}
|
|
388
724
|
for kk in KD:
|
|
389
|
-
kkm=KI[kk]
|
|
390
|
-
vvL=itemgetter(*kkm)(KD)
|
|
391
|
-
if isinstance(vvL[0],list):
|
|
392
|
-
vvL=list(chain(*vvL))
|
|
393
|
-
KDnew[kk]=vvL
|
|
725
|
+
kkm = KI[kk]
|
|
726
|
+
vvL = itemgetter(*kkm)(KD)
|
|
727
|
+
if isinstance(vvL[0], list):
|
|
728
|
+
vvL = list(chain(*vvL))
|
|
729
|
+
KDnew[kk] = vvL
|
|
394
730
|
return KDnew
|
|
395
731
|
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
732
|
+
|
|
733
|
+
def GenerateMotifGraph(mD, seqs, seqID):
|
|
734
|
+
SeqShareGraph = {}
|
|
735
|
+
mDL = {}
|
|
399
736
|
for kk in mD:
|
|
400
|
-
vv=mD[kk]
|
|
401
|
-
LL=[]
|
|
737
|
+
vv = mD[kk]
|
|
738
|
+
LL = []
|
|
402
739
|
for v in vv:
|
|
403
740
|
LL.append(len(seqs[v]))
|
|
404
|
-
mDL[kk]=LL
|
|
741
|
+
mDL[kk] = LL
|
|
405
742
|
for kk in mD:
|
|
406
|
-
vv=mD[kk]
|
|
407
|
-
LL=mDL[kk]
|
|
408
|
-
nv=len(vv)
|
|
409
|
-
for ii in range(0,nv):
|
|
410
|
-
id_1=vv[ii]
|
|
411
|
-
L1=LL[ii]
|
|
412
|
-
for jj in range(ii,nv):
|
|
413
|
-
if jj==ii:
|
|
743
|
+
vv = mD[kk]
|
|
744
|
+
LL = mDL[kk]
|
|
745
|
+
nv = len(vv)
|
|
746
|
+
for ii in range(0, nv):
|
|
747
|
+
id_1 = vv[ii]
|
|
748
|
+
L1 = LL[ii]
|
|
749
|
+
for jj in range(ii, nv):
|
|
750
|
+
if jj == ii:
|
|
414
751
|
continue
|
|
415
|
-
id_2=vv[jj]
|
|
416
|
-
L2=LL[jj]
|
|
752
|
+
id_2 = vv[jj]
|
|
753
|
+
L2 = LL[jj]
|
|
417
754
|
if L2 != L1:
|
|
418
755
|
continue
|
|
419
756
|
if id_1 not in SeqShareGraph:
|
|
420
|
-
SeqShareGraph[id_1]=[id_2]
|
|
757
|
+
SeqShareGraph[id_1] = [id_2]
|
|
421
758
|
elif id_2 not in SeqShareGraph[id_1]:
|
|
422
759
|
SeqShareGraph[id_1].append(id_2)
|
|
423
760
|
if id_2 not in SeqShareGraph:
|
|
424
|
-
SeqShareGraph[id_2]=[id_1]
|
|
761
|
+
SeqShareGraph[id_2] = [id_1]
|
|
425
762
|
elif id_1 not in SeqShareGraph[id_2]:
|
|
426
763
|
SeqShareGraph[id_2].append(id_1)
|
|
427
764
|
return SeqShareGraph
|
|
428
765
|
|
|
766
|
+
|
|
429
767
|
def generateSSG(Kset, CDR3s, k_thr=2):
|
|
430
|
-
KD=Kset.KD
|
|
431
|
-
KI=Kset.KmerIndex()
|
|
432
|
-
KDnew=Kset.updateKD(KI)
|
|
433
|
-
CD=Kset.CD
|
|
434
|
-
LL=np.array(Kset.LL)
|
|
435
|
-
SSG={}
|
|
768
|
+
KD = Kset.KD
|
|
769
|
+
KI = Kset.KmerIndex()
|
|
770
|
+
KDnew = Kset.updateKD(KI)
|
|
771
|
+
CD = Kset.CD
|
|
772
|
+
LL = np.array(Kset.LL)
|
|
773
|
+
SSG = {}
|
|
436
774
|
for kk in CD:
|
|
437
|
-
vv=itemgetter(*CD[kk])(KDnew)
|
|
438
|
-
if isinstance(vv[0],list):
|
|
439
|
-
vv=list(chain(*vv))
|
|
440
|
-
vv1=[]
|
|
441
|
-
c=Counter(vv)
|
|
775
|
+
vv = itemgetter(*CD[kk])(KDnew)
|
|
776
|
+
if isinstance(vv[0], list):
|
|
777
|
+
vv = list(chain(*vv))
|
|
778
|
+
vv1 = []
|
|
779
|
+
c = Counter(vv)
|
|
442
780
|
for k in c:
|
|
443
|
-
if c[k]>=k_thr:
|
|
781
|
+
if c[k] >= k_thr:
|
|
444
782
|
vv1.append(k)
|
|
445
|
-
vv1=np.array(vv1)
|
|
446
|
-
if len(vv1)==0:
|
|
783
|
+
vv1 = np.array(vv1)
|
|
784
|
+
if len(vv1) == 0:
|
|
447
785
|
continue
|
|
448
|
-
cdr3=CDR3s[kk]
|
|
449
|
-
L0=len(cdr3)
|
|
450
|
-
idx=np.where(LL[vv1]==L0)[0]
|
|
451
|
-
if len(idx)==0:
|
|
786
|
+
cdr3 = CDR3s[kk]
|
|
787
|
+
L0 = len(cdr3)
|
|
788
|
+
idx = np.where(LL[vv1] == L0)[0]
|
|
789
|
+
if len(idx) == 0:
|
|
452
790
|
continue
|
|
453
|
-
vvs=list(vv1[idx])
|
|
791
|
+
vvs = list(vv1[idx])
|
|
454
792
|
vvs.remove(kk)
|
|
455
|
-
if len(vvs)>0:
|
|
456
|
-
SSG[kk]=vvs
|
|
793
|
+
if len(vvs) > 0:
|
|
794
|
+
SSG[kk] = vvs
|
|
457
795
|
return SSG
|
|
458
796
|
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
797
|
+
|
|
798
|
+
def SeqComparison(s1, s2, gap=-6):
|
|
799
|
+
n = len(s1)
|
|
800
|
+
CorList = []
|
|
801
|
+
score = 0
|
|
802
|
+
for kk in range(0, n):
|
|
803
|
+
aa = s1[kk]
|
|
804
|
+
bb = s2[kk]
|
|
805
|
+
if aa in [".", "-", "*"] or bb in [".", "-", "*"]:
|
|
806
|
+
if aa != bb:
|
|
468
807
|
score += gap
|
|
469
808
|
continue
|
|
470
|
-
if aa==bb:
|
|
471
|
-
# score += min(4,blosum62[(aa,aa)])
|
|
472
|
-
score += blosum62n[(aa,aa)]
|
|
809
|
+
if aa == bb:
|
|
810
|
+
# score += min(4,blosum62[(aa,aa)])
|
|
811
|
+
score += blosum62n[(aa, aa)]
|
|
473
812
|
continue
|
|
474
|
-
KEY=(aa,bb)
|
|
475
|
-
# if KEY not in blosum62:
|
|
476
|
-
# KEY=(bb,aa)
|
|
477
|
-
# if KEY not in blosum62:
|
|
478
|
-
# raise "Non-standard amino acid coding!"
|
|
479
|
-
score+=blosum62n[KEY]
|
|
813
|
+
KEY = (aa, bb)
|
|
814
|
+
# if KEY not in blosum62:
|
|
815
|
+
# KEY=(bb,aa)
|
|
816
|
+
# if KEY not in blosum62:
|
|
817
|
+
# raise "Non-standard amino acid coding!"
|
|
818
|
+
score += blosum62n[KEY]
|
|
480
819
|
return score
|
|
481
820
|
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
Seq1
|
|
488
|
-
Seq2
|
|
489
|
-
|
|
821
|
+
|
|
822
|
+
def NHLocalAlignment(Seq1, Seq2, gap_thr=1, gap=-6):
|
|
823
|
+
n1 = len(Seq1)
|
|
824
|
+
n2 = len(Seq2)
|
|
825
|
+
if n1 < n2:
|
|
826
|
+
Seq = Seq1
|
|
827
|
+
Seq1 = Seq2
|
|
828
|
+
Seq2 = Seq
|
|
829
|
+
nn = n2 - n1
|
|
490
830
|
else:
|
|
491
|
-
nn=n1-n2
|
|
492
|
-
if nn>gap_thr:
|
|
831
|
+
nn = n1 - n2
|
|
832
|
+
if nn > gap_thr:
|
|
493
833
|
return -1
|
|
494
|
-
SeqList1=[Seq1]
|
|
495
|
-
SeqList2=InsertGap(Seq2,nn)
|
|
496
|
-
alns=[]
|
|
497
|
-
SCOREList=[]
|
|
834
|
+
SeqList1 = [Seq1]
|
|
835
|
+
SeqList2 = InsertGap(Seq2, nn)
|
|
836
|
+
alns = []
|
|
837
|
+
SCOREList = []
|
|
498
838
|
for s1 in SeqList1:
|
|
499
839
|
for s2 in SeqList2:
|
|
500
|
-
|
|
501
|
-
maxS=max(SCOREList)
|
|
840
|
+
SCOREList.append(SeqComparison(s1, s2, gap))
|
|
841
|
+
maxS = max(SCOREList)
|
|
502
842
|
return maxS
|
|
503
843
|
|
|
504
|
-
|
|
844
|
+
|
|
845
|
+
def InsertGap(Seq, n):
|
|
505
846
|
## Insert n gaps to Seq; n<=2
|
|
506
|
-
if n==0:
|
|
847
|
+
if n == 0:
|
|
507
848
|
return [Seq]
|
|
508
|
-
ns=len(Seq)
|
|
509
|
-
SeqList=[]
|
|
510
|
-
if
|
|
511
|
-
for kk in range(0,ns+1):
|
|
512
|
-
SeqNew=Seq[0:kk]+
|
|
849
|
+
ns = len(Seq)
|
|
850
|
+
SeqList = []
|
|
851
|
+
if n == 1:
|
|
852
|
+
for kk in range(0, ns + 1):
|
|
853
|
+
SeqNew = Seq[0:kk] + "-" + Seq[kk:]
|
|
513
854
|
SeqList.append(SeqNew)
|
|
514
|
-
if
|
|
515
|
-
for kk in range(0,ns+1):
|
|
516
|
-
SeqNew=Seq[0:kk]+
|
|
517
|
-
for jj in range(0,ns+2):
|
|
518
|
-
SeqNew0=SeqNew[0:jj]+
|
|
855
|
+
if n == 2:
|
|
856
|
+
for kk in range(0, ns + 1):
|
|
857
|
+
SeqNew = Seq[0:kk] + "-" + Seq[kk:]
|
|
858
|
+
for jj in range(0, ns + 2):
|
|
859
|
+
SeqNew0 = SeqNew[0:jj] + "-" + SeqNew[jj:]
|
|
519
860
|
SeqList.append(SeqNew0)
|
|
520
861
|
return SeqList
|
|
521
862
|
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
863
|
+
|
|
864
|
+
def falign(s1, s2, V1, V2, st, VScore={}, UseV=True, gapn=1, gap=-6):
|
|
865
|
+
mid1 = s1[st:-2]
|
|
866
|
+
mid2 = s2[st:-2]
|
|
525
867
|
if UseV:
|
|
526
|
-
if V2==V1:
|
|
527
|
-
V_score=4
|
|
868
|
+
if V2 == V1:
|
|
869
|
+
V_score = 4
|
|
528
870
|
else:
|
|
529
|
-
Vkey=(V1,V2)
|
|
871
|
+
Vkey = (V1, V2)
|
|
530
872
|
if Vkey not in VScore:
|
|
531
|
-
Vkey=(V2,V1)
|
|
873
|
+
Vkey = (V2, V1)
|
|
532
874
|
if Vkey not in VScore:
|
|
533
|
-
#print("V gene not found!")
|
|
875
|
+
# print("V gene not found!")
|
|
534
876
|
return 0
|
|
535
877
|
else:
|
|
536
|
-
V_score=VScore[Vkey]/20.0
|
|
878
|
+
V_score = VScore[Vkey] / 20.0
|
|
537
879
|
else:
|
|
538
|
-
V_score=4.0
|
|
539
|
-
aln=NHLocalAlignment(mid1,mid2,gapn,gap)
|
|
540
|
-
score=aln/float(max(len(mid1),len(mid2)))+V_score
|
|
880
|
+
V_score = 4.0
|
|
881
|
+
aln = NHLocalAlignment(mid1, mid2, gapn, gap)
|
|
882
|
+
score = aln / float(max(len(mid1), len(mid2))) + V_score
|
|
541
883
|
return score
|
|
542
884
|
|
|
885
|
+
|
|
543
886
|
def UpdateSSG(SSG, seqs, Vgenes, Vscore={}, UseV=True, gap=-6, gapn=1, cutoff=7.5):
|
|
544
|
-
SSGnew={}
|
|
545
|
-
count=0
|
|
546
|
-
t1=time.time()
|
|
547
|
-
N=len(list(chain(*list(SSG.values()))))
|
|
548
|
-
# print("Number of pairs to be processed: %d" %N)
|
|
887
|
+
SSGnew = {}
|
|
888
|
+
count = 0
|
|
889
|
+
t1 = time.time()
|
|
890
|
+
N = len(list(chain(*list(SSG.values()))))
|
|
891
|
+
# print("Number of pairs to be processed: %d" %N)
|
|
549
892
|
for kk in SSG:
|
|
550
|
-
s1=seqs[kk]
|
|
551
|
-
V1=Vgenes[kk]
|
|
552
|
-
VV=SSG[kk]
|
|
893
|
+
s1 = seqs[kk]
|
|
894
|
+
V1 = Vgenes[kk]
|
|
895
|
+
VV = SSG[kk]
|
|
553
896
|
for vv in VV:
|
|
554
|
-
s2=seqs[vv]
|
|
555
|
-
V2=Vgenes[vv]
|
|
556
|
-
score=falign(
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
897
|
+
s2 = seqs[vv]
|
|
898
|
+
V2 = Vgenes[vv]
|
|
899
|
+
score = falign(
|
|
900
|
+
s1, s2, V1, V2, st=3, VScore=Vscore, UseV=UseV, gap=-6, gapn=1
|
|
901
|
+
)
|
|
902
|
+
count += 1
|
|
903
|
+
if count % 1000000 == 0:
|
|
904
|
+
t2 = time.time()
|
|
905
|
+
# print("Processed %d pairs. Elapsed time %f" %(count, t2-t1))
|
|
906
|
+
if score >= cutoff:
|
|
562
907
|
if kk not in SSGnew:
|
|
563
|
-
SSGnew[kk]=[vv]
|
|
908
|
+
SSGnew[kk] = [vv]
|
|
564
909
|
else:
|
|
565
910
|
SSGnew[kk].append(vv)
|
|
566
911
|
return SSGnew
|
|
567
912
|
|
|
913
|
+
|
|
568
914
|
def dfs(graph, start):
|
|
569
|
-
|
|
915
|
+
"""
|
|
570
916
|
Non-resursive depth first search
|
|
571
|
-
|
|
917
|
+
"""
|
|
572
918
|
visited = set()
|
|
573
919
|
stack = [start]
|
|
574
920
|
while stack:
|
|
@@ -576,443 +922,503 @@ def dfs(graph, start):
|
|
|
576
922
|
if vertex not in visited:
|
|
577
923
|
visited.add(vertex)
|
|
578
924
|
stack.extend(set(graph[vertex]) - visited)
|
|
579
|
-
|
|
925
|
+
|
|
580
926
|
return visited
|
|
581
927
|
|
|
928
|
+
|
|
582
929
|
def IdentifyMotifCluster(SSG):
|
|
583
930
|
## Input SeqShareGraph dictionary representation of sparse matrix
|
|
584
|
-
POS=set(SSG.keys())
|
|
585
|
-
NP=len(POS)
|
|
586
|
-
ClusterList=[]
|
|
587
|
-
tmpL=set(chain(*ClusterList))
|
|
588
|
-
count=0
|
|
931
|
+
POS = set(SSG.keys())
|
|
932
|
+
NP = len(POS)
|
|
933
|
+
ClusterList = []
|
|
934
|
+
tmpL = set(chain(*ClusterList))
|
|
935
|
+
count = 0
|
|
589
936
|
while 1:
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
# STACK=LoadComm([],ii)
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
# tmpL=set(chain(*ClusterList))
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
937
|
+
xx = POS ^ tmpL
|
|
938
|
+
if len(xx) == 0:
|
|
939
|
+
break
|
|
940
|
+
for ii in xx:
|
|
941
|
+
# STACK=LoadComm([],ii)
|
|
942
|
+
STACK = dfs(SSG, ii)
|
|
943
|
+
tmpL = tmpL | STACK
|
|
944
|
+
ClusterList.append(list(STACK))
|
|
945
|
+
# tmpL=set(chain(*ClusterList))
|
|
946
|
+
count += 1
|
|
947
|
+
if count % 200 == 0:
|
|
948
|
+
print(" Solved %d clusters" % (count))
|
|
949
|
+
break
|
|
603
950
|
return ClusterList
|
|
604
951
|
|
|
952
|
+
|
|
605
953
|
def IdentifyVgeneCluster(sMat):
|
|
606
954
|
## Input Vgene score matrix
|
|
607
|
-
vG={}
|
|
608
|
-
n=len(sMat)
|
|
609
|
-
IDs=[x for x in range(n)]
|
|
955
|
+
vG = {}
|
|
956
|
+
n = len(sMat)
|
|
957
|
+
IDs = [x for x in range(n)]
|
|
610
958
|
for kk in IDs:
|
|
611
|
-
LL=sMat[:,kk]
|
|
612
|
-
vL=np.where(LL>=thr_v)[0]
|
|
613
|
-
if len(vL)>0:
|
|
614
|
-
vG[kk]=vL
|
|
615
|
-
CL=IdentifyMotifCluster(vG)
|
|
959
|
+
LL = sMat[:, kk]
|
|
960
|
+
vL = np.where(LL >= thr_v)[0]
|
|
961
|
+
if len(vL) > 0:
|
|
962
|
+
vG[kk] = vL
|
|
963
|
+
CL = IdentifyMotifCluster(vG)
|
|
616
964
|
return CL
|
|
617
|
-
|
|
965
|
+
|
|
966
|
+
|
|
618
967
|
def ParseFa(fname):
|
|
619
|
-
InputStr=open(fname).readlines()
|
|
620
|
-
FaDict={}
|
|
621
|
-
seq=
|
|
968
|
+
InputStr = open(fname).readlines()
|
|
969
|
+
FaDict = {}
|
|
970
|
+
seq = ""
|
|
622
971
|
for line in InputStr:
|
|
623
|
-
if line.startswith(
|
|
624
|
-
if len(seq)>0:
|
|
625
|
-
FaDict[seqHead]=seq
|
|
626
|
-
seq=
|
|
627
|
-
seqHead=line.strip()
|
|
972
|
+
if line.startswith(">"):
|
|
973
|
+
if len(seq) > 0:
|
|
974
|
+
FaDict[seqHead] = seq
|
|
975
|
+
seq = ""
|
|
976
|
+
seqHead = line.strip()
|
|
628
977
|
else:
|
|
629
|
-
seq+=line.strip()
|
|
978
|
+
seq += line.strip()
|
|
630
979
|
if seqHead not in FaDict:
|
|
631
|
-
FaDict[seqHead]=seq
|
|
980
|
+
FaDict[seqHead] = seq
|
|
632
981
|
return FaDict
|
|
633
982
|
|
|
983
|
+
|
|
634
984
|
def PreCalculateVgeneDist(VgeneFa="Imgt_Human_TRBV.fasta"):
|
|
635
985
|
## Only run one time if needed
|
|
636
|
-
FaDict=ParseFa(cur_dir+VgeneFa)
|
|
637
|
-
VScore={}
|
|
638
|
-
CDR1Dict={}
|
|
639
|
-
CDR2Dict={}
|
|
986
|
+
FaDict = ParseFa(cur_dir + VgeneFa)
|
|
987
|
+
VScore = {}
|
|
988
|
+
CDR1Dict = {}
|
|
989
|
+
CDR2Dict = {}
|
|
640
990
|
for kk in FaDict:
|
|
641
|
-
if
|
|
642
|
-
VV=kk.split(
|
|
991
|
+
if "|" in kk:
|
|
992
|
+
VV = kk.split("|")[1]
|
|
643
993
|
else:
|
|
644
|
-
VV=kk[1:]
|
|
645
|
-
CDR1Dict[VV]=FaDict[kk][26:37] ## Imgt CDR1: 27 - 38
|
|
646
|
-
CDR2Dict[VV]=FaDict[kk][55:64] ## Imgt CDR2: 56 - 65
|
|
647
|
-
Vkeys=list(CDR1Dict.keys())
|
|
648
|
-
nn=len(Vkeys)
|
|
649
|
-
for ii in range(0,nn):
|
|
650
|
-
V1=Vkeys[ii]
|
|
651
|
-
s1_CDR1=CDR1Dict[V1]
|
|
652
|
-
s1_CDR2=CDR2Dict[V1]
|
|
653
|
-
for jj in range(ii,nn):
|
|
654
|
-
V2=Vkeys[jj]
|
|
655
|
-
s2_CDR1=CDR1Dict[V2]
|
|
656
|
-
s2_CDR2=CDR2Dict[V2]
|
|
657
|
-
score1=SeqComparison(s1_CDR1,s2_CDR1)
|
|
658
|
-
score2=SeqComparison(s2_CDR2,s2_CDR2)
|
|
659
|
-
#print score1+score2
|
|
660
|
-
VScore[(V1,V2)]=score1+score2
|
|
661
|
-
gg=open(
|
|
994
|
+
VV = kk[1:]
|
|
995
|
+
CDR1Dict[VV] = FaDict[kk][26:37] ## Imgt CDR1: 27 - 38
|
|
996
|
+
CDR2Dict[VV] = FaDict[kk][55:64] ## Imgt CDR2: 56 - 65
|
|
997
|
+
Vkeys = list(CDR1Dict.keys())
|
|
998
|
+
nn = len(Vkeys)
|
|
999
|
+
for ii in range(0, nn):
|
|
1000
|
+
V1 = Vkeys[ii]
|
|
1001
|
+
s1_CDR1 = CDR1Dict[V1]
|
|
1002
|
+
s1_CDR2 = CDR2Dict[V1]
|
|
1003
|
+
for jj in range(ii, nn):
|
|
1004
|
+
V2 = Vkeys[jj]
|
|
1005
|
+
s2_CDR1 = CDR1Dict[V2]
|
|
1006
|
+
s2_CDR2 = CDR2Dict[V2]
|
|
1007
|
+
score1 = SeqComparison(s1_CDR1, s2_CDR1)
|
|
1008
|
+
score2 = SeqComparison(s2_CDR2, s2_CDR2)
|
|
1009
|
+
# print score1+score2
|
|
1010
|
+
VScore[(V1, V2)] = score1 + score2
|
|
1011
|
+
gg = open("VgeneScores.txt", "w")
|
|
662
1012
|
for kk in VScore:
|
|
663
|
-
vv=VScore[kk]
|
|
664
|
-
line=kk[0]+
|
|
1013
|
+
vv = VScore[kk]
|
|
1014
|
+
line = kk[0] + "\t" + kk[1] + "\t" + str(vv) + "\n"
|
|
665
1015
|
gg.write(line)
|
|
666
1016
|
gg.close()
|
|
667
1017
|
|
|
668
|
-
|
|
1018
|
+
|
|
1019
|
+
def EncodeRepertoire(
|
|
1020
|
+
inputfile,
|
|
1021
|
+
outdir,
|
|
1022
|
+
outfile="",
|
|
1023
|
+
exact=True,
|
|
1024
|
+
ST=3,
|
|
1025
|
+
thr_v=3.7,
|
|
1026
|
+
thr_s=3.5,
|
|
1027
|
+
VDict={},
|
|
1028
|
+
Vgene=True,
|
|
1029
|
+
thr_iso=10,
|
|
1030
|
+
gap=-6,
|
|
1031
|
+
GPU=False,
|
|
1032
|
+
Mat=False,
|
|
1033
|
+
verbose=False,
|
|
1034
|
+
):
|
|
669
1035
|
## No V gene version
|
|
670
1036
|
## Encode CDR3 sequences into 96 dimensional space and perform k-means clustering
|
|
671
1037
|
## If exact is True, SW alignment will be performed within each cluster after isometric encoding and clustering
|
|
672
|
-
h=open(inputfile)
|
|
673
|
-
t1=time.time()
|
|
674
|
-
alines=h.readlines()
|
|
675
|
-
ww=alines[0].strip().split(
|
|
676
|
-
if not ww[0].startswith(
|
|
1038
|
+
h = open(inputfile)
|
|
1039
|
+
t1 = time.time()
|
|
1040
|
+
alines = h.readlines()
|
|
1041
|
+
ww = alines[0].strip().split("\t")
|
|
1042
|
+
if not ww[0].startswith("C"):
|
|
677
1043
|
## header line
|
|
678
|
-
hline=alines[0]
|
|
679
|
-
alines=alines[1:]
|
|
680
|
-
elif
|
|
681
|
-
hline=alines[0]
|
|
682
|
-
alines=alines[1:]
|
|
1044
|
+
hline = alines[0]
|
|
1045
|
+
alines = alines[1:]
|
|
1046
|
+
elif "CDR3" in ww[0]:
|
|
1047
|
+
hline = alines[0]
|
|
1048
|
+
alines = alines[1:]
|
|
683
1049
|
else:
|
|
684
|
-
hline=
|
|
685
|
-
seqs=[]
|
|
686
|
-
vgs=[]
|
|
687
|
-
infoList=[]
|
|
688
|
-
count=0
|
|
1050
|
+
hline = "CDR3\t" + "\t".join(["Info" + str(x) for x in range(len(ww) - 1)])
|
|
1051
|
+
seqs = []
|
|
1052
|
+
vgs = []
|
|
1053
|
+
infoList = []
|
|
1054
|
+
count = 0
|
|
689
1055
|
if verbose:
|
|
690
|
-
print(
|
|
1056
|
+
print("Creating CDR3 list")
|
|
691
1057
|
for ll in alines:
|
|
692
|
-
ww=ll.strip().split(
|
|
693
|
-
cdr3=ww[0]
|
|
694
|
-
if
|
|
1058
|
+
ww = ll.strip().split("\t")
|
|
1059
|
+
cdr3 = ww[0]
|
|
1060
|
+
if "*" in cdr3:
|
|
695
1061
|
continue
|
|
696
|
-
if
|
|
1062
|
+
if "_" in cdr3:
|
|
697
1063
|
continue
|
|
698
1064
|
seqs.append(ww[0])
|
|
699
1065
|
if Vgene:
|
|
700
1066
|
vgs.append(ww[1])
|
|
701
|
-
infoList.append(
|
|
1067
|
+
infoList.append("\t".join(ww[1:]))
|
|
702
1068
|
else:
|
|
703
|
-
infoList.append(
|
|
704
|
-
count+=1
|
|
705
|
-
if len(outfile)==0:
|
|
706
|
-
outfile=inputfile.split(
|
|
707
|
-
outfile=outfile[len(outfile)-1]
|
|
708
|
-
outfile=
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
1069
|
+
infoList.append("\t".join(ww[1:]))
|
|
1070
|
+
count += 1
|
|
1071
|
+
if len(outfile) == 0:
|
|
1072
|
+
outfile = inputfile.split("/")
|
|
1073
|
+
outfile = outfile[len(outfile) - 1]
|
|
1074
|
+
outfile = (
|
|
1075
|
+
outdir
|
|
1076
|
+
+ "/"
|
|
1077
|
+
+ re.sub("\\.[txcsv]+", "", outfile)
|
|
1078
|
+
+ "-"
|
|
1079
|
+
+ "-RotationEncodingBL62.txt"
|
|
1080
|
+
)
|
|
1081
|
+
g = open(outfile, "w")
|
|
1082
|
+
tm = strftime("%Y-%m-%d %H:%M:%S", gmtime())
|
|
1083
|
+
InfoLine = (
|
|
1084
|
+
"##TIME:"
|
|
1085
|
+
+ tm
|
|
1086
|
+
+ "|cmd: "
|
|
1087
|
+
+ sys.argv[0]
|
|
1088
|
+
+ "|"
|
|
1089
|
+
+ inputfile
|
|
1090
|
+
+ "|IsometricDistance_Thr="
|
|
1091
|
+
+ str(thr_iso)
|
|
1092
|
+
+ "|thr_v="
|
|
1093
|
+
+ str(thr_v)
|
|
1094
|
+
+ "|thr_s="
|
|
1095
|
+
+ str(thr_s)
|
|
1096
|
+
+ "|exact="
|
|
1097
|
+
+ str(exact)
|
|
1098
|
+
+ "|Vgene="
|
|
1099
|
+
+ str(Vgene)
|
|
1100
|
+
+ "|ST="
|
|
1101
|
+
+ str(ST)
|
|
1102
|
+
)
|
|
1103
|
+
g.write(InfoLine + "\n")
|
|
1104
|
+
g.write(
|
|
1105
|
+
"##Column Info: CDR3 aa sequence, cluster id, other information in the input file\n"
|
|
1106
|
+
)
|
|
1107
|
+
gr = 0
|
|
715
1108
|
## Split into different lengths
|
|
716
|
-
LD,VD, ID,SD= BuildLengthDict(
|
|
1109
|
+
LD, VD, ID, SD = BuildLengthDict(
|
|
1110
|
+
seqs, vGene=vgs, INFO=infoList, sIDs=[x for x in range(len(seqs))]
|
|
1111
|
+
)
|
|
717
1112
|
LDu, VDu, IDu, SDu = CollapseUnique(LD, VD, ID, SD)
|
|
718
1113
|
if Mat:
|
|
719
|
-
Mfile=outfile+
|
|
720
|
-
h=open(Mfile,
|
|
1114
|
+
Mfile = outfile + "_EncodingMatrix.txt"
|
|
1115
|
+
h = open(Mfile, "w")
|
|
721
1116
|
for kk in LDu:
|
|
722
1117
|
if verbose:
|
|
723
|
-
print("---Process CDR3s with length %d ---" %(kk))
|
|
724
|
-
vSD=LDu[kk]
|
|
725
|
-
vSD0=[x for x in range(len(vSD))]
|
|
726
|
-
vss=SDu[kk]
|
|
727
|
-
vInfo=IDu[kk]
|
|
728
|
-
flagL=[len(x)-1 for x in vInfo]
|
|
1118
|
+
print("---Process CDR3s with length %d ---" % (kk))
|
|
1119
|
+
vSD = LDu[kk]
|
|
1120
|
+
vSD0 = [x for x in range(len(vSD))]
|
|
1121
|
+
vss = SDu[kk]
|
|
1122
|
+
vInfo = IDu[kk]
|
|
1123
|
+
flagL = [len(x) - 1 for x in vInfo]
|
|
729
1124
|
if verbose:
|
|
730
|
-
print(
|
|
731
|
-
dM=np.array([EncodingCDR3(x[ST:-2], M6, n0) for x in vss])
|
|
732
|
-
dM=dM.astype("float32")
|
|
1125
|
+
print(" Performing CDR3 encoding")
|
|
1126
|
+
dM = np.array([EncodingCDR3(x[ST:-2], M6, n0) for x in vss])
|
|
1127
|
+
dM = dM.astype("float32")
|
|
733
1128
|
if verbose:
|
|
734
|
-
print(" The number of sequences is %d" %(dM.shape[0]))
|
|
1129
|
+
print(" The number of sequences is %d" % (dM.shape[0]))
|
|
735
1130
|
if Mat:
|
|
736
1131
|
for ii in range(len(vss)):
|
|
737
|
-
line=vss[ii]+
|
|
738
|
-
NUMs=[str(xx) for xx in dM[ii
|
|
739
|
-
line +=
|
|
1132
|
+
line = vss[ii] + "\t" + vInfo[ii][0] + "\t"
|
|
1133
|
+
NUMs = [str(xx) for xx in dM[ii, :]]
|
|
1134
|
+
line += "\t".join(NUMs) + "\n"
|
|
740
1135
|
h.write(line)
|
|
741
|
-
sID=[x for x in range(dM.shape[0])]
|
|
742
|
-
t2=time.time()
|
|
1136
|
+
sID = [x for x in range(dM.shape[0])]
|
|
1137
|
+
t2 = time.time()
|
|
743
1138
|
if verbose:
|
|
744
|
-
print(
|
|
745
|
-
Cls = ClusterCDR3(
|
|
1139
|
+
print(" Done! Total time elapsed %f" % (t2 - t1))
|
|
1140
|
+
Cls = ClusterCDR3(
|
|
1141
|
+
dM, flagL, thr=thr_iso - 0.5 * (15 - kk), verbose=verbose
|
|
1142
|
+
) ## change cutoff with different lengths
|
|
746
1143
|
if verbose:
|
|
747
1144
|
print(" Handling identical CDR3 groups")
|
|
748
|
-
Cls_u=[]
|
|
1145
|
+
Cls_u = []
|
|
749
1146
|
for ii in range(len(Cls)):
|
|
750
|
-
cc=Cls[ii]
|
|
1147
|
+
cc = Cls[ii]
|
|
751
1148
|
if len(cc) == 1:
|
|
752
1149
|
## Handle identical CDR3 groups first
|
|
753
|
-
if flagL[cc[0]]>0:
|
|
1150
|
+
if flagL[cc[0]] > 0:
|
|
754
1151
|
gr += 1
|
|
755
|
-
jj=cc[0]
|
|
1152
|
+
jj = cc[0]
|
|
756
1153
|
for v_info in vInfo[jj]:
|
|
757
|
-
line=vss[jj]+
|
|
758
|
-
_=g.write(line)
|
|
1154
|
+
line = vss[jj] + "\t" + str(gr) + "\t" + v_info + "\n"
|
|
1155
|
+
_ = g.write(line)
|
|
759
1156
|
else:
|
|
760
1157
|
Cls_u.append(cc)
|
|
761
|
-
Cls=Cls_u
|
|
762
|
-
t2=time.time()
|
|
1158
|
+
Cls = Cls_u
|
|
1159
|
+
t2 = time.time()
|
|
763
1160
|
if verbose:
|
|
764
|
-
print(
|
|
1161
|
+
print(" Done! Total time elapsed %f" % (t2 - t1))
|
|
765
1162
|
if Vgene:
|
|
766
|
-
vVgene=VDu[kk]
|
|
1163
|
+
vVgene = VDu[kk]
|
|
767
1164
|
if verbose:
|
|
768
|
-
print(
|
|
769
|
-
Cls_v=[]
|
|
1165
|
+
print(" Matching variable genes")
|
|
1166
|
+
Cls_v = []
|
|
770
1167
|
for cc in Cls:
|
|
771
|
-
Nc=len(cc)
|
|
772
|
-
sMat={}
|
|
1168
|
+
Nc = len(cc)
|
|
1169
|
+
sMat = {}
|
|
773
1170
|
for ii in range(Nc):
|
|
774
|
-
v1=vVgene[cc[ii]]
|
|
775
|
-
for jj in range(ii,Nc):
|
|
776
|
-
if jj==ii:
|
|
1171
|
+
v1 = vVgene[cc[ii]]
|
|
1172
|
+
for jj in range(ii, Nc):
|
|
1173
|
+
if jj == ii:
|
|
777
1174
|
continue
|
|
778
|
-
v2=vVgene[cc[jj]]
|
|
1175
|
+
v2 = vVgene[cc[jj]]
|
|
779
1176
|
if (v1, v2) not in VDict:
|
|
780
1177
|
if v1 == v2:
|
|
781
1178
|
if ii not in sMat:
|
|
782
|
-
sMat[ii]=[jj]
|
|
1179
|
+
sMat[ii] = [jj]
|
|
783
1180
|
else:
|
|
784
1181
|
sMat[ii].append(jj)
|
|
785
1182
|
if jj not in sMat:
|
|
786
|
-
sMat[jj]=[ii]
|
|
1183
|
+
sMat[jj] = [ii]
|
|
787
1184
|
else:
|
|
788
1185
|
sMat[jj].append(ii)
|
|
789
1186
|
continue
|
|
790
|
-
if VDict[(v1,v2)] >= thr_v:
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
|
|
799
|
-
vCL=IdentifyMotifCluster(sMat)
|
|
800
|
-
vCL_List=list(chain(*vCL))
|
|
1187
|
+
if VDict[(v1, v2)] >= thr_v:
|
|
1188
|
+
if ii not in sMat:
|
|
1189
|
+
sMat[ii] = [jj]
|
|
1190
|
+
else:
|
|
1191
|
+
sMat[ii].append(jj)
|
|
1192
|
+
if jj not in sMat:
|
|
1193
|
+
sMat[jj] = [ii]
|
|
1194
|
+
else:
|
|
1195
|
+
sMat[jj].append(ii)
|
|
1196
|
+
vCL = IdentifyMotifCluster(sMat)
|
|
1197
|
+
vCL_List = list(chain(*vCL))
|
|
801
1198
|
for ii in range(Nc):
|
|
802
|
-
uu=flagL[cc[ii]]
|
|
803
|
-
if uu>0 and ii not in vCL_List:
|
|
1199
|
+
uu = flagL[cc[ii]]
|
|
1200
|
+
if uu > 0 and ii not in vCL_List:
|
|
804
1201
|
vCL.append([ii])
|
|
805
1202
|
for vcc in vCL:
|
|
806
1203
|
Cls_v.append(list(np.array(cc)[np.array(vcc)]))
|
|
807
|
-
Cls=[]
|
|
1204
|
+
Cls = []
|
|
808
1205
|
for ii in range(len(Cls_v)):
|
|
809
|
-
cc=Cls_v[ii]
|
|
1206
|
+
cc = Cls_v[ii]
|
|
810
1207
|
if len(cc) == 1:
|
|
811
1208
|
## Handle identical CDR3 groups first
|
|
812
1209
|
gr += 1
|
|
813
|
-
jj=cc[0]
|
|
1210
|
+
jj = cc[0]
|
|
814
1211
|
for v_info in vInfo[jj]:
|
|
815
|
-
line=vss[jj]+
|
|
816
|
-
_=g.write(line)
|
|
1212
|
+
line = vss[jj] + "\t" + str(gr) + "\t" + v_info + "\n"
|
|
1213
|
+
_ = g.write(line)
|
|
817
1214
|
else:
|
|
818
1215
|
Cls.append(cc)
|
|
819
1216
|
if exact:
|
|
820
1217
|
if verbose:
|
|
821
|
-
print(
|
|
822
|
-
Cls_s=[]
|
|
1218
|
+
print(" Performing Smith-Waterman alignment")
|
|
1219
|
+
Cls_s = []
|
|
823
1220
|
for cc in Cls:
|
|
824
|
-
Nc=len(cc)
|
|
825
|
-
if len(cc)<=3:
|
|
826
|
-
sMat=np.zeros((Nc,Nc))
|
|
1221
|
+
Nc = len(cc)
|
|
1222
|
+
if len(cc) <= 3:
|
|
1223
|
+
sMat = np.zeros((Nc, Nc))
|
|
827
1224
|
for ii in range(Nc):
|
|
828
|
-
s1=vss[cc[ii]]
|
|
829
|
-
for jj in range(ii,Nc):
|
|
830
|
-
if jj==ii:
|
|
1225
|
+
s1 = vss[cc[ii]]
|
|
1226
|
+
for jj in range(ii, Nc):
|
|
1227
|
+
if jj == ii:
|
|
831
1228
|
continue
|
|
832
|
-
s2=vss[cc[jj]]
|
|
1229
|
+
s2 = vss[cc[jj]]
|
|
833
1230
|
if len(s1) != len(s2):
|
|
834
1231
|
continue
|
|
835
|
-
if len(s1)<=5:
|
|
1232
|
+
if len(s1) <= 5:
|
|
836
1233
|
continue
|
|
837
|
-
sw=SeqComparison(s1[ST:-2],s2[ST:-2],gap=gap)
|
|
838
|
-
sw=sw/(len(s1)-ST-2)
|
|
839
|
-
sMat[ii,jj]=sw
|
|
840
|
-
sMat[jj,ii]=sw
|
|
841
|
-
s_max=[]
|
|
1234
|
+
sw = SeqComparison(s1[ST:-2], s2[ST:-2], gap=gap)
|
|
1235
|
+
sw = sw / (len(s1) - ST - 2)
|
|
1236
|
+
sMat[ii, jj] = sw
|
|
1237
|
+
sMat[jj, ii] = sw
|
|
1238
|
+
s_max = []
|
|
842
1239
|
for ii in range(Nc):
|
|
843
|
-
s_max.append(np.max(sMat[:,ii]))
|
|
844
|
-
cc_new=[]
|
|
1240
|
+
s_max.append(np.max(sMat[:, ii]))
|
|
1241
|
+
cc_new = []
|
|
845
1242
|
for ii in range(Nc):
|
|
846
|
-
if s_max[ii]>=thr_s:
|
|
1243
|
+
if s_max[ii] >= thr_s:
|
|
847
1244
|
cc_new.append(cc[ii])
|
|
848
|
-
if len(cc_new)>1:
|
|
1245
|
+
if len(cc_new) > 1:
|
|
849
1246
|
Cls_s.append(cc_new)
|
|
850
1247
|
else:
|
|
851
1248
|
for ii in range(Nc):
|
|
852
|
-
uu=flagL[cc[ii]]
|
|
853
|
-
if uu>0:
|
|
1249
|
+
uu = flagL[cc[ii]]
|
|
1250
|
+
if uu > 0:
|
|
854
1251
|
Cls_s.append([cc[ii]])
|
|
855
|
-
# print(Cls_s)
|
|
856
|
-
Cls_sList=list(chain(*Cls_s))
|
|
1252
|
+
# print(Cls_s)
|
|
1253
|
+
Cls_sList = list(chain(*Cls_s))
|
|
857
1254
|
for ii in range(len(cc)):
|
|
858
|
-
uu=flagL[cc[ii]]
|
|
859
|
-
if uu>0 and cc[ii] not in Cls_sList:
|
|
1255
|
+
uu = flagL[cc[ii]]
|
|
1256
|
+
if uu > 0 and cc[ii] not in Cls_sList:
|
|
860
1257
|
Cls_s.append([cc[ii]])
|
|
861
1258
|
else:
|
|
862
|
-
CDR3s=[vss[x] for x in cc]
|
|
863
|
-
sIDs=np.array([vSD0[x] for x in cc])
|
|
864
|
-
sIDs0=[x for x in range(len(cc))]
|
|
865
|
-
Kset=KmerSet(CDR3s, sIDs0, KS=5, st=ST, ed=2)
|
|
866
|
-
SSG=generateSSG(Kset, CDR3s, k_thr=1)
|
|
867
|
-
tmpVgenes=[
|
|
868
|
-
SSGnew=UpdateSSG(
|
|
869
|
-
|
|
870
|
-
|
|
1259
|
+
CDR3s = [vss[x] for x in cc]
|
|
1260
|
+
sIDs = np.array([vSD0[x] for x in cc])
|
|
1261
|
+
sIDs0 = [x for x in range(len(cc))]
|
|
1262
|
+
Kset = KmerSet(CDR3s, sIDs0, KS=5, st=ST, ed=2)
|
|
1263
|
+
SSG = generateSSG(Kset, CDR3s, k_thr=1)
|
|
1264
|
+
tmpVgenes = ["TRBV2"] * len(CDR3s)
|
|
1265
|
+
SSGnew = UpdateSSG(
|
|
1266
|
+
SSG, CDR3s, tmpVgenes, Vscore=VDict, cutoff=thr_s + 4
|
|
1267
|
+
)
|
|
1268
|
+
CLall = IdentifyMotifCluster(SSGnew)
|
|
1269
|
+
CLall_list = list(chain(*CLall))
|
|
871
1270
|
for ii in range(len(cc)):
|
|
872
|
-
uu=flagL[cc[ii]]
|
|
873
|
-
if uu>0 and ii not in CLall_list:
|
|
1271
|
+
uu = flagL[cc[ii]]
|
|
1272
|
+
if uu > 0 and ii not in CLall_list:
|
|
874
1273
|
CLall.append([ii])
|
|
875
1274
|
for cl in CLall:
|
|
876
|
-
ccs=list(sIDs[np.array(cl)])
|
|
1275
|
+
ccs = list(sIDs[np.array(cl)])
|
|
877
1276
|
Cls_s.append(ccs)
|
|
878
|
-
Cls=Cls_s
|
|
1277
|
+
Cls = Cls_s
|
|
879
1278
|
if verbose:
|
|
880
|
-
print(
|
|
1279
|
+
print(" Writing results into file")
|
|
881
1280
|
for ii in range(len(Cls)):
|
|
882
|
-
# if ii % 100000 == 0 and ii>0:
|
|
883
|
-
|
|
884
|
-
cc=Cls[ii]
|
|
885
|
-
gr+=1
|
|
1281
|
+
# if ii % 100000 == 0 and ii>0:
|
|
1282
|
+
# print(' %d sequences written' %(ii))
|
|
1283
|
+
cc = Cls[ii]
|
|
1284
|
+
gr += 1
|
|
886
1285
|
for jj in cc:
|
|
887
1286
|
for v_info in vInfo[jj]:
|
|
888
|
-
line=vss[jj]+
|
|
889
|
-
_=g.write(line)
|
|
1287
|
+
line = vss[jj] + "\t" + str(gr) + "\t" + v_info + "\n"
|
|
1288
|
+
_ = g.write(line)
|
|
890
1289
|
g.close()
|
|
891
1290
|
if Mat:
|
|
892
1291
|
h.close()
|
|
893
1292
|
|
|
1293
|
+
|
|
894
1294
|
def OrderUnique(Ig):
|
|
895
|
-
vv=list(Ig.values())
|
|
896
|
-
kk=list(Ig.keys())
|
|
897
|
-
LL=[len(x[1]) for x in vv]
|
|
898
|
-
v0=[x[0][0] for x in vv]
|
|
899
|
-
v1=[x[0][1] for x in vv]
|
|
900
|
-
zkk=zip(kk,v0,v1,LL)
|
|
901
|
-
zkks=sorted(zkk,key=lambda x: (x[1],x[3]))
|
|
902
|
-
nk=len(zkks)
|
|
903
|
-
keep_id=[0]
|
|
904
|
-
ii=1
|
|
905
|
-
n_pre=str(zkks[0][1])+
|
|
906
|
-
while ii<nk:
|
|
907
|
-
n_cur=str(zkks[ii][1])+
|
|
908
|
-
if n_cur==n_pre:
|
|
909
|
-
ii+=1
|
|
1295
|
+
vv = list(Ig.values())
|
|
1296
|
+
kk = list(Ig.keys())
|
|
1297
|
+
LL = [len(x[1]) for x in vv]
|
|
1298
|
+
v0 = [x[0][0] for x in vv]
|
|
1299
|
+
v1 = [x[0][1] for x in vv]
|
|
1300
|
+
zkk = zip(kk, v0, v1, LL)
|
|
1301
|
+
zkks = sorted(zkk, key=lambda x: (x[1], x[3]))
|
|
1302
|
+
nk = len(zkks)
|
|
1303
|
+
keep_id = [0]
|
|
1304
|
+
ii = 1
|
|
1305
|
+
n_pre = str(zkks[0][1]) + "_" + str(zkks[0][2])
|
|
1306
|
+
while ii < nk:
|
|
1307
|
+
n_cur = str(zkks[ii][1]) + "_" + str(zkks[ii][2])
|
|
1308
|
+
if n_cur == n_pre:
|
|
1309
|
+
ii += 1
|
|
910
1310
|
continue
|
|
911
1311
|
else:
|
|
912
1312
|
keep_id.append(ii)
|
|
913
|
-
n_pre=n_cur
|
|
914
|
-
ii+=1
|
|
1313
|
+
n_pre = n_cur
|
|
1314
|
+
ii += 1
|
|
915
1315
|
continue
|
|
916
|
-
nid=[x[0] for x in zkks]
|
|
917
|
-
filtered_id=np.array(nid)[np.array(keep_id)]
|
|
918
|
-
Igs={}
|
|
1316
|
+
nid = [x[0] for x in zkks]
|
|
1317
|
+
filtered_id = np.array(nid)[np.array(keep_id)]
|
|
1318
|
+
Igs = {}
|
|
919
1319
|
for ii in filtered_id:
|
|
920
|
-
Igs[kk[ii]]=vv[ii]
|
|
1320
|
+
Igs[kk[ii]] = vv[ii]
|
|
921
1321
|
return Igs, filtered_id
|
|
922
1322
|
|
|
1323
|
+
|
|
923
1324
|
def ClusterCDR3(dM, flagL, thr=10, GPU=False, verbose=False):
|
|
924
1325
|
## flagL: flag vector for identical CDR3 groups, >0 for grouped non-identical CDR3s
|
|
925
|
-
Cls=[]
|
|
926
|
-
flag=0
|
|
927
|
-
dM1=dM
|
|
928
|
-
flagL=np.array(flagL)
|
|
1326
|
+
Cls = []
|
|
1327
|
+
flag = 0
|
|
1328
|
+
dM1 = dM
|
|
1329
|
+
flagL = np.array(flagL)
|
|
929
1330
|
if GPU:
|
|
930
1331
|
res = faiss.StandardGpuResources()
|
|
931
1332
|
while 1:
|
|
932
|
-
# print(" %d number of clusters, with %d sequences" %(len(Cls),dM1.shape[0]))
|
|
1333
|
+
# print(" %d number of clusters, with %d sequences" %(len(Cls),dM1.shape[0]))
|
|
933
1334
|
if verbose:
|
|
934
|
-
print(
|
|
935
|
-
index = faiss.IndexFlatL2(Ndim*6)
|
|
1335
|
+
print("=", end="")
|
|
1336
|
+
index = faiss.IndexFlatL2(Ndim * 6)
|
|
936
1337
|
if GPU:
|
|
937
1338
|
index = faiss.index_cpu_to_gpu(res, 0, index)
|
|
938
1339
|
index.add(dM1)
|
|
939
|
-
if flag==0:
|
|
1340
|
+
if flag == 0:
|
|
940
1341
|
D, I = index.search(dM1, 2)
|
|
941
|
-
vv=np.where((D[:,1]<=thr))[0]
|
|
942
|
-
vv0=np.where((D[:,1]>thr) & (flagL>0))[0]
|
|
1342
|
+
vv = np.where((D[:, 1] <= thr))[0]
|
|
1343
|
+
vv0 = np.where((D[:, 1] > thr) & (flagL > 0))[0]
|
|
943
1344
|
for v in vv0:
|
|
944
1345
|
Cls.append([v])
|
|
945
|
-
tmp_dM=np.zeros((len(vv),Ndim*6))
|
|
946
|
-
Ig_new={}
|
|
1346
|
+
tmp_dM = np.zeros((len(vv), Ndim * 6))
|
|
1347
|
+
Ig_new = {}
|
|
947
1348
|
for ii in range(len(vv)):
|
|
948
|
-
v=vv[ii]
|
|
949
|
-
Idx=I[v,]
|
|
1349
|
+
v = vv[ii]
|
|
1350
|
+
Idx = I[v,]
|
|
950
1351
|
if v not in Idx:
|
|
951
|
-
Idx[0]=v
|
|
952
|
-
Ig_new[ii]=(sorted(list(set(Idx))),sorted(list(set(Idx))))
|
|
953
|
-
tmp_dM[ii,]=(dM1[Idx[0],]+dM1[Idx[1],])/2
|
|
954
|
-
if len(Ig_new)==0:
|
|
1352
|
+
Idx[0] = v
|
|
1353
|
+
Ig_new[ii] = (sorted(list(set(Idx))), sorted(list(set(Idx))))
|
|
1354
|
+
tmp_dM[ii,] = (dM1[Idx[0],] + dM1[Idx[1],]) / 2
|
|
1355
|
+
if len(Ig_new) == 0:
|
|
955
1356
|
if verbose:
|
|
956
|
-
print(
|
|
1357
|
+
print("type 0 break")
|
|
957
1358
|
break
|
|
958
|
-
# print('%d of sequence left at cycle %d' %(len(Ig_new),flag))
|
|
959
|
-
Igs, fid=OrderUnique(Ig_new)
|
|
960
|
-
tmp_dM=tmp_dM[fid,]
|
|
961
|
-
Ig_new=Igs
|
|
1359
|
+
# print('%d of sequence left at cycle %d' %(len(Ig_new),flag))
|
|
1360
|
+
Igs, fid = OrderUnique(Ig_new)
|
|
1361
|
+
tmp_dM = tmp_dM[fid,]
|
|
1362
|
+
Ig_new = Igs
|
|
962
1363
|
else:
|
|
963
|
-
D, I = index.search(dM1,2)
|
|
964
|
-
vv=np.where(D[:,1]<=thr)[0]
|
|
965
|
-
vv0=np.where(D[:,1]>thr)[0]
|
|
1364
|
+
D, I = index.search(dM1, 2)
|
|
1365
|
+
vv = np.where(D[:, 1] <= thr)[0]
|
|
1366
|
+
vv0 = np.where(D[:, 1] > thr)[0]
|
|
966
1367
|
## move groups in vv0 to Cls
|
|
967
|
-
kkg=list(Ig.keys())
|
|
1368
|
+
kkg = list(Ig.keys())
|
|
968
1369
|
for v in vv0:
|
|
969
|
-
ng=list(Ig[kkg[v]][1])
|
|
970
|
-
|
|
1370
|
+
ng = list(Ig[kkg[v]][1])
|
|
1371
|
+
# if ng not in Cls:
|
|
971
1372
|
Cls.append(ng)
|
|
972
|
-
tmp_dM=np.zeros((len(vv),Ndim*6))
|
|
973
|
-
Ig_new={}
|
|
1373
|
+
tmp_dM = np.zeros((len(vv), Ndim * 6))
|
|
1374
|
+
Ig_new = {}
|
|
974
1375
|
for ii in range(len(vv)):
|
|
975
|
-
v=vv[ii]
|
|
976
|
-
idx1=I[v,0]
|
|
977
|
-
idx2=I[v,1]
|
|
1376
|
+
v = vv[ii]
|
|
1377
|
+
idx1 = I[v, 0]
|
|
1378
|
+
idx2 = I[v, 1]
|
|
978
1379
|
if v not in I[v,]:
|
|
979
|
-
idx1=v
|
|
980
|
-
# Ig_new[ii]=sorted(list(set(list(Ig[kkg[idx1]])+list(Ig[kkg[idx2]]))))
|
|
981
|
-
Ig_new[ii]=(
|
|
982
|
-
|
|
983
|
-
|
|
984
|
-
|
|
1380
|
+
idx1 = v
|
|
1381
|
+
# Ig_new[ii]=sorted(list(set(list(Ig[kkg[idx1]])+list(Ig[kkg[idx2]]))))
|
|
1382
|
+
Ig_new[ii] = (
|
|
1383
|
+
sorted(
|
|
1384
|
+
list(set([idx1, idx2]))
|
|
1385
|
+
), ## First entry records the relative index of a sequence clique
|
|
1386
|
+
sorted(list(set(list(Ig[kkg[idx1]][1]) + list(Ig[kkg[idx2]][1])))),
|
|
1387
|
+
) ## Second entry records the absolute index of a sequence
|
|
1388
|
+
tmp_dM[ii,] = (dM1[idx1,] + dM1[idx2,]) / 2
|
|
1389
|
+
if len(Ig_new) == 0:
|
|
985
1390
|
if verbose:
|
|
986
1391
|
print("\ntype I break")
|
|
987
|
-
kkg=list(Ig.keys())
|
|
1392
|
+
kkg = list(Ig.keys())
|
|
988
1393
|
for kk in kkg:
|
|
989
|
-
ng=list(Ig[kk][1])
|
|
1394
|
+
ng = list(Ig[kk][1])
|
|
990
1395
|
if ng not in Cls:
|
|
991
1396
|
Cls.append(ng)
|
|
992
1397
|
break
|
|
993
|
-
# print('%d of sequence left at cycle %d' %(len(Ig_new),flag))
|
|
994
|
-
Igs, fid=OrderUnique(Ig_new)
|
|
995
|
-
tmp_dM=tmp_dM[fid,]
|
|
996
|
-
Ig_new=Igs
|
|
997
|
-
if flag>0:
|
|
1398
|
+
# print('%d of sequence left at cycle %d' %(len(Ig_new),flag))
|
|
1399
|
+
Igs, fid = OrderUnique(Ig_new)
|
|
1400
|
+
tmp_dM = tmp_dM[fid,]
|
|
1401
|
+
Ig_new = Igs
|
|
1402
|
+
if flag > 0:
|
|
998
1403
|
if Ig == Ig_new:
|
|
999
1404
|
if verbose:
|
|
1000
1405
|
print("\ntype II break")
|
|
1001
|
-
kkg=list(Ig.keys())
|
|
1406
|
+
kkg = list(Ig.keys())
|
|
1002
1407
|
for kk in kkg:
|
|
1003
|
-
ng=list(Ig[kk][1])
|
|
1408
|
+
ng = list(Ig[kk][1])
|
|
1004
1409
|
if ng in Cls:
|
|
1005
1410
|
continue
|
|
1006
1411
|
Cls.append(ng)
|
|
1007
1412
|
break
|
|
1008
|
-
Ig=Ig_new
|
|
1009
|
-
tmp_dM=tmp_dM.astype(
|
|
1010
|
-
dM1=tmp_dM
|
|
1011
|
-
flag+=1
|
|
1413
|
+
Ig = Ig_new
|
|
1414
|
+
tmp_dM = tmp_dM.astype("float32")
|
|
1415
|
+
dM1 = tmp_dM
|
|
1416
|
+
flag += 1
|
|
1012
1417
|
return Cls
|
|
1013
1418
|
|
|
1014
|
-
|
|
1015
|
-
|
|
1419
|
+
|
|
1420
|
+
def ClusterCDR3r(dM, flagL, thr=10, verbose=False):
|
|
1421
|
+
index = faiss.IndexFlatL2(Ndim * 6)
|
|
1016
1422
|
index.add(dM)
|
|
1017
1423
|
lims, D, I = index.range_search(dM, thr)
|
|
1018
1424
|
# with open('cdr3.npy', 'wb') as f:
|
|
@@ -1020,53 +1426,70 @@ def ClusterCDR3r(dM, flagL, thr = 10, verbose = False):
|
|
|
1020
1426
|
# np.save(f, D)
|
|
1021
1427
|
# np.save(f, I)
|
|
1022
1428
|
# np.save(f, dM)
|
|
1023
|
-
|
|
1429
|
+
|
|
1024
1430
|
# now clustering results
|
|
1025
1431
|
N = dM.shape[0]
|
|
1026
|
-
neighborSize = np.array(
|
|
1432
|
+
neighborSize = np.array(
|
|
1433
|
+
[lims[cur_idx_i + 1] - lims[cur_idx_i] for cur_idx_i in range(N)]
|
|
1434
|
+
)
|
|
1027
1435
|
# to_cluster = np.ones( (N,))
|
|
1028
1436
|
clusterNo = 0
|
|
1029
|
-
cluster = -
|
|
1437
|
+
cluster = -np.ones((N,), dtype=np.int32)
|
|
1030
1438
|
idx = np.where(cluster < 0)[0]
|
|
1031
1439
|
unclustered = [np.argmax(neighborSize[idx])]
|
|
1032
1440
|
depth = 0
|
|
1033
1441
|
while True:
|
|
1034
|
-
if len(unclustered) == 0:
|
|
1442
|
+
if len(unclustered) == 0:
|
|
1443
|
+
break
|
|
1035
1444
|
# cur_idx = unclustered[0] # first unclustered index
|
|
1036
1445
|
cur_idx = unclustered
|
|
1037
|
-
cluster[cur_idx] = clusterNo
|
|
1038
|
-
|
|
1039
|
-
neighbor = np.unique(
|
|
1446
|
+
cluster[cur_idx] = clusterNo # assign cluster
|
|
1447
|
+
|
|
1448
|
+
neighbor = np.unique(
|
|
1449
|
+
np.array(
|
|
1450
|
+
list(
|
|
1451
|
+
chain(
|
|
1452
|
+
*[
|
|
1453
|
+
I[(lims[cur_idx_i]) : lims[cur_idx_i + 1]]
|
|
1454
|
+
for cur_idx_i in cur_idx
|
|
1455
|
+
]
|
|
1456
|
+
)
|
|
1457
|
+
)
|
|
1458
|
+
)
|
|
1459
|
+
)
|
|
1040
1460
|
# find those unclusterred
|
|
1041
1461
|
idx = np.where(cluster[neighbor] < 0)[0]
|
|
1042
1462
|
if len(idx) == 0:
|
|
1043
1463
|
depth = 0
|
|
1044
1464
|
clusterNo += 1
|
|
1045
1465
|
idx = np.where(cluster < 0)[0]
|
|
1046
|
-
if len(idx) == 0:
|
|
1466
|
+
if len(idx) == 0:
|
|
1467
|
+
break
|
|
1047
1468
|
unclustered = [idx[np.argmax(neighborSize[idx])]]
|
|
1048
|
-
|
|
1469
|
+
|
|
1049
1470
|
else:
|
|
1050
1471
|
if depth > 3:
|
|
1051
1472
|
depth = 0
|
|
1052
1473
|
clusterNo += 1
|
|
1053
1474
|
unclustered = neighbor[idx]
|
|
1054
1475
|
depth += 1
|
|
1055
|
-
# print('clusterNo = ', clusterNo)
|
|
1056
|
-
Cls = [
|
|
1476
|
+
# print('clusterNo = ', clusterNo)
|
|
1477
|
+
Cls = [[] for i in range(clusterNo)]
|
|
1057
1478
|
for idx, i in enumerate(cluster):
|
|
1058
|
-
|
|
1059
|
-
# print("Cls[:5] = ", Cls[:5])
|
|
1060
|
-
# print("len(Cls) = ", len(Cls),
|
|
1061
|
-
# ', #elem=', sum([len(i) for i in Cls]),
|
|
1062
|
-
# ', #single=', sum([len(i) for i in Cls if len(i) == 1]),
|
|
1063
|
-
# ', #non_single=', sum([len(i) for i in Cls if len(i) != 1]),
|
|
1064
|
-
# ', #max=', max([len(i) for i in Cls]))
|
|
1479
|
+
Cls[i].append(idx)
|
|
1480
|
+
# print("Cls[:5] = ", Cls[:5])
|
|
1481
|
+
# print("len(Cls) = ", len(Cls),
|
|
1482
|
+
# ', #elem=', sum([len(i) for i in Cls]),
|
|
1483
|
+
# ', #single=', sum([len(i) for i in Cls if len(i) == 1]),
|
|
1484
|
+
# ', #non_single=', sum([len(i) for i in Cls if len(i) != 1]),
|
|
1485
|
+
# ', #max=', max([len(i) for i in Cls]))
|
|
1065
1486
|
return Cls
|
|
1066
1487
|
|
|
1488
|
+
|
|
1067
1489
|
def CommandLineParser():
|
|
1068
|
-
parser=OptionParser()
|
|
1069
|
-
print
|
|
1490
|
+
parser = OptionParser()
|
|
1491
|
+
print(
|
|
1492
|
+
"""
|
|
1070
1493
|
GIANA: Geometric Isometry based ANtigen-specific tcr Alignment
|
|
1071
1494
|
Ultrafast short peptide alignment exclusively designed for large-scale adaptome analysis
|
|
1072
1495
|
|
|
@@ -1079,130 +1502,282 @@ Input columns:
|
|
|
1079
1502
|
|
|
1080
1503
|
!!! ALL amino acid letters must be CAPITAL !!!
|
|
1081
1504
|
|
|
1082
|
-
|
|
1083
|
-
|
|
1084
|
-
parser.add_option(
|
|
1085
|
-
|
|
1086
|
-
|
|
1087
|
-
|
|
1088
|
-
|
|
1089
|
-
|
|
1090
|
-
|
|
1091
|
-
parser.add_option(
|
|
1092
|
-
|
|
1093
|
-
|
|
1094
|
-
|
|
1095
|
-
|
|
1096
|
-
|
|
1097
|
-
|
|
1098
|
-
parser.add_option(
|
|
1099
|
-
|
|
1100
|
-
|
|
1101
|
-
|
|
1102
|
-
|
|
1505
|
+
"""
|
|
1506
|
+
)
|
|
1507
|
+
parser.add_option(
|
|
1508
|
+
"-d",
|
|
1509
|
+
"--directory",
|
|
1510
|
+
dest="Directory",
|
|
1511
|
+
help="Input repertoire sequencing file directory. Please make sure that all the files in the directory are input files.",
|
|
1512
|
+
default="",
|
|
1513
|
+
)
|
|
1514
|
+
parser.add_option(
|
|
1515
|
+
"-f",
|
|
1516
|
+
"--file",
|
|
1517
|
+
dest="File",
|
|
1518
|
+
default="",
|
|
1519
|
+
help="Input single file of CDR3 sequences for grouping",
|
|
1520
|
+
)
|
|
1521
|
+
parser.add_option(
|
|
1522
|
+
"-F",
|
|
1523
|
+
"--fileList",
|
|
1524
|
+
dest="files",
|
|
1525
|
+
default="",
|
|
1526
|
+
help="Alternative input: a file containing the full path to all the files. If given, overwrite -d and -f option",
|
|
1527
|
+
)
|
|
1528
|
+
parser.add_option(
|
|
1529
|
+
"-t",
|
|
1530
|
+
"--threshold",
|
|
1531
|
+
dest="thr",
|
|
1532
|
+
default=7,
|
|
1533
|
+
help="Isometric distance threshold for calling similar CDR3 groups. Without -E, smaller value will increase speed. With -E, smaller value will increase specificity. Must be smaller than 12.",
|
|
1534
|
+
)
|
|
1535
|
+
parser.add_option(
|
|
1536
|
+
"-S",
|
|
1537
|
+
"--threshold_score",
|
|
1538
|
+
dest="thr_s",
|
|
1539
|
+
default=3.6,
|
|
1540
|
+
help="Threshold for Smith-Waterman alignment score (normalized by CDR3 length). Default 3.6",
|
|
1541
|
+
)
|
|
1542
|
+
parser.add_option(
|
|
1543
|
+
"-G",
|
|
1544
|
+
"--threshold_vgene",
|
|
1545
|
+
dest="thr_v",
|
|
1546
|
+
default=3.7,
|
|
1547
|
+
help="Threshold for variable gene comparison. Default 3.7.",
|
|
1548
|
+
)
|
|
1549
|
+
parser.add_option(
|
|
1550
|
+
"-o",
|
|
1551
|
+
"--output",
|
|
1552
|
+
dest="OutDir",
|
|
1553
|
+
default="./",
|
|
1554
|
+
help="Output directory for intermediate and final outputs.",
|
|
1555
|
+
)
|
|
1556
|
+
parser.add_option(
|
|
1557
|
+
"-O",
|
|
1558
|
+
"--outfile",
|
|
1559
|
+
dest="OutFile",
|
|
1560
|
+
default="",
|
|
1561
|
+
help="Output file name. If not given, a file with --RotationEncoding will be added to the input file as the output file name.",
|
|
1562
|
+
)
|
|
1563
|
+
parser.add_option(
|
|
1564
|
+
"-T",
|
|
1565
|
+
"--startPosition",
|
|
1566
|
+
dest="ST",
|
|
1567
|
+
default=3,
|
|
1568
|
+
help="Starting position of CDR3 sequence. The first ST letters are omitted. CDR3 sequence length L must be >= ST+7 ",
|
|
1569
|
+
)
|
|
1570
|
+
parser.add_option(
|
|
1571
|
+
"-g",
|
|
1572
|
+
"--GapPenalty",
|
|
1573
|
+
dest="Gap",
|
|
1574
|
+
default=-6,
|
|
1575
|
+
help="Gap penalty,default= -6. Not used.",
|
|
1576
|
+
)
|
|
1577
|
+
parser.add_option(
|
|
1578
|
+
"-n",
|
|
1579
|
+
"--GapNumber",
|
|
1580
|
+
dest="GapN",
|
|
1581
|
+
default=1,
|
|
1582
|
+
help="Maximum number of gaps allowed when performing alignment. Max=1, default=1. Not used.",
|
|
1583
|
+
)
|
|
1584
|
+
parser.add_option(
|
|
1585
|
+
"-V",
|
|
1586
|
+
"--VariableGeneFa",
|
|
1587
|
+
dest="VFa",
|
|
1588
|
+
default="Imgt_Human_TRBV.fasta",
|
|
1589
|
+
help="IMGT Human beta variable gene sequences",
|
|
1590
|
+
)
|
|
1591
|
+
parser.add_option(
|
|
1592
|
+
"-v",
|
|
1593
|
+
"--VariableGene",
|
|
1594
|
+
dest="V",
|
|
1595
|
+
default=True,
|
|
1596
|
+
action="store_false",
|
|
1597
|
+
help="If False, GIANA will omit variable gene information and use CDR3 sequences only. This will yield reduced specificity. The cut-off will automatically become the current value-4.0",
|
|
1598
|
+
)
|
|
1599
|
+
parser.add_option(
|
|
1600
|
+
"-e",
|
|
1601
|
+
"--Exact",
|
|
1602
|
+
dest="E",
|
|
1603
|
+
default=True,
|
|
1604
|
+
action="store_false",
|
|
1605
|
+
help="If False, GIANA will not perform Smith-Waterman alignment after isometric encoding.",
|
|
1606
|
+
)
|
|
1607
|
+
parser.add_option(
|
|
1608
|
+
"-N",
|
|
1609
|
+
"--NumberOfThreads",
|
|
1610
|
+
dest="NN",
|
|
1611
|
+
default=1,
|
|
1612
|
+
help="Number of threads for multiple processing. Not working so well.",
|
|
1613
|
+
)
|
|
1614
|
+
parser.add_option(
|
|
1615
|
+
"-M",
|
|
1616
|
+
"--EncodingMatrix",
|
|
1617
|
+
dest="Mat",
|
|
1618
|
+
default=False,
|
|
1619
|
+
action="store_true",
|
|
1620
|
+
help="If true, GIANA will export the isometric encoding matrix for each TCR. Default: False.",
|
|
1621
|
+
)
|
|
1622
|
+
parser.add_option(
|
|
1623
|
+
"-U",
|
|
1624
|
+
"--UseGPU",
|
|
1625
|
+
dest="GPU",
|
|
1626
|
+
default=False,
|
|
1627
|
+
action="store_true",
|
|
1628
|
+
help="Use GPU for Faiss indexing. Must be CUDA GPUs.",
|
|
1629
|
+
)
|
|
1630
|
+
parser.add_option(
|
|
1631
|
+
"-q",
|
|
1632
|
+
"--queryFile",
|
|
1633
|
+
dest="Query",
|
|
1634
|
+
default="",
|
|
1635
|
+
help="Input query file, if given, GIANA will run in query mode, also need to provide -r option.",
|
|
1636
|
+
)
|
|
1637
|
+
parser.add_option(
|
|
1638
|
+
"-r",
|
|
1639
|
+
"--refFile",
|
|
1640
|
+
dest="ref",
|
|
1641
|
+
default="",
|
|
1642
|
+
help="Input reference file. Query model required.",
|
|
1643
|
+
)
|
|
1644
|
+
parser.add_option(
|
|
1645
|
+
"-b",
|
|
1646
|
+
"--Verbose",
|
|
1647
|
+
dest="v",
|
|
1648
|
+
default=False,
|
|
1649
|
+
action="store_true",
|
|
1650
|
+
help="Verbose option: if given, GIANA will print intermediate messages.",
|
|
1651
|
+
)
|
|
1103
1652
|
return parser.parse_args()
|
|
1104
1653
|
|
|
1654
|
+
|
|
1105
1655
|
def main():
|
|
1106
|
-
(opt,_)=CommandLineParser()
|
|
1107
|
-
cutoff=float(opt.thr)
|
|
1108
|
-
OutDir=opt.OutDir
|
|
1109
|
-
thr_s=float(opt.thr_s)
|
|
1656
|
+
(opt, _) = CommandLineParser()
|
|
1657
|
+
cutoff = float(opt.thr)
|
|
1658
|
+
OutDir = opt.OutDir
|
|
1659
|
+
thr_s = float(opt.thr_s)
|
|
1110
1660
|
## Check if query mode first
|
|
1111
|
-
qFile=opt.Query
|
|
1112
|
-
if len(qFile)>0:
|
|
1661
|
+
qFile = opt.Query
|
|
1662
|
+
if len(qFile) > 0:
|
|
1113
1663
|
## query mode
|
|
1114
|
-
t1=time.time()
|
|
1115
|
-
if qFile.endswith(
|
|
1664
|
+
t1 = time.time()
|
|
1665
|
+
if qFile.endswith("/"):
|
|
1116
1666
|
## input query is a directory
|
|
1117
|
-
qFs=os.listdir(qFile)
|
|
1118
|
-
qFileList=[]
|
|
1667
|
+
qFs = os.listdir(qFile)
|
|
1668
|
+
qFileList = []
|
|
1119
1669
|
for ff in qFs:
|
|
1120
|
-
qFileList.append(qFile+ff)
|
|
1670
|
+
qFileList.append(qFile + ff)
|
|
1121
1671
|
else:
|
|
1122
|
-
qFileList=[qFile]
|
|
1123
|
-
rFile=opt.ref
|
|
1124
|
-
if len(rFile)==0:
|
|
1125
|
-
raise("Must provide reference file in query mode!")
|
|
1672
|
+
qFileList = [qFile]
|
|
1673
|
+
rFile = opt.ref
|
|
1674
|
+
if len(rFile) == 0:
|
|
1675
|
+
raise ("Must provide reference file in query mode!")
|
|
1126
1676
|
else:
|
|
1127
1677
|
## check if reference cluster file exists
|
|
1128
|
-
rFile0=re.sub(
|
|
1129
|
-
refClusterFile=rFile0+
|
|
1678
|
+
rFile0 = re.sub("\\.txt", "", rFile)
|
|
1679
|
+
refClusterFile = rFile0 + "--RotationEncodingBL62.txt"
|
|
1130
1680
|
if not os.path.exists(refClusterFile):
|
|
1131
|
-
raise(
|
|
1132
|
-
|
|
1133
|
-
|
|
1134
|
-
|
|
1681
|
+
raise (
|
|
1682
|
+
"Must run clustering on reference file first! Did you forget to put the clustering file in this directory?"
|
|
1683
|
+
)
|
|
1684
|
+
rData = CreateReference(rFile)
|
|
1685
|
+
t2 = time.time()
|
|
1686
|
+
print("Reference created. Elapsed %f" % (t2 - t1))
|
|
1135
1687
|
for qf in qFileList:
|
|
1136
|
-
t2_0=time.time()
|
|
1137
|
-
print("Querying "+qf)
|
|
1138
|
-
qf_s=qf.split(
|
|
1139
|
-
#outFile=re.sub('\\.txt','',qf_s)+'_query_'+rFile0+'.txt'
|
|
1140
|
-
outFile=
|
|
1141
|
-
|
|
1688
|
+
t2_0 = time.time()
|
|
1689
|
+
print("Querying " + qf)
|
|
1690
|
+
qf_s = qf.split("/")[-1]
|
|
1691
|
+
# outFile=re.sub('\\.txt','',qf_s)+'_query_'+rFile0+'.txt'
|
|
1692
|
+
outFile = (
|
|
1693
|
+
os.path.splitext(qf_s)[0]
|
|
1694
|
+
+ "_query_"
|
|
1695
|
+
+ os.path.basename(rFile0)
|
|
1696
|
+
+ ".txt"
|
|
1697
|
+
)
|
|
1698
|
+
of = OutDir + "/" + outFile
|
|
1142
1699
|
if path.exists(of):
|
|
1143
|
-
print(of+
|
|
1700
|
+
print(of + " already exits. Skipping.")
|
|
1144
1701
|
continue
|
|
1145
1702
|
MakeQuery(qf, rData, thr=cutoff, thr_s=thr_s)
|
|
1146
|
-
t2=time.time()
|
|
1147
|
-
print(" Build query clustering file. Elapsed %f" %(t2-t1))
|
|
1703
|
+
t2 = time.time()
|
|
1704
|
+
print(" Build query clustering file. Elapsed %f" % (t2 - t1))
|
|
1148
1705
|
print("Now mering with reference cluster")
|
|
1149
|
-
MergeExist(refClusterFile, OutDir+
|
|
1150
|
-
t2=time.time()
|
|
1151
|
-
print(" Time of elapsed for query %s: %f" %(qf, t2-t2_0))
|
|
1706
|
+
MergeExist(refClusterFile, OutDir + "/" + outFile)
|
|
1707
|
+
t2 = time.time()
|
|
1708
|
+
print(" Time of elapsed for query %s: %f" % (qf, t2 - t2_0))
|
|
1152
1709
|
else:
|
|
1153
1710
|
## regular clustering mode
|
|
1154
|
-
FileDir=opt.Directory
|
|
1155
|
-
if len(FileDir)>0:
|
|
1156
|
-
|
|
1157
|
-
|
|
1158
|
-
|
|
1159
|
-
|
|
1160
|
-
|
|
1161
|
-
|
|
1711
|
+
FileDir = opt.Directory
|
|
1712
|
+
if len(FileDir) > 0:
|
|
1713
|
+
files = os.listdir(FileDir)
|
|
1714
|
+
files0 = []
|
|
1715
|
+
for ff in files:
|
|
1716
|
+
ff = FileDir + "/" + ff
|
|
1717
|
+
files0.append(ff)
|
|
1718
|
+
files = files0
|
|
1162
1719
|
else:
|
|
1163
|
-
|
|
1164
|
-
File=opt.File
|
|
1165
|
-
if len(File)>0:
|
|
1166
|
-
|
|
1167
|
-
FileList=opt.files
|
|
1168
|
-
if len(FileList)>0:
|
|
1169
|
-
|
|
1170
|
-
|
|
1171
|
-
|
|
1172
|
-
|
|
1173
|
-
VFa=opt.VFa
|
|
1720
|
+
files = []
|
|
1721
|
+
File = opt.File
|
|
1722
|
+
if len(File) > 0:
|
|
1723
|
+
files = [File]
|
|
1724
|
+
FileList = opt.files
|
|
1725
|
+
if len(FileList) > 0:
|
|
1726
|
+
files = []
|
|
1727
|
+
fL = open(FileList)
|
|
1728
|
+
for ff in fL.readlines():
|
|
1729
|
+
files.append(ff.strip())
|
|
1730
|
+
VFa = opt.VFa
|
|
1174
1731
|
PreCalculateVgeneDist(VFa)
|
|
1175
|
-
vf=open(
|
|
1176
|
-
VScore={}
|
|
1177
|
-
VV=opt.V
|
|
1178
|
-
EE=opt.E
|
|
1179
|
-
Mat=opt.Mat
|
|
1180
|
-
ST=int(opt.ST)
|
|
1181
|
-
thr_v=float(opt.thr_v)
|
|
1182
|
-
verbose=opt.v
|
|
1732
|
+
vf = open("./VgeneScores.txt") ## Use tcrDist's Vgene 80-score calculation
|
|
1733
|
+
VScore = {}
|
|
1734
|
+
VV = opt.V
|
|
1735
|
+
EE = opt.E
|
|
1736
|
+
Mat = opt.Mat
|
|
1737
|
+
ST = int(opt.ST)
|
|
1738
|
+
thr_v = float(opt.thr_v)
|
|
1739
|
+
verbose = opt.v
|
|
1183
1740
|
if VV:
|
|
1184
1741
|
while 1:
|
|
1185
|
-
line=vf.readline()
|
|
1186
|
-
if len(line)==0:
|
|
1742
|
+
line = vf.readline()
|
|
1743
|
+
if len(line) == 0:
|
|
1187
1744
|
break
|
|
1188
|
-
ww=line.strip().split(
|
|
1189
|
-
VScore[(ww[0],ww[1])]=int(ww[2])/20
|
|
1190
|
-
VScore[(ww[1],ww[0])]=int(ww[2])/20
|
|
1191
|
-
Gap=int(opt.Gap)
|
|
1192
|
-
Gapn=int(opt.GapN)
|
|
1193
|
-
OutFile=opt.OutFile
|
|
1194
|
-
GPU=opt.GPU
|
|
1195
|
-
st=3
|
|
1196
|
-
ed=1
|
|
1197
|
-
NT=int(opt.NN)
|
|
1745
|
+
ww = line.strip().split("\t")
|
|
1746
|
+
VScore[(ww[0], ww[1])] = int(ww[2]) / 20
|
|
1747
|
+
VScore[(ww[1], ww[0])] = int(ww[2]) / 20
|
|
1748
|
+
Gap = int(opt.Gap)
|
|
1749
|
+
Gapn = int(opt.GapN)
|
|
1750
|
+
OutFile = opt.OutFile
|
|
1751
|
+
GPU = opt.GPU
|
|
1752
|
+
st = 3
|
|
1753
|
+
ed = 1
|
|
1754
|
+
NT = int(opt.NN)
|
|
1198
1755
|
faiss.omp_set_num_threads(NT)
|
|
1199
1756
|
for ff in files:
|
|
1200
|
-
print("Processing %s" %ff)
|
|
1201
|
-
EncodeRepertoire(
|
|
1202
|
-
|
|
1757
|
+
print("Processing %s" % ff)
|
|
1758
|
+
EncodeRepertoire(
|
|
1759
|
+
ff,
|
|
1760
|
+
OutDir,
|
|
1761
|
+
OutFile,
|
|
1762
|
+
ST=ST,
|
|
1763
|
+
thr_s=thr_s,
|
|
1764
|
+
thr_v=thr_v,
|
|
1765
|
+
exact=EE,
|
|
1766
|
+
VDict=VScore,
|
|
1767
|
+
Vgene=VV,
|
|
1768
|
+
thr_iso=cutoff,
|
|
1769
|
+
gap=Gap,
|
|
1770
|
+
GPU=GPU,
|
|
1771
|
+
Mat=Mat,
|
|
1772
|
+
verbose=verbose,
|
|
1773
|
+
)
|
|
1774
|
+
|
|
1775
|
+
|
|
1203
1776
|
if __name__ == "__main__":
|
|
1204
|
-
t0=time.time()
|
|
1777
|
+
t0 = time.time()
|
|
1205
1778
|
main()
|
|
1206
|
-
print
|
|
1207
|
-
print
|
|
1208
|
-
|
|
1779
|
+
print("Total time elapsed: %f" % (time.time() - t0))
|
|
1780
|
+
print(
|
|
1781
|
+
"Maximum memory usage: %f MB"
|
|
1782
|
+
% (resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1000000)
|
|
1783
|
+
)
|