biopipen 0.31.7__py3-none-any.whl → 0.32.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biopipen might be problematic. Click here for more details.

@@ -24,7 +24,6 @@
24
24
  import sys, os, re, resource
25
25
  from os import path
26
26
  import numpy as np
27
- from Bio.SubsMat.MatrixInfo import blosum62
28
27
  import time
29
28
  from time import gmtime, strftime
30
29
  from operator import itemgetter
@@ -36,255 +35,587 @@ from sklearn.decomposition import PCA
36
35
  from sklearn.manifold import MDS
37
36
  import faiss
38
37
  from query import *
38
+ try:
39
+ from Bio.SubsMat.MatrixInfo import blosum62
40
+ print(blosum62)
41
+ except ModuleNotFoundError:
42
+ from Bio.Align import substitution_matrices
43
+ blosum62 = substitution_matrices.load("BLOSUM62")
44
+ _tmp = {}
45
+ for ab1 in blosum62.alphabet:
46
+ for ab2 in blosum62.alphabet:
47
+ _tmp[(ab1, ab2)] = int(blosum62[(ab1, ab2)])
48
+ blosum62 = _tmp
49
+ print(blosum62)
39
50
 
40
- AAstring='ACDEFGHIKLMNPQRSTVWY'
41
- AAstringList=list(AAstring)
42
- cur_dir=os.path.dirname(os.path.realpath(__file__))+'/'
51
+ AAstring = "ACDEFGHIKLMNPQRSTVWY"
52
+ AAstringList = list(AAstring)
53
+ cur_dir = os.path.dirname(os.path.realpath(__file__)) + "/"
43
54
 
44
- blosum62n={}
55
+ blosum62n = {}
45
56
  for kk in blosum62:
46
- a1=kk[0]
47
- a2=kk[1]
48
- vv=blosum62[kk]
49
- if vv>4:
50
- vv=4
51
- blosum62n[(a1,a2)]=vv
57
+ a1 = kk[0]
58
+ a2 = kk[1]
59
+ vv = blosum62[kk]
60
+ if vv > 4:
61
+ vv = 4
62
+ blosum62n[(a1, a2)] = vv
52
63
  if a1 != a2:
53
- blosum62n[(a2,a1)]=vv
54
-
55
- bl62={'A':[4,-1,-2,-2,0,-1,-1,0,-2,-1,-1,-1,-1,-2,-1,1,0,-3,-2,0],
56
- 'R':[-1,4,0,-2,-3,1,0,-2,0,-3,-2,2,-1,-3,-2,-1,-1,-3,-2,-3],
57
- 'N':[-2,0,4,1,-3,0,0,0,1,-3,-3,0,-2,-3,-2,1,0,-4,-2,-3],
58
- 'D':[-2,-2,1,4,-3,0,2,-1,-1,-3,-4,-1,-3,-3,-1,0,-1,-4,-3,-3],
59
- 'C':[0,-3,-3,-3,4,-3,-4,-3,-3,-1,-1,-3,-1,-2,-3,-1,-1,-2,-2,-1],
60
- 'Q':[-1,1,0,0,-3,4,2,-2,0,-3,-2,1,0,-3,-1,0,-1,-2,-1,-2],
61
- 'E':[-1,0,0,2,-4,2,4,-2,0,-3,-3,1,-2,-3,-1,0,-1,-3,-2,-2],
62
- 'G':[0,-2,0,-1,-3,-2,-2,4,-2,-4,-4,-2,-3,-3,-2,0,-2,-2,-3,-3],
63
- 'H':[-2,0,1,-1,-3,0,0,-2,4,-3,-3,-1,-2,-1,-2,-1,-2,-2,2,-3],
64
- 'I':[-1,-3,-3,-3,-1,-3,-3,-4,-3,4,2,-3,1,0,-3,-2,-1,-3,-1,3],
65
- 'L':[-1,-2,-3,-4,-1,-2,-3,-4,-3,2,4,-2,2,0,-3,-2,-1,-2,-1,1],
66
- 'K':[-1,2,0,-1,-3,1,1,-2,-1,-3,-2,4,-1,-3,-1,0,-1,-3,-2,-2],
67
- 'M':[-1,-1,-2,-3,-1,0,-2,-3,-2,1,2,-1,4,0,-2,-1,-1,-1,-1,1],
68
- 'F':[-2,-3,-3,-3,-2,-3,-3,-3,-1,0,0,-3,0,4,-4,-2,-2,1,3,-1],
69
- 'P':[-1,-2,-2,-1,-3,-1,-1,-2,-2,-3,-3,-1,-2,-4,4,-1,-1,-4,-3,-2],
70
- 'S':[1,-1,1,0,-1,0,0,0,-1,-2,-2,0,-1,-2,-1,4,1,-3,-2,-2],
71
- 'T':[0,-1,0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-2,-1,1,4,-2,-2,0],
72
- 'W':[-3,-3,-4,-4,-2,-2,-3,-2,-2,-3,-2,-3,-1,1,-4,-3,-2,4,2,-3],
73
- 'Y':[-2,-2,-2,-3,-2,-1,-2,-3,2,-1,-1,-2,-1,3,-3,-2,-2,2,4,-1],
74
- 'V':[0,-3,-3,-3,-1,-2,-2,-3,-3,3,1,-2,1,-1,-2,-2,0,-3,-1,4]}
75
-
76
- bl62c=np.array([np.array(x) for x in list(bl62.values())])
77
- bl62c=4-bl62c
78
-
79
- embedding=MDS(n_components=13, n_init=100, max_iter=1000, eps=0.00001, dissimilarity='precomputed')
80
- X=embedding.fit_transform(bl62c)
81
-
82
- bl62np={}
83
- vkk=list(bl62.keys())
64
+ blosum62n[(a2, a1)] = vv
65
+
66
+ bl62 = {
67
+ "A": [4, -1, -2, -2, 0, -1, -1, 0, -2, -1, -1, -1, -1, -2, -1, 1, 0, -3, -2, 0],
68
+ "R": [-1, 4, 0, -2, -3, 1, 0, -2, 0, -3, -2, 2, -1, -3, -2, -1, -1, -3, -2, -3],
69
+ "N": [-2, 0, 4, 1, -3, 0, 0, 0, 1, -3, -3, 0, -2, -3, -2, 1, 0, -4, -2, -3],
70
+ "D": [-2, -2, 1, 4, -3, 0, 2, -1, -1, -3, -4, -1, -3, -3, -1, 0, -1, -4, -3, -3],
71
+ "C": [0, -3, -3, -3, 4, -3, -4, -3, -3, -1, -1, -3, -1, -2, -3, -1, -1, -2, -2, -1],
72
+ "Q": [-1, 1, 0, 0, -3, 4, 2, -2, 0, -3, -2, 1, 0, -3, -1, 0, -1, -2, -1, -2],
73
+ "E": [-1, 0, 0, 2, -4, 2, 4, -2, 0, -3, -3, 1, -2, -3, -1, 0, -1, -3, -2, -2],
74
+ "G": [0, -2, 0, -1, -3, -2, -2, 4, -2, -4, -4, -2, -3, -3, -2, 0, -2, -2, -3, -3],
75
+ "H": [-2, 0, 1, -1, -3, 0, 0, -2, 4, -3, -3, -1, -2, -1, -2, -1, -2, -2, 2, -3],
76
+ "I": [-1, -3, -3, -3, -1, -3, -3, -4, -3, 4, 2, -3, 1, 0, -3, -2, -1, -3, -1, 3],
77
+ "L": [-1, -2, -3, -4, -1, -2, -3, -4, -3, 2, 4, -2, 2, 0, -3, -2, -1, -2, -1, 1],
78
+ "K": [-1, 2, 0, -1, -3, 1, 1, -2, -1, -3, -2, 4, -1, -3, -1, 0, -1, -3, -2, -2],
79
+ "M": [-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1, 4, 0, -2, -1, -1, -1, -1, 1],
80
+ "F": [-2, -3, -3, -3, -2, -3, -3, -3, -1, 0, 0, -3, 0, 4, -4, -2, -2, 1, 3, -1],
81
+ "P": [
82
+ -1,
83
+ -2,
84
+ -2,
85
+ -1,
86
+ -3,
87
+ -1,
88
+ -1,
89
+ -2,
90
+ -2,
91
+ -3,
92
+ -3,
93
+ -1,
94
+ -2,
95
+ -4,
96
+ 4,
97
+ -1,
98
+ -1,
99
+ -4,
100
+ -3,
101
+ -2,
102
+ ],
103
+ "S": [1, -1, 1, 0, -1, 0, 0, 0, -1, -2, -2, 0, -1, -2, -1, 4, 1, -3, -2, -2],
104
+ "T": [0, -1, 0, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, -2, -1, 1, 4, -2, -2, 0],
105
+ "W": [-3, -3, -4, -4, -2, -2, -3, -2, -2, -3, -2, -3, -1, 1, -4, -3, -2, 4, 2, -3],
106
+ "Y": [-2, -2, -2, -3, -2, -1, -2, -3, 2, -1, -1, -2, -1, 3, -3, -2, -2, 2, 4, -1],
107
+ "V": [0, -3, -3, -3, -1, -2, -2, -3, -3, 3, 1, -2, 1, -1, -2, -2, 0, -3, -1, 4],
108
+ }
109
+
110
+ bl62c = np.array([np.array(x) for x in list(bl62.values())])
111
+ bl62c = 4 - bl62c
112
+
113
+ embedding = MDS(
114
+ n_components=13, n_init=100, max_iter=1000, eps=0.00001, dissimilarity="precomputed"
115
+ )
116
+ X = embedding.fit_transform(bl62c)
117
+
118
+ bl62np = {}
119
+ vkk = list(bl62.keys())
84
120
  for ii in range(20):
85
- kk=vkk[ii]
86
- bl62np[kk]=np.array(list(X[ii,])+[0]*17)
121
+ kk = vkk[ii]
122
+ bl62np[kk] = np.array(list(X[ii,]) + [0] * 17)
87
123
 
88
-
89
- AAencodingDict={}
124
+
125
+ AAencodingDict = {}
90
126
  for ii in range(len(AAstringList)):
91
- aa=AAstringList[ii]
92
- CODE=[0]*(ii)+[1]+[0]*(20-ii)
93
- AAencodingDict[aa]=np.array(CODE)
94
-
95
- Ndim=16 ## optimized for isometric embedding
96
- n0=Ndim*6
97
- #M0=np.concatenate((np.concatenate((ZERO,M1),axis=1),np.concatenate((M1, ZERO),axis=1)))
98
- ZERO=np.zeros((Ndim,Ndim))
99
- II=np.eye(Ndim)
100
- M0=np.concatenate((np.concatenate((ZERO,ZERO, II),axis=1),np.concatenate((II, ZERO, ZERO),axis=1),np.concatenate((ZERO,II, ZERO),axis=1)))
127
+ aa = AAstringList[ii]
128
+ CODE = [0] * (ii) + [1] + [0] * (20 - ii)
129
+ AAencodingDict[aa] = np.array(CODE)
130
+
131
+ Ndim = 16 ## optimized for isometric embedding
132
+ n0 = Ndim * 6
133
+ # M0=np.concatenate((np.concatenate((ZERO,M1),axis=1),np.concatenate((M1, ZERO),axis=1)))
134
+ ZERO = np.zeros((Ndim, Ndim))
135
+ II = np.eye(Ndim)
136
+ M0 = np.concatenate(
137
+ (
138
+ np.concatenate((ZERO, ZERO, II), axis=1),
139
+ np.concatenate((II, ZERO, ZERO), axis=1),
140
+ np.concatenate((ZERO, II, ZERO), axis=1),
141
+ )
142
+ )
101
143
  ## Construct 6-th order cyclic group
102
- ZERO45=np.zeros((Ndim*3,Ndim*3))
103
- M6=np.concatenate((np.concatenate((ZERO45,M0),axis=1),np.concatenate((M0, ZERO45),axis=1)))
104
-
105
- X=np.array([[-0.31230882, -0.53572156, -0.01949946, -0.12211268, -0.70947917,
106
- -0.42211092, 0.02783931, 0.02637933, -0.41760305, 0.21809875,
107
- 0.53532768, 0.04833016, 0.07877711, 0.50464914, -0.26972087,
108
- -0.52416842],
109
- [ 0.29672002, 0.29005364, 0.18176298, -0.05103382, -0.34686519,
110
- 0.58024228, -0.49282931, 0.62304281, -0.09575202, 0.30115555,
111
- 0.09913529, 0.1577466 , -0.94391939, -0.10505925, 0.05482389,
112
- 0.38409897],
113
- [-0.42212537, 0.12225749, 0.16279646, 0.60099009, 0.19734216,
114
- 0.42819919, -0.33562418, 0.17036334, 0.4234109 , 0.46681561,
115
- -0.50347222, -0.37936876, 0.1494825 , 0.32176759, 0.28584684,
116
- 0.68469861],
117
- [ 0.18599294, -0.44017825, -0.4476952 , 0.34340976, 0.44603553,
118
- 0.40974629, -0.60045935, -0.09056728, 0.22147919, -0.33029418,
119
- 0.55635594, -0.54149972, 0.05459062, 0.57334159, -0.06227118,
120
- 0.65299872],
121
- [-0.19010428, 0.64418792, -0.85286762, 0.21380295, 0.37639516,
122
- -0.67753593, 0.38751609, 0.55746524, 0.01443766, 0.1776535 ,
123
- 0.62853954, -0.15048523, 0.55100206, -0.21426656, 0.3644061 ,
124
- -0.0018255 ],
125
- [ 0.7350723 , 0.10111267, 0.55640019, -0.18226966, 0.51658102,
126
- -0.19321508, -0.46599027, -0.02989911, 0.4036196 , -0.11978213,
127
- -0.29837524, -0.30232765, -0.36738065, -0.1379793 , 0.04362871,
128
- 0.33553714],
129
- [ 0.41134047, 0.13512443, 0.62492322, -0.10120261, -0.03093491,
130
- 0.23751917, -0.68338694, 0.05124762, 0.41533821, 0.46669353,
131
- 0.31467277, -0.02427587, 0.15361135, 0.70595112, -0.27952632,
132
- 0.32408931],
133
- [-0.33041265, -0.43860065, -0.5509376 , -0.04380843, -0.35160935,
134
- 0.25134855, 0.53409314, 0.54850824, 0.59490287, 0.32669345,
135
- -0.45355268, -0.56317041, -0.55416297, 0.18117841, -0.71600849,
136
- -0.08989825],
137
- [-0.40366849, 0.10978974, 0.0280101 , -0.46667987, -0.45607028,
138
- 0.54114052, -0.77552923, -0.10720425, 0.55252091, -0.34397153,
139
- -0.59813694, 0.15567728, 0.03071009, -0.02176143, 0.34442719,
140
- 0.14681541],
141
- [ 0.19280422, 0.35777863, 0.06139255, 0.20081699, -0.30546596,
142
- -0.56901549, -0.15290953, -0.31181573, -0.74523217, 0.22296016,
143
- -0.39143832, -0.16474685, 0.58064427, -0.77386654, 0.19713107,
144
- -0.49477418],
145
- [-0.16133903, 0.22112761, -0.53162136, 0.34764073, -0.08522381,
146
- -0.2510216 , 0.04699411, -0.25702389, -0.8739765 , -0.24171728,
147
- -0.24370533, 0.42193635, 0.41056913, -0.60378211, -0.65756832,
148
- 0.0845203 ],
149
- [-0.34792144, 0.18450939, 0.77038332, 0.63868511, -0.06221681,
150
- 0.11930421, 0.04895523, -0.22463059, -0.03268844, -0.58941354,
151
- 0.11640045, 0.32384901, -0.42952779, 0.58119471, 0.07288662,
152
- 0.26669673],
153
- [ 0.01834555, -0.16367754, 0.34900298, 0.45087949, 0.47073855,
154
- -0.37377404, 0.0606911 , 0.2455703 , -0.55182937, -0.20261009,
155
- 0.28325423, -0.04741146, 0.30565238, -0.62090653, 0.17528413,
156
- -0.60434975],
157
- [-0.55464981, 0.50918784, -0.21371646, -0.63996967, -0.37656862,
158
- 0.27852662, 0.3287838 , -0.56800869, 0.23260763, -0.20653106,
159
- 0.63261439, -0.22666691, 0.00726302, -0.60125196, 0.07139961,
160
- -0.35086639],
161
- [ 0.94039731, -0.25999326, 0.43922549, -0.485738 , -0.20492235,
162
- -0.26005626, 0.68776626, 0.57826888, -0.05973995, -0.1193658 ,
163
- -0.12102433, -0.22091354, 0.43427913, 0.71447886, 0.32745991,
164
- 0.03466398],
165
- [-0.13194625, -0.12262688, 0.18029209, 0.16555524, 0.39594125,
166
- -0.58110665, 0.16161717, 0.0839783 , 0.0911945 , 0.34546976,
167
- -0.29415349, 0.29891936, -0.60834721, 0.5943593 , -0.29473819,
168
- 0.4864154 ],
169
- [ 0.40850093, -0.4638894 , -0.39732987, -0.01972861, 0.51189582,
170
- 0.10176704, 0.37528519, -0.41479418, -0.1932531 , 0.54732221,
171
- -0.11876511, 0.32843973, -0.259283 , 0.59500132, 0.35168375,
172
- -0.21733727],
173
- [-0.50627723, -0.1973602 , -0.02339884, -0.66846048, 0.62696606,
174
- 0.60049717, 0.69143364, -0.48053591, 0.17812208, -0.58481821,
175
- -0.23551415, -0.06229112, 0.20993116, -0.72485884, 0.34375662,
176
- -0.23539168],
177
- [-0.51388312, -0.2788953 , 0.00859533, -0.5247195 , -0.18021544,
178
- 0.28372911, 0.10791359, 0.13033494, 0.34294013, -0.70310089,
179
- -0.13245433, 0.48661081, 0.08451644, -0.69990992, 0.0408274 ,
180
- -0.47204888],
181
- [ 0.68546275, 0.22581365, -0.32571833, 0.34394298, -0.43232367,
182
- -0.5041842 , 0.04784017, -0.53067936, -0.50049908, 0.36874221,
183
- 0.22429186, 0.4616482 , 0.11159174, -0.26827959, -0.39372848,
184
- -0.40987423]])
185
-
186
- bl62np={}
187
- vkk=list(bl62.keys())
144
+ ZERO45 = np.zeros((Ndim * 3, Ndim * 3))
145
+ M6 = np.concatenate(
146
+ (np.concatenate((ZERO45, M0), axis=1), np.concatenate((M0, ZERO45), axis=1))
147
+ )
148
+
149
+ X = np.array(
150
+ [
151
+ [
152
+ -0.31230882,
153
+ -0.53572156,
154
+ -0.01949946,
155
+ -0.12211268,
156
+ -0.70947917,
157
+ -0.42211092,
158
+ 0.02783931,
159
+ 0.02637933,
160
+ -0.41760305,
161
+ 0.21809875,
162
+ 0.53532768,
163
+ 0.04833016,
164
+ 0.07877711,
165
+ 0.50464914,
166
+ -0.26972087,
167
+ -0.52416842,
168
+ ],
169
+ [
170
+ 0.29672002,
171
+ 0.29005364,
172
+ 0.18176298,
173
+ -0.05103382,
174
+ -0.34686519,
175
+ 0.58024228,
176
+ -0.49282931,
177
+ 0.62304281,
178
+ -0.09575202,
179
+ 0.30115555,
180
+ 0.09913529,
181
+ 0.1577466,
182
+ -0.94391939,
183
+ -0.10505925,
184
+ 0.05482389,
185
+ 0.38409897,
186
+ ],
187
+ [
188
+ -0.42212537,
189
+ 0.12225749,
190
+ 0.16279646,
191
+ 0.60099009,
192
+ 0.19734216,
193
+ 0.42819919,
194
+ -0.33562418,
195
+ 0.17036334,
196
+ 0.4234109,
197
+ 0.46681561,
198
+ -0.50347222,
199
+ -0.37936876,
200
+ 0.1494825,
201
+ 0.32176759,
202
+ 0.28584684,
203
+ 0.68469861,
204
+ ],
205
+ [
206
+ 0.18599294,
207
+ -0.44017825,
208
+ -0.4476952,
209
+ 0.34340976,
210
+ 0.44603553,
211
+ 0.40974629,
212
+ -0.60045935,
213
+ -0.09056728,
214
+ 0.22147919,
215
+ -0.33029418,
216
+ 0.55635594,
217
+ -0.54149972,
218
+ 0.05459062,
219
+ 0.57334159,
220
+ -0.06227118,
221
+ 0.65299872,
222
+ ],
223
+ [
224
+ -0.19010428,
225
+ 0.64418792,
226
+ -0.85286762,
227
+ 0.21380295,
228
+ 0.37639516,
229
+ -0.67753593,
230
+ 0.38751609,
231
+ 0.55746524,
232
+ 0.01443766,
233
+ 0.1776535,
234
+ 0.62853954,
235
+ -0.15048523,
236
+ 0.55100206,
237
+ -0.21426656,
238
+ 0.3644061,
239
+ -0.0018255,
240
+ ],
241
+ [
242
+ 0.7350723,
243
+ 0.10111267,
244
+ 0.55640019,
245
+ -0.18226966,
246
+ 0.51658102,
247
+ -0.19321508,
248
+ -0.46599027,
249
+ -0.02989911,
250
+ 0.4036196,
251
+ -0.11978213,
252
+ -0.29837524,
253
+ -0.30232765,
254
+ -0.36738065,
255
+ -0.1379793,
256
+ 0.04362871,
257
+ 0.33553714,
258
+ ],
259
+ [
260
+ 0.41134047,
261
+ 0.13512443,
262
+ 0.62492322,
263
+ -0.10120261,
264
+ -0.03093491,
265
+ 0.23751917,
266
+ -0.68338694,
267
+ 0.05124762,
268
+ 0.41533821,
269
+ 0.46669353,
270
+ 0.31467277,
271
+ -0.02427587,
272
+ 0.15361135,
273
+ 0.70595112,
274
+ -0.27952632,
275
+ 0.32408931,
276
+ ],
277
+ [
278
+ -0.33041265,
279
+ -0.43860065,
280
+ -0.5509376,
281
+ -0.04380843,
282
+ -0.35160935,
283
+ 0.25134855,
284
+ 0.53409314,
285
+ 0.54850824,
286
+ 0.59490287,
287
+ 0.32669345,
288
+ -0.45355268,
289
+ -0.56317041,
290
+ -0.55416297,
291
+ 0.18117841,
292
+ -0.71600849,
293
+ -0.08989825,
294
+ ],
295
+ [
296
+ -0.40366849,
297
+ 0.10978974,
298
+ 0.0280101,
299
+ -0.46667987,
300
+ -0.45607028,
301
+ 0.54114052,
302
+ -0.77552923,
303
+ -0.10720425,
304
+ 0.55252091,
305
+ -0.34397153,
306
+ -0.59813694,
307
+ 0.15567728,
308
+ 0.03071009,
309
+ -0.02176143,
310
+ 0.34442719,
311
+ 0.14681541,
312
+ ],
313
+ [
314
+ 0.19280422,
315
+ 0.35777863,
316
+ 0.06139255,
317
+ 0.20081699,
318
+ -0.30546596,
319
+ -0.56901549,
320
+ -0.15290953,
321
+ -0.31181573,
322
+ -0.74523217,
323
+ 0.22296016,
324
+ -0.39143832,
325
+ -0.16474685,
326
+ 0.58064427,
327
+ -0.77386654,
328
+ 0.19713107,
329
+ -0.49477418,
330
+ ],
331
+ [
332
+ -0.16133903,
333
+ 0.22112761,
334
+ -0.53162136,
335
+ 0.34764073,
336
+ -0.08522381,
337
+ -0.2510216,
338
+ 0.04699411,
339
+ -0.25702389,
340
+ -0.8739765,
341
+ -0.24171728,
342
+ -0.24370533,
343
+ 0.42193635,
344
+ 0.41056913,
345
+ -0.60378211,
346
+ -0.65756832,
347
+ 0.0845203,
348
+ ],
349
+ [
350
+ -0.34792144,
351
+ 0.18450939,
352
+ 0.77038332,
353
+ 0.63868511,
354
+ -0.06221681,
355
+ 0.11930421,
356
+ 0.04895523,
357
+ -0.22463059,
358
+ -0.03268844,
359
+ -0.58941354,
360
+ 0.11640045,
361
+ 0.32384901,
362
+ -0.42952779,
363
+ 0.58119471,
364
+ 0.07288662,
365
+ 0.26669673,
366
+ ],
367
+ [
368
+ 0.01834555,
369
+ -0.16367754,
370
+ 0.34900298,
371
+ 0.45087949,
372
+ 0.47073855,
373
+ -0.37377404,
374
+ 0.0606911,
375
+ 0.2455703,
376
+ -0.55182937,
377
+ -0.20261009,
378
+ 0.28325423,
379
+ -0.04741146,
380
+ 0.30565238,
381
+ -0.62090653,
382
+ 0.17528413,
383
+ -0.60434975,
384
+ ],
385
+ [
386
+ -0.55464981,
387
+ 0.50918784,
388
+ -0.21371646,
389
+ -0.63996967,
390
+ -0.37656862,
391
+ 0.27852662,
392
+ 0.3287838,
393
+ -0.56800869,
394
+ 0.23260763,
395
+ -0.20653106,
396
+ 0.63261439,
397
+ -0.22666691,
398
+ 0.00726302,
399
+ -0.60125196,
400
+ 0.07139961,
401
+ -0.35086639,
402
+ ],
403
+ [
404
+ 0.94039731,
405
+ -0.25999326,
406
+ 0.43922549,
407
+ -0.485738,
408
+ -0.20492235,
409
+ -0.26005626,
410
+ 0.68776626,
411
+ 0.57826888,
412
+ -0.05973995,
413
+ -0.1193658,
414
+ -0.12102433,
415
+ -0.22091354,
416
+ 0.43427913,
417
+ 0.71447886,
418
+ 0.32745991,
419
+ 0.03466398,
420
+ ],
421
+ [
422
+ -0.13194625,
423
+ -0.12262688,
424
+ 0.18029209,
425
+ 0.16555524,
426
+ 0.39594125,
427
+ -0.58110665,
428
+ 0.16161717,
429
+ 0.0839783,
430
+ 0.0911945,
431
+ 0.34546976,
432
+ -0.29415349,
433
+ 0.29891936,
434
+ -0.60834721,
435
+ 0.5943593,
436
+ -0.29473819,
437
+ 0.4864154,
438
+ ],
439
+ [
440
+ 0.40850093,
441
+ -0.4638894,
442
+ -0.39732987,
443
+ -0.01972861,
444
+ 0.51189582,
445
+ 0.10176704,
446
+ 0.37528519,
447
+ -0.41479418,
448
+ -0.1932531,
449
+ 0.54732221,
450
+ -0.11876511,
451
+ 0.32843973,
452
+ -0.259283,
453
+ 0.59500132,
454
+ 0.35168375,
455
+ -0.21733727,
456
+ ],
457
+ [
458
+ -0.50627723,
459
+ -0.1973602,
460
+ -0.02339884,
461
+ -0.66846048,
462
+ 0.62696606,
463
+ 0.60049717,
464
+ 0.69143364,
465
+ -0.48053591,
466
+ 0.17812208,
467
+ -0.58481821,
468
+ -0.23551415,
469
+ -0.06229112,
470
+ 0.20993116,
471
+ -0.72485884,
472
+ 0.34375662,
473
+ -0.23539168,
474
+ ],
475
+ [
476
+ -0.51388312,
477
+ -0.2788953,
478
+ 0.00859533,
479
+ -0.5247195,
480
+ -0.18021544,
481
+ 0.28372911,
482
+ 0.10791359,
483
+ 0.13033494,
484
+ 0.34294013,
485
+ -0.70310089,
486
+ -0.13245433,
487
+ 0.48661081,
488
+ 0.08451644,
489
+ -0.69990992,
490
+ 0.0408274,
491
+ -0.47204888,
492
+ ],
493
+ [
494
+ 0.68546275,
495
+ 0.22581365,
496
+ -0.32571833,
497
+ 0.34394298,
498
+ -0.43232367,
499
+ -0.5041842,
500
+ 0.04784017,
501
+ -0.53067936,
502
+ -0.50049908,
503
+ 0.36874221,
504
+ 0.22429186,
505
+ 0.4616482,
506
+ 0.11159174,
507
+ -0.26827959,
508
+ -0.39372848,
509
+ -0.40987423,
510
+ ],
511
+ ]
512
+ )
513
+
514
+ bl62np = {}
515
+ vkk = list(bl62.keys())
188
516
  for ii in range(20):
189
- kk=vkk[ii]
190
- bl62np[kk]=np.array(list(X[ii,])+[0]*Ndim*5)
517
+ kk = vkk[ii]
518
+ bl62np[kk] = np.array(list(X[ii,]) + [0] * Ndim * 5)
519
+
191
520
 
192
521
  def EncodingCDR3(s, M, n0):
193
- sL=list(s)
194
- x=np.array([0]*n0)
522
+ sL = list(s)
523
+ x = np.array([0] * n0)
195
524
  for ii in range(len(sL)):
196
- x = np.dot(M, (x+bl62np[sL[ii]]))
525
+ x = np.dot(M, (x + bl62np[sL[ii]]))
197
526
  return x
198
527
 
528
+
199
529
  def BuildLengthDict(seqs, sIDs, vGene=[], INFO=[]):
200
- LLs=[10,11,12,13,14,15,16,17,18,19,20,21,22,23,24]
201
- LengthD={}
202
- SeqD={}
203
- VgeneD={}
204
- InfoD={}
205
- AAs=set(list(AAencodingDict.keys()))
206
- NAs=len(AAencodingDict)
207
- cNAs=0
530
+ LLs = [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]
531
+ LengthD = {}
532
+ SeqD = {}
533
+ VgeneD = {}
534
+ InfoD = {}
535
+ AAs = set(list(AAencodingDict.keys()))
536
+ NAs = len(AAencodingDict)
537
+ cNAs = 0
208
538
  for ii in range(len(seqs)):
209
- ID=sIDs[ii]
210
- ss=seqs[ii]
211
- ssAA=set(list(ss))
212
- TMP=list(ssAA | AAs)
539
+ ID = sIDs[ii]
540
+ ss = seqs[ii]
541
+ ssAA = set(list(ss))
542
+ TMP = list(ssAA | AAs)
213
543
  if len(TMP) > NAs:
214
544
  ## CDR3 containing non amino acid letter
215
- #print('Warning: CDR3: '+ss + ' contains non amino acid letter!')
216
- cNAs+=1
545
+ # print('Warning: CDR3: '+ss + ' contains non amino acid letter!')
546
+ cNAs += 1
217
547
  continue
218
- if len(vGene)>0:
219
- vv=vGene[ii]
220
- if len(INFO)>0:
221
- info=INFO[ii]
222
- L=len(ss)
548
+ if len(vGene) > 0:
549
+ vv = vGene[ii]
550
+ if len(INFO) > 0:
551
+ info = INFO[ii]
552
+ L = len(ss)
223
553
  if L not in LLs:
224
554
  continue
225
555
  if L not in LengthD:
226
- LengthD[L]=[ID]
227
- SeqD[L]=[ss]
228
- if len(vGene)>0:
229
- VgeneD[L]=[vv]
230
- if len(INFO)>0:
231
- InfoD[L]=[info]
556
+ LengthD[L] = [ID]
557
+ SeqD[L] = [ss]
558
+ if len(vGene) > 0:
559
+ VgeneD[L] = [vv]
560
+ if len(INFO) > 0:
561
+ InfoD[L] = [info]
232
562
  else:
233
563
  LengthD[L].append(ID)
234
564
  SeqD[L].append(ss)
235
- if len(vGene)>0:
565
+ if len(vGene) > 0:
236
566
  VgeneD[L].append(vv)
237
- if len(INFO)>0:
567
+ if len(INFO) > 0:
238
568
  InfoD[L].append(info)
239
- if cNAs>0:
240
- print("Warning: Skipped %d sequences with non AA letter!" %(cNAs))
569
+ if cNAs > 0:
570
+ print("Warning: Skipped %d sequences with non AA letter!" % (cNAs))
241
571
  return LengthD, VgeneD, InfoD, SeqD
242
572
 
573
+
243
574
  def CollapseUnique(LD, VD, ID, SD):
244
- kks=LD.keys()
245
- LDu={}
246
- VDu={}
247
- IDu={}
248
- SDu={}
575
+ kks = LD.keys()
576
+ LDu = {}
577
+ VDu = {}
578
+ IDu = {}
579
+ SDu = {}
249
580
  for kk in kks:
250
- vvL=list(LD[kk])
251
- if len(VD)>0:
252
- vvV=list(VD[kk])
581
+ vvL = list(LD[kk])
582
+ if len(VD) > 0:
583
+ vvV = list(VD[kk])
253
584
  else:
254
- vvV=['TRBV2-1*01']*len(vvL)
255
- vvI=list(ID[kk])
256
- vvS=list(SD[kk])
257
- zz=zip(vvL, vvS, vvV, vvI)
258
- zzs=sorted(zz, key = lambda x: (x[1], x[2]))
259
- nz=len(zzs)
260
- pointer_pre=0
261
- pointer_cur=1
262
- s_pre=zzs[pointer_pre][1]
263
- v_pre=zzs[pointer_pre][2]
264
- uS=[s_pre]
265
- uV=[v_pre]
266
- uI=[[zzs[pointer_pre][3]]]
585
+ vvV = ["TRBV2-1*01"] * len(vvL)
586
+ vvI = list(ID[kk])
587
+ vvS = list(SD[kk])
588
+ zz = zip(vvL, vvS, vvV, vvI)
589
+ zzs = sorted(zz, key=lambda x: (x[1], x[2]))
590
+ nz = len(zzs)
591
+ pointer_pre = 0
592
+ pointer_cur = 1
593
+ s_pre = zzs[pointer_pre][1]
594
+ v_pre = zzs[pointer_pre][2]
595
+ uS = [s_pre]
596
+ uV = [v_pre]
597
+ uI = [[zzs[pointer_pre][3]]]
267
598
  while pointer_cur < nz:
268
- s_cur=zzs[pointer_cur][1]
269
- v_cur=zzs[pointer_cur][2]
599
+ s_cur = zzs[pointer_cur][1]
600
+ v_cur = zzs[pointer_cur][2]
270
601
  if s_cur == s_pre and v_cur == v_pre:
271
- uI[len(uI)-1].append(zzs[pointer_cur][3])
602
+ uI[len(uI) - 1].append(zzs[pointer_cur][3])
272
603
  pointer_cur += 1
273
604
  continue
274
605
  else:
275
606
  uS.append(s_cur)
276
607
  uV.append(v_cur)
277
608
  uI.append([zzs[pointer_cur][3]])
278
- s_pre=s_cur
279
- v_pre=v_cur
280
- pointer_pre=pointer_cur
609
+ s_pre = s_cur
610
+ v_pre = v_cur
611
+ pointer_pre = pointer_cur
281
612
  pointer_cur += 1
282
- uL=[x for x in range(len(uS))]
283
- LDu[kk]=uL
284
- SDu[kk]=uS
285
- if len(VD)>0:
286
- VDu[kk]=uV
287
- IDu[kk]=uI
613
+ uL = [x for x in range(len(uS))]
614
+ LDu[kk] = uL
615
+ SDu[kk] = uS
616
+ if len(VD) > 0:
617
+ VDu[kk] = uV
618
+ IDu[kk] = uI
288
619
  return LDu, VDu, IDu, SDu
289
620
 
290
621
 
@@ -296,14 +627,15 @@ class CDR3:
296
627
  ## KS: Kmer size
297
628
  ## st: the first 0:(st-1) amino acids will not be included in K-merization
298
629
  ## ed: the last L-ed amino acids will be skipped
299
- self.s=s
300
- self.ID=sID
301
- L=len(s)
302
- self.L=L
303
- sub_s=s[st: (L-ed)]
304
- Ls=len(sub_s)
305
- Kmer=[sub_s[x:(x+KS)] for x in range(0,Ls-KS+1)]
306
- self.Kmer=Kmer
630
+ self.s = s
631
+ self.ID = sID
632
+ L = len(s)
633
+ self.L = L
634
+ sub_s = s[st : (L - ed)]
635
+ Ls = len(sub_s)
636
+ Kmer = [sub_s[x : (x + KS)] for x in range(0, Ls - KS + 1)]
637
+ self.Kmer = Kmer
638
+
307
639
 
308
640
  class KmerSet:
309
641
  ## Kmer set for fast read searching based on mismatch-allowed Kmer index
@@ -312,263 +644,277 @@ class KmerSet:
312
644
  ## Seqs and sIDs must have the same length
313
645
  if len(Seqs) != len(sIDs):
314
646
  raise "Sequence and ID lists have different length. Please check input."
315
- KmerDict={}
316
- N=len(Seqs)
317
- self.N=N
318
- CDR3Dict={}
319
- LLs=[]
320
- for ii in range(0,N):
321
- s=Seqs[ii]
322
- sID=sIDs[ii]
323
- cc=CDR3(s,sID,KS,st,ed)
324
- CDR3Dict[cc.ID]=cc.Kmer
325
- KK=cc.Kmer
647
+ KmerDict = {}
648
+ N = len(Seqs)
649
+ self.N = N
650
+ CDR3Dict = {}
651
+ LLs = []
652
+ for ii in range(0, N):
653
+ s = Seqs[ii]
654
+ sID = sIDs[ii]
655
+ cc = CDR3(s, sID, KS, st, ed)
656
+ CDR3Dict[cc.ID] = cc.Kmer
657
+ KK = cc.Kmer
326
658
  LLs.append(cc.L)
327
659
  for kk in KK:
328
660
  if kk not in KmerDict:
329
- KmerDict[kk]=[sID]
661
+ KmerDict[kk] = [sID]
330
662
  else:
331
663
  KmerDict[kk].append(sID)
332
- self.KD=KmerDict
333
- self.KS=KS
334
- self.CD=CDR3Dict
335
- self.LL=LLs
336
- def FindKmerNeighbor(self,kk):
337
- KS=self.KS
338
- KS_n1=[]
664
+ self.KD = KmerDict
665
+ self.KS = KS
666
+ self.CD = CDR3Dict
667
+ self.LL = LLs
668
+
669
+ def FindKmerNeighbor(self, kk):
670
+ KS = self.KS
671
+ KS_n1 = []
339
672
  for jj in range(KS):
340
- kk_pre=[kk[0:jj]]*20
341
- kk_suf=[kk[(jj+1):KS]]*20
342
- kkn=list(zip(kk_pre,AAstringList,kk_suf))
343
- KS_n1+=[''.join(list(x)) for x in kkn]
673
+ kk_pre = [kk[0:jj]] * 20
674
+ kk_suf = [kk[(jj + 1) : KS]] * 20
675
+ kkn = list(zip(kk_pre, AAstringList, kk_suf))
676
+ KS_n1 += ["".join(list(x)) for x in kkn]
344
677
  return KS_n1
345
- def FindKmerNeighbor2(self,kk):
678
+
679
+ def FindKmerNeighbor2(self, kk):
346
680
  ## KS>=6, allowing 2 mismatches. CDR3 length must be >= 10
347
- KS=self.KS
348
- KS_n1=[]
681
+ KS = self.KS
682
+ KS_n1 = []
349
683
  for jj in range(KS):
350
684
  for ii in range(KS):
351
- if ii<=jj:
685
+ if ii <= jj:
352
686
  continue
353
- kk_pre=[kk[0:jj]]*20
354
- kk_mid=[kk[(jj+1):ii]]*20
355
- kk_suf=[kk[(ii+1):KS]]*400
356
- kkn=list(zip(kk_pre,AAstringList,kk_mid))
357
- kkn=[''.join(list(x)) for x in kkn]
358
- kkn=[[x]*20 for x in kkn]
359
- kkn=list(chain(*kkn))
360
- kkn2=list(zip(kkn, AAstringList*20, kk_suf))
361
- kkn2=[''.join(list(x)) for x in kkn2]
362
- KS_n1+=kkn2
687
+ kk_pre = [kk[0:jj]] * 20
688
+ kk_mid = [kk[(jj + 1) : ii]] * 20
689
+ kk_suf = [kk[(ii + 1) : KS]] * 400
690
+ kkn = list(zip(kk_pre, AAstringList, kk_mid))
691
+ kkn = ["".join(list(x)) for x in kkn]
692
+ kkn = [[x] * 20 for x in kkn]
693
+ kkn = list(chain(*kkn))
694
+ kkn2 = list(zip(kkn, AAstringList * 20, kk_suf))
695
+ kkn2 = ["".join(list(x)) for x in kkn2]
696
+ KS_n1 += kkn2
363
697
  return KS_n1
698
+
364
699
  def KmerIndex(self):
365
700
  ## For each K-mer, find its nearest neighbor with 1 character mismatch
366
- KKs=list(self.KD.keys())
367
- KS=self.KS
368
- KKs_set=set(KKs)
369
- Skk='_'.join(KKs)
370
- KI_Dict={}
701
+ KKs = list(self.KD.keys())
702
+ KS = self.KS
703
+ KKs_set = set(KKs)
704
+ Skk = "_".join(KKs)
705
+ KI_Dict = {}
371
706
  for kk in KKs:
372
- ## kk_neighbor=[]
373
- ## for jj in range(KS):
374
- ## kk_pre=kk[0:jj]
375
- ## kk_suf=kk[(jj+1):KS]
376
- ## pat=kk_pre+'['+AAstring+']{1}'+kk_suf
377
- ## p=re.compile(pat)
378
- ## mm=[m.group() for m in p.finditer(Skk)]
379
- ## kk_neighbor+=mm
380
- KS_n=set(self.FindKmerNeighbor(kk))
707
+ ## kk_neighbor=[]
708
+ ## for jj in range(KS):
709
+ ## kk_pre=kk[0:jj]
710
+ ## kk_suf=kk[(jj+1):KS]
711
+ ## pat=kk_pre+'['+AAstring+']{1}'+kk_suf
712
+ ## p=re.compile(pat)
713
+ ## mm=[m.group() for m in p.finditer(Skk)]
714
+ ## kk_neighbor+=mm
715
+ KS_n = set(self.FindKmerNeighbor(kk))
381
716
  kk_neighbor = KS_n & KKs_set
382
- KI_Dict[kk]=list(kk_neighbor)
717
+ KI_Dict[kk] = list(kk_neighbor)
383
718
  return KI_Dict
719
+
384
720
  def updateKD(self, KI):
385
721
  ## group sequences sharing motifs with 1-2 mismatches
386
- KD=self.KD
387
- KDnew={}
722
+ KD = self.KD
723
+ KDnew = {}
388
724
  for kk in KD:
389
- kkm=KI[kk]
390
- vvL=itemgetter(*kkm)(KD)
391
- if isinstance(vvL[0],list):
392
- vvL=list(chain(*vvL))
393
- KDnew[kk]=vvL
725
+ kkm = KI[kk]
726
+ vvL = itemgetter(*kkm)(KD)
727
+ if isinstance(vvL[0], list):
728
+ vvL = list(chain(*vvL))
729
+ KDnew[kk] = vvL
394
730
  return KDnew
395
731
 
396
- def GenerateMotifGraph(mD,seqs,seqID):
397
- SeqShareGraph={}
398
- mDL={}
732
+
733
+ def GenerateMotifGraph(mD, seqs, seqID):
734
+ SeqShareGraph = {}
735
+ mDL = {}
399
736
  for kk in mD:
400
- vv=mD[kk]
401
- LL=[]
737
+ vv = mD[kk]
738
+ LL = []
402
739
  for v in vv:
403
740
  LL.append(len(seqs[v]))
404
- mDL[kk]=LL
741
+ mDL[kk] = LL
405
742
  for kk in mD:
406
- vv=mD[kk]
407
- LL=mDL[kk]
408
- nv=len(vv)
409
- for ii in range(0,nv):
410
- id_1=vv[ii]
411
- L1=LL[ii]
412
- for jj in range(ii,nv):
413
- if jj==ii:
743
+ vv = mD[kk]
744
+ LL = mDL[kk]
745
+ nv = len(vv)
746
+ for ii in range(0, nv):
747
+ id_1 = vv[ii]
748
+ L1 = LL[ii]
749
+ for jj in range(ii, nv):
750
+ if jj == ii:
414
751
  continue
415
- id_2=vv[jj]
416
- L2=LL[jj]
752
+ id_2 = vv[jj]
753
+ L2 = LL[jj]
417
754
  if L2 != L1:
418
755
  continue
419
756
  if id_1 not in SeqShareGraph:
420
- SeqShareGraph[id_1]=[id_2]
757
+ SeqShareGraph[id_1] = [id_2]
421
758
  elif id_2 not in SeqShareGraph[id_1]:
422
759
  SeqShareGraph[id_1].append(id_2)
423
760
  if id_2 not in SeqShareGraph:
424
- SeqShareGraph[id_2]=[id_1]
761
+ SeqShareGraph[id_2] = [id_1]
425
762
  elif id_1 not in SeqShareGraph[id_2]:
426
763
  SeqShareGraph[id_2].append(id_1)
427
764
  return SeqShareGraph
428
765
 
766
+
429
767
  def generateSSG(Kset, CDR3s, k_thr=2):
430
- KD=Kset.KD
431
- KI=Kset.KmerIndex()
432
- KDnew=Kset.updateKD(KI)
433
- CD=Kset.CD
434
- LL=np.array(Kset.LL)
435
- SSG={}
768
+ KD = Kset.KD
769
+ KI = Kset.KmerIndex()
770
+ KDnew = Kset.updateKD(KI)
771
+ CD = Kset.CD
772
+ LL = np.array(Kset.LL)
773
+ SSG = {}
436
774
  for kk in CD:
437
- vv=itemgetter(*CD[kk])(KDnew)
438
- if isinstance(vv[0],list):
439
- vv=list(chain(*vv))
440
- vv1=[]
441
- c=Counter(vv)
775
+ vv = itemgetter(*CD[kk])(KDnew)
776
+ if isinstance(vv[0], list):
777
+ vv = list(chain(*vv))
778
+ vv1 = []
779
+ c = Counter(vv)
442
780
  for k in c:
443
- if c[k]>=k_thr:
781
+ if c[k] >= k_thr:
444
782
  vv1.append(k)
445
- vv1=np.array(vv1)
446
- if len(vv1)==0:
783
+ vv1 = np.array(vv1)
784
+ if len(vv1) == 0:
447
785
  continue
448
- cdr3=CDR3s[kk]
449
- L0=len(cdr3)
450
- idx=np.where(LL[vv1]==L0)[0]
451
- if len(idx)==0:
786
+ cdr3 = CDR3s[kk]
787
+ L0 = len(cdr3)
788
+ idx = np.where(LL[vv1] == L0)[0]
789
+ if len(idx) == 0:
452
790
  continue
453
- vvs=list(vv1[idx])
791
+ vvs = list(vv1[idx])
454
792
  vvs.remove(kk)
455
- if len(vvs)>0:
456
- SSG[kk]=vvs
793
+ if len(vvs) > 0:
794
+ SSG[kk] = vvs
457
795
  return SSG
458
796
 
459
- def SeqComparison(s1,s2,gap=-6):
460
- n=len(s1)
461
- CorList=[]
462
- score=0
463
- for kk in range(0,n):
464
- aa=s1[kk]
465
- bb=s2[kk]
466
- if aa in ['.','-','*'] or bb in ['.','-','*']:
467
- if aa!=bb:
797
+
798
+ def SeqComparison(s1, s2, gap=-6):
799
+ n = len(s1)
800
+ CorList = []
801
+ score = 0
802
+ for kk in range(0, n):
803
+ aa = s1[kk]
804
+ bb = s2[kk]
805
+ if aa in [".", "-", "*"] or bb in [".", "-", "*"]:
806
+ if aa != bb:
468
807
  score += gap
469
808
  continue
470
- if aa==bb:
471
- # score += min(4,blosum62[(aa,aa)])
472
- score += blosum62n[(aa,aa)]
809
+ if aa == bb:
810
+ # score += min(4,blosum62[(aa,aa)])
811
+ score += blosum62n[(aa, aa)]
473
812
  continue
474
- KEY=(aa,bb)
475
- # if KEY not in blosum62:
476
- # KEY=(bb,aa)
477
- # if KEY not in blosum62:
478
- # raise "Non-standard amino acid coding!"
479
- score+=blosum62n[KEY]
813
+ KEY = (aa, bb)
814
+ # if KEY not in blosum62:
815
+ # KEY=(bb,aa)
816
+ # if KEY not in blosum62:
817
+ # raise "Non-standard amino acid coding!"
818
+ score += blosum62n[KEY]
480
819
  return score
481
820
 
482
- def NHLocalAlignment(Seq1,Seq2,gap_thr=1,gap=-6):
483
- n1=len(Seq1)
484
- n2=len(Seq2)
485
- if n1<n2:
486
- Seq=Seq1
487
- Seq1=Seq2
488
- Seq2=Seq
489
- nn=n2-n1
821
+
822
+ def NHLocalAlignment(Seq1, Seq2, gap_thr=1, gap=-6):
823
+ n1 = len(Seq1)
824
+ n2 = len(Seq2)
825
+ if n1 < n2:
826
+ Seq = Seq1
827
+ Seq1 = Seq2
828
+ Seq2 = Seq
829
+ nn = n2 - n1
490
830
  else:
491
- nn=n1-n2
492
- if nn>gap_thr:
831
+ nn = n1 - n2
832
+ if nn > gap_thr:
493
833
  return -1
494
- SeqList1=[Seq1]
495
- SeqList2=InsertGap(Seq2,nn)
496
- alns=[]
497
- SCOREList=[]
834
+ SeqList1 = [Seq1]
835
+ SeqList2 = InsertGap(Seq2, nn)
836
+ alns = []
837
+ SCOREList = []
498
838
  for s1 in SeqList1:
499
839
  for s2 in SeqList2:
500
- SCOREList.append(SeqComparison(s1,s2,gap))
501
- maxS=max(SCOREList)
840
+ SCOREList.append(SeqComparison(s1, s2, gap))
841
+ maxS = max(SCOREList)
502
842
  return maxS
503
843
 
504
- def InsertGap(Seq,n):
844
+
845
+ def InsertGap(Seq, n):
505
846
  ## Insert n gaps to Seq; n<=2
506
- if n==0:
847
+ if n == 0:
507
848
  return [Seq]
508
- ns=len(Seq)
509
- SeqList=[]
510
- if(n==1):
511
- for kk in range(0,ns+1):
512
- SeqNew=Seq[0:kk]+'-'+Seq[kk:]
849
+ ns = len(Seq)
850
+ SeqList = []
851
+ if n == 1:
852
+ for kk in range(0, ns + 1):
853
+ SeqNew = Seq[0:kk] + "-" + Seq[kk:]
513
854
  SeqList.append(SeqNew)
514
- if(n==2):
515
- for kk in range(0,ns+1):
516
- SeqNew=Seq[0:kk]+'-'+Seq[kk:]
517
- for jj in range(0,ns+2):
518
- SeqNew0=SeqNew[0:jj]+'-'+SeqNew[jj:]
855
+ if n == 2:
856
+ for kk in range(0, ns + 1):
857
+ SeqNew = Seq[0:kk] + "-" + Seq[kk:]
858
+ for jj in range(0, ns + 2):
859
+ SeqNew0 = SeqNew[0:jj] + "-" + SeqNew[jj:]
519
860
  SeqList.append(SeqNew0)
520
861
  return SeqList
521
862
 
522
- def falign(s1, s2, V1, V2 ,st,VScore={}, UseV=True, gapn=1, gap=-6):
523
- mid1=s1[st:-2]
524
- mid2=s2[st:-2]
863
+
864
+ def falign(s1, s2, V1, V2, st, VScore={}, UseV=True, gapn=1, gap=-6):
865
+ mid1 = s1[st:-2]
866
+ mid2 = s2[st:-2]
525
867
  if UseV:
526
- if V2==V1:
527
- V_score=4
868
+ if V2 == V1:
869
+ V_score = 4
528
870
  else:
529
- Vkey=(V1,V2)
871
+ Vkey = (V1, V2)
530
872
  if Vkey not in VScore:
531
- Vkey=(V2,V1)
873
+ Vkey = (V2, V1)
532
874
  if Vkey not in VScore:
533
- #print("V gene not found!")
875
+ # print("V gene not found!")
534
876
  return 0
535
877
  else:
536
- V_score=VScore[Vkey]/20.0
878
+ V_score = VScore[Vkey] / 20.0
537
879
  else:
538
- V_score=4.0
539
- aln=NHLocalAlignment(mid1,mid2,gapn,gap)
540
- score=aln/float(max(len(mid1),len(mid2)))+V_score
880
+ V_score = 4.0
881
+ aln = NHLocalAlignment(mid1, mid2, gapn, gap)
882
+ score = aln / float(max(len(mid1), len(mid2))) + V_score
541
883
  return score
542
884
 
885
+
543
886
  def UpdateSSG(SSG, seqs, Vgenes, Vscore={}, UseV=True, gap=-6, gapn=1, cutoff=7.5):
544
- SSGnew={}
545
- count=0
546
- t1=time.time()
547
- N=len(list(chain(*list(SSG.values()))))
548
- # print("Number of pairs to be processed: %d" %N)
887
+ SSGnew = {}
888
+ count = 0
889
+ t1 = time.time()
890
+ N = len(list(chain(*list(SSG.values()))))
891
+ # print("Number of pairs to be processed: %d" %N)
549
892
  for kk in SSG:
550
- s1=seqs[kk]
551
- V1=Vgenes[kk]
552
- VV=SSG[kk]
893
+ s1 = seqs[kk]
894
+ V1 = Vgenes[kk]
895
+ VV = SSG[kk]
553
896
  for vv in VV:
554
- s2=seqs[vv]
555
- V2=Vgenes[vv]
556
- score=falign(s1, s2, V1, V2, st=3, VScore=Vscore, UseV=UseV, gap=-6, gapn=1)
557
- count+=1
558
- if count % 1000000 ==0:
559
- t2=time.time()
560
- # print("Processed %d pairs. Elapsed time %f" %(count, t2-t1))
561
- if score>=cutoff:
897
+ s2 = seqs[vv]
898
+ V2 = Vgenes[vv]
899
+ score = falign(
900
+ s1, s2, V1, V2, st=3, VScore=Vscore, UseV=UseV, gap=-6, gapn=1
901
+ )
902
+ count += 1
903
+ if count % 1000000 == 0:
904
+ t2 = time.time()
905
+ # print("Processed %d pairs. Elapsed time %f" %(count, t2-t1))
906
+ if score >= cutoff:
562
907
  if kk not in SSGnew:
563
- SSGnew[kk]=[vv]
908
+ SSGnew[kk] = [vv]
564
909
  else:
565
910
  SSGnew[kk].append(vv)
566
911
  return SSGnew
567
912
 
913
+
568
914
  def dfs(graph, start):
569
- '''
915
+ """
570
916
  Non-resursive depth first search
571
- '''
917
+ """
572
918
  visited = set()
573
919
  stack = [start]
574
920
  while stack:
@@ -576,443 +922,503 @@ def dfs(graph, start):
576
922
  if vertex not in visited:
577
923
  visited.add(vertex)
578
924
  stack.extend(set(graph[vertex]) - visited)
579
-
925
+
580
926
  return visited
581
927
 
928
+
582
929
  def IdentifyMotifCluster(SSG):
583
930
  ## Input SeqShareGraph dictionary representation of sparse matrix
584
- POS=set(SSG.keys())
585
- NP=len(POS)
586
- ClusterList=[]
587
- tmpL=set(chain(*ClusterList))
588
- count=0
931
+ POS = set(SSG.keys())
932
+ NP = len(POS)
933
+ ClusterList = []
934
+ tmpL = set(chain(*ClusterList))
935
+ count = 0
589
936
  while 1:
590
- xx=POS ^ tmpL
591
- if len(xx)==0:
592
- break
593
- for ii in xx:
594
- # STACK=LoadComm([],ii)
595
- STACK=dfs(SSG,ii)
596
- tmpL = tmpL | STACK
597
- ClusterList.append(list(STACK))
598
- # tmpL=set(chain(*ClusterList))
599
- count+=1
600
- if count % 200 ==0:
601
- print (" Solved %d clusters" %(count))
602
- break
937
+ xx = POS ^ tmpL
938
+ if len(xx) == 0:
939
+ break
940
+ for ii in xx:
941
+ # STACK=LoadComm([],ii)
942
+ STACK = dfs(SSG, ii)
943
+ tmpL = tmpL | STACK
944
+ ClusterList.append(list(STACK))
945
+ # tmpL=set(chain(*ClusterList))
946
+ count += 1
947
+ if count % 200 == 0:
948
+ print(" Solved %d clusters" % (count))
949
+ break
603
950
  return ClusterList
604
951
 
952
+
605
953
  def IdentifyVgeneCluster(sMat):
606
954
  ## Input Vgene score matrix
607
- vG={}
608
- n=len(sMat)
609
- IDs=[x for x in range(n)]
955
+ vG = {}
956
+ n = len(sMat)
957
+ IDs = [x for x in range(n)]
610
958
  for kk in IDs:
611
- LL=sMat[:,kk]
612
- vL=np.where(LL>=thr_v)[0]
613
- if len(vL)>0:
614
- vG[kk]=vL
615
- CL=IdentifyMotifCluster(vG)
959
+ LL = sMat[:, kk]
960
+ vL = np.where(LL >= thr_v)[0]
961
+ if len(vL) > 0:
962
+ vG[kk] = vL
963
+ CL = IdentifyMotifCluster(vG)
616
964
  return CL
617
-
965
+
966
+
618
967
  def ParseFa(fname):
619
- InputStr=open(fname).readlines()
620
- FaDict={}
621
- seq=''
968
+ InputStr = open(fname).readlines()
969
+ FaDict = {}
970
+ seq = ""
622
971
  for line in InputStr:
623
- if line.startswith('>'):
624
- if len(seq)>0:
625
- FaDict[seqHead]=seq
626
- seq=''
627
- seqHead=line.strip()
972
+ if line.startswith(">"):
973
+ if len(seq) > 0:
974
+ FaDict[seqHead] = seq
975
+ seq = ""
976
+ seqHead = line.strip()
628
977
  else:
629
- seq+=line.strip()
978
+ seq += line.strip()
630
979
  if seqHead not in FaDict:
631
- FaDict[seqHead]=seq
980
+ FaDict[seqHead] = seq
632
981
  return FaDict
633
982
 
983
+
634
984
  def PreCalculateVgeneDist(VgeneFa="Imgt_Human_TRBV.fasta"):
635
985
  ## Only run one time if needed
636
- FaDict=ParseFa(cur_dir+VgeneFa)
637
- VScore={}
638
- CDR1Dict={}
639
- CDR2Dict={}
986
+ FaDict = ParseFa(cur_dir + VgeneFa)
987
+ VScore = {}
988
+ CDR1Dict = {}
989
+ CDR2Dict = {}
640
990
  for kk in FaDict:
641
- if '|' in kk:
642
- VV=kk.split('|')[1]
991
+ if "|" in kk:
992
+ VV = kk.split("|")[1]
643
993
  else:
644
- VV=kk[1:]
645
- CDR1Dict[VV]=FaDict[kk][26:37] ## Imgt CDR1: 27 - 38
646
- CDR2Dict[VV]=FaDict[kk][55:64] ## Imgt CDR2: 56 - 65
647
- Vkeys=list(CDR1Dict.keys())
648
- nn=len(Vkeys)
649
- for ii in range(0,nn):
650
- V1=Vkeys[ii]
651
- s1_CDR1=CDR1Dict[V1]
652
- s1_CDR2=CDR2Dict[V1]
653
- for jj in range(ii,nn):
654
- V2=Vkeys[jj]
655
- s2_CDR1=CDR1Dict[V2]
656
- s2_CDR2=CDR2Dict[V2]
657
- score1=SeqComparison(s1_CDR1,s2_CDR1)
658
- score2=SeqComparison(s2_CDR2,s2_CDR2)
659
- #print score1+score2
660
- VScore[(V1,V2)]=score1+score2
661
- gg=open('VgeneScores.txt','w')
994
+ VV = kk[1:]
995
+ CDR1Dict[VV] = FaDict[kk][26:37] ## Imgt CDR1: 27 - 38
996
+ CDR2Dict[VV] = FaDict[kk][55:64] ## Imgt CDR2: 56 - 65
997
+ Vkeys = list(CDR1Dict.keys())
998
+ nn = len(Vkeys)
999
+ for ii in range(0, nn):
1000
+ V1 = Vkeys[ii]
1001
+ s1_CDR1 = CDR1Dict[V1]
1002
+ s1_CDR2 = CDR2Dict[V1]
1003
+ for jj in range(ii, nn):
1004
+ V2 = Vkeys[jj]
1005
+ s2_CDR1 = CDR1Dict[V2]
1006
+ s2_CDR2 = CDR2Dict[V2]
1007
+ score1 = SeqComparison(s1_CDR1, s2_CDR1)
1008
+ score2 = SeqComparison(s2_CDR2, s2_CDR2)
1009
+ # print score1+score2
1010
+ VScore[(V1, V2)] = score1 + score2
1011
+ gg = open("VgeneScores.txt", "w")
662
1012
  for kk in VScore:
663
- vv=VScore[kk]
664
- line=kk[0]+'\t'+kk[1]+'\t'+str(vv)+'\n'
1013
+ vv = VScore[kk]
1014
+ line = kk[0] + "\t" + kk[1] + "\t" + str(vv) + "\n"
665
1015
  gg.write(line)
666
1016
  gg.close()
667
1017
 
668
- def EncodeRepertoire(inputfile, outdir, outfile='',exact=True, ST=3, thr_v=3.7, thr_s=3.5, VDict={},Vgene=True,thr_iso=10, gap=-6, GPU=False,Mat=False, verbose=False):
1018
+
1019
+ def EncodeRepertoire(
1020
+ inputfile,
1021
+ outdir,
1022
+ outfile="",
1023
+ exact=True,
1024
+ ST=3,
1025
+ thr_v=3.7,
1026
+ thr_s=3.5,
1027
+ VDict={},
1028
+ Vgene=True,
1029
+ thr_iso=10,
1030
+ gap=-6,
1031
+ GPU=False,
1032
+ Mat=False,
1033
+ verbose=False,
1034
+ ):
669
1035
  ## No V gene version
670
1036
  ## Encode CDR3 sequences into 96 dimensional space and perform k-means clustering
671
1037
  ## If exact is True, SW alignment will be performed within each cluster after isometric encoding and clustering
672
- h=open(inputfile)
673
- t1=time.time()
674
- alines=h.readlines()
675
- ww=alines[0].strip().split('\t')
676
- if not ww[0].startswith('C'):
1038
+ h = open(inputfile)
1039
+ t1 = time.time()
1040
+ alines = h.readlines()
1041
+ ww = alines[0].strip().split("\t")
1042
+ if not ww[0].startswith("C"):
677
1043
  ## header line
678
- hline=alines[0]
679
- alines=alines[1:]
680
- elif 'CDR3' in ww[0]:
681
- hline=alines[0]
682
- alines=alines[1:]
1044
+ hline = alines[0]
1045
+ alines = alines[1:]
1046
+ elif "CDR3" in ww[0]:
1047
+ hline = alines[0]
1048
+ alines = alines[1:]
683
1049
  else:
684
- hline='CDR3\t'+'\t'.join(['Info'+str(x) for x in range(len(ww)-1)])
685
- seqs=[]
686
- vgs=[]
687
- infoList=[]
688
- count=0
1050
+ hline = "CDR3\t" + "\t".join(["Info" + str(x) for x in range(len(ww) - 1)])
1051
+ seqs = []
1052
+ vgs = []
1053
+ infoList = []
1054
+ count = 0
689
1055
  if verbose:
690
- print('Creating CDR3 list')
1056
+ print("Creating CDR3 list")
691
1057
  for ll in alines:
692
- ww=ll.strip().split('\t')
693
- cdr3=ww[0]
694
- if '*' in cdr3:
1058
+ ww = ll.strip().split("\t")
1059
+ cdr3 = ww[0]
1060
+ if "*" in cdr3:
695
1061
  continue
696
- if '_' in cdr3:
1062
+ if "_" in cdr3:
697
1063
  continue
698
1064
  seqs.append(ww[0])
699
1065
  if Vgene:
700
1066
  vgs.append(ww[1])
701
- infoList.append('\t'.join(ww[1:]))
1067
+ infoList.append("\t".join(ww[1:]))
702
1068
  else:
703
- infoList.append('\t'.join(ww[1:]))
704
- count+=1
705
- if len(outfile)==0:
706
- outfile=inputfile.split('/')
707
- outfile=outfile[len(outfile)-1]
708
- outfile=outdir+'/'+re.sub('\\.[txcsv]+','',outfile)+'-'+'-RotationEncodingBL62.txt'
709
- g=open(outfile,'w')
710
- tm=strftime("%Y-%m-%d %H:%M:%S", gmtime())
711
- InfoLine='##TIME:'+tm+'|cmd: '+sys.argv[0]+'|'+inputfile+'|IsometricDistance_Thr='+str(thr_iso)+'|thr_v='+str(thr_v)+'|thr_s='+str(thr_s)+'|exact='+str(exact)+'|Vgene='+str(Vgene)+'|ST='+str(ST)
712
- g.write(InfoLine+'\n')
713
- g.write("##Column Info: CDR3 aa sequence, cluster id, other information in the input file\n")
714
- gr=0
1069
+ infoList.append("\t".join(ww[1:]))
1070
+ count += 1
1071
+ if len(outfile) == 0:
1072
+ outfile = inputfile.split("/")
1073
+ outfile = outfile[len(outfile) - 1]
1074
+ outfile = (
1075
+ outdir
1076
+ + "/"
1077
+ + re.sub("\\.[txcsv]+", "", outfile)
1078
+ + "-"
1079
+ + "-RotationEncodingBL62.txt"
1080
+ )
1081
+ g = open(outfile, "w")
1082
+ tm = strftime("%Y-%m-%d %H:%M:%S", gmtime())
1083
+ InfoLine = (
1084
+ "##TIME:"
1085
+ + tm
1086
+ + "|cmd: "
1087
+ + sys.argv[0]
1088
+ + "|"
1089
+ + inputfile
1090
+ + "|IsometricDistance_Thr="
1091
+ + str(thr_iso)
1092
+ + "|thr_v="
1093
+ + str(thr_v)
1094
+ + "|thr_s="
1095
+ + str(thr_s)
1096
+ + "|exact="
1097
+ + str(exact)
1098
+ + "|Vgene="
1099
+ + str(Vgene)
1100
+ + "|ST="
1101
+ + str(ST)
1102
+ )
1103
+ g.write(InfoLine + "\n")
1104
+ g.write(
1105
+ "##Column Info: CDR3 aa sequence, cluster id, other information in the input file\n"
1106
+ )
1107
+ gr = 0
715
1108
  ## Split into different lengths
716
- LD,VD, ID,SD= BuildLengthDict(seqs, vGene=vgs,INFO=infoList,sIDs=[x for x in range(len(seqs))])
1109
+ LD, VD, ID, SD = BuildLengthDict(
1110
+ seqs, vGene=vgs, INFO=infoList, sIDs=[x for x in range(len(seqs))]
1111
+ )
717
1112
  LDu, VDu, IDu, SDu = CollapseUnique(LD, VD, ID, SD)
718
1113
  if Mat:
719
- Mfile=outfile+'_EncodingMatrix.txt'
720
- h=open(Mfile, 'w')
1114
+ Mfile = outfile + "_EncodingMatrix.txt"
1115
+ h = open(Mfile, "w")
721
1116
  for kk in LDu:
722
1117
  if verbose:
723
- print("---Process CDR3s with length %d ---" %(kk))
724
- vSD=LDu[kk]
725
- vSD0=[x for x in range(len(vSD))]
726
- vss=SDu[kk]
727
- vInfo=IDu[kk]
728
- flagL=[len(x)-1 for x in vInfo]
1118
+ print("---Process CDR3s with length %d ---" % (kk))
1119
+ vSD = LDu[kk]
1120
+ vSD0 = [x for x in range(len(vSD))]
1121
+ vss = SDu[kk]
1122
+ vInfo = IDu[kk]
1123
+ flagL = [len(x) - 1 for x in vInfo]
729
1124
  if verbose:
730
- print(' Performing CDR3 encoding')
731
- dM=np.array([EncodingCDR3(x[ST:-2], M6, n0) for x in vss])
732
- dM=dM.astype("float32")
1125
+ print(" Performing CDR3 encoding")
1126
+ dM = np.array([EncodingCDR3(x[ST:-2], M6, n0) for x in vss])
1127
+ dM = dM.astype("float32")
733
1128
  if verbose:
734
- print(" The number of sequences is %d" %(dM.shape[0]))
1129
+ print(" The number of sequences is %d" % (dM.shape[0]))
735
1130
  if Mat:
736
1131
  for ii in range(len(vss)):
737
- line=vss[ii]+'\t'+vInfo[ii][0]+'\t'
738
- NUMs=[str(xx) for xx in dM[ii,:]]
739
- line += '\t'.join(NUMs) + '\n'
1132
+ line = vss[ii] + "\t" + vInfo[ii][0] + "\t"
1133
+ NUMs = [str(xx) for xx in dM[ii, :]]
1134
+ line += "\t".join(NUMs) + "\n"
740
1135
  h.write(line)
741
- sID=[x for x in range(dM.shape[0])]
742
- t2=time.time()
1136
+ sID = [x for x in range(dM.shape[0])]
1137
+ t2 = time.time()
743
1138
  if verbose:
744
- print(' Done! Total time elapsed %f' %(t2-t1))
745
- Cls = ClusterCDR3(dM, flagL, thr=thr_iso - 0.5*(15-kk), verbose=verbose) ## change cutoff with different lengths
1139
+ print(" Done! Total time elapsed %f" % (t2 - t1))
1140
+ Cls = ClusterCDR3(
1141
+ dM, flagL, thr=thr_iso - 0.5 * (15 - kk), verbose=verbose
1142
+ ) ## change cutoff with different lengths
746
1143
  if verbose:
747
1144
  print(" Handling identical CDR3 groups")
748
- Cls_u=[]
1145
+ Cls_u = []
749
1146
  for ii in range(len(Cls)):
750
- cc=Cls[ii]
1147
+ cc = Cls[ii]
751
1148
  if len(cc) == 1:
752
1149
  ## Handle identical CDR3 groups first
753
- if flagL[cc[0]]>0:
1150
+ if flagL[cc[0]] > 0:
754
1151
  gr += 1
755
- jj=cc[0]
1152
+ jj = cc[0]
756
1153
  for v_info in vInfo[jj]:
757
- line=vss[jj]+'\t'+str(gr)+'\t'+v_info+'\n'
758
- _=g.write(line)
1154
+ line = vss[jj] + "\t" + str(gr) + "\t" + v_info + "\n"
1155
+ _ = g.write(line)
759
1156
  else:
760
1157
  Cls_u.append(cc)
761
- Cls=Cls_u
762
- t2=time.time()
1158
+ Cls = Cls_u
1159
+ t2 = time.time()
763
1160
  if verbose:
764
- print(' Done! Total time elapsed %f' %(t2-t1))
1161
+ print(" Done! Total time elapsed %f" % (t2 - t1))
765
1162
  if Vgene:
766
- vVgene=VDu[kk]
1163
+ vVgene = VDu[kk]
767
1164
  if verbose:
768
- print(' Matching variable genes')
769
- Cls_v=[]
1165
+ print(" Matching variable genes")
1166
+ Cls_v = []
770
1167
  for cc in Cls:
771
- Nc=len(cc)
772
- sMat={}
1168
+ Nc = len(cc)
1169
+ sMat = {}
773
1170
  for ii in range(Nc):
774
- v1=vVgene[cc[ii]]
775
- for jj in range(ii,Nc):
776
- if jj==ii:
1171
+ v1 = vVgene[cc[ii]]
1172
+ for jj in range(ii, Nc):
1173
+ if jj == ii:
777
1174
  continue
778
- v2=vVgene[cc[jj]]
1175
+ v2 = vVgene[cc[jj]]
779
1176
  if (v1, v2) not in VDict:
780
1177
  if v1 == v2:
781
1178
  if ii not in sMat:
782
- sMat[ii]=[jj]
1179
+ sMat[ii] = [jj]
783
1180
  else:
784
1181
  sMat[ii].append(jj)
785
1182
  if jj not in sMat:
786
- sMat[jj]=[ii]
1183
+ sMat[jj] = [ii]
787
1184
  else:
788
1185
  sMat[jj].append(ii)
789
1186
  continue
790
- if VDict[(v1,v2)] >= thr_v:
791
- if ii not in sMat:
792
- sMat[ii]=[jj]
793
- else:
794
- sMat[ii].append(jj)
795
- if jj not in sMat:
796
- sMat[jj]=[ii]
797
- else:
798
- sMat[jj].append(ii)
799
- vCL=IdentifyMotifCluster(sMat)
800
- vCL_List=list(chain(*vCL))
1187
+ if VDict[(v1, v2)] >= thr_v:
1188
+ if ii not in sMat:
1189
+ sMat[ii] = [jj]
1190
+ else:
1191
+ sMat[ii].append(jj)
1192
+ if jj not in sMat:
1193
+ sMat[jj] = [ii]
1194
+ else:
1195
+ sMat[jj].append(ii)
1196
+ vCL = IdentifyMotifCluster(sMat)
1197
+ vCL_List = list(chain(*vCL))
801
1198
  for ii in range(Nc):
802
- uu=flagL[cc[ii]]
803
- if uu>0 and ii not in vCL_List:
1199
+ uu = flagL[cc[ii]]
1200
+ if uu > 0 and ii not in vCL_List:
804
1201
  vCL.append([ii])
805
1202
  for vcc in vCL:
806
1203
  Cls_v.append(list(np.array(cc)[np.array(vcc)]))
807
- Cls=[]
1204
+ Cls = []
808
1205
  for ii in range(len(Cls_v)):
809
- cc=Cls_v[ii]
1206
+ cc = Cls_v[ii]
810
1207
  if len(cc) == 1:
811
1208
  ## Handle identical CDR3 groups first
812
1209
  gr += 1
813
- jj=cc[0]
1210
+ jj = cc[0]
814
1211
  for v_info in vInfo[jj]:
815
- line=vss[jj]+'\t'+str(gr)+'\t'+v_info+'\n'
816
- _=g.write(line)
1212
+ line = vss[jj] + "\t" + str(gr) + "\t" + v_info + "\n"
1213
+ _ = g.write(line)
817
1214
  else:
818
1215
  Cls.append(cc)
819
1216
  if exact:
820
1217
  if verbose:
821
- print(' Performing Smith-Waterman alignment')
822
- Cls_s=[]
1218
+ print(" Performing Smith-Waterman alignment")
1219
+ Cls_s = []
823
1220
  for cc in Cls:
824
- Nc=len(cc)
825
- if len(cc)<=3:
826
- sMat=np.zeros((Nc,Nc))
1221
+ Nc = len(cc)
1222
+ if len(cc) <= 3:
1223
+ sMat = np.zeros((Nc, Nc))
827
1224
  for ii in range(Nc):
828
- s1=vss[cc[ii]]
829
- for jj in range(ii,Nc):
830
- if jj==ii:
1225
+ s1 = vss[cc[ii]]
1226
+ for jj in range(ii, Nc):
1227
+ if jj == ii:
831
1228
  continue
832
- s2=vss[cc[jj]]
1229
+ s2 = vss[cc[jj]]
833
1230
  if len(s1) != len(s2):
834
1231
  continue
835
- if len(s1)<=5:
1232
+ if len(s1) <= 5:
836
1233
  continue
837
- sw=SeqComparison(s1[ST:-2],s2[ST:-2],gap=gap)
838
- sw=sw/(len(s1)-ST-2)
839
- sMat[ii,jj]=sw
840
- sMat[jj,ii]=sw
841
- s_max=[]
1234
+ sw = SeqComparison(s1[ST:-2], s2[ST:-2], gap=gap)
1235
+ sw = sw / (len(s1) - ST - 2)
1236
+ sMat[ii, jj] = sw
1237
+ sMat[jj, ii] = sw
1238
+ s_max = []
842
1239
  for ii in range(Nc):
843
- s_max.append(np.max(sMat[:,ii]))
844
- cc_new=[]
1240
+ s_max.append(np.max(sMat[:, ii]))
1241
+ cc_new = []
845
1242
  for ii in range(Nc):
846
- if s_max[ii]>=thr_s:
1243
+ if s_max[ii] >= thr_s:
847
1244
  cc_new.append(cc[ii])
848
- if len(cc_new)>1:
1245
+ if len(cc_new) > 1:
849
1246
  Cls_s.append(cc_new)
850
1247
  else:
851
1248
  for ii in range(Nc):
852
- uu=flagL[cc[ii]]
853
- if uu>0:
1249
+ uu = flagL[cc[ii]]
1250
+ if uu > 0:
854
1251
  Cls_s.append([cc[ii]])
855
- # print(Cls_s)
856
- Cls_sList=list(chain(*Cls_s))
1252
+ # print(Cls_s)
1253
+ Cls_sList = list(chain(*Cls_s))
857
1254
  for ii in range(len(cc)):
858
- uu=flagL[cc[ii]]
859
- if uu>0 and cc[ii] not in Cls_sList:
1255
+ uu = flagL[cc[ii]]
1256
+ if uu > 0 and cc[ii] not in Cls_sList:
860
1257
  Cls_s.append([cc[ii]])
861
1258
  else:
862
- CDR3s=[vss[x] for x in cc]
863
- sIDs=np.array([vSD0[x] for x in cc])
864
- sIDs0=[x for x in range(len(cc))]
865
- Kset=KmerSet(CDR3s, sIDs0, KS=5, st=ST, ed=2)
866
- SSG=generateSSG(Kset, CDR3s, k_thr=1)
867
- tmpVgenes=['TRBV2']*len(CDR3s)
868
- SSGnew=UpdateSSG(SSG, CDR3s, tmpVgenes, Vscore=VDict, cutoff=thr_s+4)
869
- CLall=IdentifyMotifCluster(SSGnew)
870
- CLall_list=list(chain(*CLall))
1259
+ CDR3s = [vss[x] for x in cc]
1260
+ sIDs = np.array([vSD0[x] for x in cc])
1261
+ sIDs0 = [x for x in range(len(cc))]
1262
+ Kset = KmerSet(CDR3s, sIDs0, KS=5, st=ST, ed=2)
1263
+ SSG = generateSSG(Kset, CDR3s, k_thr=1)
1264
+ tmpVgenes = ["TRBV2"] * len(CDR3s)
1265
+ SSGnew = UpdateSSG(
1266
+ SSG, CDR3s, tmpVgenes, Vscore=VDict, cutoff=thr_s + 4
1267
+ )
1268
+ CLall = IdentifyMotifCluster(SSGnew)
1269
+ CLall_list = list(chain(*CLall))
871
1270
  for ii in range(len(cc)):
872
- uu=flagL[cc[ii]]
873
- if uu>0 and ii not in CLall_list:
1271
+ uu = flagL[cc[ii]]
1272
+ if uu > 0 and ii not in CLall_list:
874
1273
  CLall.append([ii])
875
1274
  for cl in CLall:
876
- ccs=list(sIDs[np.array(cl)])
1275
+ ccs = list(sIDs[np.array(cl)])
877
1276
  Cls_s.append(ccs)
878
- Cls=Cls_s
1277
+ Cls = Cls_s
879
1278
  if verbose:
880
- print(' Writing results into file')
1279
+ print(" Writing results into file")
881
1280
  for ii in range(len(Cls)):
882
- # if ii % 100000 == 0 and ii>0:
883
- #print(' %d sequences written' %(ii))
884
- cc=Cls[ii]
885
- gr+=1
1281
+ # if ii % 100000 == 0 and ii>0:
1282
+ # print(' %d sequences written' %(ii))
1283
+ cc = Cls[ii]
1284
+ gr += 1
886
1285
  for jj in cc:
887
1286
  for v_info in vInfo[jj]:
888
- line=vss[jj]+'\t'+str(gr)+'\t'+v_info+'\n'
889
- _=g.write(line)
1287
+ line = vss[jj] + "\t" + str(gr) + "\t" + v_info + "\n"
1288
+ _ = g.write(line)
890
1289
  g.close()
891
1290
  if Mat:
892
1291
  h.close()
893
1292
 
1293
+
894
1294
  def OrderUnique(Ig):
895
- vv=list(Ig.values())
896
- kk=list(Ig.keys())
897
- LL=[len(x[1]) for x in vv]
898
- v0=[x[0][0] for x in vv]
899
- v1=[x[0][1] for x in vv]
900
- zkk=zip(kk,v0,v1,LL)
901
- zkks=sorted(zkk,key=lambda x: (x[1],x[3]))
902
- nk=len(zkks)
903
- keep_id=[0]
904
- ii=1
905
- n_pre=str(zkks[0][1])+'_'+str(zkks[0][2])
906
- while ii<nk:
907
- n_cur=str(zkks[ii][1])+'_'+str(zkks[ii][2])
908
- if n_cur==n_pre:
909
- ii+=1
1295
+ vv = list(Ig.values())
1296
+ kk = list(Ig.keys())
1297
+ LL = [len(x[1]) for x in vv]
1298
+ v0 = [x[0][0] for x in vv]
1299
+ v1 = [x[0][1] for x in vv]
1300
+ zkk = zip(kk, v0, v1, LL)
1301
+ zkks = sorted(zkk, key=lambda x: (x[1], x[3]))
1302
+ nk = len(zkks)
1303
+ keep_id = [0]
1304
+ ii = 1
1305
+ n_pre = str(zkks[0][1]) + "_" + str(zkks[0][2])
1306
+ while ii < nk:
1307
+ n_cur = str(zkks[ii][1]) + "_" + str(zkks[ii][2])
1308
+ if n_cur == n_pre:
1309
+ ii += 1
910
1310
  continue
911
1311
  else:
912
1312
  keep_id.append(ii)
913
- n_pre=n_cur
914
- ii+=1
1313
+ n_pre = n_cur
1314
+ ii += 1
915
1315
  continue
916
- nid=[x[0] for x in zkks]
917
- filtered_id=np.array(nid)[np.array(keep_id)]
918
- Igs={}
1316
+ nid = [x[0] for x in zkks]
1317
+ filtered_id = np.array(nid)[np.array(keep_id)]
1318
+ Igs = {}
919
1319
  for ii in filtered_id:
920
- Igs[kk[ii]]=vv[ii]
1320
+ Igs[kk[ii]] = vv[ii]
921
1321
  return Igs, filtered_id
922
1322
 
1323
+
923
1324
  def ClusterCDR3(dM, flagL, thr=10, GPU=False, verbose=False):
924
1325
  ## flagL: flag vector for identical CDR3 groups, >0 for grouped non-identical CDR3s
925
- Cls=[]
926
- flag=0
927
- dM1=dM
928
- flagL=np.array(flagL)
1326
+ Cls = []
1327
+ flag = 0
1328
+ dM1 = dM
1329
+ flagL = np.array(flagL)
929
1330
  if GPU:
930
1331
  res = faiss.StandardGpuResources()
931
1332
  while 1:
932
- # print(" %d number of clusters, with %d sequences" %(len(Cls),dM1.shape[0]))
1333
+ # print(" %d number of clusters, with %d sequences" %(len(Cls),dM1.shape[0]))
933
1334
  if verbose:
934
- print('=',end='')
935
- index = faiss.IndexFlatL2(Ndim*6)
1335
+ print("=", end="")
1336
+ index = faiss.IndexFlatL2(Ndim * 6)
936
1337
  if GPU:
937
1338
  index = faiss.index_cpu_to_gpu(res, 0, index)
938
1339
  index.add(dM1)
939
- if flag==0:
1340
+ if flag == 0:
940
1341
  D, I = index.search(dM1, 2)
941
- vv=np.where((D[:,1]<=thr))[0]
942
- vv0=np.where((D[:,1]>thr) & (flagL>0))[0]
1342
+ vv = np.where((D[:, 1] <= thr))[0]
1343
+ vv0 = np.where((D[:, 1] > thr) & (flagL > 0))[0]
943
1344
  for v in vv0:
944
1345
  Cls.append([v])
945
- tmp_dM=np.zeros((len(vv),Ndim*6))
946
- Ig_new={}
1346
+ tmp_dM = np.zeros((len(vv), Ndim * 6))
1347
+ Ig_new = {}
947
1348
  for ii in range(len(vv)):
948
- v=vv[ii]
949
- Idx=I[v,]
1349
+ v = vv[ii]
1350
+ Idx = I[v,]
950
1351
  if v not in Idx:
951
- Idx[0]=v
952
- Ig_new[ii]=(sorted(list(set(Idx))),sorted(list(set(Idx))))
953
- tmp_dM[ii,]=(dM1[Idx[0],]+dM1[Idx[1],])/2
954
- if len(Ig_new)==0:
1352
+ Idx[0] = v
1353
+ Ig_new[ii] = (sorted(list(set(Idx))), sorted(list(set(Idx))))
1354
+ tmp_dM[ii,] = (dM1[Idx[0],] + dM1[Idx[1],]) / 2
1355
+ if len(Ig_new) == 0:
955
1356
  if verbose:
956
- print('type 0 break')
1357
+ print("type 0 break")
957
1358
  break
958
- # print('%d of sequence left at cycle %d' %(len(Ig_new),flag))
959
- Igs, fid=OrderUnique(Ig_new)
960
- tmp_dM=tmp_dM[fid,]
961
- Ig_new=Igs
1359
+ # print('%d of sequence left at cycle %d' %(len(Ig_new),flag))
1360
+ Igs, fid = OrderUnique(Ig_new)
1361
+ tmp_dM = tmp_dM[fid,]
1362
+ Ig_new = Igs
962
1363
  else:
963
- D, I = index.search(dM1,2)
964
- vv=np.where(D[:,1]<=thr)[0]
965
- vv0=np.where(D[:,1]>thr)[0]
1364
+ D, I = index.search(dM1, 2)
1365
+ vv = np.where(D[:, 1] <= thr)[0]
1366
+ vv0 = np.where(D[:, 1] > thr)[0]
966
1367
  ## move groups in vv0 to Cls
967
- kkg=list(Ig.keys())
1368
+ kkg = list(Ig.keys())
968
1369
  for v in vv0:
969
- ng=list(Ig[kkg[v]][1])
970
- # if ng not in Cls:
1370
+ ng = list(Ig[kkg[v]][1])
1371
+ # if ng not in Cls:
971
1372
  Cls.append(ng)
972
- tmp_dM=np.zeros((len(vv),Ndim*6))
973
- Ig_new={}
1373
+ tmp_dM = np.zeros((len(vv), Ndim * 6))
1374
+ Ig_new = {}
974
1375
  for ii in range(len(vv)):
975
- v=vv[ii]
976
- idx1=I[v,0]
977
- idx2=I[v,1]
1376
+ v = vv[ii]
1377
+ idx1 = I[v, 0]
1378
+ idx2 = I[v, 1]
978
1379
  if v not in I[v,]:
979
- idx1=v
980
- # Ig_new[ii]=sorted(list(set(list(Ig[kkg[idx1]])+list(Ig[kkg[idx2]]))))
981
- Ig_new[ii]=(sorted(list(set([idx1,idx2]))), ## First entry records the relative index of a sequence clique
982
- sorted(list(set(list(Ig[kkg[idx1]][1])+list(Ig[kkg[idx2]][1]))))) ## Second entry records the absolute index of a sequence
983
- tmp_dM[ii,]=(dM1[idx1,]+dM1[idx2,])/2
984
- if len(Ig_new)==0:
1380
+ idx1 = v
1381
+ # Ig_new[ii]=sorted(list(set(list(Ig[kkg[idx1]])+list(Ig[kkg[idx2]]))))
1382
+ Ig_new[ii] = (
1383
+ sorted(
1384
+ list(set([idx1, idx2]))
1385
+ ), ## First entry records the relative index of a sequence clique
1386
+ sorted(list(set(list(Ig[kkg[idx1]][1]) + list(Ig[kkg[idx2]][1])))),
1387
+ ) ## Second entry records the absolute index of a sequence
1388
+ tmp_dM[ii,] = (dM1[idx1,] + dM1[idx2,]) / 2
1389
+ if len(Ig_new) == 0:
985
1390
  if verbose:
986
1391
  print("\ntype I break")
987
- kkg=list(Ig.keys())
1392
+ kkg = list(Ig.keys())
988
1393
  for kk in kkg:
989
- ng=list(Ig[kk][1])
1394
+ ng = list(Ig[kk][1])
990
1395
  if ng not in Cls:
991
1396
  Cls.append(ng)
992
1397
  break
993
- # print('%d of sequence left at cycle %d' %(len(Ig_new),flag))
994
- Igs, fid=OrderUnique(Ig_new)
995
- tmp_dM=tmp_dM[fid,]
996
- Ig_new=Igs
997
- if flag>0:
1398
+ # print('%d of sequence left at cycle %d' %(len(Ig_new),flag))
1399
+ Igs, fid = OrderUnique(Ig_new)
1400
+ tmp_dM = tmp_dM[fid,]
1401
+ Ig_new = Igs
1402
+ if flag > 0:
998
1403
  if Ig == Ig_new:
999
1404
  if verbose:
1000
1405
  print("\ntype II break")
1001
- kkg=list(Ig.keys())
1406
+ kkg = list(Ig.keys())
1002
1407
  for kk in kkg:
1003
- ng=list(Ig[kk][1])
1408
+ ng = list(Ig[kk][1])
1004
1409
  if ng in Cls:
1005
1410
  continue
1006
1411
  Cls.append(ng)
1007
1412
  break
1008
- Ig=Ig_new
1009
- tmp_dM=tmp_dM.astype('float32')
1010
- dM1=tmp_dM
1011
- flag+=1
1413
+ Ig = Ig_new
1414
+ tmp_dM = tmp_dM.astype("float32")
1415
+ dM1 = tmp_dM
1416
+ flag += 1
1012
1417
  return Cls
1013
1418
 
1014
- def ClusterCDR3r(dM, flagL, thr = 10, verbose = False):
1015
- index = faiss.IndexFlatL2(Ndim*6)
1419
+
1420
+ def ClusterCDR3r(dM, flagL, thr=10, verbose=False):
1421
+ index = faiss.IndexFlatL2(Ndim * 6)
1016
1422
  index.add(dM)
1017
1423
  lims, D, I = index.range_search(dM, thr)
1018
1424
  # with open('cdr3.npy', 'wb') as f:
@@ -1020,53 +1426,70 @@ def ClusterCDR3r(dM, flagL, thr = 10, verbose = False):
1020
1426
  # np.save(f, D)
1021
1427
  # np.save(f, I)
1022
1428
  # np.save(f, dM)
1023
-
1429
+
1024
1430
  # now clustering results
1025
1431
  N = dM.shape[0]
1026
- neighborSize = np.array([lims[cur_idx_i+1] - lims[cur_idx_i] for cur_idx_i in range(N)])
1432
+ neighborSize = np.array(
1433
+ [lims[cur_idx_i + 1] - lims[cur_idx_i] for cur_idx_i in range(N)]
1434
+ )
1027
1435
  # to_cluster = np.ones( (N,))
1028
1436
  clusterNo = 0
1029
- cluster = - np.ones( (N, ), dtype = np.int32)
1437
+ cluster = -np.ones((N,), dtype=np.int32)
1030
1438
  idx = np.where(cluster < 0)[0]
1031
1439
  unclustered = [np.argmax(neighborSize[idx])]
1032
1440
  depth = 0
1033
1441
  while True:
1034
- if len(unclustered) == 0: break
1442
+ if len(unclustered) == 0:
1443
+ break
1035
1444
  # cur_idx = unclustered[0] # first unclustered index
1036
1445
  cur_idx = unclustered
1037
- cluster[cur_idx] = clusterNo # assign cluster
1038
-
1039
- neighbor = np.unique(np.array(list(chain (* [I[(lims[cur_idx_i]): lims[cur_idx_i+1]] for cur_idx_i in cur_idx]))))
1446
+ cluster[cur_idx] = clusterNo # assign cluster
1447
+
1448
+ neighbor = np.unique(
1449
+ np.array(
1450
+ list(
1451
+ chain(
1452
+ *[
1453
+ I[(lims[cur_idx_i]) : lims[cur_idx_i + 1]]
1454
+ for cur_idx_i in cur_idx
1455
+ ]
1456
+ )
1457
+ )
1458
+ )
1459
+ )
1040
1460
  # find those unclusterred
1041
1461
  idx = np.where(cluster[neighbor] < 0)[0]
1042
1462
  if len(idx) == 0:
1043
1463
  depth = 0
1044
1464
  clusterNo += 1
1045
1465
  idx = np.where(cluster < 0)[0]
1046
- if len(idx) == 0: break
1466
+ if len(idx) == 0:
1467
+ break
1047
1468
  unclustered = [idx[np.argmax(neighborSize[idx])]]
1048
-
1469
+
1049
1470
  else:
1050
1471
  if depth > 3:
1051
1472
  depth = 0
1052
1473
  clusterNo += 1
1053
1474
  unclustered = neighbor[idx]
1054
1475
  depth += 1
1055
- # print('clusterNo = ', clusterNo)
1056
- Cls = [ [] for i in range(clusterNo)]
1476
+ # print('clusterNo = ', clusterNo)
1477
+ Cls = [[] for i in range(clusterNo)]
1057
1478
  for idx, i in enumerate(cluster):
1058
- Cls[i].append(idx)
1059
- # print("Cls[:5] = ", Cls[:5])
1060
- # print("len(Cls) = ", len(Cls),
1061
- # ', #elem=', sum([len(i) for i in Cls]),
1062
- # ', #single=', sum([len(i) for i in Cls if len(i) == 1]),
1063
- # ', #non_single=', sum([len(i) for i in Cls if len(i) != 1]),
1064
- # ', #max=', max([len(i) for i in Cls]))
1479
+ Cls[i].append(idx)
1480
+ # print("Cls[:5] = ", Cls[:5])
1481
+ # print("len(Cls) = ", len(Cls),
1482
+ # ', #elem=', sum([len(i) for i in Cls]),
1483
+ # ', #single=', sum([len(i) for i in Cls if len(i) == 1]),
1484
+ # ', #non_single=', sum([len(i) for i in Cls if len(i) != 1]),
1485
+ # ', #max=', max([len(i) for i in Cls]))
1065
1486
  return Cls
1066
1487
 
1488
+
1067
1489
  def CommandLineParser():
1068
- parser=OptionParser()
1069
- print ('''
1490
+ parser = OptionParser()
1491
+ print(
1492
+ """
1070
1493
  GIANA: Geometric Isometry based ANtigen-specific tcr Alignment
1071
1494
  Ultrafast short peptide alignment exclusively designed for large-scale adaptome analysis
1072
1495
 
@@ -1079,130 +1502,282 @@ Input columns:
1079
1502
 
1080
1503
  !!! ALL amino acid letters must be CAPITAL !!!
1081
1504
 
1082
- ''')
1083
- parser.add_option("-d","--directory",dest="Directory",help="Input repertoire sequencing file directory. Please make sure that all the files in the directory are input files.",default="")
1084
- parser.add_option("-f","--file",dest="File",default='',help="Input single file of CDR3 sequences for grouping")
1085
- parser.add_option("-F","--fileList",dest="files",default='',help='Alternative input: a file containing the full path to all the files. If given, overwrite -d and -f option')
1086
- parser.add_option("-t","--threshold",dest="thr",default=7,help="Isometric distance threshold for calling similar CDR3 groups. Without -E, smaller value will increase speed. With -E, smaller value will increase specificity. Must be smaller than 12.")
1087
- parser.add_option("-S","--threshold_score",dest="thr_s",default=3.6, help="Threshold for Smith-Waterman alignment score (normalized by CDR3 length). Default 3.6")
1088
- parser.add_option("-G","--threshold_vgene",dest="thr_v",default=3.7,help="Threshold for variable gene comparison. Default 3.7.")
1089
- parser.add_option("-o","--output",dest="OutDir",default='./',help="Output directory for intermediate and final outputs.")
1090
- parser.add_option("-O","--outfile",dest="OutFile",default='',help="Output file name. If not given, a file with --RotationEncoding will be added to the input file as the output file name.")
1091
- parser.add_option("-T","--startPosition",dest='ST',default=3, help="Starting position of CDR3 sequence. The first ST letters are omitted. CDR3 sequence length L must be >= ST+7 ")
1092
- parser.add_option("-g","--GapPenalty",dest="Gap",default= -6,help="Gap penalty,default= -6. Not used.")
1093
- parser.add_option("-n","--GapNumber",dest="GapN",default=1,help="Maximum number of gaps allowed when performing alignment. Max=1, default=1. Not used.")
1094
- parser.add_option("-V","--VariableGeneFa",dest="VFa",default="Imgt_Human_TRBV.fasta",help="IMGT Human beta variable gene sequences")
1095
- parser.add_option("-v","--VariableGene",dest="V",default=True,action="store_false",help="If False, GIANA will omit variable gene information and use CDR3 sequences only. This will yield reduced specificity. The cut-off will automatically become the current value-4.0")
1096
- parser.add_option("-e","--Exact",dest="E",default=True,action="store_false",help="If False, GIANA will not perform Smith-Waterman alignment after isometric encoding.")
1097
- parser.add_option("-N","--NumberOfThreads",dest="NN",default=1,help="Number of threads for multiple processing. Not working so well.")
1098
- parser.add_option("-M","--EncodingMatrix", dest="Mat", default=False,action="store_true", help="If true, GIANA will export the isometric encoding matrix for each TCR. Default: False.")
1099
- parser.add_option("-U","--UseGPU",dest="GPU", default=False, action="store_true",help="Use GPU for Faiss indexing. Must be CUDA GPUs.")
1100
- parser.add_option("-q","--queryFile",dest="Query",default='',help="Input query file, if given, GIANA will run in query mode, also need to provide -r option.")
1101
- parser.add_option("-r","--refFile",dest="ref", default='',help="Input reference file. Query model required.")
1102
- parser.add_option("-b","--Verbose", dest='v', default=False, action="store_true", help="Verbose option: if given, GIANA will print intermediate messages.")
1505
+ """
1506
+ )
1507
+ parser.add_option(
1508
+ "-d",
1509
+ "--directory",
1510
+ dest="Directory",
1511
+ help="Input repertoire sequencing file directory. Please make sure that all the files in the directory are input files.",
1512
+ default="",
1513
+ )
1514
+ parser.add_option(
1515
+ "-f",
1516
+ "--file",
1517
+ dest="File",
1518
+ default="",
1519
+ help="Input single file of CDR3 sequences for grouping",
1520
+ )
1521
+ parser.add_option(
1522
+ "-F",
1523
+ "--fileList",
1524
+ dest="files",
1525
+ default="",
1526
+ help="Alternative input: a file containing the full path to all the files. If given, overwrite -d and -f option",
1527
+ )
1528
+ parser.add_option(
1529
+ "-t",
1530
+ "--threshold",
1531
+ dest="thr",
1532
+ default=7,
1533
+ help="Isometric distance threshold for calling similar CDR3 groups. Without -E, smaller value will increase speed. With -E, smaller value will increase specificity. Must be smaller than 12.",
1534
+ )
1535
+ parser.add_option(
1536
+ "-S",
1537
+ "--threshold_score",
1538
+ dest="thr_s",
1539
+ default=3.6,
1540
+ help="Threshold for Smith-Waterman alignment score (normalized by CDR3 length). Default 3.6",
1541
+ )
1542
+ parser.add_option(
1543
+ "-G",
1544
+ "--threshold_vgene",
1545
+ dest="thr_v",
1546
+ default=3.7,
1547
+ help="Threshold for variable gene comparison. Default 3.7.",
1548
+ )
1549
+ parser.add_option(
1550
+ "-o",
1551
+ "--output",
1552
+ dest="OutDir",
1553
+ default="./",
1554
+ help="Output directory for intermediate and final outputs.",
1555
+ )
1556
+ parser.add_option(
1557
+ "-O",
1558
+ "--outfile",
1559
+ dest="OutFile",
1560
+ default="",
1561
+ help="Output file name. If not given, a file with --RotationEncoding will be added to the input file as the output file name.",
1562
+ )
1563
+ parser.add_option(
1564
+ "-T",
1565
+ "--startPosition",
1566
+ dest="ST",
1567
+ default=3,
1568
+ help="Starting position of CDR3 sequence. The first ST letters are omitted. CDR3 sequence length L must be >= ST+7 ",
1569
+ )
1570
+ parser.add_option(
1571
+ "-g",
1572
+ "--GapPenalty",
1573
+ dest="Gap",
1574
+ default=-6,
1575
+ help="Gap penalty,default= -6. Not used.",
1576
+ )
1577
+ parser.add_option(
1578
+ "-n",
1579
+ "--GapNumber",
1580
+ dest="GapN",
1581
+ default=1,
1582
+ help="Maximum number of gaps allowed when performing alignment. Max=1, default=1. Not used.",
1583
+ )
1584
+ parser.add_option(
1585
+ "-V",
1586
+ "--VariableGeneFa",
1587
+ dest="VFa",
1588
+ default="Imgt_Human_TRBV.fasta",
1589
+ help="IMGT Human beta variable gene sequences",
1590
+ )
1591
+ parser.add_option(
1592
+ "-v",
1593
+ "--VariableGene",
1594
+ dest="V",
1595
+ default=True,
1596
+ action="store_false",
1597
+ help="If False, GIANA will omit variable gene information and use CDR3 sequences only. This will yield reduced specificity. The cut-off will automatically become the current value-4.0",
1598
+ )
1599
+ parser.add_option(
1600
+ "-e",
1601
+ "--Exact",
1602
+ dest="E",
1603
+ default=True,
1604
+ action="store_false",
1605
+ help="If False, GIANA will not perform Smith-Waterman alignment after isometric encoding.",
1606
+ )
1607
+ parser.add_option(
1608
+ "-N",
1609
+ "--NumberOfThreads",
1610
+ dest="NN",
1611
+ default=1,
1612
+ help="Number of threads for multiple processing. Not working so well.",
1613
+ )
1614
+ parser.add_option(
1615
+ "-M",
1616
+ "--EncodingMatrix",
1617
+ dest="Mat",
1618
+ default=False,
1619
+ action="store_true",
1620
+ help="If true, GIANA will export the isometric encoding matrix for each TCR. Default: False.",
1621
+ )
1622
+ parser.add_option(
1623
+ "-U",
1624
+ "--UseGPU",
1625
+ dest="GPU",
1626
+ default=False,
1627
+ action="store_true",
1628
+ help="Use GPU for Faiss indexing. Must be CUDA GPUs.",
1629
+ )
1630
+ parser.add_option(
1631
+ "-q",
1632
+ "--queryFile",
1633
+ dest="Query",
1634
+ default="",
1635
+ help="Input query file, if given, GIANA will run in query mode, also need to provide -r option.",
1636
+ )
1637
+ parser.add_option(
1638
+ "-r",
1639
+ "--refFile",
1640
+ dest="ref",
1641
+ default="",
1642
+ help="Input reference file. Query model required.",
1643
+ )
1644
+ parser.add_option(
1645
+ "-b",
1646
+ "--Verbose",
1647
+ dest="v",
1648
+ default=False,
1649
+ action="store_true",
1650
+ help="Verbose option: if given, GIANA will print intermediate messages.",
1651
+ )
1103
1652
  return parser.parse_args()
1104
1653
 
1654
+
1105
1655
  def main():
1106
- (opt,_)=CommandLineParser()
1107
- cutoff=float(opt.thr)
1108
- OutDir=opt.OutDir
1109
- thr_s=float(opt.thr_s)
1656
+ (opt, _) = CommandLineParser()
1657
+ cutoff = float(opt.thr)
1658
+ OutDir = opt.OutDir
1659
+ thr_s = float(opt.thr_s)
1110
1660
  ## Check if query mode first
1111
- qFile=opt.Query
1112
- if len(qFile)>0:
1661
+ qFile = opt.Query
1662
+ if len(qFile) > 0:
1113
1663
  ## query mode
1114
- t1=time.time()
1115
- if qFile.endswith('/'):
1664
+ t1 = time.time()
1665
+ if qFile.endswith("/"):
1116
1666
  ## input query is a directory
1117
- qFs=os.listdir(qFile)
1118
- qFileList=[]
1667
+ qFs = os.listdir(qFile)
1668
+ qFileList = []
1119
1669
  for ff in qFs:
1120
- qFileList.append(qFile+ff)
1670
+ qFileList.append(qFile + ff)
1121
1671
  else:
1122
- qFileList=[qFile]
1123
- rFile=opt.ref
1124
- if len(rFile)==0:
1125
- raise("Must provide reference file in query mode!")
1672
+ qFileList = [qFile]
1673
+ rFile = opt.ref
1674
+ if len(rFile) == 0:
1675
+ raise ("Must provide reference file in query mode!")
1126
1676
  else:
1127
1677
  ## check if reference cluster file exists
1128
- rFile0=re.sub('\\.txt','',rFile)
1129
- refClusterFile=rFile0+'--RotationEncodingBL62.txt'
1678
+ rFile0 = re.sub("\\.txt", "", rFile)
1679
+ refClusterFile = rFile0 + "--RotationEncodingBL62.txt"
1130
1680
  if not os.path.exists(refClusterFile):
1131
- raise("Must run clustering on reference file first! Did you forget to put the clustering file in this directory?")
1132
- rData=CreateReference(rFile)
1133
- t2=time.time()
1134
- print("Reference created. Elapsed %f" %(t2-t1))
1681
+ raise (
1682
+ "Must run clustering on reference file first! Did you forget to put the clustering file in this directory?"
1683
+ )
1684
+ rData = CreateReference(rFile)
1685
+ t2 = time.time()
1686
+ print("Reference created. Elapsed %f" % (t2 - t1))
1135
1687
  for qf in qFileList:
1136
- t2_0=time.time()
1137
- print("Querying "+qf)
1138
- qf_s=qf.split('/')[-1]
1139
- #outFile=re.sub('\\.txt','',qf_s)+'_query_'+rFile0+'.txt'
1140
- outFile=os.path.splitext(qf_s)[0]+'_query_'+os.path.basename(rFile0)+'.txt'
1141
- of=OutDir+'/'+outFile
1688
+ t2_0 = time.time()
1689
+ print("Querying " + qf)
1690
+ qf_s = qf.split("/")[-1]
1691
+ # outFile=re.sub('\\.txt','',qf_s)+'_query_'+rFile0+'.txt'
1692
+ outFile = (
1693
+ os.path.splitext(qf_s)[0]
1694
+ + "_query_"
1695
+ + os.path.basename(rFile0)
1696
+ + ".txt"
1697
+ )
1698
+ of = OutDir + "/" + outFile
1142
1699
  if path.exists(of):
1143
- print(of+' already exits. Skipping.')
1700
+ print(of + " already exits. Skipping.")
1144
1701
  continue
1145
1702
  MakeQuery(qf, rData, thr=cutoff, thr_s=thr_s)
1146
- t2=time.time()
1147
- print(" Build query clustering file. Elapsed %f" %(t2-t1))
1703
+ t2 = time.time()
1704
+ print(" Build query clustering file. Elapsed %f" % (t2 - t1))
1148
1705
  print("Now mering with reference cluster")
1149
- MergeExist(refClusterFile, OutDir+'/'+outFile)
1150
- t2=time.time()
1151
- print(" Time of elapsed for query %s: %f" %(qf, t2-t2_0))
1706
+ MergeExist(refClusterFile, OutDir + "/" + outFile)
1707
+ t2 = time.time()
1708
+ print(" Time of elapsed for query %s: %f" % (qf, t2 - t2_0))
1152
1709
  else:
1153
1710
  ## regular clustering mode
1154
- FileDir=opt.Directory
1155
- if len(FileDir)>0:
1156
- files=os.listdir(FileDir)
1157
- files0=[]
1158
- for ff in files:
1159
- ff=FileDir+'/'+ff
1160
- files0.append(ff)
1161
- files=files0
1711
+ FileDir = opt.Directory
1712
+ if len(FileDir) > 0:
1713
+ files = os.listdir(FileDir)
1714
+ files0 = []
1715
+ for ff in files:
1716
+ ff = FileDir + "/" + ff
1717
+ files0.append(ff)
1718
+ files = files0
1162
1719
  else:
1163
- files=[]
1164
- File=opt.File
1165
- if len(File)>0:
1166
- files=[File]
1167
- FileList=opt.files
1168
- if len(FileList)>0:
1169
- files=[]
1170
- fL=open(FileList)
1171
- for ff in fL.readlines():
1172
- files.append(ff.strip())
1173
- VFa=opt.VFa
1720
+ files = []
1721
+ File = opt.File
1722
+ if len(File) > 0:
1723
+ files = [File]
1724
+ FileList = opt.files
1725
+ if len(FileList) > 0:
1726
+ files = []
1727
+ fL = open(FileList)
1728
+ for ff in fL.readlines():
1729
+ files.append(ff.strip())
1730
+ VFa = opt.VFa
1174
1731
  PreCalculateVgeneDist(VFa)
1175
- vf=open('./VgeneScores.txt') ## Use tcrDist's Vgene 80-score calculation
1176
- VScore={}
1177
- VV=opt.V
1178
- EE=opt.E
1179
- Mat=opt.Mat
1180
- ST=int(opt.ST)
1181
- thr_v=float(opt.thr_v)
1182
- verbose=opt.v
1732
+ vf = open("./VgeneScores.txt") ## Use tcrDist's Vgene 80-score calculation
1733
+ VScore = {}
1734
+ VV = opt.V
1735
+ EE = opt.E
1736
+ Mat = opt.Mat
1737
+ ST = int(opt.ST)
1738
+ thr_v = float(opt.thr_v)
1739
+ verbose = opt.v
1183
1740
  if VV:
1184
1741
  while 1:
1185
- line=vf.readline()
1186
- if len(line)==0:
1742
+ line = vf.readline()
1743
+ if len(line) == 0:
1187
1744
  break
1188
- ww=line.strip().split('\t')
1189
- VScore[(ww[0],ww[1])]=int(ww[2])/20
1190
- VScore[(ww[1],ww[0])]=int(ww[2])/20
1191
- Gap=int(opt.Gap)
1192
- Gapn=int(opt.GapN)
1193
- OutFile=opt.OutFile
1194
- GPU=opt.GPU
1195
- st=3
1196
- ed=1
1197
- NT=int(opt.NN)
1745
+ ww = line.strip().split("\t")
1746
+ VScore[(ww[0], ww[1])] = int(ww[2]) / 20
1747
+ VScore[(ww[1], ww[0])] = int(ww[2]) / 20
1748
+ Gap = int(opt.Gap)
1749
+ Gapn = int(opt.GapN)
1750
+ OutFile = opt.OutFile
1751
+ GPU = opt.GPU
1752
+ st = 3
1753
+ ed = 1
1754
+ NT = int(opt.NN)
1198
1755
  faiss.omp_set_num_threads(NT)
1199
1756
  for ff in files:
1200
- print("Processing %s" %ff)
1201
- EncodeRepertoire(ff, OutDir, OutFile, ST=ST, thr_s=thr_s, thr_v=thr_v, exact=EE,VDict=VScore, Vgene=VV, thr_iso=cutoff, gap=Gap, GPU=GPU, Mat=Mat, verbose=verbose)
1202
-
1757
+ print("Processing %s" % ff)
1758
+ EncodeRepertoire(
1759
+ ff,
1760
+ OutDir,
1761
+ OutFile,
1762
+ ST=ST,
1763
+ thr_s=thr_s,
1764
+ thr_v=thr_v,
1765
+ exact=EE,
1766
+ VDict=VScore,
1767
+ Vgene=VV,
1768
+ thr_iso=cutoff,
1769
+ gap=Gap,
1770
+ GPU=GPU,
1771
+ Mat=Mat,
1772
+ verbose=verbose,
1773
+ )
1774
+
1775
+
1203
1776
  if __name__ == "__main__":
1204
- t0=time.time()
1777
+ t0 = time.time()
1205
1778
  main()
1206
- print ("Total time elapsed: %f" %(time.time()-t0))
1207
- print ("Maximum memory usage: %f MB" %(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss/1000000))
1208
-
1779
+ print("Total time elapsed: %f" % (time.time() - t0))
1780
+ print(
1781
+ "Maximum memory usage: %f MB"
1782
+ % (resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1000000)
1783
+ )