biopipen 0.31.7__py3-none-any.whl → 0.32.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biopipen might be problematic. Click here for more details.

@@ -25,7 +25,6 @@ import sys, os, re, resource
25
25
  from os import path
26
26
  import numpy as np
27
27
  from copy import deepcopy
28
- from Bio.SubsMat.MatrixInfo import blosum62
29
28
  import time
30
29
  from time import gmtime, strftime
31
30
  from operator import itemgetter
@@ -38,254 +37,574 @@ from sklearn.manifold import MDS
38
37
  import faiss
39
38
  from query import *
40
39
 
41
- AAstring='ACDEFGHIKLMNPQRSTVWY'
42
- AAstringList=list(AAstring)
43
- cur_dir=os.path.dirname(os.path.realpath(__file__))+'/'
40
+ AAstring = "ACDEFGHIKLMNPQRSTVWY"
41
+ AAstringList = list(AAstring)
42
+ cur_dir = os.path.dirname(os.path.realpath(__file__)) + "/"
44
43
 
45
- blosum62n={}
44
+ blosum62n = {}
46
45
  for kk in blosum62:
47
- a1=kk[0]
48
- a2=kk[1]
49
- vv=blosum62[kk]
50
- if vv>4:
51
- vv=4
52
- blosum62n[(a1,a2)]=vv
46
+ a1 = kk[0]
47
+ a2 = kk[1]
48
+ vv = blosum62[kk]
49
+ if vv > 4:
50
+ vv = 4
51
+ blosum62n[(a1, a2)] = vv
53
52
  if a1 != a2:
54
- blosum62n[(a2,a1)]=vv
55
-
56
- bl62={'A':[4,-1,-2,-2,0,-1,-1,0,-2,-1,-1,-1,-1,-2,-1,1,0,-3,-2,0],
57
- 'R':[-1,4,0,-2,-3,1,0,-2,0,-3,-2,2,-1,-3,-2,-1,-1,-3,-2,-3],
58
- 'N':[-2,0,4,1,-3,0,0,0,1,-3,-3,0,-2,-3,-2,1,0,-4,-2,-3],
59
- 'D':[-2,-2,1,4,-3,0,2,-1,-1,-3,-4,-1,-3,-3,-1,0,-1,-4,-3,-3],
60
- 'C':[0,-3,-3,-3,4,-3,-4,-3,-3,-1,-1,-3,-1,-2,-3,-1,-1,-2,-2,-1],
61
- 'Q':[-1,1,0,0,-3,4,2,-2,0,-3,-2,1,0,-3,-1,0,-1,-2,-1,-2],
62
- 'E':[-1,0,0,2,-4,2,4,-2,0,-3,-3,1,-2,-3,-1,0,-1,-3,-2,-2],
63
- 'G':[0,-2,0,-1,-3,-2,-2,4,-2,-4,-4,-2,-3,-3,-2,0,-2,-2,-3,-3],
64
- 'H':[-2,0,1,-1,-3,0,0,-2,4,-3,-3,-1,-2,-1,-2,-1,-2,-2,2,-3],
65
- 'I':[-1,-3,-3,-3,-1,-3,-3,-4,-3,4,2,-3,1,0,-3,-2,-1,-3,-1,3],
66
- 'L':[-1,-2,-3,-4,-1,-2,-3,-4,-3,2,4,-2,2,0,-3,-2,-1,-2,-1,1],
67
- 'K':[-1,2,0,-1,-3,1,1,-2,-1,-3,-2,4,-1,-3,-1,0,-1,-3,-2,-2],
68
- 'M':[-1,-1,-2,-3,-1,0,-2,-3,-2,1,2,-1,4,0,-2,-1,-1,-1,-1,1],
69
- 'F':[-2,-3,-3,-3,-2,-3,-3,-3,-1,0,0,-3,0,4,-4,-2,-2,1,3,-1],
70
- 'P':[-1,-2,-2,-1,-3,-1,-1,-2,-2,-3,-3,-1,-2,-4,4,-1,-1,-4,-3,-2],
71
- 'S':[1,-1,1,0,-1,0,0,0,-1,-2,-2,0,-1,-2,-1,4,1,-3,-2,-2],
72
- 'T':[0,-1,0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-2,-1,1,4,-2,-2,0],
73
- 'W':[-3,-3,-4,-4,-2,-2,-3,-2,-2,-3,-2,-3,-1,1,-4,-3,-2,4,2,-3],
74
- 'Y':[-2,-2,-2,-3,-2,-1,-2,-3,2,-1,-1,-2,-1,3,-3,-2,-2,2,4,-1],
75
- 'V':[0,-3,-3,-3,-1,-2,-2,-3,-3,3,1,-2,1,-1,-2,-2,0,-3,-1,4]}
76
-
77
- bl62c=np.array([np.array(x) for x in list(bl62.values())])
78
- bl62c=4-bl62c
79
-
80
- embedding=MDS(n_components=13, n_init=100, max_iter=1000, eps=0.00001, dissimilarity='precomputed')
81
- X=embedding.fit_transform(bl62c)
82
-
83
- bl62np={}
84
- vkk=list(bl62.keys())
53
+ blosum62n[(a2, a1)] = vv
54
+
55
+ bl62 = {
56
+ "A": [4, -1, -2, -2, 0, -1, -1, 0, -2, -1, -1, -1, -1, -2, -1, 1, 0, -3, -2, 0],
57
+ "R": [-1, 4, 0, -2, -3, 1, 0, -2, 0, -3, -2, 2, -1, -3, -2, -1, -1, -3, -2, -3],
58
+ "N": [-2, 0, 4, 1, -3, 0, 0, 0, 1, -3, -3, 0, -2, -3, -2, 1, 0, -4, -2, -3],
59
+ "D": [-2, -2, 1, 4, -3, 0, 2, -1, -1, -3, -4, -1, -3, -3, -1, 0, -1, -4, -3, -3],
60
+ "C": [0, -3, -3, -3, 4, -3, -4, -3, -3, -1, -1, -3, -1, -2, -3, -1, -1, -2, -2, -1],
61
+ "Q": [-1, 1, 0, 0, -3, 4, 2, -2, 0, -3, -2, 1, 0, -3, -1, 0, -1, -2, -1, -2],
62
+ "E": [-1, 0, 0, 2, -4, 2, 4, -2, 0, -3, -3, 1, -2, -3, -1, 0, -1, -3, -2, -2],
63
+ "G": [0, -2, 0, -1, -3, -2, -2, 4, -2, -4, -4, -2, -3, -3, -2, 0, -2, -2, -3, -3],
64
+ "H": [-2, 0, 1, -1, -3, 0, 0, -2, 4, -3, -3, -1, -2, -1, -2, -1, -2, -2, 2, -3],
65
+ "I": [-1, -3, -3, -3, -1, -3, -3, -4, -3, 4, 2, -3, 1, 0, -3, -2, -1, -3, -1, 3],
66
+ "L": [-1, -2, -3, -4, -1, -2, -3, -4, -3, 2, 4, -2, 2, 0, -3, -2, -1, -2, -1, 1],
67
+ "K": [-1, 2, 0, -1, -3, 1, 1, -2, -1, -3, -2, 4, -1, -3, -1, 0, -1, -3, -2, -2],
68
+ "M": [-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1, 4, 0, -2, -1, -1, -1, -1, 1],
69
+ "F": [-2, -3, -3, -3, -2, -3, -3, -3, -1, 0, 0, -3, 0, 4, -4, -2, -2, 1, 3, -1],
70
+ "P": [
71
+ -1,
72
+ -2,
73
+ -2,
74
+ -1,
75
+ -3,
76
+ -1,
77
+ -1,
78
+ -2,
79
+ -2,
80
+ -3,
81
+ -3,
82
+ -1,
83
+ -2,
84
+ -4,
85
+ 4,
86
+ -1,
87
+ -1,
88
+ -4,
89
+ -3,
90
+ -2,
91
+ ],
92
+ "S": [1, -1, 1, 0, -1, 0, 0, 0, -1, -2, -2, 0, -1, -2, -1, 4, 1, -3, -2, -2],
93
+ "T": [0, -1, 0, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, -2, -1, 1, 4, -2, -2, 0],
94
+ "W": [-3, -3, -4, -4, -2, -2, -3, -2, -2, -3, -2, -3, -1, 1, -4, -3, -2, 4, 2, -3],
95
+ "Y": [-2, -2, -2, -3, -2, -1, -2, -3, 2, -1, -1, -2, -1, 3, -3, -2, -2, 2, 4, -1],
96
+ "V": [0, -3, -3, -3, -1, -2, -2, -3, -3, 3, 1, -2, 1, -1, -2, -2, 0, -3, -1, 4],
97
+ }
98
+
99
+ bl62c = np.array([np.array(x) for x in list(bl62.values())])
100
+ bl62c = 4 - bl62c
101
+
102
+ embedding = MDS(
103
+ n_components=13, n_init=100, max_iter=1000, eps=0.00001, dissimilarity="precomputed"
104
+ )
105
+ X = embedding.fit_transform(bl62c)
106
+
107
+ bl62np = {}
108
+ vkk = list(bl62.keys())
85
109
  for ii in range(20):
86
- kk=vkk[ii]
87
- bl62np[kk]=np.array(list(X[ii,])+[0]*17)
110
+ kk = vkk[ii]
111
+ bl62np[kk] = np.array(list(X[ii,]) + [0] * 17)
88
112
 
89
-
90
- AAencodingDict={}
113
+
114
+ AAencodingDict = {}
91
115
  for ii in range(len(AAstringList)):
92
- aa=AAstringList[ii]
93
- CODE=[0]*(ii)+[1]+[0]*(20-ii)
94
- AAencodingDict[aa]=np.array(CODE)
95
-
96
- Ndim=16 ## optimized for isometric embedding
97
- n0=Ndim*6
98
- #M0=np.concatenate((np.concatenate((ZERO,M1),axis=1),np.concatenate((M1, ZERO),axis=1)))
99
- ZERO=np.zeros((Ndim,Ndim))
100
- II=np.eye(Ndim)
101
- M0=np.concatenate((np.concatenate((ZERO,ZERO, II),axis=1),np.concatenate((II, ZERO, ZERO),axis=1),np.concatenate((ZERO,II, ZERO),axis=1)))
116
+ aa = AAstringList[ii]
117
+ CODE = [0] * (ii) + [1] + [0] * (20 - ii)
118
+ AAencodingDict[aa] = np.array(CODE)
119
+
120
+ Ndim = 16 ## optimized for isometric embedding
121
+ n0 = Ndim * 6
122
+ # M0=np.concatenate((np.concatenate((ZERO,M1),axis=1),np.concatenate((M1, ZERO),axis=1)))
123
+ ZERO = np.zeros((Ndim, Ndim))
124
+ II = np.eye(Ndim)
125
+ M0 = np.concatenate(
126
+ (
127
+ np.concatenate((ZERO, ZERO, II), axis=1),
128
+ np.concatenate((II, ZERO, ZERO), axis=1),
129
+ np.concatenate((ZERO, II, ZERO), axis=1),
130
+ )
131
+ )
102
132
  ## Construct 6-th order cyclic group
103
- ZERO45=np.zeros((Ndim*3,Ndim*3))
104
- M6=np.concatenate((np.concatenate((ZERO45,M0),axis=1),np.concatenate((M0, ZERO45),axis=1)))
105
-
106
- X=np.array([[-0.31230882, -0.53572156, -0.01949946, -0.12211268, -0.70947917,
107
- -0.42211092, 0.02783931, 0.02637933, -0.41760305, 0.21809875,
108
- 0.53532768, 0.04833016, 0.07877711, 0.50464914, -0.26972087,
109
- -0.52416842],
110
- [ 0.29672002, 0.29005364, 0.18176298, -0.05103382, -0.34686519,
111
- 0.58024228, -0.49282931, 0.62304281, -0.09575202, 0.30115555,
112
- 0.09913529, 0.1577466 , -0.94391939, -0.10505925, 0.05482389,
113
- 0.38409897],
114
- [-0.42212537, 0.12225749, 0.16279646, 0.60099009, 0.19734216,
115
- 0.42819919, -0.33562418, 0.17036334, 0.4234109 , 0.46681561,
116
- -0.50347222, -0.37936876, 0.1494825 , 0.32176759, 0.28584684,
117
- 0.68469861],
118
- [ 0.18599294, -0.44017825, -0.4476952 , 0.34340976, 0.44603553,
119
- 0.40974629, -0.60045935, -0.09056728, 0.22147919, -0.33029418,
120
- 0.55635594, -0.54149972, 0.05459062, 0.57334159, -0.06227118,
121
- 0.65299872],
122
- [-0.19010428, 0.64418792, -0.85286762, 0.21380295, 0.37639516,
123
- -0.67753593, 0.38751609, 0.55746524, 0.01443766, 0.1776535 ,
124
- 0.62853954, -0.15048523, 0.55100206, -0.21426656, 0.3644061 ,
125
- -0.0018255 ],
126
- [ 0.7350723 , 0.10111267, 0.55640019, -0.18226966, 0.51658102,
127
- -0.19321508, -0.46599027, -0.02989911, 0.4036196 , -0.11978213,
128
- -0.29837524, -0.30232765, -0.36738065, -0.1379793 , 0.04362871,
129
- 0.33553714],
130
- [ 0.41134047, 0.13512443, 0.62492322, -0.10120261, -0.03093491,
131
- 0.23751917, -0.68338694, 0.05124762, 0.41533821, 0.46669353,
132
- 0.31467277, -0.02427587, 0.15361135, 0.70595112, -0.27952632,
133
- 0.32408931],
134
- [-0.33041265, -0.43860065, -0.5509376 , -0.04380843, -0.35160935,
135
- 0.25134855, 0.53409314, 0.54850824, 0.59490287, 0.32669345,
136
- -0.45355268, -0.56317041, -0.55416297, 0.18117841, -0.71600849,
137
- -0.08989825],
138
- [-0.40366849, 0.10978974, 0.0280101 , -0.46667987, -0.45607028,
139
- 0.54114052, -0.77552923, -0.10720425, 0.55252091, -0.34397153,
140
- -0.59813694, 0.15567728, 0.03071009, -0.02176143, 0.34442719,
141
- 0.14681541],
142
- [ 0.19280422, 0.35777863, 0.06139255, 0.20081699, -0.30546596,
143
- -0.56901549, -0.15290953, -0.31181573, -0.74523217, 0.22296016,
144
- -0.39143832, -0.16474685, 0.58064427, -0.77386654, 0.19713107,
145
- -0.49477418],
146
- [-0.16133903, 0.22112761, -0.53162136, 0.34764073, -0.08522381,
147
- -0.2510216 , 0.04699411, -0.25702389, -0.8739765 , -0.24171728,
148
- -0.24370533, 0.42193635, 0.41056913, -0.60378211, -0.65756832,
149
- 0.0845203 ],
150
- [-0.34792144, 0.18450939, 0.77038332, 0.63868511, -0.06221681,
151
- 0.11930421, 0.04895523, -0.22463059, -0.03268844, -0.58941354,
152
- 0.11640045, 0.32384901, -0.42952779, 0.58119471, 0.07288662,
153
- 0.26669673],
154
- [ 0.01834555, -0.16367754, 0.34900298, 0.45087949, 0.47073855,
155
- -0.37377404, 0.0606911 , 0.2455703 , -0.55182937, -0.20261009,
156
- 0.28325423, -0.04741146, 0.30565238, -0.62090653, 0.17528413,
157
- -0.60434975],
158
- [-0.55464981, 0.50918784, -0.21371646, -0.63996967, -0.37656862,
159
- 0.27852662, 0.3287838 , -0.56800869, 0.23260763, -0.20653106,
160
- 0.63261439, -0.22666691, 0.00726302, -0.60125196, 0.07139961,
161
- -0.35086639],
162
- [ 0.94039731, -0.25999326, 0.43922549, -0.485738 , -0.20492235,
163
- -0.26005626, 0.68776626, 0.57826888, -0.05973995, -0.1193658 ,
164
- -0.12102433, -0.22091354, 0.43427913, 0.71447886, 0.32745991,
165
- 0.03466398],
166
- [-0.13194625, -0.12262688, 0.18029209, 0.16555524, 0.39594125,
167
- -0.58110665, 0.16161717, 0.0839783 , 0.0911945 , 0.34546976,
168
- -0.29415349, 0.29891936, -0.60834721, 0.5943593 , -0.29473819,
169
- 0.4864154 ],
170
- [ 0.40850093, -0.4638894 , -0.39732987, -0.01972861, 0.51189582,
171
- 0.10176704, 0.37528519, -0.41479418, -0.1932531 , 0.54732221,
172
- -0.11876511, 0.32843973, -0.259283 , 0.59500132, 0.35168375,
173
- -0.21733727],
174
- [-0.50627723, -0.1973602 , -0.02339884, -0.66846048, 0.62696606,
175
- 0.60049717, 0.69143364, -0.48053591, 0.17812208, -0.58481821,
176
- -0.23551415, -0.06229112, 0.20993116, -0.72485884, 0.34375662,
177
- -0.23539168],
178
- [-0.51388312, -0.2788953 , 0.00859533, -0.5247195 , -0.18021544,
179
- 0.28372911, 0.10791359, 0.13033494, 0.34294013, -0.70310089,
180
- -0.13245433, 0.48661081, 0.08451644, -0.69990992, 0.0408274 ,
181
- -0.47204888],
182
- [ 0.68546275, 0.22581365, -0.32571833, 0.34394298, -0.43232367,
183
- -0.5041842 , 0.04784017, -0.53067936, -0.50049908, 0.36874221,
184
- 0.22429186, 0.4616482 , 0.11159174, -0.26827959, -0.39372848,
185
- -0.40987423]])
186
-
187
- bl62np={}
188
- vkk=list(bl62.keys())
133
+ ZERO45 = np.zeros((Ndim * 3, Ndim * 3))
134
+ M6 = np.concatenate(
135
+ (np.concatenate((ZERO45, M0), axis=1), np.concatenate((M0, ZERO45), axis=1))
136
+ )
137
+
138
+ X = np.array(
139
+ [
140
+ [
141
+ -0.31230882,
142
+ -0.53572156,
143
+ -0.01949946,
144
+ -0.12211268,
145
+ -0.70947917,
146
+ -0.42211092,
147
+ 0.02783931,
148
+ 0.02637933,
149
+ -0.41760305,
150
+ 0.21809875,
151
+ 0.53532768,
152
+ 0.04833016,
153
+ 0.07877711,
154
+ 0.50464914,
155
+ -0.26972087,
156
+ -0.52416842,
157
+ ],
158
+ [
159
+ 0.29672002,
160
+ 0.29005364,
161
+ 0.18176298,
162
+ -0.05103382,
163
+ -0.34686519,
164
+ 0.58024228,
165
+ -0.49282931,
166
+ 0.62304281,
167
+ -0.09575202,
168
+ 0.30115555,
169
+ 0.09913529,
170
+ 0.1577466,
171
+ -0.94391939,
172
+ -0.10505925,
173
+ 0.05482389,
174
+ 0.38409897,
175
+ ],
176
+ [
177
+ -0.42212537,
178
+ 0.12225749,
179
+ 0.16279646,
180
+ 0.60099009,
181
+ 0.19734216,
182
+ 0.42819919,
183
+ -0.33562418,
184
+ 0.17036334,
185
+ 0.4234109,
186
+ 0.46681561,
187
+ -0.50347222,
188
+ -0.37936876,
189
+ 0.1494825,
190
+ 0.32176759,
191
+ 0.28584684,
192
+ 0.68469861,
193
+ ],
194
+ [
195
+ 0.18599294,
196
+ -0.44017825,
197
+ -0.4476952,
198
+ 0.34340976,
199
+ 0.44603553,
200
+ 0.40974629,
201
+ -0.60045935,
202
+ -0.09056728,
203
+ 0.22147919,
204
+ -0.33029418,
205
+ 0.55635594,
206
+ -0.54149972,
207
+ 0.05459062,
208
+ 0.57334159,
209
+ -0.06227118,
210
+ 0.65299872,
211
+ ],
212
+ [
213
+ -0.19010428,
214
+ 0.64418792,
215
+ -0.85286762,
216
+ 0.21380295,
217
+ 0.37639516,
218
+ -0.67753593,
219
+ 0.38751609,
220
+ 0.55746524,
221
+ 0.01443766,
222
+ 0.1776535,
223
+ 0.62853954,
224
+ -0.15048523,
225
+ 0.55100206,
226
+ -0.21426656,
227
+ 0.3644061,
228
+ -0.0018255,
229
+ ],
230
+ [
231
+ 0.7350723,
232
+ 0.10111267,
233
+ 0.55640019,
234
+ -0.18226966,
235
+ 0.51658102,
236
+ -0.19321508,
237
+ -0.46599027,
238
+ -0.02989911,
239
+ 0.4036196,
240
+ -0.11978213,
241
+ -0.29837524,
242
+ -0.30232765,
243
+ -0.36738065,
244
+ -0.1379793,
245
+ 0.04362871,
246
+ 0.33553714,
247
+ ],
248
+ [
249
+ 0.41134047,
250
+ 0.13512443,
251
+ 0.62492322,
252
+ -0.10120261,
253
+ -0.03093491,
254
+ 0.23751917,
255
+ -0.68338694,
256
+ 0.05124762,
257
+ 0.41533821,
258
+ 0.46669353,
259
+ 0.31467277,
260
+ -0.02427587,
261
+ 0.15361135,
262
+ 0.70595112,
263
+ -0.27952632,
264
+ 0.32408931,
265
+ ],
266
+ [
267
+ -0.33041265,
268
+ -0.43860065,
269
+ -0.5509376,
270
+ -0.04380843,
271
+ -0.35160935,
272
+ 0.25134855,
273
+ 0.53409314,
274
+ 0.54850824,
275
+ 0.59490287,
276
+ 0.32669345,
277
+ -0.45355268,
278
+ -0.56317041,
279
+ -0.55416297,
280
+ 0.18117841,
281
+ -0.71600849,
282
+ -0.08989825,
283
+ ],
284
+ [
285
+ -0.40366849,
286
+ 0.10978974,
287
+ 0.0280101,
288
+ -0.46667987,
289
+ -0.45607028,
290
+ 0.54114052,
291
+ -0.77552923,
292
+ -0.10720425,
293
+ 0.55252091,
294
+ -0.34397153,
295
+ -0.59813694,
296
+ 0.15567728,
297
+ 0.03071009,
298
+ -0.02176143,
299
+ 0.34442719,
300
+ 0.14681541,
301
+ ],
302
+ [
303
+ 0.19280422,
304
+ 0.35777863,
305
+ 0.06139255,
306
+ 0.20081699,
307
+ -0.30546596,
308
+ -0.56901549,
309
+ -0.15290953,
310
+ -0.31181573,
311
+ -0.74523217,
312
+ 0.22296016,
313
+ -0.39143832,
314
+ -0.16474685,
315
+ 0.58064427,
316
+ -0.77386654,
317
+ 0.19713107,
318
+ -0.49477418,
319
+ ],
320
+ [
321
+ -0.16133903,
322
+ 0.22112761,
323
+ -0.53162136,
324
+ 0.34764073,
325
+ -0.08522381,
326
+ -0.2510216,
327
+ 0.04699411,
328
+ -0.25702389,
329
+ -0.8739765,
330
+ -0.24171728,
331
+ -0.24370533,
332
+ 0.42193635,
333
+ 0.41056913,
334
+ -0.60378211,
335
+ -0.65756832,
336
+ 0.0845203,
337
+ ],
338
+ [
339
+ -0.34792144,
340
+ 0.18450939,
341
+ 0.77038332,
342
+ 0.63868511,
343
+ -0.06221681,
344
+ 0.11930421,
345
+ 0.04895523,
346
+ -0.22463059,
347
+ -0.03268844,
348
+ -0.58941354,
349
+ 0.11640045,
350
+ 0.32384901,
351
+ -0.42952779,
352
+ 0.58119471,
353
+ 0.07288662,
354
+ 0.26669673,
355
+ ],
356
+ [
357
+ 0.01834555,
358
+ -0.16367754,
359
+ 0.34900298,
360
+ 0.45087949,
361
+ 0.47073855,
362
+ -0.37377404,
363
+ 0.0606911,
364
+ 0.2455703,
365
+ -0.55182937,
366
+ -0.20261009,
367
+ 0.28325423,
368
+ -0.04741146,
369
+ 0.30565238,
370
+ -0.62090653,
371
+ 0.17528413,
372
+ -0.60434975,
373
+ ],
374
+ [
375
+ -0.55464981,
376
+ 0.50918784,
377
+ -0.21371646,
378
+ -0.63996967,
379
+ -0.37656862,
380
+ 0.27852662,
381
+ 0.3287838,
382
+ -0.56800869,
383
+ 0.23260763,
384
+ -0.20653106,
385
+ 0.63261439,
386
+ -0.22666691,
387
+ 0.00726302,
388
+ -0.60125196,
389
+ 0.07139961,
390
+ -0.35086639,
391
+ ],
392
+ [
393
+ 0.94039731,
394
+ -0.25999326,
395
+ 0.43922549,
396
+ -0.485738,
397
+ -0.20492235,
398
+ -0.26005626,
399
+ 0.68776626,
400
+ 0.57826888,
401
+ -0.05973995,
402
+ -0.1193658,
403
+ -0.12102433,
404
+ -0.22091354,
405
+ 0.43427913,
406
+ 0.71447886,
407
+ 0.32745991,
408
+ 0.03466398,
409
+ ],
410
+ [
411
+ -0.13194625,
412
+ -0.12262688,
413
+ 0.18029209,
414
+ 0.16555524,
415
+ 0.39594125,
416
+ -0.58110665,
417
+ 0.16161717,
418
+ 0.0839783,
419
+ 0.0911945,
420
+ 0.34546976,
421
+ -0.29415349,
422
+ 0.29891936,
423
+ -0.60834721,
424
+ 0.5943593,
425
+ -0.29473819,
426
+ 0.4864154,
427
+ ],
428
+ [
429
+ 0.40850093,
430
+ -0.4638894,
431
+ -0.39732987,
432
+ -0.01972861,
433
+ 0.51189582,
434
+ 0.10176704,
435
+ 0.37528519,
436
+ -0.41479418,
437
+ -0.1932531,
438
+ 0.54732221,
439
+ -0.11876511,
440
+ 0.32843973,
441
+ -0.259283,
442
+ 0.59500132,
443
+ 0.35168375,
444
+ -0.21733727,
445
+ ],
446
+ [
447
+ -0.50627723,
448
+ -0.1973602,
449
+ -0.02339884,
450
+ -0.66846048,
451
+ 0.62696606,
452
+ 0.60049717,
453
+ 0.69143364,
454
+ -0.48053591,
455
+ 0.17812208,
456
+ -0.58481821,
457
+ -0.23551415,
458
+ -0.06229112,
459
+ 0.20993116,
460
+ -0.72485884,
461
+ 0.34375662,
462
+ -0.23539168,
463
+ ],
464
+ [
465
+ -0.51388312,
466
+ -0.2788953,
467
+ 0.00859533,
468
+ -0.5247195,
469
+ -0.18021544,
470
+ 0.28372911,
471
+ 0.10791359,
472
+ 0.13033494,
473
+ 0.34294013,
474
+ -0.70310089,
475
+ -0.13245433,
476
+ 0.48661081,
477
+ 0.08451644,
478
+ -0.69990992,
479
+ 0.0408274,
480
+ -0.47204888,
481
+ ],
482
+ [
483
+ 0.68546275,
484
+ 0.22581365,
485
+ -0.32571833,
486
+ 0.34394298,
487
+ -0.43232367,
488
+ -0.5041842,
489
+ 0.04784017,
490
+ -0.53067936,
491
+ -0.50049908,
492
+ 0.36874221,
493
+ 0.22429186,
494
+ 0.4616482,
495
+ 0.11159174,
496
+ -0.26827959,
497
+ -0.39372848,
498
+ -0.40987423,
499
+ ],
500
+ ]
501
+ )
502
+
503
+ bl62np = {}
504
+ vkk = list(bl62.keys())
189
505
  for ii in range(20):
190
- kk=vkk[ii]
191
- bl62np[kk]=np.array(list(X[ii,])+[0]*Ndim*5)
506
+ kk = vkk[ii]
507
+ bl62np[kk] = np.array(list(X[ii,]) + [0] * Ndim * 5)
508
+
192
509
 
193
510
  def EncodingCDR3(s, M, n0):
194
- sL=list(s)
195
- x=np.array([0]*n0)
511
+ sL = list(s)
512
+ x = np.array([0] * n0)
196
513
  for ii in range(len(sL)):
197
- x = np.dot(M, (x+bl62np[sL[ii]]))
514
+ x = np.dot(M, (x + bl62np[sL[ii]]))
198
515
  return x
199
516
 
517
+
200
518
  def BuildLengthDict(seqs, sIDs, vGene=[], INFO=[]):
201
- LLs=[10,11,12,13,14,15,16,17,18,19,20,21,22,23,24]
202
- LengthD={}
203
- SeqD={}
204
- VgeneD={}
205
- InfoD={}
206
- AAs=set(list(AAencodingDict.keys()))
207
- NAs=len(AAencodingDict)
208
- cNAs=0
519
+ LLs = [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]
520
+ LengthD = {}
521
+ SeqD = {}
522
+ VgeneD = {}
523
+ InfoD = {}
524
+ AAs = set(list(AAencodingDict.keys()))
525
+ NAs = len(AAencodingDict)
526
+ cNAs = 0
209
527
  for ii in range(len(seqs)):
210
- ID=sIDs[ii]
211
- ss=seqs[ii]
212
- ssAA=set(list(ss))
213
- TMP=list(ssAA | AAs)
528
+ ID = sIDs[ii]
529
+ ss = seqs[ii]
530
+ ssAA = set(list(ss))
531
+ TMP = list(ssAA | AAs)
214
532
  if len(TMP) > NAs:
215
533
  ## CDR3 containing non amino acid letter
216
- #print('Warning: CDR3: '+ss + ' contains non amino acid letter!')
217
- cNAs+=1
534
+ # print('Warning: CDR3: '+ss + ' contains non amino acid letter!')
535
+ cNAs += 1
218
536
  continue
219
- if len(vGene)>0:
220
- vv=vGene[ii]
221
- if len(INFO)>0:
222
- info=INFO[ii]
223
- L=len(ss)
537
+ if len(vGene) > 0:
538
+ vv = vGene[ii]
539
+ if len(INFO) > 0:
540
+ info = INFO[ii]
541
+ L = len(ss)
224
542
  if L not in LLs:
225
543
  continue
226
544
  if L not in LengthD:
227
- LengthD[L]=[ID]
228
- SeqD[L]=[ss]
229
- if len(vGene)>0:
230
- VgeneD[L]=[vv]
231
- if len(INFO)>0:
232
- InfoD[L]=[info]
545
+ LengthD[L] = [ID]
546
+ SeqD[L] = [ss]
547
+ if len(vGene) > 0:
548
+ VgeneD[L] = [vv]
549
+ if len(INFO) > 0:
550
+ InfoD[L] = [info]
233
551
  else:
234
552
  LengthD[L].append(ID)
235
553
  SeqD[L].append(ss)
236
- if len(vGene)>0:
554
+ if len(vGene) > 0:
237
555
  VgeneD[L].append(vv)
238
- if len(INFO)>0:
556
+ if len(INFO) > 0:
239
557
  InfoD[L].append(info)
240
- if cNAs>0:
241
- print("Warning: Skipped %d sequences with non AA letter!" %(cNAs))
558
+ if cNAs > 0:
559
+ print("Warning: Skipped %d sequences with non AA letter!" % (cNAs))
242
560
  return LengthD, VgeneD, InfoD, SeqD
243
561
 
562
+
244
563
  def CollapseUnique(LD, VD, ID, SD):
245
- kks=LD.keys()
246
- LDu={}
247
- VDu={}
248
- IDu={}
249
- SDu={}
564
+ kks = LD.keys()
565
+ LDu = {}
566
+ VDu = {}
567
+ IDu = {}
568
+ SDu = {}
250
569
  for kk in kks:
251
- vvL=list(LD[kk])
252
- if len(VD)>0:
253
- vvV=list(VD[kk])
570
+ vvL = list(LD[kk])
571
+ if len(VD) > 0:
572
+ vvV = list(VD[kk])
254
573
  else:
255
- vvV=['TRBV2-1*01']*len(vvL)
256
- vvI=list(ID[kk])
257
- vvS=list(SD[kk])
258
- zz=zip(vvL, vvS, vvV, vvI)
259
- zzs=sorted(zz, key = lambda x: (x[1], x[2]))
260
- nz=len(zzs)
261
- pointer_pre=0
262
- pointer_cur=1
263
- s_pre=zzs[pointer_pre][1]
264
- v_pre=zzs[pointer_pre][2]
265
- uS=[s_pre]
266
- uV=[v_pre]
267
- uI=[[zzs[pointer_pre][3]]]
574
+ vvV = ["TRBV2-1*01"] * len(vvL)
575
+ vvI = list(ID[kk])
576
+ vvS = list(SD[kk])
577
+ zz = zip(vvL, vvS, vvV, vvI)
578
+ zzs = sorted(zz, key=lambda x: (x[1], x[2]))
579
+ nz = len(zzs)
580
+ pointer_pre = 0
581
+ pointer_cur = 1
582
+ s_pre = zzs[pointer_pre][1]
583
+ v_pre = zzs[pointer_pre][2]
584
+ uS = [s_pre]
585
+ uV = [v_pre]
586
+ uI = [[zzs[pointer_pre][3]]]
268
587
  while pointer_cur < nz:
269
- s_cur=zzs[pointer_cur][1]
270
- v_cur=zzs[pointer_cur][2]
588
+ s_cur = zzs[pointer_cur][1]
589
+ v_cur = zzs[pointer_cur][2]
271
590
  if s_cur == s_pre and v_cur == v_pre:
272
- uI[len(uI)-1].append(zzs[pointer_cur][3])
591
+ uI[len(uI) - 1].append(zzs[pointer_cur][3])
273
592
  pointer_cur += 1
274
593
  continue
275
594
  else:
276
595
  uS.append(s_cur)
277
596
  uV.append(v_cur)
278
597
  uI.append([zzs[pointer_cur][3]])
279
- s_pre=s_cur
280
- v_pre=v_cur
281
- pointer_pre=pointer_cur
598
+ s_pre = s_cur
599
+ v_pre = v_cur
600
+ pointer_pre = pointer_cur
282
601
  pointer_cur += 1
283
- uL=[x for x in range(len(uS))]
284
- LDu[kk]=uL
285
- SDu[kk]=uS
286
- if len(VD)>0:
287
- VDu[kk]=uV
288
- IDu[kk]=uI
602
+ uL = [x for x in range(len(uS))]
603
+ LDu[kk] = uL
604
+ SDu[kk] = uS
605
+ if len(VD) > 0:
606
+ VDu[kk] = uV
607
+ IDu[kk] = uI
289
608
  return LDu, VDu, IDu, SDu
290
609
 
291
610
 
@@ -297,14 +616,15 @@ class CDR3:
297
616
  ## KS: Kmer size
298
617
  ## st: the first 0:(st-1) amino acids will not be included in K-merization
299
618
  ## ed: the last L-ed amino acids will be skipped
300
- self.s=s
301
- self.ID=sID
302
- L=len(s)
303
- self.L=L
304
- sub_s=s[st: (L-ed)]
305
- Ls=len(sub_s)
306
- Kmer=[sub_s[x:(x+KS)] for x in range(0,Ls-KS+1)]
307
- self.Kmer=Kmer
619
+ self.s = s
620
+ self.ID = sID
621
+ L = len(s)
622
+ self.L = L
623
+ sub_s = s[st : (L - ed)]
624
+ Ls = len(sub_s)
625
+ Kmer = [sub_s[x : (x + KS)] for x in range(0, Ls - KS + 1)]
626
+ self.Kmer = Kmer
627
+
308
628
 
309
629
  class KmerSet:
310
630
  ## Kmer set for fast read searching based on mismatch-allowed Kmer index
@@ -313,263 +633,277 @@ class KmerSet:
313
633
  ## Seqs and sIDs must have the same length
314
634
  if len(Seqs) != len(sIDs):
315
635
  raise "Sequence and ID lists have different length. Please check input."
316
- KmerDict={}
317
- N=len(Seqs)
318
- self.N=N
319
- CDR3Dict={}
320
- LLs=[]
321
- for ii in range(0,N):
322
- s=Seqs[ii]
323
- sID=sIDs[ii]
324
- cc=CDR3(s,sID,KS,st,ed)
325
- CDR3Dict[cc.ID]=cc.Kmer
326
- KK=cc.Kmer
636
+ KmerDict = {}
637
+ N = len(Seqs)
638
+ self.N = N
639
+ CDR3Dict = {}
640
+ LLs = []
641
+ for ii in range(0, N):
642
+ s = Seqs[ii]
643
+ sID = sIDs[ii]
644
+ cc = CDR3(s, sID, KS, st, ed)
645
+ CDR3Dict[cc.ID] = cc.Kmer
646
+ KK = cc.Kmer
327
647
  LLs.append(cc.L)
328
648
  for kk in KK:
329
649
  if kk not in KmerDict:
330
- KmerDict[kk]=[sID]
650
+ KmerDict[kk] = [sID]
331
651
  else:
332
652
  KmerDict[kk].append(sID)
333
- self.KD=KmerDict
334
- self.KS=KS
335
- self.CD=CDR3Dict
336
- self.LL=LLs
337
- def FindKmerNeighbor(self,kk):
338
- KS=self.KS
339
- KS_n1=[]
653
+ self.KD = KmerDict
654
+ self.KS = KS
655
+ self.CD = CDR3Dict
656
+ self.LL = LLs
657
+
658
+ def FindKmerNeighbor(self, kk):
659
+ KS = self.KS
660
+ KS_n1 = []
340
661
  for jj in range(KS):
341
- kk_pre=[kk[0:jj]]*20
342
- kk_suf=[kk[(jj+1):KS]]*20
343
- kkn=list(zip(kk_pre,AAstringList,kk_suf))
344
- KS_n1+=[''.join(list(x)) for x in kkn]
662
+ kk_pre = [kk[0:jj]] * 20
663
+ kk_suf = [kk[(jj + 1) : KS]] * 20
664
+ kkn = list(zip(kk_pre, AAstringList, kk_suf))
665
+ KS_n1 += ["".join(list(x)) for x in kkn]
345
666
  return KS_n1
346
- def FindKmerNeighbor2(self,kk):
667
+
668
+ def FindKmerNeighbor2(self, kk):
347
669
  ## KS>=6, allowing 2 mismatches. CDR3 length must be >= 10
348
- KS=self.KS
349
- KS_n1=[]
670
+ KS = self.KS
671
+ KS_n1 = []
350
672
  for jj in range(KS):
351
673
  for ii in range(KS):
352
- if ii<=jj:
674
+ if ii <= jj:
353
675
  continue
354
- kk_pre=[kk[0:jj]]*20
355
- kk_mid=[kk[(jj+1):ii]]*20
356
- kk_suf=[kk[(ii+1):KS]]*400
357
- kkn=list(zip(kk_pre,AAstringList,kk_mid))
358
- kkn=[''.join(list(x)) for x in kkn]
359
- kkn=[[x]*20 for x in kkn]
360
- kkn=list(chain(*kkn))
361
- kkn2=list(zip(kkn, AAstringList*20, kk_suf))
362
- kkn2=[''.join(list(x)) for x in kkn2]
363
- KS_n1+=kkn2
676
+ kk_pre = [kk[0:jj]] * 20
677
+ kk_mid = [kk[(jj + 1) : ii]] * 20
678
+ kk_suf = [kk[(ii + 1) : KS]] * 400
679
+ kkn = list(zip(kk_pre, AAstringList, kk_mid))
680
+ kkn = ["".join(list(x)) for x in kkn]
681
+ kkn = [[x] * 20 for x in kkn]
682
+ kkn = list(chain(*kkn))
683
+ kkn2 = list(zip(kkn, AAstringList * 20, kk_suf))
684
+ kkn2 = ["".join(list(x)) for x in kkn2]
685
+ KS_n1 += kkn2
364
686
  return KS_n1
687
+
365
688
  def KmerIndex(self):
366
689
  ## For each K-mer, find its nearest neighbor with 1 character mismatch
367
- KKs=list(self.KD.keys())
368
- KS=self.KS
369
- KKs_set=set(KKs)
370
- Skk='_'.join(KKs)
371
- KI_Dict={}
690
+ KKs = list(self.KD.keys())
691
+ KS = self.KS
692
+ KKs_set = set(KKs)
693
+ Skk = "_".join(KKs)
694
+ KI_Dict = {}
372
695
  for kk in KKs:
373
- ## kk_neighbor=[]
374
- ## for jj in range(KS):
375
- ## kk_pre=kk[0:jj]
376
- ## kk_suf=kk[(jj+1):KS]
377
- ## pat=kk_pre+'['+AAstring+']{1}'+kk_suf
378
- ## p=re.compile(pat)
379
- ## mm=[m.group() for m in p.finditer(Skk)]
380
- ## kk_neighbor+=mm
381
- KS_n=set(self.FindKmerNeighbor(kk))
696
+ ## kk_neighbor=[]
697
+ ## for jj in range(KS):
698
+ ## kk_pre=kk[0:jj]
699
+ ## kk_suf=kk[(jj+1):KS]
700
+ ## pat=kk_pre+'['+AAstring+']{1}'+kk_suf
701
+ ## p=re.compile(pat)
702
+ ## mm=[m.group() for m in p.finditer(Skk)]
703
+ ## kk_neighbor+=mm
704
+ KS_n = set(self.FindKmerNeighbor(kk))
382
705
  kk_neighbor = KS_n & KKs_set
383
- KI_Dict[kk]=list(kk_neighbor)
706
+ KI_Dict[kk] = list(kk_neighbor)
384
707
  return KI_Dict
708
+
385
709
  def updateKD(self, KI):
386
710
  ## group sequences sharing motifs with 1-2 mismatches
387
- KD=self.KD
388
- KDnew={}
711
+ KD = self.KD
712
+ KDnew = {}
389
713
  for kk in KD:
390
- kkm=KI[kk]
391
- vvL=itemgetter(*kkm)(KD)
392
- if isinstance(vvL[0],list):
393
- vvL=list(chain(*vvL))
394
- KDnew[kk]=vvL
714
+ kkm = KI[kk]
715
+ vvL = itemgetter(*kkm)(KD)
716
+ if isinstance(vvL[0], list):
717
+ vvL = list(chain(*vvL))
718
+ KDnew[kk] = vvL
395
719
  return KDnew
396
720
 
397
- def GenerateMotifGraph(mD,seqs,seqID):
398
- SeqShareGraph={}
399
- mDL={}
721
+
722
+ def GenerateMotifGraph(mD, seqs, seqID):
723
+ SeqShareGraph = {}
724
+ mDL = {}
400
725
  for kk in mD:
401
- vv=mD[kk]
402
- LL=[]
726
+ vv = mD[kk]
727
+ LL = []
403
728
  for v in vv:
404
729
  LL.append(len(seqs[v]))
405
- mDL[kk]=LL
730
+ mDL[kk] = LL
406
731
  for kk in mD:
407
- vv=mD[kk]
408
- LL=mDL[kk]
409
- nv=len(vv)
410
- for ii in range(0,nv):
411
- id_1=vv[ii]
412
- L1=LL[ii]
413
- for jj in range(ii,nv):
414
- if jj==ii:
732
+ vv = mD[kk]
733
+ LL = mDL[kk]
734
+ nv = len(vv)
735
+ for ii in range(0, nv):
736
+ id_1 = vv[ii]
737
+ L1 = LL[ii]
738
+ for jj in range(ii, nv):
739
+ if jj == ii:
415
740
  continue
416
- id_2=vv[jj]
417
- L2=LL[jj]
741
+ id_2 = vv[jj]
742
+ L2 = LL[jj]
418
743
  if L2 != L1:
419
744
  continue
420
745
  if id_1 not in SeqShareGraph:
421
- SeqShareGraph[id_1]=[id_2]
746
+ SeqShareGraph[id_1] = [id_2]
422
747
  elif id_2 not in SeqShareGraph[id_1]:
423
748
  SeqShareGraph[id_1].append(id_2)
424
749
  if id_2 not in SeqShareGraph:
425
- SeqShareGraph[id_2]=[id_1]
750
+ SeqShareGraph[id_2] = [id_1]
426
751
  elif id_1 not in SeqShareGraph[id_2]:
427
752
  SeqShareGraph[id_2].append(id_1)
428
753
  return SeqShareGraph
429
754
 
755
+
430
756
  def generateSSG(Kset, CDR3s, k_thr=2):
431
- KD=Kset.KD
432
- KI=Kset.KmerIndex()
433
- KDnew=Kset.updateKD(KI)
434
- CD=Kset.CD
435
- LL=np.array(Kset.LL)
436
- SSG={}
757
+ KD = Kset.KD
758
+ KI = Kset.KmerIndex()
759
+ KDnew = Kset.updateKD(KI)
760
+ CD = Kset.CD
761
+ LL = np.array(Kset.LL)
762
+ SSG = {}
437
763
  for kk in CD:
438
- vv=itemgetter(*CD[kk])(KDnew)
439
- if isinstance(vv[0],list):
440
- vv=list(chain(*vv))
441
- vv1=[]
442
- c=Counter(vv)
764
+ vv = itemgetter(*CD[kk])(KDnew)
765
+ if isinstance(vv[0], list):
766
+ vv = list(chain(*vv))
767
+ vv1 = []
768
+ c = Counter(vv)
443
769
  for k in c:
444
- if c[k]>=k_thr:
770
+ if c[k] >= k_thr:
445
771
  vv1.append(k)
446
- vv1=np.array(vv1)
447
- if len(vv1)==0:
772
+ vv1 = np.array(vv1)
773
+ if len(vv1) == 0:
448
774
  continue
449
- cdr3=CDR3s[kk]
450
- L0=len(cdr3)
451
- idx=np.where(LL[vv1]==L0)[0]
452
- if len(idx)==0:
775
+ cdr3 = CDR3s[kk]
776
+ L0 = len(cdr3)
777
+ idx = np.where(LL[vv1] == L0)[0]
778
+ if len(idx) == 0:
453
779
  continue
454
- vvs=list(vv1[idx])
780
+ vvs = list(vv1[idx])
455
781
  vvs.remove(kk)
456
- if len(vvs)>0:
457
- SSG[kk]=vvs
782
+ if len(vvs) > 0:
783
+ SSG[kk] = vvs
458
784
  return SSG
459
785
 
460
- def SeqComparison(s1,s2,gap=-6):
461
- n=len(s1)
462
- CorList=[]
463
- score=0
464
- for kk in range(0,n):
465
- aa=s1[kk]
466
- bb=s2[kk]
467
- if aa in ['.','-','*'] or bb in ['.','-','*']:
468
- if aa!=bb:
786
+
787
+ def SeqComparison(s1, s2, gap=-6):
788
+ n = len(s1)
789
+ CorList = []
790
+ score = 0
791
+ for kk in range(0, n):
792
+ aa = s1[kk]
793
+ bb = s2[kk]
794
+ if aa in [".", "-", "*"] or bb in [".", "-", "*"]:
795
+ if aa != bb:
469
796
  score += gap
470
797
  continue
471
- if aa==bb:
472
- # score += min(4,blosum62[(aa,aa)])
473
- score += blosum62n[(aa,aa)]
798
+ if aa == bb:
799
+ # score += min(4,blosum62[(aa,aa)])
800
+ score += blosum62n[(aa, aa)]
474
801
  continue
475
- KEY=(aa,bb)
476
- # if KEY not in blosum62:
477
- # KEY=(bb,aa)
478
- # if KEY not in blosum62:
479
- # raise "Non-standard amino acid coding!"
480
- score+=blosum62n[KEY]
802
+ KEY = (aa, bb)
803
+ # if KEY not in blosum62:
804
+ # KEY=(bb,aa)
805
+ # if KEY not in blosum62:
806
+ # raise "Non-standard amino acid coding!"
807
+ score += blosum62n[KEY]
481
808
  return score
482
809
 
483
- def NHLocalAlignment(Seq1,Seq2,gap_thr=1,gap=-6):
484
- n1=len(Seq1)
485
- n2=len(Seq2)
486
- if n1<n2:
487
- Seq=Seq1
488
- Seq1=Seq2
489
- Seq2=Seq
490
- nn=n2-n1
810
+
811
+ def NHLocalAlignment(Seq1, Seq2, gap_thr=1, gap=-6):
812
+ n1 = len(Seq1)
813
+ n2 = len(Seq2)
814
+ if n1 < n2:
815
+ Seq = Seq1
816
+ Seq1 = Seq2
817
+ Seq2 = Seq
818
+ nn = n2 - n1
491
819
  else:
492
- nn=n1-n2
493
- if nn>gap_thr:
820
+ nn = n1 - n2
821
+ if nn > gap_thr:
494
822
  return -1
495
- SeqList1=[Seq1]
496
- SeqList2=InsertGap(Seq2,nn)
497
- alns=[]
498
- SCOREList=[]
823
+ SeqList1 = [Seq1]
824
+ SeqList2 = InsertGap(Seq2, nn)
825
+ alns = []
826
+ SCOREList = []
499
827
  for s1 in SeqList1:
500
828
  for s2 in SeqList2:
501
- SCOREList.append(SeqComparison(s1,s2,gap))
502
- maxS=max(SCOREList)
829
+ SCOREList.append(SeqComparison(s1, s2, gap))
830
+ maxS = max(SCOREList)
503
831
  return maxS
504
832
 
505
- def InsertGap(Seq,n):
833
+
834
+ def InsertGap(Seq, n):
506
835
  ## Insert n gaps to Seq; n<=2
507
- if n==0:
836
+ if n == 0:
508
837
  return [Seq]
509
- ns=len(Seq)
510
- SeqList=[]
511
- if(n==1):
512
- for kk in range(0,ns+1):
513
- SeqNew=Seq[0:kk]+'-'+Seq[kk:]
838
+ ns = len(Seq)
839
+ SeqList = []
840
+ if n == 1:
841
+ for kk in range(0, ns + 1):
842
+ SeqNew = Seq[0:kk] + "-" + Seq[kk:]
514
843
  SeqList.append(SeqNew)
515
- if(n==2):
516
- for kk in range(0,ns+1):
517
- SeqNew=Seq[0:kk]+'-'+Seq[kk:]
518
- for jj in range(0,ns+2):
519
- SeqNew0=SeqNew[0:jj]+'-'+SeqNew[jj:]
844
+ if n == 2:
845
+ for kk in range(0, ns + 1):
846
+ SeqNew = Seq[0:kk] + "-" + Seq[kk:]
847
+ for jj in range(0, ns + 2):
848
+ SeqNew0 = SeqNew[0:jj] + "-" + SeqNew[jj:]
520
849
  SeqList.append(SeqNew0)
521
850
  return SeqList
522
851
 
523
- def falign(s1, s2, V1, V2 ,st,VScore={}, UseV=True, gapn=1, gap=-6):
524
- mid1=s1[st:-2]
525
- mid2=s2[st:-2]
852
+
853
+ def falign(s1, s2, V1, V2, st, VScore={}, UseV=True, gapn=1, gap=-6):
854
+ mid1 = s1[st:-2]
855
+ mid2 = s2[st:-2]
526
856
  if UseV:
527
- if V2==V1:
528
- V_score=4
857
+ if V2 == V1:
858
+ V_score = 4
529
859
  else:
530
- Vkey=(V1,V2)
860
+ Vkey = (V1, V2)
531
861
  if Vkey not in VScore:
532
- Vkey=(V2,V1)
862
+ Vkey = (V2, V1)
533
863
  if Vkey not in VScore:
534
- #print("V gene not found!")
864
+ # print("V gene not found!")
535
865
  return 0
536
866
  else:
537
- V_score=VScore[Vkey]/20.0
867
+ V_score = VScore[Vkey] / 20.0
538
868
  else:
539
- V_score=4.0
540
- aln=NHLocalAlignment(mid1,mid2,gapn,gap)
541
- score=aln/float(max(len(mid1),len(mid2)))+V_score
869
+ V_score = 4.0
870
+ aln = NHLocalAlignment(mid1, mid2, gapn, gap)
871
+ score = aln / float(max(len(mid1), len(mid2))) + V_score
542
872
  return score
543
873
 
874
+
544
875
  def UpdateSSG(SSG, seqs, Vgenes, Vscore={}, UseV=True, gap=-6, gapn=1, cutoff=7.5):
545
- SSGnew={}
546
- count=0
547
- t1=time.time()
548
- N=len(list(chain(*list(SSG.values()))))
549
- # print("Number of pairs to be processed: %d" %N)
876
+ SSGnew = {}
877
+ count = 0
878
+ t1 = time.time()
879
+ N = len(list(chain(*list(SSG.values()))))
880
+ # print("Number of pairs to be processed: %d" %N)
550
881
  for kk in SSG:
551
- s1=seqs[kk]
552
- V1=Vgenes[kk]
553
- VV=SSG[kk]
882
+ s1 = seqs[kk]
883
+ V1 = Vgenes[kk]
884
+ VV = SSG[kk]
554
885
  for vv in VV:
555
- s2=seqs[vv]
556
- V2=Vgenes[vv]
557
- score=falign(s1, s2, V1, V2, st=3, VScore=Vscore, UseV=UseV, gap=-6, gapn=1)
558
- count+=1
559
- if count % 1000000 ==0:
560
- t2=time.time()
561
- # print("Processed %d pairs. Elapsed time %f" %(count, t2-t1))
562
- if score>=cutoff:
886
+ s2 = seqs[vv]
887
+ V2 = Vgenes[vv]
888
+ score = falign(
889
+ s1, s2, V1, V2, st=3, VScore=Vscore, UseV=UseV, gap=-6, gapn=1
890
+ )
891
+ count += 1
892
+ if count % 1000000 == 0:
893
+ t2 = time.time()
894
+ # print("Processed %d pairs. Elapsed time %f" %(count, t2-t1))
895
+ if score >= cutoff:
563
896
  if kk not in SSGnew:
564
- SSGnew[kk]=[vv]
897
+ SSGnew[kk] = [vv]
565
898
  else:
566
899
  SSGnew[kk].append(vv)
567
900
  return SSGnew
568
901
 
902
+
569
903
  def dfs(graph, start):
570
- '''
904
+ """
571
905
  Non-resursive depth first search
572
- '''
906
+ """
573
907
  visited = set()
574
908
  stack = [start]
575
909
  while stack:
@@ -577,95 +911,100 @@ def dfs(graph, start):
577
911
  if vertex not in visited:
578
912
  visited.add(vertex)
579
913
  stack.extend(set(graph[vertex]) - visited)
580
-
914
+
581
915
  return visited
582
916
 
917
+
583
918
  def IdentifyMotifCluster(SSG):
584
919
  ## Input SeqShareGraph dictionary representation of sparse matrix
585
- POS=set(SSG.keys())
586
- NP=len(POS)
587
- ClusterList=[]
588
- tmpL=set(chain(*ClusterList))
589
- count=0
920
+ POS = set(SSG.keys())
921
+ NP = len(POS)
922
+ ClusterList = []
923
+ tmpL = set(chain(*ClusterList))
924
+ count = 0
590
925
  while 1:
591
- xx=POS ^ tmpL
592
- if len(xx)==0:
593
- break
594
- for ii in xx:
595
- # STACK=LoadComm([],ii)
596
- STACK=dfs(SSG,ii)
597
- tmpL = tmpL | STACK
598
- ClusterList.append(list(STACK))
599
- # tmpL=set(chain(*ClusterList))
600
- count+=1
601
- if count % 200 ==0:
602
- print (" Solved %d clusters" %(count))
603
- break
926
+ xx = POS ^ tmpL
927
+ if len(xx) == 0:
928
+ break
929
+ for ii in xx:
930
+ # STACK=LoadComm([],ii)
931
+ STACK = dfs(SSG, ii)
932
+ tmpL = tmpL | STACK
933
+ ClusterList.append(list(STACK))
934
+ # tmpL=set(chain(*ClusterList))
935
+ count += 1
936
+ if count % 200 == 0:
937
+ print(" Solved %d clusters" % (count))
938
+ break
604
939
  return ClusterList
605
940
 
941
+
606
942
  def IdentifyVgeneCluster(sMat):
607
943
  ## Input Vgene score matrix
608
- vG={}
609
- n=len(sMat)
610
- IDs=[x for x in range(n)]
944
+ vG = {}
945
+ n = len(sMat)
946
+ IDs = [x for x in range(n)]
611
947
  for kk in IDs:
612
- LL=sMat[:,kk]
613
- vL=np.where(LL>=thr_v)[0]
614
- if len(vL)>0:
615
- vG[kk]=vL
616
- CL=IdentifyMotifCluster(vG)
948
+ LL = sMat[:, kk]
949
+ vL = np.where(LL >= thr_v)[0]
950
+ if len(vL) > 0:
951
+ vG[kk] = vL
952
+ CL = IdentifyMotifCluster(vG)
617
953
  return CL
618
-
954
+
955
+
619
956
  def ParseFa(fname):
620
- InputStr=open(fname).readlines()
621
- FaDict={}
622
- seq=''
957
+ InputStr = open(fname).readlines()
958
+ FaDict = {}
959
+ seq = ""
623
960
  for line in InputStr:
624
- if line.startswith('>'):
625
- if len(seq)>0:
626
- FaDict[seqHead]=seq
627
- seq=''
628
- seqHead=line.strip()
961
+ if line.startswith(">"):
962
+ if len(seq) > 0:
963
+ FaDict[seqHead] = seq
964
+ seq = ""
965
+ seqHead = line.strip()
629
966
  else:
630
- seq+=line.strip()
967
+ seq += line.strip()
631
968
  if seqHead not in FaDict:
632
- FaDict[seqHead]=seq
969
+ FaDict[seqHead] = seq
633
970
  return FaDict
634
971
 
972
+
635
973
  def PreCalculateVgeneDist(VgeneFa="Imgt_Human_TRBV.fasta"):
636
974
  ## Only run one time if needed
637
- FaDict=ParseFa(cur_dir+VgeneFa)
638
- VScore={}
639
- CDR1Dict={}
640
- CDR2Dict={}
975
+ FaDict = ParseFa(cur_dir + VgeneFa)
976
+ VScore = {}
977
+ CDR1Dict = {}
978
+ CDR2Dict = {}
641
979
  for kk in FaDict:
642
- if '|' in kk:
643
- VV=kk.split('|')[1]
980
+ if "|" in kk:
981
+ VV = kk.split("|")[1]
644
982
  else:
645
- VV=kk[1:]
646
- CDR1Dict[VV]=FaDict[kk][26:37] ## Imgt CDR1: 27 - 38
647
- CDR2Dict[VV]=FaDict[kk][55:64] ## Imgt CDR2: 56 - 65
648
- Vkeys=list(CDR1Dict.keys())
649
- nn=len(Vkeys)
650
- for ii in range(0,nn):
651
- V1=Vkeys[ii]
652
- s1_CDR1=CDR1Dict[V1]
653
- s1_CDR2=CDR2Dict[V1]
654
- for jj in range(ii,nn):
655
- V2=Vkeys[jj]
656
- s2_CDR1=CDR1Dict[V2]
657
- s2_CDR2=CDR2Dict[V2]
658
- score1=SeqComparison(s1_CDR1,s2_CDR1)
659
- score2=SeqComparison(s1_CDR2,s2_CDR2)
660
- #print score1+score2
661
- VScore[(V1,V2)]=score1+score2
662
- gg=open('VgeneScores.txt','w')
983
+ VV = kk[1:]
984
+ CDR1Dict[VV] = FaDict[kk][26:37] ## Imgt CDR1: 27 - 38
985
+ CDR2Dict[VV] = FaDict[kk][55:64] ## Imgt CDR2: 56 - 65
986
+ Vkeys = list(CDR1Dict.keys())
987
+ nn = len(Vkeys)
988
+ for ii in range(0, nn):
989
+ V1 = Vkeys[ii]
990
+ s1_CDR1 = CDR1Dict[V1]
991
+ s1_CDR2 = CDR2Dict[V1]
992
+ for jj in range(ii, nn):
993
+ V2 = Vkeys[jj]
994
+ s2_CDR1 = CDR1Dict[V2]
995
+ s2_CDR2 = CDR2Dict[V2]
996
+ score1 = SeqComparison(s1_CDR1, s2_CDR1)
997
+ score2 = SeqComparison(s1_CDR2, s2_CDR2)
998
+ # print score1+score2
999
+ VScore[(V1, V2)] = score1 + score2
1000
+ gg = open("VgeneScores.txt", "w")
663
1001
  for kk in VScore:
664
- vv=VScore[kk]
665
- line=kk[0]+'\t'+kk[1]+'\t'+str(vv)+'\n'
1002
+ vv = VScore[kk]
1003
+ line = kk[0] + "\t" + kk[1] + "\t" + str(vv) + "\n"
666
1004
  gg.write(line)
667
1005
  gg.close()
668
1006
 
1007
+
669
1008
  def MergeCL(Cls):
670
1009
  ## merge pre-clusters according to shared sequences
671
1010
  ## shared sequences between pre-clusters are due to approximated centroid nearest neighbor search
@@ -673,16 +1012,16 @@ def MergeCL(Cls):
673
1012
  for idx, cc in enumerate(Cls):
674
1013
  for x in cc:
675
1014
  if x not in vDict:
676
- vDict[x]=[idx]
1015
+ vDict[x] = [idx]
677
1016
  else:
678
1017
  vDict[x].append(idx)
679
- Cls_new=[]
1018
+ Cls_new = []
680
1019
  cGraph = {}
681
1020
  for kk in vDict:
682
- vv=vDict[kk]
683
- if len(vv)>1:
1021
+ vv = vDict[kk]
1022
+ if len(vv) > 1:
684
1023
  for ii in vv:
685
- vv1=deepcopy(vv)
1024
+ vv1 = deepcopy(vv)
686
1025
  vv1.pop(vv1.index(ii))
687
1026
  if ii not in cGraph:
688
1027
  cGraph[ii] = vv1
@@ -690,21 +1029,21 @@ def MergeCL(Cls):
690
1029
  cGraph[ii] += list(set(vv1 + cGraph[ii]))
691
1030
  DupKeys = list(cGraph.keys())
692
1031
  for kk in vDict:
693
- vv=vDict[kk]
694
- if len(vv)==1:
1032
+ vv = vDict[kk]
1033
+ if len(vv) == 1:
695
1034
  if vv[0] in DupKeys:
696
1035
  continue
697
1036
  cc = Cls[vv[0]]
698
1037
  if cc not in Cls_new:
699
1038
  Cls_new.append(cc)
700
- Cls_Dup=[]
1039
+ Cls_Dup = []
701
1040
  for kk in cGraph:
702
1041
  cc = dfs(cGraph, kk)
703
1042
  cc = list(cc)
704
1043
  cc = sorted(cc)
705
1044
  if cc not in Cls_Dup:
706
1045
  Cls_Dup.append(cc)
707
- if len(Cls_Dup)>0:
1046
+ if len(Cls_Dup) > 0:
708
1047
  for cdup in Cls_Dup:
709
1048
  cc_merged = []
710
1049
  for ii in cdup:
@@ -715,355 +1054,411 @@ def MergeCL(Cls):
715
1054
  Cls_new.append(cc_merged)
716
1055
  return Cls_new
717
1056
 
718
- def EncodeRepertoire(inputfile, outdir, outfile='',exact=True, ST=3, thr_v=3.7, thr_s=3.5, VDict={},Vgene=True,thr_iso=10, gap=-6, GPU=False,Mat=False, verbose=False):
1057
+
1058
+ def EncodeRepertoire(
1059
+ inputfile,
1060
+ outdir,
1061
+ outfile="",
1062
+ exact=True,
1063
+ ST=3,
1064
+ thr_v=3.7,
1065
+ thr_s=3.5,
1066
+ VDict={},
1067
+ Vgene=True,
1068
+ thr_iso=10,
1069
+ gap=-6,
1070
+ GPU=False,
1071
+ Mat=False,
1072
+ verbose=False,
1073
+ ):
719
1074
  ## No V gene version
720
1075
  ## Encode CDR3 sequences into 96 dimensional space and perform k-means clustering
721
1076
  ## If exact is True, SW alignment will be performed within each cluster after isometric encoding and clustering
722
- h=open(inputfile)
723
- t1=time.time()
724
- alines=h.readlines()
725
- ww=alines[0].strip().split('\t')
726
- if not ww[0].startswith('C'):
1077
+ h = open(inputfile)
1078
+ t1 = time.time()
1079
+ alines = h.readlines()
1080
+ ww = alines[0].strip().split("\t")
1081
+ if not ww[0].startswith("C"):
727
1082
  ## header line
728
- hline=alines[0]
729
- alines=alines[1:]
730
- elif 'CDR3' in ww[0]:
731
- hline=alines[0]
732
- alines=alines[1:]
1083
+ hline = alines[0]
1084
+ alines = alines[1:]
1085
+ elif "CDR3" in ww[0]:
1086
+ hline = alines[0]
1087
+ alines = alines[1:]
733
1088
  else:
734
- hline='CDR3\t'+'\t'.join(['Info'+str(x) for x in range(len(ww)-1)])
735
- seqs=[]
736
- vgs=[]
737
- infoList=[]
738
- count=0
1089
+ hline = "CDR3\t" + "\t".join(["Info" + str(x) for x in range(len(ww) - 1)])
1090
+ seqs = []
1091
+ vgs = []
1092
+ infoList = []
1093
+ count = 0
739
1094
  if verbose:
740
- print('Creating CDR3 list')
1095
+ print("Creating CDR3 list")
741
1096
  for ll in alines:
742
- ww=ll.strip().split('\t')
743
- cdr3=ww[0]
744
- if '*' in cdr3:
1097
+ ww = ll.strip().split("\t")
1098
+ cdr3 = ww[0]
1099
+ if "*" in cdr3:
745
1100
  continue
746
- if '_' in cdr3:
1101
+ if "_" in cdr3:
747
1102
  continue
748
1103
  seqs.append(ww[0])
749
1104
  if Vgene:
750
1105
  vgs.append(ww[1])
751
- infoList.append('\t'.join(ww[1:]))
1106
+ infoList.append("\t".join(ww[1:]))
752
1107
  else:
753
- infoList.append('\t'.join(ww[1:]))
754
- count+=1
755
- if len(outfile)==0:
756
- outfile=inputfile.split('/')
757
- outfile=outfile[len(outfile)-1]
758
- outfile=outdir+'/'+re.sub('\\.[txcsv]+','',outfile)+'-'+'-RotationEncodingBL62.txt'
759
- g=open(outfile,'w')
760
- tm=strftime("%Y-%m-%d %H:%M:%S", gmtime())
761
- InfoLine='##TIME:'+tm+'|cmd: '+sys.argv[0]+'|'+inputfile+'|IsometricDistance_Thr='+str(thr_iso)+'|thr_v='+str(thr_v)+'|thr_s='+str(thr_s)+'|exact='+str(exact)+'|Vgene='+str(Vgene)+'|ST='+str(ST)
762
- g.write(InfoLine+'\n')
763
- g.write("##Column Info: CDR3 aa sequence, cluster id, other information in the input file\n")
764
- gr=0
1108
+ infoList.append("\t".join(ww[1:]))
1109
+ count += 1
1110
+ if len(outfile) == 0:
1111
+ outfile = inputfile.split("/")
1112
+ outfile = outfile[len(outfile) - 1]
1113
+ outfile = (
1114
+ outdir
1115
+ + "/"
1116
+ + re.sub("\\.[txcsv]+", "", outfile)
1117
+ + "-"
1118
+ + "-RotationEncodingBL62.txt"
1119
+ )
1120
+ g = open(outfile, "w")
1121
+ tm = strftime("%Y-%m-%d %H:%M:%S", gmtime())
1122
+ InfoLine = (
1123
+ "##TIME:"
1124
+ + tm
1125
+ + "|cmd: "
1126
+ + sys.argv[0]
1127
+ + "|"
1128
+ + inputfile
1129
+ + "|IsometricDistance_Thr="
1130
+ + str(thr_iso)
1131
+ + "|thr_v="
1132
+ + str(thr_v)
1133
+ + "|thr_s="
1134
+ + str(thr_s)
1135
+ + "|exact="
1136
+ + str(exact)
1137
+ + "|Vgene="
1138
+ + str(Vgene)
1139
+ + "|ST="
1140
+ + str(ST)
1141
+ )
1142
+ g.write(InfoLine + "\n")
1143
+ g.write(
1144
+ "##Column Info: CDR3 aa sequence, cluster id, other information in the input file\n"
1145
+ )
1146
+ gr = 0
765
1147
  ## Split into different lengths
766
- LD,VD, ID,SD= BuildLengthDict(seqs, vGene=vgs,INFO=infoList,sIDs=[x for x in range(len(seqs))])
1148
+ LD, VD, ID, SD = BuildLengthDict(
1149
+ seqs, vGene=vgs, INFO=infoList, sIDs=[x for x in range(len(seqs))]
1150
+ )
767
1151
  LDu, VDu, IDu, SDu = CollapseUnique(LD, VD, ID, SD)
768
1152
  if Mat:
769
- Mfile=outfile+'_EncodingMatrix.txt'
770
- h=open(Mfile, 'w')
1153
+ Mfile = outfile + "_EncodingMatrix.txt"
1154
+ h = open(Mfile, "w")
771
1155
  for kk in LDu:
772
1156
  if verbose:
773
- print("---Process CDR3s with length %d ---" %(kk))
774
- vSD=LDu[kk]
775
- vSD0=[x for x in range(len(vSD))]
776
- vss=SDu[kk]
777
- vInfo=IDu[kk]
778
- flagL=[len(x)-1 for x in vInfo]
1157
+ print("---Process CDR3s with length %d ---" % (kk))
1158
+ vSD = LDu[kk]
1159
+ vSD0 = [x for x in range(len(vSD))]
1160
+ vss = SDu[kk]
1161
+ vInfo = IDu[kk]
1162
+ flagL = [len(x) - 1 for x in vInfo]
779
1163
  if verbose:
780
- print(' Performing CDR3 encoding')
781
- dM=np.array([EncodingCDR3(x[ST:-2], M6, n0) for x in vss])
782
- dM=dM.astype("float32")
1164
+ print(" Performing CDR3 encoding")
1165
+ dM = np.array([EncodingCDR3(x[ST:-2], M6, n0) for x in vss])
1166
+ dM = dM.astype("float32")
783
1167
  if verbose:
784
- print(" The number of sequences is %d" %(dM.shape[0]))
1168
+ print(" The number of sequences is %d" % (dM.shape[0]))
785
1169
  if Mat:
786
1170
  for ii in range(len(vss)):
787
- line=vss[ii]+'\t'+vInfo[ii][0]+'\t'
788
- NUMs=[str(xx) for xx in dM[ii,:]]
789
- line += '\t'.join(NUMs) + '\n'
1171
+ line = vss[ii] + "\t" + vInfo[ii][0] + "\t"
1172
+ NUMs = [str(xx) for xx in dM[ii, :]]
1173
+ line += "\t".join(NUMs) + "\n"
790
1174
  h.write(line)
791
- sID=[x for x in range(dM.shape[0])]
792
- t2=time.time()
1175
+ sID = [x for x in range(dM.shape[0])]
1176
+ t2 = time.time()
793
1177
  if verbose:
794
- print(' Done! Total time elapsed %f' %(t2-t1))
795
- Cls = ClusterCDR3(dM, flagL, thr=thr_iso - 0.5*(15-kk), verbose=verbose) ## change cutoff with different lengths
1178
+ print(" Done! Total time elapsed %f" % (t2 - t1))
1179
+ Cls = ClusterCDR3(
1180
+ dM, flagL, thr=thr_iso - 0.5 * (15 - kk), verbose=verbose
1181
+ ) ## change cutoff with different lengths
796
1182
  Cls = MergeCL(Cls)
797
1183
  if verbose:
798
1184
  print(" Handling identical CDR3 groups")
799
- Cls_u=[]
1185
+ Cls_u = []
800
1186
  for ii in range(len(Cls)):
801
- cc=Cls[ii]
1187
+ cc = Cls[ii]
802
1188
  if len(cc) == 1:
803
1189
  ## Handle identical CDR3 groups first
804
- if flagL[cc[0]]>0:
1190
+ if flagL[cc[0]] > 0:
805
1191
  gr += 1
806
- jj=cc[0]
1192
+ jj = cc[0]
807
1193
  for v_info in vInfo[jj]:
808
- line=vss[jj]+'\t'+str(gr)+'\t'+v_info+'\n'
809
- _=g.write(line)
1194
+ line = vss[jj] + "\t" + str(gr) + "\t" + v_info + "\n"
1195
+ _ = g.write(line)
810
1196
  else:
811
1197
  Cls_u.append(cc)
812
- Cls=Cls_u
813
- t2=time.time()
1198
+ Cls = Cls_u
1199
+ t2 = time.time()
814
1200
  if verbose:
815
- print(' Done! Total time elapsed %f' %(t2-t1))
1201
+ print(" Done! Total time elapsed %f" % (t2 - t1))
816
1202
  if Vgene:
817
- vVgene=VDu[kk]
1203
+ vVgene = VDu[kk]
818
1204
  if verbose:
819
- print(' Matching variable genes')
820
- Cls_v=[]
1205
+ print(" Matching variable genes")
1206
+ Cls_v = []
821
1207
  for cc in Cls:
822
- Nc=len(cc)
823
- sMat={}
1208
+ Nc = len(cc)
1209
+ sMat = {}
824
1210
  for ii in range(Nc):
825
- v1=vVgene[cc[ii]]
826
- for jj in range(ii,Nc):
827
- if jj==ii:
1211
+ v1 = vVgene[cc[ii]]
1212
+ for jj in range(ii, Nc):
1213
+ if jj == ii:
828
1214
  continue
829
- v2=vVgene[cc[jj]]
1215
+ v2 = vVgene[cc[jj]]
830
1216
  if (v1, v2) not in VDict:
831
1217
  if v1 == v2:
832
1218
  if ii not in sMat:
833
- sMat[ii]=[jj]
1219
+ sMat[ii] = [jj]
834
1220
  else:
835
1221
  sMat[ii].append(jj)
836
1222
  if jj not in sMat:
837
- sMat[jj]=[ii]
1223
+ sMat[jj] = [ii]
838
1224
  else:
839
1225
  sMat[jj].append(ii)
840
1226
  continue
841
- if VDict[(v1,v2)] >= thr_v:
842
- if ii not in sMat:
843
- sMat[ii]=[jj]
844
- else:
845
- sMat[ii].append(jj)
846
- if jj not in sMat:
847
- sMat[jj]=[ii]
848
- else:
849
- sMat[jj].append(ii)
850
- vCL=IdentifyMotifCluster(sMat)
851
- vCL_List=list(chain(*vCL))
1227
+ if VDict[(v1, v2)] >= thr_v:
1228
+ if ii not in sMat:
1229
+ sMat[ii] = [jj]
1230
+ else:
1231
+ sMat[ii].append(jj)
1232
+ if jj not in sMat:
1233
+ sMat[jj] = [ii]
1234
+ else:
1235
+ sMat[jj].append(ii)
1236
+ vCL = IdentifyMotifCluster(sMat)
1237
+ vCL_List = list(chain(*vCL))
852
1238
  for ii in range(Nc):
853
- uu=flagL[cc[ii]]
854
- if uu>0 and ii not in vCL_List:
1239
+ uu = flagL[cc[ii]]
1240
+ if uu > 0 and ii not in vCL_List:
855
1241
  vCL.append([ii])
856
1242
  for vcc in vCL:
857
1243
  Cls_v.append(list(np.array(cc)[np.array(vcc)]))
858
- Cls=[]
1244
+ Cls = []
859
1245
  for ii in range(len(Cls_v)):
860
- cc=Cls_v[ii]
1246
+ cc = Cls_v[ii]
861
1247
  if len(cc) == 1:
862
1248
  ## Handle identical CDR3 groups first
863
1249
  gr += 1
864
- jj=cc[0]
1250
+ jj = cc[0]
865
1251
  for v_info in vInfo[jj]:
866
- line=vss[jj]+'\t'+str(gr)+'\t'+v_info+'\n'
867
- _=g.write(line)
1252
+ line = vss[jj] + "\t" + str(gr) + "\t" + v_info + "\n"
1253
+ _ = g.write(line)
868
1254
  else:
869
1255
  Cls.append(cc)
870
1256
  if exact:
871
1257
  if verbose:
872
- print(' Performing Smith-Waterman alignment')
873
- Cls_s=[]
1258
+ print(" Performing Smith-Waterman alignment")
1259
+ Cls_s = []
874
1260
  for cc in Cls:
875
- Nc=len(cc)
876
- if len(cc)<=3:
877
- sMat=np.zeros((Nc,Nc))
1261
+ Nc = len(cc)
1262
+ if len(cc) <= 3:
1263
+ sMat = np.zeros((Nc, Nc))
878
1264
  for ii in range(Nc):
879
- s1=vss[cc[ii]]
880
- for jj in range(ii,Nc):
881
- if jj==ii:
1265
+ s1 = vss[cc[ii]]
1266
+ for jj in range(ii, Nc):
1267
+ if jj == ii:
882
1268
  continue
883
- s2=vss[cc[jj]]
1269
+ s2 = vss[cc[jj]]
884
1270
  if len(s1) != len(s2):
885
1271
  continue
886
- if len(s1)<=5:
1272
+ if len(s1) <= 5:
887
1273
  continue
888
- sw=SeqComparison(s1[ST:-2],s2[ST:-2],gap=gap)
889
- sw=sw/(len(s1)-ST-2)
890
- sMat[ii,jj]=sw
891
- sMat[jj,ii]=sw
892
- s_max=[]
1274
+ sw = SeqComparison(s1[ST:-2], s2[ST:-2], gap=gap)
1275
+ sw = sw / (len(s1) - ST - 2)
1276
+ sMat[ii, jj] = sw
1277
+ sMat[jj, ii] = sw
1278
+ s_max = []
893
1279
  for ii in range(Nc):
894
- s_max.append(np.max(sMat[:,ii]))
895
- cc_new=[]
1280
+ s_max.append(np.max(sMat[:, ii]))
1281
+ cc_new = []
896
1282
  for ii in range(Nc):
897
- if s_max[ii]>=thr_s:
1283
+ if s_max[ii] >= thr_s:
898
1284
  cc_new.append(cc[ii])
899
- if len(cc_new)>1:
1285
+ if len(cc_new) > 1:
900
1286
  Cls_s.append(cc_new)
901
1287
  else:
902
1288
  for ii in range(Nc):
903
- uu=flagL[cc[ii]]
904
- if uu>0:
1289
+ uu = flagL[cc[ii]]
1290
+ if uu > 0:
905
1291
  Cls_s.append([cc[ii]])
906
- # print(Cls_s)
907
- Cls_sList=list(chain(*Cls_s))
1292
+ # print(Cls_s)
1293
+ Cls_sList = list(chain(*Cls_s))
908
1294
  for ii in range(len(cc)):
909
- uu=flagL[cc[ii]]
910
- if uu>0 and cc[ii] not in Cls_sList:
1295
+ uu = flagL[cc[ii]]
1296
+ if uu > 0 and cc[ii] not in Cls_sList:
911
1297
  Cls_s.append([cc[ii]])
912
1298
  else:
913
- CDR3s=[vss[x] for x in cc]
914
- sIDs=np.array([vSD0[x] for x in cc])
915
- sIDs0=[x for x in range(len(cc))]
916
- Kset=KmerSet(CDR3s, sIDs0, KS=5, st=ST, ed=2)
917
- SSG=generateSSG(Kset, CDR3s, k_thr=1)
918
- tmpVgenes=['TRBV2']*len(CDR3s)
919
- SSGnew=UpdateSSG(SSG, CDR3s, tmpVgenes, Vscore=VDict, cutoff=thr_s+4)
920
- CLall=IdentifyMotifCluster(SSGnew)
921
- CLall_list=list(chain(*CLall))
1299
+ CDR3s = [vss[x] for x in cc]
1300
+ sIDs = np.array([vSD0[x] for x in cc])
1301
+ sIDs0 = [x for x in range(len(cc))]
1302
+ Kset = KmerSet(CDR3s, sIDs0, KS=5, st=ST, ed=2)
1303
+ SSG = generateSSG(Kset, CDR3s, k_thr=1)
1304
+ tmpVgenes = ["TRBV2"] * len(CDR3s)
1305
+ SSGnew = UpdateSSG(
1306
+ SSG, CDR3s, tmpVgenes, Vscore=VDict, cutoff=thr_s + 4
1307
+ )
1308
+ CLall = IdentifyMotifCluster(SSGnew)
1309
+ CLall_list = list(chain(*CLall))
922
1310
  for ii in range(len(cc)):
923
- uu=flagL[cc[ii]]
924
- if uu>0 and ii not in CLall_list:
1311
+ uu = flagL[cc[ii]]
1312
+ if uu > 0 and ii not in CLall_list:
925
1313
  CLall.append([ii])
926
1314
  for cl in CLall:
927
- ccs=list(sIDs[np.array(cl)])
1315
+ ccs = list(sIDs[np.array(cl)])
928
1316
  Cls_s.append(ccs)
929
- Cls=Cls_s
1317
+ Cls = Cls_s
930
1318
  if verbose:
931
- print(' Writing results into file')
1319
+ print(" Writing results into file")
932
1320
  for ii in range(len(Cls)):
933
- # if ii % 100000 == 0 and ii>0:
934
- #print(' %d sequences written' %(ii))
935
- cc=Cls[ii]
936
- gr+=1
1321
+ # if ii % 100000 == 0 and ii>0:
1322
+ # print(' %d sequences written' %(ii))
1323
+ cc = Cls[ii]
1324
+ gr += 1
937
1325
  for jj in cc:
938
1326
  for v_info in vInfo[jj]:
939
- line=vss[jj]+'\t'+str(gr)+'\t'+v_info+'\n'
940
- _=g.write(line)
1327
+ line = vss[jj] + "\t" + str(gr) + "\t" + v_info + "\n"
1328
+ _ = g.write(line)
941
1329
  g.close()
942
1330
  if Mat:
943
1331
  h.close()
944
1332
 
1333
+
945
1334
  def OrderUnique(Ig):
946
- vv=list(Ig.values())
947
- kk=list(Ig.keys())
948
- LL=[len(x[1]) for x in vv]
949
- v0=[x[0][0] for x in vv]
950
- v1=[x[0][1] for x in vv]
951
- zkk=zip(kk,v0,v1,LL)
952
- zkks=sorted(zkk,key=lambda x: (x[1],x[3]))
953
- nk=len(zkks)
954
- keep_id=[0]
955
- ii=1
956
- n_pre=str(zkks[0][1])+'_'+str(zkks[0][2])
957
- while ii<nk:
958
- n_cur=str(zkks[ii][1])+'_'+str(zkks[ii][2])
959
- if n_cur==n_pre:
960
- ii+=1
1335
+ vv = list(Ig.values())
1336
+ kk = list(Ig.keys())
1337
+ LL = [len(x[1]) for x in vv]
1338
+ v0 = [x[0][0] for x in vv]
1339
+ v1 = [x[0][1] for x in vv]
1340
+ zkk = zip(kk, v0, v1, LL)
1341
+ zkks = sorted(zkk, key=lambda x: (x[1], x[3]))
1342
+ nk = len(zkks)
1343
+ keep_id = [0]
1344
+ ii = 1
1345
+ n_pre = str(zkks[0][1]) + "_" + str(zkks[0][2])
1346
+ while ii < nk:
1347
+ n_cur = str(zkks[ii][1]) + "_" + str(zkks[ii][2])
1348
+ if n_cur == n_pre:
1349
+ ii += 1
961
1350
  continue
962
1351
  else:
963
1352
  keep_id.append(ii)
964
- n_pre=n_cur
965
- ii+=1
1353
+ n_pre = n_cur
1354
+ ii += 1
966
1355
  continue
967
- nid=[x[0] for x in zkks]
968
- filtered_id=np.array(nid)[np.array(keep_id)]
969
- Igs={}
1356
+ nid = [x[0] for x in zkks]
1357
+ filtered_id = np.array(nid)[np.array(keep_id)]
1358
+ Igs = {}
970
1359
  for ii in filtered_id:
971
- Igs[kk[ii]]=vv[ii]
1360
+ Igs[kk[ii]] = vv[ii]
972
1361
  return Igs, filtered_id
973
1362
 
1363
+
974
1364
  def ClusterCDR3(dM, flagL, thr=10, GPU=False, verbose=False):
975
1365
  ## flagL: flag vector for identical CDR3 groups, >0 for grouped non-identical CDR3s
976
- Cls=[]
977
- flag=0
978
- dM1=dM
979
- flagL=np.array(flagL)
1366
+ Cls = []
1367
+ flag = 0
1368
+ dM1 = dM
1369
+ flagL = np.array(flagL)
980
1370
  if GPU:
981
1371
  res = faiss.StandardGpuResources()
982
1372
  while 1:
983
- # print(" %d number of clusters, with %d sequences" %(len(Cls),dM1.shape[0]))
1373
+ # print(" %d number of clusters, with %d sequences" %(len(Cls),dM1.shape[0]))
984
1374
  if verbose:
985
- print('=',end='')
986
- index = faiss.IndexFlatL2(Ndim*6)
1375
+ print("=", end="")
1376
+ index = faiss.IndexFlatL2(Ndim * 6)
987
1377
  if GPU:
988
1378
  index = faiss.index_cpu_to_gpu(res, 0, index)
989
1379
  index.add(dM1)
990
- if flag==0:
1380
+ if flag == 0:
991
1381
  D, I = index.search(dM1, 2)
992
- vv=np.where((D[:,1]<=thr))[0]
993
- vv0=np.where((D[:,1]>thr) & (flagL>0))[0]
1382
+ vv = np.where((D[:, 1] <= thr))[0]
1383
+ vv0 = np.where((D[:, 1] > thr) & (flagL > 0))[0]
994
1384
  for v in vv0:
995
1385
  Cls.append([v])
996
- tmp_dM=np.zeros((len(vv),Ndim*6))
997
- Ig_new={}
1386
+ tmp_dM = np.zeros((len(vv), Ndim * 6))
1387
+ Ig_new = {}
998
1388
  for ii in range(len(vv)):
999
- v=vv[ii]
1000
- Idx=I[v,]
1389
+ v = vv[ii]
1390
+ Idx = I[v,]
1001
1391
  if v not in Idx:
1002
- Idx[0]=v
1003
- Ig_new[ii]=(sorted(list(set(Idx))),sorted(list(set(Idx))))
1004
- tmp_dM[ii,]=(dM1[Idx[0],]+dM1[Idx[1],])/2
1005
- if len(Ig_new)==0:
1392
+ Idx[0] = v
1393
+ Ig_new[ii] = (sorted(list(set(Idx))), sorted(list(set(Idx))))
1394
+ tmp_dM[ii,] = (dM1[Idx[0],] + dM1[Idx[1],]) / 2
1395
+ if len(Ig_new) == 0:
1006
1396
  if verbose:
1007
- print('type 0 break')
1397
+ print("type 0 break")
1008
1398
  break
1009
- # print('%d of sequence left at cycle %d' %(len(Ig_new),flag))
1010
- Igs, fid=OrderUnique(Ig_new)
1011
- tmp_dM=tmp_dM[fid,]
1012
- Ig_new=Igs
1399
+ # print('%d of sequence left at cycle %d' %(len(Ig_new),flag))
1400
+ Igs, fid = OrderUnique(Ig_new)
1401
+ tmp_dM = tmp_dM[fid,]
1402
+ Ig_new = Igs
1013
1403
  else:
1014
- D, I = index.search(dM1,2)
1015
- vv=np.where(D[:,1]<=thr)[0]
1016
- vv0=np.where(D[:,1]>thr)[0]
1404
+ D, I = index.search(dM1, 2)
1405
+ vv = np.where(D[:, 1] <= thr)[0]
1406
+ vv0 = np.where(D[:, 1] > thr)[0]
1017
1407
  ## move groups in vv0 to Cls
1018
- kkg=list(Ig.keys())
1408
+ kkg = list(Ig.keys())
1019
1409
  for v in vv0:
1020
- ng=list(Ig[kkg[v]][1])
1021
- # if ng not in Cls:
1410
+ ng = list(Ig[kkg[v]][1])
1411
+ # if ng not in Cls:
1022
1412
  Cls.append(ng)
1023
- tmp_dM=np.zeros((len(vv),Ndim*6))
1024
- Ig_new={}
1413
+ tmp_dM = np.zeros((len(vv), Ndim * 6))
1414
+ Ig_new = {}
1025
1415
  for ii in range(len(vv)):
1026
- v=vv[ii]
1027
- idx1=I[v,0]
1028
- idx2=I[v,1]
1416
+ v = vv[ii]
1417
+ idx1 = I[v, 0]
1418
+ idx2 = I[v, 1]
1029
1419
  if v not in I[v,]:
1030
- idx1=v
1031
- # Ig_new[ii]=sorted(list(set(list(Ig[kkg[idx1]])+list(Ig[kkg[idx2]]))))
1032
- Ig_new[ii]=(sorted(list(set([idx1,idx2]))), ## First entry records the relative index of a sequence clique
1033
- sorted(list(set(list(Ig[kkg[idx1]][1])+list(Ig[kkg[idx2]][1]))))) ## Second entry records the absolute index of a sequence
1034
- tmp_dM[ii,]=(dM1[idx1,]+dM1[idx2,])/2
1035
- if len(Ig_new)==0:
1420
+ idx1 = v
1421
+ # Ig_new[ii]=sorted(list(set(list(Ig[kkg[idx1]])+list(Ig[kkg[idx2]]))))
1422
+ Ig_new[ii] = (
1423
+ sorted(
1424
+ list(set([idx1, idx2]))
1425
+ ), ## First entry records the relative index of a sequence clique
1426
+ sorted(list(set(list(Ig[kkg[idx1]][1]) + list(Ig[kkg[idx2]][1])))),
1427
+ ) ## Second entry records the absolute index of a sequence
1428
+ tmp_dM[ii,] = (dM1[idx1,] + dM1[idx2,]) / 2
1429
+ if len(Ig_new) == 0:
1036
1430
  if verbose:
1037
1431
  print("\ntype I break")
1038
- kkg=list(Ig.keys())
1432
+ kkg = list(Ig.keys())
1039
1433
  for kk in kkg:
1040
- ng=list(Ig[kk][1])
1434
+ ng = list(Ig[kk][1])
1041
1435
  if ng not in Cls:
1042
1436
  Cls.append(ng)
1043
1437
  break
1044
- # print('%d of sequence left at cycle %d' %(len(Ig_new),flag))
1045
- Igs, fid=OrderUnique(Ig_new)
1046
- tmp_dM=tmp_dM[fid,]
1047
- Ig_new=Igs
1048
- if flag>0:
1438
+ # print('%d of sequence left at cycle %d' %(len(Ig_new),flag))
1439
+ Igs, fid = OrderUnique(Ig_new)
1440
+ tmp_dM = tmp_dM[fid,]
1441
+ Ig_new = Igs
1442
+ if flag > 0:
1049
1443
  if Ig == Ig_new:
1050
1444
  if verbose:
1051
1445
  print("\ntype II break")
1052
- kkg=list(Ig.keys())
1446
+ kkg = list(Ig.keys())
1053
1447
  for kk in kkg:
1054
- ng=list(Ig[kk][1])
1448
+ ng = list(Ig[kk][1])
1055
1449
  if ng in Cls:
1056
1450
  continue
1057
1451
  Cls.append(ng)
1058
1452
  break
1059
- Ig=Ig_new
1060
- tmp_dM=tmp_dM.astype('float32')
1061
- dM1=tmp_dM
1062
- flag+=1
1453
+ Ig = Ig_new
1454
+ tmp_dM = tmp_dM.astype("float32")
1455
+ dM1 = tmp_dM
1456
+ flag += 1
1063
1457
  return Cls
1064
1458
 
1065
- def ClusterCDR3r(dM, flagL, thr = 10, verbose = False):
1066
- index = faiss.IndexFlatL2(Ndim*6)
1459
+
1460
+ def ClusterCDR3r(dM, flagL, thr=10, verbose=False):
1461
+ index = faiss.IndexFlatL2(Ndim * 6)
1067
1462
  index.add(dM)
1068
1463
  lims, D, I = index.range_search(dM, thr)
1069
1464
  # with open('cdr3.npy', 'wb') as f:
@@ -1071,53 +1466,70 @@ def ClusterCDR3r(dM, flagL, thr = 10, verbose = False):
1071
1466
  # np.save(f, D)
1072
1467
  # np.save(f, I)
1073
1468
  # np.save(f, dM)
1074
-
1469
+
1075
1470
  # now clustering results
1076
1471
  N = dM.shape[0]
1077
- neighborSize = np.array([lims[cur_idx_i+1] - lims[cur_idx_i] for cur_idx_i in range(N)])
1472
+ neighborSize = np.array(
1473
+ [lims[cur_idx_i + 1] - lims[cur_idx_i] for cur_idx_i in range(N)]
1474
+ )
1078
1475
  # to_cluster = np.ones( (N,))
1079
1476
  clusterNo = 0
1080
- cluster = - np.ones( (N, ), dtype = np.int32)
1477
+ cluster = -np.ones((N,), dtype=np.int32)
1081
1478
  idx = np.where(cluster < 0)[0]
1082
1479
  unclustered = [np.argmax(neighborSize[idx])]
1083
1480
  depth = 0
1084
1481
  while True:
1085
- if len(unclustered) == 0: break
1482
+ if len(unclustered) == 0:
1483
+ break
1086
1484
  # cur_idx = unclustered[0] # first unclustered index
1087
1485
  cur_idx = unclustered
1088
- cluster[cur_idx] = clusterNo # assign cluster
1089
-
1090
- neighbor = np.unique(np.array(list(chain (* [I[(lims[cur_idx_i]): lims[cur_idx_i+1]] for cur_idx_i in cur_idx]))))
1486
+ cluster[cur_idx] = clusterNo # assign cluster
1487
+
1488
+ neighbor = np.unique(
1489
+ np.array(
1490
+ list(
1491
+ chain(
1492
+ *[
1493
+ I[(lims[cur_idx_i]) : lims[cur_idx_i + 1]]
1494
+ for cur_idx_i in cur_idx
1495
+ ]
1496
+ )
1497
+ )
1498
+ )
1499
+ )
1091
1500
  # find those unclusterred
1092
1501
  idx = np.where(cluster[neighbor] < 0)[0]
1093
1502
  if len(idx) == 0:
1094
1503
  depth = 0
1095
1504
  clusterNo += 1
1096
1505
  idx = np.where(cluster < 0)[0]
1097
- if len(idx) == 0: break
1506
+ if len(idx) == 0:
1507
+ break
1098
1508
  unclustered = [idx[np.argmax(neighborSize[idx])]]
1099
-
1509
+
1100
1510
  else:
1101
1511
  if depth > 3:
1102
1512
  depth = 0
1103
1513
  clusterNo += 1
1104
1514
  unclustered = neighbor[idx]
1105
1515
  depth += 1
1106
- # print('clusterNo = ', clusterNo)
1107
- Cls = [ [] for i in range(clusterNo)]
1516
+ # print('clusterNo = ', clusterNo)
1517
+ Cls = [[] for i in range(clusterNo)]
1108
1518
  for idx, i in enumerate(cluster):
1109
- Cls[i].append(idx)
1110
- # print("Cls[:5] = ", Cls[:5])
1111
- # print("len(Cls) = ", len(Cls),
1112
- # ', #elem=', sum([len(i) for i in Cls]),
1113
- # ', #single=', sum([len(i) for i in Cls if len(i) == 1]),
1114
- # ', #non_single=', sum([len(i) for i in Cls if len(i) != 1]),
1115
- # ', #max=', max([len(i) for i in Cls]))
1519
+ Cls[i].append(idx)
1520
+ # print("Cls[:5] = ", Cls[:5])
1521
+ # print("len(Cls) = ", len(Cls),
1522
+ # ', #elem=', sum([len(i) for i in Cls]),
1523
+ # ', #single=', sum([len(i) for i in Cls if len(i) == 1]),
1524
+ # ', #non_single=', sum([len(i) for i in Cls if len(i) != 1]),
1525
+ # ', #max=', max([len(i) for i in Cls]))
1116
1526
  return Cls
1117
1527
 
1528
+
1118
1529
  def CommandLineParser():
1119
- parser=OptionParser()
1120
- print ('''
1530
+ parser = OptionParser()
1531
+ print(
1532
+ """
1121
1533
  GIANA: Geometric Isometry based ANtigen-specific tcr Alignment
1122
1534
  Ultrafast short peptide alignment exclusively designed for large-scale adaptome analysis
1123
1535
 
@@ -1130,129 +1542,276 @@ Input columns:
1130
1542
 
1131
1543
  !!! ALL amino acid letters must be CAPITAL !!!
1132
1544
 
1133
- ''')
1134
- parser.add_option("-d","--directory",dest="Directory",help="Input repertoire sequencing file directory. Please make sure that all the files in the directory are input files.",default="")
1135
- parser.add_option("-f","--file",dest="File",default='',help="Input single file of CDR3 sequences for grouping")
1136
- parser.add_option("-F","--fileList",dest="files",default='',help='Alternative input: a file containing the full path to all the files. If given, overwrite -d and -f option')
1137
- parser.add_option("-t","--threshold",dest="thr",default=7,help="Isometric distance threshold for calling similar CDR3 groups. Without -E, smaller value will increase speed. With -E, smaller value will increase specificity. Must be smaller than 12.")
1138
- parser.add_option("-S","--threshold_score",dest="thr_s",default=3.5, help="Threshold for Smith-Waterman alignment score (normalized by CDR3 length). Default 3.5")
1139
- parser.add_option("-G","--threshold_vgene",dest="thr_v",default=3.7,help="Threshold for variable gene comparison. Default 3.7.")
1140
- parser.add_option("-o","--output",dest="OutDir",default='./',help="Output directory for intermediate and final outputs.")
1141
- parser.add_option("-O","--outfile",dest="OutFile",default='',help="Output file name. If not given, a file with --RotationEncoding will be added to the input file as the output file name.")
1142
- parser.add_option("-T","--startPosition",dest='ST',default=3, help="Starting position of CDR3 sequence. The first ST letters are omitted. CDR3 sequence length L must be >= ST+7 ")
1143
- parser.add_option("-g","--GapPenalty",dest="Gap",default= -6,help="Gap penalty,default= -6. Not used.")
1144
- parser.add_option("-n","--GapNumber",dest="GapN",default=1,help="Maximum number of gaps allowed when performing alignment. Max=1, default=1. Not used.")
1145
- parser.add_option("-V","--VariableGeneFa",dest="VFa",default="Imgt_Human_TRBV.fasta",help="IMGT Human beta variable gene sequences")
1146
- parser.add_option("-v","--VariableGene",dest="V",default=True,action="store_false",help="If False, GIANA will omit variable gene information and use CDR3 sequences only. This will yield reduced specificity. The cut-off will automatically become the current value-4.0")
1147
- parser.add_option("-e","--Exact",dest="E",default=True,action="store_false",help="If False, GIANA will not perform Smith-Waterman alignment after isometric encoding.")
1148
- parser.add_option("-N","--NumberOfThreads",dest="NN",default=1,help="Number of threads for multiple processing. Not working so well.")
1149
- parser.add_option("-M","--EncodingMatrix", dest="Mat", default=False,action="store_true", help="If true, GIANA will export the isometric encoding matrix for each TCR. Default: False.")
1150
- parser.add_option("-U","--UseGPU",dest="GPU", default=False, action="store_true",help="Use GPU for Faiss indexing. Must be CUDA GPUs.")
1151
- parser.add_option("-q","--queryFile",dest="Query",default='',help="Input query file, if given, GIANA will run in query mode, also need to provide -r option.")
1152
- parser.add_option("-r","--refFile",dest="ref", default='',help="Input reference file. Query model required.")
1153
- parser.add_option("-b","--Verbose", dest='v', default=False, action="store_true", help="Verbose option: if given, GIANA will print intermediate messages.")
1545
+ """
1546
+ )
1547
+ parser.add_option(
1548
+ "-d",
1549
+ "--directory",
1550
+ dest="Directory",
1551
+ help="Input repertoire sequencing file directory. Please make sure that all the files in the directory are input files.",
1552
+ default="",
1553
+ )
1554
+ parser.add_option(
1555
+ "-f",
1556
+ "--file",
1557
+ dest="File",
1558
+ default="",
1559
+ help="Input single file of CDR3 sequences for grouping",
1560
+ )
1561
+ parser.add_option(
1562
+ "-F",
1563
+ "--fileList",
1564
+ dest="files",
1565
+ default="",
1566
+ help="Alternative input: a file containing the full path to all the files. If given, overwrite -d and -f option",
1567
+ )
1568
+ parser.add_option(
1569
+ "-t",
1570
+ "--threshold",
1571
+ dest="thr",
1572
+ default=7,
1573
+ help="Isometric distance threshold for calling similar CDR3 groups. Without -E, smaller value will increase speed. With -E, smaller value will increase specificity. Must be smaller than 12.",
1574
+ )
1575
+ parser.add_option(
1576
+ "-S",
1577
+ "--threshold_score",
1578
+ dest="thr_s",
1579
+ default=3.5,
1580
+ help="Threshold for Smith-Waterman alignment score (normalized by CDR3 length). Default 3.5",
1581
+ )
1582
+ parser.add_option(
1583
+ "-G",
1584
+ "--threshold_vgene",
1585
+ dest="thr_v",
1586
+ default=3.7,
1587
+ help="Threshold for variable gene comparison. Default 3.7.",
1588
+ )
1589
+ parser.add_option(
1590
+ "-o",
1591
+ "--output",
1592
+ dest="OutDir",
1593
+ default="./",
1594
+ help="Output directory for intermediate and final outputs.",
1595
+ )
1596
+ parser.add_option(
1597
+ "-O",
1598
+ "--outfile",
1599
+ dest="OutFile",
1600
+ default="",
1601
+ help="Output file name. If not given, a file with --RotationEncoding will be added to the input file as the output file name.",
1602
+ )
1603
+ parser.add_option(
1604
+ "-T",
1605
+ "--startPosition",
1606
+ dest="ST",
1607
+ default=3,
1608
+ help="Starting position of CDR3 sequence. The first ST letters are omitted. CDR3 sequence length L must be >= ST+7 ",
1609
+ )
1610
+ parser.add_option(
1611
+ "-g",
1612
+ "--GapPenalty",
1613
+ dest="Gap",
1614
+ default=-6,
1615
+ help="Gap penalty,default= -6. Not used.",
1616
+ )
1617
+ parser.add_option(
1618
+ "-n",
1619
+ "--GapNumber",
1620
+ dest="GapN",
1621
+ default=1,
1622
+ help="Maximum number of gaps allowed when performing alignment. Max=1, default=1. Not used.",
1623
+ )
1624
+ parser.add_option(
1625
+ "-V",
1626
+ "--VariableGeneFa",
1627
+ dest="VFa",
1628
+ default="Imgt_Human_TRBV.fasta",
1629
+ help="IMGT Human beta variable gene sequences",
1630
+ )
1631
+ parser.add_option(
1632
+ "-v",
1633
+ "--VariableGene",
1634
+ dest="V",
1635
+ default=True,
1636
+ action="store_false",
1637
+ help="If False, GIANA will omit variable gene information and use CDR3 sequences only. This will yield reduced specificity. The cut-off will automatically become the current value-4.0",
1638
+ )
1639
+ parser.add_option(
1640
+ "-e",
1641
+ "--Exact",
1642
+ dest="E",
1643
+ default=True,
1644
+ action="store_false",
1645
+ help="If False, GIANA will not perform Smith-Waterman alignment after isometric encoding.",
1646
+ )
1647
+ parser.add_option(
1648
+ "-N",
1649
+ "--NumberOfThreads",
1650
+ dest="NN",
1651
+ default=1,
1652
+ help="Number of threads for multiple processing. Not working so well.",
1653
+ )
1654
+ parser.add_option(
1655
+ "-M",
1656
+ "--EncodingMatrix",
1657
+ dest="Mat",
1658
+ default=False,
1659
+ action="store_true",
1660
+ help="If true, GIANA will export the isometric encoding matrix for each TCR. Default: False.",
1661
+ )
1662
+ parser.add_option(
1663
+ "-U",
1664
+ "--UseGPU",
1665
+ dest="GPU",
1666
+ default=False,
1667
+ action="store_true",
1668
+ help="Use GPU for Faiss indexing. Must be CUDA GPUs.",
1669
+ )
1670
+ parser.add_option(
1671
+ "-q",
1672
+ "--queryFile",
1673
+ dest="Query",
1674
+ default="",
1675
+ help="Input query file, if given, GIANA will run in query mode, also need to provide -r option.",
1676
+ )
1677
+ parser.add_option(
1678
+ "-r",
1679
+ "--refFile",
1680
+ dest="ref",
1681
+ default="",
1682
+ help="Input reference file. Query model required.",
1683
+ )
1684
+ parser.add_option(
1685
+ "-b",
1686
+ "--Verbose",
1687
+ dest="v",
1688
+ default=False,
1689
+ action="store_true",
1690
+ help="Verbose option: if given, GIANA will print intermediate messages.",
1691
+ )
1154
1692
  return parser.parse_args()
1155
1693
 
1694
+
1156
1695
  def main():
1157
- (opt,_)=CommandLineParser()
1158
- cutoff=float(opt.thr)
1159
- OutDir=opt.OutDir
1160
- thr_s=float(opt.thr_s)
1696
+ (opt, _) = CommandLineParser()
1697
+ cutoff = float(opt.thr)
1698
+ OutDir = opt.OutDir
1699
+ thr_s = float(opt.thr_s)
1161
1700
  ## Check if query mode first
1162
- qFile=opt.Query
1163
- if len(qFile)>0:
1701
+ qFile = opt.Query
1702
+ if len(qFile) > 0:
1164
1703
  ## query mode
1165
- t1=time.time()
1166
- if qFile.endswith('/'):
1704
+ t1 = time.time()
1705
+ if qFile.endswith("/"):
1167
1706
  ## input query is a directory
1168
- qFs=os.listdir(qFile)
1169
- qFileList=[]
1707
+ qFs = os.listdir(qFile)
1708
+ qFileList = []
1170
1709
  for ff in qFs:
1171
- qFileList.append(qFile+ff)
1710
+ qFileList.append(qFile + ff)
1172
1711
  else:
1173
- qFileList=[qFile]
1174
- rFile=opt.ref
1175
- if len(rFile)==0:
1176
- raise("Must provide reference file in query mode!")
1712
+ qFileList = [qFile]
1713
+ rFile = opt.ref
1714
+ if len(rFile) == 0:
1715
+ raise ("Must provide reference file in query mode!")
1177
1716
  else:
1178
1717
  ## check if reference cluster file exists
1179
- rFile0=re.sub('\\.txt','',rFile)
1180
- refClusterFile=rFile0+'--RotationEncodingBL62.txt'
1718
+ rFile0 = re.sub("\\.txt", "", rFile)
1719
+ refClusterFile = rFile0 + "--RotationEncodingBL62.txt"
1181
1720
  if not os.path.exists(refClusterFile):
1182
- raise("Must run clustering on reference file first! Did you forget to put the clustering file in this directory?")
1183
- rData=CreateReference(rFile)
1184
- t2=time.time()
1185
- print("Reference created. Elapsed %f" %(t2-t1))
1721
+ raise (
1722
+ "Must run clustering on reference file first! Did you forget to put the clustering file in this directory?"
1723
+ )
1724
+ rData = CreateReference(rFile)
1725
+ t2 = time.time()
1726
+ print("Reference created. Elapsed %f" % (t2 - t1))
1186
1727
  for qf in qFileList:
1187
- t2_0=time.time()
1188
- print("Querying "+qf)
1189
- qf_s=qf.split('/')[-1]
1190
- outFile=re.sub('\\.txt','',qf_s)+'_query_'+rFile0+'.txt'
1191
- of=OutDir+'/'+outFile
1728
+ t2_0 = time.time()
1729
+ print("Querying " + qf)
1730
+ qf_s = qf.split("/")[-1]
1731
+ outFile = re.sub("\\.txt", "", qf_s) + "_query_" + rFile0 + ".txt"
1732
+ of = OutDir + "/" + outFile
1192
1733
  if path.exists(of):
1193
- print(of+' already exits. Skipping.')
1734
+ print(of + " already exits. Skipping.")
1194
1735
  continue
1195
1736
  MakeQuery(qf, rData, thr=cutoff, thr_s=thr_s)
1196
- t2=time.time()
1197
- print(" Build query clustering file. Elapsed %f" %(t2-t1))
1737
+ t2 = time.time()
1738
+ print(" Build query clustering file. Elapsed %f" % (t2 - t1))
1198
1739
  print("Now mering with reference cluster")
1199
- MergeExist(refClusterFile, OutDir+'/'+outFile)
1200
- t2=time.time()
1201
- print(" Time of elapsed for query %s: %f" %(qf, t2-t2_0))
1740
+ MergeExist(refClusterFile, OutDir + "/" + outFile)
1741
+ t2 = time.time()
1742
+ print(" Time of elapsed for query %s: %f" % (qf, t2 - t2_0))
1202
1743
  else:
1203
1744
  ## regular clustering mode
1204
- FileDir=opt.Directory
1205
- if len(FileDir)>0:
1206
- files=os.listdir(FileDir)
1207
- files0=[]
1208
- for ff in files:
1209
- ff=FileDir+'/'+ff
1210
- files0.append(ff)
1211
- files=files0
1745
+ FileDir = opt.Directory
1746
+ if len(FileDir) > 0:
1747
+ files = os.listdir(FileDir)
1748
+ files0 = []
1749
+ for ff in files:
1750
+ ff = FileDir + "/" + ff
1751
+ files0.append(ff)
1752
+ files = files0
1212
1753
  else:
1213
- files=[]
1214
- File=opt.File
1215
- if len(File)>0:
1216
- files=[File]
1217
- FileList=opt.files
1218
- if len(FileList)>0:
1219
- files=[]
1220
- fL=open(FileList)
1221
- for ff in fL.readlines():
1222
- files.append(ff.strip())
1223
- VFa=opt.VFa
1754
+ files = []
1755
+ File = opt.File
1756
+ if len(File) > 0:
1757
+ files = [File]
1758
+ FileList = opt.files
1759
+ if len(FileList) > 0:
1760
+ files = []
1761
+ fL = open(FileList)
1762
+ for ff in fL.readlines():
1763
+ files.append(ff.strip())
1764
+ VFa = opt.VFa
1224
1765
  PreCalculateVgeneDist(VFa)
1225
- vf=open('./VgeneScores.txt') ## Use tcrDist's Vgene 80-score calculation
1226
- VScore={}
1227
- VV=opt.V
1228
- EE=opt.E
1229
- Mat=opt.Mat
1230
- ST=int(opt.ST)
1231
- thr_v=float(opt.thr_v)
1232
- verbose=opt.v
1766
+ vf = open("./VgeneScores.txt") ## Use tcrDist's Vgene 80-score calculation
1767
+ VScore = {}
1768
+ VV = opt.V
1769
+ EE = opt.E
1770
+ Mat = opt.Mat
1771
+ ST = int(opt.ST)
1772
+ thr_v = float(opt.thr_v)
1773
+ verbose = opt.v
1233
1774
  if VV:
1234
1775
  while 1:
1235
- line=vf.readline()
1236
- if len(line)==0:
1776
+ line = vf.readline()
1777
+ if len(line) == 0:
1237
1778
  break
1238
- ww=line.strip().split('\t')
1239
- VScore[(ww[0],ww[1])]=int(ww[2])/20
1240
- VScore[(ww[1],ww[0])]=int(ww[2])/20
1241
- Gap=int(opt.Gap)
1242
- Gapn=int(opt.GapN)
1243
- OutFile=opt.OutFile
1244
- GPU=opt.GPU
1245
- st=3
1246
- ed=1
1247
- NT=int(opt.NN)
1779
+ ww = line.strip().split("\t")
1780
+ VScore[(ww[0], ww[1])] = int(ww[2]) / 20
1781
+ VScore[(ww[1], ww[0])] = int(ww[2]) / 20
1782
+ Gap = int(opt.Gap)
1783
+ Gapn = int(opt.GapN)
1784
+ OutFile = opt.OutFile
1785
+ GPU = opt.GPU
1786
+ st = 3
1787
+ ed = 1
1788
+ NT = int(opt.NN)
1248
1789
  faiss.omp_set_num_threads(NT)
1249
1790
  for ff in files:
1250
- print("Processing %s" %ff)
1251
- EncodeRepertoire(ff, OutDir, OutFile, ST=ST, thr_s=thr_s, thr_v=thr_v, exact=EE,VDict=VScore, Vgene=VV, thr_iso=cutoff, gap=Gap, GPU=GPU, Mat=Mat, verbose=verbose)
1252
-
1791
+ print("Processing %s" % ff)
1792
+ EncodeRepertoire(
1793
+ ff,
1794
+ OutDir,
1795
+ OutFile,
1796
+ ST=ST,
1797
+ thr_s=thr_s,
1798
+ thr_v=thr_v,
1799
+ exact=EE,
1800
+ VDict=VScore,
1801
+ Vgene=VV,
1802
+ thr_iso=cutoff,
1803
+ gap=Gap,
1804
+ GPU=GPU,
1805
+ Mat=Mat,
1806
+ verbose=verbose,
1807
+ )
1808
+
1809
+
1253
1810
  if __name__ == "__main__":
1254
- t0=time.time()
1811
+ t0 = time.time()
1255
1812
  main()
1256
- print ("Total time elapsed: %f" %(time.time()-t0))
1257
- print ("Maximum memory usage: %f MB" %(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss/1000000))
1258
-
1813
+ print("Total time elapsed: %f" % (time.time() - t0))
1814
+ print(
1815
+ "Maximum memory usage: %f MB"
1816
+ % (resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1000000)
1817
+ )