biopipen 0.31.6__py3-none-any.whl → 0.32.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biopipen might be problematic. Click here for more details.
- biopipen/__init__.py +1 -1
- biopipen/ns/bam.py +28 -0
- biopipen/ns/bed.py +40 -0
- biopipen/ns/scrna.py +153 -0
- biopipen/reports/scrna/CellCellCommunicationPlots.svelte +14 -0
- biopipen/reports/scrna/SeuratMap2Ref.svelte +10 -6
- biopipen/scripts/bam/BamSubsetByBed.py +38 -0
- biopipen/scripts/bed/BedtoolsMakeWindows.py +47 -0
- biopipen/scripts/scrna/AnnData2Seurat.R +22 -14
- biopipen/scripts/scrna/CCPlotR-patch.R +161 -0
- biopipen/scripts/scrna/CellCellCommunication.py +101 -0
- biopipen/scripts/scrna/CellCellCommunicationPlots.R +191 -0
- biopipen/scripts/scrna/Seurat2AnnData.R +2 -42
- biopipen/scripts/scrna/SeuratMap2Ref.R +20 -1
- biopipen/scripts/tcr/GIANA/GIANA.py +1356 -797
- biopipen/scripts/tcr/GIANA/GIANA4.py +1364 -789
- biopipen/scripts/tcr/GIANA/query.py +164 -162
- biopipen/scripts/tcr/TCRClustering.R +25 -4
- biopipen/utils/single_cell.R +92 -1
- {biopipen-0.31.6.dist-info → biopipen-0.32.0.dist-info}/METADATA +1 -1
- {biopipen-0.31.6.dist-info → biopipen-0.32.0.dist-info}/RECORD +23 -17
- {biopipen-0.31.6.dist-info → biopipen-0.32.0.dist-info}/WHEEL +0 -0
- {biopipen-0.31.6.dist-info → biopipen-0.32.0.dist-info}/entry_points.txt +0 -0
|
@@ -5,220 +5,222 @@ import subprocess as sp
|
|
|
5
5
|
import pandas as pd
|
|
6
6
|
from GIANA4 import *
|
|
7
7
|
|
|
8
|
-
|
|
8
|
+
|
|
9
|
+
def CreateReference(rFile, outdir="./", Vgene=True, ST=3):
|
|
9
10
|
## convert input reference file into a python workplace
|
|
10
|
-
h=open(rFile)
|
|
11
|
-
alines=h.readlines()
|
|
12
|
-
ww=alines[0].strip().split(
|
|
13
|
-
if not ww[0].startswith(
|
|
11
|
+
h = open(rFile)
|
|
12
|
+
alines = h.readlines()
|
|
13
|
+
ww = alines[0].strip().split("\t")
|
|
14
|
+
if not ww[0].startswith("C"):
|
|
14
15
|
## header line
|
|
15
|
-
hline=alines[0]
|
|
16
|
-
alines=alines[1:]
|
|
17
|
-
elif
|
|
18
|
-
hline=alines[0]
|
|
19
|
-
alines=alines[1:]
|
|
16
|
+
hline = alines[0]
|
|
17
|
+
alines = alines[1:]
|
|
18
|
+
elif "CDR3" in ww[0]:
|
|
19
|
+
hline = alines[0]
|
|
20
|
+
alines = alines[1:]
|
|
20
21
|
else:
|
|
21
|
-
hline=
|
|
22
|
-
seqs=[]
|
|
23
|
-
vgs=[]
|
|
24
|
-
infoList=[]
|
|
25
|
-
count=0
|
|
22
|
+
hline = "CDR3\t" + "\t".join(["Info" + str(x) for x in range(len(ww) - 1)])
|
|
23
|
+
seqs = []
|
|
24
|
+
vgs = []
|
|
25
|
+
infoList = []
|
|
26
|
+
count = 0
|
|
26
27
|
for ll in alines:
|
|
27
|
-
ww=ll.strip().split(
|
|
28
|
-
cdr3=ww[0]
|
|
29
|
-
if
|
|
28
|
+
ww = ll.strip().split("\t")
|
|
29
|
+
cdr3 = ww[0]
|
|
30
|
+
if "*" in cdr3:
|
|
30
31
|
continue
|
|
31
|
-
if
|
|
32
|
+
if "_" in cdr3:
|
|
32
33
|
continue
|
|
33
34
|
seqs.append(ww[0])
|
|
34
35
|
if Vgene:
|
|
35
36
|
vgs.append(ww[1])
|
|
36
|
-
infoList.append(
|
|
37
|
+
infoList.append("\t".join(ww[1:]))
|
|
37
38
|
else:
|
|
38
|
-
infoList.append(
|
|
39
|
-
count+=1
|
|
40
|
-
LD,VD, ID,SD= BuildLengthDict(
|
|
39
|
+
infoList.append("\t".join(ww[1:]))
|
|
40
|
+
count += 1
|
|
41
|
+
LD, VD, ID, SD = BuildLengthDict(
|
|
42
|
+
seqs, vGene=vgs, INFO=infoList, sIDs=[x for x in range(len(seqs))]
|
|
43
|
+
)
|
|
41
44
|
LDu_r, VDu_r, IDu_r, SDu_r = CollapseUnique(LD, VD, ID, SD)
|
|
42
|
-
flagLD_r={}
|
|
43
|
-
dMD_r={}
|
|
45
|
+
flagLD_r = {}
|
|
46
|
+
dMD_r = {}
|
|
44
47
|
for kk in LDu_r:
|
|
45
|
-
vss=SDu_r[kk]
|
|
46
|
-
vInfo=IDu_r[kk]
|
|
47
|
-
flagL=[len(x)-1 for x in vInfo]
|
|
48
|
-
flagLD_r[kk]=flagL
|
|
49
|
-
dM=np.array([EncodingCDR3(x[ST:-2], M6, n0) for x in vss])
|
|
50
|
-
dM=dM.astype("float32")
|
|
51
|
-
dMD_r[kk]=dM
|
|
52
|
-
## ff0=re.sub('.txt','',rFile)
|
|
53
|
-
## outfile=outdir+ff0+'_giana_ref.shelve'
|
|
54
|
-
## giana_shelf = shelve.open(outfile, 'n')
|
|
55
|
-
## giana_shelf['flagLD']=flagLD_r
|
|
56
|
-
## giana_shelf['dMD']=dMD_r
|
|
57
|
-
## giana_shelf['LDu']=LDu_r
|
|
58
|
-
## giana_shelf['VDu']=VDu_r
|
|
59
|
-
## giana_shelf['IDu']=IDu_r
|
|
60
|
-
## giana_shelf['SDu']=SDu_r
|
|
61
|
-
## giana_shelf.close()
|
|
48
|
+
vss = SDu_r[kk]
|
|
49
|
+
vInfo = IDu_r[kk]
|
|
50
|
+
flagL = [len(x) - 1 for x in vInfo]
|
|
51
|
+
flagLD_r[kk] = flagL
|
|
52
|
+
dM = np.array([EncodingCDR3(x[ST:-2], M6, n0) for x in vss])
|
|
53
|
+
dM = dM.astype("float32")
|
|
54
|
+
dMD_r[kk] = dM
|
|
55
|
+
## ff0=re.sub('.txt','',rFile)
|
|
56
|
+
## outfile=outdir+ff0+'_giana_ref.shelve'
|
|
57
|
+
## giana_shelf = shelve.open(outfile, 'n')
|
|
58
|
+
## giana_shelf['flagLD']=flagLD_r
|
|
59
|
+
## giana_shelf['dMD']=dMD_r
|
|
60
|
+
## giana_shelf['LDu']=LDu_r
|
|
61
|
+
## giana_shelf['VDu']=VDu_r
|
|
62
|
+
## giana_shelf['IDu']=IDu_r
|
|
63
|
+
## giana_shelf['SDu']=SDu_r
|
|
64
|
+
## giana_shelf.close()
|
|
62
65
|
return [LDu_r, VDu_r, IDu_r, SDu_r, dMD_r]
|
|
63
66
|
|
|
64
|
-
|
|
67
|
+
|
|
68
|
+
def MakeQuery(qFile, rData=[], dbFile=None, Vgene=True, thr=7, ST=3, thr_s=3.3):
|
|
65
69
|
if dbFile is not None:
|
|
66
70
|
with shelve.open(dbFile) as db:
|
|
67
71
|
for key in db:
|
|
68
|
-
globals()[key]=db[key]
|
|
72
|
+
globals()[key] = db[key]
|
|
69
73
|
else:
|
|
70
|
-
if len(rData)==0:
|
|
71
|
-
raise("Need to provide either a reference file or a shelve")
|
|
72
|
-
LDu_r=rData[0]
|
|
73
|
-
VDu_r=rData[1]
|
|
74
|
-
IDu_r=rData[2]
|
|
75
|
-
SDu_r=rData[3]
|
|
76
|
-
dMD_r=rData[4]
|
|
77
|
-
h=open(qFile)
|
|
78
|
-
alines=h.readlines()
|
|
79
|
-
ww=alines[0].strip().split(
|
|
80
|
-
if not ww[0].startswith(
|
|
74
|
+
if len(rData) == 0:
|
|
75
|
+
raise ("Need to provide either a reference file or a shelve")
|
|
76
|
+
LDu_r = rData[0]
|
|
77
|
+
VDu_r = rData[1]
|
|
78
|
+
IDu_r = rData[2]
|
|
79
|
+
SDu_r = rData[3]
|
|
80
|
+
dMD_r = rData[4]
|
|
81
|
+
h = open(qFile)
|
|
82
|
+
alines = h.readlines()
|
|
83
|
+
ww = alines[0].strip().split("\t")
|
|
84
|
+
if not ww[0].startswith("C"):
|
|
81
85
|
## header line
|
|
82
|
-
hline=alines[0]
|
|
83
|
-
alines=alines[1:]
|
|
84
|
-
elif
|
|
85
|
-
hline=alines[0]
|
|
86
|
-
alines=alines[1:]
|
|
86
|
+
hline = alines[0]
|
|
87
|
+
alines = alines[1:]
|
|
88
|
+
elif "CDR3" in ww[0]:
|
|
89
|
+
hline = alines[0]
|
|
90
|
+
alines = alines[1:]
|
|
87
91
|
else:
|
|
88
|
-
hline=
|
|
89
|
-
seqs=[]
|
|
90
|
-
vgs=[]
|
|
91
|
-
infoList=[]
|
|
92
|
-
count=0
|
|
92
|
+
hline = "CDR3\t" + "\t".join(["Info" + str(x) for x in range(len(ww) - 1)])
|
|
93
|
+
seqs = []
|
|
94
|
+
vgs = []
|
|
95
|
+
infoList = []
|
|
96
|
+
count = 0
|
|
93
97
|
for ll in alines:
|
|
94
|
-
ww=ll.strip().split(
|
|
95
|
-
cdr3=ww[0]
|
|
96
|
-
if
|
|
98
|
+
ww = ll.strip().split("\t")
|
|
99
|
+
cdr3 = ww[0]
|
|
100
|
+
if "*" in cdr3:
|
|
97
101
|
continue
|
|
98
|
-
if
|
|
102
|
+
if "_" in cdr3:
|
|
99
103
|
continue
|
|
100
104
|
seqs.append(ww[0])
|
|
101
105
|
if Vgene:
|
|
102
106
|
vgs.append(ww[1])
|
|
103
|
-
infoList.append(
|
|
107
|
+
infoList.append("\t".join(ww[1:]))
|
|
104
108
|
else:
|
|
105
|
-
infoList.append(
|
|
106
|
-
count+=1
|
|
107
|
-
LD,VD, ID,SD= BuildLengthDict(
|
|
109
|
+
infoList.append("\t".join(ww[1:]))
|
|
110
|
+
count += 1
|
|
111
|
+
LD, VD, ID, SD = BuildLengthDict(
|
|
112
|
+
seqs, vGene=vgs, INFO=infoList, sIDs=[x for x in range(len(seqs))]
|
|
113
|
+
)
|
|
108
114
|
LDu, VDu, IDu, SDu = CollapseUnique(LD, VD, ID, SD)
|
|
109
|
-
tmpFile=
|
|
110
|
-
g=open(tmpFile,
|
|
115
|
+
tmpFile = "tmp_query.txt"
|
|
116
|
+
g = open(tmpFile, "w")
|
|
111
117
|
for kk in LDu:
|
|
112
|
-
vss=SDu[kk]
|
|
113
|
-
vInfo=IDu[kk]
|
|
114
|
-
vss_r=SDu_r[kk]
|
|
115
|
-
vInfo_r=IDu_r[kk]
|
|
116
|
-
flagL=[len(x)-1 for x in vInfo]
|
|
117
|
-
dM_r=dMD_r[kk]
|
|
118
|
-
dM=np.array([EncodingCDR3(x[ST:-2], M6, n0) for x in vss])
|
|
119
|
-
dM=dM.astype("float32")
|
|
120
|
-
nq=dM.shape[0]
|
|
121
|
-
nr=dM_r.shape[0]
|
|
122
|
-
vssc=vss+vss_r
|
|
123
|
-
vInfoc=vInfo+vInfo_r
|
|
124
|
-
dMc=np.concatenate((dM, dM_r))
|
|
125
|
-
index = faiss.IndexFlatL2(Ndim*6)
|
|
118
|
+
vss = SDu[kk]
|
|
119
|
+
vInfo = IDu[kk]
|
|
120
|
+
vss_r = SDu_r[kk]
|
|
121
|
+
vInfo_r = IDu_r[kk]
|
|
122
|
+
flagL = [len(x) - 1 for x in vInfo]
|
|
123
|
+
dM_r = dMD_r[kk]
|
|
124
|
+
dM = np.array([EncodingCDR3(x[ST:-2], M6, n0) for x in vss])
|
|
125
|
+
dM = dM.astype("float32")
|
|
126
|
+
nq = dM.shape[0]
|
|
127
|
+
nr = dM_r.shape[0]
|
|
128
|
+
vssc = vss + vss_r
|
|
129
|
+
vInfoc = vInfo + vInfo_r
|
|
130
|
+
dMc = np.concatenate((dM, dM_r))
|
|
131
|
+
index = faiss.IndexFlatL2(Ndim * 6)
|
|
126
132
|
index.add(dMc)
|
|
127
133
|
D, I = index.search(dM, 2)
|
|
128
|
-
vv=np.where((D[0:nq,1]<=thr))[0]
|
|
129
|
-
flagL=np.array(flagL)
|
|
130
|
-
vv0=np.where((D[0:nq,1]>thr) & (flagL>0))[0]
|
|
131
|
-
curList=[]
|
|
134
|
+
vv = np.where((D[0:nq, 1] <= thr))[0]
|
|
135
|
+
flagL = np.array(flagL)
|
|
136
|
+
vv0 = np.where((D[0:nq, 1] > thr) & (flagL > 0))[0]
|
|
137
|
+
curList = []
|
|
132
138
|
for v in vv0:
|
|
133
139
|
for ii in range(len(vInfoc[v])):
|
|
134
|
-
line=vssc[v]+
|
|
135
|
-
_=g.write(line)
|
|
140
|
+
line = vssc[v] + "\t" + vInfoc[v][ii] + "\t" + "query\n"
|
|
141
|
+
_ = g.write(line)
|
|
136
142
|
for v in vv:
|
|
137
|
-
tmpI=I[v,]
|
|
143
|
+
tmpI = I[v,]
|
|
138
144
|
if v not in tmpI:
|
|
139
|
-
tmpI[0]=v
|
|
140
|
-
idx1=tmpI[0]
|
|
141
|
-
idx2=tmpI[1]
|
|
142
|
-
c1=vssc[idx1]
|
|
143
|
-
c2=vssc[idx2]
|
|
144
|
-
info1=vInfoc[idx1]
|
|
145
|
-
info2=vInfoc[idx2]
|
|
145
|
+
tmpI[0] = v
|
|
146
|
+
idx1 = tmpI[0]
|
|
147
|
+
idx2 = tmpI[1]
|
|
148
|
+
c1 = vssc[idx1]
|
|
149
|
+
c2 = vssc[idx2]
|
|
150
|
+
info1 = vInfoc[idx1]
|
|
151
|
+
info2 = vInfoc[idx2]
|
|
146
152
|
for tmpInfo in info1:
|
|
147
|
-
tup1=(c1, tmpInfo)
|
|
153
|
+
tup1 = (c1, tmpInfo)
|
|
148
154
|
if tup1 not in curList:
|
|
149
|
-
if idx1<nq:
|
|
150
|
-
line1=c1+
|
|
155
|
+
if idx1 < nq:
|
|
156
|
+
line1 = c1 + "\t" + tmpInfo + "\t" + "query\n"
|
|
151
157
|
else:
|
|
152
|
-
line1=c1+
|
|
153
|
-
_=g.write(line1)
|
|
158
|
+
line1 = c1 + "\t" + tmpInfo + "\t" + "ref\n"
|
|
159
|
+
_ = g.write(line1)
|
|
154
160
|
curList.append(tup1)
|
|
155
161
|
for tmpInfo in info2:
|
|
156
|
-
tup2=(c2, tmpInfo)
|
|
162
|
+
tup2 = (c2, tmpInfo)
|
|
157
163
|
if tup2 not in curList:
|
|
158
|
-
if idx2<nq:
|
|
159
|
-
line2=c2+
|
|
164
|
+
if idx2 < nq:
|
|
165
|
+
line2 = c2 + "\t" + tmpInfo + "\t" + "query\n"
|
|
160
166
|
else:
|
|
161
|
-
line2=c2+
|
|
162
|
-
_=g.write(line2)
|
|
167
|
+
line2 = c2 + "\t" + tmpInfo + "\t" + "ref\n"
|
|
168
|
+
_ = g.write(line2)
|
|
163
169
|
curList.append(tup2)
|
|
164
170
|
g.close()
|
|
165
|
-
cmd=
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
171
|
+
cmd = "python3 GIANA4.1.py -f tmp_query.txt -S " + str(
|
|
172
|
+
thr_s
|
|
173
|
+
) ## updated to GIANA4.1
|
|
174
|
+
p = sp.run(cmd, shell=True)
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def MergeExist(
|
|
178
|
+
refClusterFile,
|
|
179
|
+
outFile="queryFinal.txt",
|
|
180
|
+
queryClusterFile="tmp_query--RotationEncodingBL62.txt",
|
|
181
|
+
direction="q",
|
|
182
|
+
):
|
|
169
183
|
## This function compare the query file with ref cluster file and merge the two based on shared TCRs
|
|
170
184
|
## If direction is 'q', the overlapping clusters will be added to the query file
|
|
171
185
|
## If direction is 'r', the overlapping and non-overlapping clusters will be added to the reference file
|
|
172
|
-
refT=pd.read_table(refClusterFile, skiprows=2, delimiter=
|
|
173
|
-
queryT=pd.read_table(queryClusterFile, skiprows=2, delimiter=
|
|
174
|
-
nq=queryT.shape[1]
|
|
175
|
-
nr=refT.shape[1]
|
|
176
|
-
if nr != nq-1:
|
|
186
|
+
refT = pd.read_table(refClusterFile, skiprows=2, delimiter="\t", header=None)
|
|
187
|
+
queryT = pd.read_table(queryClusterFile, skiprows=2, delimiter="\t", header=None)
|
|
188
|
+
nq = queryT.shape[1]
|
|
189
|
+
nr = refT.shape[1]
|
|
190
|
+
if nr != nq - 1:
|
|
177
191
|
print("ERROR: Make sure reference and the query samples have the same columns!")
|
|
178
192
|
print("No query file is generated.")
|
|
179
193
|
return
|
|
180
|
-
gn=np.unique(queryT[1])
|
|
181
|
-
queryTs=pd.DataFrame([], columns=queryT.columns)
|
|
194
|
+
gn = np.unique(queryT[1])
|
|
195
|
+
queryTs = pd.DataFrame([], columns=queryT.columns)
|
|
182
196
|
for nn in gn:
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
queryTs.index=range(queryTs.shape[0])
|
|
190
|
-
keyr=refT[0]+
|
|
191
|
-
keyq=queryTs[0]+
|
|
192
|
-
vvr=np.where(queryTs[nq-1]==
|
|
193
|
-
vvr_in=np.where(keyr.isin(keyq[vvr]))[0]
|
|
194
|
-
gn_r=list(refT.loc[vvr_in,1].drop_duplicates())
|
|
195
|
-
ddo=pd.DataFrame([], columns=refT.columns)
|
|
197
|
+
tmp_ddq = queryT.loc[np.where(queryT[1] == nn)[0], :]
|
|
198
|
+
cls_lab = np.unique(tmp_ddq[nq - 1])
|
|
199
|
+
if len(cls_lab) == 1:
|
|
200
|
+
if cls_lab[0] == "ref":
|
|
201
|
+
continue
|
|
202
|
+
queryTs = queryTs.append(tmp_ddq)
|
|
203
|
+
queryTs.index = range(queryTs.shape[0])
|
|
204
|
+
keyr = refT[0] + "_" + refT[2]
|
|
205
|
+
keyq = queryTs[0] + "_" + queryTs[2]
|
|
206
|
+
vvr = np.where(queryTs[nq - 1] == "ref")[0]
|
|
207
|
+
vvr_in = np.where(keyr.isin(keyq[vvr]))[0]
|
|
208
|
+
gn_r = list(refT.loc[vvr_in, 1].drop_duplicates())
|
|
209
|
+
ddo = pd.DataFrame([], columns=refT.columns)
|
|
196
210
|
for nn in gn_r:
|
|
197
|
-
tmp_dd=refT.loc[np.where(refT[1]==nn)[0]
|
|
198
|
-
tmpkey=tmp_dd[0]+
|
|
199
|
-
vv=np.where(keyq.isin(tmpkey))[0][0]
|
|
200
|
-
gq=queryTs[1][vv]
|
|
201
|
-
tmp_dd[1]=gq
|
|
202
|
-
ddo=ddo.append(tmp_dd)
|
|
203
|
-
if direction==
|
|
204
|
-
ddo[nq-1]=
|
|
211
|
+
tmp_dd = refT.loc[np.where(refT[1] == nn)[0], :]
|
|
212
|
+
tmpkey = tmp_dd[0] + "_" + tmp_dd[2]
|
|
213
|
+
vv = np.where(keyq.isin(tmpkey))[0][0]
|
|
214
|
+
gq = queryTs[1][vv]
|
|
215
|
+
tmp_dd[1] = gq
|
|
216
|
+
ddo = ddo.append(tmp_dd)
|
|
217
|
+
if direction == "q":
|
|
218
|
+
ddo[nq - 1] = "ref"
|
|
205
219
|
## remove groups that contain only ref group
|
|
206
|
-
queryTs=queryTs.append(ddo)
|
|
207
|
-
queryTs=queryTs.drop_duplicates()
|
|
208
|
-
queryTs.to_csv(outFile, sep=
|
|
209
|
-
# queryTs.index=range(queryTs.shape[0])
|
|
210
|
-
if direction==
|
|
220
|
+
queryTs = queryTs.append(ddo)
|
|
221
|
+
queryTs = queryTs.drop_duplicates()
|
|
222
|
+
queryTs.to_csv(outFile, sep="\t", header=False, index=False)
|
|
223
|
+
# queryTs.index=range(queryTs.shape[0])
|
|
224
|
+
if direction == "r":
|
|
211
225
|
## to be developed
|
|
212
226
|
pass
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
@@ -60,10 +60,12 @@ prepare_clustcr = function(clustcr_dir) {
|
|
|
60
60
|
}
|
|
61
61
|
clustcr_source = '
|
|
62
62
|
import sys
|
|
63
|
-
import pandas as pd
|
|
64
|
-
import clustcr
|
|
65
63
|
import atexit
|
|
66
64
|
|
|
65
|
+
import pandas as pd
|
|
66
|
+
from scipy import sparse as scipy_sparse
|
|
67
|
+
|
|
68
|
+
|
|
67
69
|
@atexit.register
|
|
68
70
|
def clustcr_exit():
|
|
69
71
|
import pandas as pd
|
|
@@ -78,13 +80,32 @@ def clustcr_exit():
|
|
|
78
80
|
sys.stderr.write(f"- sklearn: {sklearn.__version__}\\n")
|
|
79
81
|
sys.stderr.write(f"- matplotlib: {matplotlib.__version__}\\n")
|
|
80
82
|
|
|
83
|
+
|
|
84
|
+
# Monkey-patch scipy.sparse.isspmatrix to adopt latest scipy v1.14
|
|
85
|
+
# If not, an error is raised:
|
|
86
|
+
# numpy.linalg.LinAlgError: 0-dimensional array given.
|
|
87
|
+
# Array must be at least two-dimensional
|
|
88
|
+
scipy_sparse.isspmatrix = lambda x: isinstance(
|
|
89
|
+
x,
|
|
90
|
+
(
|
|
91
|
+
scipy_sparse.spmatrix,
|
|
92
|
+
scipy_sparse.csr_array,
|
|
93
|
+
scipy_sparse.csr_matrix,
|
|
94
|
+
scipy_sparse.csc_array,
|
|
95
|
+
scipy_sparse.csc_matrix,
|
|
96
|
+
),
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
import clustcr # noqa: #402
|
|
101
|
+
|
|
81
102
|
clustcr_dir, clustcr_infile = sys.argv[1:3]
|
|
82
103
|
cdr3df = pd.read_csv(clustcr_infile, index_col=None)
|
|
83
104
|
cdr3 = cdr3df.iloc[:, 0]
|
|
84
105
|
|
|
85
|
-
clustering = clustcr.Clustering(
|
|
106
|
+
clustering = clustcr.Clustering()
|
|
86
107
|
output = clustering.fit(cdr3)
|
|
87
|
-
output.clusters_df.to_csv(clustcr_dir + "/clusters.txt", sep="
|
|
108
|
+
output.clusters_df.to_csv(clustcr_dir + "/clusters.txt", sep="\t", index=False)
|
|
88
109
|
'
|
|
89
110
|
clustcr_file = file.path(clustcr_dir, "_clustcr.py")
|
|
90
111
|
cat(sprintf(clustcr_source, clustering_args), file=clustcr_file)
|
biopipen/utils/single_cell.R
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
suppressPackageStartupMessages(library(rlang))
|
|
2
2
|
suppressPackageStartupMessages(library(dplyr))
|
|
3
3
|
suppressPackageStartupMessages(library(tidyr))
|
|
4
|
-
suppressPackageStartupMessages(library(immunarch))
|
|
4
|
+
try(suppressPackageStartupMessages(library(immunarch)))
|
|
5
5
|
|
|
6
6
|
#' Expand a Immunarch object into cell-level
|
|
7
7
|
#'
|
|
@@ -114,3 +114,94 @@ immdata_from_expanded <- function(
|
|
|
114
114
|
)
|
|
115
115
|
out
|
|
116
116
|
}
|
|
117
|
+
|
|
118
|
+
#' Convert Seurat object to Anndata
|
|
119
|
+
#'
|
|
120
|
+
#' @param sobjfile Seurat object file
|
|
121
|
+
#' @param outfile Output file
|
|
122
|
+
#' @param assay Assay to be used
|
|
123
|
+
#'
|
|
124
|
+
#' @export
|
|
125
|
+
seurat_to_anndata <- function(sobjfile, outfile, assay = NULL, log_info, tmpdir = NULL, log_indent = "") {
|
|
126
|
+
library(Seurat)
|
|
127
|
+
library(SeuratDisk)
|
|
128
|
+
library(hdf5r)
|
|
129
|
+
if (endsWith(sobjfile, ".rds") || endsWith(sobjfile, ".RDS")) {
|
|
130
|
+
library(digest)
|
|
131
|
+
|
|
132
|
+
dig <- digest::digest(sobjfile, algo = "md5")
|
|
133
|
+
dig <- substr(dig, 1, 8)
|
|
134
|
+
assay_name <- ifelse(is.null(assay), "", paste0("_", assay))
|
|
135
|
+
tmpdir <- tmpdir %||% dirname(outfile)
|
|
136
|
+
dir.create(tmpdir, showWarnings = FALSE)
|
|
137
|
+
h5seurat_file <- file.path(
|
|
138
|
+
tmpdir,
|
|
139
|
+
paste0(
|
|
140
|
+
tools::file_path_sans_ext(basename(outfile)),
|
|
141
|
+
assay_name, ".", dig, ".h5seurat"
|
|
142
|
+
)
|
|
143
|
+
)
|
|
144
|
+
if (file.exists(h5seurat_file) &&
|
|
145
|
+
(file.mtime(h5seurat_file) < file.mtime(sobjfile))) {
|
|
146
|
+
file.remove(h5seurat_file)
|
|
147
|
+
}
|
|
148
|
+
if (!file.exists(h5seurat_file)) {
|
|
149
|
+
log_info("{log_indent}Reading RDS file ...")
|
|
150
|
+
sobj <- readRDS(sobjfile)
|
|
151
|
+
assay <- assay %||% DefaultAssay(sobj)
|
|
152
|
+
# In order to convert to h5ad
|
|
153
|
+
# https://github.com/satijalab/seurat/issues/8220#issuecomment-1871874649
|
|
154
|
+
sobj$RNAv3 <- as(object = sobj[[assay]], Class = "Assay")
|
|
155
|
+
DefaultAssay(sobj) <- "RNAv3"
|
|
156
|
+
sobj$RNA <- NULL
|
|
157
|
+
sobj <- RenameAssays(sobj, RNAv3 = "RNA")
|
|
158
|
+
|
|
159
|
+
log_info("{log_indent}Saving to H5Seurat file ...")
|
|
160
|
+
SaveH5Seurat(sobj, h5seurat_file)
|
|
161
|
+
rm(sobj)
|
|
162
|
+
gc()
|
|
163
|
+
sobjfile <- h5seurat_file
|
|
164
|
+
} else {
|
|
165
|
+
log_info("{log_indent}Using existing H5Seurat file ...")
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
if (!endsWith(sobjfile, ".h5seurat")) {
|
|
170
|
+
stop(paste0("Unknown input file format: ",
|
|
171
|
+
tools::file_ext(sobjfile),
|
|
172
|
+
". Supported formats: .rds, .RDS, .h5seurat"))
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
log_info("{log_indent}Converting to Anndata ...")
|
|
176
|
+
Convert(sobjfile, dest = outfile, assay = assay %||% "RNA", overwrite = TRUE)
|
|
177
|
+
|
|
178
|
+
log_info("{log_indent}Fixing categorical data ...")
|
|
179
|
+
# See: https://github.com/mojaveazure/seurat-disk/issues/183
|
|
180
|
+
H5.create_reference <- function(self, ...) {
|
|
181
|
+
space <- self$get_space()
|
|
182
|
+
do.call("[", c(list(space), list(...)))
|
|
183
|
+
ref_type <- hdf5r::h5const$H5R_OBJECT
|
|
184
|
+
ref_obj <- hdf5r::H5R_OBJECT$new(1, self)
|
|
185
|
+
res <- .Call("R_H5Rcreate", ref_obj$ref, self$id, ".", ref_type,
|
|
186
|
+
space$id, FALSE, PACKAGE = "hdf5r")
|
|
187
|
+
if (res$return_val < 0) {
|
|
188
|
+
stop("Error creating object reference")
|
|
189
|
+
}
|
|
190
|
+
ref_obj$ref <- res$ref
|
|
191
|
+
return(ref_obj)
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
h5ad <- H5File$new(outfile, "r+")
|
|
195
|
+
cats <- names(h5ad[["obs/__categories"]])
|
|
196
|
+
for (cat in cats) {
|
|
197
|
+
catname <- paste0("obs/__categories/", cat)
|
|
198
|
+
obsname <- paste0("obs/", cat)
|
|
199
|
+
ref <- H5.create_reference(h5ad[[catname]])
|
|
200
|
+
h5ad[[obsname]]$create_attr(
|
|
201
|
+
attr_name = "categories",
|
|
202
|
+
robj = ref,
|
|
203
|
+
space = H5S$new(type = "scalar")
|
|
204
|
+
)
|
|
205
|
+
}
|
|
206
|
+
h5ad$close()
|
|
207
|
+
}
|