scte-quant 1.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scTE/__init__.py +11 -0
- scTE/_version.py +24 -0
- scTE/annotation.py +158 -0
- scTE/base.py +473 -0
- scTE/miniglbase/README.md +25 -0
- scTE/miniglbase/__init__.py +54 -0
- scTE/miniglbase/base_genelist.py +324 -0
- scTE/miniglbase/config.py +45 -0
- scTE/miniglbase/genelist.py +1681 -0
- scTE/miniglbase/location.py +249 -0
- scTE/miniglbase/utils.py +59 -0
- scTE/scatacseq.py +293 -0
- scte_quant-1.3.3.data/scripts/scTE +177 -0
- scte_quant-1.3.3.data/scripts/scTEATAC +219 -0
- scte_quant-1.3.3.data/scripts/scTEATAC_build +89 -0
- scte_quant-1.3.3.data/scripts/scTE_build +693 -0
- scte_quant-1.3.3.dist-info/METADATA +276 -0
- scte_quant-1.3.3.dist-info/RECORD +21 -0
- scte_quant-1.3.3.dist-info/WHEEL +5 -0
- scte_quant-1.3.3.dist-info/licenses/LICENSE +21 -0
- scte_quant-1.3.3.dist-info/top_level.txt +1 -0
scTE/__init__.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
from importlib.metadata import version, PackageNotFoundError
|
|
2
|
+
|
|
3
|
+
try:
|
|
4
|
+
__version__ = version("scTE")
|
|
5
|
+
except PackageNotFoundError:
|
|
6
|
+
# package is not installed
|
|
7
|
+
__version__ = "unknown"
|
|
8
|
+
|
|
9
|
+
from .miniglbase import genelist, location, glload
|
|
10
|
+
|
|
11
|
+
__all__ = ["genelist", "location", "glload",]
|
scTE/_version.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# file generated by vcs-versioning
|
|
2
|
+
# don't change, don't track in version control
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
__all__ = [
|
|
6
|
+
"__version__",
|
|
7
|
+
"__version_tuple__",
|
|
8
|
+
"version",
|
|
9
|
+
"version_tuple",
|
|
10
|
+
"__commit_id__",
|
|
11
|
+
"commit_id",
|
|
12
|
+
]
|
|
13
|
+
|
|
14
|
+
version: str
|
|
15
|
+
__version__: str
|
|
16
|
+
__version_tuple__: tuple[int | str, ...]
|
|
17
|
+
version_tuple: tuple[int | str, ...]
|
|
18
|
+
commit_id: str | None
|
|
19
|
+
__commit_id__: str | None
|
|
20
|
+
|
|
21
|
+
__version__ = version = '1.3.3'
|
|
22
|
+
__version_tuple__ = version_tuple = (1, 3, 3)
|
|
23
|
+
|
|
24
|
+
__commit_id__ = commit_id = None
|
scTE/annotation.py
ADDED
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
import os,sys,gzip,time
|
|
2
|
+
import numpy as np
|
|
3
|
+
from scTE.miniglbase import genelist, glload, location
|
|
4
|
+
|
|
5
|
+
form ={'force_tsv': True, 'loc': 'location(chr=column[0], left=column[1], right=column[2])', 'annot': 3}
|
|
6
|
+
|
|
7
|
+
def cleanexon(filename, genefilename, exons):
|
|
8
|
+
if not os.path.exists('%s_scTEtmp/index'%filename):
|
|
9
|
+
os.system('mkdir -p %s_scTEtmp/index'%filename)
|
|
10
|
+
|
|
11
|
+
oh=gzip.open('%s_scTEtmp/index/%s.bed.gz'%(filename,genefilename),'wt')
|
|
12
|
+
for k in sorted(exons):
|
|
13
|
+
E=[]
|
|
14
|
+
chr_val = exons[k][0][0] # get chromosome from first exon
|
|
15
|
+
for it in exons[k]:
|
|
16
|
+
E+=list(range(it[1],it[2]))
|
|
17
|
+
E=sorted(set(E))
|
|
18
|
+
|
|
19
|
+
s=0
|
|
20
|
+
tmp=[]
|
|
21
|
+
for id in range(0,len(E)-1):
|
|
22
|
+
if E[id+1]-E[id] >1:
|
|
23
|
+
en=id
|
|
24
|
+
tmp.append([E[s],E[en]])
|
|
25
|
+
s=en+1
|
|
26
|
+
tmp.append([E[s],E[id+1]])
|
|
27
|
+
|
|
28
|
+
for item in tmp:
|
|
29
|
+
oh.write('%s\t%s\t%s\t%s\n'%(chr_val,item[0],item[1],k))
|
|
30
|
+
oh.close()
|
|
31
|
+
|
|
32
|
+
def annoGtf(filename, genefile, tefile, mode):
|
|
33
|
+
|
|
34
|
+
genefilename = genefile.split('/')[-1:][0].replace('.gtf','').replace('.gz','')
|
|
35
|
+
tefilename = tefile.split('/')[-1:][0].replace('.bed','').replace('.gz','')
|
|
36
|
+
|
|
37
|
+
raw = {}
|
|
38
|
+
clean = {}
|
|
39
|
+
if '.gz' in genefile:
|
|
40
|
+
o = gzip.open(genefile,'rb')
|
|
41
|
+
else:
|
|
42
|
+
o=open(genefile,'r')
|
|
43
|
+
for l in o:
|
|
44
|
+
if '.gz' in genefile:
|
|
45
|
+
l=l.decode('ascii')
|
|
46
|
+
if l.startswith('#'):
|
|
47
|
+
continue
|
|
48
|
+
t=l.strip().split('\t')
|
|
49
|
+
if t[2]=='exon' or t[2]=='UTR':
|
|
50
|
+
chr = t[0].replace('chr','')
|
|
51
|
+
left = int(t[3])
|
|
52
|
+
riht = int(t[4])
|
|
53
|
+
name=t[8].split('gene_name "')[1].split('";')[0]
|
|
54
|
+
|
|
55
|
+
if name not in raw:
|
|
56
|
+
raw[name] = []
|
|
57
|
+
raw[name].append([chr,left,riht])
|
|
58
|
+
|
|
59
|
+
if 'protein_coding' not in l and 'lincRNA' not in l:
|
|
60
|
+
continue
|
|
61
|
+
if name not in clean:
|
|
62
|
+
clean[name] = []
|
|
63
|
+
clean[name].append([chr,left,riht])
|
|
64
|
+
o.close()
|
|
65
|
+
|
|
66
|
+
cleanexon(filename,'%s.raw'%genefilename,raw)
|
|
67
|
+
cleanexon(filename,'%s.clean'%genefilename,clean)
|
|
68
|
+
|
|
69
|
+
if mode == 'exclusive':
|
|
70
|
+
gene ={}
|
|
71
|
+
o = gzip.open('%s_scTEtmp/index/%s.clean.bed.gz'%(filename,genefilename),'rb')
|
|
72
|
+
for l in o:
|
|
73
|
+
t = l.decode('ascii').strip().split('\t')
|
|
74
|
+
chr = t[0].replace('chr','')
|
|
75
|
+
left = int(t[1])
|
|
76
|
+
rite = int(t[2])
|
|
77
|
+
|
|
78
|
+
left_buck = int((left-1)/10000) * 10000
|
|
79
|
+
right_buck = int((rite)/10000) * 10000
|
|
80
|
+
buckets_reqd = range(left_buck, right_buck+10000, 10000)
|
|
81
|
+
|
|
82
|
+
if chr not in gene:
|
|
83
|
+
gene[chr] = {}
|
|
84
|
+
|
|
85
|
+
if buckets_reqd:
|
|
86
|
+
for buck in buckets_reqd:
|
|
87
|
+
if buck not in gene[chr]:
|
|
88
|
+
gene[chr][buck] = []
|
|
89
|
+
gene[chr][buck].append([left, rite])
|
|
90
|
+
o.close()
|
|
91
|
+
|
|
92
|
+
noverlap = []
|
|
93
|
+
if '.gz' in tefile:
|
|
94
|
+
o = gzip.open(tefile,'rb')
|
|
95
|
+
else:
|
|
96
|
+
o = open(tefile,'r')
|
|
97
|
+
for n,l in enumerate(o):
|
|
98
|
+
if '.gz' in tefile:
|
|
99
|
+
l = l.decode('ascii')
|
|
100
|
+
t = l.strip().split('\t')
|
|
101
|
+
chr = t[0]
|
|
102
|
+
left = int(t[1])
|
|
103
|
+
rite = int(t[2])
|
|
104
|
+
|
|
105
|
+
if chr not in gene:
|
|
106
|
+
noverlap.append('%s\t%s\t%s\t%s\n'%(chr,left,rite,t[3]))
|
|
107
|
+
continue
|
|
108
|
+
|
|
109
|
+
left_buck = int((left-1)/10000) * 10000
|
|
110
|
+
right_buck = int((rite)/10000) * 10000
|
|
111
|
+
buckets_reqd = range(left_buck, right_buck+10000, 10000)
|
|
112
|
+
|
|
113
|
+
if buckets_reqd:
|
|
114
|
+
i = 1
|
|
115
|
+
for buck in buckets_reqd:
|
|
116
|
+
if buck not in gene[chr]:
|
|
117
|
+
pass
|
|
118
|
+
else:
|
|
119
|
+
for k in gene[chr][buck]:
|
|
120
|
+
if left < k[1] and rite > k[0]:
|
|
121
|
+
i = 0
|
|
122
|
+
break
|
|
123
|
+
if i == 0:
|
|
124
|
+
break
|
|
125
|
+
if i == 1:
|
|
126
|
+
noverlap.append('%s\t%s\t%s\t%s\n'%(chr,left,rite,t[3]))
|
|
127
|
+
|
|
128
|
+
oh = gzip.open('%s_scTEtmp/index/%s.exclusive.gz'%(filename, tefilename),'wt')
|
|
129
|
+
for k in noverlap:
|
|
130
|
+
oh.write(k)
|
|
131
|
+
oh.close()
|
|
132
|
+
|
|
133
|
+
genes = genelist('%s_scTEtmp/index/%s.raw.bed.gz'%(filename, genefilename), format=form, gzip=True)
|
|
134
|
+
TEs = genelist('%s_scTEtmp/index/%s.exclusive.gz'%(filename, tefilename), format=form, gzip=True)
|
|
135
|
+
print(genes)
|
|
136
|
+
print(TEs)
|
|
137
|
+
|
|
138
|
+
all_annot = genes + TEs
|
|
139
|
+
all_annot.save('%s_scTEtmp/index/custome.exclusive.glb'%filename)
|
|
140
|
+
annot = '%s_scTEtmp/index/custome.exclusive.glb'%filename
|
|
141
|
+
|
|
142
|
+
elif mode == 'inclusive':
|
|
143
|
+
genes = genelist('%s_scTEtmp/index/%s.raw.bed.gz'%(filename,genefilename), format=form, gzip=True)
|
|
144
|
+
if tefilename.endswith('.gz'):
|
|
145
|
+
TEs = genelist(tefile, format=form, gzip=True)
|
|
146
|
+
else:
|
|
147
|
+
TEs = genelist(tefile, format=form)
|
|
148
|
+
|
|
149
|
+
all_annot = genes + TEs
|
|
150
|
+
all_annot.save('%s_scTEtmp/index/custome.inclusive.glb'%filename)
|
|
151
|
+
annot = '%s_scTEtmp/index/custome.inclusive.glb'%filename
|
|
152
|
+
|
|
153
|
+
return annot
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
|
scTE/base.py
ADDED
|
@@ -0,0 +1,473 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import multiprocessing
|
|
3
|
+
import argparse
|
|
4
|
+
from functools import partial
|
|
5
|
+
import logging
|
|
6
|
+
logging.getLogger("zarr").setLevel(logging.WARNING)
|
|
7
|
+
import os, sys, glob, datetime, time, gzip
|
|
8
|
+
import collections
|
|
9
|
+
from collections import defaultdict
|
|
10
|
+
from math import log
|
|
11
|
+
from scTE.miniglbase import genelist, glload, location
|
|
12
|
+
from scTE.annotation import annoGtf
|
|
13
|
+
import subprocess
|
|
14
|
+
|
|
15
|
+
import numpy as np
|
|
16
|
+
import scipy
|
|
17
|
+
import anndata as ad
|
|
18
|
+
|
|
19
|
+
def read_opts(parser):
|
|
20
|
+
args = parser.parse_args()
|
|
21
|
+
if args.format == "BAM" :
|
|
22
|
+
args.parser = "BAM"
|
|
23
|
+
elif args.format == "SAM" :
|
|
24
|
+
args.parser = "SAM"
|
|
25
|
+
else :
|
|
26
|
+
logging.error("The input file must be SAM/BAM format: %s !\n" % (args.format))
|
|
27
|
+
sys.exit(1)
|
|
28
|
+
|
|
29
|
+
args.error = logging.critical
|
|
30
|
+
args.warn = logging.warning
|
|
31
|
+
args.debug = logging.debug
|
|
32
|
+
args.info = logging.info
|
|
33
|
+
|
|
34
|
+
args.argtxt ="\n".join(("Parameter list:", \
|
|
35
|
+
"Sample = %s" % (args.out), \
|
|
36
|
+
"Reference annotation index = %s" %(args.annoglb[0]), \
|
|
37
|
+
"Minimum number of genes required = %s" % (args.genenumber), \
|
|
38
|
+
"Minimum number of counts required = %s"% (args.countnumber),\
|
|
39
|
+
"Number of threads = %s " % (args.thread),\
|
|
40
|
+
))
|
|
41
|
+
return args
|
|
42
|
+
|
|
43
|
+
def Readanno(filename, annoglb): #genome
|
|
44
|
+
glannot = glload(annoglb)
|
|
45
|
+
allelement = set(glannot['annot'])
|
|
46
|
+
# if genome in ['mm10']:
|
|
47
|
+
# chr_list = ['chr'+ str(i) for i in range(1,20) ] + [ 'chrX','chrY', 'chrM' ]
|
|
48
|
+
# elif genome in ['hg38']:
|
|
49
|
+
# chr_list = ['chr'+ str(i) for i in range(1,23) ] + [ 'chrX','chrY', 'chrM' ]
|
|
50
|
+
|
|
51
|
+
chr_list = list(set([ k['chr'] for k in glannot['loc']])) #this is useful for costume chromsome
|
|
52
|
+
return(allelement, chr_list, annoglb, glannot)
|
|
53
|
+
|
|
54
|
+
def checkCBUMI(filename,out,CB,UMI):
|
|
55
|
+
def _run_and_wait(cmd, outfile, timeout=30):
|
|
56
|
+
"""Run a command and wait for its output file with polling."""
|
|
57
|
+
subprocess.run(cmd, shell=True)
|
|
58
|
+
# Poll for file instead of fixed sleep
|
|
59
|
+
for _ in range(timeout):
|
|
60
|
+
if os.path.exists(outfile):
|
|
61
|
+
time.sleep(0.1) # brief settle for file flush
|
|
62
|
+
return
|
|
63
|
+
time.sleep(1)
|
|
64
|
+
logging.error("Timeout waiting for output: %s" % outfile)
|
|
65
|
+
sys.exit(1)
|
|
66
|
+
|
|
67
|
+
if CB == 'CR':
|
|
68
|
+
_run_and_wait(
|
|
69
|
+
'samtools view %s | head -100| grep "CR:Z:" | wc -l > %s_scTEtmp/o1/testCR.txt' % (filename, out),
|
|
70
|
+
'%s_scTEtmp/o1/testCR.txt' % out)
|
|
71
|
+
with open('%s_scTEtmp/o1/testCR.txt' % out, 'r') as o:
|
|
72
|
+
for l in o:
|
|
73
|
+
l = l.strip()
|
|
74
|
+
if int(l) < 100:
|
|
75
|
+
logging.error("The input file %s has no cell barcodes information, plese make sure the aligner have add the cell barcode key, or set CB to False" % filename)
|
|
76
|
+
sys.exit(1)
|
|
77
|
+
elif CB == 'CB':
|
|
78
|
+
_run_and_wait(
|
|
79
|
+
'samtools view %s | head -100| grep "CB:Z:" | wc -l > %s_scTEtmp/o1/testCR.txt' % (filename, out),
|
|
80
|
+
'%s_scTEtmp/o1/testCR.txt' % out)
|
|
81
|
+
with open('%s_scTEtmp/o1/testCR.txt' % out, 'r') as o:
|
|
82
|
+
for l in o:
|
|
83
|
+
l = l.strip()
|
|
84
|
+
if int(l) < 100:
|
|
85
|
+
logging.error("The input file %s has no cell barcodes information, plese make sure the aligner have add the cell barcode key, or set CB to False" % filename)
|
|
86
|
+
sys.exit(1)
|
|
87
|
+
|
|
88
|
+
if UMI == 'UR':
|
|
89
|
+
_run_and_wait(
|
|
90
|
+
'samtools view %s | head -100| grep "UR:Z:" | wc -l > %s_scTEtmp/o1/testUMI.txt' % (filename, out),
|
|
91
|
+
'%s_scTEtmp/o1/testUMI.txt' % out)
|
|
92
|
+
with open('%s_scTEtmp/o1/testUMI.txt' % out, 'r') as o:
|
|
93
|
+
for l in o:
|
|
94
|
+
l = l.strip()
|
|
95
|
+
if int(l) < 100:
|
|
96
|
+
logging.error("The input file %s has no UR:Z information, plese make sure the aligner have add the UMI key, or set UMI to False" % filename)
|
|
97
|
+
sys.exit(1)
|
|
98
|
+
elif UMI == 'UB':
|
|
99
|
+
_run_and_wait(
|
|
100
|
+
'samtools view %s | head -100| grep "UB:Z:" | wc -l > %s_scTEtmp/o1/testUMI.txt' % (filename, out),
|
|
101
|
+
'%s_scTEtmp/o1/testUMI.txt' % out)
|
|
102
|
+
with open('%s_scTEtmp/o1/testUMI.txt' % out, 'r') as o:
|
|
103
|
+
for l in o:
|
|
104
|
+
l = l.strip()
|
|
105
|
+
if int(l) < 100:
|
|
106
|
+
logging.error("The input file %s has no UB:Z information, plese make sure the aligner have add the UMI key, or set UMI to False" % filename)
|
|
107
|
+
sys.exit(1)
|
|
108
|
+
|
|
109
|
+
def Bam2bed(filename, CB, UMI, out, num_threads):
|
|
110
|
+
if not os.path.exists('%s_scTEtmp/o1'%out):
|
|
111
|
+
os.system('mkdir -p %s_scTEtmp/o1'%out)
|
|
112
|
+
|
|
113
|
+
sample=filename.split('/')[-1].replace('.bam','')
|
|
114
|
+
if sys.platform == 'darwin': # Mac OSX has BSD sed
|
|
115
|
+
switch = '-E'
|
|
116
|
+
else:
|
|
117
|
+
switch = '-r'
|
|
118
|
+
|
|
119
|
+
if UMI == 'False':
|
|
120
|
+
if CB == 'False':
|
|
121
|
+
# Put the sample name in the barcode slot
|
|
122
|
+
os.system('samtools view -@ %s %s | awk \'{OFS="\t"}{print $3,$4,$4+100,"%s"}\' | sed %s \'s/^chr//g\'| gzip -c > %s_scTEtmp/o1/%s.bed.gz' % (num_threads, filename, out, switch, out, out))
|
|
123
|
+
elif CB == 'CR':
|
|
124
|
+
os.system('samtools view -@ %s %s | awk \'{OFS="\t"}{for(i=12;i<=NF;i++)if($i~/CR:Z:/)n=i}{print $3,$4,$4+100,$n}\' | sed %s \'s/CR:Z://g\' | sed %s \'s/^chr//g\' | gzip -c > %s_scTEtmp/o1/%s.bed.gz' % (num_threads, filename, switch, switch, out, out))
|
|
125
|
+
elif CB == 'CB':
|
|
126
|
+
os.system('samtools view -@ %s %s | awk \'{OFS="\t"}{for(i=12;i<=NF;i++)if($i~/CB:Z:/)n=i}{print $3,$4,$4+100,$n}\' | sed %s \'s/CB:Z://g\' | sed %s \'s/^chr//g\' | gzip -c > %s_scTEtmp/o1/%s.bed.gz' % (num_threads, filename, switch, switch, out, out))
|
|
127
|
+
elif UMI == 'UR':
|
|
128
|
+
if CB == 'CR':
|
|
129
|
+
os.system('samtools view -@ %s %s | awk \'{OFS="\t"}{for(i=12;i<=NF;i++)if($i~/CR:Z:/)n=i}{for(i=12;i<=NF;i++)if($i~/UR:Z:/)m=i}{print $3,$4,$4+100,$n,$m}\' | sed %s \'s/CR:Z://g\' | sed %s \'s/UR:Z://g\'| sed %s \'s/^chr//g\' | awk \'!x[$4$5]++\' | gzip -c > %s_scTEtmp/o1/%s.bed.gz' % (num_threads, filename, switch, switch, switch, out,out))
|
|
130
|
+
elif CB == 'CB':
|
|
131
|
+
os.system('samtools view -@ %s %s | awk \'{OFS="\t"}{for(i=12;i<=NF;i++)if($i~/CB:Z:/)n=i}{for(i=12;i<=NF;i++)if($i~/UR:Z:/)m=i}{print $3,$4,$4+100,$n,$m}\' | sed %s \'s/CB:Z://g\' | sed %s \'s/UR:Z://g\'| sed %s \'s/^chr//g\' | awk \'!x[$4$5]++\' | gzip -c > %s_scTEtmp/o1/%s.bed.gz' % (num_threads, filename, switch, switch, switch, out,out))
|
|
132
|
+
elif UMI == 'UB':
|
|
133
|
+
if CB == 'CR':
|
|
134
|
+
os.system('samtools view -@ %s %s | awk \'{OFS="\t"}{for(i=12;i<=NF;i++)if($i~/CR:Z:/)n=i}{for(i=12;i<=NF;i++)if($i~/UB:Z:/)m=i}{print $3,$4,$4+100,$n,$m}\' | sed %s \'s/CR:Z://g\' | sed %s \'s/UB:Z://g\'| sed %s \'s/^chr//g\' | awk \'!x[$4$5]++\' | gzip -c > %s_scTEtmp/o1/%s.bed.gz' % (num_threads, filename, switch, switch, switch, out,out))
|
|
135
|
+
elif CB == 'CB':
|
|
136
|
+
os.system('samtools view -@ %s %s | awk \'{OFS="\t"}{for(i=12;i<=NF;i++)if($i~/CB:Z:/)n=i}{for(i=12;i<=NF;i++)if($i~/UB:Z:/)m=i}{print $3,$4,$4+100,$n,$m}\' | sed %s \'s/CB:Z://g\' | sed %s \'s/UB:Z://g\'| sed %s \'s/^chr//g\' | awk \'!x[$4$5]++\' | gzip -c > %s_scTEtmp/o1/%s.bed.gz' % (num_threads, filename, switch, switch, switch, out,out))
|
|
137
|
+
|
|
138
|
+
def Para_bam2bed(filename, CB, UMI, out, num_threads=1):
|
|
139
|
+
if not os.path.exists('%s_scTEtmp/o0'%out):
|
|
140
|
+
os.system('mkdir -p %s_scTEtmp/o0'%out)
|
|
141
|
+
|
|
142
|
+
sample=filename.split('/')[-1].replace('.bam','')
|
|
143
|
+
|
|
144
|
+
if sys.platform == 'darwin': # Mac OSX has BSD sed
|
|
145
|
+
switch = '-E'
|
|
146
|
+
else:
|
|
147
|
+
switch = '-r'
|
|
148
|
+
|
|
149
|
+
if UMI == 'False':
|
|
150
|
+
if CB == 'False':
|
|
151
|
+
os.system('samtools view -@ %s %s | awk \'{OFS="\t"}{print $3,$4,$4+100,"%s"}\' | sed %s \'s/^chr//g\' | gzip > %s_scTEtmp/o0/%s.bed.gz'%(num_threads, filename, sample, switch, out, sample))
|
|
152
|
+
elif CB == 'CR':
|
|
153
|
+
os.system('samtools view -@ %s %s | awk \'{OFS="\t"}{for(i=12;i<=NF;i++)if($i~/CR:Z:/)n=i}{print $3,$4,$4+100,$n}\' | sed %s \'s/CR:Z://g\' | sed %s \'s/^chr//g\' | gzip > %s_scTEtmp/o0/%s.bed.gz'%(num_threads, filename, switch, switch, out,sample))
|
|
154
|
+
elif CB == 'CB':
|
|
155
|
+
os.system('samtools view -@ %s %s | awk \'{OFS="\t"}{for(i=12;i<=NF;i++)if($i~/CB:Z:/)n=i}{print $3,$4,$4+100,$n}\' | sed %s \'s/CB:Z://g\' | sed %s \'s/^chr//g\' | gzip > %s_scTEtmp/o0/%s.bed.gz'%(num_threads, filename, switch, switch, out,sample))
|
|
156
|
+
elif UMI == 'UR':
|
|
157
|
+
if CB == 'CR':
|
|
158
|
+
os.system('samtools view -@ %s %s | awk \'{OFS="\t"}{for(i=12;i<=NF;i++)if($i~/CR:Z:/)n=i}{for(i=12;i<=NF;i++)if($i~/UR:Z:/)m=i}{print $3,$4,$4+100,$n,$m}\' | sed %s \'s/CR:Z://g\' | sed %s \'s/UR:Z://g\' | sed %s \'s/^chr//g\' | awk \'!x[$4$5]++\' | gzip > %s_scTEtmp/o0/%s.bed.gz'%(num_threads, filename, switch, switch, switch, out,sample))
|
|
159
|
+
elif CB == 'CB':
|
|
160
|
+
os.system('samtools view -@ %s %s | awk \'{OFS="\t"}{for(i=12;i<=NF;i++)if($i~/CB:Z:/)n=i}{for(i=12;i<=NF;i++)if($i~/UR:Z:/)m=i}{print $3,$4,$4+100,$n,$m}\' | sed %s \'s/CB:Z://g\' | sed %s \'s/UR:Z://g\' | sed %s \'s/^chr//g\' | awk \'!x[$4$5]++\' | gzip > %s_scTEtmp/o0/%s.bed.gz'%(num_threads, filename, switch, switch, switch, out,sample))
|
|
161
|
+
elif UMI == 'UB':
|
|
162
|
+
if CB == 'CR':
|
|
163
|
+
os.system('samtools view -@ %s %s | awk \'{OFS="\t"}{for(i=12;i<=NF;i++)if($i~/CR:Z:/)n=i}{for(i=12;i<=NF;i++)if($i~/UB:Z:/)m=i}{print $3,$4,$4+100,$n,$m}\' | sed %s \'s/CR:Z://g\' | sed %s \'s/UB:Z://g\' | sed %s \'s/^chr//g\' | awk \'!x[$4$5]++\' | gzip > %s_scTEtmp/o0/%s.bed.gz'%(num_threads, filename, switch, switch, switch, out,sample))
|
|
164
|
+
elif CB == 'CB':
|
|
165
|
+
os.system('samtools view -@ %s %s | awk \'{OFS="\t"}{for(i=12;i<=NF;i++)if($i~/CB:Z:/)n=i}{for(i=12;i<=NF;i++)if($i~/UB:Z:/)m=i}{print $3,$4,$4+100,$n,$m}\' | sed %s \'s/CB:Z://g\' | sed %s \'s/UB:Z://g\' | sed %s \'s/^chr//g\' | awk \'!x[$4$5]++\' | gzip > %s_scTEtmp/o0/%s.bed.gz'%(num_threads, filename, switch, switch, switch, out,sample))
|
|
166
|
+
|
|
167
|
+
def splitAllChrs(chromosome_list, filename, genenumber, countnumber, UMI=True):
|
|
168
|
+
'''
|
|
169
|
+
**Purpose**
|
|
170
|
+
Split the data into separate beds, and count up all the times each barcode appears
|
|
171
|
+
|
|
172
|
+
This variant uses more memory, but does it all at the same time and gets the filtered whitelist for free
|
|
173
|
+
|
|
174
|
+
**Arguments**
|
|
175
|
+
chromosome_list
|
|
176
|
+
List of chromosome names
|
|
177
|
+
|
|
178
|
+
filename (Required)
|
|
179
|
+
filename stub to use for tmp files
|
|
180
|
+
|
|
181
|
+
genenumber (Required)
|
|
182
|
+
Minimum number of genes expressed required for a cell to pass filtering
|
|
183
|
+
|
|
184
|
+
countnumber (Required)
|
|
185
|
+
Minimum number of counts required for a cell to pass filtering.
|
|
186
|
+
|
|
187
|
+
UMI (optional, default=True)
|
|
188
|
+
use the UMI
|
|
189
|
+
|
|
190
|
+
**Returns**
|
|
191
|
+
The barcode whitelist
|
|
192
|
+
'''
|
|
193
|
+
|
|
194
|
+
if not os.path.exists('%s_scTEtmp/o2' % filename):
|
|
195
|
+
os.system('mkdir -p %s_scTEtmp/o2'%filename)
|
|
196
|
+
|
|
197
|
+
chromosome_list = set([c.replace('chr', '') for c in chromosome_list])
|
|
198
|
+
|
|
199
|
+
file_handle_in = gzip.open('%s_scTEtmp/o1/%s.bed.gz' % (filename,filename), 'rt')
|
|
200
|
+
file_handles_out = {chr: gzip.open('%s_scTEtmp/o2/%s.chr%s.bed.gz' % (filename,filename,chr), 'wt') for chr in chromosome_list}
|
|
201
|
+
|
|
202
|
+
CRs = defaultdict(int)
|
|
203
|
+
|
|
204
|
+
if UMI:
|
|
205
|
+
uniques = {chrom: set([]) for chrom in chromosome_list}
|
|
206
|
+
|
|
207
|
+
# Make a BED for each chromosome
|
|
208
|
+
for line in file_handle_in:
|
|
209
|
+
t = line.strip().split('\t')
|
|
210
|
+
chrom = t[0].replace('chr', '') # strip chr
|
|
211
|
+
|
|
212
|
+
if chrom not in chromosome_list: # remove the unusual chromosomes
|
|
213
|
+
# Force chrMT -> chrM
|
|
214
|
+
if chrom == 'MT':
|
|
215
|
+
chrom = 'M'
|
|
216
|
+
else:
|
|
217
|
+
continue
|
|
218
|
+
|
|
219
|
+
if UMI:
|
|
220
|
+
if line in uniques[chrom]:
|
|
221
|
+
continue
|
|
222
|
+
uniques[chrom].add(line)
|
|
223
|
+
CRs[t[3]] += 1
|
|
224
|
+
else:
|
|
225
|
+
CRs[t[3]] += 1
|
|
226
|
+
|
|
227
|
+
file_handles_out[chrom].write(line)
|
|
228
|
+
|
|
229
|
+
[file_handles_out[k].close() for k in file_handles_out]
|
|
230
|
+
file_handle_in.close()
|
|
231
|
+
|
|
232
|
+
if not countnumber:
|
|
233
|
+
mincounts = 2 * genenumber
|
|
234
|
+
else:
|
|
235
|
+
mincounts = countnumber
|
|
236
|
+
|
|
237
|
+
CRs = {k: v for k, v in CRs.items() if v >= mincounts}
|
|
238
|
+
|
|
239
|
+
return list(CRs.keys())
|
|
240
|
+
|
|
241
|
+
def filterCRs(filename, genenumber, countnumber):
|
|
242
|
+
CRs = defaultdict(int)
|
|
243
|
+
for f in sorted(glob.glob('%s_scTEtmp/o2/%s*.count.gz'%(filename,filename))):
|
|
244
|
+
logging.info('Reading %s '%os.path.split(f)[1])
|
|
245
|
+
o = gzip.open(f,'rt')
|
|
246
|
+
for l in o:
|
|
247
|
+
t = l.strip().split('\t')
|
|
248
|
+
CRs[t[0]] += int(t[1])
|
|
249
|
+
o.close()
|
|
250
|
+
|
|
251
|
+
if not countnumber:
|
|
252
|
+
mincounts = 2* genenumber
|
|
253
|
+
else:
|
|
254
|
+
mincounts = countnumber
|
|
255
|
+
|
|
256
|
+
logging.info('Before filter %s'%len(CRs))
|
|
257
|
+
CRs = {k: v for k, v in CRs.items() if v >= mincounts}
|
|
258
|
+
logging.info('Before filter %s'%len(CRs))
|
|
259
|
+
|
|
260
|
+
return list(CRs.keys())
|
|
261
|
+
|
|
262
|
+
def splitChr(chr, filename, CB, UMI):
|
|
263
|
+
if not os.path.exists('%s_scTEtmp/o2'%filename):
|
|
264
|
+
os.system('mkdir -p %s_scTEtmp/o2'%filename)
|
|
265
|
+
|
|
266
|
+
chr=chr.replace('chr','')
|
|
267
|
+
if CB == 'CR' or CB == 'CB': CB = True
|
|
268
|
+
else: CB = False
|
|
269
|
+
if UMI == 'UR' or UMI == 'UB': UMI = True
|
|
270
|
+
else: UMI= False
|
|
271
|
+
|
|
272
|
+
if not CB: # C1-style data is a cell per BAM, so no barcode;
|
|
273
|
+
if not UMI:
|
|
274
|
+
if chr == '1':
|
|
275
|
+
os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep -v ^1\'[0-9]\' | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr))
|
|
276
|
+
elif chr == '2':
|
|
277
|
+
os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep -v ^2\'[0-9]\' | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr))
|
|
278
|
+
elif chr == '3':
|
|
279
|
+
os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep -v ^3\'[0-9]\' | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr))
|
|
280
|
+
else:
|
|
281
|
+
os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr))
|
|
282
|
+
else:
|
|
283
|
+
if chr == '1':
|
|
284
|
+
os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep -v ^1\'[0-9]\' | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr))
|
|
285
|
+
elif chr == '2':
|
|
286
|
+
os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep -v ^2\'[0-9]\' | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr))
|
|
287
|
+
elif chr == '3':
|
|
288
|
+
os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep -v ^3\'[0-9]\' | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr))
|
|
289
|
+
else:
|
|
290
|
+
os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr))
|
|
291
|
+
else:
|
|
292
|
+
if not UMI: # did not remove the potential PCR duplicates for scRNA-seq
|
|
293
|
+
if chr == '1':
|
|
294
|
+
os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep -v ^1\'[0-9]\' | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr))
|
|
295
|
+
elif chr == '2':
|
|
296
|
+
os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep -v ^2\'[0-9]\' | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr))
|
|
297
|
+
elif chr == '3':
|
|
298
|
+
os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep -v ^3\'[0-9]\' | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr))
|
|
299
|
+
else:
|
|
300
|
+
os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr))
|
|
301
|
+
else:
|
|
302
|
+
if chr == '1':
|
|
303
|
+
os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep -v ^1\'[0-9]\' | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr))
|
|
304
|
+
elif chr == '2':
|
|
305
|
+
os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep -v ^2\'[0-9]\' | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr))
|
|
306
|
+
elif chr == '3':
|
|
307
|
+
os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep -v ^3\'[0-9]\' | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr))
|
|
308
|
+
else:
|
|
309
|
+
os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr))
|
|
310
|
+
|
|
311
|
+
CRs = defaultdict(int)
|
|
312
|
+
o = gzip.open('%s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr),'rt')
|
|
313
|
+
for l in o:
|
|
314
|
+
t = l.strip().split('\t')
|
|
315
|
+
CRs[t[3]] += 1
|
|
316
|
+
o.close()
|
|
317
|
+
|
|
318
|
+
o = gzip.open('%s_scTEtmp/o2/%s.chr%s.count.gz'%(filename,filename,chr),'wt')
|
|
319
|
+
for k in CRs:
|
|
320
|
+
o.write('%s\t%s\n'%(k,CRs[k]))
|
|
321
|
+
o.close()
|
|
322
|
+
|
|
323
|
+
def align(chr, filename, all_annot, glannot, whitelist): #CB
|
|
324
|
+
'''
|
|
325
|
+
**Purpose**
|
|
326
|
+
For each read, align it to the index and assign a TE, gene.
|
|
327
|
+
|
|
328
|
+
This is the speed critical part.
|
|
329
|
+
|
|
330
|
+
'''
|
|
331
|
+
s1 = time.time()
|
|
332
|
+
chr = 'chr' + chr
|
|
333
|
+
|
|
334
|
+
if not os.path.exists('%s_scTEtmp/o3'%filename):
|
|
335
|
+
os.system('mkdir -p %s_scTEtmp/o3'%filename)
|
|
336
|
+
|
|
337
|
+
if not glannot: # Load separately for the multicore pipeline, share the index for the single core pipeline
|
|
338
|
+
glannot = glload(all_annot)
|
|
339
|
+
|
|
340
|
+
# Only keep the glbase parts we need.
|
|
341
|
+
buckets = glannot.buckets[chr.replace('chr', '')]
|
|
342
|
+
all_annot = glannot.linearData
|
|
343
|
+
|
|
344
|
+
oh = gzip.open('%s_scTEtmp/o2/%s.%s.bed.gz' % (filename, filename, chr), 'rt')
|
|
345
|
+
res = {}
|
|
346
|
+
for line in oh:
|
|
347
|
+
t = line.strip().split('\t')
|
|
348
|
+
barcode = t[3]
|
|
349
|
+
if barcode not in whitelist:
|
|
350
|
+
continue
|
|
351
|
+
if barcode not in res:
|
|
352
|
+
res[barcode] = defaultdict(int)
|
|
353
|
+
|
|
354
|
+
#chrom = t[0].replace('chr', '') # Don't need as each align is already split for each chrom;
|
|
355
|
+
left = int(t[1])
|
|
356
|
+
rite = int(t[2])
|
|
357
|
+
|
|
358
|
+
#loc = location(chr=chrom, left=left, right=rite)
|
|
359
|
+
left_buck = ((left-1)//10000) * 10000
|
|
360
|
+
right_buck = ((rite)//10000) * 10000
|
|
361
|
+
buckets_reqd = range(left_buck, right_buck+10000, 10000)
|
|
362
|
+
|
|
363
|
+
if buckets_reqd:
|
|
364
|
+
loc_ids = set()
|
|
365
|
+
loc_ids_update = loc_ids.update
|
|
366
|
+
|
|
367
|
+
# get the ids reqd.
|
|
368
|
+
[loc_ids_update(buckets[buck]) for buck in buckets_reqd if buck in buckets]
|
|
369
|
+
|
|
370
|
+
result = [all_annot[index]['annot'] for index in loc_ids if (rite >= all_annot[index]['loc'].loc['left'] and left <= all_annot[index]['loc'].loc["right"])]
|
|
371
|
+
|
|
372
|
+
if result:
|
|
373
|
+
for gene in result:
|
|
374
|
+
res[barcode][gene] += 1
|
|
375
|
+
|
|
376
|
+
oh.close()
|
|
377
|
+
|
|
378
|
+
oh = gzip.open('%s_scTEtmp/o3/%s.%s.bed.gz' % (filename,filename,chr), 'wt')
|
|
379
|
+
for bc in sorted(res):
|
|
380
|
+
for gene in sorted(res[bc]):
|
|
381
|
+
oh.write('%s\t%s\t%s\n' % (bc, gene, res[bc][gene]))
|
|
382
|
+
oh.close()
|
|
383
|
+
|
|
384
|
+
def Countexpression(filename, allelement, genenumber, cellnumber, hdf5):
|
|
385
|
+
gene_seen = allelement
|
|
386
|
+
|
|
387
|
+
whitelist={}
|
|
388
|
+
o = gzip.open('%s_scTEtmp/o4/%s.bed.gz'%(filename, filename), 'rt')
|
|
389
|
+
for n,l in enumerate(o):
|
|
390
|
+
t = l.strip().split('\t')
|
|
391
|
+
if t[0] not in whitelist:
|
|
392
|
+
whitelist[t[0]] = 0
|
|
393
|
+
whitelist[t[0]] += 1
|
|
394
|
+
o.close()
|
|
395
|
+
|
|
396
|
+
CRlist = []
|
|
397
|
+
sortcb = sorted(whitelist.items(), key=lambda item:item[1], reverse=True)
|
|
398
|
+
for n,k in enumerate(sortcb):
|
|
399
|
+
if k[1] < genenumber:
|
|
400
|
+
break
|
|
401
|
+
if n >= cellnumber:
|
|
402
|
+
break
|
|
403
|
+
CRlist.append(k[0])
|
|
404
|
+
CRlist = set(CRlist)
|
|
405
|
+
|
|
406
|
+
res = {}
|
|
407
|
+
genes_oh = gzip.open('%s_scTEtmp/o4/%s.bed.gz' % (filename,filename), 'rt')
|
|
408
|
+
for n, l in enumerate(genes_oh):
|
|
409
|
+
t = l.strip().split('\t')
|
|
410
|
+
if t[0] not in CRlist:
|
|
411
|
+
continue
|
|
412
|
+
if t[0] not in res:
|
|
413
|
+
res[t[0]] = {}
|
|
414
|
+
if t[1] not in res[t[0]]:
|
|
415
|
+
res[t[0]][t[1]] = 0
|
|
416
|
+
res[t[0]][t[1]] += int(t[2])
|
|
417
|
+
|
|
418
|
+
genes_oh.close()
|
|
419
|
+
|
|
420
|
+
s=time.time()
|
|
421
|
+
|
|
422
|
+
# Save out the final file
|
|
423
|
+
|
|
424
|
+
gene_seen = list(gene_seen) # Do the sort once;
|
|
425
|
+
gene_seen.sort()
|
|
426
|
+
|
|
427
|
+
#==== save results =====
|
|
428
|
+
if not hdf5: # save as csv
|
|
429
|
+
res_oh = open('%s.csv'%filename, 'w')
|
|
430
|
+
res_oh.write('barcodes,')
|
|
431
|
+
res_oh.write('%s\n' % (','.join([str(i) for i in gene_seen])))
|
|
432
|
+
|
|
433
|
+
for k in sorted(res):
|
|
434
|
+
l = ["0"] * len(gene_seen) # Avoid all the appends
|
|
435
|
+
for idx, gene in enumerate(gene_seen):
|
|
436
|
+
if gene in res[k]:
|
|
437
|
+
l[idx] = str(res[k][gene])
|
|
438
|
+
res_oh.write('%s,%s\n' % (k, ','.join(l)))
|
|
439
|
+
res_oh.close()
|
|
440
|
+
|
|
441
|
+
else: # save as hdf5
|
|
442
|
+
data = []
|
|
443
|
+
CBs = []
|
|
444
|
+
for k in sorted(res):
|
|
445
|
+
l = ["0"] * len(gene_seen) # Avoid all the appends
|
|
446
|
+
for idx, gene in enumerate(gene_seen):
|
|
447
|
+
if gene in res[k]:
|
|
448
|
+
l[idx] = str(res[k][gene])
|
|
449
|
+
data.append(l)
|
|
450
|
+
CBs.append(k)
|
|
451
|
+
|
|
452
|
+
obs = pd.DataFrame(index = CBs)
|
|
453
|
+
var = pd.DataFrame(index = gene_seen)
|
|
454
|
+
adata = ad.AnnData(np.asarray(data).astype(int),var = var,obs = obs)
|
|
455
|
+
adata.X = scipy.sparse.csr_matrix(adata.X)
|
|
456
|
+
adata.write('%s.h5ad'%filename)
|
|
457
|
+
|
|
458
|
+
#========================
|
|
459
|
+
|
|
460
|
+
|
|
461
|
+
return len(res), genenumber, filename
|
|
462
|
+
|
|
463
|
+
def timediff(timestart, timestop):
|
|
464
|
+
t = (timestop-timestart)
|
|
465
|
+
time_day = t.days
|
|
466
|
+
s_time = t.seconds
|
|
467
|
+
ms_time = t.microseconds / 1000000
|
|
468
|
+
usedtime = int(s_time + ms_time)
|
|
469
|
+
time_hour = int(usedtime / 60 / 60 )
|
|
470
|
+
time_minute = int((usedtime - time_hour * 3600 ) / 60 )
|
|
471
|
+
time_second = int(usedtime - time_hour * 3600 - time_minute * 60 )
|
|
472
|
+
retstr = "%dd %dh %dm %ds" %(time_day, time_hour, time_minute, time_second,)
|
|
473
|
+
return retstr
|