scte-quant 1.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
scTE/__init__.py ADDED
@@ -0,0 +1,11 @@
1
+ from importlib.metadata import version, PackageNotFoundError
2
+
3
+ try:
4
+ __version__ = version("scTE")
5
+ except PackageNotFoundError:
6
+ # package is not installed
7
+ __version__ = "unknown"
8
+
9
+ from .miniglbase import genelist, location, glload
10
+
11
+ __all__ = ["genelist", "location", "glload",]
scTE/_version.py ADDED
@@ -0,0 +1,24 @@
1
+ # file generated by vcs-versioning
2
+ # don't change, don't track in version control
3
+ from __future__ import annotations
4
+
5
+ __all__ = [
6
+ "__version__",
7
+ "__version_tuple__",
8
+ "version",
9
+ "version_tuple",
10
+ "__commit_id__",
11
+ "commit_id",
12
+ ]
13
+
14
+ version: str
15
+ __version__: str
16
+ __version_tuple__: tuple[int | str, ...]
17
+ version_tuple: tuple[int | str, ...]
18
+ commit_id: str | None
19
+ __commit_id__: str | None
20
+
21
+ __version__ = version = '1.3.3'
22
+ __version_tuple__ = version_tuple = (1, 3, 3)
23
+
24
+ __commit_id__ = commit_id = None
scTE/annotation.py ADDED
@@ -0,0 +1,158 @@
1
+ import os,sys,gzip,time
2
+ import numpy as np
3
+ from scTE.miniglbase import genelist, glload, location
4
+
5
+ form ={'force_tsv': True, 'loc': 'location(chr=column[0], left=column[1], right=column[2])', 'annot': 3}
6
+
7
+ def cleanexon(filename, genefilename, exons):
8
+ if not os.path.exists('%s_scTEtmp/index'%filename):
9
+ os.system('mkdir -p %s_scTEtmp/index'%filename)
10
+
11
+ oh=gzip.open('%s_scTEtmp/index/%s.bed.gz'%(filename,genefilename),'wt')
12
+ for k in sorted(exons):
13
+ E=[]
14
+ chr_val = exons[k][0][0] # get chromosome from first exon
15
+ for it in exons[k]:
16
+ E+=list(range(it[1],it[2]))
17
+ E=sorted(set(E))
18
+
19
+ s=0
20
+ tmp=[]
21
+ for id in range(0,len(E)-1):
22
+ if E[id+1]-E[id] >1:
23
+ en=id
24
+ tmp.append([E[s],E[en]])
25
+ s=en+1
26
+ tmp.append([E[s],E[id+1]])
27
+
28
+ for item in tmp:
29
+ oh.write('%s\t%s\t%s\t%s\n'%(chr_val,item[0],item[1],k))
30
+ oh.close()
31
+
32
+ def annoGtf(filename, genefile, tefile, mode):
33
+
34
+ genefilename = genefile.split('/')[-1:][0].replace('.gtf','').replace('.gz','')
35
+ tefilename = tefile.split('/')[-1:][0].replace('.bed','').replace('.gz','')
36
+
37
+ raw = {}
38
+ clean = {}
39
+ if '.gz' in genefile:
40
+ o = gzip.open(genefile,'rb')
41
+ else:
42
+ o=open(genefile,'r')
43
+ for l in o:
44
+ if '.gz' in genefile:
45
+ l=l.decode('ascii')
46
+ if l.startswith('#'):
47
+ continue
48
+ t=l.strip().split('\t')
49
+ if t[2]=='exon' or t[2]=='UTR':
50
+ chr = t[0].replace('chr','')
51
+ left = int(t[3])
52
+ riht = int(t[4])
53
+ name=t[8].split('gene_name "')[1].split('";')[0]
54
+
55
+ if name not in raw:
56
+ raw[name] = []
57
+ raw[name].append([chr,left,riht])
58
+
59
+ if 'protein_coding' not in l and 'lincRNA' not in l:
60
+ continue
61
+ if name not in clean:
62
+ clean[name] = []
63
+ clean[name].append([chr,left,riht])
64
+ o.close()
65
+
66
+ cleanexon(filename,'%s.raw'%genefilename,raw)
67
+ cleanexon(filename,'%s.clean'%genefilename,clean)
68
+
69
+ if mode == 'exclusive':
70
+ gene ={}
71
+ o = gzip.open('%s_scTEtmp/index/%s.clean.bed.gz'%(filename,genefilename),'rb')
72
+ for l in o:
73
+ t = l.decode('ascii').strip().split('\t')
74
+ chr = t[0].replace('chr','')
75
+ left = int(t[1])
76
+ rite = int(t[2])
77
+
78
+ left_buck = int((left-1)/10000) * 10000
79
+ right_buck = int((rite)/10000) * 10000
80
+ buckets_reqd = range(left_buck, right_buck+10000, 10000)
81
+
82
+ if chr not in gene:
83
+ gene[chr] = {}
84
+
85
+ if buckets_reqd:
86
+ for buck in buckets_reqd:
87
+ if buck not in gene[chr]:
88
+ gene[chr][buck] = []
89
+ gene[chr][buck].append([left, rite])
90
+ o.close()
91
+
92
+ noverlap = []
93
+ if '.gz' in tefile:
94
+ o = gzip.open(tefile,'rb')
95
+ else:
96
+ o = open(tefile,'r')
97
+ for n,l in enumerate(o):
98
+ if '.gz' in tefile:
99
+ l = l.decode('ascii')
100
+ t = l.strip().split('\t')
101
+ chr = t[0]
102
+ left = int(t[1])
103
+ rite = int(t[2])
104
+
105
+ if chr not in gene:
106
+ noverlap.append('%s\t%s\t%s\t%s\n'%(chr,left,rite,t[3]))
107
+ continue
108
+
109
+ left_buck = int((left-1)/10000) * 10000
110
+ right_buck = int((rite)/10000) * 10000
111
+ buckets_reqd = range(left_buck, right_buck+10000, 10000)
112
+
113
+ if buckets_reqd:
114
+ i = 1
115
+ for buck in buckets_reqd:
116
+ if buck not in gene[chr]:
117
+ pass
118
+ else:
119
+ for k in gene[chr][buck]:
120
+ if left < k[1] and rite > k[0]:
121
+ i = 0
122
+ break
123
+ if i == 0:
124
+ break
125
+ if i == 1:
126
+ noverlap.append('%s\t%s\t%s\t%s\n'%(chr,left,rite,t[3]))
127
+
128
+ oh = gzip.open('%s_scTEtmp/index/%s.exclusive.gz'%(filename, tefilename),'wt')
129
+ for k in noverlap:
130
+ oh.write(k)
131
+ oh.close()
132
+
133
+ genes = genelist('%s_scTEtmp/index/%s.raw.bed.gz'%(filename, genefilename), format=form, gzip=True)
134
+ TEs = genelist('%s_scTEtmp/index/%s.exclusive.gz'%(filename, tefilename), format=form, gzip=True)
135
+ print(genes)
136
+ print(TEs)
137
+
138
+ all_annot = genes + TEs
139
+ all_annot.save('%s_scTEtmp/index/custome.exclusive.glb'%filename)
140
+ annot = '%s_scTEtmp/index/custome.exclusive.glb'%filename
141
+
142
+ elif mode == 'inclusive':
143
+ genes = genelist('%s_scTEtmp/index/%s.raw.bed.gz'%(filename,genefilename), format=form, gzip=True)
144
+ if tefilename.endswith('.gz'):
145
+ TEs = genelist(tefile, format=form, gzip=True)
146
+ else:
147
+ TEs = genelist(tefile, format=form)
148
+
149
+ all_annot = genes + TEs
150
+ all_annot.save('%s_scTEtmp/index/custome.inclusive.glb'%filename)
151
+ annot = '%s_scTEtmp/index/custome.inclusive.glb'%filename
152
+
153
+ return annot
154
+
155
+
156
+
157
+
158
+
scTE/base.py ADDED
@@ -0,0 +1,473 @@
1
+ import pandas as pd
2
+ import multiprocessing
3
+ import argparse
4
+ from functools import partial
5
+ import logging
6
+ logging.getLogger("zarr").setLevel(logging.WARNING)
7
+ import os, sys, glob, datetime, time, gzip
8
+ import collections
9
+ from collections import defaultdict
10
+ from math import log
11
+ from scTE.miniglbase import genelist, glload, location
12
+ from scTE.annotation import annoGtf
13
+ import subprocess
14
+
15
+ import numpy as np
16
+ import scipy
17
+ import anndata as ad
18
+
19
+ def read_opts(parser):
20
+ args = parser.parse_args()
21
+ if args.format == "BAM" :
22
+ args.parser = "BAM"
23
+ elif args.format == "SAM" :
24
+ args.parser = "SAM"
25
+ else :
26
+ logging.error("The input file must be SAM/BAM format: %s !\n" % (args.format))
27
+ sys.exit(1)
28
+
29
+ args.error = logging.critical
30
+ args.warn = logging.warning
31
+ args.debug = logging.debug
32
+ args.info = logging.info
33
+
34
+ args.argtxt ="\n".join(("Parameter list:", \
35
+ "Sample = %s" % (args.out), \
36
+ "Reference annotation index = %s" %(args.annoglb[0]), \
37
+ "Minimum number of genes required = %s" % (args.genenumber), \
38
+ "Minimum number of counts required = %s"% (args.countnumber),\
39
+ "Number of threads = %s " % (args.thread),\
40
+ ))
41
+ return args
42
+
43
+ def Readanno(filename, annoglb): #genome
44
+ glannot = glload(annoglb)
45
+ allelement = set(glannot['annot'])
46
+ # if genome in ['mm10']:
47
+ # chr_list = ['chr'+ str(i) for i in range(1,20) ] + [ 'chrX','chrY', 'chrM' ]
48
+ # elif genome in ['hg38']:
49
+ # chr_list = ['chr'+ str(i) for i in range(1,23) ] + [ 'chrX','chrY', 'chrM' ]
50
+
51
+ chr_list = list(set([ k['chr'] for k in glannot['loc']])) #this is useful for costume chromsome
52
+ return(allelement, chr_list, annoglb, glannot)
53
+
54
+ def checkCBUMI(filename,out,CB,UMI):
55
+ def _run_and_wait(cmd, outfile, timeout=30):
56
+ """Run a command and wait for its output file with polling."""
57
+ subprocess.run(cmd, shell=True)
58
+ # Poll for file instead of fixed sleep
59
+ for _ in range(timeout):
60
+ if os.path.exists(outfile):
61
+ time.sleep(0.1) # brief settle for file flush
62
+ return
63
+ time.sleep(1)
64
+ logging.error("Timeout waiting for output: %s" % outfile)
65
+ sys.exit(1)
66
+
67
+ if CB == 'CR':
68
+ _run_and_wait(
69
+ 'samtools view %s | head -100| grep "CR:Z:" | wc -l > %s_scTEtmp/o1/testCR.txt' % (filename, out),
70
+ '%s_scTEtmp/o1/testCR.txt' % out)
71
+ with open('%s_scTEtmp/o1/testCR.txt' % out, 'r') as o:
72
+ for l in o:
73
+ l = l.strip()
74
+ if int(l) < 100:
75
+ logging.error("The input file %s has no cell barcodes information, plese make sure the aligner have add the cell barcode key, or set CB to False" % filename)
76
+ sys.exit(1)
77
+ elif CB == 'CB':
78
+ _run_and_wait(
79
+ 'samtools view %s | head -100| grep "CB:Z:" | wc -l > %s_scTEtmp/o1/testCR.txt' % (filename, out),
80
+ '%s_scTEtmp/o1/testCR.txt' % out)
81
+ with open('%s_scTEtmp/o1/testCR.txt' % out, 'r') as o:
82
+ for l in o:
83
+ l = l.strip()
84
+ if int(l) < 100:
85
+ logging.error("The input file %s has no cell barcodes information, plese make sure the aligner have add the cell barcode key, or set CB to False" % filename)
86
+ sys.exit(1)
87
+
88
+ if UMI == 'UR':
89
+ _run_and_wait(
90
+ 'samtools view %s | head -100| grep "UR:Z:" | wc -l > %s_scTEtmp/o1/testUMI.txt' % (filename, out),
91
+ '%s_scTEtmp/o1/testUMI.txt' % out)
92
+ with open('%s_scTEtmp/o1/testUMI.txt' % out, 'r') as o:
93
+ for l in o:
94
+ l = l.strip()
95
+ if int(l) < 100:
96
+ logging.error("The input file %s has no UR:Z information, plese make sure the aligner have add the UMI key, or set UMI to False" % filename)
97
+ sys.exit(1)
98
+ elif UMI == 'UB':
99
+ _run_and_wait(
100
+ 'samtools view %s | head -100| grep "UB:Z:" | wc -l > %s_scTEtmp/o1/testUMI.txt' % (filename, out),
101
+ '%s_scTEtmp/o1/testUMI.txt' % out)
102
+ with open('%s_scTEtmp/o1/testUMI.txt' % out, 'r') as o:
103
+ for l in o:
104
+ l = l.strip()
105
+ if int(l) < 100:
106
+ logging.error("The input file %s has no UB:Z information, plese make sure the aligner have add the UMI key, or set UMI to False" % filename)
107
+ sys.exit(1)
108
+
109
+ def Bam2bed(filename, CB, UMI, out, num_threads):
110
+ if not os.path.exists('%s_scTEtmp/o1'%out):
111
+ os.system('mkdir -p %s_scTEtmp/o1'%out)
112
+
113
+ sample=filename.split('/')[-1].replace('.bam','')
114
+ if sys.platform == 'darwin': # Mac OSX has BSD sed
115
+ switch = '-E'
116
+ else:
117
+ switch = '-r'
118
+
119
+ if UMI == 'False':
120
+ if CB == 'False':
121
+ # Put the sample name in the barcode slot
122
+ os.system('samtools view -@ %s %s | awk \'{OFS="\t"}{print $3,$4,$4+100,"%s"}\' | sed %s \'s/^chr//g\'| gzip -c > %s_scTEtmp/o1/%s.bed.gz' % (num_threads, filename, out, switch, out, out))
123
+ elif CB == 'CR':
124
+ os.system('samtools view -@ %s %s | awk \'{OFS="\t"}{for(i=12;i<=NF;i++)if($i~/CR:Z:/)n=i}{print $3,$4,$4+100,$n}\' | sed %s \'s/CR:Z://g\' | sed %s \'s/^chr//g\' | gzip -c > %s_scTEtmp/o1/%s.bed.gz' % (num_threads, filename, switch, switch, out, out))
125
+ elif CB == 'CB':
126
+ os.system('samtools view -@ %s %s | awk \'{OFS="\t"}{for(i=12;i<=NF;i++)if($i~/CB:Z:/)n=i}{print $3,$4,$4+100,$n}\' | sed %s \'s/CB:Z://g\' | sed %s \'s/^chr//g\' | gzip -c > %s_scTEtmp/o1/%s.bed.gz' % (num_threads, filename, switch, switch, out, out))
127
+ elif UMI == 'UR':
128
+ if CB == 'CR':
129
+ os.system('samtools view -@ %s %s | awk \'{OFS="\t"}{for(i=12;i<=NF;i++)if($i~/CR:Z:/)n=i}{for(i=12;i<=NF;i++)if($i~/UR:Z:/)m=i}{print $3,$4,$4+100,$n,$m}\' | sed %s \'s/CR:Z://g\' | sed %s \'s/UR:Z://g\'| sed %s \'s/^chr//g\' | awk \'!x[$4$5]++\' | gzip -c > %s_scTEtmp/o1/%s.bed.gz' % (num_threads, filename, switch, switch, switch, out,out))
130
+ elif CB == 'CB':
131
+ os.system('samtools view -@ %s %s | awk \'{OFS="\t"}{for(i=12;i<=NF;i++)if($i~/CB:Z:/)n=i}{for(i=12;i<=NF;i++)if($i~/UR:Z:/)m=i}{print $3,$4,$4+100,$n,$m}\' | sed %s \'s/CB:Z://g\' | sed %s \'s/UR:Z://g\'| sed %s \'s/^chr//g\' | awk \'!x[$4$5]++\' | gzip -c > %s_scTEtmp/o1/%s.bed.gz' % (num_threads, filename, switch, switch, switch, out,out))
132
+ elif UMI == 'UB':
133
+ if CB == 'CR':
134
+ os.system('samtools view -@ %s %s | awk \'{OFS="\t"}{for(i=12;i<=NF;i++)if($i~/CR:Z:/)n=i}{for(i=12;i<=NF;i++)if($i~/UB:Z:/)m=i}{print $3,$4,$4+100,$n,$m}\' | sed %s \'s/CR:Z://g\' | sed %s \'s/UB:Z://g\'| sed %s \'s/^chr//g\' | awk \'!x[$4$5]++\' | gzip -c > %s_scTEtmp/o1/%s.bed.gz' % (num_threads, filename, switch, switch, switch, out,out))
135
+ elif CB == 'CB':
136
+ os.system('samtools view -@ %s %s | awk \'{OFS="\t"}{for(i=12;i<=NF;i++)if($i~/CB:Z:/)n=i}{for(i=12;i<=NF;i++)if($i~/UB:Z:/)m=i}{print $3,$4,$4+100,$n,$m}\' | sed %s \'s/CB:Z://g\' | sed %s \'s/UB:Z://g\'| sed %s \'s/^chr//g\' | awk \'!x[$4$5]++\' | gzip -c > %s_scTEtmp/o1/%s.bed.gz' % (num_threads, filename, switch, switch, switch, out,out))
137
+
138
+ def Para_bam2bed(filename, CB, UMI, out, num_threads=1):
139
+ if not os.path.exists('%s_scTEtmp/o0'%out):
140
+ os.system('mkdir -p %s_scTEtmp/o0'%out)
141
+
142
+ sample=filename.split('/')[-1].replace('.bam','')
143
+
144
+ if sys.platform == 'darwin': # Mac OSX has BSD sed
145
+ switch = '-E'
146
+ else:
147
+ switch = '-r'
148
+
149
+ if UMI == 'False':
150
+ if CB == 'False':
151
+ os.system('samtools view -@ %s %s | awk \'{OFS="\t"}{print $3,$4,$4+100,"%s"}\' | sed %s \'s/^chr//g\' | gzip > %s_scTEtmp/o0/%s.bed.gz'%(num_threads, filename, sample, switch, out, sample))
152
+ elif CB == 'CR':
153
+ os.system('samtools view -@ %s %s | awk \'{OFS="\t"}{for(i=12;i<=NF;i++)if($i~/CR:Z:/)n=i}{print $3,$4,$4+100,$n}\' | sed %s \'s/CR:Z://g\' | sed %s \'s/^chr//g\' | gzip > %s_scTEtmp/o0/%s.bed.gz'%(num_threads, filename, switch, switch, out,sample))
154
+ elif CB == 'CB':
155
+ os.system('samtools view -@ %s %s | awk \'{OFS="\t"}{for(i=12;i<=NF;i++)if($i~/CB:Z:/)n=i}{print $3,$4,$4+100,$n}\' | sed %s \'s/CB:Z://g\' | sed %s \'s/^chr//g\' | gzip > %s_scTEtmp/o0/%s.bed.gz'%(num_threads, filename, switch, switch, out,sample))
156
+ elif UMI == 'UR':
157
+ if CB == 'CR':
158
+ os.system('samtools view -@ %s %s | awk \'{OFS="\t"}{for(i=12;i<=NF;i++)if($i~/CR:Z:/)n=i}{for(i=12;i<=NF;i++)if($i~/UR:Z:/)m=i}{print $3,$4,$4+100,$n,$m}\' | sed %s \'s/CR:Z://g\' | sed %s \'s/UR:Z://g\' | sed %s \'s/^chr//g\' | awk \'!x[$4$5]++\' | gzip > %s_scTEtmp/o0/%s.bed.gz'%(num_threads, filename, switch, switch, switch, out,sample))
159
+ elif CB == 'CB':
160
+ os.system('samtools view -@ %s %s | awk \'{OFS="\t"}{for(i=12;i<=NF;i++)if($i~/CB:Z:/)n=i}{for(i=12;i<=NF;i++)if($i~/UR:Z:/)m=i}{print $3,$4,$4+100,$n,$m}\' | sed %s \'s/CB:Z://g\' | sed %s \'s/UR:Z://g\' | sed %s \'s/^chr//g\' | awk \'!x[$4$5]++\' | gzip > %s_scTEtmp/o0/%s.bed.gz'%(num_threads, filename, switch, switch, switch, out,sample))
161
+ elif UMI == 'UB':
162
+ if CB == 'CR':
163
+ os.system('samtools view -@ %s %s | awk \'{OFS="\t"}{for(i=12;i<=NF;i++)if($i~/CR:Z:/)n=i}{for(i=12;i<=NF;i++)if($i~/UB:Z:/)m=i}{print $3,$4,$4+100,$n,$m}\' | sed %s \'s/CR:Z://g\' | sed %s \'s/UB:Z://g\' | sed %s \'s/^chr//g\' | awk \'!x[$4$5]++\' | gzip > %s_scTEtmp/o0/%s.bed.gz'%(num_threads, filename, switch, switch, switch, out,sample))
164
+ elif CB == 'CB':
165
+ os.system('samtools view -@ %s %s | awk \'{OFS="\t"}{for(i=12;i<=NF;i++)if($i~/CB:Z:/)n=i}{for(i=12;i<=NF;i++)if($i~/UB:Z:/)m=i}{print $3,$4,$4+100,$n,$m}\' | sed %s \'s/CB:Z://g\' | sed %s \'s/UB:Z://g\' | sed %s \'s/^chr//g\' | awk \'!x[$4$5]++\' | gzip > %s_scTEtmp/o0/%s.bed.gz'%(num_threads, filename, switch, switch, switch, out,sample))
166
+
167
+ def splitAllChrs(chromosome_list, filename, genenumber, countnumber, UMI=True):
168
+ '''
169
+ **Purpose**
170
+ Split the data into separate beds, and count up all the times each barcode appears
171
+
172
+ This variant uses more memory, but does it all at the same time and gets the filtered whitelist for free
173
+
174
+ **Arguments**
175
+ chromosome_list
176
+ List of chromosome names
177
+
178
+ filename (Required)
179
+ filename stub to use for tmp files
180
+
181
+ genenumber (Required)
182
+ Minimum number of genes expressed required for a cell to pass filtering
183
+
184
+ countnumber (Required)
185
+ Minimum number of counts required for a cell to pass filtering.
186
+
187
+ UMI (optional, default=True)
188
+ use the UMI
189
+
190
+ **Returns**
191
+ The barcode whitelist
192
+ '''
193
+
194
+ if not os.path.exists('%s_scTEtmp/o2' % filename):
195
+ os.system('mkdir -p %s_scTEtmp/o2'%filename)
196
+
197
+ chromosome_list = set([c.replace('chr', '') for c in chromosome_list])
198
+
199
+ file_handle_in = gzip.open('%s_scTEtmp/o1/%s.bed.gz' % (filename,filename), 'rt')
200
+ file_handles_out = {chr: gzip.open('%s_scTEtmp/o2/%s.chr%s.bed.gz' % (filename,filename,chr), 'wt') for chr in chromosome_list}
201
+
202
+ CRs = defaultdict(int)
203
+
204
+ if UMI:
205
+ uniques = {chrom: set([]) for chrom in chromosome_list}
206
+
207
+ # Make a BED for each chromosome
208
+ for line in file_handle_in:
209
+ t = line.strip().split('\t')
210
+ chrom = t[0].replace('chr', '') # strip chr
211
+
212
+ if chrom not in chromosome_list: # remove the unusual chromosomes
213
+ # Force chrMT -> chrM
214
+ if chrom == 'MT':
215
+ chrom = 'M'
216
+ else:
217
+ continue
218
+
219
+ if UMI:
220
+ if line in uniques[chrom]:
221
+ continue
222
+ uniques[chrom].add(line)
223
+ CRs[t[3]] += 1
224
+ else:
225
+ CRs[t[3]] += 1
226
+
227
+ file_handles_out[chrom].write(line)
228
+
229
+ [file_handles_out[k].close() for k in file_handles_out]
230
+ file_handle_in.close()
231
+
232
+ if not countnumber:
233
+ mincounts = 2 * genenumber
234
+ else:
235
+ mincounts = countnumber
236
+
237
+ CRs = {k: v for k, v in CRs.items() if v >= mincounts}
238
+
239
+ return list(CRs.keys())
240
+
241
+ def filterCRs(filename, genenumber, countnumber):
242
+ CRs = defaultdict(int)
243
+ for f in sorted(glob.glob('%s_scTEtmp/o2/%s*.count.gz'%(filename,filename))):
244
+ logging.info('Reading %s '%os.path.split(f)[1])
245
+ o = gzip.open(f,'rt')
246
+ for l in o:
247
+ t = l.strip().split('\t')
248
+ CRs[t[0]] += int(t[1])
249
+ o.close()
250
+
251
+ if not countnumber:
252
+ mincounts = 2* genenumber
253
+ else:
254
+ mincounts = countnumber
255
+
256
+ logging.info('Before filter %s'%len(CRs))
257
+ CRs = {k: v for k, v in CRs.items() if v >= mincounts}
258
+ logging.info('Before filter %s'%len(CRs))
259
+
260
+ return list(CRs.keys())
261
+
262
+ def splitChr(chr, filename, CB, UMI):
263
+ if not os.path.exists('%s_scTEtmp/o2'%filename):
264
+ os.system('mkdir -p %s_scTEtmp/o2'%filename)
265
+
266
+ chr=chr.replace('chr','')
267
+ if CB == 'CR' or CB == 'CB': CB = True
268
+ else: CB = False
269
+ if UMI == 'UR' or UMI == 'UB': UMI = True
270
+ else: UMI= False
271
+
272
+ if not CB: # C1-style data is a cell per BAM, so no barcode;
273
+ if not UMI:
274
+ if chr == '1':
275
+ os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep -v ^1\'[0-9]\' | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr))
276
+ elif chr == '2':
277
+ os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep -v ^2\'[0-9]\' | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr))
278
+ elif chr == '3':
279
+ os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep -v ^3\'[0-9]\' | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr))
280
+ else:
281
+ os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr))
282
+ else:
283
+ if chr == '1':
284
+ os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep -v ^1\'[0-9]\' | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr))
285
+ elif chr == '2':
286
+ os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep -v ^2\'[0-9]\' | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr))
287
+ elif chr == '3':
288
+ os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep -v ^3\'[0-9]\' | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr))
289
+ else:
290
+ os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr))
291
+ else:
292
+ if not UMI: # did not remove the potential PCR duplicates for scRNA-seq
293
+ if chr == '1':
294
+ os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep -v ^1\'[0-9]\' | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr))
295
+ elif chr == '2':
296
+ os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep -v ^2\'[0-9]\' | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr))
297
+ elif chr == '3':
298
+ os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep -v ^3\'[0-9]\' | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr))
299
+ else:
300
+ os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr))
301
+ else:
302
+ if chr == '1':
303
+ os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep -v ^1\'[0-9]\' | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr))
304
+ elif chr == '2':
305
+ os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep -v ^2\'[0-9]\' | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr))
306
+ elif chr == '3':
307
+ os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep -v ^3\'[0-9]\' | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr))
308
+ else:
309
+ os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr))
310
+
311
+ CRs = defaultdict(int)
312
+ o = gzip.open('%s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr),'rt')
313
+ for l in o:
314
+ t = l.strip().split('\t')
315
+ CRs[t[3]] += 1
316
+ o.close()
317
+
318
+ o = gzip.open('%s_scTEtmp/o2/%s.chr%s.count.gz'%(filename,filename,chr),'wt')
319
+ for k in CRs:
320
+ o.write('%s\t%s\n'%(k,CRs[k]))
321
+ o.close()
322
+
323
+ def align(chr, filename, all_annot, glannot, whitelist): #CB
324
+ '''
325
+ **Purpose**
326
+ For each read, align it to the index and assign a TE, gene.
327
+
328
+ This is the speed critical part.
329
+
330
+ '''
331
+ s1 = time.time()
332
+ chr = 'chr' + chr
333
+
334
+ if not os.path.exists('%s_scTEtmp/o3'%filename):
335
+ os.system('mkdir -p %s_scTEtmp/o3'%filename)
336
+
337
+ if not glannot: # Load separately for the multicore pipeline, share the index for the single core pipeline
338
+ glannot = glload(all_annot)
339
+
340
+ # Only keep the glbase parts we need.
341
+ buckets = glannot.buckets[chr.replace('chr', '')]
342
+ all_annot = glannot.linearData
343
+
344
+ oh = gzip.open('%s_scTEtmp/o2/%s.%s.bed.gz' % (filename, filename, chr), 'rt')
345
+ res = {}
346
+ for line in oh:
347
+ t = line.strip().split('\t')
348
+ barcode = t[3]
349
+ if barcode not in whitelist:
350
+ continue
351
+ if barcode not in res:
352
+ res[barcode] = defaultdict(int)
353
+
354
+ #chrom = t[0].replace('chr', '') # Don't need as each align is already split for each chrom;
355
+ left = int(t[1])
356
+ rite = int(t[2])
357
+
358
+ #loc = location(chr=chrom, left=left, right=rite)
359
+ left_buck = ((left-1)//10000) * 10000
360
+ right_buck = ((rite)//10000) * 10000
361
+ buckets_reqd = range(left_buck, right_buck+10000, 10000)
362
+
363
+ if buckets_reqd:
364
+ loc_ids = set()
365
+ loc_ids_update = loc_ids.update
366
+
367
+ # get the ids reqd.
368
+ [loc_ids_update(buckets[buck]) for buck in buckets_reqd if buck in buckets]
369
+
370
+ result = [all_annot[index]['annot'] for index in loc_ids if (rite >= all_annot[index]['loc'].loc['left'] and left <= all_annot[index]['loc'].loc["right"])]
371
+
372
+ if result:
373
+ for gene in result:
374
+ res[barcode][gene] += 1
375
+
376
+ oh.close()
377
+
378
+ oh = gzip.open('%s_scTEtmp/o3/%s.%s.bed.gz' % (filename,filename,chr), 'wt')
379
+ for bc in sorted(res):
380
+ for gene in sorted(res[bc]):
381
+ oh.write('%s\t%s\t%s\n' % (bc, gene, res[bc][gene]))
382
+ oh.close()
383
+
384
+ def Countexpression(filename, allelement, genenumber, cellnumber, hdf5):
385
+ gene_seen = allelement
386
+
387
+ whitelist={}
388
+ o = gzip.open('%s_scTEtmp/o4/%s.bed.gz'%(filename, filename), 'rt')
389
+ for n,l in enumerate(o):
390
+ t = l.strip().split('\t')
391
+ if t[0] not in whitelist:
392
+ whitelist[t[0]] = 0
393
+ whitelist[t[0]] += 1
394
+ o.close()
395
+
396
+ CRlist = []
397
+ sortcb = sorted(whitelist.items(), key=lambda item:item[1], reverse=True)
398
+ for n,k in enumerate(sortcb):
399
+ if k[1] < genenumber:
400
+ break
401
+ if n >= cellnumber:
402
+ break
403
+ CRlist.append(k[0])
404
+ CRlist = set(CRlist)
405
+
406
+ res = {}
407
+ genes_oh = gzip.open('%s_scTEtmp/o4/%s.bed.gz' % (filename,filename), 'rt')
408
+ for n, l in enumerate(genes_oh):
409
+ t = l.strip().split('\t')
410
+ if t[0] not in CRlist:
411
+ continue
412
+ if t[0] not in res:
413
+ res[t[0]] = {}
414
+ if t[1] not in res[t[0]]:
415
+ res[t[0]][t[1]] = 0
416
+ res[t[0]][t[1]] += int(t[2])
417
+
418
+ genes_oh.close()
419
+
420
+ s=time.time()
421
+
422
+ # Save out the final file
423
+
424
+ gene_seen = list(gene_seen) # Do the sort once;
425
+ gene_seen.sort()
426
+
427
+ #==== save results =====
428
+ if not hdf5: # save as csv
429
+ res_oh = open('%s.csv'%filename, 'w')
430
+ res_oh.write('barcodes,')
431
+ res_oh.write('%s\n' % (','.join([str(i) for i in gene_seen])))
432
+
433
+ for k in sorted(res):
434
+ l = ["0"] * len(gene_seen) # Avoid all the appends
435
+ for idx, gene in enumerate(gene_seen):
436
+ if gene in res[k]:
437
+ l[idx] = str(res[k][gene])
438
+ res_oh.write('%s,%s\n' % (k, ','.join(l)))
439
+ res_oh.close()
440
+
441
+ else: # save as hdf5
442
+ data = []
443
+ CBs = []
444
+ for k in sorted(res):
445
+ l = ["0"] * len(gene_seen) # Avoid all the appends
446
+ for idx, gene in enumerate(gene_seen):
447
+ if gene in res[k]:
448
+ l[idx] = str(res[k][gene])
449
+ data.append(l)
450
+ CBs.append(k)
451
+
452
+ obs = pd.DataFrame(index = CBs)
453
+ var = pd.DataFrame(index = gene_seen)
454
+ adata = ad.AnnData(np.asarray(data).astype(int),var = var,obs = obs)
455
+ adata.X = scipy.sparse.csr_matrix(adata.X)
456
+ adata.write('%s.h5ad'%filename)
457
+
458
+ #========================
459
+
460
+
461
+ return len(res), genenumber, filename
462
+
463
+ def timediff(timestart, timestop):
464
+ t = (timestop-timestart)
465
+ time_day = t.days
466
+ s_time = t.seconds
467
+ ms_time = t.microseconds / 1000000
468
+ usedtime = int(s_time + ms_time)
469
+ time_hour = int(usedtime / 60 / 60 )
470
+ time_minute = int((usedtime - time_hour * 3600 ) / 60 )
471
+ time_second = int(usedtime - time_hour * 3600 - time_minute * 60 )
472
+ retstr = "%dd %dh %dm %ds" %(time_day, time_hour, time_minute, time_second,)
473
+ return retstr