offtracker 2.7.10__zip → 2.10.0__zip

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. {offtracker-2.7.10/offtracker.egg-info → offtracker-2.10.0}/PKG-INFO +62 -18
  2. {offtracker-2.7.10 → offtracker-2.10.0}/README.md +62 -18
  3. {offtracker-2.7.10 → offtracker-2.10.0}/offtracker/X_offplot.py +13 -2
  4. {offtracker-2.7.10 → offtracker-2.10.0}/offtracker/X_sequence.py +113 -7
  5. {offtracker-2.7.10 → offtracker-2.10.0}/offtracker/_version.py +8 -2
  6. offtracker-2.10.0/offtracker/snakefile/Snakefile_QC.smk +66 -0
  7. offtracker-2.10.0/offtracker/snakefile/Snakefile_offtracker.smk +249 -0
  8. offtracker-2.7.10/offtracker/mapping/1.1_bed2fr_v4.5.py → offtracker-2.10.0/offtracker/utility/1.1_bed2fr.py +6 -4
  9. {offtracker-2.7.10 → offtracker-2.10.0/offtracker.egg-info}/PKG-INFO +62 -18
  10. offtracker-2.10.0/offtracker.egg-info/SOURCES.txt +28 -0
  11. {offtracker-2.7.10 → offtracker-2.10.0}/scripts/offtracker_analysis.py +10 -3
  12. offtracker-2.10.0/scripts/offtracker_candidates.py +318 -0
  13. {offtracker-2.7.10 → offtracker-2.10.0}/scripts/offtracker_config.py +28 -44
  14. offtracker-2.10.0/scripts/offtracker_qc.py +62 -0
  15. {offtracker-2.7.10 → offtracker-2.10.0}/setup.py +5 -4
  16. offtracker-2.7.10/offtracker/mapping/Snakefile_offtracker +0 -245
  17. offtracker-2.7.10/offtracker.egg-info/SOURCES.txt +0 -26
  18. offtracker-2.7.10/scripts/offtracker_candidates.py +0 -307
  19. {offtracker-2.7.10 → offtracker-2.10.0}/LICENSE.txt +0 -0
  20. {offtracker-2.7.10 → offtracker-2.10.0}/MANIFEST.in +0 -0
  21. {offtracker-2.7.10 → offtracker-2.10.0}/offtracker/X_offtracker.py +0 -0
  22. {offtracker-2.7.10 → offtracker-2.10.0}/offtracker/__init__.py +0 -0
  23. {offtracker-2.7.10/offtracker/mapping → offtracker-2.10.0/offtracker/utility}/1.3_bdg_normalize_v4.0.py +0 -0
  24. {offtracker-2.7.10/offtracker/mapping → offtracker-2.10.0/offtracker/utility}/bedGraphToBigWig +0 -0
  25. {offtracker-2.7.10/offtracker/mapping → offtracker-2.10.0/offtracker/utility}/hg38.chrom.sizes +0 -0
  26. {offtracker-2.7.10/offtracker/mapping → offtracker-2.10.0/offtracker/utility}/mm10.chrom.sizes +0 -0
  27. {offtracker-2.7.10/offtracker/mapping → offtracker-2.10.0/offtracker/utility}/offtracker_blacklist_hg38.merged.bed +0 -0
  28. {offtracker-2.7.10/offtracker/mapping → offtracker-2.10.0/offtracker/utility}/offtracker_blacklist_mm10.merged.bed +0 -0
  29. {offtracker-2.7.10 → offtracker-2.10.0}/offtracker.egg-info/dependency_links.txt +0 -0
  30. {offtracker-2.7.10 → offtracker-2.10.0}/offtracker.egg-info/requires.txt +0 -0
  31. {offtracker-2.7.10 → offtracker-2.10.0}/offtracker.egg-info/top_level.txt +0 -0
  32. {offtracker-2.7.10 → offtracker-2.10.0}/scripts/offtracker_plot.py +0 -0
  33. {offtracker-2.7.10 → offtracker-2.10.0}/setup.cfg +0 -0
@@ -1,245 +0,0 @@
1
- # 2023.08.11. adding a option for not normalizing the bw file
2
- # 2024.01.23. add --fixedStep to bigwigCompare for not merging neighbouring bins with equal values.
3
-
4
- configfile: "config.yaml"
5
-
6
- _threads = config["thread"]
7
- BinSize = str(config["binsize"])
8
- normalize = config["normalize"]
9
- output_dir = config["output_dir"]
10
- nametype = config["nametype"]
11
- suffix = config["suffix"]
12
- name1 = nametype.replace('2','1') + '.' + suffix
13
- name2 = nametype + '.' + suffix
14
-
15
- import os
16
-
17
- if normalize == "True":
18
- rule all:
19
- input:
20
- expand( os.path.join(output_dir,"{sample}.fw.bed"), sample=config["sample"] ),
21
- expand( os.path.join(output_dir,"{sample}.rv.bed"), sample=config["sample"] ),
22
- expand( os.path.join(output_dir,"{sample}.fw.scaled.bw"), sample=config["sample"] ),
23
- expand( os.path.join(output_dir,"{sample}.rv.scaled.bw"), sample=config["sample"] ),
24
- expand( os.path.join(output_dir,"{sample}." + BinSize + ".add.bdg"),sample=config["sample"] ),
25
- elif normalize == "False":
26
- rule all:
27
- input:
28
- expand( os.path.join(output_dir,"{sample}.fw.bed"), sample=config["sample"] ),
29
- expand( os.path.join(output_dir,"{sample}.rv.bed"), sample=config["sample"] ),
30
- expand( os.path.join(output_dir,"{sample}.fw.raw.bw"), sample=config["sample"] ),
31
- expand( os.path.join(output_dir,"{sample}.rv.raw.bw"), sample=config["sample"] ),
32
- else:
33
- raise ValueError('Please provide "True" or "False" for "--normalize" when running offtracker_config.py')
34
-
35
-
36
- rule chromap:
37
- input:
38
- R1= lambda w: config["sample"][w.sample] + name1,
39
- R2= lambda w: config["sample"][w.sample] + name2
40
- threads:
41
- _threads
42
- params:
43
- index=config["index"],
44
- fasta=config["fasta"]
45
- output:
46
- temp(os.path.join(output_dir,"{sample}.chromapx.bed"))
47
- shell:
48
- """
49
- chromap -l 3000 --low-mem --BED --remove-pcr-duplicates \
50
- --min-read-length 10 --allocate-multi-mappings \
51
- -x {params.index} -r {params.fasta} -t {threads} -1 {input.R1} -2 {input.R2} -o {output}
52
- """
53
-
54
- if config["blacklist"] != 'none':
55
- rule remove_blacklist:
56
- input:
57
- os.path.join(output_dir,"{sample}.chromapx.bed")
58
- threads:
59
- _threads
60
- params:
61
- blacklist=config["blacklist"]
62
- output:
63
- temp(os.path.join(output_dir,"{sample}.filtered.bed"))
64
- shell:
65
- "bedtools intersect -a {input} -b {params.blacklist} -v > {output}"
66
-
67
- rule bed2fr:
68
- input:
69
- os.path.join(output_dir,"{sample}.filtered.bed")
70
- threads:
71
- _threads
72
- params:
73
- dir_script=config["script_folder"]
74
- output:
75
- fw=os.path.join(output_dir,"{sample}.fw.bed"),
76
- rv=os.path.join(output_dir,"{sample}.rv.bed")
77
- shell:
78
- "python {params.dir_script}/1.1_bed2fr_v4.5.py -b {input}"
79
- else:
80
- rule bed2fr:
81
- input:
82
- os.path.join(output_dir,"{sample}.chromapx.bed")
83
- threads:
84
- _threads
85
- params:
86
- dir_script=config["script_folder"]
87
- output:
88
- fw=os.path.join(output_dir,"{sample}.fw.bed"),
89
- rv=os.path.join(output_dir,"{sample}.rv.bed")
90
- shell:
91
- "python {params.dir_script}/1.1_bed2fr_v4.5.py -b {input}"
92
-
93
- rule bed2bdg_fw:
94
- input:
95
- os.path.join(output_dir,"{sample}.fw.bed")
96
- threads:
97
- _threads
98
- params:
99
- gl=config["genomelen"]
100
- output:
101
- temp(os.path.join(output_dir,"{sample}.fw.bdg"))
102
- shell:
103
- "bedtools genomecov -bg -i {input} -g {params.gl} > {output}"
104
-
105
- rule bed2bdg_rv:
106
- input:
107
- os.path.join(output_dir,"{sample}.rv.bed")
108
- threads:
109
- _threads
110
- params:
111
- gl=config["genomelen"]
112
- output:
113
- temp(os.path.join(output_dir,"{sample}.rv.bdg"))
114
- shell:
115
- "bedtools genomecov -bg -i {input} -g {params.gl} > {output}"
116
-
117
- rule bdg_sort_fw:
118
- input:
119
- fw=os.path.join(output_dir,"{sample}.fw.bdg")
120
- threads:
121
- _threads
122
- output:
123
- temp(os.path.join(output_dir,"{sample}.fw.sorted.bdg"))
124
- shell:
125
- "bedtools sort -i {input.fw} > {output}"
126
-
127
- rule bdg_sort_rv:
128
- input:
129
- rv=os.path.join(output_dir,"{sample}.rv.bdg")
130
- threads:
131
- _threads
132
- output:
133
- temp(os.path.join(output_dir,"{sample}.rv.sorted.bdg"))
134
- shell:
135
- "bedtools sort -i {input.rv} > {output}"
136
-
137
- if normalize == "True":
138
- rule bdg_normalize_fw:
139
- input:
140
- bdg=os.path.join(output_dir,"{sample}.fw.sorted.bdg"),
141
- bed=os.path.join(output_dir,"{sample}.fw.bed")
142
- threads:
143
- _threads
144
- params:
145
- dir_script=config["script_folder"]
146
- output:
147
- temp(os.path.join(output_dir,"{sample}.fw.scaled.bdg"))
148
- shell:
149
- "python {params.dir_script}/1.3_bdg_normalize_v4.0.py --bdg {input.bdg} --bed {input.bed}"
150
-
151
- rule bdg_normalize_rv:
152
- input:
153
- bdg=os.path.join(output_dir,"{sample}.rv.sorted.bdg"),
154
- bed=os.path.join(output_dir,"{sample}.rv.bed")
155
- threads:
156
- _threads
157
- params:
158
- dir_script=config["script_folder"]
159
- output:
160
- temp(os.path.join(output_dir,"{sample}.rv.scaled.bdg"))
161
- shell:
162
- "python {params.dir_script}/1.3_bdg_normalize_v4.0.py --bdg {input.bdg} --bed {input.bed}"
163
-
164
- rule bdg2bw_fw:
165
- input:
166
- os.path.join(output_dir,"{sample}.fw.scaled.bdg")
167
- threads:
168
- _threads
169
- params:
170
- gl=config["genomelen"],
171
- dir_script=config["script_folder"]
172
- output:
173
- os.path.join(output_dir,"{sample}.fw.scaled.bw")
174
- shell:
175
- "{params.dir_script}/bedGraphToBigWig {input} {params.gl} {output}"
176
-
177
- rule bdg2bw_rv:
178
- input:
179
- os.path.join(output_dir,"{sample}.rv.scaled.bdg")
180
- threads:
181
- _threads
182
- params:
183
- gl=config["genomelen"],
184
- dir_script=config["script_folder"]
185
- output:
186
- os.path.join(output_dir,"{sample}.rv.scaled.bw")
187
- shell:
188
- "{params.dir_script}/bedGraphToBigWig {input} {params.gl} {output}"
189
-
190
- rule bwAdd:
191
- input:
192
- fw=os.path.join(output_dir,"{sample}.fw.scaled.bw"),
193
- rv=os.path.join(output_dir,"{sample}.rv.scaled.bw")
194
- threads:
195
- _threads
196
- output:
197
- os.path.join(output_dir,"{sample}." + BinSize + ".add.bdg")
198
- shell:
199
- """
200
- bigwigCompare --binSize {BinSize} -p {threads} --verbose -o {output} \
201
- --outFileFormat bedgraph --fixedStep \
202
- --bigwig1 {input.fw} \
203
- --bigwig2 {input.rv} \
204
- --operation add
205
- """
206
- else:
207
- rule bdg_reverse_rv:
208
- input:
209
- os.path.join(output_dir,"{sample}.rv.sorted.bdg")
210
- threads:
211
- _threads
212
- output:
213
- temp(os.path.join(output_dir,"{sample}.rv.sorted_r.bdg"))
214
- shell:
215
- "awk -F '\t' -v OFS='\t' '{{$4=-$4; print}}' {input} > {output}"
216
-
217
- rule bdg2bw_fw:
218
- input:
219
- os.path.join(output_dir,"{sample}.fw.sorted.bdg")
220
- threads:
221
- _threads
222
- params:
223
- gl=config["genomelen"],
224
- dir_script=config["script_folder"]
225
- output:
226
- os.path.join(output_dir,"{sample}.fw.raw.bw")
227
- shell:
228
- "{params.dir_script}/bedGraphToBigWig {input} {params.gl} {output}"
229
-
230
- rule bdg2bw_rv:
231
- input:
232
- os.path.join(output_dir,"{sample}.rv.sorted_r.bdg")
233
- threads:
234
- _threads
235
- params:
236
- gl=config["genomelen"],
237
- dir_script=config["script_folder"]
238
- output:
239
- os.path.join(output_dir,"{sample}.rv.raw.bw")
240
- shell:
241
- "{params.dir_script}/bedGraphToBigWig {input} {params.gl} {output}"
242
-
243
-
244
-
245
-
@@ -1,26 +0,0 @@
1
- LICENSE.txt
2
- MANIFEST.in
3
- README.md
4
- setup.py
5
- offtracker/X_offplot.py
6
- offtracker/X_offtracker.py
7
- offtracker/X_sequence.py
8
- offtracker/__init__.py
9
- offtracker/_version.py
10
- offtracker.egg-info/PKG-INFO
11
- offtracker.egg-info/SOURCES.txt
12
- offtracker.egg-info/dependency_links.txt
13
- offtracker.egg-info/requires.txt
14
- offtracker.egg-info/top_level.txt
15
- offtracker/mapping/1.1_bed2fr_v4.5.py
16
- offtracker/mapping/1.3_bdg_normalize_v4.0.py
17
- offtracker/mapping/Snakefile_offtracker
18
- offtracker/mapping/bedGraphToBigWig
19
- offtracker/mapping/hg38.chrom.sizes
20
- offtracker/mapping/mm10.chrom.sizes
21
- offtracker/mapping/offtracker_blacklist_hg38.merged.bed
22
- offtracker/mapping/offtracker_blacklist_mm10.merged.bed
23
- scripts/offtracker_analysis.py
24
- scripts/offtracker_candidates.py
25
- scripts/offtracker_config.py
26
- scripts/offtracker_plot.py
@@ -1,307 +0,0 @@
1
- #!/usr/bin/env python
2
- # -*- coding: utf-8 -*-
3
-
4
- # 2023.10.27. v2.0: 2.0以target_location midpoint为中心,因此取消 pct 计算
5
- # 2023.12.06. v2.1: 2.1增加 cleavage_site 推测, 修正 deletion 错位, 以 cleavage_site 为中心
6
- import os,sys,re,time
7
- from itertools import product
8
-
9
- if sys.version_info < (3,0):
10
- import platform
11
- raise Exception(f'python3 is needed, while running {platform.python_version()} now')
12
-
13
- import offtracker
14
- import offtracker.X_sequence as xseq
15
- script_dir = os.path.abspath(os.path.dirname(offtracker.__file__))
16
- script_folder= os.path.join(script_dir, 'mapping')
17
-
18
- import argparse
19
- import pandas as pd
20
- import pybedtools
21
- import multiprocessing as mp
22
- from Bio.Blast.Applications import NcbiblastnCommandline
23
-
24
- def main():
25
- parser = argparse.ArgumentParser()
26
- parser.description='Generate candidate regions by sgRNA sequence'
27
- parser.add_argument('--sgrna' , type=str, required=True, help='sgRNA sequence without PAM' )
28
- parser.add_argument('--pam' , type=str, required=True, help='The protospacer adjacent motif' )
29
- parser.add_argument('--name' , type=str, required=True, help='custom name of the sgRNA' )
30
- parser.add_argument('-r','--ref' , type=str, required=True, help='The fasta file of reference genome')
31
- parser.add_argument('-b','--blastdb', type=str, required=True, help='blast database')
32
- parser.add_argument('-o','--outdir' , type=str, required=True, help='The output folder')
33
- parser.add_argument('-g','--genome' , type=str, default='hg38', help='File of chromosome sizes, or "hg38", "mm10" ')
34
- parser.add_argument('-t','--thread' , type=int, default=4, help='Number of threads for parallel computing')
35
- parser.add_argument('--quick_mode' , action='store_true', help='BLAST faster but less candidates.')
36
-
37
- args = parser.parse_args()
38
-
39
-
40
- if (args.genome == 'hg38') or (args.genome == 'mm10'):
41
- dir_chrom_sizes = os.path.join(script_folder, f'{args.genome}.chrom.sizes')
42
- else:
43
- dir_chrom_sizes = args.genome
44
-
45
- sgRNA_name = args.name
46
- sgRNA_seq = args.sgrna
47
- PAM = args.pam
48
- n_threads = args.thread
49
- dir_output = args.outdir
50
- if not os.path.exists(dir_output):
51
- os.makedirs(dir_output)
52
- dir_ref_fa = args.ref
53
- blast_db = args.blastdb
54
- quick_mode = args.quick_mode
55
-
56
- # parameters for alignment
57
- half_width = 100
58
- pct_params = 1.0
59
- frag_len= half_width*2
60
- dir_df_candidate = os.path.join(dir_output, f'df_candidate_{sgRNA_name}.csv')
61
-
62
-
63
- sgRNA_seq = sgRNA_seq.upper()
64
- PAM = PAM.upper()
65
- dir_sgRNA_fasta = os.path.join(dir_output, f'{sgRNA_name}_PAM.fasta')
66
- dir_sgRNA_blast = os.path.join(dir_output, f'{sgRNA_name}_PAM.blast')
67
- dir_sgRNA_bed = os.path.join(dir_output, f'{sgRNA_name}_PAM.bed')
68
-
69
-
70
- possible_sgRNA_PAM = list(product([sgRNA_seq],xseq.possible_seq(PAM)))
71
- possible_sgRNA_PAM = [''.join(combination) for combination in possible_sgRNA_PAM]
72
- n_seq = len(possible_sgRNA_PAM)
73
-
74
- ID = pd.Series(['seq']*n_seq) + pd.Series(range(1,n_seq+1)).astype(str)
75
- df_sgRNA_PAM = pd.DataFrame({'ID':ID,'sequence':possible_sgRNA_PAM})
76
- xseq.write_fasta(df_sgRNA_PAM, dir_sgRNA_fasta)
77
-
78
-
79
-
80
- #########
81
- # BLAST #
82
- #########
83
- if os.path.isfile(dir_sgRNA_blast):
84
- print(f'{dir_sgRNA_blast} exists, skipped.')
85
- else:
86
- if quick_mode:
87
- print('Using quick mode for BLAST')
88
- blastx_cline = NcbiblastnCommandline(query=dir_sgRNA_fasta, task='blastn-short',out=dir_sgRNA_blast,
89
- db=blast_db, evalue=10000,outfmt=6, num_threads=n_threads,
90
- gapopen=4, gapextend=2, reward=2, word_size=5, dust='no', soft_masking=False)
91
- else:
92
- blastx_cline = NcbiblastnCommandline(query=dir_sgRNA_fasta, task='blastn-short',out=dir_sgRNA_blast,
93
- db=blast_db, evalue=10000,outfmt=6, num_threads=n_threads,
94
- gapopen=4, gapextend=2, reward=2, word_size=4, dust='no', soft_masking=False)
95
- print(f'BLAST for candidate off-target sites of {sgRNA_name}.')
96
- blastx_cline()
97
- print(f'BLAST finished.')
98
-
99
- ##############
100
- # Output bed #
101
- ##############
102
-
103
- blast_regions = pd.read_csv(dir_sgRNA_blast, sep='\t',header=None)
104
- blast_regions.columns = ['query acc.','chr','% identity','alignment length','mismatches','gap opens','q. start','q. end','st','ed','evalue','bit score']
105
- blast_regions = blast_regions[blast_regions.evalue<10000]
106
-
107
- # reverse strand
108
- blast_regions['reverse'] = (blast_regions['st']>blast_regions['ed']).astype(int)
109
- blast_regions_f = blast_regions[blast_regions.reverse==0].copy()
110
- blast_regions_r = blast_regions[blast_regions.reverse==1].copy()
111
- temp = blast_regions_r['st'].copy()
112
- blast_regions_r['st'] = blast_regions_r['ed']
113
- blast_regions_r['ed'] = temp
114
- blast_regions = pd.concat([blast_regions_f, blast_regions_r])
115
- # sort and add location
116
- blast_regions = blast_regions.sort_values('evalue').reset_index(drop=True)
117
- blast_regions['location']=blast_regions['chr'].str[:] + ':' + blast_regions['st'].astype(str).str[:] + '-' + blast_regions['ed'].astype(str).str[:]
118
- blast_regions = blast_regions.drop_duplicates(subset='location').copy()
119
-
120
- # alignment length 筛选
121
- len_sgRNA=len(sgRNA_seq)
122
- min_len = len_sgRNA-8
123
- blast_regions = blast_regions[blast_regions['alignment length']>=min_len].copy().reset_index(drop=True)
124
- blast_regions = blast_regions.reindex(columns = ['chr', 'st', 'ed' , 'query acc.', '% identity', 'alignment length', 'mismatches',
125
- 'gap opens', 'q. start', 'q. end', 'evalue', 'bit score', 'reverse', 'location'] )
126
-
127
- # 输出 bed 用于后续 alignment score 计算
128
- blast_regions_bed = blast_regions[['chr','st','ed']]
129
- xseq.write_bed(blast_regions_bed, dir_sgRNA_bed)
130
- # 对 bed 进行排序但不合并
131
- a = pybedtools.BedTool(dir_sgRNA_bed)
132
- a.sort(g=dir_chrom_sizes).saveas( dir_sgRNA_bed )
133
- print(f'Output {sgRNA_name}_PAM.bed')
134
-
135
-
136
- ###################
137
- # alignment score #
138
- ###################
139
- if os.path.isfile(dir_df_candidate):
140
- print(f'{dir_df_candidate} exists, skipped.')
141
- else:
142
- #########
143
- # 读取 blast bed
144
- #########
145
- common_chr = pd.Series(['chr']*23).str[:] + pd.Series(range(23)).astype(str).str[:]
146
- common_chr = pd.concat([common_chr, pd.Series(['chrX','chrY'])]).to_numpy()
147
-
148
- bed_short = xseq.X_readbed(dir_sgRNA_bed)
149
- bed_short = bed_short[bed_short['chr'].isin(common_chr)].copy()
150
- bed_short['midpoint'] = ((bed_short['st'] + bed_short['ed'])/2).astype(int)
151
- bed_short['st'] = bed_short['midpoint'] - half_width
152
- bed_short['ed'] = bed_short['midpoint'] + half_width
153
- bed_short.loc[bed_short['st']<0,'st']=0
154
- bed_short = bed_short.drop_duplicates()
155
-
156
- #########
157
- # 根据 bed_f 位点 ed 前后 half_width 取基因组序列
158
- #########
159
-
160
- temp_bed = os.path.join(dir_output, 'temp.bed')
161
- xseq.write_bed(bed_short.iloc[:,:3], temp_bed)
162
- a = pybedtools.BedTool(temp_bed)
163
- fasta = pybedtools.example_filename(dir_ref_fa)
164
- a = a.sequence(fi=fasta)
165
- with open(a.seqfn) as f:
166
- fasta = {}
167
- for line in f:
168
- line = line.strip() # 去除末尾换行符
169
- if line[0] == '>':
170
- header = line[1:]
171
- else:
172
- sequence = line
173
- fasta[header] = fasta.get(header,'') + sequence
174
-
175
- # pybedtools 得到位置 chrA:X-Y 时,X数字会往左多1bp
176
-
177
- #########
178
- # local alignment
179
- #########
180
- DNA_matrix = {('A','A'): 2, ('A','T'):0.01, ('A','C'):0.01, ('A','G'):0.01, ('A','N'):0.01,
181
- ('T','T'): 2, ('T','A'):0.01, ('T','C'):0.01, ('T','G'):0.01, ('T','N'):0.01,
182
- ('G','G'): 2, ('G','A'):0.01, ('G','C'):0.01, ('G','T'):0.01, ('G','N'):0.01,
183
- ('C','C'): 2, ('C','A'):0.01, ('C','G'):0.01, ('C','T'):0.01, ('C','N'):0.01,
184
- ('N','N'): 2, ('N','C'):2, ('N','A'): 2, ('N','G'): 2, ('N','T'): 2}
185
- mismatch_score = 0.01
186
- # 添加 PAM
187
- sgRNA_PAM_fw = sgRNA_seq + PAM
188
- sgRNA_PAM_rv = xseq.reverse_complement(sgRNA_PAM_fw)
189
-
190
- list_args_fw=[]
191
- list_args_rv=[]
192
- for a_key in fasta.keys():
193
- seq = re.sub('[^ATCG]','N',fasta[a_key])
194
- list_args_fw.append( [a_key, sgRNA_PAM_fw, seq, frag_len, DNA_matrix, mismatch_score] )
195
- list_args_rv.append( [a_key, sgRNA_PAM_rv, seq, frag_len, DNA_matrix, mismatch_score] )
196
- st = time.time()
197
- with mp.Pool(n_threads) as p:
198
- list_align_forward = p.starmap(xseq.sgRNA_alignment, list_args_fw)
199
- ed = time.time()
200
- print('align_forward:{:.2f}'.format(ed-st))
201
- st = time.time()
202
- with mp.Pool(n_threads) as p:
203
- list_align_reverse = p.starmap(xseq.sgRNA_alignment, list_args_rv)
204
- ed = time.time()
205
- print('align_reverse:{:.2f}'.format(ed-st))
206
- #
207
- df_align_forward = pd.DataFrame(list_align_forward, columns= ['fw_score','fw_pct','fw_target','fw_location','fw_deletion','fw_insertion','fw_mismatch'])
208
- df_align_reverse = pd.DataFrame(list_align_reverse, columns= ['rv_score','rv_pct','rv_target','rv_location','rv_deletion','rv_insertion','rv_mismatch'])
209
- df_align_reverse['rv_target'] = df_align_reverse['rv_target'].apply(xseq.reverse_complement)
210
- df_candidate = pd.concat([df_align_forward,df_align_reverse],axis=1)
211
- df_candidate['location'] = fasta.keys()
212
- df_candidate['alignment_score'] = df_candidate[['fw_score','rv_score']].max(axis=1)
213
- #df_candidate['fw_score_2'] = df_candidate['fw_score']*(pct_params-df_candidate['fw_pct'].abs())
214
- #df_candidate['rv_score_2'] = df_candidate['rv_score']*(pct_params-df_candidate['rv_pct'].abs())
215
- #df_candidate['best_seq_score'] = df_candidate[['fw_score_2', 'rv_score_2']].max(axis=1)
216
- #df_candidate['best_strand'] = df_candidate[['fw_score_2', 'rv_score_2']].idxmax(axis='columns').replace({'fw_score_2':'+', 'rv_score_2':'-'})
217
- #df_candidate.loc[df_candidate['fw_score_2']==df_candidate['rv_score_2'],'best_strand']='equal_score'
218
- df_candidate['best_seq_score'] = df_candidate[['fw_score', 'rv_score']].max(axis=1)
219
- df_candidate['best_strand'] = df_candidate[['fw_score', 'rv_score']].idxmax(axis='columns').replace({'fw_score':'+', 'rv_score':'-'})
220
- df_candidate.loc[df_candidate['fw_score']==df_candidate['rv_score'],'best_strand']='equal_score'
221
-
222
- # GG check
223
- # 2023.12.05 增加 cleavage_site 推测
224
- list_best_target = []
225
- list_best_location = []
226
- list_cleavage_site = []
227
- list_delete = []
228
- list_insert = []
229
- list_mismat = []
230
- list_GG = []
231
- for a_row in df_candidate.iterrows():
232
- if a_row[1]['best_strand']=='+':
233
- list_best_target.append(a_row[1]['fw_target'])
234
- list_best_location.append(a_row[1]['fw_location'])
235
- list_cleavage_site.append(int(a_row[1]['fw_location'].split('-')[1]) - 6)
236
- list_delete.append(a_row[1]['fw_deletion'])
237
- list_insert.append(a_row[1]['fw_insertion'])
238
- list_mismat.append(a_row[1]['fw_mismatch'])
239
- if a_row[1]['fw_target'][-2:]=='GG':
240
- list_GG.append('OK')
241
- else:
242
- list_GG.append('NO')
243
- elif a_row[1]['best_strand']=='-':
244
- list_best_target.append(a_row[1]['rv_target'])
245
- list_best_location.append(a_row[1]['rv_location'])
246
- list_cleavage_site.append(int(a_row[1]['rv_location'].split('-')[0].split(':')[1]) + 5)
247
- list_delete.append(a_row[1]['rv_deletion'])
248
- list_insert.append(a_row[1]['rv_insertion'])
249
- list_mismat.append(a_row[1]['rv_mismatch'])
250
- if a_row[1]['rv_target'][-2:]=='GG':
251
- list_GG.append('OK')
252
- else:
253
- list_GG.append('NO')
254
- else:
255
- if a_row[1]['fw_target'][-2:]=='GG':
256
- list_best_target.append(a_row[1]['fw_target'])
257
- list_best_location.append(a_row[1]['fw_location'])
258
- list_cleavage_site.append(int(a_row[1]['fw_location'].split('-')[1]) - 6)
259
- list_delete.append(a_row[1]['fw_deletion'])
260
- list_insert.append(a_row[1]['fw_insertion'])
261
- list_mismat.append(a_row[1]['fw_mismatch'])
262
- list_GG.append('OK_same_score')
263
- # 发现没有 GG 则看 RC
264
- elif a_row[1]['rv_target'][-2:]=='GG':
265
- list_best_target.append(a_row[1]['rv_target'])
266
- list_best_location.append(a_row[1]['rv_location'])
267
- list_cleavage_site.append(int(a_row[1]['rv_location'].split('-')[0].split(':')[1]) + 5)
268
- list_delete.append(a_row[1]['rv_deletion'])
269
- list_insert.append(a_row[1]['rv_insertion'])
270
- list_mismat.append(a_row[1]['rv_mismatch'])
271
- list_GG.append('OK_same_score')
272
- else:
273
- list_best_target.append(a_row[1]['fw_target'])
274
- list_best_location.append(a_row[1]['fw_location'])
275
- list_cleavage_site.append(int(a_row[1]['fw_location'].split('-')[1]) - 6)
276
- list_delete.append(a_row[1]['fw_deletion'])
277
- list_insert.append(a_row[1]['fw_insertion'])
278
- list_mismat.append(a_row[1]['fw_mismatch'])
279
- list_GG.append('NO_same_score')
280
- # 记入 df_candidate
281
- df_candidate['deletion'] = list_delete
282
- df_candidate['insertion'] = list_insert
283
- df_candidate['mismatch'] = list_mismat
284
- df_candidate['GG'] = list_GG
285
- df_candidate['best_target'] = list_best_target
286
- df_candidate['target_location'] = list_best_location
287
- df_candidate['cleavage_site'] = list_cleavage_site
288
-
289
- # 2.0 更新一下格式
290
- df_candidate = df_candidate.drop_duplicates(subset=['target_location']).reset_index(drop=True)
291
- df_candidate = pd.concat([xseq.bedfmt(df_candidate['target_location']), df_candidate],axis=1)
292
- # df_candidate['midpoint'] = ((df_candidate['ed'] + df_candidate['st'])/2).astype(int)
293
- df_candidate = xseq.add_ID(df_candidate, midpoint='cleavage_site')
294
-
295
- df_candidate.to_csv(dir_df_candidate)
296
- print(f'Output df_candidate_{sgRNA_name}.csv')
297
- os.remove(temp_bed)
298
-
299
- return 'Done!'
300
-
301
-
302
- if __name__ == '__main__' :
303
- result = main()
304
- print(result)
305
-
306
-
307
-
File without changes
File without changes
File without changes