offtracker 2.7.8__zip → 2.10.0__zip

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. offtracker-2.10.0/PKG-INFO +233 -0
  2. offtracker-2.10.0/README.md +221 -0
  3. {offtracker-2.7.8 → offtracker-2.10.0}/offtracker/X_offplot.py +37 -8
  4. {offtracker-2.7.8 → offtracker-2.10.0}/offtracker/X_sequence.py +113 -7
  5. offtracker-2.10.0/offtracker/_version.py +36 -0
  6. offtracker-2.10.0/offtracker/snakefile/Snakefile_QC.smk +66 -0
  7. offtracker-2.10.0/offtracker/snakefile/Snakefile_offtracker.smk +249 -0
  8. offtracker-2.7.8/offtracker/mapping/1.1_bed2fr_v4.5.py → offtracker-2.10.0/offtracker/utility/1.1_bed2fr.py +6 -4
  9. offtracker-2.10.0/offtracker.egg-info/PKG-INFO +233 -0
  10. offtracker-2.10.0/offtracker.egg-info/SOURCES.txt +28 -0
  11. {offtracker-2.7.8 → offtracker-2.10.0}/scripts/offtracker_analysis.py +20 -5
  12. offtracker-2.10.0/scripts/offtracker_candidates.py +318 -0
  13. {offtracker-2.7.8 → offtracker-2.10.0}/scripts/offtracker_config.py +28 -44
  14. offtracker-2.10.0/scripts/offtracker_plot.py +39 -0
  15. offtracker-2.10.0/scripts/offtracker_qc.py +62 -0
  16. {offtracker-2.7.8 → offtracker-2.10.0}/setup.py +8 -4
  17. offtracker-2.7.8/PKG-INFO +0 -146
  18. offtracker-2.7.8/README.md +0 -134
  19. offtracker-2.7.8/offtracker/_version.py +0 -28
  20. offtracker-2.7.8/offtracker/mapping/Snakefile_offtracker +0 -245
  21. offtracker-2.7.8/offtracker.egg-info/PKG-INFO +0 -146
  22. offtracker-2.7.8/offtracker.egg-info/SOURCES.txt +0 -25
  23. offtracker-2.7.8/scripts/offtracker_candidates.py +0 -307
  24. {offtracker-2.7.8 → offtracker-2.10.0}/LICENSE.txt +0 -0
  25. {offtracker-2.7.8 → offtracker-2.10.0}/MANIFEST.in +0 -0
  26. {offtracker-2.7.8 → offtracker-2.10.0}/offtracker/X_offtracker.py +0 -0
  27. {offtracker-2.7.8 → offtracker-2.10.0}/offtracker/__init__.py +0 -0
  28. {offtracker-2.7.8/offtracker/mapping → offtracker-2.10.0/offtracker/utility}/1.3_bdg_normalize_v4.0.py +0 -0
  29. {offtracker-2.7.8/offtracker/mapping → offtracker-2.10.0/offtracker/utility}/bedGraphToBigWig +0 -0
  30. {offtracker-2.7.8/offtracker/mapping → offtracker-2.10.0/offtracker/utility}/hg38.chrom.sizes +0 -0
  31. {offtracker-2.7.8/offtracker/mapping → offtracker-2.10.0/offtracker/utility}/mm10.chrom.sizes +0 -0
  32. {offtracker-2.7.8/offtracker/mapping → offtracker-2.10.0/offtracker/utility}/offtracker_blacklist_hg38.merged.bed +0 -0
  33. {offtracker-2.7.8/offtracker/mapping → offtracker-2.10.0/offtracker/utility}/offtracker_blacklist_mm10.merged.bed +0 -0
  34. {offtracker-2.7.8 → offtracker-2.10.0}/offtracker.egg-info/dependency_links.txt +0 -0
  35. {offtracker-2.7.8 → offtracker-2.10.0}/offtracker.egg-info/requires.txt +0 -0
  36. {offtracker-2.7.8 → offtracker-2.10.0}/offtracker.egg-info/top_level.txt +0 -0
  37. {offtracker-2.7.8 → offtracker-2.10.0}/setup.cfg +0 -0
@@ -0,0 +1,233 @@
1
+ Metadata-Version: 2.1
2
+ Name: offtracker
3
+ Version: 2.10.0
4
+ Summary: Tracking-seq data analysis
5
+ Home-page: https://github.com/Lan-lab/offtracker
6
+ Author: Runda Xu
7
+ Author-email: xrd18@tsinghua.org.cn
8
+ Requires-Python: >=3.6.0
9
+ Description-Content-Type: text/markdown
10
+ License-File: LICENSE.txt
11
+
12
+
13
+ # OFF-TRACKER
14
+
15
+ OFF-TRACKER is an end to end pipeline of Tracking-seq data analysis for detecting off-target sites of any genome editing tools that generate double-strand breaks (DSBs) or single-strand breaks (SSBs).
16
+
17
+ ## System requirements
18
+
19
+ * Linux/Unix
20
+ * Python >= 3.6
21
+
22
+ ## Dependency
23
+
24
+ ```bash
25
+ # We recommend creating a new environment using mamba/conda to avoid compatibility problems
26
+ # If you don't use mamba, just replace the code with conda
27
+ # Windows systems may not be compatible with pybedtools.
28
+ mamba create -n offtracker -c bioconda blast snakemake pybedtools chromap
29
+ ```
30
+
31
+
32
+ ## Installation
33
+
34
+ ```bash
35
+ # Activate the environment
36
+ conda activate offtracker
37
+
38
+ # Direct installation with pip
39
+ pip install offtracker
40
+
41
+ # (Alternative) Download the offtracker from github
42
+ git clone https://github.com/Lan-lab/offtracker.git
43
+ cd offtracker
44
+ pip install .
45
+ ```
46
+
47
+
48
+ ## Before analyzing samples
49
+
50
+ ```bash
51
+ # Build blast index (only need once for each genome)
52
+ makeblastdb -input_type fasta -title hg38 -dbtype nucl -parse_seqids \
53
+ -in /Your_Path_To_Reference/hg38_genome.fa \
54
+ -out /Your_Path_To_Reference/hg38_genome.blastdb \
55
+ -logfile /Your_Path_To_Reference/hg38_genome.blastdb.log
56
+
57
+ # Build chromap index (only need once for each genome)
58
+ chromap -i -r /Your_Path_To_Reference/hg38_genome.fa \
59
+ -o /Your_Path_To_Reference/hg38_genome.chromap.index
60
+
61
+ # Generate candidate regions by sgRNA sequence (need once for each genome and sgRNA)
62
+ # --name: a user-defined name of the sgRNA, which will be used in the following analysis.
63
+ offtracker_candidates.py -t 8 -g hg38 \
64
+ -r /Your_Path_To_Reference/hg38_genome.fa \
65
+ -b /Your_Path_To_Reference/hg38_genome.blastdb \
66
+ --name 'VEGFA2' --sgrna 'GACCCCCTCCACCCCGCCTC' --pam 'NGG' \
67
+ -o /Your_Path_To_Candidates_Folder
68
+
69
+ ```
70
+
71
+
72
+ ## Quality control and adapter trimming
73
+
74
+ ```bash
75
+ # Generate snakemake config file for quality control and adapter trimming.
76
+ offtracker_qc.py -t 4 \
77
+ -f /Your_Path_To_Input_Folder \
78
+ --subfolder 0
79
+
80
+ cd /Your_Path_To_Input_Folder/Trimmed_data
81
+ snakemake -np # dry run to check whether everything is alright
82
+ nohup snakemake --cores 16 1>${outdir}/sm_qc.log 2>&1 &
83
+
84
+ """
85
+ Set “--subfolder 0” if the file structure is like:
86
+ | - Input_Folder
87
+ | - sample1_R1.fastq.gz
88
+ | - sample1_R2.fastq.gz
89
+ | - sample2_R1.fastq.gz
90
+ | - sample2_R2.fastq.gz
91
+ Set “--subfolder 1” if the file structure is like:
92
+ | - Input_Folder
93
+ | - Sample1_Folder
94
+ | - sample1_R1.fastq.gz
95
+ | - sample1_R2.fastq.gz
96
+ | - Sample2_Folder
97
+ | - sample2_R1.fastq.gz
98
+ | - sample2_R2.fastq.gz
99
+
100
+ The script “offtracker_qc.py” will create a “Trimmed_data” folder under /Your_Path_To_Input_Folder.
101
+ If “-o /Your_Path_To_Output” is set, the output will be redirected to /Your_Path_To_Output.
102
+ """
103
+ ```
104
+
105
+ ## Strand-specific mapping of Tracking-seq data
106
+
107
+ ```bash
108
+
109
+ # Generate snakemake config file for mapping
110
+ # Results will be generated in /Your_Path_To_Output, if -o is not set, the output will be in the same folder as the fastq files
111
+ offtracker_config.py -t 8 -g hg38 --blacklist hg38 \
112
+ -r /Your_Path_To_Reference/hg38_genome.fa \
113
+ -i /Your_Path_To_Reference/hg38_genome.chromap.index \
114
+ -f /Your_Path_To_Trimmed_Data \
115
+ -o /Your_Path_To_Output \
116
+ --subfolder 0
117
+
118
+ # Warning: Do not contain "fastq" or "fq" in the folder name, otherwise the program may treat the folder as a fastq file
119
+ # This problem may be fixed in the future
120
+
121
+ # Run the snakemake program
122
+ cd /Your_Path_To_Fastq
123
+ snakemake -np # dry run
124
+ nohup snakemake --cores 16 1>sm_mapping.log 2>sm_mapping.err &
125
+
126
+ ## about cores
127
+ # --cores of snakemake must be larger than -t of offtracker_config.py
128
+ # parallel number = cores/t
129
+
130
+ ## about output
131
+ # This part will generate "*.fw.scaled.bw" and ".rv.scaled.bw" for IGV visualization
132
+ # "*.fw.bed" and "*.rv.bed" are used in the next part.
133
+ ```
134
+
135
+
136
+ ## Analyzing the genome-wide off-target sites
137
+
138
+ ```bash
139
+ # In this part, multiple samples in the same condition can be analyzed in a single run by pattern recognition of sample names
140
+
141
+ offtracker_analysis.py -g hg38 --name "VEGFA2" \
142
+ --exp 'Cas9_VEGFA2' \
143
+ --control 'WT' \
144
+ --outname 'Cas9_VEGFA_293' \
145
+ -f /Your_Path_To_Output \
146
+ --seqfolder /Your_Path_To_Candidates
147
+
148
+ # --name: the same gRNA name you set when running offtracker_candidates.py
149
+ # --exp/--control: add one or multiple patterns of file name in regular expressions
150
+ # If multiple samples meet the pattern, their signals will be averaged. Thus, only samples with the same condition should be included in a single analysis.
151
+
152
+ # This step will generate Offtracker_result_{outname}.csv
153
+ # Default FDR is 0.05, which can be changed by --fdr. This will empirically make the threshold of Track score around 2.
154
+ # Sites with Track score >=2, which is a empirical threshold, are output regardless of FDR.
155
+ # Intermediate files are saved in ./temp folder, which can be deleted.
156
+ # Keeping the intermediate files can make the analysis faster if involving previously analyzed samples (e.g. using the same control samples for different analyses)
157
+ ```
158
+
159
+ ## Off-target sequences visualization
160
+
161
+ ```bash
162
+ # After get the Offtracker_result_{outname}.csv, you can visualize the off-target sites with their genomic sequence with the following command:
163
+
164
+ offtracker_plot.py --result Your_Offtracker_Result_CSV \
165
+ --sgrna 'GACCCCCTCCACCCCGCCTC' --pam 'NGG'
166
+
167
+ # The default output is a pdf file with Offtracker_result_{outname}.pdf
168
+ # Assigning a specific output file with another suffix can change the format. e.g., "--output Offtracker_plot.png" will generate a png file.
169
+ # The orange dash line indicates the empirical threshold of Track score = 2
170
+ # Empirically, the off-target sites with Track score < 2 are less likely to be real off-target sites.
171
+ ```
172
+
173
+
174
+ ## Note1, when not using hg38 or mm10
175
+
176
+ The default setting only includes chr1-chr22, chrX, chrY, and chrM. (only suitable for human and mouse) \
177
+ If you are using reference genomes without "chr" at the beginning, or want to analyze all chromosomes or other species, you can set "--ignore_chr" when running offtracker_config.py to skip chromosome filter.
178
+
179
+ Currently, this software is only ready-to-use for mm10 and hg38. For any other genome, e.g., hg19, please add a genome size file named "hg19.chrom.sizes" to .\offtracker\utility. Besides, add "--blacklist none" or "--blacklist Your_Blacklist" (e.g., ENCODE blacklist) when running offtracker_config.py, because we only include blacklists for mm10 and hg38.
180
+
181
+ ## Note2
182
+
183
+ The FDRs in the Tracking-seq result do not reflect the real off-target probability.
184
+ It is strongly recommended to observe the "fw.scaled.bw" and "rv.scaled.bw" using genome browser like IGV to visually inspect each target location from the Tracking-seq result.
185
+
186
+
187
+
188
+ # Example Data
189
+
190
+ Here are example data that contains reads of chr6 from HEK293T cells edited with Cas9 + sgRNA VEGFA2 and wild type cells:
191
+
192
+ https://figshare.com/articles/dataset/WT_HEK239T_chr6/25956034
193
+
194
+ It takes about 5-10 minutes to run the mapping (offtracker_config.py & snakemake) of example data with -t 8 and --cores 16 (2 parallel tasks)
195
+
196
+ ## Signal visualization
197
+
198
+ After mapping, there will be 4 .bw files in the output folder:
199
+ ```bash
200
+ Cas9_VEGFA2_chr6.fw.scaled.bw
201
+
202
+ Cas9_VEGFA2_chr6.rv.scaled.bw
203
+
204
+ WT_chr6.fw.scaled.bw
205
+
206
+ WT_chr6.rv.scaled.bw
207
+ ```
208
+ These files can be visualized in genome browser like IGV:
209
+
210
+ ![signal](https://github.com/Lan-lab/offtracker/blob/main/example_output/signals_example.png?raw=true)
211
+
212
+ The signal (coverage) for each sample is normalized to 1e7/total_reads. As only reads mapping to chr6 were extracted in the example data, the signal range is much higher than that of the whole genome samples.
213
+
214
+ ## Whole genome off-target analysis
215
+
216
+ For analyzing the signals (offtracker_analysis.py), it takes about 3-5 minutes and outputs a file named "Offtracker_result_{outname}.csv"
217
+
218
+ After that, you can visualize the off-target sites with their genomic sequence (offtracker_plot.py) and get an image like this:
219
+
220
+ ![offtarget](https://github.com/Lan-lab/offtracker/blob/main/example_output/sequences_example.png?raw=true)
221
+
222
+ # Citation
223
+
224
+ If you use Tracking-seq or OFF-TRACKER in your research, please cite the following paper:
225
+
226
+ Zhu, M., Xu, R., Yuan, J., Wang, J. et al. Tracking-seq reveals the heterogeneity of off-target effects in CRISPR–Cas9-mediated genome editing. Nat Biotechnol (2024). https://doi.org/10.1038/s41587-024-02307-y
227
+
228
+ The signal visualization of .bw file here was generated by the Integrative Genomics Viewer (IGV) software. The signal visualization in the Tracking-seq article above was generated by either IGV or pyGenomeTracks:
229
+
230
+ Robinson, J., Thorvaldsdóttir, H., Winckler, W. et al. Integrative genomics viewer. Nat Biotechnol 29, 24–26 (2011). https://doi.org/10.1038/nbt.1754
231
+
232
+ Lopez-Delisle L, Rabbani L, Wolff J, Bhardwaj V, Backofen R, Grüning B, Ramírez F, Manke T. pyGenomeTracks: reproducible plots for multivariate genomic data sets. Bioinformatics. 2020 Aug 3:btaa692. doi: 10.1093/bioinformatics/btaa692.
233
+
@@ -0,0 +1,28 @@
1
+ LICENSE.txt
2
+ MANIFEST.in
3
+ README.md
4
+ setup.py
5
+ offtracker/X_offplot.py
6
+ offtracker/X_offtracker.py
7
+ offtracker/X_sequence.py
8
+ offtracker/__init__.py
9
+ offtracker/_version.py
10
+ offtracker.egg-info/PKG-INFO
11
+ offtracker.egg-info/SOURCES.txt
12
+ offtracker.egg-info/dependency_links.txt
13
+ offtracker.egg-info/requires.txt
14
+ offtracker.egg-info/top_level.txt
15
+ offtracker/snakefile/Snakefile_QC.smk
16
+ offtracker/snakefile/Snakefile_offtracker.smk
17
+ offtracker/utility/1.1_bed2fr.py
18
+ offtracker/utility/1.3_bdg_normalize_v4.0.py
19
+ offtracker/utility/bedGraphToBigWig
20
+ offtracker/utility/hg38.chrom.sizes
21
+ offtracker/utility/mm10.chrom.sizes
22
+ offtracker/utility/offtracker_blacklist_hg38.merged.bed
23
+ offtracker/utility/offtracker_blacklist_mm10.merged.bed
24
+ scripts/offtracker_analysis.py
25
+ scripts/offtracker_candidates.py
26
+ scripts/offtracker_config.py
27
+ scripts/offtracker_plot.py
28
+ scripts/offtracker_qc.py
@@ -26,6 +26,8 @@ def main():
26
26
  parser.add_argument('--name' , type=str, required=True, help='custom name of the sgRNA' )
27
27
  parser.add_argument('--exp' , type=str, default='all', nargs='+', help='A substring mark in the name of experimental samples. The default is to use all samples other than control' )
28
28
  parser.add_argument('--control' , type=str, default='none', nargs='+', help='A substring mark in the name of control samples. The default is no control. "others" for all samples other than --exp.' )
29
+ parser.add_argument('--fdr' , type=int, default=0.05, help='FDR threshold for the final result. Default is 0.05.')
30
+ parser.add_argument('--score' , type=int, default=2, help='Track score threshold for the final result. Default is 2.')
29
31
  parser.add_argument('--smooth' , type=int, default=1, help='Smooth strength for the signal.')
30
32
  parser.add_argument('--window' , type=int, default=3, help='Window size for smoothing the signal.')
31
33
  parser.add_argument('--binsize' , type=int, default=100, help='Window size for smoothing the signal.')
@@ -41,6 +43,7 @@ def main():
41
43
  parser.add_argument('--overwrite' , action='store_true', help='Whether to overwrite existed dataframes.' )
42
44
  parser.add_argument('--clean' , action='store_true', help='Whether to remove temp files')
43
45
 
46
+
44
47
  args = parser.parse_args()
45
48
 
46
49
  print(f'Runing offtracker verision: {offtracker.__version__}')
@@ -49,6 +52,8 @@ def main():
49
52
  sgRNA_name = args.name
50
53
  pattern_exp = args.exp
51
54
  pattern_ctr = args.control
55
+ fdr_thresh = args.fdr
56
+ score_thresh = args.score
52
57
  binsize = args.binsize
53
58
  flank_max = args.flank_max
54
59
  flank_regions = args.flank_regions
@@ -93,6 +98,8 @@ def main():
93
98
  all_sample_files.extend( bdg_files )
94
99
  all_sample_files = pd.Series(all_sample_files)
95
100
  all_sample_names = pd.Series(all_sample_names)
101
+ print('all sample names in the folders:')
102
+ print(all_sample_names)
96
103
  print('your string pattern for experimental groups: ', pattern_exp)
97
104
  ctr_samples = []
98
105
  if pattern_ctr == 'none':
@@ -155,8 +162,11 @@ def main():
155
162
  df_bdg.columns = ['chr','start','end','residual']
156
163
  # 将 df_bdg 按照染色体分组
157
164
  sample_groups = df_bdg.groupby('chr')
165
+ # 2024.06.03. fix a bug that df_bdg has less chr than df_candidate
166
+ total_chr = df_bdg['chr'].unique()
167
+ df_candidate_sub_temp = df_candidate_sub[df_candidate_sub['chr'].isin(total_chr)]
158
168
  # 将 df_candidate_sub 按照染色体分组
159
- candidate_groups = df_candidate_sub.groupby('chr')
169
+ candidate_groups = df_candidate_sub_temp.groupby('chr')
160
170
 
161
171
  # 定义一个空的列表,用于存储每个染色体的数据
162
172
  chrom_list = []
@@ -234,7 +244,8 @@ def main():
234
244
  df_score = pd.concat([df_score, df_exp, df_ctr], axis=1)
235
245
  else:
236
246
  df_score = pd.concat([df_score, df_exp], axis=1)
237
- df_score = df_score.copy()
247
+ # 2024.06.03. 跑样例数据时,只有一个 chr6, 其他都是 nan, 不删除会导致后续计算出错
248
+ df_score = df_score.dropna().copy()
238
249
  df_score.to_csv(output)
239
250
 
240
251
  ##########################
@@ -335,12 +346,16 @@ def main():
335
346
  print('mean_score:{:.3f};std:{:.3f}'.format(mu,std))
336
347
  # pv and fdr
337
348
  df_result['pv'] = df_result[f'log2_track_score'].apply( lambda x: norm.sf(x,loc=mu,scale=std) )
338
- df_result['pv'].clip(lower=1e-320,inplace=True)
349
+ df_result['pv'] = df_result['pv'].clip(lower=1e-320)
339
350
  df_result['fdr'] = offtracker.fdr(df_result['pv'])
340
351
  df_result['rank'] = range(1,len(df_result)+1)
341
352
  df_result.to_csv(output)
342
-
343
- df_output = df_result[df_result['fdr']<=0.05].copy()
353
+ # 2024.06.03. 以防 fdr<=fdr_thresh 滤掉了 track_score>=2 的位点
354
+ bool_fdr = df_result['fdr']<=fdr_thresh
355
+ bool_score = df_result['track_score']>=score_thresh
356
+ # 2025.06.05. BE可能会形成单边信号,导致 track_score 为负数,也保留
357
+ bool_neg_score = df_result['track_score']<0
358
+ df_output = df_result[bool_fdr|bool_score|bool_neg_score].copy()
344
359
  if pattern_ctr != 'none':
345
360
  df_output = df_output[['target_location', 'best_strand','best_target','deletion','insertion','mismatch',
346
361
  'exp_L_length', 'exp_R_length','ctr_L_length','ctr_R_length','L_length','R_length','signal_length',
@@ -0,0 +1,318 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ # 2023.10.27. v2.0: 2.0以target_location midpoint为中心,因此取消 pct 计算
5
+ # 2023.12.06. v2.1: 2.1增加 cleavage_site 推测, 修正 deletion 错位, 以 cleavage_site 为中心
6
+ # 2025.04.25. 修正大小写问题
7
+ # 2025.06.11. 调整跳过已存在的candidates的代码顺序
8
+
9
+ import os,sys,re,time
10
+ from itertools import product, permutations
11
+
12
+ if sys.version_info < (3,0):
13
+ import platform
14
+ raise Exception(f'python3 is needed, while running {platform.python_version()} now')
15
+
16
+ import offtracker
17
+ import offtracker.X_sequence as xseq
18
+ script_dir = os.path.abspath(os.path.dirname(offtracker.__file__))
19
+ script_folder= os.path.join(script_dir, 'mapping')
20
+
21
+ import argparse
22
+ import pandas as pd
23
+ import pybedtools
24
+ import multiprocessing as mp
25
+ from Bio.Blast.Applications import NcbiblastnCommandline
26
+
27
+ def main():
28
+ parser = argparse.ArgumentParser()
29
+ parser.description='Generate candidate regions by sgRNA sequence'
30
+ parser.add_argument('--sgrna' , type=str, required=True, help='One sgRNA sequence without PAM' )
31
+ parser.add_argument('--pam' , type=str, required=True, help='The protospacer adjacent motif' )
32
+ parser.add_argument('--pam_location', type=str, default='downstream', help='Upstream or downstream, default is downstream (Cas9)' )
33
+ parser.add_argument('--name' , type=str, required=True, help='custom name of the sgRNA' )
34
+ parser.add_argument('-r','--ref' , type=str, required=True, help='The fasta file of reference genome')
35
+ parser.add_argument('-b','--blastdb', type=str, required=True, help='blast database')
36
+ parser.add_argument('-o','--outdir' , type=str, required=True, help='The output folder')
37
+ parser.add_argument('-g','--genome' , type=str, default='hg38', help='File of chromosome sizes, or "hg38", "mm10" ')
38
+ parser.add_argument('-t','--thread' , type=int, default=4, help='Number of threads for parallel computing')
39
+ parser.add_argument('--quick_mode' , action='store_true', help='BLAST faster but less candidates.')
40
+
41
+ args = parser.parse_args()
42
+
43
+
44
+ if (args.genome == 'hg38') or (args.genome == 'mm10'):
45
+ dir_chrom_sizes = os.path.join(script_folder, f'{args.genome}.chrom.sizes')
46
+ else:
47
+ dir_chrom_sizes = args.genome
48
+
49
+ sgRNA_name = args.name
50
+ sgRNA_seq = args.sgrna
51
+ PAM = args.pam
52
+ PAM_loc = args.pam_location.lower()
53
+ n_threads = args.thread
54
+ dir_output = args.outdir
55
+ if not os.path.exists(dir_output):
56
+ os.makedirs(dir_output)
57
+ dir_ref_fa = args.ref
58
+ blast_db = args.blastdb
59
+ quick_mode = args.quick_mode
60
+
61
+ # parameters for alignment
62
+ half_width = 100
63
+ pct_params = 1.0
64
+ frag_len= half_width*2
65
+ dir_df_candidate = os.path.join(dir_output, f'df_candidate_{sgRNA_name}.csv')
66
+ if os.path.isfile(dir_df_candidate):
67
+ print(f'{dir_df_candidate} exists, skipped.')
68
+ return 'skipped'
69
+
70
+ sgRNA_seq = sgRNA_seq.upper()
71
+ PAM = PAM.upper()
72
+ dir_sgRNA_fasta = os.path.join(dir_output, f'{sgRNA_name}_PAM.fasta')
73
+ dir_sgRNA_blast = os.path.join(dir_output, f'{sgRNA_name}_PAM.blast')
74
+ dir_sgRNA_bed = os.path.join(dir_output, f'{sgRNA_name}_PAM.bed')
75
+
76
+ if PAM_loc == 'downstream':
77
+ possible_sgRNA_PAM = list(product([sgRNA_seq],xseq.possible_seq(PAM)))
78
+ elif PAM_loc == 'upstream':
79
+ possible_sgRNA_PAM = list(product(xseq.possible_seq(PAM),[sgRNA_seq]))
80
+ else:
81
+ raise Exception(f'PAM_location should be "upstream" or "downstream", while {PAM_loc} is given.')
82
+ possible_sgRNA_PAM = [''.join(combination) for combination in possible_sgRNA_PAM]
83
+ n_seq = len(possible_sgRNA_PAM)
84
+
85
+ ID = pd.Series(['seq']*n_seq) + pd.Series(range(1,n_seq+1)).astype(str)
86
+ df_sgRNA_PAM = pd.DataFrame({'ID':ID,'sequence':possible_sgRNA_PAM})
87
+ xseq.write_fasta(df_sgRNA_PAM, dir_sgRNA_fasta)
88
+
89
+ #########
90
+ # BLAST #
91
+ #########
92
+ if os.path.isfile(dir_sgRNA_blast):
93
+ print(f'{dir_sgRNA_blast} exists, skipped.')
94
+ else:
95
+ if quick_mode:
96
+ print('Using quick mode for BLAST')
97
+ blastx_cline = NcbiblastnCommandline(query=dir_sgRNA_fasta, task='blastn-short',out=dir_sgRNA_blast,
98
+ db=blast_db, evalue=10000,outfmt=6, num_threads=n_threads,
99
+ gapopen=4, gapextend=2, reward=2, word_size=5, dust='no', soft_masking=False)
100
+ else:
101
+ blastx_cline = NcbiblastnCommandline(query=dir_sgRNA_fasta, task='blastn-short',out=dir_sgRNA_blast,
102
+ db=blast_db, evalue=10000,outfmt=6, num_threads=n_threads,
103
+ gapopen=4, gapextend=2, reward=2, word_size=4, dust='no', soft_masking=False)
104
+ print(f'BLAST for candidate off-target sites of {sgRNA_name}.')
105
+ blastx_cline()
106
+ print(f'BLAST finished.')
107
+
108
+ ##############
109
+ # Output bed #
110
+ ##############
111
+
112
+ blast_regions = pd.read_csv(dir_sgRNA_blast, sep='\t',header=None)
113
+ blast_regions.columns = ['query acc.','chr','% identity','alignment length','mismatches','gap opens','q. start','q. end','st','ed','evalue','bit score']
114
+ blast_regions = blast_regions[blast_regions.evalue<10000]
115
+
116
+ # reverse strand
117
+ blast_regions['reverse'] = (blast_regions['st']>blast_regions['ed']).astype(int)
118
+ blast_regions_f = blast_regions[blast_regions.reverse==0].copy()
119
+ blast_regions_r = blast_regions[blast_regions.reverse==1].copy()
120
+ temp = blast_regions_r['st'].copy()
121
+ blast_regions_r['st'] = blast_regions_r['ed']
122
+ blast_regions_r['ed'] = temp
123
+ blast_regions = pd.concat([blast_regions_f, blast_regions_r])
124
+ # sort and add location
125
+ blast_regions = blast_regions.sort_values('evalue').reset_index(drop=True)
126
+ blast_regions['location']=blast_regions['chr'].str[:] + ':' + blast_regions['st'].astype(str).str[:] + '-' + blast_regions['ed'].astype(str).str[:]
127
+ blast_regions = blast_regions.drop_duplicates(subset='location').copy()
128
+
129
+ # alignment length 筛选
130
+ len_sgRNA=len(sgRNA_seq)
131
+ min_len = len_sgRNA-8
132
+ blast_regions = blast_regions[blast_regions['alignment length']>=min_len].copy().reset_index(drop=True)
133
+ blast_regions = blast_regions.reindex(columns = ['chr', 'st', 'ed' , 'query acc.', '% identity', 'alignment length', 'mismatches',
134
+ 'gap opens', 'q. start', 'q. end', 'evalue', 'bit score', 'reverse', 'location'] )
135
+
136
+ # 输出 bed 用于后续 alignment score 计算
137
+ blast_regions_bed = blast_regions[['chr','st','ed']]
138
+ xseq.write_bed(blast_regions_bed, dir_sgRNA_bed)
139
+ # 对 bed 进行排序但不合并
140
+ a = pybedtools.BedTool(dir_sgRNA_bed)
141
+ a.sort(g=dir_chrom_sizes).saveas( dir_sgRNA_bed )
142
+ print(f'Output {sgRNA_name}_PAM.bed')
143
+
144
+
145
+ ###################
146
+ # alignment score #
147
+ ###################
148
+
149
+ #########
150
+ # 读取 blast bed
151
+ #########
152
+ common_chr = pd.Series(['chr']*23).str[:] + pd.Series(range(23)).astype(str).str[:]
153
+ common_chr = pd.concat([common_chr, pd.Series(['chrX','chrY'])]).to_numpy()
154
+
155
+ bed_short = xseq.X_readbed(dir_sgRNA_bed)
156
+ bed_short = bed_short[bed_short['chr'].isin(common_chr)].copy()
157
+ bed_short['midpoint'] = ((bed_short['st'] + bed_short['ed'])/2).astype(int)
158
+ bed_short['st'] = bed_short['midpoint'] - half_width
159
+ bed_short['ed'] = bed_short['midpoint'] + half_width
160
+ bed_short.loc[bed_short['st']<0,'st']=0
161
+ bed_short = bed_short.drop_duplicates()
162
+
163
+ #########
164
+ # 根据 bed_f 位点 ed 前后 half_width 取基因组序列
165
+ #########
166
+
167
+ temp_bed = os.path.join(dir_output, 'temp.bed')
168
+ xseq.write_bed(bed_short.iloc[:,:3], temp_bed)
169
+ a = pybedtools.BedTool(temp_bed)
170
+ fasta = pybedtools.example_filename(dir_ref_fa)
171
+ a = a.sequence(fi=fasta)
172
+ with open(a.seqfn) as f:
173
+ fasta = {}
174
+ for line in f:
175
+ line = line.strip() # 去除末尾换行符
176
+ if line[0] == '>':
177
+ header = line[1:]
178
+ else:
179
+ sequence = line
180
+ fasta[header] = fasta.get(header,'') + sequence
181
+
182
+ # pybedtools 得到位置 chrA:X-Y 时,X数字会往左多1bp
183
+
184
+ #########
185
+ # local alignment
186
+ #########
187
+ # 生成 DNA_matrix
188
+ mismatch_score = 0.01
189
+ base_codes = list(xseq.ambiguous_nt.keys())
190
+ all_base_pairs = list(permutations(base_codes,2)) + [(x,x) for x in base_codes]
191
+ DNA_matrix = {x : xseq.get_base_score(*x, mismatch_score=mismatch_score) for x in all_base_pairs}
192
+ # 添加 PAM
193
+ if PAM_loc == 'downstream':
194
+ sgRNA_PAM_fw = sgRNA_seq + PAM
195
+ else:
196
+ sgRNA_PAM_fw = PAM + sgRNA_seq
197
+
198
+ sgRNA_PAM_rv = xseq.reverse_complement(sgRNA_PAM_fw)
199
+
200
+ list_args_fw=[]
201
+ list_args_rv=[]
202
+ for a_key, a_seq in fasta.items():
203
+ # 2025.04.25 修正大小写问题
204
+ a_seq = re.sub('[^ATCG]','N',a_seq.upper())
205
+ list_args_fw.append( [a_key, sgRNA_PAM_fw, a_seq, frag_len, DNA_matrix, mismatch_score] )
206
+ list_args_rv.append( [a_key, sgRNA_PAM_rv, a_seq, frag_len, DNA_matrix, mismatch_score] )
207
+ st = time.time()
208
+ with mp.Pool(n_threads) as p:
209
+ list_align_forward = p.starmap(xseq.sgRNA_alignment, list_args_fw)
210
+ ed = time.time()
211
+ print('align_forward:{:.2f}'.format(ed-st))
212
+ st = time.time()
213
+ with mp.Pool(n_threads) as p:
214
+ list_align_reverse = p.starmap(xseq.sgRNA_alignment, list_args_rv)
215
+ ed = time.time()
216
+ print('align_reverse:{:.2f}'.format(ed-st))
217
+ #
218
+ df_align_forward = pd.DataFrame(list_align_forward, columns= ['fw_score','fw_pct','fw_target','fw_location','fw_deletion','fw_insertion','fw_mismatch'])
219
+ df_align_reverse = pd.DataFrame(list_align_reverse, columns= ['rv_score','rv_pct','rv_target','rv_location','rv_deletion','rv_insertion','rv_mismatch'])
220
+ df_align_reverse['rv_target'] = df_align_reverse['rv_target'].apply(xseq.reverse_complement)
221
+ df_candidate = pd.concat([df_align_forward,df_align_reverse],axis=1)
222
+ df_candidate['location'] = fasta.keys()
223
+ df_candidate['alignment_score'] = df_candidate[['fw_score','rv_score']].max(axis=1)
224
+ #df_candidate['fw_score_2'] = df_candidate['fw_score']*(pct_params-df_candidate['fw_pct'].abs())
225
+ #df_candidate['rv_score_2'] = df_candidate['rv_score']*(pct_params-df_candidate['rv_pct'].abs())
226
+ #df_candidate['best_seq_score'] = df_candidate[['fw_score_2', 'rv_score_2']].max(axis=1)
227
+ #df_candidate['best_strand'] = df_candidate[['fw_score_2', 'rv_score_2']].idxmax(axis='columns').replace({'fw_score_2':'+', 'rv_score_2':'-'})
228
+ #df_candidate.loc[df_candidate['fw_score_2']==df_candidate['rv_score_2'],'best_strand']='equal_score'
229
+ df_candidate['best_seq_score'] = df_candidate[['fw_score', 'rv_score']].max(axis=1)
230
+ df_candidate['best_strand'] = df_candidate[['fw_score', 'rv_score']].idxmax(axis='columns').replace({'fw_score':'+', 'rv_score':'-'})
231
+ df_candidate.loc[df_candidate['fw_score']==df_candidate['rv_score'],'best_strand']='equal_score'
232
+
233
+ # GG check
234
+ # 2023.12.05 增加 cleavage_site 推测
235
+ list_best_target = []
236
+ list_best_location = []
237
+ list_cleavage_site = []
238
+ list_delete = []
239
+ list_insert = []
240
+ list_mismat = []
241
+ list_GG = []
242
+ for a_row in df_candidate.iterrows():
243
+ if a_row[1]['best_strand']=='+':
244
+ list_best_target.append(a_row[1]['fw_target'])
245
+ list_best_location.append(a_row[1]['fw_location'])
246
+ list_cleavage_site.append(int(a_row[1]['fw_location'].split('-')[1]) - 6)
247
+ list_delete.append(a_row[1]['fw_deletion'])
248
+ list_insert.append(a_row[1]['fw_insertion'])
249
+ list_mismat.append(a_row[1]['fw_mismatch'])
250
+ if a_row[1]['fw_target'][-2:]=='GG':
251
+ list_GG.append('OK')
252
+ else:
253
+ list_GG.append('NO')
254
+ elif a_row[1]['best_strand']=='-':
255
+ list_best_target.append(a_row[1]['rv_target'])
256
+ list_best_location.append(a_row[1]['rv_location'])
257
+ list_cleavage_site.append(int(a_row[1]['rv_location'].split('-')[0].split(':')[1]) + 5)
258
+ list_delete.append(a_row[1]['rv_deletion'])
259
+ list_insert.append(a_row[1]['rv_insertion'])
260
+ list_mismat.append(a_row[1]['rv_mismatch'])
261
+ if a_row[1]['rv_target'][-2:]=='GG':
262
+ list_GG.append('OK')
263
+ else:
264
+ list_GG.append('NO')
265
+ else:
266
+ if a_row[1]['fw_target'][-2:]=='GG':
267
+ list_best_target.append(a_row[1]['fw_target'])
268
+ list_best_location.append(a_row[1]['fw_location'])
269
+ list_cleavage_site.append(int(a_row[1]['fw_location'].split('-')[1]) - 6)
270
+ list_delete.append(a_row[1]['fw_deletion'])
271
+ list_insert.append(a_row[1]['fw_insertion'])
272
+ list_mismat.append(a_row[1]['fw_mismatch'])
273
+ list_GG.append('OK_same_score')
274
+ # 发现没有 GG 则看 RC
275
+ elif a_row[1]['rv_target'][-2:]=='GG':
276
+ list_best_target.append(a_row[1]['rv_target'])
277
+ list_best_location.append(a_row[1]['rv_location'])
278
+ list_cleavage_site.append(int(a_row[1]['rv_location'].split('-')[0].split(':')[1]) + 5)
279
+ list_delete.append(a_row[1]['rv_deletion'])
280
+ list_insert.append(a_row[1]['rv_insertion'])
281
+ list_mismat.append(a_row[1]['rv_mismatch'])
282
+ list_GG.append('OK_same_score')
283
+ else:
284
+ list_best_target.append(a_row[1]['fw_target'])
285
+ list_best_location.append(a_row[1]['fw_location'])
286
+ list_cleavage_site.append(int(a_row[1]['fw_location'].split('-')[1]) - 6)
287
+ list_delete.append(a_row[1]['fw_deletion'])
288
+ list_insert.append(a_row[1]['fw_insertion'])
289
+ list_mismat.append(a_row[1]['fw_mismatch'])
290
+ list_GG.append('NO_same_score')
291
+ # 记入 df_candidate
292
+ df_candidate['deletion'] = list_delete
293
+ df_candidate['insertion'] = list_insert
294
+ df_candidate['mismatch'] = list_mismat
295
+ df_candidate['GG'] = list_GG
296
+ df_candidate['best_target'] = list_best_target
297
+ df_candidate['target_location'] = list_best_location
298
+ df_candidate['cleavage_site'] = list_cleavage_site
299
+
300
+ # 2.0 更新一下格式
301
+ df_candidate = df_candidate.drop_duplicates(subset=['target_location']).reset_index(drop=True)
302
+ df_candidate = pd.concat([xseq.bedfmt(df_candidate['target_location']), df_candidate],axis=1)
303
+ # df_candidate['midpoint'] = ((df_candidate['ed'] + df_candidate['st'])/2).astype(int)
304
+ df_candidate = xseq.add_ID(df_candidate, midpoint='cleavage_site')
305
+
306
+ df_candidate.to_csv(dir_df_candidate)
307
+ print(f'Output df_candidate_{sgRNA_name}.csv')
308
+ os.remove(temp_bed)
309
+
310
+ return 'Done!'
311
+
312
+
313
+ if __name__ == '__main__' :
314
+ result = main()
315
+ print(result)
316
+
317
+
318
+