offtracker 2.13.2__zip → 2.14.0__zip

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {offtracker-2.13.2 → offtracker-2.14.0}/PKG-INFO +18 -4
  2. offtracker-2.13.2/offtracker.egg-info/PKG-INFO → offtracker-2.14.0/README.md +261 -259
  3. {offtracker-2.13.2 → offtracker-2.14.0}/offtracker/_version.py +3 -2
  4. offtracker-2.13.2/README.md → offtracker-2.14.0/offtracker.egg-info/PKG-INFO +273 -247
  5. {offtracker-2.13.2 → offtracker-2.14.0}/offtracker.egg-info/requires.txt +1 -0
  6. {offtracker-2.13.2 → offtracker-2.14.0}/scripts/offtracker_analysis.py +18 -0
  7. {offtracker-2.13.2 → offtracker-2.14.0}/scripts/offtracker_candidates.py +14 -0
  8. {offtracker-2.13.2 → offtracker-2.14.0}/scripts/offtracker_config.py +37 -3
  9. {offtracker-2.13.2 → offtracker-2.14.0}/scripts/offtracker_qc.py +15 -2
  10. {offtracker-2.13.2 → offtracker-2.14.0}/setup.py +1 -1
  11. {offtracker-2.13.2 → offtracker-2.14.0}/LICENSE.txt +0 -0
  12. {offtracker-2.13.2 → offtracker-2.14.0}/MANIFEST.in +0 -0
  13. {offtracker-2.13.2 → offtracker-2.14.0}/offtracker/X_offplot.py +0 -0
  14. {offtracker-2.13.2 → offtracker-2.14.0}/offtracker/X_offtracker.py +0 -0
  15. {offtracker-2.13.2 → offtracker-2.14.0}/offtracker/X_sequence.py +0 -0
  16. {offtracker-2.13.2 → offtracker-2.14.0}/offtracker/__init__.py +0 -0
  17. {offtracker-2.13.2 → offtracker-2.14.0}/offtracker/snakefile/Snakefile_QC.smk +0 -0
  18. {offtracker-2.13.2 → offtracker-2.14.0}/offtracker/snakefile/Snakefile_offtracker.smk +0 -0
  19. {offtracker-2.13.2 → offtracker-2.14.0}/offtracker/utility/1.1_bed2fr.py +0 -0
  20. {offtracker-2.13.2 → offtracker-2.14.0}/offtracker/utility/1.3_bdg_normalize_v4.0.py +0 -0
  21. {offtracker-2.13.2 → offtracker-2.14.0}/offtracker/utility/bedGraphToBigWig +0 -0
  22. {offtracker-2.13.2 → offtracker-2.14.0}/offtracker/utility/hg38.chrom.sizes +0 -0
  23. {offtracker-2.13.2 → offtracker-2.14.0}/offtracker/utility/mm10.chrom.sizes +0 -0
  24. {offtracker-2.13.2 → offtracker-2.14.0}/offtracker/utility/offtracker_blacklist_hg38.merged.bed +0 -0
  25. {offtracker-2.13.2 → offtracker-2.14.0}/offtracker/utility/offtracker_blacklist_mm10.merged.bed +0 -0
  26. {offtracker-2.13.2 → offtracker-2.14.0}/offtracker.egg-info/SOURCES.txt +0 -0
  27. {offtracker-2.13.2 → offtracker-2.14.0}/offtracker.egg-info/dependency_links.txt +0 -0
  28. {offtracker-2.13.2 → offtracker-2.14.0}/offtracker.egg-info/top_level.txt +0 -0
  29. {offtracker-2.13.2 → offtracker-2.14.0}/scripts/offtracker_correction.py +0 -0
  30. {offtracker-2.13.2 → offtracker-2.14.0}/scripts/offtracker_init.py +0 -0
  31. {offtracker-2.13.2 → offtracker-2.14.0}/scripts/offtracker_plot.py +0 -0
  32. {offtracker-2.13.2 → offtracker-2.14.0}/setup.cfg +0 -0
@@ -1,247 +1,273 @@
1
- # Offtracker
2
-
3
- Offtracker is an end to end pipeline of Tracking-seq data analysis for detecting off-target sites of any genome editing tools that generate double-strand breaks (DSBs) or single-strand breaks (SSBs).
4
-
5
- ## System requirements
6
-
7
- * Linux/Unix
8
- * Python >= 3.6
9
-
10
- ## Dependency
11
-
12
- ```bash
13
- # We recommend creating a new environment using mamba/conda to avoid compatibility problems
14
- # If you don't use mamba, just replace the code with conda
15
- # Windows systems may not be compatible with pybedtools.
16
- mamba create -n offtracker -c bioconda blast snakemake pybedtools deeptools chromap
17
- ```
18
-
19
-
20
- ## Installation
21
-
22
-
23
- ```bash
24
- # Activate the environment
25
- conda activate offtracker
26
-
27
- # Direct installation with pip
28
- pip install offtracker
29
-
30
- # (Alternative) Download the offtracker from github
31
- git clone https://github.com/Lan-lab/offtracker.git
32
- cd offtracker
33
- pip install .
34
- ```
35
-
36
-
37
- ## Before analyzing samples
38
-
39
- **Important: Do not use hard-masked genome.fa**, in which repeats are masked by capital Ns and reads should have been mapped to these region (e.g. MHC region) will be lost. Besides, the genome.fa **should not** contain alternate loci like chr2_KI270776v1_alt and chr6_GL000256v2_alt, which may cause multi-mappings and the reads may be discarded.
40
-
41
- For example, https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz is soft-masked genome with alternate loci. https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.masked.gz is hard-masked genome. **Do not** use these two as reference genome.
42
-
43
- http://cistrome.org/~galib/MAESTRO/references/scATAC/Refdata_scATAC_MAESTRO_GRCh38_1.1.0.tar.gz is the genome used for the example data.
44
-
45
- ```bash
46
- # The following command can be used to check whether alternate loci of chr6 are present in the reference genome.
47
- grep "^>chr6" genome.fa
48
- ```
49
-
50
- ```bash
51
- # Build chromap index (only need once for each genome)
52
- chromap -i -r /Your_Path_To_Reference/hg38_genome.fa \
53
- -o /Your_Path_To_Reference/hg38_genome.chromap.index
54
-
55
- # Build blast index (only need once for each genome)
56
- makeblastdb -input_type fasta -title hg38 -dbtype nucl -parse_seqids \
57
- -in /Your_Path_To_Reference/hg38_genome.fa \
58
- -out /Your_Path_To_Reference/hg38_genome.blastdb \
59
- -logfile /Your_Path_To_Reference/hg38_genome.blastdb.log
60
-
61
- # Generate candidate regions by sgRNA sequence (need once for each genome and sgRNA)
62
- # --name: a user-defined name of the sgRNA, which will be used in the following analysis.
63
- offtracker_candidates.py -t 8 -g hg38 \
64
- -r /Your_Path_To_Reference/hg38_genome.fa \
65
- -b /Your_Path_To_Reference/hg38_genome.blastdb \
66
- --name 'VEGFA2' --sgrna 'GACCCCCTCCACCCCGCCTC' --pam 'NGG' \
67
- -o /Your_Path_To_Candidates_Folder
68
-
69
- # if analyzing Cas12a, whose pam is upstream of the sgRNA, add this:
70
- --pam_location 'upstream'
71
-
72
- ```
73
-
74
-
75
- ## Quality control and adapter trimming
76
-
77
- ```bash
78
- # Generate snakemake config file for quality control and adapter trimming.
79
- offtracker_qc.py -t 4 \
80
- -f /Your_Path_To_Input_Folder \
81
- --subfolder 0
82
-
83
- cd /Your_Path_To_Input_Folder/Trimmed_data
84
- snakemake -np # dry run to check whether everything is alright
85
- nohup snakemake --cores 16 1>${outdir}/sm_qc.log 2>&1 &
86
-
87
- """
88
- Set “--subfolder 0” if the file structure is like:
89
- | - Input_Folder
90
- | - sample1_R1.fastq.gz
91
- | - sample1_R2.fastq.gz
92
- | - sample2_R1.fastq.gz
93
- | - sample2_R2.fastq.gz
94
- Set “--subfolder 1” if the file structure is like:
95
- | - Input_Folder
96
- | - Sample1_Folder
97
- | - sample1_R1.fastq.gz
98
- | - sample1_R2.fastq.gz
99
- | - Sample2_Folder
100
- | - sample2_R1.fastq.gz
101
- | - sample2_R2.fastq.gz
102
-
103
- The script “offtracker_qc.py” will create a “Trimmed_data” folder under /Your_Path_To_Input_Folder.
104
- If “-o /Your_Path_To_Output” is set, the output will be redirected to /Your_Path_To_Output.
105
- """
106
- ```
107
-
108
- ## Strand-specific mapping of Tracking-seq data
109
-
110
- ```bash
111
-
112
- # Generate snakemake config file for mapping
113
- # Results will be generated in /Your_Path_To_Output, if -o is not set, the output will be in the same folder as the fastq files
114
- offtracker_config.py -t 8 -g hg38 --blacklist hg38 \
115
- -r /Your_Path_To_Reference/hg38_genome.fa \
116
- -i /Your_Path_To_Reference/hg38_genome.chromap.index \
117
- -f /Your_Path_To_Trimmed_Data \
118
- -o /Your_Path_To_Output \
119
- --subfolder 0
120
-
121
- # Warning: Do not contain "fastq" or "fq" in the folder name, otherwise the program may treat the folder as a fastq file
122
- # This problem may be fixed in the future
123
-
124
- # Run the snakemake program
125
- cd /Your_Path_To_Fastq
126
- snakemake -np # dry run
127
- nohup snakemake --cores 16 1>sm_mapping.log 2>sm_mapping.err &
128
-
129
- ## about cores
130
- # --cores of snakemake must be larger than -t of offtracker_config.py
131
- # parallel number = cores/t
132
-
133
- ## about output
134
- # This part will generate "*.fw.scaled.bw" and ".rv.scaled.bw" for IGV visualization
135
- # "*.fw.bed" and "*.rv.bed" are used in the next part.
136
- ```
137
-
138
-
139
- ## Analyzing the genome-wide off-target sites
140
-
141
- ```bash
142
- # In this part, multiple samples in the same condition can be analyzed in a single run by pattern recognition of sample names
143
-
144
- offtracker_analysis.py -g hg38 --name "VEGFA2" \
145
- --exp 'Cas9_VEGFA2' \
146
- --control 'WT' \
147
- --outname 'Cas9_VEGFA_293' \
148
- -f /Your_Path_To_Output \
149
- --seqfolder /Your_Path_To_Candidates
150
-
151
- # --name: the same gRNA name you set when running offtracker_candidates.py
152
- # --exp/--control: add one or multiple patterns of file name in regular expressions
153
- # If multiple samples meet the pattern, their signals will be averaged. Thus, only samples with the same condition should be included in a single analysis.
154
-
155
- # This step will generate Offtracker_result_{outname}.csv
156
- # Default FDR is 0.05, which can be changed by --fdr. This will empirically make the threshold of Track score around 2.
157
- # Sites with Track score >=2, which is a empirical threshold, are output regardless of FDR.
158
- # Intermediate files are saved in ./temp folder, which can be deleted.
159
- # Keeping the intermediate files can make the analysis faster if involving previously analyzed samples (e.g. using the same control samples for different analyses)
160
- ```
161
-
162
- ## Off-target sequences visualization
163
-
164
- ```bash
165
- # After get the Offtracker_result_{outname}.csv, you can visualize the off-target sites with their genomic sequence with the following command:
166
-
167
- offtracker_plot.py --result Your_Offtracker_Result_CSV \
168
- --sgrna 'GACCCCCTCCACCCCGCCTC' --pam 'NGG'
169
-
170
- # The default output is a pdf file with Offtracker_result_{outname}.pdf
171
- # Assigning a specific output file with another suffix can change the format. e.g., "--output Offtracker_plot.png" will generate a png file.
172
- # The orange dash line indicates the empirical threshold of Track score = 2
173
- # Empirically, the off-target sites with Track score < 2 are less likely to be real off-target sites.
174
- ```
175
-
176
-
177
- ## Note1, when not using hg38 or mm10
178
-
179
- The default setting only includes chr1-chr22, chrX, chrY, and chrM. (only suitable for human and mouse) \
180
- If you are using reference genomes without "chr" at the beginning, or want to analyze all chromosomes or other species, you can set "--ignore_chr" when running offtracker_config.py to skip chromosome filter.
181
-
182
- Currently, this software is only ready-to-use for mm10 and hg38. For any other genome, e.g., hg19, please add a genome size file named "hg19.chrom.sizes" to .\offtracker\utility. Besides, add "--blacklist none" or "--blacklist Your_Blacklist" (e.g., ENCODE blacklist) when running offtracker_config.py, because we only include blacklists for mm10 and hg38.
183
-
184
- ## Note2
185
-
186
- The FDRs in the Tracking-seq result do not reflect the real off-target probability.
187
- It is strongly recommended to observe the "fw.scaled.bw" and "rv.scaled.bw" using genome browser like IGV to visually inspect each target location from the Tracking-seq result.
188
-
189
-
190
-
191
- # Example Data
192
-
193
- Here are example data that contains reads of chr6 from HEK293T cells edited with Cas9 + sgRNA VEGFA_site_2 (VEGFA2) and reads of chr6 from wild type HEK293T cells:
194
-
195
- https://figshare.com/articles/dataset/WT_HEK239T_chr6/25956034
196
-
197
- It takes about 5-10 minutes to run the mapping (offtracker_config.py & snakemake) of example data with -t 8 and --cores 16 (2 parallel tasks)
198
-
199
- ## Signal visualization
200
-
201
- After mapping, there will be 4 .bw files in the output folder:
202
- ```bash
203
- Cas9_VEGFA2_chr6.fw.scaled.bw
204
-
205
- Cas9_VEGFA2_chr6.rv.scaled.bw
206
-
207
- WT_chr6.fw.scaled.bw
208
-
209
- WT_chr6.rv.scaled.bw
210
- ```
211
- These files can be visualized in genome browser like IGV:
212
-
213
- ![signal](https://github.com/Lan-lab/offtracker/blob/main/example_output/signals_example.png?raw=true)
214
-
215
- The signal (coverage) for each sample is normalized to 1e7/total_reads. As only reads mapping to chr6 were extracted in the example data, the signal range is much higher than that of the whole genome samples.
216
-
217
- ## Whole genome off-target analysis
218
-
219
- For analyzing the signals (offtracker_analysis.py), it takes about 3-5 minutes and outputs a file named "Offtracker_result_{outname}.csv"
220
-
221
- After that, you can visualize the off-target sites with their genomic sequence (offtracker_plot.py) and get an image like this:
222
-
223
- ![offtarget](https://github.com/Lan-lab/offtracker/blob/main/example_output/sequences_example.png?raw=true)
224
-
225
-
226
- After finishing the pipeline, if “chr6:31400832-31400854” and “chr6:31495044-31495066” are missing in the plot, it is most likely due to either:
227
-
228
- • Using a hard-masked reference genome (where repeats are replaced with 'N's)
229
-
230
- • The presence of alternate loci (e.g., chr6_GL000256v2_alt) in the genome.
231
-
232
- These two off-target sites locate in the region of MHC class I chain-related protein A and B (MICA and MICB), which is polymorphic (resulting in alternate loci in contigs like “chr6_GL000256v2_alt”) and contains interspersed repeats (resulting in sequences masked by capital 'N's in a hard-masked genome). Please try again with unmasked or soft-masked genome without alternate loci.
233
-
234
-
235
-
236
- # Citation
237
-
238
- If you use Tracking-seq or OFF-TRACKER in your research, please cite the following paper:
239
-
240
- Zhu, M., Xu, R., Yuan, J., Wang, J. et al. Tracking-seq reveals the heterogeneity of off-target effects in CRISPR–Cas9-mediated genome editing. Nat Biotechnol (2024). https://doi.org/10.1038/s41587-024-02307-y
241
-
242
- The signal visualization of .bw file here was generated by the Integrative Genomics Viewer (IGV) software. The signal visualization in the Tracking-seq article above was generated by either IGV or pyGenomeTracks:
243
-
244
- Robinson, J., Thorvaldsdóttir, H., Winckler, W. et al. Integrative genomics viewer. Nat Biotechnol 29, 24–26 (2011). https://doi.org/10.1038/nbt.1754
245
-
246
- Lopez-Delisle L, Rabbani L, Wolff J, Bhardwaj V, Backofen R, Grüning B, Ramírez F, Manke T. pyGenomeTracks: reproducible plots for multivariate genomic data sets. Bioinformatics. 2020 Aug 3:btaa692. doi: 10.1093/bioinformatics/btaa692.
247
-
1
+ Metadata-Version: 2.1
2
+ Name: offtracker
3
+ Version: 2.14.0
4
+ Summary: Tracking-seq data analysis
5
+ Home-page: https://github.com/Lan-lab/offtracker
6
+ Author: Runda Xu
7
+ Author-email: xrd18@tsinghua.org.cn
8
+ Requires-Python: >=3.6.0
9
+ Description-Content-Type: text/markdown
10
+ License-File: LICENSE.txt
11
+
12
+
13
+ # Offtracker
14
+
15
+ Offtracker is an end to end pipeline of Tracking-seq data analysis for detecting off-target sites of any genome editing tools that generate double-strand breaks (DSBs) or single-strand breaks (SSBs).
16
+
17
+ ## System requirements
18
+
19
+ * Linux/Unix
20
+ * Python >= 3.6
21
+
22
+ ## Dependency
23
+
24
+ ```bash
25
+ # We recommend creating a new environment using mamba/conda to avoid compatibility problems
26
+ # If you don't use mamba, just replace the code with conda
27
+ # Windows systems may not be compatible with pybedtools.
28
+ mamba create -n offtracker -c bioconda blast snakemake pybedtools deeptools chromap
29
+ ```
30
+
31
+
32
+ ## Installation
33
+
34
+
35
+ ```bash
36
+ # Activate the environment
37
+ conda activate offtracker
38
+
39
+ # Direct installation with pip
40
+ pip install offtracker
41
+
42
+ # (Alternative) Download the offtracker from github
43
+ git clone https://github.com/Lan-lab/offtracker.git
44
+ cd offtracker
45
+ pip install .
46
+ ```
47
+
48
+
49
+ ## Before analyzing samples
50
+
51
+ **Important: Do not use hard-masked genome.fa**, in which repeats are masked by capital Ns and reads should have been mapped to these region (e.g. MHC region) will be lost. Besides, the genome.fa **should not contain alternate loci** like chr2_KI270776v1_alt and chr6_GL000256v2_alt, which may cause multi-mappings and the reads may be discarded.
52
+
53
+
54
+ **!! Do not use any of these two .fa files !!** \
55
+ For example, https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz is soft-masked genome with alternate loci. https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.masked.gz is hard-masked genome. **Do not** use these two as reference genome. \
56
+ **!! Do not use any of these two .fa files !!**
57
+
58
+
59
+
60
+ http://cistrome.org/~galib/MAESTRO/references/scATAC/Refdata_scATAC_MAESTRO_GRCh38_1.1.0.tar.gz is the genome used for the example data.
61
+
62
+
63
+ ```bash
64
+ # The following command can be used to check whether alternate loci of chr6 are present in the reference genome.
65
+ grep "^>chr6" genome.fa
66
+ ```
67
+
68
+ ```bash
69
+ # Build chromap index (only need once for each genome)
70
+ chromap -i -r /Your_Path_To_Reference/hg38_genome.fa \
71
+ -o /Your_Path_To_Reference/hg38_genome.chromap.index
72
+
73
+ # Build blast index (only need once for each genome)
74
+ makeblastdb -input_type fasta -title hg38 -dbtype nucl -parse_seqids \
75
+ -in /Your_Path_To_Reference/hg38_genome.fa \
76
+ -out /Your_Path_To_Reference/hg38_genome.blastdb \
77
+ -logfile /Your_Path_To_Reference/hg38_genome.blastdb.log
78
+
79
+ # Generate candidate regions by sgRNA sequence (need once for each genome and sgRNA)
80
+ # --name: a user-defined name of the sgRNA, which will be used in the following analysis.
81
+ offtracker_candidates.py -t 8 -g hg38 \
82
+ -r /Your_Path_To_Reference/hg38_genome.fa \
83
+ -b /Your_Path_To_Reference/hg38_genome.blastdb \
84
+ --name 'VEGFA2' --sgrna 'GACCCCCTCCACCCCGCCTC' --pam 'NGG' \
85
+ -o /Your_Path_To_Candidates_Folder
86
+
87
+ # If analyzing Cas12a, whose pam is upstream of the sgRNA, add this:
88
+ --pam_location 'upstream'
89
+
90
+ ```
91
+
92
+
93
+ ## Quality control and adapter trimming
94
+
95
+ ```bash
96
+ # Generate snakemake config file for quality control and adapter trimming.
97
+ offtracker_qc.py -t 4 \
98
+ -f /Your_Path_To_Input_Folder \
99
+ --subfolder 0
100
+
101
+ cd /Your_Path_To_Input_Folder/Trimmed_data
102
+ snakemake -np # dry run to check whether everything is alright
103
+ nohup snakemake --cores 16 1>${outdir}/sm_qc.log 2>&1 &
104
+
105
+ """
106
+ Set “--subfolder 0” if the file structure is like:
107
+ | - Input_Folder
108
+ | - sample1_R1.fastq.gz
109
+ | - sample1_R2.fastq.gz
110
+ | - sample2_R1.fastq.gz
111
+ | - sample2_R2.fastq.gz
112
+ Set “--subfolder 1” if the file structure is like:
113
+ | - Input_Folder
114
+ | - Sample1_Folder
115
+ | - sample1_R1.fastq.gz
116
+ | - sample1_R2.fastq.gz
117
+ | - Sample2_Folder
118
+ | - sample2_R1.fastq.gz
119
+ | - sample2_R2.fastq.gz
120
+
121
+ The script “offtracker_qc.py” will create a “Trimmed_data” folder under /Your_Path_To_Input_Folder.
122
+ If “-o /Your_Path_To_Output” is set, the output will be redirected to /Your_Path_To_Output.
123
+ """
124
+ ```
125
+
126
+ ## Strand-specific mapping of Tracking-seq data
127
+
128
+ ```bash
129
+
130
+ # Generate snakemake config file for mapping
131
+ # Results will be generated in /Your_Path_To_Output, if -o is not set, the output will be in the same folder as the fastq files
132
+ offtracker_config.py -t 8 -g hg38 --blacklist hg38 \
133
+ -r /Your_Path_To_Reference/hg38_genome.fa \
134
+ -i /Your_Path_To_Reference/hg38_genome.chromap.index \
135
+ -f /Your_Path_To_Trimmed_Data \
136
+ -o /Your_Path_To_Output \
137
+ --subfolder 0
138
+
139
+ # Warning: Do not contain "fastq" or "fq" in the folder name, otherwise the program may treat the folder as a fastq file
140
+ # This problem may be fixed in the future
141
+
142
+ # Run the snakemake program
143
+ cd /Your_Path_To_Fastq
144
+ snakemake -np # dry run
145
+ nohup snakemake --cores 16 1>sm_mapping.log 2>sm_mapping.err &
146
+
147
+ ## about cores
148
+ # --cores of snakemake must be larger than -t of offtracker_config.py
149
+ # parallel number = cores/t
150
+
151
+ ## about output
152
+ # This part will generate "*.fw.scaled.bw" and ".rv.scaled.bw" for IGV visualization
153
+ # "*.fw.bed" and "*.rv.bed" are used in the next part.
154
+ ```
155
+
156
+
157
+ ## Analyzing the genome-wide off-target sites
158
+
159
+ ```bash
160
+ # In this part, multiple samples in the same condition can be analyzed in a single run by pattern recognition of sample names
161
+
162
+ offtracker_analysis.py -g hg38 --name "VEGFA2" \
163
+ --exp 'Cas9_VEGFA2' \
164
+ --control 'WT' \
165
+ --outname 'Cas9_VEGFA_293' \
166
+ -f /Your_Path_To_Output \
167
+ --seqfolder /Your_Path_To_Candidates
168
+
169
+ # --name: the same gRNA name you set when running offtracker_candidates.py
170
+ # --exp/--control: add one or multiple patterns of file name in regular expressions
171
+ # If multiple samples meet the pattern, their signals will be averaged. Thus, only samples with the same condition should be included in a single analysis.
172
+
173
+ # This step will generate Offtracker_result_{outname}.csv
174
+ # Default FDR is 0.05, which can be changed by --fdr. This will empirically make the threshold of Track score around 2.
175
+ # Sites with Track score >=2, which is a empirical threshold, are output regardless of FDR.
176
+ # Intermediate files are saved in ./temp folder, which can be deleted.
177
+ # Keeping the intermediate files can make the analysis faster if involving previously analyzed samples (e.g. using the same control samples for different analyses)
178
+ ```
179
+
180
+ ## Off-target sequences visualization
181
+
182
+ ```bash
183
+ # After get the Offtracker_result_{outname}.csv, you can visualize the off-target sites with their genomic sequence with the following command:
184
+
185
+ offtracker_plot.py --result Your_Offtracker_Result_CSV \
186
+ --sgrna 'GACCCCCTCCACCCCGCCTC' --pam 'NGG'
187
+
188
+ # The default output is a pdf file with Offtracker_result_{outname}.pdf
189
+ # Assigning a specific output file with another suffix can change the format. e.g., "--output Offtracker_plot.png" will generate a png file.
190
+ # The orange dash line indicates the empirical threshold of Track score = 2
191
+ # Empirically, the off-target sites with Track score < 2 are less likely to be real off-target sites.
192
+ ```
193
+
194
+
195
+ ## Note1, when not using hg38 or mm10
196
+
197
+ The default setting only includes chr1-chr22, chrX, chrY, and chrM. (only suitable for human and mouse) \
198
+ If you are using reference genomes without "chr" at the beginning, or want to analyze all chromosomes or other species, you can set "--ignore_chr" when running offtracker_config.py to skip chromosome filter.
199
+
200
+ Currently, this software is only ready-to-use for mm10 and hg38. For any other genome, e.g., hg19, please add a genome size file named "hg19.chrom.sizes" to .\offtracker\utility. Besides, add "--blacklist none" or "--blacklist Your_Blacklist" (e.g., ENCODE blacklist) when running offtracker_config.py, because we only include blacklists for mm10 and hg38.
201
+
202
+ ## Note2
203
+
204
+ The FDRs in the Tracking-seq result do not reflect the real off-target probability.
205
+ It is strongly recommended to observe the "fw.scaled.bw" and "rv.scaled.bw" using genome browser like IGV to visually inspect each target location from the Tracking-seq result.
206
+
207
+
208
+
209
+ # Example Data
210
+
211
+ Here are example data that contains reads of chr6 from HEK293T cells edited with Cas9 + sgRNA VEGFA_site_2 (VEGFA2) and reads of chr6 from wild type HEK293T cells:
212
+
213
+ https://figshare.com/articles/dataset/WT_HEK239T_chr6/25956034
214
+
215
+ It takes about 5-10 minutes to run the mapping (offtracker_config.py & snakemake) of example data with -t 8 and --cores 16 (2 parallel tasks)
216
+
217
+ To download the data with wget:
218
+ ```
219
+ wget --user-agent="Mozilla" https://figshare.com/ndownloader/files/46770337 -O WT_HEK239T_chr6_1.fq.gz
220
+ wget --user-agent="Mozilla" https://figshare.com/ndownloader/files/46770334 -O WT_HEK239T_chr6_2.fq.gz
221
+ wget --user-agent="Mozilla" https://figshare.com/ndownloader/files/46775599 -O Cas9_VEGFA2_chr6_1.fq.gz
222
+ wget --user-agent="Mozilla" https://figshare.com/ndownloader/files/46775602 -O Cas9_VEGFA2_chr6_2.fq.gz
223
+ ```
224
+
225
+ ## Signal visualization
226
+
227
+ After mapping, there will be 4 .bw files in the output folder:
228
+ ```bash
229
+ Cas9_VEGFA2_chr6.fw.scaled.bw
230
+
231
+ Cas9_VEGFA2_chr6.rv.scaled.bw
232
+
233
+ WT_chr6.fw.scaled.bw
234
+
235
+ WT_chr6.rv.scaled.bw
236
+ ```
237
+ These files can be visualized in genome browser like IGV:
238
+
239
+ ![signal](https://github.com/Lan-lab/offtracker/blob/main/example_output/signals_example.png?raw=true)
240
+
241
+ The signal (coverage) for each sample is normalized to 1e7/total_reads. As only reads mapping to chr6 were extracted in the example data, the signal range is much higher than that of the whole genome samples.
242
+
243
+ ## Whole genome off-target analysis
244
+
245
+ For analyzing the signals (offtracker_analysis.py), it takes about 3-5 minutes and outputs a file named "Offtracker_result_{outname}.csv"
246
+
247
+ After that, you can visualize the off-target sites with their genomic sequence (offtracker_plot.py) and get an image like this:
248
+
249
+ ![offtarget](https://github.com/Lan-lab/offtracker/blob/main/example_output/sequences_example.png?raw=true)
250
+
251
+
252
+ After finishing the pipeline, if “chr6:31400832-31400854” and “chr6:31495044-31495066” are missing in the plot, it is most likely due to either:
253
+
254
+ • Using a hard-masked reference genome (where repeats are replaced with 'N's)
255
+
256
+ • The presence of alternate loci (e.g., chr6_GL000256v2_alt) in the genome.
257
+
258
+ These two off-target sites locate in the region of MHC class I chain-related protein A and B (MICA and MICB), which is polymorphic (resulting in alternate loci in contigs like “chr6_GL000256v2_alt”) and contains interspersed repeats (resulting in sequences masked by capital 'N's in a hard-masked genome). Please try again with unmasked or soft-masked genome without alternate loci.
259
+
260
+
261
+
262
+ # Citation
263
+
264
+ If you use Tracking-seq or OFF-TRACKER in your research, please cite the following paper:
265
+
266
+ Zhu, M., Xu, R., Yuan, J., Wang, J. et al. Tracking-seq reveals the heterogeneity of off-target effects in CRISPR–Cas9-mediated genome editing. Nat Biotechnol (2024). https://doi.org/10.1038/s41587-024-02307-y
267
+
268
+ The signal visualization of .bw file here was generated by the Integrative Genomics Viewer (IGV) software. The signal visualization in the Tracking-seq article above was generated by either IGV or pyGenomeTracks:
269
+
270
+ Robinson, J., Thorvaldsdóttir, H., Winckler, W. et al. Integrative genomics viewer. Nat Biotechnol 29, 24–26 (2011). https://doi.org/10.1038/nbt.1754
271
+
272
+ Lopez-Delisle L, Rabbani L, Wolff J, Bhardwaj V, Backofen R, Grüning B, Ramírez F, Manke T. pyGenomeTracks: reproducible plots for multivariate genomic data sets. Bioinformatics. 2020 Aug 3:btaa692. doi: 10.1093/bioinformatics/btaa692.
273
+
@@ -4,3 +4,4 @@ numpy
4
4
  biopython<=1.85
5
5
  pybedtools
6
6
  pyyaml
7
+ psutil
@@ -4,6 +4,8 @@
4
4
  import os,glob,sys,time,shutil
5
5
  import polars as pl
6
6
 
7
+ # 2025.10.05. 添加 threads 监测
8
+
7
9
  if sys.version_info < (3,0):
8
10
  import platform
9
11
  raise Exception(f'python3 is needed, while running {platform.python_version()} now')
@@ -69,6 +71,21 @@ def main():
69
71
  seq_score_power = args.SeqScorePower
70
72
  n_threads = args.thread
71
73
 
74
+ ################
75
+ # threads 监测 #
76
+ ################
77
+ import psutil
78
+ n_threads = args.thread
79
+ assert n_threads > 0, f'n_threads should be greater than 0, while {n_threads} is given.'
80
+ cpu_count_total = psutil.cpu_count(logical=True) # 逻辑 CPU 总数(包括超线程)
81
+ if n_threads > cpu_count_total:
82
+ n_threads = cpu_count_total-1
83
+ print(f'n_threads is reset to {n_threads} due to the total number of threads ({cpu_count_total}).')
84
+ if n_threads > 16:
85
+ n_threads = 16
86
+ print(f'n_threads is reset to {n_threads} as too many threads are unnecessary.')
87
+
88
+
72
89
  outdir = args.outdir
73
90
  if outdir == 'first':
74
91
  outdir = folders[0]
@@ -383,6 +400,7 @@ def main():
383
400
  # 2025.06.05. BE可能会形成单边信号,但是很少见,如果 control 用的是别的 sgRNA 的样本,对应脱靶位置附近一般就是负数
384
401
  # bool_neg_score = df_result['track_score']< -1
385
402
  df_output = df_result[bool_fdr|bool_score].copy()
403
+ df_output = df_output['track_score']>1.75
386
404
  if pattern_ctr != 'none':
387
405
  df_output = df_output[['target_location', 'best_strand','best_target','deletion','insertion','mismatch',
388
406
  'exp_L_length', 'exp_R_length','ctr_L_length','ctr_R_length','L_length','R_length','signal_length',
@@ -5,6 +5,7 @@
5
5
  # 2023.12.06. v2.1: 2.1增加 cleavage_site 推测, 修正 deletion 错位, 以 cleavage_site 为中心
6
6
  # 2025.04.25. 修正大小写问题
7
7
  # 2025.06.11. 调整跳过已存在的candidates的代码顺序
8
+ # 2025.10.05. 添加 threads 监测
8
9
 
9
10
  import os,sys,re,time
10
11
  from itertools import product, permutations
@@ -87,6 +88,19 @@ def main():
87
88
  df_sgRNA_PAM = pd.DataFrame({'ID':ID,'sequence':possible_sgRNA_PAM})
88
89
  xseq.write_fasta(df_sgRNA_PAM, dir_sgRNA_fasta)
89
90
 
91
+ ################
92
+ # threads 监测 #
93
+ ################
94
+ import psutil
95
+ assert n_threads > 0, f'n_threads should be greater than 0, while {n_threads} is given.'
96
+ cpu_count_total = psutil.cpu_count(logical=True) # 逻辑 CPU 总数(包括超线程)
97
+ if n_threads > cpu_count_total:
98
+ n_threads = cpu_count_total-1
99
+ print(f'n_threads is reset to {n_threads} due to the total number of threads ({cpu_count_total}).')
100
+ if n_threads > 16:
101
+ n_threads = 16
102
+ print(f'n_threads is reset to {n_threads} as too many threads are unnecessary.')
103
+
90
104
  #########
91
105
  # BLAST #
92
106
  #########