offtracker 2.10.6__zip → 2.10.8__zip

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. {offtracker-2.10.6 → offtracker-2.10.8}/PKG-INFO +29 -6
  2. offtracker-2.10.6/offtracker.egg-info/PKG-INFO → offtracker-2.10.8/README.md +244 -233
  3. {offtracker-2.10.6 → offtracker-2.10.8}/offtracker/_version.py +3 -2
  4. offtracker-2.10.6/README.md → offtracker-2.10.8/offtracker.egg-info/PKG-INFO +256 -221
  5. {offtracker-2.10.6 → offtracker-2.10.8}/scripts/offtracker_analysis.py +5 -4
  6. {offtracker-2.10.6 → offtracker-2.10.8}/scripts/offtracker_config.py +1 -1
  7. {offtracker-2.10.6 → offtracker-2.10.8}/scripts/offtracker_qc.py +1 -1
  8. {offtracker-2.10.6 → offtracker-2.10.8}/setup.py +34 -6
  9. {offtracker-2.10.6 → offtracker-2.10.8}/LICENSE.txt +0 -0
  10. {offtracker-2.10.6 → offtracker-2.10.8}/MANIFEST.in +0 -0
  11. {offtracker-2.10.6 → offtracker-2.10.8}/offtracker/X_offplot.py +0 -0
  12. {offtracker-2.10.6 → offtracker-2.10.8}/offtracker/X_offtracker.py +0 -0
  13. {offtracker-2.10.6 → offtracker-2.10.8}/offtracker/X_sequence.py +0 -0
  14. {offtracker-2.10.6 → offtracker-2.10.8}/offtracker/__init__.py +0 -0
  15. {offtracker-2.10.6 → offtracker-2.10.8}/offtracker/snakefile/Snakefile_QC.smk +0 -0
  16. {offtracker-2.10.6 → offtracker-2.10.8}/offtracker/snakefile/Snakefile_offtracker.smk +0 -0
  17. {offtracker-2.10.6 → offtracker-2.10.8}/offtracker/utility/1.1_bed2fr.py +0 -0
  18. {offtracker-2.10.6 → offtracker-2.10.8}/offtracker/utility/1.3_bdg_normalize_v4.0.py +0 -0
  19. {offtracker-2.10.6 → offtracker-2.10.8}/offtracker/utility/bedGraphToBigWig +0 -0
  20. {offtracker-2.10.6 → offtracker-2.10.8}/offtracker/utility/hg38.chrom.sizes +0 -0
  21. {offtracker-2.10.6 → offtracker-2.10.8}/offtracker/utility/mm10.chrom.sizes +0 -0
  22. {offtracker-2.10.6 → offtracker-2.10.8}/offtracker/utility/offtracker_blacklist_hg38.merged.bed +0 -0
  23. {offtracker-2.10.6 → offtracker-2.10.8}/offtracker/utility/offtracker_blacklist_mm10.merged.bed +0 -0
  24. {offtracker-2.10.6 → offtracker-2.10.8}/offtracker.egg-info/SOURCES.txt +0 -0
  25. {offtracker-2.10.6 → offtracker-2.10.8}/offtracker.egg-info/dependency_links.txt +0 -0
  26. {offtracker-2.10.6 → offtracker-2.10.8}/offtracker.egg-info/requires.txt +2 -2
  27. {offtracker-2.10.6 → offtracker-2.10.8}/offtracker.egg-info/top_level.txt +0 -0
  28. {offtracker-2.10.6 → offtracker-2.10.8}/scripts/offtracker_candidates.py +0 -0
  29. {offtracker-2.10.6 → offtracker-2.10.8}/scripts/offtracker_plot.py +0 -0
  30. {offtracker-2.10.6 → offtracker-2.10.8}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: offtracker
3
- Version: 2.10.6
3
+ Version: 2.10.8
4
4
  Summary: Tracking-seq data analysis
5
5
  Home-page: https://github.com/Lan-lab/offtracker
6
6
  Author: Runda Xu
@@ -31,6 +31,7 @@ mamba create -n offtracker -c bioconda blast snakemake pybedtools deeptools chro
31
31
 
32
32
  ## Installation
33
33
 
34
+
34
35
  ```bash
35
36
  # Activate the environment
36
37
  conda activate offtracker
@@ -47,17 +48,28 @@ pip install .
47
48
 
48
49
  ## Before analyzing samples
49
50
 
51
+ **Important: Do not use hard-masked genome.fa**, in which repeats are masked by capital Ns and reads should have been mapped to these region (e.g. MHC region) will be lost. Besides, the genome.fa **should not** contain alternate loci like chr2_KI270776v1_alt and chr6_GL000256v2_alt, which may cause multi-mappings and the reads may be discarded.
52
+
53
+ For example, https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz is soft-masked genome with alternate loci. https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.masked.gz is hard-masked genome. **Do not** use these two as reference genome.
54
+
55
+ http://cistrome.org/~galib/MAESTRO/references/scATAC/Refdata_scATAC_MAESTRO_GRCh38_1.1.0.tar.gz is the genome used for the example data.
56
+
50
57
  ```bash
58
+ # The following command can be used to check whether alternate loci of chr6 are present in the reference genome.
59
+ grep "^>chr6" genome.fa
60
+ ```
61
+
62
+ ```bash
63
+ # Build chromap index (only need once for each genome)
64
+ chromap -i -r /Your_Path_To_Reference/hg38_genome.fa \
65
+ -o /Your_Path_To_Reference/hg38_genome.chromap.index
66
+
51
67
  # Build blast index (only need once for each genome)
52
68
  makeblastdb -input_type fasta -title hg38 -dbtype nucl -parse_seqids \
53
69
  -in /Your_Path_To_Reference/hg38_genome.fa \
54
70
  -out /Your_Path_To_Reference/hg38_genome.blastdb \
55
71
  -logfile /Your_Path_To_Reference/hg38_genome.blastdb.log
56
72
 
57
- # Build chromap index (only need once for each genome)
58
- chromap -i -r /Your_Path_To_Reference/hg38_genome.fa \
59
- -o /Your_Path_To_Reference/hg38_genome.chromap.index
60
-
61
73
  # Generate candidate regions by sgRNA sequence (need once for each genome and sgRNA)
62
74
  # --name: a user-defined name of the sgRNA, which will be used in the following analysis.
63
75
  offtracker_candidates.py -t 8 -g hg38 \
@@ -187,7 +199,7 @@ It is strongly recommended to observe the "fw.scaled.bw" and "rv.scaled.bw" usin
187
199
 
188
200
  # Example Data
189
201
 
190
- Here are example data that contains reads of chr6 from HEK293T cells edited with Cas9 + sgRNA VEGFA2 and wild type cells:
202
+ Here are example data that contains reads of chr6 from HEK293T cells edited with Cas9 + sgRNA VEGFA_site_2 (VEGFA2) and reads of chr6 from wild type HEK293T cells:
191
203
 
192
204
  https://figshare.com/articles/dataset/WT_HEK239T_chr6/25956034
193
205
 
@@ -219,6 +231,17 @@ After that, you can visualize the off-target sites with their genomic sequence (
219
231
 
220
232
  ![offtarget](https://github.com/Lan-lab/offtracker/blob/main/example_output/sequences_example.png?raw=true)
221
233
 
234
+
235
+ After finishing the pipeline, if “chr6:31400832-31400854” and “chr6:31495044-31495066” are missing in the plot, it is most likely due to either:
236
+
237
+ • Using a hard-masked reference genome (where repeats are replaced with 'N's)
238
+
239
+ • The presence of alternate loci (e.g., chr6_GL000256v2_alt) in the genome.
240
+
241
+ These two off-target sites locate in the region of MHC class I chain-related protein A and B (MICA and MICB), which is polymorphic (resulting in alternate loci in contigs like “chr6_GL000256v2_alt”) and contains interspersed repeats (resulting in sequences masked by capital 'N's in a hard-masked genome). Please try again with unmasked or soft-masked genome without alternate loci.
242
+
243
+
244
+
222
245
  # Citation
223
246
 
224
247
  If you use Tracking-seq or OFF-TRACKER in your research, please cite the following paper:
@@ -1,233 +1,244 @@
1
- Metadata-Version: 2.1
2
- Name: offtracker
3
- Version: 2.10.6
4
- Summary: Tracking-seq data analysis
5
- Home-page: https://github.com/Lan-lab/offtracker
6
- Author: Runda Xu
7
- Author-email: xrd18@tsinghua.org.cn
8
- Requires-Python: >=3.6.0
9
- Description-Content-Type: text/markdown
10
- License-File: LICENSE.txt
11
-
12
-
13
- # Offtracker
14
-
15
- Offtracker is an end to end pipeline of Tracking-seq data analysis for detecting off-target sites of any genome editing tools that generate double-strand breaks (DSBs) or single-strand breaks (SSBs).
16
-
17
- ## System requirements
18
-
19
- * Linux/Unix
20
- * Python >= 3.6
21
-
22
- ## Dependency
23
-
24
- ```bash
25
- # We recommend creating a new environment using mamba/conda to avoid compatibility problems
26
- # If you don't use mamba, just replace the code with conda
27
- # Windows systems may not be compatible with pybedtools.
28
- mamba create -n offtracker -c bioconda blast snakemake pybedtools deeptools chromap
29
- ```
30
-
31
-
32
- ## Installation
33
-
34
- ```bash
35
- # Activate the environment
36
- conda activate offtracker
37
-
38
- # Direct installation with pip
39
- pip install offtracker
40
-
41
- # (Alternative) Download the offtracker from github
42
- git clone https://github.com/Lan-lab/offtracker.git
43
- cd offtracker
44
- pip install .
45
- ```
46
-
47
-
48
- ## Before analyzing samples
49
-
50
- ```bash
51
- # Build blast index (only need once for each genome)
52
- makeblastdb -input_type fasta -title hg38 -dbtype nucl -parse_seqids \
53
- -in /Your_Path_To_Reference/hg38_genome.fa \
54
- -out /Your_Path_To_Reference/hg38_genome.blastdb \
55
- -logfile /Your_Path_To_Reference/hg38_genome.blastdb.log
56
-
57
- # Build chromap index (only need once for each genome)
58
- chromap -i -r /Your_Path_To_Reference/hg38_genome.fa \
59
- -o /Your_Path_To_Reference/hg38_genome.chromap.index
60
-
61
- # Generate candidate regions by sgRNA sequence (need once for each genome and sgRNA)
62
- # --name: a user-defined name of the sgRNA, which will be used in the following analysis.
63
- offtracker_candidates.py -t 8 -g hg38 \
64
- -r /Your_Path_To_Reference/hg38_genome.fa \
65
- -b /Your_Path_To_Reference/hg38_genome.blastdb \
66
- --name 'VEGFA2' --sgrna 'GACCCCCTCCACCCCGCCTC' --pam 'NGG' \
67
- -o /Your_Path_To_Candidates_Folder
68
-
69
- ```
70
-
71
-
72
- ## Quality control and adapter trimming
73
-
74
- ```bash
75
- # Generate snakemake config file for quality control and adapter trimming.
76
- offtracker_qc.py -t 4 \
77
- -f /Your_Path_To_Input_Folder \
78
- --subfolder 0
79
-
80
- cd /Your_Path_To_Input_Folder/Trimmed_data
81
- snakemake -np # dry run to check whether everything is alright
82
- nohup snakemake --cores 16 1>${outdir}/sm_qc.log 2>&1 &
83
-
84
- """
85
- Set “--subfolder 0” if the file structure is like:
86
- | - Input_Folder
87
- | - sample1_R1.fastq.gz
88
- | - sample1_R2.fastq.gz
89
- | - sample2_R1.fastq.gz
90
- | - sample2_R2.fastq.gz
91
- Set “--subfolder 1” if the file structure is like:
92
- | - Input_Folder
93
- | - Sample1_Folder
94
- | - sample1_R1.fastq.gz
95
- | - sample1_R2.fastq.gz
96
- | - Sample2_Folder
97
- | - sample2_R1.fastq.gz
98
- | - sample2_R2.fastq.gz
99
-
100
- The script “offtracker_qc.py” will create a “Trimmed_data” folder under /Your_Path_To_Input_Folder.
101
- If “-o /Your_Path_To_Output” is set, the output will be redirected to /Your_Path_To_Output.
102
- """
103
- ```
104
-
105
- ## Strand-specific mapping of Tracking-seq data
106
-
107
- ```bash
108
-
109
- # Generate snakemake config file for mapping
110
- # Results will be generated in /Your_Path_To_Output, if -o is not set, the output will be in the same folder as the fastq files
111
- offtracker_config.py -t 8 -g hg38 --blacklist hg38 \
112
- -r /Your_Path_To_Reference/hg38_genome.fa \
113
- -i /Your_Path_To_Reference/hg38_genome.chromap.index \
114
- -f /Your_Path_To_Trimmed_Data \
115
- -o /Your_Path_To_Output \
116
- --subfolder 0
117
-
118
- # Warning: Do not contain "fastq" or "fq" in the folder name, otherwise the program may treat the folder as a fastq file
119
- # This problem may be fixed in the future
120
-
121
- # Run the snakemake program
122
- cd /Your_Path_To_Fastq
123
- snakemake -np # dry run
124
- nohup snakemake --cores 16 1>sm_mapping.log 2>sm_mapping.err &
125
-
126
- ## about cores
127
- # --cores of snakemake must be larger than -t of offtracker_config.py
128
- # parallel number = cores/t
129
-
130
- ## about output
131
- # This part will generate "*.fw.scaled.bw" and ".rv.scaled.bw" for IGV visualization
132
- # "*.fw.bed" and "*.rv.bed" are used in the next part.
133
- ```
134
-
135
-
136
- ## Analyzing the genome-wide off-target sites
137
-
138
- ```bash
139
- # In this part, multiple samples in the same condition can be analyzed in a single run by pattern recognition of sample names
140
-
141
- offtracker_analysis.py -g hg38 --name "VEGFA2" \
142
- --exp 'Cas9_VEGFA2' \
143
- --control 'WT' \
144
- --outname 'Cas9_VEGFA_293' \
145
- -f /Your_Path_To_Output \
146
- --seqfolder /Your_Path_To_Candidates
147
-
148
- # --name: the same gRNA name you set when running offtracker_candidates.py
149
- # --exp/--control: add one or multiple patterns of file name in regular expressions
150
- # If multiple samples meet the pattern, their signals will be averaged. Thus, only samples with the same condition should be included in a single analysis.
151
-
152
- # This step will generate Offtracker_result_{outname}.csv
153
- # Default FDR is 0.05, which can be changed by --fdr. This will empirically make the threshold of Track score around 2.
154
- # Sites with Track score >=2, which is a empirical threshold, are output regardless of FDR.
155
- # Intermediate files are saved in ./temp folder, which can be deleted.
156
- # Keeping the intermediate files can make the analysis faster if involving previously analyzed samples (e.g. using the same control samples for different analyses)
157
- ```
158
-
159
- ## Off-target sequences visualization
160
-
161
- ```bash
162
- # After get the Offtracker_result_{outname}.csv, you can visualize the off-target sites with their genomic sequence with the following command:
163
-
164
- offtracker_plot.py --result Your_Offtracker_Result_CSV \
165
- --sgrna 'GACCCCCTCCACCCCGCCTC' --pam 'NGG'
166
-
167
- # The default output is a pdf file with Offtracker_result_{outname}.pdf
168
- # Assigning a specific output file with another suffix can change the format. e.g., "--output Offtracker_plot.png" will generate a png file.
169
- # The orange dash line indicates the empirical threshold of Track score = 2
170
- # Empirically, the off-target sites with Track score < 2 are less likely to be real off-target sites.
171
- ```
172
-
173
-
174
- ## Note1, when not using hg38 or mm10
175
-
176
- The default setting only includes chr1-chr22, chrX, chrY, and chrM. (only suitable for human and mouse) \
177
- If you are using reference genomes without "chr" at the beginning, or want to analyze all chromosomes or other species, you can set "--ignore_chr" when running offtracker_config.py to skip chromosome filter.
178
-
179
- Currently, this software is only ready-to-use for mm10 and hg38. For any other genome, e.g., hg19, please add a genome size file named "hg19.chrom.sizes" to .\offtracker\utility. Besides, add "--blacklist none" or "--blacklist Your_Blacklist" (e.g., ENCODE blacklist) when running offtracker_config.py, because we only include blacklists for mm10 and hg38.
180
-
181
- ## Note2
182
-
183
- The FDRs in the Tracking-seq result do not reflect the real off-target probability.
184
- It is strongly recommended to observe the "fw.scaled.bw" and "rv.scaled.bw" using genome browser like IGV to visually inspect each target location from the Tracking-seq result.
185
-
186
-
187
-
188
- # Example Data
189
-
190
- Here are example data that contains reads of chr6 from HEK293T cells edited with Cas9 + sgRNA VEGFA2 and wild type cells:
191
-
192
- https://figshare.com/articles/dataset/WT_HEK239T_chr6/25956034
193
-
194
- It takes about 5-10 minutes to run the mapping (offtracker_config.py & snakemake) of example data with -t 8 and --cores 16 (2 parallel tasks)
195
-
196
- ## Signal visualization
197
-
198
- After mapping, there will be 4 .bw files in the output folder:
199
- ```bash
200
- Cas9_VEGFA2_chr6.fw.scaled.bw
201
-
202
- Cas9_VEGFA2_chr6.rv.scaled.bw
203
-
204
- WT_chr6.fw.scaled.bw
205
-
206
- WT_chr6.rv.scaled.bw
207
- ```
208
- These files can be visualized in genome browser like IGV:
209
-
210
- ![signal](https://github.com/Lan-lab/offtracker/blob/main/example_output/signals_example.png?raw=true)
211
-
212
- The signal (coverage) for each sample is normalized to 1e7/total_reads. As only reads mapping to chr6 were extracted in the example data, the signal range is much higher than that of the whole genome samples.
213
-
214
- ## Whole genome off-target analysis
215
-
216
- For analyzing the signals (offtracker_analysis.py), it takes about 3-5 minutes and outputs a file named "Offtracker_result_{outname}.csv"
217
-
218
- After that, you can visualize the off-target sites with their genomic sequence (offtracker_plot.py) and get an image like this:
219
-
220
- ![offtarget](https://github.com/Lan-lab/offtracker/blob/main/example_output/sequences_example.png?raw=true)
221
-
222
- # Citation
223
-
224
- If you use Tracking-seq or OFF-TRACKER in your research, please cite the following paper:
225
-
226
- Zhu, M., Xu, R., Yuan, J., Wang, J. et al. Tracking-seq reveals the heterogeneity of off-target effects in CRISPR–Cas9-mediated genome editing. Nat Biotechnol (2024). https://doi.org/10.1038/s41587-024-02307-y
227
-
228
- The signal visualization of .bw file here was generated by the Integrative Genomics Viewer (IGV) software. The signal visualization in the Tracking-seq article above was generated by either IGV or pyGenomeTracks:
229
-
230
- Robinson, J., Thorvaldsdóttir, H., Winckler, W. et al. Integrative genomics viewer. Nat Biotechnol 29, 24–26 (2011). https://doi.org/10.1038/nbt.1754
231
-
232
- Lopez-Delisle L, Rabbani L, Wolff J, Bhardwaj V, Backofen R, Grüning B, Ramírez F, Manke T. pyGenomeTracks: reproducible plots for multivariate genomic data sets. Bioinformatics. 2020 Aug 3:btaa692. doi: 10.1093/bioinformatics/btaa692.
233
-
1
+ # Offtracker
2
+
3
+ Offtracker is an end to end pipeline of Tracking-seq data analysis for detecting off-target sites of any genome editing tools that generate double-strand breaks (DSBs) or single-strand breaks (SSBs).
4
+
5
+ ## System requirements
6
+
7
+ * Linux/Unix
8
+ * Python >= 3.6
9
+
10
+ ## Dependency
11
+
12
+ ```bash
13
+ # We recommend creating a new environment using mamba/conda to avoid compatibility problems
14
+ # If you don't use mamba, just replace the code with conda
15
+ # Windows systems may not be compatible with pybedtools.
16
+ mamba create -n offtracker -c bioconda blast snakemake pybedtools deeptools chromap
17
+ ```
18
+
19
+
20
+ ## Installation
21
+
22
+
23
+ ```bash
24
+ # Activate the environment
25
+ conda activate offtracker
26
+
27
+ # Direct installation with pip
28
+ pip install offtracker
29
+
30
+ # (Alternative) Download the offtracker from github
31
+ git clone https://github.com/Lan-lab/offtracker.git
32
+ cd offtracker
33
+ pip install .
34
+ ```
35
+
36
+
37
+ ## Before analyzing samples
38
+
39
+ **Important: Do not use hard-masked genome.fa**, in which repeats are masked by capital Ns and reads should have been mapped to these region (e.g. MHC region) will be lost. Besides, the genome.fa **should not** contain alternate loci like chr2_KI270776v1_alt and chr6_GL000256v2_alt, which may cause multi-mappings and the reads may be discarded.
40
+
41
+ For example, https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz is soft-masked genome with alternate loci. https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.masked.gz is hard-masked genome. **Do not** use these two as reference genome.
42
+
43
+ http://cistrome.org/~galib/MAESTRO/references/scATAC/Refdata_scATAC_MAESTRO_GRCh38_1.1.0.tar.gz is the genome used for the example data.
44
+
45
+ ```bash
46
+ # The following command can be used to check whether alternate loci of chr6 are present in the reference genome.
47
+ grep "^>chr6" genome.fa
48
+ ```
49
+
50
+ ```bash
51
+ # Build chromap index (only need once for each genome)
52
+ chromap -i -r /Your_Path_To_Reference/hg38_genome.fa \
53
+ -o /Your_Path_To_Reference/hg38_genome.chromap.index
54
+
55
+ # Build blast index (only need once for each genome)
56
+ makeblastdb -input_type fasta -title hg38 -dbtype nucl -parse_seqids \
57
+ -in /Your_Path_To_Reference/hg38_genome.fa \
58
+ -out /Your_Path_To_Reference/hg38_genome.blastdb \
59
+ -logfile /Your_Path_To_Reference/hg38_genome.blastdb.log
60
+
61
+ # Generate candidate regions by sgRNA sequence (need once for each genome and sgRNA)
62
+ # --name: a user-defined name of the sgRNA, which will be used in the following analysis.
63
+ offtracker_candidates.py -t 8 -g hg38 \
64
+ -r /Your_Path_To_Reference/hg38_genome.fa \
65
+ -b /Your_Path_To_Reference/hg38_genome.blastdb \
66
+ --name 'VEGFA2' --sgrna 'GACCCCCTCCACCCCGCCTC' --pam 'NGG' \
67
+ -o /Your_Path_To_Candidates_Folder
68
+
69
+ ```
70
+
71
+
72
+ ## Quality control and adapter trimming
73
+
74
+ ```bash
75
+ # Generate snakemake config file for quality control and adapter trimming.
76
+ offtracker_qc.py -t 4 \
77
+ -f /Your_Path_To_Input_Folder \
78
+ --subfolder 0
79
+
80
+ cd /Your_Path_To_Input_Folder/Trimmed_data
81
+ snakemake -np # dry run to check whether everything is alright
82
+ nohup snakemake --cores 16 1>${outdir}/sm_qc.log 2>&1 &
83
+
84
+ """
85
+ Set “--subfolder 0” if the file structure is like:
86
+ | - Input_Folder
87
+ | - sample1_R1.fastq.gz
88
+ | - sample1_R2.fastq.gz
89
+ | - sample2_R1.fastq.gz
90
+ | - sample2_R2.fastq.gz
91
+ Set “--subfolder 1” if the file structure is like:
92
+ | - Input_Folder
93
+ | - Sample1_Folder
94
+ | - sample1_R1.fastq.gz
95
+ | - sample1_R2.fastq.gz
96
+ | - Sample2_Folder
97
+ | - sample2_R1.fastq.gz
98
+ | - sample2_R2.fastq.gz
99
+
100
+ The script “offtracker_qc.py” will create a “Trimmed_data” folder under /Your_Path_To_Input_Folder.
101
+ If “-o /Your_Path_To_Output” is set, the output will be redirected to /Your_Path_To_Output.
102
+ """
103
+ ```
104
+
105
+ ## Strand-specific mapping of Tracking-seq data
106
+
107
+ ```bash
108
+
109
+ # Generate snakemake config file for mapping
110
+ # Results will be generated in /Your_Path_To_Output, if -o is not set, the output will be in the same folder as the fastq files
111
+ offtracker_config.py -t 8 -g hg38 --blacklist hg38 \
112
+ -r /Your_Path_To_Reference/hg38_genome.fa \
113
+ -i /Your_Path_To_Reference/hg38_genome.chromap.index \
114
+ -f /Your_Path_To_Trimmed_Data \
115
+ -o /Your_Path_To_Output \
116
+ --subfolder 0
117
+
118
+ # Warning: Do not contain "fastq" or "fq" in the folder name, otherwise the program may treat the folder as a fastq file
119
+ # This problem may be fixed in the future
120
+
121
+ # Run the snakemake program
122
+ cd /Your_Path_To_Fastq
123
+ snakemake -np # dry run
124
+ nohup snakemake --cores 16 1>sm_mapping.log 2>sm_mapping.err &
125
+
126
+ ## about cores
127
+ # --cores of snakemake must be larger than -t of offtracker_config.py
128
+ # parallel number = cores/t
129
+
130
+ ## about output
131
+ # This part will generate "*.fw.scaled.bw" and ".rv.scaled.bw" for IGV visualization
132
+ # "*.fw.bed" and "*.rv.bed" are used in the next part.
133
+ ```
134
+
135
+
136
+ ## Analyzing the genome-wide off-target sites
137
+
138
+ ```bash
139
+ # In this part, multiple samples in the same condition can be analyzed in a single run by pattern recognition of sample names
140
+
141
+ offtracker_analysis.py -g hg38 --name "VEGFA2" \
142
+ --exp 'Cas9_VEGFA2' \
143
+ --control 'WT' \
144
+ --outname 'Cas9_VEGFA_293' \
145
+ -f /Your_Path_To_Output \
146
+ --seqfolder /Your_Path_To_Candidates
147
+
148
+ # --name: the same gRNA name you set when running offtracker_candidates.py
149
+ # --exp/--control: add one or multiple patterns of file name in regular expressions
150
+ # If multiple samples meet the pattern, their signals will be averaged. Thus, only samples with the same condition should be included in a single analysis.
151
+
152
+ # This step will generate Offtracker_result_{outname}.csv
153
+ # Default FDR is 0.05, which can be changed by --fdr. This will empirically make the threshold of Track score around 2.
154
+ # Sites with Track score >=2, which is a empirical threshold, are output regardless of FDR.
155
+ # Intermediate files are saved in ./temp folder, which can be deleted.
156
+ # Keeping the intermediate files can make the analysis faster if involving previously analyzed samples (e.g. using the same control samples for different analyses)
157
+ ```
158
+
159
+ ## Off-target sequences visualization
160
+
161
+ ```bash
162
+ # After get the Offtracker_result_{outname}.csv, you can visualize the off-target sites with their genomic sequence with the following command:
163
+
164
+ offtracker_plot.py --result Your_Offtracker_Result_CSV \
165
+ --sgrna 'GACCCCCTCCACCCCGCCTC' --pam 'NGG'
166
+
167
+ # The default output is a pdf file with Offtracker_result_{outname}.pdf
168
+ # Assigning a specific output file with another suffix can change the format. e.g., "--output Offtracker_plot.png" will generate a png file.
169
+ # The orange dash line indicates the empirical threshold of Track score = 2
170
+ # Empirically, the off-target sites with Track score < 2 are less likely to be real off-target sites.
171
+ ```
172
+
173
+
174
+ ## Note1, when not using hg38 or mm10
175
+
176
+ The default setting only includes chr1-chr22, chrX, chrY, and chrM. (only suitable for human and mouse) \
177
+ If you are using reference genomes without "chr" at the beginning, or want to analyze all chromosomes or other species, you can set "--ignore_chr" when running offtracker_config.py to skip chromosome filter.
178
+
179
+ Currently, this software is only ready-to-use for mm10 and hg38. For any other genome, e.g., hg19, please add a genome size file named "hg19.chrom.sizes" to .\offtracker\utility. Besides, add "--blacklist none" or "--blacklist Your_Blacklist" (e.g., ENCODE blacklist) when running offtracker_config.py, because we only include blacklists for mm10 and hg38.
180
+
181
+ ## Note2
182
+
183
+ The FDRs in the Tracking-seq result do not reflect the real off-target probability.
184
+ It is strongly recommended to observe the "fw.scaled.bw" and "rv.scaled.bw" using genome browser like IGV to visually inspect each target location from the Tracking-seq result.
185
+
186
+
187
+
188
+ # Example Data
189
+
190
+ Here are example data that contains reads of chr6 from HEK293T cells edited with Cas9 + sgRNA VEGFA_site_2 (VEGFA2) and reads of chr6 from wild type HEK293T cells:
191
+
192
+ https://figshare.com/articles/dataset/WT_HEK239T_chr6/25956034
193
+
194
+ It takes about 5-10 minutes to run the mapping (offtracker_config.py & snakemake) of example data with -t 8 and --cores 16 (2 parallel tasks)
195
+
196
+ ## Signal visualization
197
+
198
+ After mapping, there will be 4 .bw files in the output folder:
199
+ ```bash
200
+ Cas9_VEGFA2_chr6.fw.scaled.bw
201
+
202
+ Cas9_VEGFA2_chr6.rv.scaled.bw
203
+
204
+ WT_chr6.fw.scaled.bw
205
+
206
+ WT_chr6.rv.scaled.bw
207
+ ```
208
+ These files can be visualized in genome browser like IGV:
209
+
210
+ ![signal](https://github.com/Lan-lab/offtracker/blob/main/example_output/signals_example.png?raw=true)
211
+
212
+ The signal (coverage) for each sample is normalized to 1e7/total_reads. As only reads mapping to chr6 were extracted in the example data, the signal range is much higher than that of the whole genome samples.
213
+
214
+ ## Whole genome off-target analysis
215
+
216
+ For analyzing the signals (offtracker_analysis.py), it takes about 3-5 minutes and outputs a file named "Offtracker_result_{outname}.csv"
217
+
218
+ After that, you can visualize the off-target sites with their genomic sequence (offtracker_plot.py) and get an image like this:
219
+
220
+ ![offtarget](https://github.com/Lan-lab/offtracker/blob/main/example_output/sequences_example.png?raw=true)
221
+
222
+
223
+ After finishing the pipeline, if “chr6:31400832-31400854” and “chr6:31495044-31495066” are missing in the plot, it is most likely due to either:
224
+
225
+ • Using a hard-masked reference genome (where repeats are replaced with 'N's)
226
+
227
+ • The presence of alternate loci (e.g., chr6_GL000256v2_alt) in the genome.
228
+
229
+ These two off-target sites locate in the region of MHC class I chain-related protein A and B (MICA and MICB), which is polymorphic (resulting in alternate loci in contigs like “chr6_GL000256v2_alt”) and contains interspersed repeats (resulting in sequences masked by capital 'N's in a hard-masked genome). Please try again with unmasked or soft-masked genome without alternate loci.
230
+
231
+
232
+
233
+ # Citation
234
+
235
+ If you use Tracking-seq or OFF-TRACKER in your research, please cite the following paper:
236
+
237
+ Zhu, M., Xu, R., Yuan, J., Wang, J. et al. Tracking-seq reveals the heterogeneity of off-target effects in CRISPR–Cas9-mediated genome editing. Nat Biotechnol (2024). https://doi.org/10.1038/s41587-024-02307-y
238
+
239
+ The signal visualization of .bw file here was generated by the Integrative Genomics Viewer (IGV) software. The signal visualization in the Tracking-seq article above was generated by either IGV or pyGenomeTracks:
240
+
241
+ Robinson, J., Thorvaldsdóttir, H., Winckler, W. et al. Integrative genomics viewer. Nat Biotechnol 29, 24–26 (2011). https://doi.org/10.1038/nbt.1754
242
+
243
+ Lopez-Delisle L, Rabbani L, Wolff J, Bhardwaj V, Backofen R, Grüning B, Ramírez F, Manke T. pyGenomeTracks: reproducible plots for multivariate genomic data sets. Bioinformatics. 2020 Aug 3:btaa692. doi: 10.1093/bioinformatics/btaa692.
244
+
@@ -1,4 +1,4 @@
1
- __version__ = "2.10.6"
1
+ __version__ = "2.10.8"
2
2
  # 2023.08.11. v1.1.0 adding a option for not normalizing the bw file
3
3
  # 2023.10.26. v1.9.0 prerelease for v2.0
4
4
  # 2023.10.27. v2.0.0 大更新,还没微调
@@ -33,4 +33,5 @@ __version__ = "2.10.6"
33
33
  # 2025.04.25. v2.8.0 修复了 offtracker candidates 会把小写序列转换成 N 的 bug
34
34
  # 2025.05.22. v2.9.0 翻新部分代码结构
35
35
  # 2025.06.05. v2.10.0 增加了QC模块。保留了负数score的记录,并在plot时显示为红字。增加了 "--ignore_chr" 用于跳过common chr过滤。
36
- # 2025.06.17. v2.10.6 修复翻新代码结构导致的bug
36
+ # 2025.06.17. v2.10.7 修复翻新代码结构导致的bug
37
+ # 2025.06.27. v2.10.8 将 chmod 放在了 setup.py 里
@@ -1,221 +1,256 @@
1
- # Offtracker
2
-
3
- Offtracker is an end to end pipeline of Tracking-seq data analysis for detecting off-target sites of any genome editing tools that generate double-strand breaks (DSBs) or single-strand breaks (SSBs).
4
-
5
- ## System requirements
6
-
7
- * Linux/Unix
8
- * Python >= 3.6
9
-
10
- ## Dependency
11
-
12
- ```bash
13
- # We recommend creating a new environment using mamba/conda to avoid compatibility problems
14
- # If you don't use mamba, just replace the code with conda
15
- # Windows systems may not be compatible with pybedtools.
16
- mamba create -n offtracker -c bioconda blast snakemake pybedtools deeptools chromap
17
- ```
18
-
19
-
20
- ## Installation
21
-
22
- ```bash
23
- # Activate the environment
24
- conda activate offtracker
25
-
26
- # Direct installation with pip
27
- pip install offtracker
28
-
29
- # (Alternative) Download the offtracker from github
30
- git clone https://github.com/Lan-lab/offtracker.git
31
- cd offtracker
32
- pip install .
33
- ```
34
-
35
-
36
- ## Before analyzing samples
37
-
38
- ```bash
39
- # Build blast index (only need once for each genome)
40
- makeblastdb -input_type fasta -title hg38 -dbtype nucl -parse_seqids \
41
- -in /Your_Path_To_Reference/hg38_genome.fa \
42
- -out /Your_Path_To_Reference/hg38_genome.blastdb \
43
- -logfile /Your_Path_To_Reference/hg38_genome.blastdb.log
44
-
45
- # Build chromap index (only need once for each genome)
46
- chromap -i -r /Your_Path_To_Reference/hg38_genome.fa \
47
- -o /Your_Path_To_Reference/hg38_genome.chromap.index
48
-
49
- # Generate candidate regions by sgRNA sequence (need once for each genome and sgRNA)
50
- # --name: a user-defined name of the sgRNA, which will be used in the following analysis.
51
- offtracker_candidates.py -t 8 -g hg38 \
52
- -r /Your_Path_To_Reference/hg38_genome.fa \
53
- -b /Your_Path_To_Reference/hg38_genome.blastdb \
54
- --name 'VEGFA2' --sgrna 'GACCCCCTCCACCCCGCCTC' --pam 'NGG' \
55
- -o /Your_Path_To_Candidates_Folder
56
-
57
- ```
58
-
59
-
60
- ## Quality control and adapter trimming
61
-
62
- ```bash
63
- # Generate snakemake config file for quality control and adapter trimming.
64
- offtracker_qc.py -t 4 \
65
- -f /Your_Path_To_Input_Folder \
66
- --subfolder 0
67
-
68
- cd /Your_Path_To_Input_Folder/Trimmed_data
69
- snakemake -np # dry run to check whether everything is alright
70
- nohup snakemake --cores 16 1>${outdir}/sm_qc.log 2>&1 &
71
-
72
- """
73
- Set “--subfolder 0” if the file structure is like:
74
- | - Input_Folder
75
- | - sample1_R1.fastq.gz
76
- | - sample1_R2.fastq.gz
77
- | - sample2_R1.fastq.gz
78
- | - sample2_R2.fastq.gz
79
- Set “--subfolder 1” if the file structure is like:
80
- | - Input_Folder
81
- | - Sample1_Folder
82
- | - sample1_R1.fastq.gz
83
- | - sample1_R2.fastq.gz
84
- | - Sample2_Folder
85
- | - sample2_R1.fastq.gz
86
- | - sample2_R2.fastq.gz
87
-
88
- The script “offtracker_qc.py will create a “Trimmed_data” folder under /Your_Path_To_Input_Folder.
89
- If “-o /Your_Path_To_Output” is set, the output will be redirected to /Your_Path_To_Output.
90
- """
91
- ```
92
-
93
- ## Strand-specific mapping of Tracking-seq data
94
-
95
- ```bash
96
-
97
- # Generate snakemake config file for mapping
98
- # Results will be generated in /Your_Path_To_Output, if -o is not set, the output will be in the same folder as the fastq files
99
- offtracker_config.py -t 8 -g hg38 --blacklist hg38 \
100
- -r /Your_Path_To_Reference/hg38_genome.fa \
101
- -i /Your_Path_To_Reference/hg38_genome.chromap.index \
102
- -f /Your_Path_To_Trimmed_Data \
103
- -o /Your_Path_To_Output \
104
- --subfolder 0
105
-
106
- # Warning: Do not contain "fastq" or "fq" in the folder name, otherwise the program may treat the folder as a fastq file
107
- # This problem may be fixed in the future
108
-
109
- # Run the snakemake program
110
- cd /Your_Path_To_Fastq
111
- snakemake -np # dry run
112
- nohup snakemake --cores 16 1>sm_mapping.log 2>sm_mapping.err &
113
-
114
- ## about cores
115
- # --cores of snakemake must be larger than -t of offtracker_config.py
116
- # parallel number = cores/t
117
-
118
- ## about output
119
- # This part will generate "*.fw.scaled.bw" and ".rv.scaled.bw" for IGV visualization
120
- # "*.fw.bed" and "*.rv.bed" are used in the next part.
121
- ```
122
-
123
-
124
- ## Analyzing the genome-wide off-target sites
125
-
126
- ```bash
127
- # In this part, multiple samples in the same condition can be analyzed in a single run by pattern recognition of sample names
128
-
129
- offtracker_analysis.py -g hg38 --name "VEGFA2" \
130
- --exp 'Cas9_VEGFA2' \
131
- --control 'WT' \
132
- --outname 'Cas9_VEGFA_293' \
133
- -f /Your_Path_To_Output \
134
- --seqfolder /Your_Path_To_Candidates
135
-
136
- # --name: the same gRNA name you set when running offtracker_candidates.py
137
- # --exp/--control: add one or multiple patterns of file name in regular expressions
138
- # If multiple samples meet the pattern, their signals will be averaged. Thus, only samples with the same condition should be included in a single analysis.
139
-
140
- # This step will generate Offtracker_result_{outname}.csv
141
- # Default FDR is 0.05, which can be changed by --fdr. This will empirically make the threshold of Track score around 2.
142
- # Sites with Track score >=2, which is a empirical threshold, are output regardless of FDR.
143
- # Intermediate files are saved in ./temp folder, which can be deleted.
144
- # Keeping the intermediate files can make the analysis faster if involving previously analyzed samples (e.g. using the same control samples for different analyses)
145
- ```
146
-
147
- ## Off-target sequences visualization
148
-
149
- ```bash
150
- # After get the Offtracker_result_{outname}.csv, you can visualize the off-target sites with their genomic sequence with the following command:
151
-
152
- offtracker_plot.py --result Your_Offtracker_Result_CSV \
153
- --sgrna 'GACCCCCTCCACCCCGCCTC' --pam 'NGG'
154
-
155
- # The default output is a pdf file with Offtracker_result_{outname}.pdf
156
- # Assigning a specific output file with another suffix can change the format. e.g., "--output Offtracker_plot.png" will generate a png file.
157
- # The orange dash line indicates the empirical threshold of Track score = 2
158
- # Empirically, the off-target sites with Track score < 2 are less likely to be real off-target sites.
159
- ```
160
-
161
-
162
- ## Note1, when not using hg38 or mm10
163
-
164
- The default setting only includes chr1-chr22, chrX, chrY, and chrM. (only suitable for human and mouse) \
165
- If you are using reference genomes without "chr" at the beginning, or want to analyze all chromosomes or other species, you can set "--ignore_chr" when running offtracker_config.py to skip chromosome filter.
166
-
167
- Currently, this software is only ready-to-use for mm10 and hg38. For any other genome, e.g., hg19, please add a genome size file named "hg19.chrom.sizes" to .\offtracker\utility. Besides, add "--blacklist none" or "--blacklist Your_Blacklist" (e.g., ENCODE blacklist) when running offtracker_config.py, because we only include blacklists for mm10 and hg38.
168
-
169
- ## Note2
170
-
171
- The FDRs in the Tracking-seq result do not reflect the real off-target probability.
172
- It is strongly recommended to observe the "fw.scaled.bw" and "rv.scaled.bw" using genome browser like IGV to visually inspect each target location from the Tracking-seq result.
173
-
174
-
175
-
176
- # Example Data
177
-
178
- Here are example data that contains reads of chr6 from HEK293T cells edited with Cas9 + sgRNA VEGFA2 and wild type cells:
179
-
180
- https://figshare.com/articles/dataset/WT_HEK239T_chr6/25956034
181
-
182
- It takes about 5-10 minutes to run the mapping (offtracker_config.py & snakemake) of example data with -t 8 and --cores 16 (2 parallel tasks)
183
-
184
- ## Signal visualization
185
-
186
- After mapping, there will be 4 .bw files in the output folder:
187
- ```bash
188
- Cas9_VEGFA2_chr6.fw.scaled.bw
189
-
190
- Cas9_VEGFA2_chr6.rv.scaled.bw
191
-
192
- WT_chr6.fw.scaled.bw
193
-
194
- WT_chr6.rv.scaled.bw
195
- ```
196
- These files can be visualized in genome browser like IGV:
197
-
198
- ![signal](https://github.com/Lan-lab/offtracker/blob/main/example_output/signals_example.png?raw=true)
199
-
200
- The signal (coverage) for each sample is normalized to 1e7/total_reads. As only reads mapping to chr6 were extracted in the example data, the signal range is much higher than that of the whole genome samples.
201
-
202
- ## Whole genome off-target analysis
203
-
204
- For analyzing the signals (offtracker_analysis.py), it takes about 3-5 minutes and outputs a file named "Offtracker_result_{outname}.csv"
205
-
206
- After that, you can visualize the off-target sites with their genomic sequence (offtracker_plot.py) and get an image like this:
207
-
208
- ![offtarget](https://github.com/Lan-lab/offtracker/blob/main/example_output/sequences_example.png?raw=true)
209
-
210
- # Citation
211
-
212
- If you use Tracking-seq or OFF-TRACKER in your research, please cite the following paper:
213
-
214
- Zhu, M., Xu, R., Yuan, J., Wang, J. et al. Tracking-seq reveals the heterogeneity of off-target effects in CRISPR–Cas9-mediated genome editing. Nat Biotechnol (2024). https://doi.org/10.1038/s41587-024-02307-y
215
-
216
- The signal visualization of .bw file here was generated by the Integrative Genomics Viewer (IGV) software. The signal visualization in the Tracking-seq article above was generated by either IGV or pyGenomeTracks:
217
-
218
- Robinson, J., Thorvaldsdóttir, H., Winckler, W. et al. Integrative genomics viewer. Nat Biotechnol 29, 24–26 (2011). https://doi.org/10.1038/nbt.1754
219
-
220
- Lopez-Delisle L, Rabbani L, Wolff J, Bhardwaj V, Backofen R, Grüning B, Ramírez F, Manke T. pyGenomeTracks: reproducible plots for multivariate genomic data sets. Bioinformatics. 2020 Aug 3:btaa692. doi: 10.1093/bioinformatics/btaa692.
221
-
1
+ Metadata-Version: 2.1
2
+ Name: offtracker
3
+ Version: 2.10.8
4
+ Summary: Tracking-seq data analysis
5
+ Home-page: https://github.com/Lan-lab/offtracker
6
+ Author: Runda Xu
7
+ Author-email: xrd18@tsinghua.org.cn
8
+ Requires-Python: >=3.6.0
9
+ Description-Content-Type: text/markdown
10
+ License-File: LICENSE.txt
11
+
12
+
13
+ # Offtracker
14
+
15
+ Offtracker is an end to end pipeline of Tracking-seq data analysis for detecting off-target sites of any genome editing tools that generate double-strand breaks (DSBs) or single-strand breaks (SSBs).
16
+
17
+ ## System requirements
18
+
19
+ * Linux/Unix
20
+ * Python >= 3.6
21
+
22
+ ## Dependency
23
+
24
+ ```bash
25
+ # We recommend creating a new environment using mamba/conda to avoid compatibility problems
26
+ # If you don't use mamba, just replace the code with conda
27
+ # Windows systems may not be compatible with pybedtools.
28
+ mamba create -n offtracker -c bioconda blast snakemake pybedtools deeptools chromap
29
+ ```
30
+
31
+
32
+ ## Installation
33
+
34
+
35
+ ```bash
36
+ # Activate the environment
37
+ conda activate offtracker
38
+
39
+ # Direct installation with pip
40
+ pip install offtracker
41
+
42
+ # (Alternative) Download the offtracker from github
43
+ git clone https://github.com/Lan-lab/offtracker.git
44
+ cd offtracker
45
+ pip install .
46
+ ```
47
+
48
+
49
+ ## Before analyzing samples
50
+
51
+ **Important: Do not use hard-masked genome.fa**, in which repeats are masked by capital Ns and reads should have been mapped to these region (e.g. MHC region) will be lost. Besides, the genome.fa **should not** contain alternate loci like chr2_KI270776v1_alt and chr6_GL000256v2_alt, which may cause multi-mappings and the reads may be discarded.
52
+
53
+ For example, https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz is soft-masked genome with alternate loci. https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.masked.gz is hard-masked genome. **Do not** use these two as reference genome.
54
+
55
+ http://cistrome.org/~galib/MAESTRO/references/scATAC/Refdata_scATAC_MAESTRO_GRCh38_1.1.0.tar.gz is the genome used for the example data.
56
+
57
+ ```bash
58
+ # The following command can be used to check whether alternate loci of chr6 are present in the reference genome.
59
+ grep "^>chr6" genome.fa
60
+ ```
61
+
62
+ ```bash
63
+ # Build chromap index (only need once for each genome)
64
+ chromap -i -r /Your_Path_To_Reference/hg38_genome.fa \
65
+ -o /Your_Path_To_Reference/hg38_genome.chromap.index
66
+
67
+ # Build blast index (only need once for each genome)
68
+ makeblastdb -input_type fasta -title hg38 -dbtype nucl -parse_seqids \
69
+ -in /Your_Path_To_Reference/hg38_genome.fa \
70
+ -out /Your_Path_To_Reference/hg38_genome.blastdb \
71
+ -logfile /Your_Path_To_Reference/hg38_genome.blastdb.log
72
+
73
+ # Generate candidate regions by sgRNA sequence (need once for each genome and sgRNA)
74
+ # --name: a user-defined name of the sgRNA, which will be used in the following analysis.
75
+ offtracker_candidates.py -t 8 -g hg38 \
76
+ -r /Your_Path_To_Reference/hg38_genome.fa \
77
+ -b /Your_Path_To_Reference/hg38_genome.blastdb \
78
+ --name 'VEGFA2' --sgrna 'GACCCCCTCCACCCCGCCTC' --pam 'NGG' \
79
+ -o /Your_Path_To_Candidates_Folder
80
+
81
+ ```
82
+
83
+
84
+ ## Quality control and adapter trimming
85
+
86
+ ```bash
87
+ # Generate snakemake config file for quality control and adapter trimming.
88
+ offtracker_qc.py -t 4 \
89
+ -f /Your_Path_To_Input_Folder \
90
+ --subfolder 0
91
+
92
+ cd /Your_Path_To_Input_Folder/Trimmed_data
93
+ snakemake -np # dry run to check whether everything is alright
94
+ nohup snakemake --cores 16 1>${outdir}/sm_qc.log 2>&1 &
95
+
96
+ """
97
+ Set “--subfolder 0” if the file structure is like:
98
+ | - Input_Folder
99
+ | - sample1_R1.fastq.gz
100
+ | - sample1_R2.fastq.gz
101
+ | - sample2_R1.fastq.gz
102
+ | - sample2_R2.fastq.gz
103
+ Set “--subfolder 1” if the file structure is like:
104
+ | - Input_Folder
105
+ | - Sample1_Folder
106
+ | - sample1_R1.fastq.gz
107
+ | - sample1_R2.fastq.gz
108
+ | - Sample2_Folder
109
+ | - sample2_R1.fastq.gz
110
+ | - sample2_R2.fastq.gz
111
+
112
+ The script “offtracker_qc.py” will create a “Trimmed_data” folder under /Your_Path_To_Input_Folder.
113
+ If “-o /Your_Path_To_Output” is set, the output will be redirected to /Your_Path_To_Output.
114
+ """
115
+ ```
116
+
117
+ ## Strand-specific mapping of Tracking-seq data
118
+
119
+ ```bash
120
+
121
+ # Generate snakemake config file for mapping
122
+ # Results will be generated in /Your_Path_To_Output, if -o is not set, the output will be in the same folder as the fastq files
123
+ offtracker_config.py -t 8 -g hg38 --blacklist hg38 \
124
+ -r /Your_Path_To_Reference/hg38_genome.fa \
125
+ -i /Your_Path_To_Reference/hg38_genome.chromap.index \
126
+ -f /Your_Path_To_Trimmed_Data \
127
+ -o /Your_Path_To_Output \
128
+ --subfolder 0
129
+
130
+ # Warning: Do not contain "fastq" or "fq" in the folder name, otherwise the program may treat the folder as a fastq file
131
+ # This problem may be fixed in the future
132
+
133
+ # Run the snakemake program
134
+ cd /Your_Path_To_Fastq
135
+ snakemake -np # dry run
136
+ nohup snakemake --cores 16 1>sm_mapping.log 2>sm_mapping.err &
137
+
138
+ ## about cores
139
+ # --cores of snakemake must be larger than -t of offtracker_config.py
140
+ # parallel number = cores/t
141
+
142
+ ## about output
143
+ # This part will generate "*.fw.scaled.bw" and ".rv.scaled.bw" for IGV visualization
144
+ # "*.fw.bed" and "*.rv.bed" are used in the next part.
145
+ ```
146
+
147
+
148
+ ## Analyzing the genome-wide off-target sites
149
+
150
+ ```bash
151
+ # In this part, multiple samples in the same condition can be analyzed in a single run by pattern recognition of sample names
152
+
153
+ offtracker_analysis.py -g hg38 --name "VEGFA2" \
154
+ --exp 'Cas9_VEGFA2' \
155
+ --control 'WT' \
156
+ --outname 'Cas9_VEGFA_293' \
157
+ -f /Your_Path_To_Output \
158
+ --seqfolder /Your_Path_To_Candidates
159
+
160
+ # --name: the same gRNA name you set when running offtracker_candidates.py
161
+ # --exp/--control: add one or multiple patterns of file name in regular expressions
162
+ # If multiple samples meet the pattern, their signals will be averaged. Thus, only samples with the same condition should be included in a single analysis.
163
+
164
+ # This step will generate Offtracker_result_{outname}.csv
165
+ # Default FDR is 0.05, which can be changed by --fdr. This will empirically make the threshold of Track score around 2.
166
+ # Sites with Track score >=2, which is a empirical threshold, are output regardless of FDR.
167
+ # Intermediate files are saved in ./temp folder, which can be deleted.
168
+ # Keeping the intermediate files can make the analysis faster if involving previously analyzed samples (e.g. using the same control samples for different analyses)
169
+ ```
170
+
171
+ ## Off-target sequences visualization
172
+
173
+ ```bash
174
+ # After get the Offtracker_result_{outname}.csv, you can visualize the off-target sites with their genomic sequence with the following command:
175
+
176
+ offtracker_plot.py --result Your_Offtracker_Result_CSV \
177
+ --sgrna 'GACCCCCTCCACCCCGCCTC' --pam 'NGG'
178
+
179
+ # The default output is a pdf file with Offtracker_result_{outname}.pdf
180
+ # Assigning a specific output file with another suffix can change the format. e.g., "--output Offtracker_plot.png" will generate a png file.
181
+ # The orange dash line indicates the empirical threshold of Track score = 2
182
+ # Empirically, the off-target sites with Track score < 2 are less likely to be real off-target sites.
183
+ ```
184
+
185
+
186
+ ## Note1, when not using hg38 or mm10
187
+
188
+ The default setting only includes chr1-chr22, chrX, chrY, and chrM. (only suitable for human and mouse) \
189
+ If you are using reference genomes without "chr" at the beginning, or want to analyze all chromosomes or other species, you can set "--ignore_chr" when running offtracker_config.py to skip chromosome filter.
190
+
191
+ Currently, this software is only ready-to-use for mm10 and hg38. For any other genome, e.g., hg19, please add a genome size file named "hg19.chrom.sizes" to .\offtracker\utility. Besides, add "--blacklist none" or "--blacklist Your_Blacklist" (e.g., ENCODE blacklist) when running offtracker_config.py, because we only include blacklists for mm10 and hg38.
192
+
193
+ ## Note2
194
+
195
+ The FDRs in the Tracking-seq result do not reflect the real off-target probability.
196
+ It is strongly recommended to observe the "fw.scaled.bw" and "rv.scaled.bw" using genome browser like IGV to visually inspect each target location from the Tracking-seq result.
197
+
198
+
199
+
200
+ # Example Data
201
+
202
+ Here are example data that contains reads of chr6 from HEK293T cells edited with Cas9 + sgRNA VEGFA_site_2 (VEGFA2) and reads of chr6 from wild type HEK293T cells:
203
+
204
+ https://figshare.com/articles/dataset/WT_HEK239T_chr6/25956034
205
+
206
+ It takes about 5-10 minutes to run the mapping (offtracker_config.py & snakemake) of example data with -t 8 and --cores 16 (2 parallel tasks)
207
+
208
+ ## Signal visualization
209
+
210
+ After mapping, there will be 4 .bw files in the output folder:
211
+ ```bash
212
+ Cas9_VEGFA2_chr6.fw.scaled.bw
213
+
214
+ Cas9_VEGFA2_chr6.rv.scaled.bw
215
+
216
+ WT_chr6.fw.scaled.bw
217
+
218
+ WT_chr6.rv.scaled.bw
219
+ ```
220
+ These files can be visualized in genome browser like IGV:
221
+
222
+ ![signal](https://github.com/Lan-lab/offtracker/blob/main/example_output/signals_example.png?raw=true)
223
+
224
+ The signal (coverage) for each sample is normalized to 1e7/total_reads. As only reads mapping to chr6 were extracted in the example data, the signal range is much higher than that of the whole genome samples.
225
+
226
+ ## Whole genome off-target analysis
227
+
228
+ For analyzing the signals (offtracker_analysis.py), it takes about 3-5 minutes and outputs a file named "Offtracker_result_{outname}.csv"
229
+
230
+ After that, you can visualize the off-target sites with their genomic sequence (offtracker_plot.py) and get an image like this:
231
+
232
+ ![offtarget](https://github.com/Lan-lab/offtracker/blob/main/example_output/sequences_example.png?raw=true)
233
+
234
+
235
+ After finishing the pipeline, if “chr6:31400832-31400854” and “chr6:31495044-31495066” are missing in the plot, it is most likely due to either:
236
+
237
+ • Using a hard-masked reference genome (where repeats are replaced with 'N's)
238
+
239
+ • The presence of alternate loci (e.g., chr6_GL000256v2_alt) in the genome.
240
+
241
+ These two off-target sites locate in the region of MHC class I chain-related protein A and B (MICA and MICB), which is polymorphic (resulting in alternate loci in contigs like “chr6_GL000256v2_alt”) and contains interspersed repeats (resulting in sequences masked by capital 'N's in a hard-masked genome). Please try again with unmasked or soft-masked genome without alternate loci.
242
+
243
+
244
+
245
+ # Citation
246
+
247
+ If you use Tracking-seq or OFF-TRACKER in your research, please cite the following paper:
248
+
249
+ Zhu, M., Xu, R., Yuan, J., Wang, J. et al. Tracking-seq reveals the heterogeneity of off-target effects in CRISPR–Cas9-mediated genome editing. Nat Biotechnol (2024). https://doi.org/10.1038/s41587-024-02307-y
250
+
251
+ The signal visualization of .bw file here was generated by the Integrative Genomics Viewer (IGV) software. The signal visualization in the Tracking-seq article above was generated by either IGV or pyGenomeTracks:
252
+
253
+ Robinson, J., Thorvaldsdóttir, H., Winckler, W. et al. Integrative genomics viewer. Nat Biotechnol 29, 24–26 (2011). https://doi.org/10.1038/nbt.1754
254
+
255
+ Lopez-Delisle L, Rabbani L, Wolff J, Bhardwaj V, Backofen R, Grüning B, Ramírez F, Manke T. pyGenomeTracks: reproducible plots for multivariate genomic data sets. Bioinformatics. 2020 Aug 3:btaa692. doi: 10.1093/bioinformatics/btaa692.
256
+
@@ -9,8 +9,7 @@ if sys.version_info < (3,0):
9
9
 
10
10
  import offtracker
11
11
  import offtracker.X_sequence as xseq
12
- script_dir = os.path.abspath(os.path.dirname(offtracker.__file__))
13
- script_folder= os.path.join(script_dir, 'mapping')
12
+
14
13
 
15
14
  import argparse
16
15
  import pandas as pd
@@ -26,8 +25,8 @@ def main():
26
25
  parser.add_argument('--name' , type=str, required=True, help='custom name of the sgRNA' )
27
26
  parser.add_argument('--exp' , type=str, default='all', nargs='+', help='A substring mark in the name of experimental samples. The default is to use all samples other than control' )
28
27
  parser.add_argument('--control' , type=str, default='none', nargs='+', help='A substring mark in the name of control samples. The default is no control. "others" for all samples other than --exp.' )
29
- parser.add_argument('--fdr' , type=int, default=0.01, help='FDR threshold for the final result. Default is 0.01.')
30
- parser.add_argument('--score' , type=int, default=1.9, help='Track score threshold for the final result. Default is 1.9.')
28
+ parser.add_argument('--fdr' , type=float, default=0.05, help='FDR threshold for the final result. Default is 0.05.')
29
+ parser.add_argument('--score' , type=float, default=1.9, help='Track score threshold for the final result. Default is 1.9.')
31
30
  parser.add_argument('--smooth' , type=int, default=1, help='Smooth strength for the signal.')
32
31
  parser.add_argument('--window' , type=int, default=3, help='Window size for smoothing the signal.')
33
32
  parser.add_argument('--binsize' , type=int, default=100, help='Window size for smoothing the signal.')
@@ -40,6 +39,8 @@ def main():
40
39
  parser.add_argument('-o','--outdir' , type=str, default='first', help='The output folder. Default is the first folder of --folder' )
41
40
  parser.add_argument('--outname' , type=str, default='same', help='The suffix of output files. Default is the same --exp' )
42
41
  parser.add_argument('--signal_only' , action='store_true', help='A developer option: stop before group analysis. ' )
42
+ # parser.add_argument('--individual_results', action='store_true', help='When multiple samples meet the exp pattern, only one merged result is generated.\n' \
43
+ # 'Set --individual_results to additionally output the individual result of each exp sample. ' )
43
44
  parser.add_argument('--overwrite' , action='store_true', help='Whether to overwrite existed dataframes.' )
44
45
  parser.add_argument('--clean' , action='store_true', help='Whether to remove temp files')
45
46
 
@@ -13,7 +13,7 @@ import offtracker
13
13
  import offtracker.X_sequence as xseq
14
14
  script_dir = os.path.abspath(os.path.dirname(offtracker.__file__))
15
15
  utility_dir = os.path.join(script_dir, 'utility')
16
- os.chmod( os.path.join(utility_dir, 'bedGraphToBigWig'), 0o755)
16
+ # os.chmod( os.path.join(utility_dir, 'bedGraphToBigWig'), 0o755)
17
17
 
18
18
  ###
19
19
  parser = argparse.ArgumentParser()
@@ -12,7 +12,7 @@ import offtracker.X_sequence as xseq
12
12
 
13
13
  script_dir = os.path.abspath(os.path.dirname(offtracker.__file__))
14
14
  utility_dir = os.path.join(script_dir, 'utility')
15
- os.chmod( os.path.join(utility_dir, 'bedGraphToBigWig'), 0o755)
15
+ # os.chmod( os.path.join(utility_dir, 'bedGraphToBigWig'), 0o755)
16
16
 
17
17
  ###
18
18
  parser = argparse.ArgumentParser()
@@ -1,11 +1,15 @@
1
1
  #!/usr/bin/env python
2
2
  # -*- coding: utf-8 -*-
3
3
 
4
- import io
5
- import os
6
- import sys
7
- from shutil import rmtree
8
- from setuptools import find_packages, setup, Command
4
+ import io, os
5
+ # import sys
6
+ # from shutil import rmtree
7
+ from setuptools import setup
8
+
9
+ # import pkg_resources
10
+ from importlib_metadata import distribution
11
+ from setuptools.command.install import install
12
+ from setuptools.command.develop import develop
9
13
 
10
14
  #
11
15
  NAME = 'offtracker'
@@ -26,7 +30,7 @@ with open(os.path.join(here, package_folder, '_version.py'),'r',encoding='utf-8'
26
30
 
27
31
  # requirements
28
32
  REQUIRED = [
29
- 'biopython', 'pybedtools', 'pyyaml', 'pandas', 'numpy',
33
+ 'pandas', 'numpy', 'biopython', 'pybedtools', 'pyyaml',
30
34
  ]
31
35
  ## pybedtools may be not supported in Windows
32
36
 
@@ -37,6 +41,26 @@ except FileNotFoundError:
37
41
  long_description = DESCRIPTION
38
42
 
39
43
 
44
+
45
+
46
+ class PostInstallCommand(install):
47
+ def run(self):
48
+ install.run(self)
49
+ # 获取文件位置
50
+ dist = distribution('offtracker')
51
+ package_path = dist.locate_file('')
52
+ utility_dir = os.path.join(package_path, 'offtracker/utility')
53
+ os.chmod( os.path.join(utility_dir, 'bedGraphToBigWig'), 0o755)
54
+
55
+ class PostDevelopCommand(develop):
56
+ def run(self):
57
+ develop.run(self)
58
+ # 获取文件位置
59
+ dist = distribution('offtracker')
60
+ package_path = dist.locate_file('')
61
+ utility_dir = os.path.join(package_path, 'offtracker/utility')
62
+ os.chmod( os.path.join(utility_dir, 'bedGraphToBigWig'), 0o755)
63
+
40
64
  setup(
41
65
  name=NAME,
42
66
  version=VERSION,
@@ -49,6 +73,10 @@ setup(
49
73
  python_requires=REQUIRES_PYTHON,
50
74
  packages=['offtracker'],
51
75
  package_data={'offtracker': ['snakefile/*','utility/*']},
76
+ cmdclass={
77
+ 'install': PostInstallCommand,
78
+ 'develop': PostDevelopCommand,
79
+ },
52
80
  scripts = ['scripts/offtracker_qc.py',
53
81
  'scripts/offtracker_config.py',
54
82
  'scripts/offtracker_candidates.py',
File without changes
File without changes
@@ -1,5 +1,5 @@
1
+ pandas
2
+ numpy
1
3
  biopython
2
4
  pybedtools
3
5
  pyyaml
4
- pandas
5
- numpy
File without changes