offtracker 2.7.10__zip → 2.10.0__zip

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. {offtracker-2.7.10/offtracker.egg-info → offtracker-2.10.0}/PKG-INFO +62 -18
  2. {offtracker-2.7.10 → offtracker-2.10.0}/README.md +62 -18
  3. {offtracker-2.7.10 → offtracker-2.10.0}/offtracker/X_offplot.py +13 -2
  4. {offtracker-2.7.10 → offtracker-2.10.0}/offtracker/X_sequence.py +113 -7
  5. {offtracker-2.7.10 → offtracker-2.10.0}/offtracker/_version.py +8 -2
  6. offtracker-2.10.0/offtracker/snakefile/Snakefile_QC.smk +66 -0
  7. offtracker-2.10.0/offtracker/snakefile/Snakefile_offtracker.smk +249 -0
  8. offtracker-2.7.10/offtracker/mapping/1.1_bed2fr_v4.5.py → offtracker-2.10.0/offtracker/utility/1.1_bed2fr.py +6 -4
  9. {offtracker-2.7.10 → offtracker-2.10.0/offtracker.egg-info}/PKG-INFO +62 -18
  10. offtracker-2.10.0/offtracker.egg-info/SOURCES.txt +28 -0
  11. {offtracker-2.7.10 → offtracker-2.10.0}/scripts/offtracker_analysis.py +10 -3
  12. offtracker-2.10.0/scripts/offtracker_candidates.py +318 -0
  13. {offtracker-2.7.10 → offtracker-2.10.0}/scripts/offtracker_config.py +28 -44
  14. offtracker-2.10.0/scripts/offtracker_qc.py +62 -0
  15. {offtracker-2.7.10 → offtracker-2.10.0}/setup.py +5 -4
  16. offtracker-2.7.10/offtracker/mapping/Snakefile_offtracker +0 -245
  17. offtracker-2.7.10/offtracker.egg-info/SOURCES.txt +0 -26
  18. offtracker-2.7.10/scripts/offtracker_candidates.py +0 -307
  19. {offtracker-2.7.10 → offtracker-2.10.0}/LICENSE.txt +0 -0
  20. {offtracker-2.7.10 → offtracker-2.10.0}/MANIFEST.in +0 -0
  21. {offtracker-2.7.10 → offtracker-2.10.0}/offtracker/X_offtracker.py +0 -0
  22. {offtracker-2.7.10 → offtracker-2.10.0}/offtracker/__init__.py +0 -0
  23. {offtracker-2.7.10/offtracker/mapping → offtracker-2.10.0/offtracker/utility}/1.3_bdg_normalize_v4.0.py +0 -0
  24. {offtracker-2.7.10/offtracker/mapping → offtracker-2.10.0/offtracker/utility}/bedGraphToBigWig +0 -0
  25. {offtracker-2.7.10/offtracker/mapping → offtracker-2.10.0/offtracker/utility}/hg38.chrom.sizes +0 -0
  26. {offtracker-2.7.10/offtracker/mapping → offtracker-2.10.0/offtracker/utility}/mm10.chrom.sizes +0 -0
  27. {offtracker-2.7.10/offtracker/mapping → offtracker-2.10.0/offtracker/utility}/offtracker_blacklist_hg38.merged.bed +0 -0
  28. {offtracker-2.7.10/offtracker/mapping → offtracker-2.10.0/offtracker/utility}/offtracker_blacklist_mm10.merged.bed +0 -0
  29. {offtracker-2.7.10 → offtracker-2.10.0}/offtracker.egg-info/dependency_links.txt +0 -0
  30. {offtracker-2.7.10 → offtracker-2.10.0}/offtracker.egg-info/requires.txt +0 -0
  31. {offtracker-2.7.10 → offtracker-2.10.0}/offtracker.egg-info/top_level.txt +0 -0
  32. {offtracker-2.7.10 → offtracker-2.10.0}/scripts/offtracker_plot.py +0 -0
  33. {offtracker-2.7.10 → offtracker-2.10.0}/setup.cfg +0 -0
@@ -1,10 +1,10 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: offtracker
3
- Version: 2.7.10
3
+ Version: 2.10.0
4
4
  Summary: Tracking-seq data analysis
5
5
  Home-page: https://github.com/Lan-lab/offtracker
6
6
  Author: Runda Xu
7
- Author-email: runda.xu@foxmail.com
7
+ Author-email: xrd18@tsinghua.org.cn
8
8
  Requires-Python: >=3.6.0
9
9
  Description-Content-Type: text/markdown
10
10
  License-File: LICENSE.txt
@@ -22,9 +22,10 @@ OFF-TRACKER is an end to end pipeline of Tracking-seq data analysis for detectin
22
22
  ## Dependency
23
23
 
24
24
  ```bash
25
- # We recommend creating a new enviroment using mamba/conda to avoid compatibility problems
25
+ # We recommend creating a new environment using mamba/conda to avoid compatibility problems
26
26
  # If you don't use mamba, just replace the code with conda
27
- mamba create -n offtracker -c bioconda blast snakemake pybedtools
27
+ # Windows systems may not be compatible with pybedtools.
28
+ mamba create -n offtracker -c bioconda blast snakemake pybedtools chromap
28
29
  ```
29
30
 
30
31
 
@@ -58,32 +59,69 @@ chromap -i -r /Your_Path_To_Reference/hg38_genome.fa \
58
59
  -o /Your_Path_To_Reference/hg38_genome.chromap.index
59
60
 
60
61
  # Generate candidate regions by sgRNA sequence (need once for each genome and sgRNA)
61
- # --name: the name of the sgRNA, which will be used in the following analysis
62
+ # --name: a user-defined name of the sgRNA, which will be used in the following analysis.
62
63
  offtracker_candidates.py -t 8 -g hg38 \
63
64
  -r /Your_Path_To_Reference/hg38_genome.fa \
64
65
  -b /Your_Path_To_Reference/hg38_genome.blastdb \
65
66
  --name 'VEGFA2' --sgrna 'GACCCCCTCCACCCCGCCTC' --pam 'NGG' \
66
- -o /Your_Path_To_Candidates
67
+ -o /Your_Path_To_Candidates_Folder
67
68
 
68
69
  ```
69
70
 
71
+
72
+ ## Quality control and adapter trimming
73
+
74
+ ```bash
75
+ # Generate snakemake config file for quality control and adapter trimming.
76
+ offtracker_qc.py -t 4 \
77
+ -f /Your_Path_To_Input_Folder \
78
+ --subfolder 0
79
+
80
+ cd /Your_Path_To_Input_Folder/Trimmed_data
81
+ snakemake -np # dry run to check whether everything is alright
82
+ nohup snakemake --cores 16 1>${outdir}/sm_qc.log 2>&1 &
83
+
84
+ """
85
+ Set “--subfolder 0” if the file structure is like:
86
+ | - Input_Folder
87
+ | - sample1_R1.fastq.gz
88
+ | - sample1_R2.fastq.gz
89
+ | - sample2_R1.fastq.gz
90
+ | - sample2_R2.fastq.gz
91
+ Set “--subfolder 1” if the file structure is like:
92
+ | - Input_Folder
93
+ | - Sample1_Folder
94
+ | - sample1_R1.fastq.gz
95
+ | - sample1_R2.fastq.gz
96
+ | - Sample2_Folder
97
+ | - sample2_R1.fastq.gz
98
+ | - sample2_R2.fastq.gz
99
+
100
+ The script “offtracker_qc.py” will create a “Trimmed_data” folder under /Your_Path_To_Input_Folder.
101
+ If “-o /Your_Path_To_Output” is set, the output will be redirected to /Your_Path_To_Output.
102
+ """
103
+ ```
104
+
70
105
  ## Strand-specific mapping of Tracking-seq data
71
106
 
72
107
  ```bash
73
- # Generate snakemake config file
74
- # --subfolder: If different samples are in seperate folders, set this to 1
75
- # if -o is not set, the output will be in the same folder as the fastq files
108
+
109
+ # Generate snakemake config file for mapping
110
+ # Results will be generated in /Your_Path_To_Output, if -o is not set, the output will be in the same folder as the fastq files
76
111
  offtracker_config.py -t 8 -g hg38 --blacklist hg38 \
77
112
  -r /Your_Path_To_Reference/hg38_genome.fa \
78
113
  -i /Your_Path_To_Reference/hg38_genome.chromap.index \
79
- -f /Your_Path_To_Fastq \
114
+ -f /Your_Path_To_Trimmed_Data \
80
115
  -o /Your_Path_To_Output \
81
116
  --subfolder 0
82
117
 
118
+ # Warning: Do not contain "fastq" or "fq" in the folder name, otherwise the program may treat the folder as a fastq file
119
+ # This problem may be fixed in the future
120
+
83
121
  # Run the snakemake program
84
122
  cd /Your_Path_To_Fastq
85
123
  snakemake -np # dry run
86
- nohup snakemake --cores 16 1>snakemake.log 2>snakemake.err &
124
+ nohup snakemake --cores 16 1>sm_mapping.log 2>sm_mapping.err &
87
125
 
88
126
  ## about cores
89
127
  # --cores of snakemake must be larger than -t of offtracker_config.py
@@ -98,7 +136,7 @@ nohup snakemake --cores 16 1>snakemake.log 2>snakemake.err &
98
136
  ## Analyzing the genome-wide off-target sites
99
137
 
100
138
  ```bash
101
- # In this part, multiple samples in the same condition can be analyzed in a single run by pattern recogonization of sample names
139
+ # In this part, multiple samples in the same condition can be analyzed in a single run by pattern recognition of sample names
102
140
 
103
141
  offtracker_analysis.py -g hg38 --name "VEGFA2" \
104
142
  --exp 'Cas9_VEGFA2' \
@@ -127,19 +165,18 @@ offtracker_plot.py --result Your_Offtracker_Result_CSV \
127
165
  --sgrna 'GACCCCCTCCACCCCGCCTC' --pam 'NGG'
128
166
 
129
167
  # The default output is a pdf file with Offtracker_result_{outname}.pdf
130
- # Change the suffix of the output file to change the format (e.g.: .png)
168
+ # Assigning a specific output file with another suffix can change the format. e.g., "--output Offtracker_plot.png" will generate a png file.
131
169
  # The orange dash line indicates the empirical threshold of Track score = 2
132
170
  # Empirically, the off-target sites with Track score < 2 are less likely to be real off-target sites.
133
171
  ```
134
172
 
135
173
 
136
- ## Note1
174
+ ## Note1, when not using hg38 or mm10
137
175
 
138
- The default setting only includes chr1-chr22, chrX, chrY, and chrM. Please make sure the reference genome contains "chr" at the beginning.
176
+ The default setting only includes chr1-chr22, chrX, chrY, and chrM. (only suitable for human and mouse) \
177
+ If you are using reference genomes without "chr" at the beginning, or want to analyze all chromosomes or other species, you can set "--ignore_chr" when running offtracker_config.py to skip chromosome filter.
139
178
 
140
- Currently, this software is only ready-to-use for mm10 and hg38. For any other genome, e.g., hg19, please add genome size file named "hg19.chrom.sizes" to .\offtracker\mapping and instal manually. Besides, add "--blacklist none" or "--blacklist Your_Blacklist" (e.g., ENCODE blacklist) when running offtracker_config.py, because we only provide blacklists for mm10 and hg38.
141
-
142
- If you have a requirement for species other than human/mouse, please post an issue.
179
+ Currently, this software is only ready-to-use for mm10 and hg38. For any other genome, e.g., hg19, please add a genome size file named "hg19.chrom.sizes" to .\offtracker\utility. Besides, add "--blacklist none" or "--blacklist Your_Blacklist" (e.g., ENCODE blacklist) when running offtracker_config.py, because we only include blacklists for mm10 and hg38.
143
180
 
144
181
  ## Note2
145
182
 
@@ -172,6 +209,7 @@ These files can be visualized in genome browser like IGV:
172
209
 
173
210
  ![signal](https://github.com/Lan-lab/offtracker/blob/main/example_output/signals_example.png?raw=true)
174
211
 
212
+ The signal (coverage) for each sample is normalized to 1e7/total_reads. As only reads mapping to chr6 were extracted in the example data, the signal range is much higher than that of the whole genome samples.
175
213
 
176
214
  ## Whole genome off-target analysis
177
215
 
@@ -183,7 +221,13 @@ After that, you can visualize the off-target sites with their genomic sequence (
183
221
 
184
222
  # Citation
185
223
 
224
+ If you use Tracking-seq or OFF-TRACKER in your research, please cite the following paper:
225
+
226
+ Zhu, M., Xu, R., Yuan, J., Wang, J. et al. Tracking-seq reveals the heterogeneity of off-target effects in CRISPR–Cas9-mediated genome editing. Nat Biotechnol (2024). https://doi.org/10.1038/s41587-024-02307-y
186
227
 
228
+ The signal visualization of .bw file here was generated by the Integrative Genomics Viewer (IGV) software. The signal visualization in the Tracking-seq article above was generated by either IGV or pyGenomeTracks:
187
229
 
230
+ Robinson, J., Thorvaldsdóttir, H., Winckler, W. et al. Integrative genomics viewer. Nat Biotechnol 29, 24–26 (2011). https://doi.org/10.1038/nbt.1754
188
231
 
232
+ Lopez-Delisle L, Rabbani L, Wolff J, Bhardwaj V, Backofen R, Grüning B, Ramírez F, Manke T. pyGenomeTracks: reproducible plots for multivariate genomic data sets. Bioinformatics. 2020 Aug 3:btaa692. doi: 10.1093/bioinformatics/btaa692.
189
233
 
@@ -1,6 +1,6 @@
1
- # OFF-TRACKER
1
+ # Offtracker
2
2
 
3
- OFF-TRACKER is an end to end pipeline of Tracking-seq data analysis for detecting off-target sites of any genome editing tools that generate double-strand breaks (DSBs) or single-strand breaks (SSBs).
3
+ Offtracker is an end to end pipeline of Tracking-seq data analysis for detecting off-target sites of any genome editing tools that generate double-strand breaks (DSBs) or single-strand breaks (SSBs).
4
4
 
5
5
  ## System requirements
6
6
 
@@ -10,9 +10,10 @@ OFF-TRACKER is an end to end pipeline of Tracking-seq data analysis for detectin
10
10
  ## Dependency
11
11
 
12
12
  ```bash
13
- # We recommend creating a new enviroment using mamba/conda to avoid compatibility problems
13
+ # We recommend creating a new environment using mamba/conda to avoid compatibility problems
14
14
  # If you don't use mamba, just replace the code with conda
15
- mamba create -n offtracker -c bioconda blast snakemake pybedtools
15
+ # Windows systems may not be compatible with pybedtools.
16
+ mamba create -n offtracker -c bioconda blast snakemake pybedtools chromap
16
17
  ```
17
18
 
18
19
 
@@ -46,32 +47,69 @@ chromap -i -r /Your_Path_To_Reference/hg38_genome.fa \
46
47
  -o /Your_Path_To_Reference/hg38_genome.chromap.index
47
48
 
48
49
  # Generate candidate regions by sgRNA sequence (need once for each genome and sgRNA)
49
- # --name: the name of the sgRNA, which will be used in the following analysis
50
+ # --name: a user-defined name of the sgRNA, which will be used in the following analysis.
50
51
  offtracker_candidates.py -t 8 -g hg38 \
51
52
  -r /Your_Path_To_Reference/hg38_genome.fa \
52
53
  -b /Your_Path_To_Reference/hg38_genome.blastdb \
53
54
  --name 'VEGFA2' --sgrna 'GACCCCCTCCACCCCGCCTC' --pam 'NGG' \
54
- -o /Your_Path_To_Candidates
55
+ -o /Your_Path_To_Candidates_Folder
55
56
 
56
57
  ```
57
58
 
59
+
60
+ ## Quality control and adapter trimming
61
+
62
+ ```bash
63
+ # Generate snakemake config file for quality control and adapter trimming.
64
+ offtracker_qc.py -t 4 \
65
+ -f /Your_Path_To_Input_Folder \
66
+ --subfolder 0
67
+
68
+ cd /Your_Path_To_Input_Folder/Trimmed_data
69
+ snakemake -np # dry run to check whether everything is alright
70
+ nohup snakemake --cores 16 1>${outdir}/sm_qc.log 2>&1 &
71
+
72
+ """
73
+ Set “--subfolder 0” if the file structure is like:
74
+ | - Input_Folder
75
+ | - sample1_R1.fastq.gz
76
+ | - sample1_R2.fastq.gz
77
+ | - sample2_R1.fastq.gz
78
+ | - sample2_R2.fastq.gz
79
+ Set “--subfolder 1” if the file structure is like:
80
+ | - Input_Folder
81
+ | - Sample1_Folder
82
+ | - sample1_R1.fastq.gz
83
+ | - sample1_R2.fastq.gz
84
+ | - Sample2_Folder
85
+ | - sample2_R1.fastq.gz
86
+ | - sample2_R2.fastq.gz
87
+
88
+ The script “offtracker_qc.py” will create a “Trimmed_data” folder under /Your_Path_To_Input_Folder.
89
+ If “-o /Your_Path_To_Output” is set, the output will be redirected to /Your_Path_To_Output.
90
+ """
91
+ ```
92
+
58
93
  ## Strand-specific mapping of Tracking-seq data
59
94
 
60
95
  ```bash
61
- # Generate snakemake config file
62
- # --subfolder: If different samples are in seperate folders, set this to 1
63
- # if -o is not set, the output will be in the same folder as the fastq files
96
+
97
+ # Generate snakemake config file for mapping
98
+ # Results will be generated in /Your_Path_To_Output, if -o is not set, the output will be in the same folder as the fastq files
64
99
  offtracker_config.py -t 8 -g hg38 --blacklist hg38 \
65
100
  -r /Your_Path_To_Reference/hg38_genome.fa \
66
101
  -i /Your_Path_To_Reference/hg38_genome.chromap.index \
67
- -f /Your_Path_To_Fastq \
102
+ -f /Your_Path_To_Trimmed_Data \
68
103
  -o /Your_Path_To_Output \
69
104
  --subfolder 0
70
105
 
106
+ # Warning: Do not contain "fastq" or "fq" in the folder name, otherwise the program may treat the folder as a fastq file
107
+ # This problem may be fixed in the future
108
+
71
109
  # Run the snakemake program
72
110
  cd /Your_Path_To_Fastq
73
111
  snakemake -np # dry run
74
- nohup snakemake --cores 16 1>snakemake.log 2>snakemake.err &
112
+ nohup snakemake --cores 16 1>sm_mapping.log 2>sm_mapping.err &
75
113
 
76
114
  ## about cores
77
115
  # --cores of snakemake must be larger than -t of offtracker_config.py
@@ -86,7 +124,7 @@ nohup snakemake --cores 16 1>snakemake.log 2>snakemake.err &
86
124
  ## Analyzing the genome-wide off-target sites
87
125
 
88
126
  ```bash
89
- # In this part, multiple samples in the same condition can be analyzed in a single run by pattern recogonization of sample names
127
+ # In this part, multiple samples in the same condition can be analyzed in a single run by pattern recognition of sample names
90
128
 
91
129
  offtracker_analysis.py -g hg38 --name "VEGFA2" \
92
130
  --exp 'Cas9_VEGFA2' \
@@ -115,19 +153,18 @@ offtracker_plot.py --result Your_Offtracker_Result_CSV \
115
153
  --sgrna 'GACCCCCTCCACCCCGCCTC' --pam 'NGG'
116
154
 
117
155
  # The default output is a pdf file with Offtracker_result_{outname}.pdf
118
- # Change the suffix of the output file to change the format (e.g.: .png)
156
+ # Assigning a specific output file with another suffix can change the format. e.g., "--output Offtracker_plot.png" will generate a png file.
119
157
  # The orange dash line indicates the empirical threshold of Track score = 2
120
158
  # Empirically, the off-target sites with Track score < 2 are less likely to be real off-target sites.
121
159
  ```
122
160
 
123
161
 
124
- ## Note1
162
+ ## Note1, when not using hg38 or mm10
125
163
 
126
- The default setting only includes chr1-chr22, chrX, chrY, and chrM. Please make sure the reference genome contains "chr" at the beginning.
164
+ The default setting only includes chr1-chr22, chrX, chrY, and chrM. (only suitable for human and mouse) \
165
+ If you are using reference genomes without "chr" at the beginning, or want to analyze all chromosomes or other species, you can set "--ignore_chr" when running offtracker_config.py to skip chromosome filter.
127
166
 
128
- Currently, this software is only ready-to-use for mm10 and hg38. For any other genome, e.g., hg19, please add genome size file named "hg19.chrom.sizes" to .\offtracker\mapping and instal manually. Besides, add "--blacklist none" or "--blacklist Your_Blacklist" (e.g., ENCODE blacklist) when running offtracker_config.py, because we only provide blacklists for mm10 and hg38.
129
-
130
- If you have a requirement for species other than human/mouse, please post an issue.
167
+ Currently, this software is only ready-to-use for mm10 and hg38. For any other genome, e.g., hg19, please add a genome size file named "hg19.chrom.sizes" to .\offtracker\utility. Besides, add "--blacklist none" or "--blacklist Your_Blacklist" (e.g., ENCODE blacklist) when running offtracker_config.py, because we only include blacklists for mm10 and hg38.
131
168
 
132
169
  ## Note2
133
170
 
@@ -160,6 +197,7 @@ These files can be visualized in genome browser like IGV:
160
197
 
161
198
  ![signal](https://github.com/Lan-lab/offtracker/blob/main/example_output/signals_example.png?raw=true)
162
199
 
200
+ The signal (coverage) for each sample is normalized to 1e7/total_reads. As only reads mapping to chr6 were extracted in the example data, the signal range is much higher than that of the whole genome samples.
163
201
 
164
202
  ## Whole genome off-target analysis
165
203
 
@@ -171,7 +209,13 @@ After that, you can visualize the off-target sites with their genomic sequence (
171
209
 
172
210
  # Citation
173
211
 
212
+ If you use Tracking-seq or OFF-TRACKER in your research, please cite the following paper:
213
+
214
+ Zhu, M., Xu, R., Yuan, J., Wang, J. et al. Tracking-seq reveals the heterogeneity of off-target effects in CRISPR–Cas9-mediated genome editing. Nat Biotechnol (2024). https://doi.org/10.1038/s41587-024-02307-y
174
215
 
216
+ The signal visualization of .bw file here was generated by the Integrative Genomics Viewer (IGV) software. The signal visualization in the Tracking-seq article above was generated by either IGV or pyGenomeTracks:
175
217
 
218
+ Robinson, J., Thorvaldsdóttir, H., Winckler, W. et al. Integrative genomics viewer. Nat Biotechnol 29, 24–26 (2011). https://doi.org/10.1038/nbt.1754
176
219
 
220
+ Lopez-Delisle L, Rabbani L, Wolff J, Bhardwaj V, Backofen R, Grüning B, Ramírez F, Manke T. pyGenomeTracks: reproducible plots for multivariate genomic data sets. Bioinformatics. 2020 Aug 3:btaa692. doi: 10.1093/bioinformatics/btaa692.
177
221
 
@@ -12,6 +12,7 @@ dict_rc = {
12
12
  rcParams.update(dict_rc)
13
13
 
14
14
  # 2024.06.03. offtable 添加 threshold 分界线,默认为 None,常用的是 2
15
+
15
16
  def offtable(offtargets, target_guide, length_pam = 3,
16
17
  col_seq='best_target', col_score='track_score', col_mismatch='mismatch', col_loc='target_location',
17
18
  title=None, font='Arial', font_size=9,
@@ -28,12 +29,15 @@ def offtable(offtargets, target_guide, length_pam = 3,
28
29
  '-': 'orange'
29
30
  }
30
31
 
32
+
33
+
31
34
  # If offtargets is a DataFrame, convert to list of dictionaries
32
35
  if isinstance(offtargets, pd.DataFrame):
33
36
  if threshold is not None:
34
37
  n_positive = sum(offtargets[col_score]>=threshold)
35
38
  offtargets = offtargets.to_dict(orient='records')
36
39
 
40
+
37
41
  # Configuration
38
42
  # title=None
39
43
  # font='Arial'
@@ -106,10 +110,16 @@ def offtable(offtargets, target_guide, length_pam = 3,
106
110
  ax.text(x + box_size_x / 2, y + box_size_y / 2, "." if c == target_guide[i] else c, ha='center', va='center', family=font, fontsize=font_size, weight='bold')
107
111
 
108
112
  # Annotations for score, mismatches, and location coordinates
109
- ax.text(x_offset + (len(target_guide) + 2) * box_size_x, y + box_size_y / 2, round(seq[col_score],2), ha='center', va='center', family=font, fontsize=font_size)
113
+ # 2025.06.05. 如果有负数的,用红色显示
114
+ if seq[col_score]>0:
115
+ text_color = 'black'
116
+ else:
117
+ text_color = 'red'
118
+ ax.text(x_offset + (len(target_guide) + 2) * box_size_x, y + box_size_y / 2, round(seq[col_score],2), ha='center', va='center', family=font, fontsize=font_size, color=text_color)
110
119
  #ax.text(x_offset + (len(target_guide) + 7) * box_size_x, y + box_size_y / 2, "Target" if seq[col_mismatch] == 0 else seq[col_mismatch], ha='center', va='center', family=font, fontsize=font_size, color='red' if seq[col_mismatch] == 0 else 'black')
111
- ax.text(x_offset + (len(target_guide) + 4) * box_size_x, y + box_size_y / 2, seq[col_loc], ha='left', va='center', family=font, fontsize=font_size)
120
+ ax.text(x_offset + (len(target_guide) + 4) * box_size_x, y + box_size_y / 2, seq[col_loc], ha='left', va='center', family=font, fontsize=font_size, color=text_color)
112
121
 
122
+
113
123
  # add a vertical line to indicate the PAM
114
124
  x_line = x_offset + (len(target_guide) - length_pam) * box_size_x
115
125
  y_start = y_offset # + box_size_y / 2
@@ -123,6 +133,7 @@ def offtable(offtargets, target_guide, length_pam = 3,
123
133
  thresh_y = y_offset + (n_positive+1) * (box_size_y + box_gap) - box_gap*0.5
124
134
  ax.hlines(y=thresh_y, xmin=thresh_x_start, xmax=thresh_x_end, color='orange', linestyle='--')
125
135
 
136
+
126
137
  # Styling and save
127
138
  ax.set_xlim(0, width*1.1) # location 的文字太长了,所以要加长一点
128
139
  ax.set_ylim(height, 0)
@@ -3,6 +3,7 @@ import math
3
3
  import pandas as pd
4
4
  from itertools import product
5
5
  import numpy as np
6
+ import os, glob
6
7
 
7
8
  ambiguous_nt = {'A': ['A'],
8
9
  'T': ['T'],
@@ -19,7 +20,7 @@ ambiguous_nt = {'A': ['A'],
19
20
  'H': ['A', 'C', 'T'],
20
21
  'D': ['A', 'G', 'T'],
21
22
  'B': ['C', 'G', 'T'],
22
- 'N': ['A', 'T', 'C', 'G']}
23
+ 'N': ['A', 'C', 'G', 'T']}
23
24
 
24
25
  def is_seq_valid(sequence, extra=True, ambiguous_nt=ambiguous_nt):
25
26
  if extra:
@@ -43,12 +44,24 @@ def possible_seq(sequence):
43
44
  raise KeyError(f'Unvalid character \'{valid_check}\' in sequence')
44
45
  return sequences
45
46
 
47
+ # 包含 degenerate base pairs
48
+ def get_base_score(base1, base2, exact_score=2, partial_match=2, mismatch_score=0.01):
49
+ base1 = ambiguous_nt[base1]
50
+ base2 = ambiguous_nt[base2]
51
+ if base1 == base2:
52
+ return exact_score
53
+ if list(np.union1d(base1,base2)) == base1 or list(np.union1d(base1,base2)) == base2:
54
+ # 其中一个是子集,注意顺序不一致会导致不等,所以必须排好序
55
+ return partial_match
56
+ return mismatch_score
57
+
58
+
46
59
  def complement(seq):
47
- complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N', '-':'-',
60
+ dict_complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N', '-':'-',
48
61
  'M': 'K', 'R': 'Y', 'W': 'W', 'S': 'S', 'Y': 'R', 'K':'M',
49
62
  'V': 'B', 'H': 'D', 'D': 'H', 'B': 'V'}
50
63
  bases = list(seq)
51
- letters = [complement[base] for base in bases]
64
+ letters = [dict_complement[base] for base in bases]
52
65
  return ''.join(letters)
53
66
 
54
67
  def reverse(seq):
@@ -100,14 +113,107 @@ def add_ID(df, chr_col=0, midpoint='cleavage_site'):#, midpoint='midpoint'):
100
113
  df.loc[point_tail>=500,'ID_2'] = df[chr_col_name] + ':' + (point_head+1).astype(str)
101
114
  return df
102
115
 
116
+
117
+
118
+ def detect_fastq(folder, n_subfolder, NGS_type='paired-end'):
119
+ """
120
+ 搜索 folder 的 n级子目录下的所有 fastq/fastq.gz/fq/fq.gz 文件
121
+ paired-end 模式 : 识别 2.fq/2.fastq 为 paired-end 的 R2 文件,并验证对应 R1 文件
122
+ single-end 模式 : 所有 fastq/fastq.gz/fq/fq.gz 文件都视为 single-end 文件
123
+
124
+ 不建议 2. 和 fq/fastq 之间有其他字符,如 2.trimmed.fq.gz,因为中间字符不确定,使用通配符容易误判文件名其他的2.
125
+ 样本名不要带点,建议用_分割特征,同特征内分割不要用_可以用-,如 sample_day-hour_type_batch_rep_1.fq.gz
126
+
127
+ Input
128
+ ----------
129
+ folder : 根目录
130
+ n_subfolder : n级子目录
131
+
132
+ Parameter
133
+ ----------
134
+ NGS_type : 'paired-end' or 'single-end'
135
+
136
+ Output
137
+ ----------
138
+ sample_names : 识别的样品名
139
+ files_R1 : R1文件的完整路径
140
+ files_R2 : R2文件的完整路径
141
+
142
+ """
143
+ # import os, sys, glob
144
+ # import pandas as pd
145
+ if NGS_type == 'paired-end':
146
+ print('paired-end mode')
147
+ files_R2 = []
148
+ # 支持四种文件扩展名
149
+ # 个人习惯包含绝对路径
150
+ for fastq in ['*2.fq','*2.fastq','*2.fq.gz','*2.fastq.gz']:
151
+ fq_files = glob.glob( os.path.join(folder, n_subfolder*'*/', fastq ) )
152
+ print(f'{len(fq_files)} {fastq[2:]} samples detected')
153
+ files_R2.extend( fq_files )
154
+ #
155
+ if len(files_R2) > 0:
156
+ files_R2 = pd.Series(files_R2).sort_values().reset_index(drop=True)
157
+ # 拆分文件名
158
+ suffix = files_R2.str.extract('(\.fastq.*|\.fq.*)',expand=False)
159
+ prefix = files_R2.str.extract('(.*)(?:.fq|.fastq)',expand=False)
160
+ # 将 prefix 进一步拆分为 sample_dir (真样品名) 和 nametype (某种统一后缀),支持五种样本名后缀
161
+ nametype = []
162
+ sample_dir = []
163
+ for a_prefix in prefix:
164
+ for a_type in ['_trimmed_2', '_2_val_2','_R2_val_2','_R2','_2']:
165
+ len_type = len(a_type)
166
+ if a_prefix[-len_type:] == a_type:
167
+ nametype.append(a_type)
168
+ sample_dir.append(a_prefix[:-len_type])
169
+ break
170
+ assert len(nametype) == len(files_R2), 'The file name pattern is invaild!'
171
+ nametype = pd.Series(nametype)
172
+ sample_dir = pd.Series(sample_dir)
173
+ # 根据 R2 文件,检查 R1 文件是否存在
174
+ files_R1 = sample_dir + nametype.str.replace('2','1') + suffix
175
+ for i in range(len(files_R1)):
176
+ assert os.path.exists(files_R1[i]), f'{files_R1[i]} not found!'
177
+ sample_names = sample_dir.apply(os.path.basename)
178
+ else:
179
+ print('No paired-end samples detected!')
180
+ sample_names = 'no sample'
181
+ files_R1 = []
182
+
183
+ elif NGS_type == 'single-end':
184
+ print('single-end mode')
185
+ files_R1 = []
186
+ files_R2 = [] # 占位
187
+ # 支持四种文件扩展名
188
+ # 个人习惯包含绝对路径
189
+ for fastq in ['*.fq','*.fastq','*.fq.gz','*.fastq.gz']:
190
+ fq_files = glob.glob( os.path.join(folder, n_subfolder*'*/', fastq ) )
191
+ print(f'{len(fq_files)} {fastq[1:]} samples detected')
192
+ files_R1.extend( fq_files )
193
+ files_R1 = pd.Series(files_R1).sort_values()
194
+ #
195
+ if len(files_R1) > 0:
196
+ # 拆分文件名
197
+ suffix = files_R1.str.extract('(\.fastq.*|\.fq.*)',expand=False)
198
+ prefix = files_R1.str.extract('(.*)(?:.fq|.fastq)',expand=False)
199
+ # 单端模式下,所有前缀都视为样品名
200
+ sample_names = prefix.apply(os.path.basename)
201
+ else:
202
+ print('No single-end samples detected!')
203
+ sample_names = 'no sample'
204
+ files_R1 = []
205
+
206
+ return sample_names, files_R1, files_R2
207
+
208
+
103
209
  def sgRNA_alignment(a_key, sgRNA, seq, frag_len, DNA_matrix=None, mismatch_score = 0.01, return_align=False):
104
210
  from Bio import pairwise2
105
211
  import numpy as np
106
212
  if DNA_matrix is None:
107
- DNA_matrix = {('A','A'): 2, ('A','T'):0.01, ('A','C'):0.01, ('A','G'):0.01, ('A','N'):0.01,
108
- ('T','T'): 2, ('T','A'):0.01, ('T','C'):0.01, ('T','G'):0.01, ('T','N'):0.01,
109
- ('G','G'): 2, ('G','A'):0.01, ('G','C'):0.01, ('G','T'):0.01, ('G','N'):0.01,
110
- ('C','C'): 2, ('C','A'):0.01, ('C','G'):0.01, ('C','T'):0.01, ('C','N'):0.01,
213
+ DNA_matrix = {('A','A'): 2, ('A','T'):0.01, ('A','C'):0.01, ('A','G'):0.01, ('A','N'):2,
214
+ ('T','T'): 2, ('T','A'):0.01, ('T','C'):0.01, ('T','G'):0.01, ('T','N'):2,
215
+ ('G','G'): 2, ('G','A'):0.01, ('G','C'):0.01, ('G','T'):0.01, ('G','N'):2,
216
+ ('C','C'): 2, ('C','A'):0.01, ('C','G'):0.01, ('C','T'):0.01, ('C','N'):2,
111
217
  ('N','N'): 2, ('N','C'):2, ('N','A'): 2, ('N','G'): 2, ('N','T'): 2}
112
218
  # a_key 是 pybedtools 得到的位置 chrA:X-Y 而 X 数字会往左多1bp
113
219
  alignments = pairwise2.align.localds( sgRNA, seq, DNA_matrix, -2, -2, penalize_extend_when_opening=False)
@@ -1,4 +1,4 @@
1
- __version__ = "2.7.10"
1
+ __version__ = "2.10.0"
2
2
  # 2023.08.11. v1.1.0 adding a option for not normalizing the bw file
3
3
  # 2023.10.26. v1.9.0 prerelease for v2.0
4
4
  # 2023.10.27. v2.0.0 大更新,还没微调
@@ -27,4 +27,10 @@ __version__ = "2.7.10"
27
27
  # 2024.01.23. v2.7.7 Snakefile_offtracker: add --fixedStep to bigwigCompare for not merging neighbouring bins with equal values.
28
28
  # 2024.02.01. v2.7.8 逐步添加 X_offplot.py 功能
29
29
  # 2024.06.02. v2.7.9 添加 offtracker_plot.py
30
- # 2024.06.03. v2.7.10 修复 bugs,offtable 添加 threshold = 2 的分界
30
+ # 2024.06.03. v2.7.10 修复 bugs,offtable 添加 threshold = 2 的分界
31
+ # 2024.06.04. v2.7.11 readme 修改
32
+ # 2024.11.19. v2.7.12 offtracker_candidates.py 新增 --pam_location 参数指定 upstream 或 downstream,用于非 Cas9 情况
33
+ # 2025.04.25. v2.8.0 修复了 offtracker candidates 会把小写序列转换成 N 的 bug
34
+ # 2025.05.22. v2.9.0 翻新部分代码结构
35
+ # 2025.06.05. v2.10.0 增加了QC模块。保留了负数score的记录,并在plot时显示为红字。增加了 "--ignore_chr" 用于跳过common chr过滤。
36
+
@@ -0,0 +1,66 @@
1
+ # 更新记录:
2
+ # 2022.05.04. v1.0: 初步运行, fastp + multiqc
3
+ # 2024.01.17. v2.0: 翻新结构,匹配 X_NGS 框架
4
+
5
+ # 参数列表
6
+ configfile: "config.yaml"
7
+
8
+ ### config['files_R1'], config['files_R2'] 为 dict型
9
+
10
+ # # fastq 信息
11
+ _files_R1 = config['files_R1'] # dict型, key 为 sample
12
+ _files_R2 = config['files_R2'] # dict型, key 为 sample
13
+ # # 输入输出文件夹
14
+ # config['input_dir']
15
+ _output_dir = config["output_dir"]
16
+ # # 运行参数
17
+ _thread = config['thread']
18
+ # config['utility_dir']
19
+
20
+ import os
21
+
22
+ ############################
23
+ # conditional output_files #
24
+ ############################
25
+ output_HT = expand( os.path.join(_output_dir,"{sample}_fastp.html"), sample=_files_R1)
26
+ output_JS = expand( os.path.join(_output_dir,"{sample}_fastp.json"), sample=_files_R1)
27
+ output_MQC = os.path.join(_output_dir,"MultiQC_Report_Raw.html")
28
+ output_R1 = expand( os.path.join(_output_dir,"{sample}_trimmed_1.fq.gz"), sample=_files_R1) # dict 会自动迭代 keys
29
+ output_R2 = expand( os.path.join(_output_dir,"{sample}_trimmed_2.fq.gz"), sample=_files_R1)
30
+
31
+ output_files = output_HT + output_JS + [output_MQC] + output_R1 + output_R2
32
+
33
+ rule all:
34
+ input:
35
+ output_files
36
+
37
+ #######################
38
+ ## fastp and multiQC ##
39
+ #######################
40
+ rule QCtrim:
41
+ input:
42
+ R1=lambda w: _files_R1[w.sample],
43
+ R2=lambda w: _files_R2[w.sample]
44
+ threads:
45
+ _thread
46
+ output:
47
+ R1=os.path.join(_output_dir,"{sample}_trimmed_1.fq.gz"),
48
+ R2=os.path.join(_output_dir,"{sample}_trimmed_2.fq.gz"),
49
+ HT=os.path.join(_output_dir,"{sample}_fastp.html"),
50
+ JS=os.path.join(_output_dir,"{sample}_fastp.json")
51
+ shell:
52
+ """
53
+ fastp -i {input.R1} -I {input.R2} -o {output.R1} -O {output.R2} \
54
+ -h {wildcards.sample}_fastp.html -j {wildcards.sample}_fastp.json \
55
+ --length_required 10 --thread {threads} --detect_adapter_for_pe --disable_quality_filtering
56
+ """
57
+
58
+ rule multiqc:
59
+ input:
60
+ expand( os.path.join(_output_dir,"{sample}_fastp.html"), sample=_files_R1 )
61
+ threads:
62
+ _thread
63
+ output:
64
+ os.path.join(_output_dir,"MultiQC_Report_Raw.html")
65
+ shell:
66
+ "multiqc {_output_dir} -n MultiQC_Report_Raw --outdir {_output_dir}"