offtracker 2.7.8__zip → 2.10.0__zip

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. offtracker-2.10.0/PKG-INFO +233 -0
  2. offtracker-2.10.0/README.md +221 -0
  3. {offtracker-2.7.8 → offtracker-2.10.0}/offtracker/X_offplot.py +37 -8
  4. {offtracker-2.7.8 → offtracker-2.10.0}/offtracker/X_sequence.py +113 -7
  5. offtracker-2.10.0/offtracker/_version.py +36 -0
  6. offtracker-2.10.0/offtracker/snakefile/Snakefile_QC.smk +66 -0
  7. offtracker-2.10.0/offtracker/snakefile/Snakefile_offtracker.smk +249 -0
  8. offtracker-2.7.8/offtracker/mapping/1.1_bed2fr_v4.5.py → offtracker-2.10.0/offtracker/utility/1.1_bed2fr.py +6 -4
  9. offtracker-2.10.0/offtracker.egg-info/PKG-INFO +233 -0
  10. offtracker-2.10.0/offtracker.egg-info/SOURCES.txt +28 -0
  11. {offtracker-2.7.8 → offtracker-2.10.0}/scripts/offtracker_analysis.py +20 -5
  12. offtracker-2.10.0/scripts/offtracker_candidates.py +318 -0
  13. {offtracker-2.7.8 → offtracker-2.10.0}/scripts/offtracker_config.py +28 -44
  14. offtracker-2.10.0/scripts/offtracker_plot.py +39 -0
  15. offtracker-2.10.0/scripts/offtracker_qc.py +62 -0
  16. {offtracker-2.7.8 → offtracker-2.10.0}/setup.py +8 -4
  17. offtracker-2.7.8/PKG-INFO +0 -146
  18. offtracker-2.7.8/README.md +0 -134
  19. offtracker-2.7.8/offtracker/_version.py +0 -28
  20. offtracker-2.7.8/offtracker/mapping/Snakefile_offtracker +0 -245
  21. offtracker-2.7.8/offtracker.egg-info/PKG-INFO +0 -146
  22. offtracker-2.7.8/offtracker.egg-info/SOURCES.txt +0 -25
  23. offtracker-2.7.8/scripts/offtracker_candidates.py +0 -307
  24. {offtracker-2.7.8 → offtracker-2.10.0}/LICENSE.txt +0 -0
  25. {offtracker-2.7.8 → offtracker-2.10.0}/MANIFEST.in +0 -0
  26. {offtracker-2.7.8 → offtracker-2.10.0}/offtracker/X_offtracker.py +0 -0
  27. {offtracker-2.7.8 → offtracker-2.10.0}/offtracker/__init__.py +0 -0
  28. {offtracker-2.7.8/offtracker/mapping → offtracker-2.10.0/offtracker/utility}/1.3_bdg_normalize_v4.0.py +0 -0
  29. {offtracker-2.7.8/offtracker/mapping → offtracker-2.10.0/offtracker/utility}/bedGraphToBigWig +0 -0
  30. {offtracker-2.7.8/offtracker/mapping → offtracker-2.10.0/offtracker/utility}/hg38.chrom.sizes +0 -0
  31. {offtracker-2.7.8/offtracker/mapping → offtracker-2.10.0/offtracker/utility}/mm10.chrom.sizes +0 -0
  32. {offtracker-2.7.8/offtracker/mapping → offtracker-2.10.0/offtracker/utility}/offtracker_blacklist_hg38.merged.bed +0 -0
  33. {offtracker-2.7.8/offtracker/mapping → offtracker-2.10.0/offtracker/utility}/offtracker_blacklist_mm10.merged.bed +0 -0
  34. {offtracker-2.7.8 → offtracker-2.10.0}/offtracker.egg-info/dependency_links.txt +0 -0
  35. {offtracker-2.7.8 → offtracker-2.10.0}/offtracker.egg-info/requires.txt +0 -0
  36. {offtracker-2.7.8 → offtracker-2.10.0}/offtracker.egg-info/top_level.txt +0 -0
  37. {offtracker-2.7.8 → offtracker-2.10.0}/setup.cfg +0 -0
@@ -3,6 +3,7 @@ import math
3
3
  import pandas as pd
4
4
  from itertools import product
5
5
  import numpy as np
6
+ import os, glob
6
7
 
7
8
  ambiguous_nt = {'A': ['A'],
8
9
  'T': ['T'],
@@ -19,7 +20,7 @@ ambiguous_nt = {'A': ['A'],
19
20
  'H': ['A', 'C', 'T'],
20
21
  'D': ['A', 'G', 'T'],
21
22
  'B': ['C', 'G', 'T'],
22
- 'N': ['A', 'T', 'C', 'G']}
23
+ 'N': ['A', 'C', 'G', 'T']}
23
24
 
24
25
  def is_seq_valid(sequence, extra=True, ambiguous_nt=ambiguous_nt):
25
26
  if extra:
@@ -43,12 +44,24 @@ def possible_seq(sequence):
43
44
  raise KeyError(f'Unvalid character \'{valid_check}\' in sequence')
44
45
  return sequences
45
46
 
47
+ # 包含 degenerate base pairs
48
+ def get_base_score(base1, base2, exact_score=2, partial_match=2, mismatch_score=0.01):
49
+ base1 = ambiguous_nt[base1]
50
+ base2 = ambiguous_nt[base2]
51
+ if base1 == base2:
52
+ return exact_score
53
+ if list(np.union1d(base1,base2)) == base1 or list(np.union1d(base1,base2)) == base2:
54
+ # 其中一个是子集,注意顺序不一致会导致不等,所以必须排好序
55
+ return partial_match
56
+ return mismatch_score
57
+
58
+
46
59
  def complement(seq):
47
- complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N', '-':'-',
60
+ dict_complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N', '-':'-',
48
61
  'M': 'K', 'R': 'Y', 'W': 'W', 'S': 'S', 'Y': 'R', 'K':'M',
49
62
  'V': 'B', 'H': 'D', 'D': 'H', 'B': 'V'}
50
63
  bases = list(seq)
51
- letters = [complement[base] for base in bases]
64
+ letters = [dict_complement[base] for base in bases]
52
65
  return ''.join(letters)
53
66
 
54
67
  def reverse(seq):
@@ -100,14 +113,107 @@ def add_ID(df, chr_col=0, midpoint='cleavage_site'):#, midpoint='midpoint'):
100
113
  df.loc[point_tail>=500,'ID_2'] = df[chr_col_name] + ':' + (point_head+1).astype(str)
101
114
  return df
102
115
 
116
+
117
+
118
+ def detect_fastq(folder, n_subfolder, NGS_type='paired-end'):
119
+ """
120
+ 搜索 folder 的 n级子目录下的所有 fastq/fastq.gz/fq/fq.gz 文件
121
+ paired-end 模式 : 识别 2.fq/2.fastq 为 paired-end 的 R2 文件,并验证对应 R1 文件
122
+ single-end 模式 : 所有 fastq/fastq.gz/fq/fq.gz 文件都视为 single-end 文件
123
+
124
+ 不建议 2. 和 fq/fastq 之间有其他字符,如 2.trimmed.fq.gz,因为中间字符不确定,使用通配符容易误判文件名其他的2.
125
+ 样本名不要带点,建议用_分割特征,同特征内分割不要用_可以用-,如 sample_day-hour_type_batch_rep_1.fq.gz
126
+
127
+ Input
128
+ ----------
129
+ folder : 根目录
130
+ n_subfolder : n级子目录
131
+
132
+ Parameter
133
+ ----------
134
+ NGS_type : 'paired-end' or 'single-end'
135
+
136
+ Output
137
+ ----------
138
+ sample_names : 识别的样品名
139
+ files_R1 : R1文件的完整路径
140
+ files_R2 : R2文件的完整路径
141
+
142
+ """
143
+ # import os, sys, glob
144
+ # import pandas as pd
145
+ if NGS_type == 'paired-end':
146
+ print('paired-end mode')
147
+ files_R2 = []
148
+ # 支持四种文件扩展名
149
+ # 个人习惯包含绝对路径
150
+ for fastq in ['*2.fq','*2.fastq','*2.fq.gz','*2.fastq.gz']:
151
+ fq_files = glob.glob( os.path.join(folder, n_subfolder*'*/', fastq ) )
152
+ print(f'{len(fq_files)} {fastq[2:]} samples detected')
153
+ files_R2.extend( fq_files )
154
+ #
155
+ if len(files_R2) > 0:
156
+ files_R2 = pd.Series(files_R2).sort_values().reset_index(drop=True)
157
+ # 拆分文件名
158
+ suffix = files_R2.str.extract('(\.fastq.*|\.fq.*)',expand=False)
159
+ prefix = files_R2.str.extract('(.*)(?:.fq|.fastq)',expand=False)
160
+ # 将 prefix 进一步拆分为 sample_dir (真样品名) 和 nametype (某种统一后缀),支持五种样本名后缀
161
+ nametype = []
162
+ sample_dir = []
163
+ for a_prefix in prefix:
164
+ for a_type in ['_trimmed_2', '_2_val_2','_R2_val_2','_R2','_2']:
165
+ len_type = len(a_type)
166
+ if a_prefix[-len_type:] == a_type:
167
+ nametype.append(a_type)
168
+ sample_dir.append(a_prefix[:-len_type])
169
+ break
170
+ assert len(nametype) == len(files_R2), 'The file name pattern is invaild!'
171
+ nametype = pd.Series(nametype)
172
+ sample_dir = pd.Series(sample_dir)
173
+ # 根据 R2 文件,检查 R1 文件是否存在
174
+ files_R1 = sample_dir + nametype.str.replace('2','1') + suffix
175
+ for i in range(len(files_R1)):
176
+ assert os.path.exists(files_R1[i]), f'{files_R1[i]} not found!'
177
+ sample_names = sample_dir.apply(os.path.basename)
178
+ else:
179
+ print('No paired-end samples detected!')
180
+ sample_names = 'no sample'
181
+ files_R1 = []
182
+
183
+ elif NGS_type == 'single-end':
184
+ print('single-end mode')
185
+ files_R1 = []
186
+ files_R2 = [] # 占位
187
+ # 支持四种文件扩展名
188
+ # 个人习惯包含绝对路径
189
+ for fastq in ['*.fq','*.fastq','*.fq.gz','*.fastq.gz']:
190
+ fq_files = glob.glob( os.path.join(folder, n_subfolder*'*/', fastq ) )
191
+ print(f'{len(fq_files)} {fastq[1:]} samples detected')
192
+ files_R1.extend( fq_files )
193
+ files_R1 = pd.Series(files_R1).sort_values()
194
+ #
195
+ if len(files_R1) > 0:
196
+ # 拆分文件名
197
+ suffix = files_R1.str.extract('(\.fastq.*|\.fq.*)',expand=False)
198
+ prefix = files_R1.str.extract('(.*)(?:.fq|.fastq)',expand=False)
199
+ # 单端模式下,所有前缀都视为样品名
200
+ sample_names = prefix.apply(os.path.basename)
201
+ else:
202
+ print('No single-end samples detected!')
203
+ sample_names = 'no sample'
204
+ files_R1 = []
205
+
206
+ return sample_names, files_R1, files_R2
207
+
208
+
103
209
  def sgRNA_alignment(a_key, sgRNA, seq, frag_len, DNA_matrix=None, mismatch_score = 0.01, return_align=False):
104
210
  from Bio import pairwise2
105
211
  import numpy as np
106
212
  if DNA_matrix is None:
107
- DNA_matrix = {('A','A'): 2, ('A','T'):0.01, ('A','C'):0.01, ('A','G'):0.01, ('A','N'):0.01,
108
- ('T','T'): 2, ('T','A'):0.01, ('T','C'):0.01, ('T','G'):0.01, ('T','N'):0.01,
109
- ('G','G'): 2, ('G','A'):0.01, ('G','C'):0.01, ('G','T'):0.01, ('G','N'):0.01,
110
- ('C','C'): 2, ('C','A'):0.01, ('C','G'):0.01, ('C','T'):0.01, ('C','N'):0.01,
213
+ DNA_matrix = {('A','A'): 2, ('A','T'):0.01, ('A','C'):0.01, ('A','G'):0.01, ('A','N'):2,
214
+ ('T','T'): 2, ('T','A'):0.01, ('T','C'):0.01, ('T','G'):0.01, ('T','N'):2,
215
+ ('G','G'): 2, ('G','A'):0.01, ('G','C'):0.01, ('G','T'):0.01, ('G','N'):2,
216
+ ('C','C'): 2, ('C','A'):0.01, ('C','G'):0.01, ('C','T'):0.01, ('C','N'):2,
111
217
  ('N','N'): 2, ('N','C'):2, ('N','A'): 2, ('N','G'): 2, ('N','T'): 2}
112
218
  # a_key 是 pybedtools 得到的位置 chrA:X-Y 而 X 数字会往左多1bp
113
219
  alignments = pairwise2.align.localds( sgRNA, seq, DNA_matrix, -2, -2, penalize_extend_when_opening=False)
@@ -0,0 +1,36 @@
1
+ __version__ = "2.10.0"
2
+ # 2023.08.11. v1.1.0 adding a option for not normalizing the bw file
3
+ # 2023.10.26. v1.9.0 prerelease for v2.0
4
+ # 2023.10.27. v2.0.0 大更新,还没微调
5
+ # 2023.10.28. v2.1.0 修复bug,增加计算信号长度的功能
6
+ # 2023.10.28. v2.2.0 修复bug,改变计算信号长度的算法
7
+ # 2023.10.29. v2.3.0 增加 overall signal 计算
8
+ # 2023.11.01. v2.3.1 增加 signal_only 选项
9
+ # 2023.11.02. v2.3.2 修改 sample signal 和 group mean 的计算顺序
10
+ # 2023.11.04. v2.3.3 修复 overall score 标准化时排序错误的问题
11
+ # 2023.11.05. v2.3.4 修复判断单边溢出信号时的列名选取错误
12
+ # 2023.11.13. v2.3.5 微调 track score
13
+ # 2023.12.05. v2.3.6 candidates 增加 cleavage site,修正 alignment 有 deletion 会错位的 bug
14
+ # 2023.12.05. v2.3.7 用 cleavage site 代替 midpoint # 还没改完
15
+ # 2023.12.07. v2.3.8 df_score 增加 df_exp, df_ctr 各自列。修复没 df_ctr 时的 bug。track score 用 proximal
16
+ # 2023.12.09. v2.4.0 为了兼顾 proximal 和 overall,当 normalized overall signal 高于 2 时,增加 overall signal 的加分
17
+ # 2023.12.09. v2.5.0 尝试新的加权位置
18
+ # 2023.12.10. v2.6.0 加入 trackseq v4 的计算分支,即考虑 Region 内的 positive_pct,避免短而尖锐的信号
19
+ # 2023.12.10. v2.6.1 有些非特异信号数值很大,如果在 control 组是大负数,可能导致减 control 后假高信号,因此给负数一个 clip
20
+ # 2023.12.30. v2.7.0 增加 X_offplot 模块,用于绘图
21
+ # 2023.12.31. v2.7.1 control 的负数值 clip 由 -5 改为 -1,进一步减少假阳性。另外不加 overall 了
22
+ # 2024.01.01. v2.7.2 权重改为 proximal + pct = 1 + 1. 防信号外溢假阳性标准由<0改为<=0
23
+ # 2024.01.02. v2.7.3 flank regions 默认值改为 1000 2000 3000 5000。之前 control 的负数值 clip 相当于直接在 final score,现在改为每个单独 clip 后重新算 score,默认值为 CtrClip=-0.5
24
+ # 2024.01.03. v2.7.4 更新了 blacklist.bed
25
+ # 2024.01.04. v2.7.5 更新了 hg38 blacklist.bed
26
+ # 2024.01.12. v2.7.6 修复小bug,输出 fdr 改为 <0.05。
27
+ # 2024.01.23. v2.7.7 Snakefile_offtracker: add --fixedStep to bigwigCompare for not merging neighbouring bins with equal values.
28
+ # 2024.02.01. v2.7.8 逐步添加 X_offplot.py 功能
29
+ # 2024.06.02. v2.7.9 添加 offtracker_plot.py
30
+ # 2024.06.03. v2.7.10 修复 bugs,offtable 添加 threshold = 2 的分界
31
+ # 2024.06.04. v2.7.11 readme 修改
32
+ # 2024.11.19. v2.7.12 offtracker_candidates.py 新增 --pam_location 参数指定 upstream 或 downstream,用于非 Cas9 情况
33
+ # 2025.04.25. v2.8.0 修复了 offtracker candidates 会把小写序列转换成 N 的 bug
34
+ # 2025.05.22. v2.9.0 翻新部分代码结构
35
+ # 2025.06.05. v2.10.0 增加了QC模块。保留了负数score的记录,并在plot时显示为红字。增加了 "--ignore_chr" 用于跳过common chr过滤。
36
+
@@ -0,0 +1,66 @@
1
+ # 更新记录:
2
+ # 2022.05.04. v1.0: 初步运行, fastp + multiqc
3
+ # 2024.01.17. v2.0: 翻新结构,匹配 X_NGS 框架
4
+
5
+ # 参数列表
6
+ configfile: "config.yaml"
7
+
8
+ ### config['files_R1'], config['files_R2'] 为 dict型
9
+
10
+ # # fastq 信息
11
+ _files_R1 = config['files_R1'] # dict型, key 为 sample
12
+ _files_R2 = config['files_R2'] # dict型, key 为 sample
13
+ # # 输入输出文件夹
14
+ # config['input_dir']
15
+ _output_dir = config["output_dir"]
16
+ # # 运行参数
17
+ _thread = config['thread']
18
+ # config['utility_dir']
19
+
20
+ import os
21
+
22
+ ############################
23
+ # conditional output_files #
24
+ ############################
25
+ output_HT = expand( os.path.join(_output_dir,"{sample}_fastp.html"), sample=_files_R1)
26
+ output_JS = expand( os.path.join(_output_dir,"{sample}_fastp.json"), sample=_files_R1)
27
+ output_MQC = os.path.join(_output_dir,"MultiQC_Report_Raw.html")
28
+ output_R1 = expand( os.path.join(_output_dir,"{sample}_trimmed_1.fq.gz"), sample=_files_R1) # dict 会自动迭代 keys
29
+ output_R2 = expand( os.path.join(_output_dir,"{sample}_trimmed_2.fq.gz"), sample=_files_R1)
30
+
31
+ output_files = output_HT + output_JS + [output_MQC] + output_R1 + output_R2
32
+
33
+ rule all:
34
+ input:
35
+ output_files
36
+
37
+ #######################
38
+ ## fastp and multiQC ##
39
+ #######################
40
+ rule QCtrim:
41
+ input:
42
+ R1=lambda w: _files_R1[w.sample],
43
+ R2=lambda w: _files_R2[w.sample]
44
+ threads:
45
+ _thread
46
+ output:
47
+ R1=os.path.join(_output_dir,"{sample}_trimmed_1.fq.gz"),
48
+ R2=os.path.join(_output_dir,"{sample}_trimmed_2.fq.gz"),
49
+ HT=os.path.join(_output_dir,"{sample}_fastp.html"),
50
+ JS=os.path.join(_output_dir,"{sample}_fastp.json")
51
+ shell:
52
+ """
53
+ fastp -i {input.R1} -I {input.R2} -o {output.R1} -O {output.R2} \
54
+ -h {wildcards.sample}_fastp.html -j {wildcards.sample}_fastp.json \
55
+ --length_required 10 --thread {threads} --detect_adapter_for_pe --disable_quality_filtering
56
+ """
57
+
58
+ rule multiqc:
59
+ input:
60
+ expand( os.path.join(_output_dir,"{sample}_fastp.html"), sample=_files_R1 )
61
+ threads:
62
+ _thread
63
+ output:
64
+ os.path.join(_output_dir,"MultiQC_Report_Raw.html")
65
+ shell:
66
+ "multiqc {_output_dir} -n MultiQC_Report_Raw --outdir {_output_dir}"
@@ -0,0 +1,249 @@
1
+ # 2023.08.11. adding a option for not normalizing the bw file
2
+ # 2024.01.23. add --fixedStep to bigwigCompare for not merging neighbouring bins with equal values.
3
+ # 2025.05.22. refine the structure
4
+
5
+ configfile: "config.yaml"
6
+
7
+ # # fastq 信息
8
+ _files_R1 = config['files_R1'] # dict型, key 为 sample
9
+ _files_R2 = config['files_R2'] # dict型, key 为 sample
10
+ # # 运行参数
11
+ _output_dir = config["output_dir"]
12
+ _thread = config['thread']
13
+ _BinSize = str(config["binsize"])
14
+ _normalize = config["normalize"]
15
+
16
+
17
+ import os
18
+
19
+ if _normalize == "True":
20
+ rule all:
21
+ input:
22
+ expand( os.path.join(_output_dir,"{sample}.fw.bed"), sample=_files_R1 ),
23
+ expand( os.path.join(_output_dir,"{sample}.rv.bed"), sample=_files_R1 ),
24
+ expand( os.path.join(_output_dir,"{sample}.fw.scaled.bw"), sample=_files_R1 ),
25
+ expand( os.path.join(_output_dir,"{sample}.rv.scaled.bw"), sample=_files_R1 ),
26
+ expand( os.path.join(_output_dir,"{sample}." + _BinSize + ".add.bdg"),sample=_files_R1 ),
27
+ elif _normalize == "False":
28
+ rule all:
29
+ input:
30
+ expand( os.path.join(_output_dir,"{sample}.fw.bed"), sample=_files_R1 ),
31
+ expand( os.path.join(_output_dir,"{sample}.rv.bed"), sample=_files_R1 ),
32
+ expand( os.path.join(_output_dir,"{sample}.fw.raw.bw"), sample=_files_R1 ),
33
+ expand( os.path.join(_output_dir,"{sample}.rv.raw.bw"), sample=_files_R1 ),
34
+ else:
35
+ raise ValueError('Please provide "True" or "False" for "--normalize" when running offtracker_config.py')
36
+
37
+
38
+ rule chromap:
39
+ input:
40
+ R1=lambda w: _files_R1[w.sample],
41
+ R2=lambda w: _files_R2[w.sample]
42
+ threads:
43
+ _threads
44
+ params:
45
+ index=config["index"],
46
+ fasta=config["fasta"]
47
+ output:
48
+ temp(os.path.join(_output_dir,"{sample}.chromapx.bed"))
49
+ shell:
50
+ """
51
+ chromap -l 3000 --low-mem --BED --remove-pcr-duplicates \
52
+ --min-read-length 10 --allocate-multi-mappings \
53
+ -x {params.index} -r {params.fasta} -t {threads} -1 {input.R1} -2 {input.R2} -o {output}
54
+ """
55
+
56
+ if config["blacklist"] != 'none':
57
+ rule remove_blacklist:
58
+ input:
59
+ os.path.join(_output_dir,"{sample}.chromapx.bed")
60
+ threads:
61
+ _threads
62
+ params:
63
+ blacklist=config["blacklist"]
64
+ output:
65
+ temp(os.path.join(_output_dir,"{sample}.filtered.bed"))
66
+ shell:
67
+ "bedtools intersect -a {input} -b {params.blacklist} -v > {output}"
68
+
69
+ rule bed2fr:
70
+ input:
71
+ os.path.join(_output_dir,"{sample}.filtered.bed")
72
+ threads:
73
+ _threads
74
+ params:
75
+ dir_script=config["utility_dir"],
76
+ ignore_chr=config["ignore_chr"],
77
+ output:
78
+ fw=os.path.join(_output_dir,"{sample}.fw.bed"),
79
+ rv=os.path.join(_output_dir,"{sample}.rv.bed")
80
+ shell:
81
+ "python {params.dir_script}/1.1_bed2fr.py -b {input} {params.ignore_chr}"
82
+ else:
83
+ rule bed2fr:
84
+ input:
85
+ os.path.join(_output_dir,"{sample}.chromapx.bed")
86
+ threads:
87
+ _threads
88
+ params:
89
+ dir_script=config["utility_dir"],
90
+ ignore_chr=config["ignore_chr"],
91
+ output:
92
+ fw=os.path.join(_output_dir,"{sample}.fw.bed"),
93
+ rv=os.path.join(_output_dir,"{sample}.rv.bed")
94
+ shell:
95
+ "python {params.dir_script}/1.1_bed2fr.py -b {input} {params.ignore_chr}"
96
+
97
+ rule bed2bdg_fw:
98
+ input:
99
+ os.path.join(_output_dir,"{sample}.fw.bed")
100
+ threads:
101
+ _threads
102
+ params:
103
+ gl=config["genomelen"]
104
+ output:
105
+ temp(os.path.join(_output_dir,"{sample}.fw.bdg"))
106
+ shell:
107
+ "bedtools genomecov -bg -i {input} -g {params.gl} > {output}"
108
+
109
+ rule bed2bdg_rv:
110
+ input:
111
+ os.path.join(_output_dir,"{sample}.rv.bed")
112
+ threads:
113
+ _threads
114
+ params:
115
+ gl=config["genomelen"]
116
+ output:
117
+ temp(os.path.join(_output_dir,"{sample}.rv.bdg"))
118
+ shell:
119
+ "bedtools genomecov -bg -i {input} -g {params.gl} > {output}"
120
+
121
+ rule bdg_sort_fw:
122
+ input:
123
+ fw=os.path.join(_output_dir,"{sample}.fw.bdg")
124
+ threads:
125
+ _threads
126
+ output:
127
+ temp(os.path.join(_output_dir,"{sample}.fw.sorted.bdg"))
128
+ shell:
129
+ "bedtools sort -i {input.fw} > {output}"
130
+
131
+ rule bdg_sort_rv:
132
+ input:
133
+ rv=os.path.join(_output_dir,"{sample}.rv.bdg")
134
+ threads:
135
+ _threads
136
+ output:
137
+ temp(os.path.join(_output_dir,"{sample}.rv.sorted.bdg"))
138
+ shell:
139
+ "bedtools sort -i {input.rv} > {output}"
140
+
141
+ if _normalize == "True":
142
+ rule bdg_normalize_fw:
143
+ input:
144
+ bdg=os.path.join(_output_dir,"{sample}.fw.sorted.bdg"),
145
+ bed=os.path.join(_output_dir,"{sample}.fw.bed")
146
+ threads:
147
+ _threads
148
+ params:
149
+ dir_script=config["utility_dir"]
150
+ output:
151
+ temp(os.path.join(_output_dir,"{sample}.fw.scaled.bdg"))
152
+ shell:
153
+ "python {params.dir_script}/1.3_bdg_normalize_v4.0.py --bdg {input.bdg} --bed {input.bed}"
154
+
155
+ rule bdg_normalize_rv:
156
+ input:
157
+ bdg=os.path.join(_output_dir,"{sample}.rv.sorted.bdg"),
158
+ bed=os.path.join(_output_dir,"{sample}.rv.bed")
159
+ threads:
160
+ _threads
161
+ params:
162
+ dir_script=config["utility_dir"]
163
+ output:
164
+ temp(os.path.join(_output_dir,"{sample}.rv.scaled.bdg"))
165
+ shell:
166
+ "python {params.dir_script}/1.3_bdg_normalize_v4.0.py --bdg {input.bdg} --bed {input.bed}"
167
+
168
+ rule bdg2bw_fw:
169
+ input:
170
+ os.path.join(_output_dir,"{sample}.fw.scaled.bdg")
171
+ threads:
172
+ _threads
173
+ params:
174
+ gl=config["genomelen"],
175
+ dir_script=config["utility_dir"]
176
+ output:
177
+ os.path.join(_output_dir,"{sample}.fw.scaled.bw")
178
+ shell:
179
+ "{params.dir_script}/bedGraphToBigWig {input} {params.gl} {output}"
180
+
181
+ rule bdg2bw_rv:
182
+ input:
183
+ os.path.join(_output_dir,"{sample}.rv.scaled.bdg")
184
+ threads:
185
+ _threads
186
+ params:
187
+ gl=config["genomelen"],
188
+ dir_script=config["utility_dir"]
189
+ output:
190
+ os.path.join(_output_dir,"{sample}.rv.scaled.bw")
191
+ shell:
192
+ "{params.dir_script}/bedGraphToBigWig {input} {params.gl} {output}"
193
+
194
+ rule bwAdd:
195
+ input:
196
+ fw=os.path.join(_output_dir,"{sample}.fw.scaled.bw"),
197
+ rv=os.path.join(_output_dir,"{sample}.rv.scaled.bw")
198
+ threads:
199
+ _threads
200
+ output:
201
+ os.path.join(_output_dir,"{sample}." + _BinSize + ".add.bdg")
202
+ shell:
203
+ """
204
+ bigwigCompare --binSize {_BinSize} -p {threads} --verbose -o {output} \
205
+ --outFileFormat bedgraph --fixedStep \
206
+ --bigwig1 {input.fw} \
207
+ --bigwig2 {input.rv} \
208
+ --operation add
209
+ """
210
+ else:
211
+ rule bdg_reverse_rv:
212
+ input:
213
+ os.path.join(_output_dir,"{sample}.rv.sorted.bdg")
214
+ threads:
215
+ _threads
216
+ output:
217
+ temp(os.path.join(_output_dir,"{sample}.rv.sorted_r.bdg"))
218
+ shell:
219
+ "awk -F '\t' -v OFS='\t' '{{$4=-$4; print}}' {input} > {output}"
220
+
221
+ rule bdg2bw_fw:
222
+ input:
223
+ os.path.join(_output_dir,"{sample}.fw.sorted.bdg")
224
+ threads:
225
+ _threads
226
+ params:
227
+ gl=config["genomelen"],
228
+ dir_script=config["utility_dir"]
229
+ output:
230
+ os.path.join(_output_dir,"{sample}.fw.raw.bw")
231
+ shell:
232
+ "{params.dir_script}/bedGraphToBigWig {input} {params.gl} {output}"
233
+
234
+ rule bdg2bw_rv:
235
+ input:
236
+ os.path.join(_output_dir,"{sample}.rv.sorted_r.bdg")
237
+ threads:
238
+ _threads
239
+ params:
240
+ gl=config["genomelen"],
241
+ dir_script=config["utility_dir"]
242
+ output:
243
+ os.path.join(_output_dir,"{sample}.rv.raw.bw")
244
+ shell:
245
+ "{params.dir_script}/bedGraphToBigWig {input} {params.gl} {output}"
246
+
247
+
248
+
249
+
@@ -8,19 +8,21 @@ parser.description='这算一个小彩蛋'
8
8
  # 2022.10.21. v3.0: 文件名长度 chromap -> filtered
9
9
  # 2022.10.26. v4.0: f,r 改成 fw,rv
10
10
  # 2022.01.11. v4.5: 只取 common chromosomes (chr1-chr22, chrX, chrY, chrM)
11
+ # 2025.06.05. v5.0: 增加 ignore_chr 选项,默认只取 common chromosomes
11
12
 
12
13
  # 单文件处理脚本,配合snakemake使用
13
14
 
14
15
  parser.add_argument("-b", "--bed", type=str, metavar="dir_bed" , required=True, help="dir of bed file")
16
+ parser.add_argument('--ignore_chr', action='store_true', help='If not set, only chr1-chr22, chrX, chrY, chrM will be analyzed.')
15
17
 
16
18
  args = parser.parse_args()
17
19
 
18
20
  bed_file = pd.read_csv( args.bed, sep='\t', header=None)
19
21
 
20
- common_chr = pd.Series(['chr']*22).str[:] + pd.Series(range(1,23)).astype(str).str[:]
21
- common_chr = pd.concat([common_chr, pd.Series(['chrX','chrY','chrM'])]).to_numpy()
22
-
23
- bed_file = bed_file[bed_file[0].isin(common_chr)]
22
+ if not args.ignore_chr:
23
+ common_chr = pd.Series(['chr']*22).str[:] + pd.Series(range(1,23)).astype(str).str[:]
24
+ common_chr = pd.concat([common_chr, pd.Series(['chrX','chrY','chrM'])]).to_numpy()
25
+ bed_file = bed_file[bed_file[0].isin(common_chr)]
24
26
 
25
27
  bed_f = bed_file[bed_file[5]=='+']
26
28
  bed_r = bed_file[bed_file[5]=='-']