offtracker 1.0.1__zip → 2.7.7__zip
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {offtracker-1.0.1/offtracker.egg-info → offtracker-2.7.7}/PKG-INFO +13 -6
- {offtracker-1.0.1 → offtracker-2.7.7}/README.md +12 -5
- offtracker-2.7.7/offtracker/X_offplot.py +123 -0
- offtracker-2.7.7/offtracker/X_offtracker.py +338 -0
- offtracker-1.0.1/offtracker/X_general.py → offtracker-2.7.7/offtracker/X_sequence.py +18 -5
- {offtracker-1.0.1 → offtracker-2.7.7}/offtracker/__init__.py +1 -1
- offtracker-2.7.7/offtracker/_version.py +27 -0
- offtracker-2.7.7/offtracker/mapping/Snakefile_offtracker +245 -0
- offtracker-2.7.7/offtracker/mapping/offtracker_blacklist_hg38.merged.bed +3846 -0
- offtracker-2.7.7/offtracker/mapping/offtracker_blacklist_mm10.merged.bed +5827 -0
- {offtracker-1.0.1 → offtracker-2.7.7/offtracker.egg-info}/PKG-INFO +13 -6
- {offtracker-1.0.1 → offtracker-2.7.7}/offtracker.egg-info/SOURCES.txt +4 -3
- offtracker-2.7.7/scripts/offtracker_analysis.py +369 -0
- {offtracker-1.0.1 → offtracker-2.7.7}/scripts/offtracker_candidates.py +59 -101
- {offtracker-1.0.1 → offtracker-2.7.7}/scripts/offtracker_config.py +15 -10
- offtracker-1.0.1/offtracker/X_analysis.py +0 -332
- offtracker-1.0.1/offtracker/_version.py +0 -1
- offtracker-1.0.1/offtracker/mapping/Snakefile_Trackseq +0 -193
- offtracker-1.0.1/offtracker/mapping/offtracker_blacklist_hg38.merged.bed +0 -22228
- offtracker-1.0.1/offtracker/mapping/offtracker_blacklist_mm10.merged.bed +0 -9347
- offtracker-1.0.1/scripts/offtracker_analysis.py +0 -407
- {offtracker-1.0.1 → offtracker-2.7.7}/LICENSE.txt +0 -0
- {offtracker-1.0.1 → offtracker-2.7.7}/MANIFEST.in +0 -0
- {offtracker-1.0.1 → offtracker-2.7.7}/offtracker/mapping/1.1_bed2fr_v4.5.py +0 -0
- {offtracker-1.0.1 → offtracker-2.7.7}/offtracker/mapping/1.3_bdg_normalize_v4.0.py +0 -0
- {offtracker-1.0.1 → offtracker-2.7.7}/offtracker/mapping/bedGraphToBigWig +0 -0
- {offtracker-1.0.1 → offtracker-2.7.7}/offtracker/mapping/hg38.chrom.sizes +0 -0
- {offtracker-1.0.1 → offtracker-2.7.7}/offtracker/mapping/mm10.chrom.sizes +0 -0
- {offtracker-1.0.1 → offtracker-2.7.7}/offtracker.egg-info/dependency_links.txt +0 -0
- {offtracker-1.0.1 → offtracker-2.7.7}/offtracker.egg-info/requires.txt +0 -0
- {offtracker-1.0.1 → offtracker-2.7.7}/offtracker.egg-info/top_level.txt +0 -0
- {offtracker-1.0.1 → offtracker-2.7.7}/setup.cfg +0 -0
- {offtracker-1.0.1 → offtracker-2.7.7}/setup.py +0 -0
@@ -1,332 +0,0 @@
|
|
1
|
-
|
2
|
-
import pandas as pd
|
3
|
-
import numpy as np
|
4
|
-
import os, sys, pybedtools
|
5
|
-
sys.path.append( os.path.abspath(os.path.dirname(__file__)) )
|
6
|
-
from X_general import *
|
7
|
-
|
8
|
-
def signal_formula1(signal, nonsignal, pseudocount):
|
9
|
-
# 背景均值略小于 1
|
10
|
-
assert pseudocount>0
|
11
|
-
return signal/(nonsignal+pseudocount)
|
12
|
-
|
13
|
-
def signal_formula2(signal, nonsignal, pseudocount):
|
14
|
-
# 背景均值为 1
|
15
|
-
assert pseudocount>0
|
16
|
-
return (signal+pseudocount)/(nonsignal+pseudocount)
|
17
|
-
|
18
|
-
def signal_formula3(signal, nonsignal, pseudocount):
|
19
|
-
# 调整背景均值为 1
|
20
|
-
assert pseudocount>0
|
21
|
-
out = (signal - nonsignal + 1)
|
22
|
-
out.loc[out<0] = 0
|
23
|
-
return out
|
24
|
-
|
25
|
-
def signal_formula4(signal, nonsignal, pseudocount):
|
26
|
-
# 背景均值为 1
|
27
|
-
assert pseudocount>0
|
28
|
-
return signal
|
29
|
-
|
30
|
-
def shape_formula1(normed_left_signal, normed_left_nonsignal, normed_right_signal, normed_right_nonsignal, bkg_noise, ratio_thresh=1.5, exp_weight = 0.3):
|
31
|
-
# 防止破坏性的极小值出现
|
32
|
-
normed_left_signal = normed_left_signal.clip(lower=1)
|
33
|
-
normed_left_nonsignal = normed_left_nonsignal.clip(lower=1)
|
34
|
-
normed_right_signal = normed_right_signal.clip(lower=1)
|
35
|
-
normed_right_nonsignal = normed_right_nonsignal.clip(lower=1)
|
36
|
-
left_ratio = normed_left_signal/normed_left_nonsignal
|
37
|
-
right_ratio = normed_right_signal/normed_right_nonsignal
|
38
|
-
#
|
39
|
-
good_shape = pd.concat([left_ratio,right_ratio],axis=1).min(axis=1)>ratio_thresh
|
40
|
-
left_ratio.loc[~good_shape]=1
|
41
|
-
right_ratio.loc[~good_shape]=1
|
42
|
-
bonus_coef = left_ratio*right_ratio
|
43
|
-
bonus_coef = np.power(bonus_coef,exp_weight)
|
44
|
-
return bonus_coef
|
45
|
-
|
46
|
-
def shape_formula2(normed_left_signal, normed_left_nonsignal, normed_right_signal, normed_right_nonsignal, bkg_noise, ratio_thresh=1, exp_weight = 0.3):
|
47
|
-
left_residual = normed_left_signal - normed_left_nonsignal
|
48
|
-
right_residual = normed_right_signal - normed_right_nonsignal
|
49
|
-
# 这里先取 max,也可以取 min 或者 mean
|
50
|
-
good_left = left_residual>ratio_thresh
|
51
|
-
good_right = right_residual>ratio_thresh
|
52
|
-
good_shape = good_left&good_right
|
53
|
-
left_residual.loc[~good_shape]=0
|
54
|
-
right_residual.loc[~good_shape]=0
|
55
|
-
bonus_coef = (left_residual+right_residual)/(2*ratio_thresh)
|
56
|
-
bonus_coef = np.power(bonus_coef,exp_weight)
|
57
|
-
bonus_coef = bonus_coef.clip(lower=1)
|
58
|
-
return bonus_coef
|
59
|
-
|
60
|
-
def score_formula1(exp, ctrl, pseudocount):
|
61
|
-
assert pseudocount>0
|
62
|
-
return exp/(max(ctrl,0)+pseudocount)
|
63
|
-
|
64
|
-
def score_formula2(exp, ctrl, pseudocount):
|
65
|
-
assert pseudocount>0
|
66
|
-
return (exp+pseudocount)/(max(ctrl,0)+pseudocount)
|
67
|
-
|
68
|
-
def score_formula3(exp, ctrl, pseudocount):
|
69
|
-
assert pseudocount>0
|
70
|
-
return exp/max(ctrl, pseudocount)
|
71
|
-
|
72
|
-
def score_formula4(exp, ctrl, pseudocount):
|
73
|
-
assert pseudocount>0
|
74
|
-
return max((exp - ctrl), 0)
|
75
|
-
|
76
|
-
def score_formula5(exp, ctrl, pseudocount):
|
77
|
-
assert pseudocount>0
|
78
|
-
return exp
|
79
|
-
|
80
|
-
def fdr(p_vals):
|
81
|
-
# Benjamini-Hochberg
|
82
|
-
from scipy.stats import rankdata
|
83
|
-
ranked_p_values = rankdata(p_vals)
|
84
|
-
fdr = p_vals * len(p_vals) / ranked_p_values
|
85
|
-
fdr[fdr > 1] = 1
|
86
|
-
return fdr
|
87
|
-
|
88
|
-
def dedup_two( df_loc, col_ID_1='ID_1', col_ID_2='ID_2'):
|
89
|
-
# 会根据 df_loc 的排序保留第一个 location
|
90
|
-
# dedup 结束后,剩下的 ID_1 + ID_2 并集可能会小于 dedup 前的并集
|
91
|
-
list_nondup = []
|
92
|
-
set_IDs = set()
|
93
|
-
df_IDs = df_loc[[col_ID_1,col_ID_2]]
|
94
|
-
for a_row in df_IDs.iterrows():
|
95
|
-
temp = a_row[1]
|
96
|
-
if (temp[col_ID_1] in set_IDs) or (temp[col_ID_2] in set_IDs):
|
97
|
-
# 只要有一ID出现过,即便另一ID没出现过,也不更新 set_IDs
|
98
|
-
list_nondup.append(False)
|
99
|
-
else:
|
100
|
-
set_IDs.add(temp[col_ID_1])
|
101
|
-
set_IDs.add(temp[col_ID_2])
|
102
|
-
list_nondup.append(True)
|
103
|
-
return list_nondup
|
104
|
-
|
105
|
-
def cand_count(a_pref, the_sgRNA, regions, seq_folder, dir_chrom_sizes, overwrite=False):
|
106
|
-
#
|
107
|
-
forward_bed = f'{a_pref}.fw.bed'
|
108
|
-
reverse_bed = f'{a_pref}.rv.bed'
|
109
|
-
# put into temp_dir
|
110
|
-
dirname = os.path.dirname(a_pref)
|
111
|
-
basename = os.path.basename(a_pref)
|
112
|
-
temp_dir = os.path.join(dirname, 'temp')
|
113
|
-
if not os.path.exists( temp_dir ):
|
114
|
-
os.makedirs(temp_dir)
|
115
|
-
a_pref = os.path.join(temp_dir, basename)
|
116
|
-
#
|
117
|
-
for a_region in regions:
|
118
|
-
work = 1
|
119
|
-
if os.path.isfile(f'{a_pref}_candidate_{the_sgRNA}_right_rstrand_{a_region}.count'):
|
120
|
-
#print(f'Candidates for {the_sgRNA} within {a_region} of {basename} exists.\n')
|
121
|
-
work = 0
|
122
|
-
if overwrite:
|
123
|
-
print('overwrite mode')
|
124
|
-
work = 1
|
125
|
-
if work == 1:
|
126
|
-
print(f'Working for {basename} within {a_region} bp.\n')
|
127
|
-
# 左侧区域
|
128
|
-
left_region=os.path.join(seq_folder, f'{the_sgRNA}_candidate_left_{a_region}.bed')
|
129
|
-
a = pybedtools.BedTool(left_region)
|
130
|
-
b = pybedtools.BedTool(forward_bed)
|
131
|
-
c = a.coverage(b,sorted=True,g=dir_chrom_sizes)
|
132
|
-
c.saveas(f'{a_pref}_candidate_{the_sgRNA}_left_fstrand_{a_region}.count')
|
133
|
-
b = pybedtools.BedTool(reverse_bed)
|
134
|
-
c = a.coverage(b,sorted=True,g=dir_chrom_sizes)
|
135
|
-
c.saveas(f'{a_pref}_candidate_{the_sgRNA}_left_rstrand_{a_region}.count')
|
136
|
-
# 右侧区域
|
137
|
-
right_region=os.path.join(seq_folder, f'{the_sgRNA}_candidate_right_{a_region}.bed')
|
138
|
-
a = pybedtools.BedTool(right_region)
|
139
|
-
b = pybedtools.BedTool(forward_bed)
|
140
|
-
c = a.coverage(b,sorted=True,g=dir_chrom_sizes)
|
141
|
-
c.saveas(f'{a_pref}_candidate_{the_sgRNA}_right_fstrand_{a_region}.count')
|
142
|
-
b = pybedtools.BedTool(reverse_bed)
|
143
|
-
c = a.coverage(b,sorted=True,g=dir_chrom_sizes)
|
144
|
-
c.saveas(f'{a_pref}_candidate_{the_sgRNA}_right_rstrand_{a_region}.count')
|
145
|
-
|
146
|
-
def bkg_count(a_pref, the_sgRNA, bkgs, seq_folder, dir_chrom_sizes, overwrite=False):
|
147
|
-
# no need for LF and RR
|
148
|
-
forward_bed = f'{a_pref}.fw.bed'
|
149
|
-
reverse_bed = f'{a_pref}.rv.bed'
|
150
|
-
# put into temp_dir
|
151
|
-
dirname = os.path.dirname(a_pref)
|
152
|
-
basename = os.path.basename(a_pref)
|
153
|
-
temp_dir = os.path.join(dirname, 'temp')
|
154
|
-
if not os.path.exists( temp_dir ):
|
155
|
-
os.makedirs(temp_dir)
|
156
|
-
a_pref = os.path.join(temp_dir, basename)
|
157
|
-
#
|
158
|
-
work = 1
|
159
|
-
for a_region in bkgs:
|
160
|
-
if os.path.isfile(f'{a_pref}_candidate_{the_sgRNA}_right_fstrand_{a_region}.count'):
|
161
|
-
#print(f'Candidates for {the_sgRNA} within {a_region} of {basename} exists.\n')
|
162
|
-
work = 0
|
163
|
-
if overwrite:
|
164
|
-
print('overwrite mode')
|
165
|
-
work = 1
|
166
|
-
if work == 1:
|
167
|
-
print(f'Working for {basename} within {a_region} bp.\n')
|
168
|
-
# 左侧区域
|
169
|
-
left_region=os.path.join(seq_folder, f'{the_sgRNA}_candidate_left_{a_region}.bed')
|
170
|
-
a = pybedtools.BedTool(left_region)
|
171
|
-
b = pybedtools.BedTool(reverse_bed)
|
172
|
-
c = a.coverage(b,sorted=True,g=dir_chrom_sizes)
|
173
|
-
c.saveas(f'{a_pref}_candidate_{the_sgRNA}_left_rstrand_{a_region}.count')
|
174
|
-
# 右侧区域
|
175
|
-
right_region=os.path.join(seq_folder, f'{the_sgRNA}_candidate_right_{a_region}.bed')
|
176
|
-
a = pybedtools.BedTool(right_region)
|
177
|
-
b = pybedtools.BedTool(forward_bed)
|
178
|
-
c = a.coverage(b,sorted=True,g=dir_chrom_sizes)
|
179
|
-
c.saveas(f'{a_pref}_candidate_{the_sgRNA}_right_fstrand_{a_region}.count')
|
180
|
-
|
181
|
-
|
182
|
-
def load_count( a_sample, regions, bkgs, sgRNA_name, signal_formula, noise_length,
|
183
|
-
ratio_thresh, exp_weight, shape_formula=None,
|
184
|
-
operator='p', region_op='mean', dirname='./', pseudo_count=1
|
185
|
-
):
|
186
|
-
list_noise = []
|
187
|
-
for a_bkg in bkgs:
|
188
|
-
left_count_rstrand = os.path.join(dirname, f'{a_sample}_candidate_{sgRNA_name}_left_rstrand_{a_bkg}.count')
|
189
|
-
left_count_rstrand = readbed(left_count_rstrand)
|
190
|
-
right_count_fstrand= os.path.join(dirname, f'{a_sample}_candidate_{sgRNA_name}_right_fstrand_{a_bkg}.count')
|
191
|
-
right_count_fstrand = readbed(right_count_fstrand)
|
192
|
-
list_noise.append(right_count_fstrand[3])
|
193
|
-
list_noise.append(left_count_rstrand[3])
|
194
|
-
df_noise = pd.concat(list_noise,axis=1, ignore_index=True)
|
195
|
-
df_noise['mean'] = df_noise.mean(axis=1)
|
196
|
-
mean_all = df_noise['mean'].mean()
|
197
|
-
df_noise['std'] = df_noise.std(axis=1)
|
198
|
-
df_noise['outlier'] = df_noise['mean']+2*df_noise['std']
|
199
|
-
for i in range(len(bkgs)*2):
|
200
|
-
df_noise.loc[df_noise[i]>df_noise['outlier'],i] = np.nan
|
201
|
-
df_noise['mean2'] = df_noise.iloc[:,:6].mean(axis=1, skipna = True)
|
202
|
-
n_0bkg = sum(df_noise.mean2==0)
|
203
|
-
if n_0bkg > 0:
|
204
|
-
print(f'{n_0bkg} region(s) with 0 count in background.')
|
205
|
-
# 由于有些位置可能出现无法 mapping 而产生大量空白,导致局部噪音过低,因此这里主要是防局部高噪音造成假阳性
|
206
|
-
df_noise.loc[df_noise['mean2']<mean_all, 'mean2'] = mean_all
|
207
|
-
df_noise['noise_bp'] = df_noise['mean2']/noise_length
|
208
|
-
noise_5kb = df_noise['noise_bp'].mean()*5000
|
209
|
-
print('Average noise within 5kb on a single strand: {:.2f}'.format(noise_5kb))
|
210
|
-
if noise_5kb < 10:
|
211
|
-
print('The sequencing depth might be too shallow')
|
212
|
-
list_df_counts = []
|
213
|
-
for a_region in regions:
|
214
|
-
left_count_fstrand= os.path.join(dirname, f'{a_sample}_candidate_{sgRNA_name}_left_fstrand_{a_region}.count')
|
215
|
-
left_count_fstrand = readbed(left_count_fstrand)
|
216
|
-
left_count_fstrand.columns=[f'chr_{a_region}',f'st_left_{a_region}',f'ed_left_{a_region}',f'counts_left_F_{a_region}',f'cover_left_F_bp_{a_region}',f'length_{a_region}',f'cover_left_F_pct_{a_region}']
|
217
|
-
right_count_fstrand=os.path.join(dirname, f'{a_sample}_candidate_{sgRNA_name}_right_fstrand_{a_region}.count')
|
218
|
-
right_count_fstrand = readbed(right_count_fstrand)
|
219
|
-
right_count_fstrand.columns=[f'chr_{a_region}',f'st_right_{a_region}',f'ed_right_{a_region}',f'counts_right_F_{a_region}',f'cover_right_F_bp_{a_region}',f'length_{a_region}',f'cover_right_F_pct_{a_region}']
|
220
|
-
left_count_rstrand= os.path.join(dirname, f'{a_sample}_candidate_{sgRNA_name}_left_rstrand_{a_region}.count')
|
221
|
-
left_count_rstrand = readbed(left_count_rstrand)
|
222
|
-
left_count_rstrand.columns=[f'chr_{a_region}',f'st_left_{a_region}',f'ed_left_{a_region}',f'counts_left_R_{a_region}',f'cover_left_R_bp_{a_region}',f'length_{a_region}',f'cover_left_R_pct_{a_region}']
|
223
|
-
right_count_rstrand=os.path.join(dirname, f'{a_sample}_candidate_{sgRNA_name}_right_rstrand_{a_region}.count')
|
224
|
-
right_count_rstrand = readbed(right_count_rstrand)
|
225
|
-
right_count_rstrand.columns=[f'chr_{a_region}',f'st_right_{a_region}',f'ed_right_{a_region}',f'counts_right_R_{a_region}',f'cover_right_R_bp_{a_region}',f'length_{a_region}',f'cover_right_R_pct_{a_region}']
|
226
|
-
|
227
|
-
df_counts = pd.concat([left_count_fstrand[[f'chr_{a_region}',f'st_left_{a_region}', f'ed_left_{a_region}',
|
228
|
-
f'counts_left_F_{a_region}',f'cover_left_F_bp_{a_region}',f'cover_left_F_pct_{a_region}']],
|
229
|
-
left_count_rstrand[[f'counts_left_R_{a_region}',f'cover_left_R_bp_{a_region}',f'cover_left_R_pct_{a_region}']],
|
230
|
-
right_count_fstrand[[f'ed_right_{a_region}',f'counts_right_F_{a_region}',f'cover_right_F_bp_{a_region}',f'cover_right_F_pct_{a_region}']],
|
231
|
-
right_count_rstrand[[f'counts_right_R_{a_region}',f'cover_right_R_bp_{a_region}',f'cover_right_R_pct_{a_region}']]],axis=1)
|
232
|
-
|
233
|
-
df_counts = df_counts.reindex(columns=[f'chr_{a_region}', f'st_left_{a_region}', f'ed_right_{a_region}', f'ed_left_{a_region}',
|
234
|
-
f'counts_left_F_{a_region}',f'cover_left_F_bp_{a_region}',f'cover_left_F_pct_{a_region}',
|
235
|
-
f'counts_left_R_{a_region}',f'cover_left_R_bp_{a_region}',f'cover_left_R_pct_{a_region}',
|
236
|
-
f'counts_right_F_{a_region}',f'cover_right_F_bp_{a_region}',f'cover_right_F_pct_{a_region}',
|
237
|
-
f'counts_right_R_{a_region}',f'cover_right_R_bp_{a_region}',f'cover_right_R_pct_{a_region}'])
|
238
|
-
df_counts.columns = [f'chr_{a_region}', f'st_{a_region}', f'ed_{a_region}', f'midpoint_{a_region}',
|
239
|
-
f'counts_left_F_{a_region}',f'cover_left_F_bp_{a_region}',f'cover_left_F_pct_{a_region}',
|
240
|
-
f'counts_left_R_{a_region}',f'cover_left_R_bp_{a_region}',f'cover_left_R_pct_{a_region}',
|
241
|
-
f'counts_right_F_{a_region}',f'cover_right_F_bp_{a_region}',f'cover_right_F_pct_{a_region}',
|
242
|
-
f'counts_right_R_{a_region}',f'cover_right_R_bp_{a_region}',f'cover_right_R_pct_{a_region}']
|
243
|
-
|
244
|
-
# signal enrichment = Cs/(Cn+B)
|
245
|
-
bkg_noise = df_noise['noise_bp']*a_region
|
246
|
-
normed_left_signal = df_counts[f'counts_left_F_{a_region}']/bkg_noise
|
247
|
-
normed_left_nonsignal = df_counts[f'counts_left_R_{a_region}']/bkg_noise
|
248
|
-
normed_right_signal = df_counts[f'counts_right_R_{a_region}']/bkg_noise
|
249
|
-
normed_right_nonsignal = df_counts[f'counts_right_F_{a_region}']/bkg_noise
|
250
|
-
df_counts[f'N_LF_{a_region}'] = normed_left_signal
|
251
|
-
df_counts[f'N_LR_{a_region}'] = normed_left_nonsignal
|
252
|
-
df_counts[f'N_RR_{a_region}'] = normed_right_signal
|
253
|
-
df_counts[f'N_RF_{a_region}'] = normed_right_nonsignal
|
254
|
-
# 可变公式区
|
255
|
-
df_counts[f'left_signal_{a_region}'] = signal_formula(normed_left_signal, normed_left_nonsignal, pseudo_count)
|
256
|
-
df_counts[f'right_signal_{a_region}'] = signal_formula(normed_right_signal, normed_right_nonsignal, pseudo_count)
|
257
|
-
if shape_formula:
|
258
|
-
df_counts[f'shape_bonus_{a_region}'] = shape_formula(normed_left_signal, normed_left_nonsignal,
|
259
|
-
normed_right_signal, normed_right_nonsignal,
|
260
|
-
bkg_noise, ratio_thresh, exp_weight)
|
261
|
-
else:
|
262
|
-
df_counts[f'shape_bonus_{a_region}'] = 1
|
263
|
-
list_df_counts.append(df_counts)
|
264
|
-
df_counts = pd.concat(list_df_counts,axis=1)
|
265
|
-
left_signal_cols = df_counts.columns[ df_counts.columns.str.contains('left_signal_') ]
|
266
|
-
right_signal_cols = df_counts.columns[ df_counts.columns.str.contains('right_signal_') ]
|
267
|
-
|
268
|
-
print('region_op',region_op)
|
269
|
-
if region_op == 'mean':
|
270
|
-
df_counts['left_signal'] = df_counts[left_signal_cols].mean(axis=1)
|
271
|
-
df_counts['right_signal'] = df_counts[right_signal_cols].mean(axis=1)
|
272
|
-
elif region_op == 'max':
|
273
|
-
df_counts['left_signal'] = df_counts[left_signal_cols].max(axis=1)
|
274
|
-
df_counts['right_signal'] = df_counts[right_signal_cols].max(axis=1)
|
275
|
-
elif region_op == 'min':
|
276
|
-
df_counts['left_signal'] = df_counts[left_signal_cols].min(axis=1)
|
277
|
-
df_counts['right_signal'] = df_counts[right_signal_cols].min(axis=1)
|
278
|
-
else:
|
279
|
-
raise Exception('region_op should be "mean", "max", or "min" ')
|
280
|
-
|
281
|
-
max_region = max(regions)
|
282
|
-
df_counts['chr'] = df_counts[f'chr_{max_region}']
|
283
|
-
df_counts['st'] = df_counts[f'st_{max_region}']
|
284
|
-
df_counts['ed'] = df_counts[f'ed_{max_region}']
|
285
|
-
df_counts['midpoint'] = df_counts[f'midpoint_{max_region}']
|
286
|
-
df_counts = df_counts.reindex( columns= ['chr','st','ed','midpoint'] + list(df_counts.columns) )
|
287
|
-
df_counts = df_counts.loc[:,~df_counts.columns.duplicated()].copy()
|
288
|
-
print('Raw regions:', len(df_counts))
|
289
|
-
|
290
|
-
# operator 选择
|
291
|
-
if operator == 'p':
|
292
|
-
df_counts['score'] = (df_counts['left_signal']+df_counts['right_signal'])/2
|
293
|
-
elif operator == 'm':
|
294
|
-
df_counts['score'] = np.power(df_counts['left_signal']*df_counts['right_signal'],0.5)
|
295
|
-
|
296
|
-
#
|
297
|
-
min_region = min(regions)
|
298
|
-
|
299
|
-
if shape_formula:
|
300
|
-
#
|
301
|
-
shape_bonus_cols = df_counts.columns[ df_counts.columns.str.contains('shape_bonus_') ]
|
302
|
-
df_counts['shape_bonus'] = df_counts[shape_bonus_cols].mean(axis=1)
|
303
|
-
|
304
|
-
|
305
|
-
df_counts = df_counts.sort_values(by='score',ascending=False).reset_index(drop=True)
|
306
|
-
|
307
|
-
### 一些其他特征
|
308
|
-
|
309
|
-
# 左右最小范围信号绝对强度比值与差值
|
310
|
-
max_adjacent_ratio = df_counts[[f'left_signal_{min_region}',f'right_signal_{min_region}']].max(axis=1)
|
311
|
-
min_adjacent_ratio = df_counts[[f'left_signal_{min_region}',f'right_signal_{min_region}']].min(axis=1)
|
312
|
-
df_counts['signal_FC'] = max_adjacent_ratio/(min_adjacent_ratio+0.001)
|
313
|
-
|
314
|
-
# 左右500bp小信号边若为负数,可能是假的
|
315
|
-
df_counts['left_signal_residual'] = df_counts[f'N_LF_{min_region}'] - df_counts[f'N_LR_{min_region}']
|
316
|
-
df_counts['right_signal_residual'] = df_counts[f'N_RR_{min_region}'] - df_counts[f'N_RF_{min_region}']
|
317
|
-
min_adjacent_signal = df_counts[['left_signal_residual','right_signal_residual']].min(axis=1)
|
318
|
-
df_counts['signal_min'] = min_adjacent_signal
|
319
|
-
|
320
|
-
# 具体位置去重
|
321
|
-
df_counts['location'] = igvfmt(df_counts)
|
322
|
-
df_counts = df_counts.drop_duplicates(subset='location').reset_index(drop=True).copy()
|
323
|
-
|
324
|
-
# 第二版 unique_ID
|
325
|
-
point_head = (df_counts['midpoint']/1000).astype(int)
|
326
|
-
df_counts['ID_1'] = df_counts['chr'] + ':' + point_head.astype(str)
|
327
|
-
point_tail = df_counts['midpoint'] % 1000
|
328
|
-
df_counts.loc[point_tail<500,'ID_2'] = df_counts['chr'] + ':' + (point_head-1).astype(str)
|
329
|
-
df_counts.loc[point_tail>=500,'ID_2'] = df_counts['chr'] + ':' + (point_head+1).astype(str)
|
330
|
-
|
331
|
-
return df_counts, df_noise
|
332
|
-
|
@@ -1 +0,0 @@
|
|
1
|
-
__version__ = "1.0.1"
|
@@ -1,193 +0,0 @@
|
|
1
|
-
|
2
|
-
configfile: "config.yaml"
|
3
|
-
|
4
|
-
_threads = config["thread"]
|
5
|
-
BinSize = str(config["binsize"])
|
6
|
-
output_dir = config["output_dir"]
|
7
|
-
nametype = config["nametype"]
|
8
|
-
suffix = config["suffix"]
|
9
|
-
name1 = nametype.replace('2','1') + '.' + suffix
|
10
|
-
name2 = nametype + '.' + suffix
|
11
|
-
|
12
|
-
import os
|
13
|
-
|
14
|
-
#所有的输出列表
|
15
|
-
|
16
|
-
rule all:
|
17
|
-
input:
|
18
|
-
expand( os.path.join(output_dir,"{sample}.fw.bed"), sample=config["sample"] ),
|
19
|
-
expand( os.path.join(output_dir,"{sample}.rv.bed"), sample=config["sample"] ),
|
20
|
-
expand( os.path.join(output_dir,"{sample}.fw.scaled.bw"), sample=config["sample"] ),
|
21
|
-
expand( os.path.join(output_dir,"{sample}.rv.scaled.bw"), sample=config["sample"] ),
|
22
|
-
|
23
|
-
|
24
|
-
rule chromap:
|
25
|
-
input:
|
26
|
-
R1= lambda w: config["sample"][w.sample] + name1,
|
27
|
-
R2= lambda w: config["sample"][w.sample] + name2
|
28
|
-
threads:
|
29
|
-
_threads
|
30
|
-
params:
|
31
|
-
index=config["index"],
|
32
|
-
fasta=config["fasta"]
|
33
|
-
output:
|
34
|
-
temp(os.path.join(output_dir,"{sample}.chromapx.bed"))
|
35
|
-
shell:
|
36
|
-
"""
|
37
|
-
chromap -l 2000 --low-mem --BED --remove-pcr-duplicates \
|
38
|
-
--min-read-length 10 --allocate-multi-mappings \
|
39
|
-
-x {params.index} -r {params.fasta} -t {threads} -1 {input.R1} -2 {input.R2} -o {output}
|
40
|
-
"""
|
41
|
-
|
42
|
-
if config["blacklist"] != 'none':
|
43
|
-
rule remove_blacklist:
|
44
|
-
input:
|
45
|
-
os.path.join(output_dir,"{sample}.chromapx.bed")
|
46
|
-
threads:
|
47
|
-
_threads
|
48
|
-
params:
|
49
|
-
blacklist=config["blacklist"]
|
50
|
-
output:
|
51
|
-
temp(os.path.join(output_dir,"{sample}.filtered.bed"))
|
52
|
-
shell:
|
53
|
-
"bedtools intersect -a {input} -b {params.blacklist} -v > {output}"
|
54
|
-
|
55
|
-
rule bed2fr:
|
56
|
-
input:
|
57
|
-
os.path.join(output_dir,"{sample}.filtered.bed")
|
58
|
-
threads:
|
59
|
-
_threads
|
60
|
-
params:
|
61
|
-
dir_script=config["script_folder"]
|
62
|
-
output:
|
63
|
-
fw=os.path.join(output_dir,"{sample}.fw.bed"),
|
64
|
-
rv=os.path.join(output_dir,"{sample}.rv.bed")
|
65
|
-
shell:
|
66
|
-
"python {params.dir_script}/1.1_bed2fr_v4.5.py -b {input}"
|
67
|
-
else:
|
68
|
-
rule bed2fr:
|
69
|
-
input:
|
70
|
-
os.path.join(output_dir,"{sample}.chromapx.bed")
|
71
|
-
threads:
|
72
|
-
_threads
|
73
|
-
params:
|
74
|
-
dir_script=config["script_folder"]
|
75
|
-
output:
|
76
|
-
fw=os.path.join(output_dir,"{sample}.fw.bed"),
|
77
|
-
rv=os.path.join(output_dir,"{sample}.rv.bed")
|
78
|
-
shell:
|
79
|
-
"python {params.dir_script}/1.1_bed2fr_v4.5.py -b {input}"
|
80
|
-
|
81
|
-
rule bed2bdg_fw:
|
82
|
-
input:
|
83
|
-
os.path.join(output_dir,"{sample}.fw.bed")
|
84
|
-
threads:
|
85
|
-
_threads
|
86
|
-
params:
|
87
|
-
gl=config["genomelen"]
|
88
|
-
output:
|
89
|
-
temp(os.path.join(output_dir,"{sample}.fw.bdg"))
|
90
|
-
shell:
|
91
|
-
"bedtools genomecov -bg -i {input} -g {params.gl} > {output}"
|
92
|
-
|
93
|
-
rule bed2bdg_rv:
|
94
|
-
input:
|
95
|
-
os.path.join(output_dir,"{sample}.rv.bed")
|
96
|
-
threads:
|
97
|
-
_threads
|
98
|
-
params:
|
99
|
-
gl=config["genomelen"]
|
100
|
-
output:
|
101
|
-
temp(os.path.join(output_dir,"{sample}.rv.bdg"))
|
102
|
-
shell:
|
103
|
-
"bedtools genomecov -bg -i {input} -g {params.gl} > {output}"
|
104
|
-
|
105
|
-
rule bdg_sort_fw:
|
106
|
-
input:
|
107
|
-
fw=os.path.join(output_dir,"{sample}.fw.bdg")
|
108
|
-
threads:
|
109
|
-
_threads
|
110
|
-
output:
|
111
|
-
temp(os.path.join(output_dir,"{sample}.fw.sorted.bdg"))
|
112
|
-
shell:
|
113
|
-
"bedtools sort -i {input.fw} > {output}"
|
114
|
-
|
115
|
-
rule bdg_sort_rv:
|
116
|
-
input:
|
117
|
-
rv=os.path.join(output_dir,"{sample}.rv.bdg")
|
118
|
-
threads:
|
119
|
-
_threads
|
120
|
-
output:
|
121
|
-
temp(os.path.join(output_dir,"{sample}.rv.sorted.bdg"))
|
122
|
-
shell:
|
123
|
-
"bedtools sort -i {input.rv} > {output}"
|
124
|
-
|
125
|
-
rule bdg_normalize_fw:
|
126
|
-
input:
|
127
|
-
bdg=os.path.join(output_dir,"{sample}.fw.sorted.bdg"),
|
128
|
-
bed=os.path.join(output_dir,"{sample}.fw.bed")
|
129
|
-
threads:
|
130
|
-
_threads
|
131
|
-
params:
|
132
|
-
dir_script=config["script_folder"]
|
133
|
-
output:
|
134
|
-
temp(os.path.join(output_dir,"{sample}.fw.scaled.bdg"))
|
135
|
-
shell:
|
136
|
-
"python {params.dir_script}/1.3_bdg_normalize_v4.0.py --bdg {input.bdg} --bed {input.bed}"
|
137
|
-
|
138
|
-
rule bdg_normalize_rv:
|
139
|
-
input:
|
140
|
-
bdg=os.path.join(output_dir,"{sample}.rv.sorted.bdg"),
|
141
|
-
bed=os.path.join(output_dir,"{sample}.rv.bed")
|
142
|
-
threads:
|
143
|
-
_threads
|
144
|
-
params:
|
145
|
-
dir_script=config["script_folder"]
|
146
|
-
output:
|
147
|
-
temp(os.path.join(output_dir,"{sample}.rv.scaled.bdg"))
|
148
|
-
shell:
|
149
|
-
"python {params.dir_script}/1.3_bdg_normalize_v4.0.py --bdg {input.bdg} --bed {input.bed}"
|
150
|
-
|
151
|
-
rule bdg2bw_fw:
|
152
|
-
input:
|
153
|
-
os.path.join(output_dir,"{sample}.fw.scaled.bdg")
|
154
|
-
threads:
|
155
|
-
_threads
|
156
|
-
params:
|
157
|
-
gl=config["genomelen"],
|
158
|
-
dir_script=config["script_folder"]
|
159
|
-
output:
|
160
|
-
os.path.join(output_dir,"{sample}.fw.scaled.bw")
|
161
|
-
shell:
|
162
|
-
"{params.dir_script}/bedGraphToBigWig {input} {params.gl} {output}"
|
163
|
-
|
164
|
-
rule bdg2bw_rv:
|
165
|
-
input:
|
166
|
-
os.path.join(output_dir,"{sample}.rv.scaled.bdg")
|
167
|
-
threads:
|
168
|
-
_threads
|
169
|
-
params:
|
170
|
-
gl=config["genomelen"],
|
171
|
-
dir_script=config["script_folder"]
|
172
|
-
output:
|
173
|
-
os.path.join(output_dir,"{sample}.rv.scaled.bw")
|
174
|
-
shell:
|
175
|
-
"{params.dir_script}/bedGraphToBigWig {input} {params.gl} {output}"
|
176
|
-
|
177
|
-
#rule bwAdd:
|
178
|
-
# input:
|
179
|
-
# fw=os.path.join(output_dir,"{sample}.fw.scaled.bw"),
|
180
|
-
# rv=os.path.join(output_dir,"{sample}.rv.scaled.bw")
|
181
|
-
# threads:
|
182
|
-
# _threads
|
183
|
-
# output:
|
184
|
-
# os.path.join(output_dir,"{sample}." + BinSize + ".add.bdg")
|
185
|
-
# shell:
|
186
|
-
# """
|
187
|
-
# bigwigCompare --binSize {BinSize} -p {threads} --verbose -o {output} \
|
188
|
-
# --outFileFormat bedgraph \
|
189
|
-
# --bigwig1 {input.fw} \
|
190
|
-
# --bigwig2 {input.rv} \
|
191
|
-
# --operation add
|
192
|
-
# """
|
193
|
-
|