offtracker 1.0.2__zip → 2.7.7__zip
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {offtracker-1.0.2/offtracker.egg-info → offtracker-2.7.7}/PKG-INFO +1 -1
- offtracker-2.7.7/offtracker/X_offplot.py +123 -0
- offtracker-2.7.7/offtracker/X_offtracker.py +338 -0
- offtracker-1.0.2/offtracker/X_general.py → offtracker-2.7.7/offtracker/X_sequence.py +18 -5
- {offtracker-1.0.2 → offtracker-2.7.7}/offtracker/__init__.py +1 -1
- offtracker-2.7.7/offtracker/_version.py +27 -0
- offtracker-2.7.7/offtracker/mapping/Snakefile_offtracker +245 -0
- offtracker-2.7.7/offtracker/mapping/offtracker_blacklist_hg38.merged.bed +3846 -0
- offtracker-2.7.7/offtracker/mapping/offtracker_blacklist_mm10.merged.bed +5827 -0
- {offtracker-1.0.2 → offtracker-2.7.7/offtracker.egg-info}/PKG-INFO +1 -1
- {offtracker-1.0.2 → offtracker-2.7.7}/offtracker.egg-info/SOURCES.txt +4 -3
- offtracker-2.7.7/scripts/offtracker_analysis.py +369 -0
- {offtracker-1.0.2 → offtracker-2.7.7}/scripts/offtracker_candidates.py +59 -101
- {offtracker-1.0.2 → offtracker-2.7.7}/scripts/offtracker_config.py +15 -10
- offtracker-1.0.2/offtracker/X_analysis.py +0 -332
- offtracker-1.0.2/offtracker/_version.py +0 -1
- offtracker-1.0.2/offtracker/mapping/Snakefile_Trackseq +0 -175
- offtracker-1.0.2/offtracker/mapping/offtracker_blacklist_hg38.merged.bed +0 -22228
- offtracker-1.0.2/offtracker/mapping/offtracker_blacklist_mm10.merged.bed +0 -9347
- offtracker-1.0.2/scripts/offtracker_analysis.py +0 -407
- {offtracker-1.0.2 → offtracker-2.7.7}/LICENSE.txt +0 -0
- {offtracker-1.0.2 → offtracker-2.7.7}/MANIFEST.in +0 -0
- {offtracker-1.0.2 → offtracker-2.7.7}/README.md +0 -0
- {offtracker-1.0.2 → offtracker-2.7.7}/offtracker/mapping/1.1_bed2fr_v4.5.py +0 -0
- {offtracker-1.0.2 → offtracker-2.7.7}/offtracker/mapping/1.3_bdg_normalize_v4.0.py +0 -0
- {offtracker-1.0.2 → offtracker-2.7.7}/offtracker/mapping/bedGraphToBigWig +0 -0
- {offtracker-1.0.2 → offtracker-2.7.7}/offtracker/mapping/hg38.chrom.sizes +0 -0
- {offtracker-1.0.2 → offtracker-2.7.7}/offtracker/mapping/mm10.chrom.sizes +0 -0
- {offtracker-1.0.2 → offtracker-2.7.7}/offtracker.egg-info/dependency_links.txt +0 -0
- {offtracker-1.0.2 → offtracker-2.7.7}/offtracker.egg-info/requires.txt +0 -0
- {offtracker-1.0.2 → offtracker-2.7.7}/offtracker.egg-info/top_level.txt +0 -0
- {offtracker-1.0.2 → offtracker-2.7.7}/setup.cfg +0 -0
- {offtracker-1.0.2 → offtracker-2.7.7}/setup.py +0 -0
@@ -0,0 +1,123 @@
|
|
1
|
+
import matplotlib.pyplot as plt
|
2
|
+
import matplotlib.patches as patches
|
3
|
+
import pandas as pd
|
4
|
+
|
5
|
+
def offtable(offtargets, target_guide,
|
6
|
+
col_seq='best_target', col_score='track_score', col_mismatch='mismatch', col_loc='target_location',
|
7
|
+
title=None, font='Arial', font_size=9,
|
8
|
+
box_size_x=15, box_size_y=20, box_gap=1,
|
9
|
+
x_offset=15, y_offset=35, dpi=100, savefig=None):
|
10
|
+
# Facecolor
|
11
|
+
color_dict = {
|
12
|
+
'A': 'lightgreen',
|
13
|
+
'T': 'lightblue',
|
14
|
+
'C': 'lightcoral',
|
15
|
+
'G': 'lightgoldenrodyellow',
|
16
|
+
'N': 'lightgrey',
|
17
|
+
'—': 'orange',
|
18
|
+
'-': 'orange'
|
19
|
+
}
|
20
|
+
|
21
|
+
# If offtargets is a DataFrame, convert to list of dictionaries
|
22
|
+
if isinstance(offtargets, pd.DataFrame):
|
23
|
+
offtargets = offtargets.to_dict(orient='records')
|
24
|
+
|
25
|
+
# Configuration
|
26
|
+
# title=None
|
27
|
+
# font='Arial'
|
28
|
+
# font_size = 9
|
29
|
+
# box_size_x = 15 # 一个碱基图形的宽度
|
30
|
+
# box_size_y = 20 # 一个碱基图形的高度
|
31
|
+
# box_gap = 1 # 两行之间的间隔
|
32
|
+
# x_offset = 15
|
33
|
+
# y_offset = 35
|
34
|
+
# dpi=100
|
35
|
+
# col_seq='best_target'
|
36
|
+
# col_score='track_score'
|
37
|
+
# col_mismatch='mismatch'
|
38
|
+
# col_loc='target_location'
|
39
|
+
width = box_size_x * (len(target_guide) + 15)
|
40
|
+
height = y_offset + (len(offtargets) + 2) * (box_size_y + box_gap)
|
41
|
+
fig = plt.figure(figsize=(width / 100.0, height / 100.0), dpi=dpi)
|
42
|
+
ax = fig.add_subplot(111)
|
43
|
+
|
44
|
+
# Plot a title
|
45
|
+
ax.text(x_offset, 25, "Off-targets table" if title is None else f"{title}", fontsize=14, family=font)
|
46
|
+
|
47
|
+
# Plot the reference sequence
|
48
|
+
for i, c in enumerate(target_guide):
|
49
|
+
x = x_offset + i * box_size_x
|
50
|
+
y = y_offset
|
51
|
+
base_color = color_dict.get(c, 'purple') # Default to purple if base is not recognized
|
52
|
+
ax.add_patch(patches.Rectangle((x, y), box_size_x, box_size_y, facecolor=base_color))
|
53
|
+
ax.text(x + box_size_x / 2, y + box_size_y / 2, c, ha='center', va='center', family=font, fontsize=font_size)
|
54
|
+
# add column annotations
|
55
|
+
ax.text(x_offset + (len(target_guide) + 2) * box_size_x, y_offset + box_size_y / 4, 'Track\nScore', ha='center', va='center', family=font, fontsize=font_size*1.1)
|
56
|
+
#ax.text(x_offset + (len(target_guide) + 7) * box_size_x, y_offset + box_size_y / 2, 'Mismatch', ha='center', va='center', family=font, fontsize=font_size*1.1)
|
57
|
+
ax.text(x_offset + (len(target_guide) + 4) * box_size_x, y_offset + box_size_y / 2, 'Coordinates', ha='left', va='center', family=font, fontsize=font_size*1.1)
|
58
|
+
|
59
|
+
# Plot aligned sequences
|
60
|
+
# 目前有个bug:脱靶序列如果有 insertion,长度会不一致,而且也没想到画图怎么画,只能是默认删掉第一个碱基
|
61
|
+
for j, seq in enumerate(offtargets):
|
62
|
+
y = y_offset + (j + 1) * (box_size_y + box_gap)
|
63
|
+
# 长度不一致的情况
|
64
|
+
len_out = len(seq[col_seq]) - len(target_guide)
|
65
|
+
if len_out > 0:
|
66
|
+
if len_out > 1:
|
67
|
+
print(f"Warning: {seq[col_seq]} is {len_out} longer than {target_guide}")
|
68
|
+
# 通过比较删除开头的碱基和最后的碱基,看哪个更接近target_guide
|
69
|
+
delete_first = seq[col_seq][len_out:]
|
70
|
+
delete_last = seq[col_seq][:-len_out]
|
71
|
+
# 计算两个序列和target_guide的hamming distance
|
72
|
+
hamming_first = sum([1 for i, c in enumerate(delete_first) if c != target_guide[i]])
|
73
|
+
hamming_last = sum([1 for i, c in enumerate(delete_last) if c != target_guide[i]])
|
74
|
+
# 选择hamming distance小的那个序列
|
75
|
+
if hamming_first < hamming_last:
|
76
|
+
seq[col_seq] = delete_first
|
77
|
+
else:
|
78
|
+
seq[col_seq] = delete_last
|
79
|
+
elif len_out < 0:
|
80
|
+
print(f"Warning: {seq[col_seq]} is {-len_out} shorter than {target_guide}")
|
81
|
+
|
82
|
+
for i, c in enumerate(seq[col_seq]):
|
83
|
+
# gap 的 - (minus sign) 太短了,所以替换成 — (em dash)
|
84
|
+
if c == '-':
|
85
|
+
c = '—'
|
86
|
+
x = x_offset + i * box_size_x
|
87
|
+
base_color = color_dict.get(c, 'purple') # Default to purple if base is not recognized
|
88
|
+
if c == target_guide[i]:
|
89
|
+
ax.add_patch(patches.Rectangle((x, y), box_size_x, box_size_y, facecolor='white')) # same
|
90
|
+
elif target_guide[i] == 'N':
|
91
|
+
ax.add_patch(patches.Rectangle((x, y), box_size_x, box_size_y, facecolor='white')) # N in target
|
92
|
+
else:
|
93
|
+
ax.add_patch(patches.Rectangle((x, y), box_size_x, box_size_y, facecolor=base_color))
|
94
|
+
ax.text(x + box_size_x / 2, y + box_size_y / 2, "." if c == target_guide[i] else c, ha='center', va='center', family=font, fontsize=font_size, weight='bold')
|
95
|
+
|
96
|
+
# Annotations for score, mismatches, and location coordinates
|
97
|
+
ax.text(x_offset + (len(target_guide) + 2) * box_size_x, y + box_size_y / 2, round(seq[col_score],2), ha='center', va='center', family=font, fontsize=font_size)
|
98
|
+
#ax.text(x_offset + (len(target_guide) + 7) * box_size_x, y + box_size_y / 2, "Target" if seq[col_mismatch] == 0 else seq[col_mismatch], ha='center', va='center', family=font, fontsize=font_size, color='red' if seq[col_mismatch] == 0 else 'black')
|
99
|
+
ax.text(x_offset + (len(target_guide) + 4) * box_size_x, y + box_size_y / 2, seq[col_loc], ha='left', va='center', family=font, fontsize=font_size)
|
100
|
+
|
101
|
+
# add a vertical line to indicate the PAM
|
102
|
+
x_line = x_offset + (len(target_guide) - 3) * box_size_x
|
103
|
+
y_start = y_offset # + box_size_y / 2
|
104
|
+
y_end = y_start + (len(offtargets)+1) * (box_size_y + box_gap)
|
105
|
+
ax.vlines(x=x_line, ymin=y_start, ymax=y_end, color='indianred', linestyle='--')
|
106
|
+
|
107
|
+
# Styling and save
|
108
|
+
ax.set_xlim(0, width*1.1) # location 的文字太长了,所以要加长一点
|
109
|
+
ax.set_ylim(height, 0)
|
110
|
+
ax.axis('off')
|
111
|
+
|
112
|
+
# # This will make the subplot(s) expand to fill the entire figure area, with no padding on any side.
|
113
|
+
# # In brief, make the plot bigger (not influence the font size)
|
114
|
+
plt.subplots_adjust(left=0, right=1, top=1, bottom=0)
|
115
|
+
if savefig is not None:
|
116
|
+
plt.savefig(savefig, dpi=dpi)
|
117
|
+
plt.show()
|
118
|
+
return ax
|
119
|
+
|
120
|
+
|
121
|
+
|
122
|
+
|
123
|
+
|
@@ -0,0 +1,338 @@
|
|
1
|
+
|
2
|
+
import pandas as pd
|
3
|
+
import numpy as np
|
4
|
+
import os, sys
|
5
|
+
sys.path.append( os.path.abspath(os.path.dirname(__file__)) )
|
6
|
+
|
7
|
+
def fdr(p_vals):
|
8
|
+
# Benjamini-Hochberg
|
9
|
+
from scipy.stats import rankdata
|
10
|
+
ranked_p_values = rankdata(p_vals)
|
11
|
+
fdr = p_vals * len(p_vals) / ranked_p_values
|
12
|
+
fdr[fdr > 1] = 1
|
13
|
+
return fdr
|
14
|
+
|
15
|
+
def dedup_two( df_loc, col_ID_1='ID_1', col_ID_2='ID_2'):
|
16
|
+
# 会根据 df_loc 的排序保留第一个 location
|
17
|
+
# dedup 结束后,剩下的 ID_1 + ID_2 并集可能会小于 dedup 前的并集
|
18
|
+
list_nondup = []
|
19
|
+
set_IDs = set()
|
20
|
+
df_IDs = df_loc[[col_ID_1,col_ID_2]]
|
21
|
+
for a_row in df_IDs.iterrows():
|
22
|
+
temp = a_row[1]
|
23
|
+
if (temp[col_ID_1] in set_IDs) or (temp[col_ID_2] in set_IDs):
|
24
|
+
# 只要有一ID出现过,即便另一ID没出现过,也不更新 set_IDs
|
25
|
+
list_nondup.append(False)
|
26
|
+
else:
|
27
|
+
set_IDs.add(temp[col_ID_1])
|
28
|
+
set_IDs.add(temp[col_ID_2])
|
29
|
+
list_nondup.append(True)
|
30
|
+
return list_nondup
|
31
|
+
|
32
|
+
def window_smooth(sr_smooth, window_size=3, times=1):
|
33
|
+
window = np.ones(window_size) / window_size
|
34
|
+
|
35
|
+
bool_index = False
|
36
|
+
if isinstance(sr_smooth, pd.Series):
|
37
|
+
sr_index = sr_smooth.index
|
38
|
+
bool_index = True
|
39
|
+
|
40
|
+
for i in range(times):
|
41
|
+
sr_smooth = pd.Series(np.convolve(sr_smooth, window, mode='same'))
|
42
|
+
|
43
|
+
if bool_index:
|
44
|
+
sr_smooth.index = sr_index
|
45
|
+
|
46
|
+
return sr_smooth
|
47
|
+
|
48
|
+
# 每 n 个数取平均
|
49
|
+
def segmental_mean(vector, n, drop='last'):
|
50
|
+
# Identify the length and remainder
|
51
|
+
length = len(vector)
|
52
|
+
rem = length % n
|
53
|
+
# If there is a remainder
|
54
|
+
if rem != 0:
|
55
|
+
if drop=='last':
|
56
|
+
main_part = vector[:-rem] # Part that could be reshaped
|
57
|
+
#last_part = vector[-rem:] # Excessive part in the end
|
58
|
+
array = np.array(main_part).reshape(-1, n)
|
59
|
+
result = array.mean(axis=1)
|
60
|
+
elif drop=='first':
|
61
|
+
main_part = vector[rem:] # Part that could be reshaped
|
62
|
+
#first_part = vector[:rem] # Excessive part in the start
|
63
|
+
array = np.array(main_part).reshape(-1, n)
|
64
|
+
result = array.mean(axis=1)
|
65
|
+
else:
|
66
|
+
# If there's no remainder, proceed as usual
|
67
|
+
array = np.array(vector).reshape(-1, n)
|
68
|
+
result = array.mean(axis=1)
|
69
|
+
|
70
|
+
return result
|
71
|
+
|
72
|
+
# v2.1 版本的计算信号长度存在一个问题:bw add 后,不是严格按 binsize 分割的,连续几个区域同数值,会被合并,但是这里都按 binsize 算,导致长度可能偏小
|
73
|
+
# v2.6 按 flank regions 分别取子集,避免根据 binsize 推测行数的误差,并且加入 trackseq v4 版本的正负bin占比计算
|
74
|
+
def target_signal(df_bdg_chr, chrom, cleavage_site, flank_max=100000, smooth_times = 1, window_size = 3,
|
75
|
+
binsize=100, flank_regions=[500,1000,2000,5000],
|
76
|
+
length_bkg = 20000, length_binsize=1000, length_min_noise=0.2, n_std=1,
|
77
|
+
end='end',start='start',value='residual', pct_offset=0.0):
|
78
|
+
# 输入数据必须是同一条染色体内的
|
79
|
+
# 统计 flank regions 的个数
|
80
|
+
# n_regions = len(flank_regions)
|
81
|
+
## 根据 binsize 计算每个 flank region 对应长度的 row 个数, 会有偏差
|
82
|
+
## flank_bins = [int(x/binsize) for x in flank_regions] # 取消这个算法
|
83
|
+
|
84
|
+
assert flank_max >= max(flank_regions), 'flank_max must be larger than max(flank_regions)'
|
85
|
+
assert length_binsize >= binsize, 'length_binsize must be larger than binsize'
|
86
|
+
n_merge = int(length_binsize/binsize)
|
87
|
+
n_bkg = int(length_bkg/length_binsize)
|
88
|
+
|
89
|
+
# Left
|
90
|
+
# 新版增加 list_signal_pct_L 会带 (pos_pct_left, left_pos_sum, left_neg_sum)*n_regions 外加一个 list_pct_score_L
|
91
|
+
# list_signal_residual_L 数值和之前类似
|
92
|
+
list_signal_pct_L = []
|
93
|
+
list_pct_score_L = []
|
94
|
+
list_signal_residual_L = []
|
95
|
+
df_bdg_chr_L = df_bdg_chr[ (df_bdg_chr[end] >= cleavage_site-flank_max) & (df_bdg_chr[end]<=cleavage_site) ]
|
96
|
+
if len(df_bdg_chr_L)<=window_size:
|
97
|
+
L_length = 0
|
98
|
+
L_overall_signal = 0
|
99
|
+
for flank in flank_regions:
|
100
|
+
list_signal_pct_L.extend([0,0,0])
|
101
|
+
list_pct_score_L.append(0)
|
102
|
+
list_signal_residual_L.append(0)
|
103
|
+
else:
|
104
|
+
##################
|
105
|
+
## 先算 overall ##
|
106
|
+
##################
|
107
|
+
L_length = 0
|
108
|
+
# 信号平滑
|
109
|
+
if smooth_times > 0:
|
110
|
+
signal_residual_L = window_smooth(df_bdg_chr_L[value], window_size=window_size, times=smooth_times)
|
111
|
+
else:
|
112
|
+
signal_residual_L = df_bdg_chr_L[value]
|
113
|
+
# 信号长度
|
114
|
+
# 增大 binsize 减少随机波动,增大到 length_binsize 长度
|
115
|
+
signal_residual_L_merged = segmental_mean(signal_residual_L, n_merge, drop='first')
|
116
|
+
# 防止出现长度不足的情况,然后二次平滑
|
117
|
+
if len(signal_residual_L_merged)<=window_size:
|
118
|
+
L_length = 0
|
119
|
+
L_overall_signal = 0
|
120
|
+
else:
|
121
|
+
signal_residual_L_merged = window_smooth(signal_residual_L_merged, window_size=3, times=3)
|
122
|
+
# 计算背景和阈值
|
123
|
+
bkg_L_mean = signal_residual_L_merged[:n_bkg].mean()
|
124
|
+
bkg_L_std = max(length_min_noise, signal_residual_L_merged[:n_bkg].std())
|
125
|
+
# 平移到均值为0
|
126
|
+
# signal_residual_L_merged = signal_residual_L_merged - bkg_L_mean
|
127
|
+
# 最后一个小于 threshold 的位点
|
128
|
+
signal_start_index = signal_residual_L_merged.index[signal_residual_L_merged<bkg_L_mean+n_std*bkg_L_std].max()
|
129
|
+
# 计算信号长度
|
130
|
+
L_n_bins = signal_residual_L_merged.index.max()-signal_start_index
|
131
|
+
if L_n_bins == 0:
|
132
|
+
L_length = 0
|
133
|
+
else:
|
134
|
+
df_bdg_chr_L_good = df_bdg_chr_L[-n_merge*L_n_bins:]
|
135
|
+
L_length = df_bdg_chr_L_good[end].iloc[-1]-df_bdg_chr_L_good[start].iloc[0]
|
136
|
+
# 计算 overall 信号强度
|
137
|
+
L_overall_signal = signal_residual_L_merged.sum()
|
138
|
+
|
139
|
+
###################
|
140
|
+
## 再算 proximal ##
|
141
|
+
###################
|
142
|
+
# left_region_sum_norm 应该约等于 v2.5 以前的单一数值
|
143
|
+
for flank in flank_regions:
|
144
|
+
bool_flank = (df_bdg_chr_L[end] >= cleavage_site-flank)
|
145
|
+
df_bdg_chr_L_flank = df_bdg_chr_L[ bool_flank ]
|
146
|
+
signal_residual_L_flank = signal_residual_L[ bool_flank ]
|
147
|
+
if df_bdg_chr_L_flank.empty:
|
148
|
+
list_signal_pct_L.extend( [0,0,0] )
|
149
|
+
list_pct_score_L.append(0)
|
150
|
+
list_signal_residual_L.append(0)
|
151
|
+
continue
|
152
|
+
# pos and neg
|
153
|
+
df_bdg_chr_L_flank_pos = df_bdg_chr_L_flank[df_bdg_chr_L_flank[value] > 0]
|
154
|
+
df_bdg_chr_L_flank_neg = df_bdg_chr_L_flank[df_bdg_chr_L_flank[value] <= 0]
|
155
|
+
n_pos_left = len(df_bdg_chr_L_flank_pos)
|
156
|
+
n_neg_left = len(df_bdg_chr_L_flank_neg)
|
157
|
+
# avoid zero
|
158
|
+
if n_pos_left == 0:
|
159
|
+
pos_pct_left = 0
|
160
|
+
else:
|
161
|
+
pos_pct_left = n_pos_left/(n_pos_left+n_neg_left)
|
162
|
+
# pos/neg value sum
|
163
|
+
left_pos_sum = df_bdg_chr_L_flank_pos[value].sum()
|
164
|
+
left_neg_sum = df_bdg_chr_L_flank_neg[value].sum()
|
165
|
+
list_signal_pct_L.extend( [pos_pct_left,left_pos_sum,left_neg_sum] )
|
166
|
+
# 平滑 sum
|
167
|
+
left_region_sum_norm = 1000*signal_residual_L_flank.sum()/flank
|
168
|
+
list_signal_residual_L.append(left_region_sum_norm)
|
169
|
+
# pct_score
|
170
|
+
left_pct_score = left_region_sum_norm*max(0,(pos_pct_left-pct_offset))
|
171
|
+
list_pct_score_L.append(left_pct_score)
|
172
|
+
|
173
|
+
# Right
|
174
|
+
list_signal_pct_R = []
|
175
|
+
list_pct_score_R = []
|
176
|
+
list_signal_residual_R = []
|
177
|
+
df_bdg_chr_R = df_bdg_chr[ (df_bdg_chr[start] <= cleavage_site+flank_max) & (df_bdg_chr[start]>=cleavage_site) ].copy()
|
178
|
+
if len(df_bdg_chr_R)<=window_size:
|
179
|
+
R_length = 0
|
180
|
+
R_overall_signal = 0
|
181
|
+
for flank in flank_regions:
|
182
|
+
list_signal_pct_R.extend([0,0,0])
|
183
|
+
list_pct_score_R.append(0)
|
184
|
+
list_signal_residual_R.append(0)
|
185
|
+
else:
|
186
|
+
##################
|
187
|
+
## 先算 overall ##
|
188
|
+
##################
|
189
|
+
R_length = 0
|
190
|
+
# 右侧信号反向
|
191
|
+
df_bdg_chr_R[value] = -df_bdg_chr_R[value]
|
192
|
+
# 信号平滑
|
193
|
+
if smooth_times > 0:
|
194
|
+
signal_residual_R = window_smooth(df_bdg_chr_R[value], window_size=window_size, times=smooth_times)
|
195
|
+
else:
|
196
|
+
signal_residual_R = df_bdg_chr_R[value]
|
197
|
+
# 信号长度
|
198
|
+
# 增大 binsize 减少随机波动,增大到 length_binsize 长度
|
199
|
+
signal_residual_R_merged = segmental_mean(signal_residual_R, n_merge, drop='last')
|
200
|
+
# 防止出现长度不足的情况
|
201
|
+
if len(signal_residual_R_merged)<=window_size:
|
202
|
+
R_length = 0
|
203
|
+
R_overall_signal = 0
|
204
|
+
else:
|
205
|
+
signal_residual_R_merged = window_smooth(signal_residual_R_merged, window_size=3, times=3)
|
206
|
+
# 计算背景和阈值
|
207
|
+
bkg_R_mean = signal_residual_R_merged[-n_bkg:].mean()
|
208
|
+
bkg_R_std = max(length_min_noise, signal_residual_R_merged[-n_bkg:].std())
|
209
|
+
# 平移到均值为0
|
210
|
+
# signal_residual_R_merged = signal_residual_R_merged - bkg_R_mean
|
211
|
+
# 第一个小于 threshold 的位点
|
212
|
+
signal_end_index = signal_residual_R_merged.index[signal_residual_R_merged<bkg_R_mean+n_std*bkg_R_std].min()
|
213
|
+
# 计算信号长度
|
214
|
+
R_n_bins = signal_end_index
|
215
|
+
if R_n_bins == 0:
|
216
|
+
R_length = 0
|
217
|
+
else:
|
218
|
+
df_bdg_chr_R_good = df_bdg_chr_R[:n_merge*R_n_bins]
|
219
|
+
R_length = df_bdg_chr_R_good[end].iloc[-1]-df_bdg_chr_R_good[start].iloc[0]
|
220
|
+
# 计算 overall 信号强度
|
221
|
+
R_overall_signal = signal_residual_R_merged.sum()
|
222
|
+
###################
|
223
|
+
## 再算 proximal ##
|
224
|
+
###################
|
225
|
+
# 注意,上面的 df_bdg_chr_R[value] 是反向的,因此这边还是 pos 为有信号
|
226
|
+
for flank in flank_regions:
|
227
|
+
bool_flank = (df_bdg_chr_R[start] <= cleavage_site+flank)
|
228
|
+
df_bdg_chr_R_flank = df_bdg_chr_R[ bool_flank ]
|
229
|
+
signal_residual_R_flank = signal_residual_R[ bool_flank ]
|
230
|
+
if df_bdg_chr_R_flank.empty:
|
231
|
+
list_signal_pct_R.extend( [0,0,0] )
|
232
|
+
list_pct_score_R.append(0)
|
233
|
+
list_signal_residual_R.append(0)
|
234
|
+
continue
|
235
|
+
# pos and neg
|
236
|
+
df_bdg_chr_R_flank_pos = df_bdg_chr_R_flank[df_bdg_chr_R_flank[value] > 0]
|
237
|
+
df_bdg_chr_R_flank_neg = df_bdg_chr_R_flank[df_bdg_chr_R_flank[value] <= 0]
|
238
|
+
n_pos_right = len(df_bdg_chr_R_flank_pos)
|
239
|
+
n_neg_right = len(df_bdg_chr_R_flank_neg)
|
240
|
+
# avoid zero
|
241
|
+
if n_pos_right == 0:
|
242
|
+
pos_pct_right = 0
|
243
|
+
else:
|
244
|
+
pos_pct_right = n_pos_right/(n_pos_right+n_neg_right)
|
245
|
+
# pos/neg value sum
|
246
|
+
right_pos_sum = df_bdg_chr_R_flank_pos[value].sum()
|
247
|
+
right_neg_sum = df_bdg_chr_R_flank_neg[value].sum()
|
248
|
+
list_signal_pct_R.extend( [pos_pct_right,right_pos_sum,right_neg_sum] )
|
249
|
+
# 平滑 sum
|
250
|
+
right_region_sum_norm = 1000*signal_residual_R_flank.sum()/flank
|
251
|
+
list_signal_residual_R.append(right_region_sum_norm)
|
252
|
+
# pct_score
|
253
|
+
right_pct_score = right_region_sum_norm*max(0,(pos_pct_right-pct_offset))
|
254
|
+
list_pct_score_R.append(right_pct_score)
|
255
|
+
|
256
|
+
|
257
|
+
# calculate proximal_signal
|
258
|
+
mean_signal_residual_L = np.mean(list_signal_residual_L)
|
259
|
+
mean_signal_residual_R = np.mean(list_signal_residual_R)
|
260
|
+
proximal_signal = mean_signal_residual_L+mean_signal_residual_R
|
261
|
+
# calculate pct_score
|
262
|
+
mean_pct_score_L = np.mean(list_pct_score_L)
|
263
|
+
mean_pct_score_R = np.mean(list_pct_score_R)
|
264
|
+
pct_score = mean_pct_score_L+mean_pct_score_R
|
265
|
+
# calculate length and overall_signal
|
266
|
+
signal_length = L_length + R_length
|
267
|
+
#pct_signal_length = L_pct_length + R_pct_length
|
268
|
+
# 有时候极远处有真编辑位点或者大噪音,会导致 overall_signal 的 bkg 不正确
|
269
|
+
if L_overall_signal > 2*(mean_pct_score_L+mean_signal_residual_L):
|
270
|
+
L_overall_signal = (mean_pct_score_L+mean_signal_residual_L)/2
|
271
|
+
if R_overall_signal > 2*(mean_pct_score_R+mean_signal_residual_R):
|
272
|
+
R_overall_signal = (mean_pct_score_R+mean_signal_residual_R)/2
|
273
|
+
overall_signal = L_overall_signal + R_overall_signal
|
274
|
+
list_return = list_signal_pct_L + list_signal_pct_R + \
|
275
|
+
list_pct_score_L + list_pct_score_R + \
|
276
|
+
list_signal_residual_L + list_signal_residual_R + \
|
277
|
+
[mean_signal_residual_L, mean_signal_residual_R] + \
|
278
|
+
[mean_pct_score_L, mean_pct_score_R] + \
|
279
|
+
[chrom+':'+str(cleavage_site)] + \
|
280
|
+
[L_length, R_length, L_overall_signal, R_overall_signal, signal_length, overall_signal, proximal_signal, pct_score]
|
281
|
+
# [L_pct_length, R_pct_length, pct_signal_length] 暂时不加这里了,额外写一个函数
|
282
|
+
# 2*3*n_regions
|
283
|
+
# 2*n_regions
|
284
|
+
# 2*n_regions
|
285
|
+
# 2+2+1+8
|
286
|
+
|
287
|
+
return list_return
|
288
|
+
|
289
|
+
def target_signal_chunk(df_bdg_chr, df_alignment_chr, flank_max=100000, smooth_times = 1, window_size = 3, binsize=100, flank_regions=[500,1000,2000,5000],
|
290
|
+
length_bkg = 20000, length_binsize=1000, length_min_noise=0.2, n_std=1, pct_offset=0.0):
|
291
|
+
# 输入数据必须是同一条染色体内的
|
292
|
+
list_target_all = []
|
293
|
+
for a_row in df_alignment_chr.iterrows():
|
294
|
+
chrom, cleavage_site = a_row[1]
|
295
|
+
list_target = target_signal(df_bdg_chr, chrom, cleavage_site, flank_max, smooth_times = smooth_times, window_size = window_size, binsize=binsize, flank_regions=flank_regions,
|
296
|
+
length_bkg = length_bkg, length_binsize=length_binsize, length_min_noise=length_min_noise, n_std=n_std, pct_offset=pct_offset)
|
297
|
+
list_target_all.append(list_target)
|
298
|
+
df_result = pd.DataFrame(list_target_all)
|
299
|
+
pct_features_L = [['L_pos_pct_'+x,'L_pos_'+x,'L_neg_'+x] for x in pd.Series(flank_regions).astype(str)]
|
300
|
+
pct_features_L = [item for sublist in pct_features_L for item in sublist]
|
301
|
+
pct_features_R = [['R_pos_pct_'+x,'R_pos_'+x,'R_neg_'+x] for x in pd.Series(flank_regions).astype(str)]
|
302
|
+
pct_features_R = [item for sublist in pct_features_R for item in sublist]
|
303
|
+
df_result.columns = pct_features_L + pct_features_R + \
|
304
|
+
list('L_pct_score_' + pd.Series(flank_regions).astype(str)) + list('R_pct_score_' + pd.Series(flank_regions).astype(str)) + \
|
305
|
+
list('L_' + pd.Series(flank_regions).astype(str)) + list('R_' + pd.Series(flank_regions).astype(str)) + \
|
306
|
+
['L_mean', 'R_mean','L_mean_pct_score','R_mean_pct_score','chr_cleavage',
|
307
|
+
'L_length', 'R_length', 'L_overall_signal', 'R_overall_signal', 'signal_length', 'overall_signal','proximal_signal','pct_score']
|
308
|
+
return df_result
|
309
|
+
|
310
|
+
|
311
|
+
|
312
|
+
|
313
|
+
# 2024.01.22. 额外写一个 signal length 算法,增加基于 pos_pct 而非 smooth 后的 overall_signal 的 length,叫 singal_length
|
314
|
+
def signal_length(df_bdg_chr, chrom, cleavage_site, end='end',start='start',value='residual',
|
315
|
+
flank_max=100000, binsize=100, pct_threshold=0.6):
|
316
|
+
# 输入数据必须是同一条染色体内的
|
317
|
+
# Left
|
318
|
+
df_bdg_chr_L = df_bdg_chr[ (df_bdg_chr[end] >= cleavage_site-flank_max) & (df_bdg_chr[end]<=cleavage_site) ].copy()
|
319
|
+
|
320
|
+
# pos and neg
|
321
|
+
df_bdg_chr_L_flank_pos = df_bdg_chr_L_flank[df_bdg_chr_L_flank[value] > 0]
|
322
|
+
df_bdg_chr_L_flank_neg = df_bdg_chr_L_flank[df_bdg_chr_L_flank[value] <= 0]
|
323
|
+
n_pos_left = len(df_bdg_chr_L_flank_pos)
|
324
|
+
n_neg_left = len(df_bdg_chr_L_flank_neg)
|
325
|
+
# avoid zero
|
326
|
+
if n_pos_left == 0:
|
327
|
+
pos_pct_left = 0
|
328
|
+
else:
|
329
|
+
pos_pct_left = n_pos_left/(n_pos_left+n_neg_left)
|
330
|
+
|
331
|
+
|
332
|
+
df_bdg_chr_R = df_bdg_chr[ (df_bdg_chr[start] <= cleavage_site+flank_max) & (df_bdg_chr[start]>=cleavage_site) ].copy()
|
333
|
+
# list_signal_residual_L 数值和之前类似
|
334
|
+
list_signal_pct_L = []
|
335
|
+
list_pct_score_L = []
|
336
|
+
list_signal_residual_L = []
|
337
|
+
|
338
|
+
return list_return
|
@@ -67,10 +67,10 @@ def write_fasta(df, output, name_col = 'ID', sequence_col='sequence', line_len =
|
|
67
67
|
f.write( df[sequence_col].iloc[i][ j*line_len : (j+1)*line_len ] + '\n')
|
68
68
|
return 'fasta is written.'
|
69
69
|
|
70
|
-
def
|
70
|
+
def write_bed(df, bed_dir):
|
71
71
|
return df.to_csv(bed_dir, sep='\t', header=None, index=False)
|
72
72
|
|
73
|
-
def
|
73
|
+
def read_bed(bed_dir):
|
74
74
|
return pd.read_csv(bed_dir,sep='\t',header=None)
|
75
75
|
|
76
76
|
def X_readbed(bed_dir):
|
@@ -90,7 +90,7 @@ def bedfmt(igv):
|
|
90
90
|
igv['ed'] = igv['ed'].astype(int)
|
91
91
|
return igv
|
92
92
|
|
93
|
-
def add_ID(df, chr_col=0, midpoint='midpoint'):
|
93
|
+
def add_ID(df, chr_col=0, midpoint='cleavage_site'):#, midpoint='midpoint'):
|
94
94
|
chr_col_name = df.columns[chr_col]
|
95
95
|
print(f'chromosome col = {chr_col_name}')
|
96
96
|
point_head = (df[midpoint]/1000).astype(int)
|
@@ -130,7 +130,7 @@ def sgRNA_alignment(a_key, sgRNA, seq, frag_len, DNA_matrix=None, mismatch_score
|
|
130
130
|
pos_ed = int(a_key.split('-')[1])
|
131
131
|
chr_name = a_key.split(':')[0]
|
132
132
|
target_st = pos_st + best_alignment.start
|
133
|
-
target_ed = pos_st + best_alignment.end - 1 #
|
133
|
+
target_ed = pos_st + best_alignment.end - 1 - deletion # 2023.12.05 修正 deletion 错位
|
134
134
|
target_location = chr_name + ':' + str(target_st) + '-' + str(target_ed)
|
135
135
|
if return_align:
|
136
136
|
return [best_alignment.score, position_pct, target, target_location, deletion, insertion, mismatch, best_alignment.seqB]
|
@@ -138,4 +138,17 @@ def sgRNA_alignment(a_key, sgRNA, seq, frag_len, DNA_matrix=None, mismatch_score
|
|
138
138
|
return [best_alignment.score, position_pct, target, target_location, deletion, insertion, mismatch]
|
139
139
|
|
140
140
|
|
141
|
-
|
141
|
+
def combine_df(list_df, op = 'mean'):
|
142
|
+
# df 行列、结构必须一模一样,非数字部分也一模一样,只有数字不同
|
143
|
+
df_nondigit = list_df[0].select_dtypes(exclude=[float, int])
|
144
|
+
if op=='mean':
|
145
|
+
df_combined = pd.concat(list_df).groupby(level=0).mean(numeric_only=True)
|
146
|
+
elif op=='max':
|
147
|
+
df_combined = pd.concat(list_df).groupby(level=0).max(numeric_only=True)
|
148
|
+
elif op=='min':
|
149
|
+
df_combined = pd.concat(list_df).groupby(level=0).min(numeric_only=True)
|
150
|
+
else:
|
151
|
+
print('op must be mean, max or min')
|
152
|
+
#
|
153
|
+
df_combined = pd.concat([df_nondigit, df_combined], axis=1)
|
154
|
+
return df_combined
|
@@ -1,2 +1,2 @@
|
|
1
1
|
from ._version import __version__
|
2
|
-
from . import
|
2
|
+
from .X_offtracker import *
|
@@ -0,0 +1,27 @@
|
|
1
|
+
__version__ = "2.7.7"
|
2
|
+
# 2023.08.11. v1.1.0 adding a option for not normalizing the bw file
|
3
|
+
# 2023.10.26. v1.9.0 prerelease for v2.0
|
4
|
+
# 2023.10.27. v2.0.0 大更新,还没微调
|
5
|
+
# 2023.10.28. v2.1.0 修复bug,增加计算信号长度的功能
|
6
|
+
# 2023.10.28. v2.2.0 修复bug,改变计算信号长度的算法
|
7
|
+
# 2023.10.29. v2.3.0 增加 overall signal 计算
|
8
|
+
# 2023.11.01. v2.3.1 增加 signal_only 选项
|
9
|
+
# 2023.11.02. v2.3.2 修改 sample signal 和 group mean 的计算顺序
|
10
|
+
# 2023.11.04. v2.3.3 修复 overall score 标准化时排序错误的问题
|
11
|
+
# 2023.11.05. v2.3.4 修复判断单边溢出信号时的列名选取错误
|
12
|
+
# 2023.11.13. v2.3.5 微调 track score
|
13
|
+
# 2023.12.05. v2.3.6 candidates 增加 cleavage site,修正 alignment 有 deletion 会错位的 bug
|
14
|
+
# 2023.12.05. v2.3.7 用 cleavage site 代替 midpoint # 还没改完
|
15
|
+
# 2023.12.07. v2.3.8 df_score 增加 df_exp, df_ctr 各自列。修复没 df_ctr 时的 bug。track score 用 proximal
|
16
|
+
# 2023.12.09. v2.4.0 为了兼顾 proximal 和 overall,当 normalized overall signal 高于 2 时,增加 overall signal 的加分
|
17
|
+
# 2023.12.09. v2.5.0 尝试新的加权位置
|
18
|
+
# 2023.12.10. v2.6.0 加入 trackseq v4 的计算分支,即考虑 Region 内的 positive_pct,避免短而尖锐的信号
|
19
|
+
# 2023.12.10. v2.6.1 有些非特异信号数值很大,如果在 control 组是大负数,可能导致减 control 后假高信号,因此给负数一个 clip
|
20
|
+
# 2023.12.30. v2.7.0 增加 X_offplot 模块,用于绘图
|
21
|
+
# 2023.12.31. v2.7.1 control 的负数值 clip 由 -5 改为 -1,进一步减少假阳性。另外不加 overall 了
|
22
|
+
# 2024.01.01. v2.7.2 权重改为 proximal + pct = 1 + 1. 防信号外溢假阳性标准由<0改为<=0
|
23
|
+
# 2024.01.02. v2.7.3 flank regions 默认值改为 1000 2000 3000 5000。之前 control 的负数值 clip 相当于直接在 final score,现在改为每个单独 clip 后重新算 score,默认值为 CtrClip=-0.5
|
24
|
+
# 2024.01.03. v2.7.4 更新了 blacklist.bed
|
25
|
+
# 2024.01.04. v2.7.5 更新了 hg38 blacklist.bed
|
26
|
+
# 2024.01.12. v2.7.6 修复小bug,输出 fdr 改为 <0.05。
|
27
|
+
# 2024.01.23. v2.7.7 Snakefile_offtracker: add --fixedStep to bigwigCompare for not merging neighbouring bins with equal values.
|