offtracker 2.12.3__zip → 2.13.1__zip
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {offtracker-2.12.3/offtracker.egg-info → offtracker-2.13.1}/PKG-INFO +1 -1
- {offtracker-2.12.3 → offtracker-2.13.1}/offtracker/X_offtracker.py +314 -1
- {offtracker-2.12.3 → offtracker-2.13.1}/offtracker/X_sequence.py +133 -1
- {offtracker-2.12.3 → offtracker-2.13.1}/offtracker/_version.py +12 -10
- {offtracker-2.12.3 → offtracker-2.13.1/offtracker.egg-info}/PKG-INFO +1 -1
- {offtracker-2.12.3 → offtracker-2.13.1}/offtracker.egg-info/SOURCES.txt +1 -0
- {offtracker-2.12.3 → offtracker-2.13.1}/scripts/offtracker_analysis.py +47 -32
- offtracker-2.13.1/scripts/offtracker_correction.py +282 -0
- {offtracker-2.12.3 → offtracker-2.13.1}/setup.py +1 -0
- {offtracker-2.12.3 → offtracker-2.13.1}/LICENSE.txt +0 -0
- {offtracker-2.12.3 → offtracker-2.13.1}/MANIFEST.in +0 -0
- {offtracker-2.12.3 → offtracker-2.13.1}/README.md +0 -0
- {offtracker-2.12.3 → offtracker-2.13.1}/offtracker/X_offplot.py +0 -0
- {offtracker-2.12.3 → offtracker-2.13.1}/offtracker/__init__.py +0 -0
- {offtracker-2.12.3 → offtracker-2.13.1}/offtracker/snakefile/Snakefile_QC.smk +0 -0
- {offtracker-2.12.3 → offtracker-2.13.1}/offtracker/snakefile/Snakefile_offtracker.smk +0 -0
- {offtracker-2.12.3 → offtracker-2.13.1}/offtracker/utility/1.1_bed2fr.py +0 -0
- {offtracker-2.12.3 → offtracker-2.13.1}/offtracker/utility/1.3_bdg_normalize_v4.0.py +0 -0
- {offtracker-2.12.3 → offtracker-2.13.1}/offtracker/utility/bedGraphToBigWig +0 -0
- {offtracker-2.12.3 → offtracker-2.13.1}/offtracker/utility/hg38.chrom.sizes +0 -0
- {offtracker-2.12.3 → offtracker-2.13.1}/offtracker/utility/mm10.chrom.sizes +0 -0
- {offtracker-2.12.3 → offtracker-2.13.1}/offtracker/utility/offtracker_blacklist_hg38.merged.bed +0 -0
- {offtracker-2.12.3 → offtracker-2.13.1}/offtracker/utility/offtracker_blacklist_mm10.merged.bed +0 -0
- {offtracker-2.12.3 → offtracker-2.13.1}/offtracker.egg-info/dependency_links.txt +0 -0
- {offtracker-2.12.3 → offtracker-2.13.1}/offtracker.egg-info/requires.txt +0 -0
- {offtracker-2.12.3 → offtracker-2.13.1}/offtracker.egg-info/top_level.txt +0 -0
- {offtracker-2.12.3 → offtracker-2.13.1}/scripts/offtracker_candidates.py +0 -0
- {offtracker-2.12.3 → offtracker-2.13.1}/scripts/offtracker_config.py +0 -0
- {offtracker-2.12.3 → offtracker-2.13.1}/scripts/offtracker_init.py +0 -0
- {offtracker-2.12.3 → offtracker-2.13.1}/scripts/offtracker_plot.py +0 -0
- {offtracker-2.12.3 → offtracker-2.13.1}/scripts/offtracker_qc.py +0 -0
- {offtracker-2.12.3 → offtracker-2.13.1}/setup.cfg +0 -0
|
@@ -2,7 +2,8 @@
|
|
|
2
2
|
import pandas as pd
|
|
3
3
|
import polars as pl
|
|
4
4
|
import numpy as np
|
|
5
|
-
import os, sys
|
|
5
|
+
import os, sys, re
|
|
6
|
+
import offtracker.X_sequence as xseq
|
|
6
7
|
sys.path.append( os.path.abspath(os.path.dirname(__file__)) )
|
|
7
8
|
|
|
8
9
|
def fdr(p_vals):
|
|
@@ -115,6 +116,29 @@ def target_signal(df_bdg_chr, chrom, cleavage_site, flank_max=100000, smooth_tim
|
|
|
115
116
|
binsize=100, flank_regions=[500,1000,2000,5000],
|
|
116
117
|
length_bkg = 20000, length_binsize=1000, length_min_noise=0.2, n_std=1,
|
|
117
118
|
end='end',start='start',value='residual', pct_offset=0.0):
|
|
119
|
+
"""_summary_
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
df_bdg_chr (_type_): .bdg table with the same chromosome
|
|
123
|
+
chrom (_type_): chr name
|
|
124
|
+
cleavage_site (_type_): cleavage site
|
|
125
|
+
flank_max (int, optional): _description_. Defaults to 100000.
|
|
126
|
+
smooth_times (int, optional): _description_. Defaults to 1.
|
|
127
|
+
window_size (int, optional): _description_. Defaults to 3.
|
|
128
|
+
binsize (int, optional): _description_. Defaults to 100.
|
|
129
|
+
flank_regions (list, optional): _description_. Defaults to [500,1000,2000,5000].
|
|
130
|
+
length_bkg (int, optional): _description_. Defaults to 20000.
|
|
131
|
+
length_binsize (int, optional): _description_. Defaults to 1000.
|
|
132
|
+
length_min_noise (float, optional): _description_. Defaults to 0.2.
|
|
133
|
+
n_std (int, optional): _description_. Defaults to 1.
|
|
134
|
+
end (str, optional): _description_. Defaults to 'end'.
|
|
135
|
+
start (str, optional): _description_. Defaults to 'start'.
|
|
136
|
+
value (str, optional): _description_. Defaults to 'residual'.
|
|
137
|
+
pct_offset (float, optional): _description_. Defaults to 0.0.
|
|
138
|
+
|
|
139
|
+
Returns:
|
|
140
|
+
_type_: _description_
|
|
141
|
+
"""
|
|
118
142
|
# 输入数据必须是同一条染色体内的
|
|
119
143
|
# 统计 flank regions 的个数
|
|
120
144
|
# n_regions = len(flank_regions)
|
|
@@ -328,6 +352,25 @@ def target_signal(df_bdg_chr, chrom, cleavage_site, flank_max=100000, smooth_tim
|
|
|
328
352
|
|
|
329
353
|
def target_signal_chunk(df_bdg_chr, df_alignment_chr, flank_max=100000, smooth_times = 1, window_size = 3, binsize=100, flank_regions=[500,1000,2000,5000],
|
|
330
354
|
length_bkg = 20000, length_binsize=1000, length_min_noise=0.2, n_std=1, pct_offset=0.0):
|
|
355
|
+
"""
|
|
356
|
+
|
|
357
|
+
Args:
|
|
358
|
+
df_bdg_chr (_type_): .bdg table with the same chromosome
|
|
359
|
+
df_alignment_chr (_type_): candidate sites
|
|
360
|
+
flank_max (int, optional): _description_. Defaults to 100000.
|
|
361
|
+
smooth_times (int, optional): _description_. Defaults to 1.
|
|
362
|
+
window_size (int, optional): _description_. Defaults to 3.
|
|
363
|
+
binsize (int, optional): _description_. Defaults to 100.
|
|
364
|
+
flank_regions (list, optional): _description_. Defaults to [500,1000,2000,5000].
|
|
365
|
+
length_bkg (int, optional): _description_. Defaults to 20000.
|
|
366
|
+
length_binsize (int, optional): _description_. Defaults to 1000.
|
|
367
|
+
length_min_noise (float, optional): _description_. Defaults to 0.2.
|
|
368
|
+
n_std (int, optional): _description_. Defaults to 1.
|
|
369
|
+
pct_offset (float, optional): _description_. Defaults to 0.0.
|
|
370
|
+
|
|
371
|
+
Returns:
|
|
372
|
+
_type_: _description_
|
|
373
|
+
"""
|
|
331
374
|
# 输入数据必须是同一条染色体内的
|
|
332
375
|
list_target_all = []
|
|
333
376
|
for a_row in df_alignment_chr.iterrows():
|
|
@@ -348,3 +391,273 @@ def target_signal_chunk(df_bdg_chr, df_alignment_chr, flank_max=100000, smooth_t
|
|
|
348
391
|
return df_result
|
|
349
392
|
|
|
350
393
|
|
|
394
|
+
|
|
395
|
+
|
|
396
|
+
|
|
397
|
+
|
|
398
|
+
|
|
399
|
+
|
|
400
|
+
|
|
401
|
+
|
|
402
|
+
|
|
403
|
+
############################################################################
|
|
404
|
+
# 2025.08.08 新写的的基于 Bio.Align 的 local realign,用于局部修正脱靶位点坐标 #
|
|
405
|
+
############################################################################
|
|
406
|
+
def create_substitution_matrix(mismatch_score=0.01):
|
|
407
|
+
"""
|
|
408
|
+
Create substitution matrix for DNA alignment using Bio.Align format.
|
|
409
|
+
"""
|
|
410
|
+
alphabet = 'ATGCN'
|
|
411
|
+
matrix = np.full((len(alphabet), len(alphabet)), mismatch_score)
|
|
412
|
+
|
|
413
|
+
# Set match scores
|
|
414
|
+
for i in range(len(alphabet)):
|
|
415
|
+
matrix[i][i] = 2.0
|
|
416
|
+
|
|
417
|
+
# N matches with everything
|
|
418
|
+
n_idx = alphabet.index('N')
|
|
419
|
+
matrix[n_idx, :] = 2.0
|
|
420
|
+
matrix[:, n_idx] = 2.0
|
|
421
|
+
|
|
422
|
+
return matrix, alphabet
|
|
423
|
+
|
|
424
|
+
def sgRNA_alignment_new(a_key, sgRNA, seq, substitution_matrix=None, alphabet=None,
|
|
425
|
+
mismatch_score=0.01):
|
|
426
|
+
"""
|
|
427
|
+
Perform local alignment using Bio.Align instead of deprecated pairwise2.
|
|
428
|
+
"""
|
|
429
|
+
from Bio import Align
|
|
430
|
+
if substitution_matrix is None or alphabet is None:
|
|
431
|
+
substitution_matrix, alphabet = create_substitution_matrix(mismatch_score)
|
|
432
|
+
|
|
433
|
+
# Create aligner
|
|
434
|
+
aligner = Align.PairwiseAligner()
|
|
435
|
+
aligner.substitution_matrix = Align.substitution_matrices.Array(
|
|
436
|
+
alphabet=alphabet, dims=2, data=substitution_matrix
|
|
437
|
+
)
|
|
438
|
+
aligner.open_gap_score = -2
|
|
439
|
+
aligner.extend_gap_score = -2
|
|
440
|
+
aligner.mode = 'local'
|
|
441
|
+
|
|
442
|
+
try:
|
|
443
|
+
# Perform alignment
|
|
444
|
+
alignments = aligner.align(sgRNA, seq)
|
|
445
|
+
|
|
446
|
+
if not alignments:
|
|
447
|
+
# No alignment found, return default values
|
|
448
|
+
return [0, 0, '', f"{a_key.split(':')[0]}:0-0", 0, 0, len(sgRNA)]
|
|
449
|
+
|
|
450
|
+
# Convert to list for indexing
|
|
451
|
+
alignments = list(alignments)
|
|
452
|
+
|
|
453
|
+
# Extract alignment information
|
|
454
|
+
coords = alignments[0].coordinates
|
|
455
|
+
start_target = coords[1][0]
|
|
456
|
+
end_target = coords[1][-1]
|
|
457
|
+
|
|
458
|
+
# Extract target sequence directly from coordinates
|
|
459
|
+
# target = seq[start_target:end_target]
|
|
460
|
+
|
|
461
|
+
# Get aligned sequences for detailed analysis
|
|
462
|
+
alignment_str = str(alignments[0])
|
|
463
|
+
alignment_lines = alignment_str.split('\n')
|
|
464
|
+
if len(alignment_lines) >= 3:
|
|
465
|
+
aligned_sgrna = [x for x in alignment_lines[0].split(' ') if x != '']
|
|
466
|
+
aligned_genome = [x for x in alignment_lines[2].split(' ') if x != '']
|
|
467
|
+
else:
|
|
468
|
+
raise ValueError("Unexpected alignment format")
|
|
469
|
+
|
|
470
|
+
assert int(aligned_sgrna[-1]) == len(sgRNA)
|
|
471
|
+
|
|
472
|
+
# Calculate indels and mismatches
|
|
473
|
+
# deletion = RNA bulge
|
|
474
|
+
# insertion = DNA bulge
|
|
475
|
+
aligned_sgrna_seq = aligned_sgrna[-2]
|
|
476
|
+
aligned_genome_seq = aligned_genome[-2]
|
|
477
|
+
insertion = aligned_sgrna_seq.count('-') if '-' in aligned_sgrna_seq else 0
|
|
478
|
+
deletion = aligned_genome_seq.count('-') if '-' in aligned_genome_seq else 0
|
|
479
|
+
|
|
480
|
+
# Count mismatches by comparing sequences directly
|
|
481
|
+
# mismatch = 0
|
|
482
|
+
# assert len(aligned_sgrna_seq) == len(aligned_genome_seq)
|
|
483
|
+
# for i in range(len(aligned_sgrna_seq)):
|
|
484
|
+
# if (aligned_sgrna_seq[i] != aligned_genome_seq[i]) & (aligned_sgrna_seq[i] != 'N') & (aligned_genome_seq[i] != 'N'):
|
|
485
|
+
# mismatch += 1
|
|
486
|
+
|
|
487
|
+
mismatch = round((alignments[0].score % 1)/mismatch_score)
|
|
488
|
+
|
|
489
|
+
# Calculate target location
|
|
490
|
+
pos_st = int(a_key.split('-')[0].split(':')[1]) + 1
|
|
491
|
+
chr_name = a_key.split(':')[0]
|
|
492
|
+
target_st = pos_st + start_target
|
|
493
|
+
target_ed = pos_st + end_target - 1
|
|
494
|
+
target_location = f"{chr_name}:{target_st}-{target_ed}"
|
|
495
|
+
|
|
496
|
+
score = alignments[0].score
|
|
497
|
+
|
|
498
|
+
return [score, aligned_genome_seq, target_location, deletion, insertion, mismatch]
|
|
499
|
+
|
|
500
|
+
except Exception as e:
|
|
501
|
+
print(f"Alignment error for {a_key}: {e}")
|
|
502
|
+
return [0, 0, '', f"{a_key.split(':')[0]}:0-0", 0, 0, len(sgRNA)]
|
|
503
|
+
|
|
504
|
+
def local_realign(sgRNA_seq, fasta, PAM='NGG', PAM_loc='downstream'):
|
|
505
|
+
# 添加 PAM
|
|
506
|
+
if PAM_loc == 'downstream':
|
|
507
|
+
sgRNA_PAM_fw = sgRNA_seq + PAM
|
|
508
|
+
else:
|
|
509
|
+
sgRNA_PAM_fw = PAM + sgRNA_seq
|
|
510
|
+
sgRNA_PAM_rv = xseq.reverse_complement(sgRNA_PAM_fw)
|
|
511
|
+
list_args_fw=[]
|
|
512
|
+
list_args_rv=[]
|
|
513
|
+
for a_key, a_seq in fasta.items():
|
|
514
|
+
# 2025.04.25 修正大小写问题
|
|
515
|
+
a_seq = re.sub('[^ATCG]','N',a_seq.upper())
|
|
516
|
+
list_args_fw.append( [a_key, sgRNA_PAM_fw, a_seq])
|
|
517
|
+
list_args_rv.append( [a_key, sgRNA_PAM_rv, a_seq])
|
|
518
|
+
list_align_forward = [sgRNA_alignment_new(*args) for args in list_args_fw]
|
|
519
|
+
list_align_reverse = [sgRNA_alignment_new(*args) for args in list_args_rv]
|
|
520
|
+
#
|
|
521
|
+
df_align_forward = pd.DataFrame(list_align_forward, columns= ['fw_score', 'fw_target','fw_location','fw_deletion','fw_insertion','fw_mismatch'])
|
|
522
|
+
df_align_reverse = pd.DataFrame(list_align_reverse, columns= ['rv_score', 'rv_target','rv_location','rv_deletion','rv_insertion','rv_mismatch'])
|
|
523
|
+
df_align_reverse['rv_target'] = df_align_reverse['rv_target'].apply(xseq.reverse_complement)
|
|
524
|
+
df_candidate = pd.concat([df_align_forward,df_align_reverse],axis=1)
|
|
525
|
+
df_candidate['location'] = fasta.keys()
|
|
526
|
+
df_candidate['alignment_score'] = df_candidate[['fw_score','rv_score']].max(axis=1)
|
|
527
|
+
df_candidate['best_seq_score'] = df_candidate[['fw_score', 'rv_score']].max(axis=1)
|
|
528
|
+
df_candidate['best_strand'] = df_candidate[['fw_score', 'rv_score']].idxmax(axis='columns').replace({'fw_score':'+', 'rv_score':'-'})
|
|
529
|
+
df_candidate.loc[df_candidate['fw_score']==df_candidate['rv_score'],'best_strand']='equal_score'
|
|
530
|
+
|
|
531
|
+
# GG check
|
|
532
|
+
# 2023.12.05 增加 cleavage_site 推测
|
|
533
|
+
list_best_target = []
|
|
534
|
+
list_best_location = []
|
|
535
|
+
list_cleavage_site = []
|
|
536
|
+
list_delete = []
|
|
537
|
+
list_insert = []
|
|
538
|
+
list_mismat = []
|
|
539
|
+
list_GG = []
|
|
540
|
+
for a_row in df_candidate.iterrows():
|
|
541
|
+
if a_row[1]['best_strand']=='+':
|
|
542
|
+
list_best_target.append(a_row[1]['fw_target'])
|
|
543
|
+
list_best_location.append(a_row[1]['fw_location'])
|
|
544
|
+
list_cleavage_site.append(int(a_row[1]['fw_location'].split('-')[1]) - 6)
|
|
545
|
+
list_delete.append(a_row[1]['fw_deletion'])
|
|
546
|
+
list_insert.append(a_row[1]['fw_insertion'])
|
|
547
|
+
list_mismat.append(a_row[1]['fw_mismatch'])
|
|
548
|
+
if a_row[1]['fw_target'][-2:]=='GG':
|
|
549
|
+
list_GG.append('OK')
|
|
550
|
+
else:
|
|
551
|
+
list_GG.append('NO')
|
|
552
|
+
elif a_row[1]['best_strand']=='-':
|
|
553
|
+
list_best_target.append(a_row[1]['rv_target'])
|
|
554
|
+
list_best_location.append(a_row[1]['rv_location'])
|
|
555
|
+
list_cleavage_site.append(int(a_row[1]['rv_location'].split('-')[0].split(':')[1]) + 5)
|
|
556
|
+
list_delete.append(a_row[1]['rv_deletion'])
|
|
557
|
+
list_insert.append(a_row[1]['rv_insertion'])
|
|
558
|
+
list_mismat.append(a_row[1]['rv_mismatch'])
|
|
559
|
+
if a_row[1]['rv_target'][-2:]=='GG':
|
|
560
|
+
list_GG.append('OK')
|
|
561
|
+
else:
|
|
562
|
+
list_GG.append('NO')
|
|
563
|
+
else:
|
|
564
|
+
if a_row[1]['fw_target'][-2:]=='GG':
|
|
565
|
+
list_best_target.append(a_row[1]['fw_target'])
|
|
566
|
+
list_best_location.append(a_row[1]['fw_location'])
|
|
567
|
+
list_cleavage_site.append(int(a_row[1]['fw_location'].split('-')[1]) - 6)
|
|
568
|
+
list_delete.append(a_row[1]['fw_deletion'])
|
|
569
|
+
list_insert.append(a_row[1]['fw_insertion'])
|
|
570
|
+
list_mismat.append(a_row[1]['fw_mismatch'])
|
|
571
|
+
list_GG.append('OK_same_score')
|
|
572
|
+
# 发现没有 GG 则看 RC
|
|
573
|
+
elif a_row[1]['rv_target'][-2:]=='GG':
|
|
574
|
+
list_best_target.append(a_row[1]['rv_target'])
|
|
575
|
+
list_best_location.append(a_row[1]['rv_location'])
|
|
576
|
+
list_cleavage_site.append(int(a_row[1]['rv_location'].split('-')[0].split(':')[1]) + 5)
|
|
577
|
+
list_delete.append(a_row[1]['rv_deletion'])
|
|
578
|
+
list_insert.append(a_row[1]['rv_insertion'])
|
|
579
|
+
list_mismat.append(a_row[1]['rv_mismatch'])
|
|
580
|
+
list_GG.append('OK_same_score')
|
|
581
|
+
else:
|
|
582
|
+
list_best_target.append(a_row[1]['fw_target'])
|
|
583
|
+
list_best_location.append(a_row[1]['fw_location'])
|
|
584
|
+
list_cleavage_site.append(int(a_row[1]['fw_location'].split('-')[1]) - 6)
|
|
585
|
+
list_delete.append(a_row[1]['fw_deletion'])
|
|
586
|
+
list_insert.append(a_row[1]['fw_insertion'])
|
|
587
|
+
list_mismat.append(a_row[1]['fw_mismatch'])
|
|
588
|
+
list_GG.append('NO_same_score')
|
|
589
|
+
# 记入 df_candidate
|
|
590
|
+
df_candidate['deletion'] = list_delete
|
|
591
|
+
df_candidate['insertion'] = list_insert
|
|
592
|
+
df_candidate['mismatch'] = list_mismat
|
|
593
|
+
df_candidate['GG'] = list_GG
|
|
594
|
+
df_candidate['best_target'] = list_best_target
|
|
595
|
+
df_candidate['target_location'] = list_best_location
|
|
596
|
+
df_candidate['cleavage_site'] = list_cleavage_site
|
|
597
|
+
df_candidate = pd.concat([xseq.bedfmt(df_candidate['target_location']), df_candidate], axis=1)
|
|
598
|
+
|
|
599
|
+
return df_candidate
|
|
600
|
+
|
|
601
|
+
def left_realign(dp_bdg_chr, loc_shift_left, ref_fasta, sgRNA_seq, PAM, PAM_loc, n_iter):
|
|
602
|
+
# print(loc_shift_left)
|
|
603
|
+
fasta = xseq.get_seq(loc_shift_left, ref_fasta)
|
|
604
|
+
df_candidate = local_realign(sgRNA_seq, fasta, PAM, PAM_loc)
|
|
605
|
+
sr_candidate = df_candidate.iloc[0].copy()
|
|
606
|
+
chrom = sr_candidate['chr']
|
|
607
|
+
cleavage_site = sr_candidate['cleavage_site']
|
|
608
|
+
flank_regions = [500]
|
|
609
|
+
signals = target_signal(dp_bdg_chr.to_pandas(), chrom, cleavage_site, flank_regions=flank_regions)
|
|
610
|
+
L_neg_1000 = signals[2]
|
|
611
|
+
R_neg_1000 = signals[5]
|
|
612
|
+
# 如果右侧范围变负数了,说明过头了
|
|
613
|
+
if R_neg_1000 < 0:
|
|
614
|
+
sr_candidate.loc['realign'] = 'fail'
|
|
615
|
+
return sr_candidate
|
|
616
|
+
|
|
617
|
+
# 计算左移后的 L_neg_1000,如果还是负数则迭代,最多迭代 10 次
|
|
618
|
+
if L_neg_1000 < 0:
|
|
619
|
+
st = sr_candidate['st']
|
|
620
|
+
ed = sr_candidate['ed']
|
|
621
|
+
loc_shift_left = f'{chrom}:{int(st)-1000}-{int(ed)-20}'
|
|
622
|
+
n_iter += 1
|
|
623
|
+
if n_iter < 10:
|
|
624
|
+
return left_realign(dp_bdg_chr, loc_shift_left, ref_fasta, sgRNA_seq, PAM, PAM_loc, n_iter)
|
|
625
|
+
else:
|
|
626
|
+
sr_candidate.loc['realign'] = 'fail'
|
|
627
|
+
return sr_candidate
|
|
628
|
+
else:
|
|
629
|
+
sr_candidate.loc['realign'] = 'success'
|
|
630
|
+
return sr_candidate
|
|
631
|
+
|
|
632
|
+
def right_realign(dp_bdg_chr, loc_shift_right, ref_fasta, sgRNA_seq, PAM, PAM_loc, n_iter):
|
|
633
|
+
# print(loc_shift_right)
|
|
634
|
+
fasta = xseq.get_seq(loc_shift_right, ref_fasta)
|
|
635
|
+
df_candidate = local_realign(sgRNA_seq, fasta, PAM, PAM_loc)
|
|
636
|
+
sr_candidate = df_candidate.iloc[0].copy()
|
|
637
|
+
chrom = sr_candidate['chr']
|
|
638
|
+
cleavage_site = sr_candidate['cleavage_site']
|
|
639
|
+
flank_regions = [500]
|
|
640
|
+
signals = target_signal(dp_bdg_chr.to_pandas(), chrom, cleavage_site, flank_regions=flank_regions)
|
|
641
|
+
L_neg_1000 = signals[2]
|
|
642
|
+
R_neg_1000 = signals[5]
|
|
643
|
+
# 如果左侧范围变负数了,说明过头了
|
|
644
|
+
if L_neg_1000 < 0:
|
|
645
|
+
sr_candidate.loc['realign'] = 'fail'
|
|
646
|
+
return sr_candidate
|
|
647
|
+
|
|
648
|
+
# 计算右移后的 R_neg_1000,如果还是负数则迭代,最多迭代 10 次
|
|
649
|
+
if R_neg_1000 < 0:
|
|
650
|
+
st = sr_candidate['st']
|
|
651
|
+
ed = sr_candidate['ed']
|
|
652
|
+
loc_shift_right = f'{chrom}:{int(st)+20}-{int(ed)+1000}'
|
|
653
|
+
n_iter += 1
|
|
654
|
+
if n_iter < 10:
|
|
655
|
+
return right_realign(dp_bdg_chr, loc_shift_right, ref_fasta, sgRNA_seq, PAM, PAM_loc, n_iter)
|
|
656
|
+
else:
|
|
657
|
+
sr_candidate.loc['realign'] = 'fail'
|
|
658
|
+
return sr_candidate
|
|
659
|
+
else:
|
|
660
|
+
sr_candidate.loc['realign'] = 'success'
|
|
661
|
+
return sr_candidate
|
|
662
|
+
|
|
663
|
+
|
|
@@ -3,7 +3,8 @@ import math
|
|
|
3
3
|
import pandas as pd
|
|
4
4
|
from itertools import product
|
|
5
5
|
import numpy as np
|
|
6
|
-
import os, glob
|
|
6
|
+
import os, glob, sys
|
|
7
|
+
import polars as pl
|
|
7
8
|
|
|
8
9
|
ambiguous_nt = {'A': ['A'],
|
|
9
10
|
'T': ['T'],
|
|
@@ -246,6 +247,137 @@ def sgRNA_alignment(a_key, sgRNA, seq, frag_len, DNA_matrix=None, mismatch_score
|
|
|
246
247
|
else:
|
|
247
248
|
return [best_alignment.score, position_pct, target, target_location, deletion, insertion, mismatch]
|
|
248
249
|
|
|
250
|
+
def sgRNA_alignment_new(a_key, sgRNA, seq, substitution_matrix=None, alphabet=None,
|
|
251
|
+
mismatch_score=0.01):
|
|
252
|
+
"""
|
|
253
|
+
Perform local alignment using Bio.Align instead of deprecated pairwise2.
|
|
254
|
+
"""
|
|
255
|
+
if substitution_matrix is None or alphabet is None:
|
|
256
|
+
substitution_matrix, alphabet = create_substitution_matrix(mismatch_score)
|
|
257
|
+
|
|
258
|
+
# Create aligner
|
|
259
|
+
aligner = Align.PairwiseAligner()
|
|
260
|
+
aligner.substitution_matrix = Align.substitution_matrices.Array(
|
|
261
|
+
alphabet=alphabet, dims=2, data=substitution_matrix
|
|
262
|
+
)
|
|
263
|
+
aligner.open_gap_score = -2
|
|
264
|
+
aligner.extend_gap_score = -2
|
|
265
|
+
aligner.mode = 'local'
|
|
266
|
+
|
|
267
|
+
try:
|
|
268
|
+
# Perform alignment
|
|
269
|
+
alignments = aligner.align(sgRNA, seq)
|
|
270
|
+
|
|
271
|
+
if not alignments:
|
|
272
|
+
# No alignment found, return default values
|
|
273
|
+
return [0, 0, '', f"{a_key.split(':')[0]}:0-0", 0, 0, len(sgRNA)]
|
|
274
|
+
|
|
275
|
+
# Convert to list for indexing
|
|
276
|
+
alignments = list(alignments)
|
|
277
|
+
|
|
278
|
+
# Extract alignment information
|
|
279
|
+
coords = alignments[0].coordinates
|
|
280
|
+
start_target = coords[1][0]
|
|
281
|
+
end_target = coords[1][-1]
|
|
282
|
+
|
|
283
|
+
# Extract target sequence directly from coordinates
|
|
284
|
+
# target = seq[start_target:end_target]
|
|
285
|
+
|
|
286
|
+
# Get aligned sequences for detailed analysis
|
|
287
|
+
alignment_str = str(alignments[0])
|
|
288
|
+
alignment_lines = alignment_str.split('\n')
|
|
289
|
+
if len(alignment_lines) >= 3:
|
|
290
|
+
aligned_sgrna = [x for x in alignment_lines[0].split(' ') if x != '']
|
|
291
|
+
aligned_genome = [x for x in alignment_lines[2].split(' ') if x != '']
|
|
292
|
+
else:
|
|
293
|
+
raise ValueError("Unexpected alignment format")
|
|
294
|
+
|
|
295
|
+
assert int(aligned_sgrna[-1]) == len(sgRNA)
|
|
296
|
+
|
|
297
|
+
# Calculate indels and mismatches
|
|
298
|
+
# deletion = RNA bulge
|
|
299
|
+
# insertion = DNA bulge
|
|
300
|
+
aligned_sgrna_seq = aligned_sgrna[-2]
|
|
301
|
+
aligned_genome_seq = aligned_genome[-2]
|
|
302
|
+
insertion = aligned_sgrna_seq.count('-') if '-' in aligned_sgrna_seq else 0
|
|
303
|
+
deletion = aligned_genome_seq.count('-') if '-' in aligned_genome_seq else 0
|
|
304
|
+
|
|
305
|
+
# Count mismatches by comparing sequences directly
|
|
306
|
+
# mismatch = 0
|
|
307
|
+
# assert len(aligned_sgrna_seq) == len(aligned_genome_seq)
|
|
308
|
+
# for i in range(len(aligned_sgrna_seq)):
|
|
309
|
+
# if (aligned_sgrna_seq[i] != aligned_genome_seq[i]) & (aligned_sgrna_seq[i] != 'N') & (aligned_genome_seq[i] != 'N'):
|
|
310
|
+
# mismatch += 1
|
|
311
|
+
|
|
312
|
+
mismatch = round((alignments[0].score % 1)/mismatch_score)
|
|
313
|
+
|
|
314
|
+
# Calculate target location
|
|
315
|
+
pos_st = int(a_key.split('-')[0].split(':')[1]) + 1
|
|
316
|
+
chr_name = a_key.split(':')[0]
|
|
317
|
+
target_st = pos_st + start_target
|
|
318
|
+
target_ed = pos_st + end_target - 1
|
|
319
|
+
target_location = f"{chr_name}:{target_st}-{target_ed}"
|
|
320
|
+
|
|
321
|
+
score = alignments[0].score
|
|
322
|
+
|
|
323
|
+
return [score, aligned_genome_seq, target_location, deletion, insertion, mismatch]
|
|
324
|
+
|
|
325
|
+
except Exception as e:
|
|
326
|
+
print(f"Alignment error for {a_key}: {e}")
|
|
327
|
+
return [0, 0, '', f"{a_key.split(':')[0]}:0-0", 0, 0, len(sgRNA)]
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
def get_seq(location, ref_fasta, return_df=False) -> dict:
|
|
331
|
+
"""
|
|
332
|
+
根据 genome location 取序列
|
|
333
|
+
location 如果是 list, str 则形式为 "chr1:123456-123458" 或 ["chr1:123456-123458", "chr2:123456-123458"]
|
|
334
|
+
location 如果是 pd.DataFrame, pl.DataFrame 则默认前三列为 bed 格式
|
|
335
|
+
ref_fasta 是参考基因组 fasta 文件的路径
|
|
336
|
+
默认返回字典, key 是位置, value 是序列
|
|
337
|
+
如果 return_df 为 True, 则返回 pl.DataFrame, 第一列为位置, 第二列为序列
|
|
338
|
+
|
|
339
|
+
pybedtools 返回的序列实际上没有包括坐标 start 的那个碱基,这一点和 twoBitToFa 一样
|
|
340
|
+
但是 IGV/UCSC 等序列是包括 start 的碱基,blast 的结果也是包括 start 的
|
|
341
|
+
所以在后续分析时要注意这一点
|
|
342
|
+
"""
|
|
343
|
+
if sys.platform[:3]=='win':
|
|
344
|
+
# windows 似乎装不了 pybedtools
|
|
345
|
+
raise ValueError('windows 似乎装不了 pybedtools')
|
|
346
|
+
else:
|
|
347
|
+
import pybedtools
|
|
348
|
+
#########
|
|
349
|
+
# 根据 genome location 取序列
|
|
350
|
+
#########
|
|
351
|
+
if isinstance(location,(list,str)):
|
|
352
|
+
bed_loc = bedfmt(location)
|
|
353
|
+
elif isinstance(location,pd.DataFrame):
|
|
354
|
+
bed_loc = location.iloc[:,:3]
|
|
355
|
+
elif isinstance(location,pl.DataFrame):
|
|
356
|
+
bed_loc = location[:,:3]
|
|
357
|
+
else:
|
|
358
|
+
raise ValueError('location must be a list, str or pd.DataFrame')
|
|
359
|
+
|
|
360
|
+
fasta = pybedtools.example_filename(ref_fasta)
|
|
361
|
+
temp_bed = './temp_amp_loc.bed'
|
|
362
|
+
write_bed(bed_loc, temp_bed)
|
|
363
|
+
a = pybedtools.BedTool(temp_bed)
|
|
364
|
+
a = a.sequence(fi=fasta)
|
|
365
|
+
with open(a.seqfn, encoding='utf-8') as f:
|
|
366
|
+
dict_seq = {} # 定义一个空的字典
|
|
367
|
+
for line in f:
|
|
368
|
+
line = line.strip() # 去除末尾换行符
|
|
369
|
+
if line[0] == '>':
|
|
370
|
+
header = line[1:]
|
|
371
|
+
else:
|
|
372
|
+
sequence = line
|
|
373
|
+
dict_seq[header] = dict_seq.get(header,'') + sequence
|
|
374
|
+
|
|
375
|
+
# remove temp_amp_loc.bed
|
|
376
|
+
os.remove(temp_bed)
|
|
377
|
+
if return_df:
|
|
378
|
+
return pl.DataFrame(list(dict_seq.items()), orient='row', schema={'location':pl.String,'sequence':pl.String})
|
|
379
|
+
else:
|
|
380
|
+
return dict_seq
|
|
249
381
|
|
|
250
382
|
def combine_df(list_df, op = 'mean'):
|
|
251
383
|
# df 行列、结构必须一模一样,非数字部分也一模一样,只有数字不同
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
__version__ = "2.
|
|
1
|
+
__version__ = "2.13.1"
|
|
2
2
|
# 2023.08.11. v1.1.0 adding a option for not normalizing the bw file
|
|
3
3
|
# 2023.10.26. v1.9.0 prerelease for v2.0
|
|
4
4
|
# 2023.10.27. v2.0.0 大更新,还没微调
|
|
@@ -33,12 +33,14 @@ __version__ = "2.12.3"
|
|
|
33
33
|
# 2025.04.25. v2.8.0 修复了 offtracker candidates 会把小写序列转换成 N 的 bug
|
|
34
34
|
# 2025.05.22. v2.9.0 翻新部分代码结构
|
|
35
35
|
# 2025.06.05. v2.10.0 增加了QC模块。保留了负数score的记录,并在plot时显示为红字。增加了 "--ignore_chr" 用于跳过common chr过滤。
|
|
36
|
-
# 2025.06.17. v2.10.7
|
|
37
|
-
# 2025.06.27. v2.10.8
|
|
38
|
-
# 2025.06.28. v2.10.9
|
|
39
|
-
# 2025.06.28. v2.10.10
|
|
40
|
-
# 2025.06.28. v2.10.11
|
|
41
|
-
# 2025.07.02. v2.11.4
|
|
42
|
-
# 2025.07.04. v2.11.5
|
|
43
|
-
# 2025.07.04. v2.12.2
|
|
44
|
-
# 2025.07.18. v2.12.3
|
|
36
|
+
# 2025.06.17. v2.10.7 修复翻新代码结构导致的bug
|
|
37
|
+
# 2025.06.27. v2.10.8 将 chmod 放在了 setup.py 里
|
|
38
|
+
# 2025.06.28. v2.10.9 现在 pip 都是从 wheel 安装,不再运行 setup.py,所以增加一个 offtracker_init.py
|
|
39
|
+
# 2025.06.28. v2.10.10 直接塞 script 里试试
|
|
40
|
+
# 2025.06.28. v2.10.11 回滚到2.10.9外加修正
|
|
41
|
+
# 2025.07.02. v2.11.4 基于 blast 的缺陷更新 candidates,去除 quick mode
|
|
42
|
+
# 2025.07.04. v2.11.5 offtracker_analysis 提前 skip 已有结果的样本
|
|
43
|
+
# 2025.07.04. v2.12.2 新增 region_index 标记区域,用于更好的去重
|
|
44
|
+
# 2025.07.18. v2.12.3 新增QC自动避免重复读取 trimmed fastq files
|
|
45
|
+
# 2025.08.08. v2.13.0 测试 local realign 功能
|
|
46
|
+
# 2025.08.09. v2.13.1 测试 correction 功能
|
|
@@ -24,6 +24,7 @@ offtracker/utility/offtracker_blacklist_mm10.merged.bed
|
|
|
24
24
|
scripts/offtracker_analysis.py
|
|
25
25
|
scripts/offtracker_candidates.py
|
|
26
26
|
scripts/offtracker_config.py
|
|
27
|
+
scripts/offtracker_correction.py
|
|
27
28
|
scripts/offtracker_init.py
|
|
28
29
|
scripts/offtracker_plot.py
|
|
29
30
|
scripts/offtracker_qc.py
|
|
@@ -22,24 +22,29 @@ def main():
|
|
|
22
22
|
parser = argparse.ArgumentParser()
|
|
23
23
|
parser.description='Analyze the Tracking-seq data.'
|
|
24
24
|
parser.add_argument('-f','--folder' , type=str, required=True, nargs='+', help='Directory of the data folder.' )
|
|
25
|
-
parser.add_argument('--seqfolder' , type=str,
|
|
25
|
+
parser.add_argument('--seqfolder' , type=str, default ='none', help='folder containing df_candidate created by offtracker_cadidates.py.')
|
|
26
26
|
parser.add_argument('--name' , type=str, required=True, help='custom name of the sgRNA' )
|
|
27
27
|
parser.add_argument('--exp' , type=str, default='all', nargs='+', help='A substring mark in the name of experimental samples. The default is to use all samples other than control' )
|
|
28
28
|
parser.add_argument('--control' , type=str, default='none', nargs='+', help='A substring mark in the name of control samples. The default is no control. "others" for all samples other than --exp.' )
|
|
29
|
-
parser.add_argument('--fdr' , type=float, default=0.05,
|
|
30
|
-
parser.add_argument('--score' , type=float, default=1.9,
|
|
29
|
+
parser.add_argument('--fdr' , type=float, default=0.05, help='FDR threshold for the final result. Default is 0.05.')
|
|
30
|
+
parser.add_argument('--score' , type=float, default=1.9, help='Track score threshold for the final result. Default is 1.9.')
|
|
31
31
|
parser.add_argument('--smooth' , type=int, default=1, help='Smooth strength for the signal.')
|
|
32
32
|
parser.add_argument('--window' , type=int, default=3, help='Window size for smoothing the signal.')
|
|
33
33
|
parser.add_argument('--binsize' , type=int, default=100, help='Window size for smoothing the signal.')
|
|
34
34
|
parser.add_argument('--flank_max' , type=int, default=100000, help='Maximun flanking distance from the candidate site.')
|
|
35
35
|
parser.add_argument('--flank_regions', type=int, default=[1000,2000,3000,5000], nargs='+',help='flanking regions for calculating signal.')
|
|
36
36
|
parser.add_argument('--SeqScorePower', type=float, default=4, help='The seq score power' )
|
|
37
|
-
parser.add_argument('--CtrClip' , type=float, default=-0.5,
|
|
37
|
+
parser.add_argument('--CtrClip' , type=float, default=-0.5, help='The lower clip for control samples' )
|
|
38
38
|
parser.add_argument('-t','--thread' , type=int, default=4, help='Number of threads for parallel computing')
|
|
39
39
|
parser.add_argument('-g','--genome' , type=str, default='hg38', help='File of chromosome sizes, or "hg38", "mm10" ')
|
|
40
40
|
parser.add_argument('-o','--outdir' , type=str, default='first', help='The output folder. Default is the first folder of --folder' )
|
|
41
41
|
parser.add_argument('--outname' , type=str, default='same', help='The suffix of output files. Default is the same --exp' )
|
|
42
42
|
parser.add_argument('--signal_only' , action='store_true', help='A developer option: stop before group analysis. ' )
|
|
43
|
+
# for offtracker_correction
|
|
44
|
+
parser.add_argument('--check_loc' , action='store_true', help='New in v2.13, for other scripts. Do not use this option. ' )
|
|
45
|
+
parser.add_argument('--seqfile' , type=str, default='none', help='Assign a specific df_candidate file.')
|
|
46
|
+
|
|
47
|
+
# other parameters
|
|
43
48
|
# parser.add_argument('--individual_results', action='store_true', help='When multiple samples meet the exp pattern, only one merged result is generated.\n' \
|
|
44
49
|
# 'Set --individual_results to additionally output the individual result of each exp sample. ' )
|
|
45
50
|
parser.add_argument('--overwrite' , action='store_true', help='Whether to overwrite existed dataframes.' )
|
|
@@ -79,11 +84,16 @@ def main():
|
|
|
79
84
|
|
|
80
85
|
# load df_candidate
|
|
81
86
|
try:
|
|
82
|
-
|
|
87
|
+
if args.seqfile != 'none':
|
|
88
|
+
df_candidate = pl.read_csv(args.seqfile).to_pandas()
|
|
89
|
+
elif args.seqfolder != 'none':
|
|
90
|
+
df_candidate = pl.read_csv(os.path.join(args.seqfolder,f'df_candidate_{sgRNA_name}.csv')).to_pandas()
|
|
91
|
+
else:
|
|
92
|
+
raise ValueError('Please provide --seqfolder or --seqfile')
|
|
83
93
|
df_candidate.index = df_candidate['target_location']
|
|
84
94
|
df_candidate_brief = df_candidate[['chr','st','ed','best_strand','best_target','best_seq_score',
|
|
85
95
|
'deletion', 'insertion','mismatch', 'GG',
|
|
86
|
-
'target_location', 'cleavage_site', '
|
|
96
|
+
'target_location', 'cleavage_site', 'region_index']] # 2025.07.06 添加 region_index, 去除 'ID_1','ID_2',
|
|
87
97
|
df_candidate_sub = df_candidate[['chr','cleavage_site']]
|
|
88
98
|
except FileNotFoundError:
|
|
89
99
|
return 'Please run offtracker_candidates.py first and provide the correct directory with --seqfolder'
|
|
@@ -160,8 +170,11 @@ def main():
|
|
|
160
170
|
if (os.path.isfile(output))&(not args.overwrite):
|
|
161
171
|
print(output, 'exists, skipped')
|
|
162
172
|
continue
|
|
163
|
-
|
|
164
|
-
df_bdg
|
|
173
|
+
# 2025.08.09. 改用 pl 读取加速
|
|
174
|
+
df_bdg = pl.read_csv(a_file, separator='\t', has_header=False,
|
|
175
|
+
schema_overrides={'chr':pl.String,'start':pl.Int32,
|
|
176
|
+
'end':pl.Int32,'residual':pl.Float32}).to_pandas() # xseq.read_bed(a_file)
|
|
177
|
+
# df_bdg.columns = ['chr','start','end','residual']
|
|
165
178
|
# 将 df_bdg 按照染色体分组
|
|
166
179
|
sample_groups = df_bdg.groupby('chr')
|
|
167
180
|
# 2024.06.03. fix a bug that df_bdg has less chr than df_candidate
|
|
@@ -308,7 +321,7 @@ def main():
|
|
|
308
321
|
# 2025.07.06 更新去重方式
|
|
309
322
|
df_result = df_score.drop_duplicates(subset=['region_index'], keep='first').copy()
|
|
310
323
|
|
|
311
|
-
#
|
|
324
|
+
# 标准化分布,2025.08.09
|
|
312
325
|
target_std=0.15
|
|
313
326
|
n_outliers = int(np.ceil(len(df_result)*0.01))
|
|
314
327
|
score_bkg = df_result['raw_score'][n_outliers:-n_outliers]
|
|
@@ -317,7 +330,7 @@ def main():
|
|
|
317
330
|
df_result['track_score'] = (df_result['raw_score'] - mean_score_bkg) / std_score_bkg
|
|
318
331
|
df_result['track_score'] = df_result['track_score']*target_std + 1
|
|
319
332
|
df_result = df_result.sort_values(by='track_score', ascending=False)
|
|
320
|
-
df_result['log2_track_score'] = np.log2(df_result['track_score'].clip(lower=0.5))
|
|
333
|
+
df_result['log2_track_score'] = np.log2(df_result['track_score'].clip(lower=0.5))
|
|
321
334
|
|
|
322
335
|
# 单边信号周围有更高分的,去掉
|
|
323
336
|
# v2.1 后 cols_L, cols_R 要手动
|
|
@@ -362,28 +375,30 @@ def main():
|
|
|
362
375
|
df_result['rank'] = range(1,len(df_result)+1)
|
|
363
376
|
df_result.to_csv(output)
|
|
364
377
|
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
378
|
+
if not args.check_loc:
|
|
379
|
+
output = f'Offtracker_result_{outname}.csv'
|
|
380
|
+
# 2024.06.03. 以防 fdr<=fdr_thresh 滤掉了 track_score>=2 的位点
|
|
381
|
+
bool_fdr = df_result['fdr']<=fdr_thresh
|
|
382
|
+
bool_score = df_result['track_score']>=score_thresh
|
|
383
|
+
# 2025.06.05. BE可能会形成单边信号,但是很少见,如果 control 用的是别的 sgRNA 的样本,对应脱靶位置附近一般就是负数
|
|
384
|
+
# bool_neg_score = df_result['track_score']< -1
|
|
385
|
+
df_output = df_result[bool_fdr|bool_score].copy()
|
|
386
|
+
if pattern_ctr != 'none':
|
|
387
|
+
df_output = df_output[['target_location', 'best_strand','best_target','deletion','insertion','mismatch',
|
|
388
|
+
'exp_L_length', 'exp_R_length','ctr_L_length','ctr_R_length','L_length','R_length','signal_length',
|
|
389
|
+
'norm_best_seq_score','track_score', 'log2_track_score','fdr','rank']]
|
|
390
|
+
df_output.columns = ['target_location', 'strand', 'target', 'deletion', 'insertion', 'mismatch',
|
|
391
|
+
'exp_L_length', 'exp_R_length','ctr_L_length','ctr_R_length','L_length','R_length','signal_length',
|
|
392
|
+
'seq_score', 'track_score', 'log2_track_score','FDR', 'rank']
|
|
393
|
+
else:
|
|
394
|
+
df_output = df_output[['target_location', 'best_strand','best_target','deletion','insertion','mismatch',
|
|
395
|
+
'L_length', 'R_length','signal_length',
|
|
396
|
+
'norm_best_seq_score','track_score', 'log2_track_score','fdr','rank']]
|
|
397
|
+
df_output.columns = ['target_location', 'strand', 'target', 'deletion', 'insertion', 'mismatch',
|
|
398
|
+
'L_length', 'R_length','signal_length',
|
|
399
|
+
'seq_score', 'track_score', 'log2_track_score','FDR', 'rank']
|
|
400
|
+
|
|
401
|
+
df_output.to_csv(f'Offtracker_result_{outname}.csv', index=False)
|
|
387
402
|
|
|
388
403
|
if args.clean:
|
|
389
404
|
shutil.rmtree('./temp')
|
|
@@ -0,0 +1,282 @@
|
|
|
1
|
+
|
|
2
|
+
|
|
3
|
+
import polars as pl
|
|
4
|
+
import pandas as pd
|
|
5
|
+
import numpy as np
|
|
6
|
+
import offtracker
|
|
7
|
+
import argparse
|
|
8
|
+
import os, glob
|
|
9
|
+
import shlex, subprocess
|
|
10
|
+
from scipy.stats import norm
|
|
11
|
+
|
|
12
|
+
def main():
|
|
13
|
+
parser = argparse.ArgumentParser()
|
|
14
|
+
parser.description='New function in 2026. Check and correct potential incorrect target locations.'
|
|
15
|
+
parser.add_argument('-f','--folder' , type=str, required=True, nargs='+', help='Directory of the data folder.' )
|
|
16
|
+
parser.add_argument('--name' , type=str, required=True, help='custom name of the sgRNA' )
|
|
17
|
+
parser.add_argument('--exp' , type=str, default='all', nargs='+', help='A substring mark in the name of experimental samples. The default is to use all samples other than control' )
|
|
18
|
+
parser.add_argument('--control' , type=str, default='none', nargs='+', help='A substring mark in the name of control samples. The default is no control. "others" for all samples other than --exp.' )
|
|
19
|
+
parser.add_argument('--fdr' , type=float, default=0.05, help='FDR threshold for the final result. Default is 0.05.')
|
|
20
|
+
parser.add_argument('--score' , type=float, default=1.9, help='Track score threshold for the final result. Default is 1.9.')
|
|
21
|
+
parser.add_argument('--smooth' , type=int, default=1, help='Smooth strength for the signal.')
|
|
22
|
+
parser.add_argument('--window' , type=int, default=3, help='Window size for smoothing the signal.')
|
|
23
|
+
parser.add_argument('--binsize' , type=int, default=100, help='Window size for smoothing the signal.')
|
|
24
|
+
parser.add_argument('--flank_max' , type=int, default=100000, help='Maximun flanking distance from the candidate site.')
|
|
25
|
+
parser.add_argument('--flank_regions', type=int, default=[1000,2000,3000,5000], nargs='+',help='flanking regions for calculating signal.')
|
|
26
|
+
parser.add_argument('--SeqScorePower', type=float, default=4, help='The seq score power' )
|
|
27
|
+
parser.add_argument('--CtrClip' , type=float, default=-0.5, help='The lower clip for control samples' )
|
|
28
|
+
parser.add_argument('-t','--thread' , type=int, default=4, help='Number of threads for parallel computing')
|
|
29
|
+
parser.add_argument('-g','--genome' , type=str, default='hg38', help='File of chromosome sizes, or "hg38", "mm10" ')
|
|
30
|
+
parser.add_argument('-o','--outdir' , type=str, default='first', help='The output folder. Default is the first folder of --folder' )
|
|
31
|
+
parser.add_argument('--outname' , type=str, default='same', help='The suffix of output files. Default is the same --exp' )
|
|
32
|
+
# new argument
|
|
33
|
+
parser.add_argument('-r','--ref' , type=str, required=True, help='The fasta file of reference genome')
|
|
34
|
+
parser.add_argument('--sgrna' , type=str, required=True, help='One sgRNA sequence without PAM' )
|
|
35
|
+
parser.add_argument('--pam' , type=str, required=True, help='The protospacer adjacent motif' )
|
|
36
|
+
parser.add_argument('--pam_location', type=str, default='downstream', help='Upstream or downstream, default is downstream (Cas9)' )
|
|
37
|
+
# not used
|
|
38
|
+
parser.add_argument('--seqfolder' , type=str, required=True, help='Actually not used in this script.Only in case you forget to remove this argument.')
|
|
39
|
+
|
|
40
|
+
args = parser.parse_args()
|
|
41
|
+
# 2025.08.08. 增加对阳性位点的 target_location 重比对功能,避免 blast 比对后的 realign 在更大范围内的存在不准确的情况
|
|
42
|
+
# 实验性功能,如果 exp 有多个样本的话目前只取第一个 bdg 来分析
|
|
43
|
+
|
|
44
|
+
##########################
|
|
45
|
+
## parameter initiation ##
|
|
46
|
+
##########################
|
|
47
|
+
|
|
48
|
+
folders = args.folder
|
|
49
|
+
sgRNA_name = args.name + '_loc_correction'
|
|
50
|
+
pattern_exp = args.exp
|
|
51
|
+
pattern_ctr = args.control
|
|
52
|
+
fdr_thresh = args.fdr
|
|
53
|
+
score_thresh = args.score
|
|
54
|
+
binsize = args.binsize
|
|
55
|
+
flank_max = args.flank_max
|
|
56
|
+
flank_regions = args.flank_regions
|
|
57
|
+
smooth_times = args.smooth
|
|
58
|
+
window_size = args.window
|
|
59
|
+
seq_score_power = args.SeqScorePower
|
|
60
|
+
ctr_clip = args.CtrClip
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
if args.outname == 'same':
|
|
64
|
+
if isinstance(pattern_exp, list):
|
|
65
|
+
outname = '_'.join(pattern_exp)
|
|
66
|
+
else:
|
|
67
|
+
outname = pattern_exp
|
|
68
|
+
else:
|
|
69
|
+
outname = args.outname
|
|
70
|
+
|
|
71
|
+
outdir = args.outdir
|
|
72
|
+
if outdir == 'first':
|
|
73
|
+
outdir = folders[0]
|
|
74
|
+
os.chdir(outdir)
|
|
75
|
+
# out temp folder
|
|
76
|
+
if not os.path.exists( os.path.join(outdir,'temp') ):
|
|
77
|
+
os.makedirs(os.path.join(outdir,'temp'))
|
|
78
|
+
# data temp folder
|
|
79
|
+
for a_folder in folders:
|
|
80
|
+
temp_dir = os.path.join(a_folder, 'temp')
|
|
81
|
+
if not os.path.exists( temp_dir ):
|
|
82
|
+
os.makedirs(temp_dir)
|
|
83
|
+
|
|
84
|
+
##################
|
|
85
|
+
## glob samples ##
|
|
86
|
+
##################
|
|
87
|
+
all_sample_names = []
|
|
88
|
+
all_sample_files = []
|
|
89
|
+
for a_folder in folders:
|
|
90
|
+
bdg_files = pd.Series(glob.glob(os.path.join( a_folder, '*.add.bdg' ))).sort_values().reset_index(drop=True)
|
|
91
|
+
sample_names = bdg_files.apply(os.path.basename).str.extract(r'(.*)\.\d+\.add\.bdg',expand=False)
|
|
92
|
+
all_sample_names.extend( sample_names )
|
|
93
|
+
all_sample_files.extend( bdg_files )
|
|
94
|
+
all_sample_files = pd.Series(all_sample_files)
|
|
95
|
+
all_sample_names = pd.Series(all_sample_names)
|
|
96
|
+
print('all sample names in the folders:')
|
|
97
|
+
print(all_sample_names)
|
|
98
|
+
print('your string pattern for experimental groups: ', pattern_exp)
|
|
99
|
+
ctr_samples = []
|
|
100
|
+
if pattern_ctr == 'none':
|
|
101
|
+
if pattern_exp == 'all':
|
|
102
|
+
exp_samples = list( all_sample_names )
|
|
103
|
+
else:
|
|
104
|
+
exp_samples = []
|
|
105
|
+
for a_mark in pattern_exp:
|
|
106
|
+
exp_samples.extend( list( all_sample_names[all_sample_names.str.contains(a_mark)] ) )
|
|
107
|
+
elif pattern_ctr == 'others':
|
|
108
|
+
if pattern_exp == 'all':
|
|
109
|
+
exp_samples = list( all_sample_names )
|
|
110
|
+
else:
|
|
111
|
+
exp_samples = []
|
|
112
|
+
for a_mark in pattern_exp:
|
|
113
|
+
exp_samples.extend( list( all_sample_names[all_sample_names.str.contains(a_mark)] ) )
|
|
114
|
+
ctr_samples = list( all_sample_names[~all_sample_names.isin(exp_samples)] )
|
|
115
|
+
else:
|
|
116
|
+
for a_mark in pattern_ctr:
|
|
117
|
+
ctr_samples.extend( list( all_sample_names[all_sample_names.str.contains(a_mark)] ) )
|
|
118
|
+
if pattern_exp == 'all':
|
|
119
|
+
exp_samples = list( all_sample_names[~all_sample_names.isin(ctr_samples)] )
|
|
120
|
+
else:
|
|
121
|
+
exp_samples = []
|
|
122
|
+
for a_mark in pattern_exp:
|
|
123
|
+
exp_samples.extend( list( all_sample_names[all_sample_names.str.contains(a_mark)] ) )
|
|
124
|
+
n_exp = len(exp_samples)
|
|
125
|
+
n_ctr = len(ctr_samples)
|
|
126
|
+
print(f'Experimental group has {n_exp} samples:\n{exp_samples}')
|
|
127
|
+
print(f'Control group has {n_ctr} samples:\n{ctr_samples}')
|
|
128
|
+
|
|
129
|
+
# mark 错误时
|
|
130
|
+
assert n_exp > 0, 'No experimental sample is found. Please check the name pattern.'
|
|
131
|
+
if (n_ctr==0)&(pattern_ctr != 'none'):
|
|
132
|
+
print('Name pattern for control sample(s) was given, but no file meet the pattern.')
|
|
133
|
+
return 'Program terminated'
|
|
134
|
+
|
|
135
|
+
# summarize samples
|
|
136
|
+
bool_exp = all_sample_names.isin(exp_samples)
|
|
137
|
+
bool_ctr = all_sample_names.isin(ctr_samples)
|
|
138
|
+
exp_sample_files = all_sample_files[bool_exp]
|
|
139
|
+
ctr_sample_files = all_sample_files[bool_ctr]
|
|
140
|
+
exp_sample_names = all_sample_names[bool_exp]
|
|
141
|
+
ctr_sample_names = all_sample_names[bool_ctr]
|
|
142
|
+
selected_sample_files = pd.concat([exp_sample_files,ctr_sample_files])
|
|
143
|
+
selected_sample_names = pd.concat([exp_sample_names,ctr_sample_names]) # no use
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
####################
|
|
148
|
+
## run correction ##
|
|
149
|
+
####################
|
|
150
|
+
|
|
151
|
+
# new parameters
|
|
152
|
+
ref_fasta = args.ref
|
|
153
|
+
sgRNA_seq = args.sgrna
|
|
154
|
+
PAM = args.pam
|
|
155
|
+
PAM_loc = args.pam_location
|
|
156
|
+
# read result
|
|
157
|
+
dp_result = pl.read_csv(f'Offtracker_result_{outname}.csv')
|
|
158
|
+
dp_bdg = pl.read_parquet(selected_sample_files[0], separator='\t', has_header=False,
|
|
159
|
+
schema_overrides={'chr':pl.String,'start':pl.Int32,'end':pl.Int32,'residual':pl.Float32})
|
|
160
|
+
# check and realign
|
|
161
|
+
bool_left_neg=(dp_result['exp_L_neg_1000']<-5)&(dp_result['exp_R_neg_1000']==0)
|
|
162
|
+
bool_right_neg=(dp_result['exp_R_neg_1000']<-5)&(dp_result['exp_L_neg_1000']==0)
|
|
163
|
+
list_good_result = []
|
|
164
|
+
list_bad_left = []
|
|
165
|
+
list_bad_right = []
|
|
166
|
+
for a_left_bool, a_right_bool, a_row in zip(bool_left_neg, bool_right_neg, dp_result.iter_rows(named=True)):
|
|
167
|
+
if a_left_bool & a_right_bool:
|
|
168
|
+
raise ValueError('abnormal on both left and right')
|
|
169
|
+
if a_left_bool:
|
|
170
|
+
print('left')
|
|
171
|
+
loc_shift_left = a_row['chr'] + ':' + str(a_row['st']-1000) + '-' + str(a_row['ed']-20)
|
|
172
|
+
region_index = a_row['region_index']
|
|
173
|
+
dp_bdg_chr = dp_bdg.filter(pl.col('chr') == a_row['chr'])
|
|
174
|
+
sr_candidate = offtracker.left_realign(dp_bdg_chr, loc_shift_left, ref_fasta, sgRNA_seq, PAM, PAM_loc, n_iter=0)
|
|
175
|
+
sr_candidate.loc['region_index'] = region_index
|
|
176
|
+
list_bad_left.append(sr_candidate)
|
|
177
|
+
elif a_right_bool:
|
|
178
|
+
print('right')
|
|
179
|
+
loc_shift_right = a_row['chr'] + ':' + str(a_row['st']+20) + '-' + str(a_row['ed']+1000)
|
|
180
|
+
region_index = a_row['region_index']
|
|
181
|
+
dp_bdg_chr = dp_bdg.filter(pl.col('chr') == a_row['chr'])
|
|
182
|
+
sr_candidate = offtracker.right_realign(dp_bdg_chr, loc_shift_right, ref_fasta, sgRNA_seq, PAM, PAM_loc, n_iter=0)
|
|
183
|
+
sr_candidate.loc['region_index'] = region_index
|
|
184
|
+
list_bad_right.append(sr_candidate)
|
|
185
|
+
else:
|
|
186
|
+
list_good_result.append(a_row)
|
|
187
|
+
dp_result_good = pl.DataFrame(list_good_result)
|
|
188
|
+
df_cand_left = pd.DataFrame(list_bad_left)
|
|
189
|
+
df_cand_right = pd.DataFrame(list_bad_right)
|
|
190
|
+
df_cand_realign = pd.concat([df_cand_left, df_cand_right])
|
|
191
|
+
|
|
192
|
+
seqfile = rf'correction_df_candidate_{outname}_realign.csv'
|
|
193
|
+
df_cand_realign.to_csv(seqfile)
|
|
194
|
+
|
|
195
|
+
# run offtracker_analysis with check_loc mode
|
|
196
|
+
running_log = rf'correction_analysis_{outname}.log'
|
|
197
|
+
with open(running_log, "w+") as running_log:
|
|
198
|
+
command = f'offtracker_analysis.py -t {args.thread} -g {args.genome} --seqfile {seqfile} --name {sgRNA_name} \
|
|
199
|
+
--exp {pattern_exp} --control {pattern_ctr} --outname {outname}_loc_correction -f {folders} -o {outdir} \
|
|
200
|
+
--fdr {fdr_thresh} --window {window_size} --smooth {smooth_times} --SeqScorePower {seq_score_power} \
|
|
201
|
+
--score {score_thresh} --binsize {binsize} --flank_max {flank_max} --flank_regions {flank_regions} --CtrClip {ctr_clip} \
|
|
202
|
+
--check_loc'
|
|
203
|
+
command2 = shlex.split('bash -c "{}"'.format(command))
|
|
204
|
+
process_1 = subprocess.Popen(command2, stdout=running_log, stderr=subprocess.STDOUT )
|
|
205
|
+
process_1.wait(timeout=100000)
|
|
206
|
+
retc = process_1.returncode
|
|
207
|
+
if retc==0:
|
|
208
|
+
print((f'correction_analysis {outname} is done!'))
|
|
209
|
+
else:
|
|
210
|
+
print((f'correction_analysis {outname} is failed!'))
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
#######################
|
|
214
|
+
## recalculate score ##
|
|
215
|
+
#######################
|
|
216
|
+
dp_result_bkg = pl.read_csv(f'./temp/df_result_{outname}.csv')
|
|
217
|
+
bool_fdr_bkg = dp_result_bkg['fdr']>fdr_thresh
|
|
218
|
+
bool_score_bkg = dp_result_bkg['track_score']<score_thresh
|
|
219
|
+
dp_result_bkg = dp_result_bkg.filter(bool_fdr_bkg & bool_score_bkg)
|
|
220
|
+
dp_result_realign = pl.read_csv(f'./temp/df_result_{outname}_loc_correction.csv')
|
|
221
|
+
|
|
222
|
+
# 兼容旧版输出列名
|
|
223
|
+
list_col = dp_result_realign.columns[:-5]
|
|
224
|
+
dp_result_new = pl.concat([dp_result_realign[list_col], dp_result_good[list_col], dp_result_bkg[list_col]])
|
|
225
|
+
|
|
226
|
+
# 标准化分布, polars 版
|
|
227
|
+
target_std=0.15
|
|
228
|
+
n_outliers = int(np.ceil(len(dp_result_new)*0.01))
|
|
229
|
+
score_bkg = dp_result_new['raw_score'][n_outliers:-n_outliers]
|
|
230
|
+
mean_score_bkg = score_bkg.mean()
|
|
231
|
+
std_score_bkg = score_bkg.std()
|
|
232
|
+
dp_result_new = dp_result_new.with_columns(
|
|
233
|
+
(pl.col('raw_score').sub(mean_score_bkg)/std_score_bkg).alias('track_score')
|
|
234
|
+
)
|
|
235
|
+
dp_result_new = dp_result_new.with_columns(
|
|
236
|
+
pl.col('track_score').mul(target_std).add(1).alias('track_score')
|
|
237
|
+
)
|
|
238
|
+
dp_result_new = dp_result_new.with_columns(
|
|
239
|
+
pl.col('track_score').clip(lower_bound=0.5).log(base=2).alias('log2_track_score')
|
|
240
|
+
)
|
|
241
|
+
dp_result_new = dp_result_new.sort('track_score', descending=True)
|
|
242
|
+
|
|
243
|
+
# pv and fdr
|
|
244
|
+
score_for_fitting = dp_result_new['log2_track_score'][n_outliers:-n_outliers]
|
|
245
|
+
mu, std = norm.fit(score_for_fitting)
|
|
246
|
+
print('mean_score:{:.3f};std:{:.3f}'.format(mu,std))
|
|
247
|
+
dp_result_new = dp_result_new.with_columns(
|
|
248
|
+
pl.col('log2_track_score').map_elements( lambda x: norm.sf(x,loc=mu,scale=std), return_dtype=pl.Float64 ).clip(lower_bound=1e-320).alias('pv')
|
|
249
|
+
)
|
|
250
|
+
dp_result_new = dp_result_new.with_columns(
|
|
251
|
+
fdr=offtracker.fdr(dp_result_new['pv']).alias('fdr'),
|
|
252
|
+
rank=pl.Series(range(1,len(dp_result_new)+1))
|
|
253
|
+
) #.with_row_index(name='rank',offset=1)
|
|
254
|
+
dp_result_new.write_csv(f'./temp/df_result_{outname}.csv') # 覆盖原结果
|
|
255
|
+
|
|
256
|
+
# ouput Offtracker result
|
|
257
|
+
bool_fdr = pl.col('fdr')<=fdr_thresh
|
|
258
|
+
bool_score = pl.col('track_score')>=score_thresh
|
|
259
|
+
dp_output = dp_result_new.filter(bool_fdr|bool_score).copy()
|
|
260
|
+
if pattern_ctr != 'none':
|
|
261
|
+
dp_output = dp_output[['target_location', 'best_strand','best_target','deletion','insertion','mismatch',
|
|
262
|
+
'exp_L_length', 'exp_R_length','ctr_L_length','ctr_R_length','L_length','R_length','signal_length',
|
|
263
|
+
'norm_best_seq_score','track_score', 'log2_track_score','fdr','rank']]
|
|
264
|
+
dp_output.columns = ['target_location', 'strand', 'target', 'deletion', 'insertion', 'mismatch',
|
|
265
|
+
'exp_L_length', 'exp_R_length','ctr_L_length','ctr_R_length','L_length','R_length','signal_length',
|
|
266
|
+
'seq_score', 'track_score', 'log2_track_score','FDR', 'rank']
|
|
267
|
+
else:
|
|
268
|
+
dp_output = dp_output[['target_location', 'best_strand','best_target','deletion','insertion','mismatch',
|
|
269
|
+
'L_length', 'R_length','signal_length',
|
|
270
|
+
'norm_best_seq_score','track_score', 'log2_track_score','fdr','rank']]
|
|
271
|
+
dp_output.columns = ['target_location', 'strand', 'target', 'deletion', 'insertion', 'mismatch',
|
|
272
|
+
'L_length', 'R_length','signal_length',
|
|
273
|
+
'seq_score', 'track_score', 'log2_track_score','FDR', 'rank']
|
|
274
|
+
dp_output.write_csv(f'Offtracker_result_{outname}.csv')
|
|
275
|
+
|
|
276
|
+
return 'correction finished'
|
|
277
|
+
|
|
278
|
+
if __name__ == '__main__' :
|
|
279
|
+
result = main()
|
|
280
|
+
print(result)
|
|
281
|
+
|
|
282
|
+
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{offtracker-2.12.3 → offtracker-2.13.1}/offtracker/utility/offtracker_blacklist_hg38.merged.bed
RENAMED
|
File without changes
|
{offtracker-2.12.3 → offtracker-2.13.1}/offtracker/utility/offtracker_blacklist_mm10.merged.bed
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|