offtracker 2.13.0__zip → 2.13.2__zip

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {offtracker-2.13.0/offtracker.egg-info → offtracker-2.13.2}/PKG-INFO +1 -1
  2. {offtracker-2.13.0 → offtracker-2.13.2}/offtracker/X_offtracker.py +12 -0
  3. {offtracker-2.13.0 → offtracker-2.13.2}/offtracker/_version.py +3 -1
  4. {offtracker-2.13.0 → offtracker-2.13.2}/offtracker/snakefile/Snakefile_offtracker.smk +1 -1
  5. {offtracker-2.13.0 → offtracker-2.13.2/offtracker.egg-info}/PKG-INFO +1 -1
  6. {offtracker-2.13.0 → offtracker-2.13.2}/offtracker.egg-info/SOURCES.txt +1 -0
  7. {offtracker-2.13.0 → offtracker-2.13.2}/scripts/offtracker_analysis.py +45 -37
  8. offtracker-2.13.2/scripts/offtracker_correction.py +324 -0
  9. {offtracker-2.13.0 → offtracker-2.13.2}/setup.py +1 -0
  10. {offtracker-2.13.0 → offtracker-2.13.2}/LICENSE.txt +0 -0
  11. {offtracker-2.13.0 → offtracker-2.13.2}/MANIFEST.in +0 -0
  12. {offtracker-2.13.0 → offtracker-2.13.2}/README.md +0 -0
  13. {offtracker-2.13.0 → offtracker-2.13.2}/offtracker/X_offplot.py +0 -0
  14. {offtracker-2.13.0 → offtracker-2.13.2}/offtracker/X_sequence.py +0 -0
  15. {offtracker-2.13.0 → offtracker-2.13.2}/offtracker/__init__.py +0 -0
  16. {offtracker-2.13.0 → offtracker-2.13.2}/offtracker/snakefile/Snakefile_QC.smk +0 -0
  17. {offtracker-2.13.0 → offtracker-2.13.2}/offtracker/utility/1.1_bed2fr.py +0 -0
  18. {offtracker-2.13.0 → offtracker-2.13.2}/offtracker/utility/1.3_bdg_normalize_v4.0.py +0 -0
  19. {offtracker-2.13.0 → offtracker-2.13.2}/offtracker/utility/bedGraphToBigWig +0 -0
  20. {offtracker-2.13.0 → offtracker-2.13.2}/offtracker/utility/hg38.chrom.sizes +0 -0
  21. {offtracker-2.13.0 → offtracker-2.13.2}/offtracker/utility/mm10.chrom.sizes +0 -0
  22. {offtracker-2.13.0 → offtracker-2.13.2}/offtracker/utility/offtracker_blacklist_hg38.merged.bed +0 -0
  23. {offtracker-2.13.0 → offtracker-2.13.2}/offtracker/utility/offtracker_blacklist_mm10.merged.bed +0 -0
  24. {offtracker-2.13.0 → offtracker-2.13.2}/offtracker.egg-info/dependency_links.txt +0 -0
  25. {offtracker-2.13.0 → offtracker-2.13.2}/offtracker.egg-info/requires.txt +0 -0
  26. {offtracker-2.13.0 → offtracker-2.13.2}/offtracker.egg-info/top_level.txt +0 -0
  27. {offtracker-2.13.0 → offtracker-2.13.2}/scripts/offtracker_candidates.py +0 -0
  28. {offtracker-2.13.0 → offtracker-2.13.2}/scripts/offtracker_config.py +0 -0
  29. {offtracker-2.13.0 → offtracker-2.13.2}/scripts/offtracker_init.py +0 -0
  30. {offtracker-2.13.0 → offtracker-2.13.2}/scripts/offtracker_plot.py +0 -0
  31. {offtracker-2.13.0 → offtracker-2.13.2}/scripts/offtracker_qc.py +0 -0
  32. {offtracker-2.13.0 → offtracker-2.13.2}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: offtracker
3
- Version: 2.13.0
3
+ Version: 2.13.2
4
4
  Summary: Tracking-seq data analysis
5
5
  Home-page: https://github.com/Lan-lab/offtracker
6
6
  Author: Runda Xu
@@ -608,6 +608,12 @@ def left_realign(dp_bdg_chr, loc_shift_left, ref_fasta, sgRNA_seq, PAM, PAM_loc,
608
608
  flank_regions = [500]
609
609
  signals = target_signal(dp_bdg_chr.to_pandas(), chrom, cleavage_site, flank_regions=flank_regions)
610
610
  L_neg_1000 = signals[2]
611
+ R_neg_1000 = signals[5]
612
+ # 如果右侧范围变负数了,说明过头了
613
+ if R_neg_1000 < 0:
614
+ sr_candidate.loc['realign'] = 'fail'
615
+ return sr_candidate
616
+
611
617
  # 计算左移后的 L_neg_1000,如果还是负数则迭代,最多迭代 10 次
612
618
  if L_neg_1000 < 0:
613
619
  st = sr_candidate['st']
@@ -632,7 +638,13 @@ def right_realign(dp_bdg_chr, loc_shift_right, ref_fasta, sgRNA_seq, PAM, PAM_lo
632
638
  cleavage_site = sr_candidate['cleavage_site']
633
639
  flank_regions = [500]
634
640
  signals = target_signal(dp_bdg_chr.to_pandas(), chrom, cleavage_site, flank_regions=flank_regions)
641
+ L_neg_1000 = signals[2]
635
642
  R_neg_1000 = signals[5]
643
+ # 如果左侧范围变负数了,说明过头了
644
+ if L_neg_1000 < 0:
645
+ sr_candidate.loc['realign'] = 'fail'
646
+ return sr_candidate
647
+
636
648
  # 计算右移后的 R_neg_1000,如果还是负数则迭代,最多迭代 10 次
637
649
  if R_neg_1000 < 0:
638
650
  st = sr_candidate['st']
@@ -1,4 +1,4 @@
1
- __version__ = "2.13.0"
1
+ __version__ = "2.13.2"
2
2
  # 2023.08.11. v1.1.0 adding a option for not normalizing the bw file
3
3
  # 2023.10.26. v1.9.0 prerelease for v2.0
4
4
  # 2023.10.27. v2.0.0 大更新,还没微调
@@ -43,3 +43,5 @@ __version__ = "2.13.0"
43
43
  # 2025.07.04. v2.12.2 新增 region_index 标记区域,用于更好的去重
44
44
  # 2025.07.18. v2.12.3 新增QC自动避免重复读取 trimmed fastq files
45
45
  # 2025.08.08. v2.13.0 测试 local realign 功能
46
+ # 2025.08.09. v2.13.1 测试 correction 功能
47
+ # 2025.08.09. v2.13.2 chromap + trim 参数
@@ -48,7 +48,7 @@ rule chromap:
48
48
  temp(os.path.join(_output_dir,"{sample}.chromapx.bed"))
49
49
  shell:
50
50
  """
51
- chromap -l 3000 --low-mem --BED --remove-pcr-duplicates \
51
+ chromap -l 3000 --low-mem --BED --remove-pcr-duplicates --trim-adapters \
52
52
  --min-read-length 10 --allocate-multi-mappings \
53
53
  -x {params.index} -r {params.fasta} -t {threads} -1 {input.R1} -2 {input.R2} -o {output}
54
54
  """
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: offtracker
3
- Version: 2.13.0
3
+ Version: 2.13.2
4
4
  Summary: Tracking-seq data analysis
5
5
  Home-page: https://github.com/Lan-lab/offtracker
6
6
  Author: Runda Xu
@@ -24,6 +24,7 @@ offtracker/utility/offtracker_blacklist_mm10.merged.bed
24
24
  scripts/offtracker_analysis.py
25
25
  scripts/offtracker_candidates.py
26
26
  scripts/offtracker_config.py
27
+ scripts/offtracker_correction.py
27
28
  scripts/offtracker_init.py
28
29
  scripts/offtracker_plot.py
29
30
  scripts/offtracker_qc.py
@@ -22,24 +22,29 @@ def main():
22
22
  parser = argparse.ArgumentParser()
23
23
  parser.description='Analyze the Tracking-seq data.'
24
24
  parser.add_argument('-f','--folder' , type=str, required=True, nargs='+', help='Directory of the data folder.' )
25
- parser.add_argument('--seqfolder' , type=str, required=True, help='folder containing df_candidate created by offtracker_cadidates.py.')
25
+ parser.add_argument('--seqfolder' , type=str, default ='none', help='folder containing df_candidate created by offtracker_cadidates.py.')
26
26
  parser.add_argument('--name' , type=str, required=True, help='custom name of the sgRNA' )
27
27
  parser.add_argument('--exp' , type=str, default='all', nargs='+', help='A substring mark in the name of experimental samples. The default is to use all samples other than control' )
28
28
  parser.add_argument('--control' , type=str, default='none', nargs='+', help='A substring mark in the name of control samples. The default is no control. "others" for all samples other than --exp.' )
29
- parser.add_argument('--fdr' , type=float, default=0.05, help='FDR threshold for the final result. Default is 0.05.')
30
- parser.add_argument('--score' , type=float, default=1.9, help='Track score threshold for the final result. Default is 1.9.')
29
+ parser.add_argument('--fdr' , type=float, default=0.05, help='FDR threshold for the final result. Default is 0.05.')
30
+ parser.add_argument('--score' , type=float, default=1.9, help='Track score threshold for the final result. Default is 1.9.')
31
31
  parser.add_argument('--smooth' , type=int, default=1, help='Smooth strength for the signal.')
32
32
  parser.add_argument('--window' , type=int, default=3, help='Window size for smoothing the signal.')
33
33
  parser.add_argument('--binsize' , type=int, default=100, help='Window size for smoothing the signal.')
34
34
  parser.add_argument('--flank_max' , type=int, default=100000, help='Maximun flanking distance from the candidate site.')
35
35
  parser.add_argument('--flank_regions', type=int, default=[1000,2000,3000,5000], nargs='+',help='flanking regions for calculating signal.')
36
36
  parser.add_argument('--SeqScorePower', type=float, default=4, help='The seq score power' )
37
- parser.add_argument('--CtrClip' , type=float, default=-0.5, help='The lower clip for control samples' )
37
+ parser.add_argument('--CtrClip' , type=float, default=-0.5, help='The lower clip for control samples' )
38
38
  parser.add_argument('-t','--thread' , type=int, default=4, help='Number of threads for parallel computing')
39
39
  parser.add_argument('-g','--genome' , type=str, default='hg38', help='File of chromosome sizes, or "hg38", "mm10" ')
40
40
  parser.add_argument('-o','--outdir' , type=str, default='first', help='The output folder. Default is the first folder of --folder' )
41
41
  parser.add_argument('--outname' , type=str, default='same', help='The suffix of output files. Default is the same --exp' )
42
42
  parser.add_argument('--signal_only' , action='store_true', help='A developer option: stop before group analysis. ' )
43
+ # for offtracker_correction
44
+ parser.add_argument('--check_loc' , action='store_true', help='New in v2.13, for other scripts. Do not use this option. ' )
45
+ parser.add_argument('--seqfile' , type=str, default='none', help='Assign a specific df_candidate file.')
46
+
47
+ # other parameters
43
48
  # parser.add_argument('--individual_results', action='store_true', help='When multiple samples meet the exp pattern, only one merged result is generated.\n' \
44
49
  # 'Set --individual_results to additionally output the individual result of each exp sample. ' )
45
50
  parser.add_argument('--overwrite' , action='store_true', help='Whether to overwrite existed dataframes.' )
@@ -79,7 +84,12 @@ def main():
79
84
 
80
85
  # load df_candidate
81
86
  try:
82
- df_candidate = pl.read_csv(os.path.join(args.seqfolder,f'df_candidate_{sgRNA_name}.csv')).to_pandas()
87
+ if args.seqfile != 'none':
88
+ df_candidate = pl.read_csv(args.seqfile).to_pandas()
89
+ elif args.seqfolder != 'none':
90
+ df_candidate = pl.read_csv(os.path.join(args.seqfolder,f'df_candidate_{sgRNA_name}.csv')).to_pandas()
91
+ else:
92
+ raise ValueError('Please provide --seqfolder or --seqfile')
83
93
  df_candidate.index = df_candidate['target_location']
84
94
  df_candidate_brief = df_candidate[['chr','st','ed','best_strand','best_target','best_seq_score',
85
95
  'deletion', 'insertion','mismatch', 'GG',
@@ -160,8 +170,11 @@ def main():
160
170
  if (os.path.isfile(output))&(not args.overwrite):
161
171
  print(output, 'exists, skipped')
162
172
  continue
163
- df_bdg = xseq.read_bed(a_file)
164
- df_bdg.columns = ['chr','start','end','residual']
173
+ # 2025.08.09. 改用 pl 读取加速
174
+ df_bdg = pl.read_csv(a_file, separator='\t', has_header=False,
175
+ schema_overrides={'chr':pl.String,'start':pl.Int32,
176
+ 'end':pl.Int32,'residual':pl.Float32}).to_pandas() # xseq.read_bed(a_file)
177
+ # df_bdg.columns = ['chr','start','end','residual']
165
178
  # 将 df_bdg 按照染色体分组
166
179
  sample_groups = df_bdg.groupby('chr')
167
180
  # 2024.06.03. fix a bug that df_bdg has less chr than df_candidate
@@ -308,7 +321,7 @@ def main():
308
321
  # 2025.07.06 更新去重方式
309
322
  df_result = df_score.drop_duplicates(subset=['region_index'], keep='first').copy()
310
323
 
311
- # 标准化分布
324
+ # 标准化分布,2025.08.09
312
325
  target_std=0.15
313
326
  n_outliers = int(np.ceil(len(df_result)*0.01))
314
327
  score_bkg = df_result['raw_score'][n_outliers:-n_outliers]
@@ -317,7 +330,7 @@ def main():
317
330
  df_result['track_score'] = (df_result['raw_score'] - mean_score_bkg) / std_score_bkg
318
331
  df_result['track_score'] = df_result['track_score']*target_std + 1
319
332
  df_result = df_result.sort_values(by='track_score', ascending=False)
320
- df_result['log2_track_score'] = np.log2(df_result['track_score'].clip(lower=0.5))
333
+ df_result['log2_track_score'] = np.log2(df_result['track_score'].clip(lower=0.5))
321
334
 
322
335
  # 单边信号周围有更高分的,去掉
323
336
  # v2.1 后 cols_L, cols_R 要手动
@@ -362,35 +375,30 @@ def main():
362
375
  df_result['rank'] = range(1,len(df_result)+1)
363
376
  df_result.to_csv(output)
364
377
 
365
- output = f'Offtracker_result_{outname}.csv'
366
- # 2024.06.03. 以防 fdr<=fdr_thresh 滤掉了 track_score>=2 的位点
367
- bool_fdr = df_result['fdr']<=fdr_thresh
368
- bool_score = df_result['track_score']>=score_thresh
369
- # 2025.06.05. BE可能会形成单边信号,但是很少见,如果 control 用的是别的 sgRNA 的样本,对应脱靶位置附近一般就是负数
370
- # bool_neg_score = df_result['track_score']< -1
371
- df_output = df_result[bool_fdr|bool_score].copy()
372
- if pattern_ctr != 'none':
373
- df_output = df_output[['target_location', 'best_strand','best_target','deletion','insertion','mismatch',
374
- 'exp_L_length', 'exp_R_length','ctr_L_length','ctr_R_length','L_length','R_length','signal_length',
375
- 'norm_best_seq_score','track_score', 'log2_track_score','fdr','rank']]
376
- df_output.columns = ['target_location', 'strand', 'target', 'deletion', 'insertion', 'mismatch',
377
- 'exp_L_length', 'exp_R_length','ctr_L_length','ctr_R_length','L_length','R_length','signal_length',
378
- 'seq_score', 'track_score', 'log2_track_score','FDR', 'rank']
379
- else:
380
- df_output = df_output[['target_location', 'best_strand','best_target','deletion','insertion','mismatch',
381
- 'L_length', 'R_length','signal_length',
382
- 'norm_best_seq_score','track_score', 'log2_track_score','fdr','rank']]
383
- df_output.columns = ['target_location', 'strand', 'target', 'deletion', 'insertion', 'mismatch',
384
- 'L_length', 'R_length','signal_length',
385
- 'seq_score', 'track_score', 'log2_track_score','FDR', 'rank']
386
-
387
-
388
- # 2025.08.08. 增加对阳性位点的 target_location 重比对功能,避免 blast 比对后的 realign 在更大范围内的存在不准确的情况
389
-
390
-
391
-
378
+ if not args.check_loc:
379
+ output = f'Offtracker_result_{outname}.csv'
380
+ # 2024.06.03. 以防 fdr<=fdr_thresh 滤掉了 track_score>=2 的位点
381
+ bool_fdr = df_result['fdr']<=fdr_thresh
382
+ bool_score = df_result['track_score']>=score_thresh
383
+ # 2025.06.05. BE可能会形成单边信号,但是很少见,如果 control 用的是别的 sgRNA 的样本,对应脱靶位置附近一般就是负数
384
+ # bool_neg_score = df_result['track_score']< -1
385
+ df_output = df_result[bool_fdr|bool_score].copy()
386
+ if pattern_ctr != 'none':
387
+ df_output = df_output[['target_location', 'best_strand','best_target','deletion','insertion','mismatch',
388
+ 'exp_L_length', 'exp_R_length','ctr_L_length','ctr_R_length','L_length','R_length','signal_length',
389
+ 'norm_best_seq_score','track_score', 'log2_track_score','fdr','rank']]
390
+ df_output.columns = ['target_location', 'strand', 'target', 'deletion', 'insertion', 'mismatch',
391
+ 'exp_L_length', 'exp_R_length','ctr_L_length','ctr_R_length','L_length','R_length','signal_length',
392
+ 'seq_score', 'track_score', 'log2_track_score','FDR', 'rank']
393
+ else:
394
+ df_output = df_output[['target_location', 'best_strand','best_target','deletion','insertion','mismatch',
395
+ 'L_length', 'R_length','signal_length',
396
+ 'norm_best_seq_score','track_score', 'log2_track_score','fdr','rank']]
397
+ df_output.columns = ['target_location', 'strand', 'target', 'deletion', 'insertion', 'mismatch',
398
+ 'L_length', 'R_length','signal_length',
399
+ 'seq_score', 'track_score', 'log2_track_score','FDR', 'rank']
392
400
 
393
- df_output.to_csv(f'Offtracker_result_{outname}.csv', index=False)
401
+ df_output.to_csv(f'Offtracker_result_{outname}.csv', index=False)
394
402
 
395
403
  if args.clean:
396
404
  shutil.rmtree('./temp')
@@ -0,0 +1,324 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ import polars as pl
5
+ import pandas as pd
6
+ import numpy as np
7
+ import offtracker
8
+ import argparse
9
+ import os, glob
10
+ import shlex, subprocess
11
+ from scipy.stats import norm
12
+
13
+ def main():
14
+ parser = argparse.ArgumentParser()
15
+ parser.description='New function in 2026. Check and correct potential incorrect target locations.'
16
+ parser.add_argument('-f','--folder' , type=str, required=True, nargs='+', help='Directory of the data folder.' )
17
+ parser.add_argument('--name' , type=str, required=True, help='custom name of the sgRNA' )
18
+ parser.add_argument('--exp' , type=str, default='all', nargs='+', help='A substring mark in the name of experimental samples. The default is to use all samples other than control' )
19
+ parser.add_argument('--control' , type=str, default='none', nargs='+', help='A substring mark in the name of control samples. The default is no control. "others" for all samples other than --exp.' )
20
+ parser.add_argument('--fdr' , type=float, default=0.05, help='FDR threshold for the final result. Default is 0.05.')
21
+ parser.add_argument('--score' , type=float, default=1.9, help='Track score threshold for the final result. Default is 1.9.')
22
+ parser.add_argument('--smooth' , type=int, default=1, help='Smooth strength for the signal.')
23
+ parser.add_argument('--window' , type=int, default=3, help='Window size for smoothing the signal.')
24
+ parser.add_argument('--binsize' , type=int, default=100, help='Window size for smoothing the signal.')
25
+ parser.add_argument('--flank_max' , type=int, default=100000, help='Maximun flanking distance from the candidate site.')
26
+ parser.add_argument('--flank_regions', type=int, default=[1000,2000,3000,5000], nargs='+',help='flanking regions for calculating signal.')
27
+ parser.add_argument('--SeqScorePower', type=float, default=4, help='The seq score power' )
28
+ parser.add_argument('--CtrClip' , type=float, default=-0.5, help='The lower clip for control samples' )
29
+ parser.add_argument('-t','--thread' , type=int, default=4, help='Number of threads for parallel computing')
30
+ parser.add_argument('-g','--genome' , type=str, default='hg38', help='File of chromosome sizes, or "hg38", "mm10" ')
31
+ parser.add_argument('-o','--outdir' , type=str, default='first', help='The output folder. Default is the first folder of --folder' )
32
+ parser.add_argument('--outname' , type=str, default='same', help='The suffix of output files. Default is the same --exp' )
33
+ # new argument
34
+ parser.add_argument('-r','--ref' , type=str, required=True, help='The fasta file of reference genome')
35
+ parser.add_argument('--sgrna' , type=str, required=True, help='One sgRNA sequence without PAM' )
36
+ parser.add_argument('--pam' , type=str, required=True, help='The protospacer adjacent motif' )
37
+ parser.add_argument('--pam_location', type=str, default='downstream', help='Upstream or downstream, default is downstream (Cas9)' )
38
+ # not used
39
+ parser.add_argument('--seqfolder' , type=str, default='none', help='Actually not used in this script.Only in case you forget to remove this argument.')
40
+
41
+ args = parser.parse_args()
42
+ # 2025.08.08. 增加对阳性位点的 target_location 重比对功能,避免 blast 比对后的 realign 在更大范围内的存在不准确的情况
43
+ # 实验性功能,如果 exp 有多个样本的话目前只取第一个 bdg 来分析
44
+
45
+ ##########################
46
+ ## parameter initiation ##
47
+ ##########################
48
+
49
+ folders = args.folder
50
+ sgRNA_name = args.name + '_loc_correction'
51
+ pattern_exp = args.exp
52
+ pattern_ctr = args.control
53
+ fdr_thresh = args.fdr
54
+ score_thresh = args.score
55
+ binsize = args.binsize
56
+ flank_max = args.flank_max
57
+ flank_regions = args.flank_regions # 如果 analysis 时修改了这个参数没有写 1000 的话会出bug,暂时懒得改了
58
+ smooth_times = args.smooth
59
+ window_size = args.window
60
+ seq_score_power = args.SeqScorePower
61
+ ctr_clip = args.CtrClip
62
+
63
+
64
+ if args.outname == 'same':
65
+ if isinstance(pattern_exp, list):
66
+ outname = '_'.join(pattern_exp)
67
+ else:
68
+ outname = pattern_exp
69
+ else:
70
+ outname = args.outname
71
+
72
+ outdir = args.outdir
73
+ if outdir == 'first':
74
+ outdir = folders[0]
75
+ os.chdir(outdir)
76
+ # out temp folder
77
+ if not os.path.exists( os.path.join(outdir,'temp') ):
78
+ os.makedirs(os.path.join(outdir,'temp'))
79
+ # data temp folder
80
+ for a_folder in folders:
81
+ temp_dir = os.path.join(a_folder, 'temp')
82
+ if not os.path.exists( temp_dir ):
83
+ os.makedirs(temp_dir)
84
+
85
+ ##################
86
+ ## glob samples ##
87
+ ##################
88
+ all_sample_names = []
89
+ all_sample_files = []
90
+ for a_folder in folders:
91
+ bdg_files = pd.Series(glob.glob(os.path.join( a_folder, '*.add.bdg' ))).sort_values().reset_index(drop=True)
92
+ sample_names = bdg_files.apply(os.path.basename).str.extract(r'(.*)\.\d+\.add\.bdg',expand=False)
93
+ all_sample_names.extend( sample_names )
94
+ all_sample_files.extend( bdg_files )
95
+ all_sample_files = pd.Series(all_sample_files)
96
+ all_sample_names = pd.Series(all_sample_names)
97
+ print('all sample names in the folders:')
98
+ print(all_sample_names)
99
+ print('your string pattern for experimental groups: ', pattern_exp)
100
+ ctr_samples = []
101
+ if pattern_ctr == 'none':
102
+ if pattern_exp == 'all':
103
+ exp_samples = list( all_sample_names )
104
+ else:
105
+ exp_samples = []
106
+ for a_mark in pattern_exp:
107
+ exp_samples.extend( list( all_sample_names[all_sample_names.str.contains(a_mark)] ) )
108
+ elif pattern_ctr == 'others':
109
+ if pattern_exp == 'all':
110
+ exp_samples = list( all_sample_names )
111
+ else:
112
+ exp_samples = []
113
+ for a_mark in pattern_exp:
114
+ exp_samples.extend( list( all_sample_names[all_sample_names.str.contains(a_mark)] ) )
115
+ ctr_samples = list( all_sample_names[~all_sample_names.isin(exp_samples)] )
116
+ else:
117
+ for a_mark in pattern_ctr:
118
+ ctr_samples.extend( list( all_sample_names[all_sample_names.str.contains(a_mark)] ) )
119
+ if pattern_exp == 'all':
120
+ exp_samples = list( all_sample_names[~all_sample_names.isin(ctr_samples)] )
121
+ else:
122
+ exp_samples = []
123
+ for a_mark in pattern_exp:
124
+ exp_samples.extend( list( all_sample_names[all_sample_names.str.contains(a_mark)] ) )
125
+ n_exp = len(exp_samples)
126
+ n_ctr = len(ctr_samples)
127
+ print(f'Experimental group has {n_exp} samples:\n{exp_samples}')
128
+ print(f'Control group has {n_ctr} samples:\n{ctr_samples}')
129
+
130
+ # mark 错误时
131
+ assert n_exp > 0, 'No experimental sample is found. Please check the name pattern.'
132
+ if (n_ctr==0)&(pattern_ctr != 'none'):
133
+ print('Name pattern for control sample(s) was given, but no file meet the pattern.')
134
+ return 'Program terminated'
135
+
136
+ # summarize samples
137
+ bool_exp = all_sample_names.isin(exp_samples)
138
+ bool_ctr = all_sample_names.isin(ctr_samples)
139
+ exp_sample_files = all_sample_files[bool_exp]
140
+ ctr_sample_files = all_sample_files[bool_ctr]
141
+ exp_sample_names = all_sample_names[bool_exp]
142
+ ctr_sample_names = all_sample_names[bool_ctr]
143
+ # selected_sample_files = pd.concat([exp_sample_files,ctr_sample_files])
144
+ # selected_sample_names = pd.concat([exp_sample_names,ctr_sample_names]) # no use
145
+
146
+
147
+
148
+ ####################
149
+ ## run correction ##
150
+ ####################
151
+
152
+ # new parameters
153
+ ref_fasta = args.ref
154
+ sgRNA_seq = args.sgrna
155
+ PAM = args.pam
156
+ PAM_loc = args.pam_location
157
+ # read result
158
+ dp_result = pl.read_csv(f'./temp/df_result_{outname}.csv')
159
+ # negative for next section
160
+ bool_fdr_bkg = dp_result['fdr']>fdr_thresh
161
+ bool_score_bkg = dp_result['track_score']<score_thresh
162
+ dp_result_bkg = dp_result.filter(bool_fdr_bkg & bool_score_bkg)
163
+ # positive
164
+ bool_fdr = pl.col('fdr')<=fdr_thresh
165
+ bool_score = pl.col('track_score')>=score_thresh
166
+ dp_result = dp_result.filter(bool_fdr & bool_score)
167
+ # bdg
168
+ dp_bdg = pl.read_csv(exp_sample_files.iloc[0], separator='\t', has_header=False,
169
+ schema_overrides={'chr':pl.String,'start':pl.Int32,'end':pl.Int32,'residual':pl.Float32})
170
+ # check and realign
171
+ bool_left_neg=(dp_result['exp_L_neg_1000']<-5)&(dp_result['exp_R_neg_1000']==0)
172
+ bool_right_neg=(dp_result['exp_R_neg_1000']<-5)&(dp_result['exp_L_neg_1000']==0)
173
+ list_good_result = []
174
+ list_bad_left = []
175
+ list_bad_right = []
176
+ n_left_for_correct = 0
177
+ n_right_for_correct = 0
178
+ for a_left_bool, a_right_bool, a_row in zip(bool_left_neg, bool_right_neg, dp_result.iter_rows(named=True)):
179
+ if a_left_bool & a_right_bool:
180
+ raise ValueError('abnormal on both left and right')
181
+ if a_left_bool:
182
+ n_left_for_correct += 1
183
+ loc_shift_left = a_row['chr'] + ':' + str(a_row['st']-1000) + '-' + str(a_row['ed']-20)
184
+ region_index = a_row['region_index']
185
+ dp_bdg_chr = dp_bdg.filter(pl.col('chr') == a_row['chr'])
186
+ sr_candidate = offtracker.left_realign(dp_bdg_chr, loc_shift_left, ref_fasta, sgRNA_seq, PAM, PAM_loc, n_iter=0)
187
+ sr_candidate.loc['region_index'] = region_index
188
+ list_bad_left.append(sr_candidate)
189
+ elif a_right_bool:
190
+ n_right_for_correct += 1
191
+ loc_shift_right = a_row['chr'] + ':' + str(a_row['st']+20) + '-' + str(a_row['ed']+1000)
192
+ region_index = a_row['region_index']
193
+ dp_bdg_chr = dp_bdg.filter(pl.col('chr') == a_row['chr'])
194
+ sr_candidate = offtracker.right_realign(dp_bdg_chr, loc_shift_right, ref_fasta, sgRNA_seq, PAM, PAM_loc, n_iter=0)
195
+ sr_candidate.loc['region_index'] = region_index
196
+ list_bad_right.append(sr_candidate)
197
+ else:
198
+ list_good_result.append(a_row)
199
+ dp_result_good = pl.DataFrame(list_good_result)
200
+ df_cand_left = pd.DataFrame(list_bad_left)
201
+ df_cand_right = pd.DataFrame(list_bad_right)
202
+ df_cand_realign = pd.concat([df_cand_left, df_cand_right])
203
+ if len(df_cand_realign) == 0:
204
+ print('No candidate is found for realignment.')
205
+ return 'finished'
206
+
207
+ # 情况判断
208
+ n_success_realign = sum(df_cand_realign['realign']=='success')
209
+ n_fail_realign = sum(df_cand_realign['realign']!='success')
210
+ if (n_success_realign == 0) and (n_fail_realign > 0):
211
+ print(f'{n_fail_realign} candidates are found for realignment, but all failed.')
212
+ return 'finished'
213
+ elif (n_success_realign > 0) and (n_fail_realign > 0):
214
+ print(f'{n_success_realign} candidates succeeded, and {n_fail_realign} candidates failed.')
215
+ else:
216
+ print(f'{n_success_realign} candidates succeeded.')
217
+
218
+ df_cand_realign = df_cand_realign[df_cand_realign['realign']=='success']
219
+ seqfile = rf'correction_df_candidate_{outname}_realign.csv'
220
+ df_cand_realign.to_csv(seqfile)
221
+
222
+ # run offtracker_analysis with check_loc mode
223
+ running_log = rf'correction_analysis_{outname}.log'
224
+ # list 转空格分割参数
225
+ if isinstance(pattern_exp, list):
226
+ param_pattern_exp = ' '.join(pattern_exp)
227
+ else:
228
+ param_pattern_exp = pattern_exp
229
+ if isinstance(pattern_ctr, list):
230
+ param_pattern_ctr = ' '.join(pattern_ctr)
231
+ else:
232
+ param_pattern_ctr = pattern_ctr
233
+ if isinstance(flank_regions, list):
234
+ param_flank_regions = ' '.join([str(x) for x in flank_regions])
235
+ else:
236
+ param_flank_regions = flank_regions
237
+ if isinstance(folders, list):
238
+ param_folders = ' '.join([str(x) for x in folders])
239
+ else:
240
+ param_folders = folders
241
+
242
+ with open(running_log, "w+") as running_log:
243
+ command = f'offtracker_analysis.py -t {args.thread} -g {args.genome} --seqfile {seqfile} --name {sgRNA_name} \
244
+ --exp {param_pattern_exp} --control {param_pattern_ctr} --outname {outname}_loc_correction -f {param_folders} -o {outdir} \
245
+ --fdr {fdr_thresh} --window {window_size} --smooth {smooth_times} --SeqScorePower {seq_score_power} \
246
+ --score {score_thresh} --binsize {binsize} --flank_max {flank_max} --flank_regions {param_flank_regions} --CtrClip {ctr_clip} \
247
+ --check_loc'
248
+ command2 = shlex.split('bash -c "{}"'.format(command))
249
+ process_1 = subprocess.Popen(command2, stdout=running_log, stderr=subprocess.STDOUT )
250
+ process_1.wait(timeout=100000)
251
+ retc = process_1.returncode
252
+ if retc==0:
253
+ print((f'correction_analysis {outname} is done!'))
254
+ else:
255
+ print((f'correction_analysis {outname} is failed!'))
256
+
257
+
258
+ #######################
259
+ ## recalculate score ##
260
+ #######################
261
+
262
+ dp_result_realign = pl.read_csv(f'./temp/df_result_{outname}_loc_correction.csv')
263
+
264
+ # 兼容旧版输出列名
265
+ list_col = dp_result_realign.columns[:-5]
266
+ dp_result_new = pl.concat([dp_result_realign[list_col], dp_result_good[list_col], dp_result_bkg[list_col]])
267
+
268
+ # 标准化分布, polars 版
269
+ target_std=0.15
270
+ n_outliers = int(np.ceil(len(dp_result_new)*0.01))
271
+ score_bkg = dp_result_new['raw_score'][n_outliers:-n_outliers]
272
+ mean_score_bkg = score_bkg.mean()
273
+ std_score_bkg = score_bkg.std()
274
+ dp_result_new = dp_result_new.with_columns(
275
+ (pl.col('raw_score').sub(mean_score_bkg)/std_score_bkg).alias('track_score')
276
+ )
277
+ dp_result_new = dp_result_new.with_columns(
278
+ pl.col('track_score').mul(target_std).add(1).alias('track_score')
279
+ )
280
+ dp_result_new = dp_result_new.with_columns(
281
+ pl.col('track_score').clip(lower_bound=0.5).log(base=2).alias('log2_track_score')
282
+ )
283
+ dp_result_new = dp_result_new.sort('track_score', descending=True)
284
+
285
+ # pv and fdr
286
+ score_for_fitting = dp_result_new['log2_track_score'][n_outliers:-n_outliers]
287
+ mu, std = norm.fit(score_for_fitting)
288
+ print('mean_score:{:.3f};std:{:.3f}'.format(mu,std))
289
+ dp_result_new = dp_result_new.with_columns(
290
+ pl.col('log2_track_score').map_elements( lambda x: norm.sf(x,loc=mu,scale=std), return_dtype=pl.Float64 ).clip(lower_bound=1e-320).alias('pv')
291
+ )
292
+ dp_result_new = dp_result_new.with_columns(
293
+ fdr=offtracker.fdr(dp_result_new['pv']).alias('fdr'),
294
+ rank=pl.Series(range(1,len(dp_result_new)+1))
295
+ ) #.with_row_index(name='rank',offset=1)
296
+ dp_result_new.write_csv(f'./temp/df_result_{outname}.csv') # 覆盖原结果
297
+
298
+ # ouput Offtracker result
299
+ bool_fdr = pl.col('fdr')<=fdr_thresh
300
+ bool_score = pl.col('track_score')>=score_thresh
301
+ dp_output = dp_result_new.filter(bool_fdr|bool_score)
302
+ if pattern_ctr != 'none':
303
+ dp_output = dp_output[['target_location', 'best_strand','best_target','deletion','insertion','mismatch',
304
+ 'exp_L_length', 'exp_R_length','ctr_L_length','ctr_R_length','L_length','R_length','signal_length',
305
+ 'norm_best_seq_score','track_score', 'log2_track_score','fdr','rank']]
306
+ dp_output.columns = ['target_location', 'strand', 'target', 'deletion', 'insertion', 'mismatch',
307
+ 'exp_L_length', 'exp_R_length','ctr_L_length','ctr_R_length','L_length','R_length','signal_length',
308
+ 'seq_score', 'track_score', 'log2_track_score','FDR', 'rank']
309
+ else:
310
+ dp_output = dp_output[['target_location', 'best_strand','best_target','deletion','insertion','mismatch',
311
+ 'L_length', 'R_length','signal_length',
312
+ 'norm_best_seq_score','track_score', 'log2_track_score','fdr','rank']]
313
+ dp_output.columns = ['target_location', 'strand', 'target', 'deletion', 'insertion', 'mismatch',
314
+ 'L_length', 'R_length','signal_length',
315
+ 'seq_score', 'track_score', 'log2_track_score','FDR', 'rank']
316
+ dp_output.write_csv(f'Offtracker_result_{outname}.csv')
317
+
318
+ return 'correction finished'
319
+
320
+ if __name__ == '__main__' :
321
+ result = main()
322
+ print(result)
323
+
324
+
@@ -54,6 +54,7 @@ setup(
54
54
  'scripts/offtracker_config.py',
55
55
  'scripts/offtracker_candidates.py',
56
56
  'scripts/offtracker_analysis.py',
57
+ 'scripts/offtracker_correction.py',
57
58
  'scripts/offtracker_plot.py'],
58
59
  install_requires=REQUIRED,
59
60
  include_package_data=True
File without changes
File without changes
File without changes
File without changes