offtracker 2.11.3__zip → 2.12.0__zip

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. {offtracker-2.11.3/offtracker.egg-info → offtracker-2.12.0}/PKG-INFO +1 -1
  2. {offtracker-2.11.3 → offtracker-2.12.0}/offtracker/X_offtracker.py +58 -18
  3. {offtracker-2.11.3 → offtracker-2.12.0}/offtracker/_version.py +4 -2
  4. {offtracker-2.11.3 → offtracker-2.12.0/offtracker.egg-info}/PKG-INFO +1 -1
  5. {offtracker-2.11.3 → offtracker-2.12.0}/scripts/offtracker_analysis.py +37 -31
  6. {offtracker-2.11.3 → offtracker-2.12.0}/scripts/offtracker_candidates.py +15 -10
  7. {offtracker-2.11.3 → offtracker-2.12.0}/LICENSE.txt +0 -0
  8. {offtracker-2.11.3 → offtracker-2.12.0}/MANIFEST.in +0 -0
  9. {offtracker-2.11.3 → offtracker-2.12.0}/README.md +0 -0
  10. {offtracker-2.11.3 → offtracker-2.12.0}/offtracker/X_offplot.py +0 -0
  11. {offtracker-2.11.3 → offtracker-2.12.0}/offtracker/X_sequence.py +0 -0
  12. {offtracker-2.11.3 → offtracker-2.12.0}/offtracker/__init__.py +0 -0
  13. {offtracker-2.11.3 → offtracker-2.12.0}/offtracker/snakefile/Snakefile_QC.smk +0 -0
  14. {offtracker-2.11.3 → offtracker-2.12.0}/offtracker/snakefile/Snakefile_offtracker.smk +0 -0
  15. {offtracker-2.11.3 → offtracker-2.12.0}/offtracker/utility/1.1_bed2fr.py +0 -0
  16. {offtracker-2.11.3 → offtracker-2.12.0}/offtracker/utility/1.3_bdg_normalize_v4.0.py +0 -0
  17. {offtracker-2.11.3 → offtracker-2.12.0}/offtracker/utility/bedGraphToBigWig +0 -0
  18. {offtracker-2.11.3 → offtracker-2.12.0}/offtracker/utility/hg38.chrom.sizes +0 -0
  19. {offtracker-2.11.3 → offtracker-2.12.0}/offtracker/utility/mm10.chrom.sizes +0 -0
  20. {offtracker-2.11.3 → offtracker-2.12.0}/offtracker/utility/offtracker_blacklist_hg38.merged.bed +0 -0
  21. {offtracker-2.11.3 → offtracker-2.12.0}/offtracker/utility/offtracker_blacklist_mm10.merged.bed +0 -0
  22. {offtracker-2.11.3 → offtracker-2.12.0}/offtracker.egg-info/SOURCES.txt +0 -0
  23. {offtracker-2.11.3 → offtracker-2.12.0}/offtracker.egg-info/dependency_links.txt +0 -0
  24. {offtracker-2.11.3 → offtracker-2.12.0}/offtracker.egg-info/requires.txt +0 -0
  25. {offtracker-2.11.3 → offtracker-2.12.0}/offtracker.egg-info/top_level.txt +0 -0
  26. {offtracker-2.11.3 → offtracker-2.12.0}/scripts/offtracker_config.py +0 -0
  27. {offtracker-2.11.3 → offtracker-2.12.0}/scripts/offtracker_init.py +0 -0
  28. {offtracker-2.11.3 → offtracker-2.12.0}/scripts/offtracker_plot.py +0 -0
  29. {offtracker-2.11.3 → offtracker-2.12.0}/scripts/offtracker_qc.py +0 -0
  30. {offtracker-2.11.3 → offtracker-2.12.0}/setup.cfg +0 -0
  31. {offtracker-2.11.3 → offtracker-2.12.0}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: offtracker
3
- Version: 2.11.3
3
+ Version: 2.12.0
4
4
  Summary: Tracking-seq data analysis
5
5
  Home-page: https://github.com/Lan-lab/offtracker
6
6
  Author: Runda Xu
@@ -1,5 +1,6 @@
1
1
 
2
2
  import pandas as pd
3
+ import polars as pl
3
4
  import numpy as np
4
5
  import os, sys
5
6
  sys.path.append( os.path.abspath(os.path.dirname(__file__)) )
@@ -8,26 +9,65 @@ def fdr(p_vals):
8
9
  # Benjamini-Hochberg
9
10
  from scipy.stats import rankdata
10
11
  ranked_p_values = rankdata(p_vals)
11
- fdr = p_vals * len(p_vals) / ranked_p_values
12
- fdr[fdr > 1] = 1
13
- return fdr
12
+ fdr_value = p_vals * len(p_vals) / ranked_p_values
13
+ fdr_value[fdr_value > 1] = 1
14
+ return fdr_value
14
15
 
15
- def dedup_two( df_loc, col_ID_1='ID_1', col_ID_2='ID_2'):
16
- # 会根据 df_loc 的排序保留第一个 location
17
- # dedup 结束后,剩下的 ID_1 + ID_2 并集可能会小于 dedup 前的并集
18
- list_nondup = []
19
- set_IDs = set()
20
- df_IDs = df_loc[[col_ID_1,col_ID_2]]
21
- for a_row in df_IDs.iterrows():
22
- temp = a_row[1]
23
- if (temp[col_ID_1] in set_IDs) or (temp[col_ID_2] in set_IDs):
24
- # 只要有一ID出现过,即便另一ID没出现过,也不更新 set_IDs
25
- list_nondup.append(False)
16
+
17
+ def mark_regions_single_chr(dp, min_distance=1000):
18
+ unique_chr = dp['chr'].unique()
19
+ assert len(unique_chr) == 1
20
+ unique_chr = unique_chr[0]
21
+
22
+ # Initialize variables for marking regions
23
+ region_id = 1
24
+ current_start = None
25
+ current_end = None
26
+ marked_regions = []
27
+
28
+ for row in dp.iter_rows(named=True):
29
+ start, end = row['st'], row['ed']
30
+
31
+ if current_start is None:
32
+ # First region
33
+ current_start = start
34
+ current_end = end
35
+ marked_regions.append(f'{unique_chr}_region_{region_id}')
26
36
  else:
27
- set_IDs.add(temp[col_ID_1])
28
- set_IDs.add(temp[col_ID_2])
29
- list_nondup.append(True)
30
- return list_nondup
37
+ if start <= current_end + min_distance:
38
+ # Mark as the same region
39
+ marked_regions.append(f'{unique_chr}_region_{region_id}')
40
+ else:
41
+ # New region
42
+ region_id += 1
43
+ marked_regions.append(f'{unique_chr}_region_{region_id}')
44
+ current_start = start
45
+ current_end = end
46
+
47
+ current_end = max(current_end, end)
48
+
49
+ return dp.with_columns(region_index=pl.Series(marked_regions))
50
+
51
+
52
+
53
+
54
+
55
+ # def dedup_two( df_loc, col_ID_1='ID_1', col_ID_2='ID_2'):
56
+ # # 会根据 df_loc 的排序保留第一个 location
57
+ # # dedup 结束后,剩下的 ID_1 + ID_2 并集可能会小于 dedup 前的并集
58
+ # list_nondup = []
59
+ # set_IDs = set()
60
+ # df_IDs = df_loc[[col_ID_1,col_ID_2]]
61
+ # for a_row in df_IDs.iterrows():
62
+ # temp = a_row[1]
63
+ # if (temp[col_ID_1] in set_IDs) or (temp[col_ID_2] in set_IDs):
64
+ # # 只要有一ID出现过,即便另一ID没出现过,也不更新 set_IDs
65
+ # list_nondup.append(False)
66
+ # else:
67
+ # set_IDs.add(temp[col_ID_1])
68
+ # set_IDs.add(temp[col_ID_2])
69
+ # list_nondup.append(True)
70
+ # return list_nondup
31
71
 
32
72
  def window_smooth(sr_smooth, window_size=3, times=1):
33
73
  window = np.ones(window_size) / window_size
@@ -1,4 +1,4 @@
1
- __version__ = "2.11.3"
1
+ __version__ = "2.12.0"
2
2
  # 2023.08.11. v1.1.0 adding a option for not normalizing the bw file
3
3
  # 2023.10.26. v1.9.0 prerelease for v2.0
4
4
  # 2023.10.27. v2.0.0 大更新,还没微调
@@ -38,4 +38,6 @@ __version__ = "2.11.3"
38
38
  # 2025.06.28. v2.10.9 现在 pip 都是从 wheel 安装,不再运行 setup.py,所以增加一个 offtracker_init.py
39
39
  # 2025.06.28. v2.10.10 直接塞 script 里试试
40
40
  # 2025.06.28. v2.10.11 回滚到2.10.9外加修正
41
- # 2025.07.02. v2.11.3 基于 blast 的缺陷更新 candidates
41
+ # 2025.07.02. v2.11.4 基于 blast 的缺陷更新 candidates,去除 quick mode
42
+ # 2025.07.04. v2.11.5 offtracker_analysis 提前 skip 已有结果的样本
43
+ # 2025.07.04. v2.12.0 新增 region_index 标记区域,用于更好的去重
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: offtracker
3
- Version: 2.11.3
3
+ Version: 2.12.0
4
4
  Summary: Tracking-seq data analysis
5
5
  Home-page: https://github.com/Lan-lab/offtracker
6
6
  Author: Runda Xu
@@ -82,7 +82,7 @@ def main():
82
82
  df_candidate.index = df_candidate['target_location']
83
83
  df_candidate_brief = df_candidate[['chr','st','ed','best_strand','best_target','best_seq_score',
84
84
  'deletion', 'insertion','mismatch', 'GG',
85
- 'target_location', 'cleavage_site', 'ID_1','ID_2']]
85
+ 'target_location', 'cleavage_site', 'ID_1','ID_2', 'region_index']] # 2025.07.06 添加 region_index
86
86
  df_candidate_sub = df_candidate[['chr','cleavage_site']]
87
87
  except FileNotFoundError:
88
88
  return 'Please run offtracker_candidates.py first and provide the correct directory with --seqfolder'
@@ -192,7 +192,8 @@ def main():
192
192
 
193
193
  if args.signal_only:
194
194
  return 'signal_only is on, stop here.'
195
-
195
+
196
+
196
197
  ####################
197
198
  ## group analysis ##
198
199
  ####################
@@ -204,6 +205,11 @@ def main():
204
205
  else:
205
206
  outname = args.outname
206
207
 
208
+ # skip finished
209
+ output = f'Offtracker_result_{outname}.csv'
210
+ if (os.path.isfile(output))&(not args.overwrite):
211
+ return 'skip {output} as the result exists!'
212
+
207
213
  output = f'./temp/df_score_{outname}.csv'
208
214
  if (os.path.isfile(output))&(not args.overwrite):
209
215
  print(f'skip {output}')
@@ -294,9 +300,12 @@ def main():
294
300
  df_score['raw_score'] = df_score['final_score_1'] + df_score['final_score_2']
295
301
  df_score = df_score.sort_values('raw_score', ascending=False)
296
302
 
297
- # local dedup
298
- list_nondup = offtracker.dedup_two(df_score,'ID_1','ID_2')
299
- df_result = df_score[list_nondup].copy()
303
+ # # local dedup
304
+ # list_nondup = offtracker.dedup_two(df_score,'ID_1','ID_2')
305
+ # df_result = df_score[list_nondup].copy()
306
+
307
+ # 2025.07.06 更新去重方式
308
+ df_result = df_score.drop_duplicates(subset=['region_index'], keep='first')
300
309
 
301
310
  # 标准化分布
302
311
  target_std=0.15
@@ -353,33 +362,30 @@ def main():
353
362
  df_result.to_csv(output)
354
363
 
355
364
  output = f'Offtracker_result_{outname}.csv'
356
- if (os.path.isfile(output))&(not args.overwrite):
357
- print(f'skip {output} as the result exists')
365
+ # 2024.06.03. 以防 fdr<=fdr_thresh 滤掉了 track_score>=2 的位点
366
+ bool_fdr = df_result['fdr']<=fdr_thresh
367
+ bool_score = df_result['track_score']>=score_thresh
368
+ # 2025.06.05. BE可能会形成单边信号,导致 track_score 为负数,也保留
369
+ bool_neg_score = df_result['track_score']< -1
370
+ df_output = df_result[bool_fdr|bool_score|bool_neg_score].copy()
371
+ if pattern_ctr != 'none':
372
+ df_output = df_output[['target_location', 'best_strand','best_target','deletion','insertion','mismatch',
373
+ 'exp_L_length', 'exp_R_length','ctr_L_length','ctr_R_length','L_length','R_length','signal_length',
374
+ 'norm_best_seq_score','track_score', 'log2_track_score','fdr','rank']]
375
+ df_output.columns = ['target_location', 'strand', 'target', 'deletion', 'insertion', 'mismatch',
376
+ 'exp_L_length', 'exp_R_length','ctr_L_length','ctr_R_length','L_length','R_length','signal_length',
377
+ 'seq_score', 'track_score', 'log2_track_score','FDR', 'rank']
358
378
  else:
359
- # 2024.06.03. 以防 fdr<=fdr_thresh 滤掉了 track_score>=2 的位点
360
- bool_fdr = df_result['fdr']<=fdr_thresh
361
- bool_score = df_result['track_score']>=score_thresh
362
- # 2025.06.05. BE可能会形成单边信号,导致 track_score 为负数,也保留
363
- bool_neg_score = df_result['track_score']< -1
364
- df_output = df_result[bool_fdr|bool_score|bool_neg_score].copy()
365
- if pattern_ctr != 'none':
366
- df_output = df_output[['target_location', 'best_strand','best_target','deletion','insertion','mismatch',
367
- 'exp_L_length', 'exp_R_length','ctr_L_length','ctr_R_length','L_length','R_length','signal_length',
368
- 'norm_best_seq_score','track_score', 'log2_track_score','fdr','rank']]
369
- df_output.columns = ['target_location', 'strand', 'target', 'deletion', 'insertion', 'mismatch',
370
- 'exp_L_length', 'exp_R_length','ctr_L_length','ctr_R_length','L_length','R_length','signal_length',
371
- 'seq_score', 'track_score', 'log2_track_score','FDR', 'rank']
372
- else:
373
- df_output = df_output[['target_location', 'best_strand','best_target','deletion','insertion','mismatch',
374
- 'L_length', 'R_length','signal_length',
375
- 'norm_best_seq_score','track_score', 'log2_track_score','fdr','rank']]
376
- df_output.columns = ['target_location', 'strand', 'target', 'deletion', 'insertion', 'mismatch',
377
- 'L_length', 'R_length','signal_length',
378
- 'seq_score', 'track_score', 'log2_track_score','FDR', 'rank']
379
- df_output.to_csv(f'Offtracker_result_{outname}.csv', index=False)
380
-
381
- if args.clean:
382
- shutil.rmtree('./temp')
379
+ df_output = df_output[['target_location', 'best_strand','best_target','deletion','insertion','mismatch',
380
+ 'L_length', 'R_length','signal_length',
381
+ 'norm_best_seq_score','track_score', 'log2_track_score','fdr','rank']]
382
+ df_output.columns = ['target_location', 'strand', 'target', 'deletion', 'insertion', 'mismatch',
383
+ 'L_length', 'R_length','signal_length',
384
+ 'seq_score', 'track_score', 'log2_track_score','FDR', 'rank']
385
+ df_output.to_csv(f'Offtracker_result_{outname}.csv', index=False)
386
+
387
+ if args.clean:
388
+ shutil.rmtree('./temp')
383
389
 
384
390
  return 'Done!'
385
391
 
@@ -37,7 +37,7 @@ def main():
37
37
  parser.add_argument('-o','--outdir' , type=str, required=True, help='The output folder')
38
38
  parser.add_argument('-g','--genome' , type=str, default='hg38', help='File of chromosome sizes, or "hg38", "mm10" ')
39
39
  parser.add_argument('-t','--thread' , type=int, default=4, help='Number of threads for parallel computing')
40
- parser.add_argument('--quick_mode' , action='store_true', help='BLAST faster but less candidates.')
40
+ # parser.add_argument('--quick_mode' , action='store_true', help='Quick mode is deprecated due to blast flaw.')
41
41
 
42
42
  args = parser.parse_args()
43
43
 
@@ -57,7 +57,7 @@ def main():
57
57
  os.makedirs(dir_output)
58
58
  dir_ref_fa = args.ref
59
59
  blast_db = args.blastdb
60
- quick_mode = args.quick_mode
60
+ # quick_mode = args.quick_mode
61
61
 
62
62
  # parameters for alignment
63
63
  half_width = 100
@@ -95,13 +95,7 @@ def main():
95
95
  if os.path.isfile(dir_sgRNA_blast):
96
96
  print(f'{dir_sgRNA_blast} exists, skipped.')
97
97
  else:
98
- if quick_mode:
99
- print('Using quick mode for BLAST')
100
- blastx_cline = NcbiblastnCommandline(query=dir_sgRNA_fasta, task='blastn-short',out=dir_sgRNA_blast,
101
- db=blast_db, evalue=100000,outfmt=6, num_threads=n_threads,
102
- gapopen=4, gapextend=2, reward=2, word_size=5, dust='no', soft_masking=False)
103
- else:
104
- blastx_cline = NcbiblastnCommandline(query=dir_sgRNA_fasta, task='blastn-short',out=dir_sgRNA_blast,
98
+ blastx_cline = NcbiblastnCommandline(query=dir_sgRNA_fasta, task='blastn-short',out=dir_sgRNA_blast,
105
99
  db=blast_db, evalue=100000,outfmt=6, num_threads=n_threads,
106
100
  gapopen=4, gapextend=2, reward=2, word_size=4, dust='no', soft_masking=False)
107
101
  print(f'BLAST for candidate off-target sites of {sgRNA_name}.')
@@ -316,7 +310,18 @@ def main():
316
310
  df_candidate['mis_all'] = df_candidate[['mismatch','deletion','insertion']].sum(axis=1)
317
311
  df_candidate = df_candidate[df_candidate['mis_all']<8]
318
312
 
319
- df_candidate.to_csv(dir_df_candidate)
313
+ # 2025.07.06 增加 region 标记用于去重
314
+ # 将 df_candidate 按照染色体分组
315
+ candidate_groups = df_candidate.groupby('chr')
316
+ # 定义一个空的列表,用于存储每个染色体的数据
317
+ list_dp = []
318
+ for chr_name, chr_candidate in candidate_groups:
319
+ dp_marked = offtracker.mark_regions_single_chr(pl.DataFrame(chr_candidate))
320
+ list_dp.append(dp_marked)
321
+ df_candidate = pl.concat(list_dp)
322
+
323
+ # 改成 pl 输出
324
+ df_candidate.write_csv(dir_df_candidate)
320
325
  print(f'Output df_candidate_{sgRNA_name}.csv')
321
326
  os.remove(temp_bed)
322
327
 
File without changes
File without changes
File without changes
File without changes
File without changes