offtracker 2.12.2__zip → 2.13.0__zip

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. {offtracker-2.12.2/offtracker.egg-info → offtracker-2.13.0}/PKG-INFO +1 -1
  2. {offtracker-2.12.2 → offtracker-2.13.0}/offtracker/X_offtracker.py +302 -1
  3. {offtracker-2.12.2 → offtracker-2.13.0}/offtracker/X_sequence.py +137 -2
  4. {offtracker-2.12.2 → offtracker-2.13.0}/offtracker/_version.py +11 -9
  5. {offtracker-2.12.2 → offtracker-2.13.0/offtracker.egg-info}/PKG-INFO +1 -1
  6. {offtracker-2.12.2 → offtracker-2.13.0}/scripts/offtracker_analysis.py +11 -4
  7. {offtracker-2.12.2 → offtracker-2.13.0}/scripts/offtracker_qc.py +6 -2
  8. {offtracker-2.12.2 → offtracker-2.13.0}/LICENSE.txt +0 -0
  9. {offtracker-2.12.2 → offtracker-2.13.0}/MANIFEST.in +0 -0
  10. {offtracker-2.12.2 → offtracker-2.13.0}/README.md +0 -0
  11. {offtracker-2.12.2 → offtracker-2.13.0}/offtracker/X_offplot.py +0 -0
  12. {offtracker-2.12.2 → offtracker-2.13.0}/offtracker/__init__.py +0 -0
  13. {offtracker-2.12.2 → offtracker-2.13.0}/offtracker/snakefile/Snakefile_QC.smk +0 -0
  14. {offtracker-2.12.2 → offtracker-2.13.0}/offtracker/snakefile/Snakefile_offtracker.smk +0 -0
  15. {offtracker-2.12.2 → offtracker-2.13.0}/offtracker/utility/1.1_bed2fr.py +0 -0
  16. {offtracker-2.12.2 → offtracker-2.13.0}/offtracker/utility/1.3_bdg_normalize_v4.0.py +0 -0
  17. {offtracker-2.12.2 → offtracker-2.13.0}/offtracker/utility/bedGraphToBigWig +0 -0
  18. {offtracker-2.12.2 → offtracker-2.13.0}/offtracker/utility/hg38.chrom.sizes +0 -0
  19. {offtracker-2.12.2 → offtracker-2.13.0}/offtracker/utility/mm10.chrom.sizes +0 -0
  20. {offtracker-2.12.2 → offtracker-2.13.0}/offtracker/utility/offtracker_blacklist_hg38.merged.bed +0 -0
  21. {offtracker-2.12.2 → offtracker-2.13.0}/offtracker/utility/offtracker_blacklist_mm10.merged.bed +0 -0
  22. {offtracker-2.12.2 → offtracker-2.13.0}/offtracker.egg-info/SOURCES.txt +0 -0
  23. {offtracker-2.12.2 → offtracker-2.13.0}/offtracker.egg-info/dependency_links.txt +0 -0
  24. {offtracker-2.12.2 → offtracker-2.13.0}/offtracker.egg-info/requires.txt +0 -0
  25. {offtracker-2.12.2 → offtracker-2.13.0}/offtracker.egg-info/top_level.txt +0 -0
  26. {offtracker-2.12.2 → offtracker-2.13.0}/scripts/offtracker_candidates.py +0 -0
  27. {offtracker-2.12.2 → offtracker-2.13.0}/scripts/offtracker_config.py +0 -0
  28. {offtracker-2.12.2 → offtracker-2.13.0}/scripts/offtracker_init.py +0 -0
  29. {offtracker-2.12.2 → offtracker-2.13.0}/scripts/offtracker_plot.py +0 -0
  30. {offtracker-2.12.2 → offtracker-2.13.0}/setup.cfg +0 -0
  31. {offtracker-2.12.2 → offtracker-2.13.0}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: offtracker
3
- Version: 2.12.2
3
+ Version: 2.13.0
4
4
  Summary: Tracking-seq data analysis
5
5
  Home-page: https://github.com/Lan-lab/offtracker
6
6
  Author: Runda Xu
@@ -2,7 +2,8 @@
2
2
  import pandas as pd
3
3
  import polars as pl
4
4
  import numpy as np
5
- import os, sys
5
+ import os, sys, re
6
+ import offtracker.X_sequence as xseq
6
7
  sys.path.append( os.path.abspath(os.path.dirname(__file__)) )
7
8
 
8
9
  def fdr(p_vals):
@@ -115,6 +116,29 @@ def target_signal(df_bdg_chr, chrom, cleavage_site, flank_max=100000, smooth_tim
115
116
  binsize=100, flank_regions=[500,1000,2000,5000],
116
117
  length_bkg = 20000, length_binsize=1000, length_min_noise=0.2, n_std=1,
117
118
  end='end',start='start',value='residual', pct_offset=0.0):
119
+ """_summary_
120
+
121
+ Args:
122
+ df_bdg_chr (_type_): .bdg table with the same chromosome
123
+ chrom (_type_): chr name
124
+ cleavage_site (_type_): cleavage site
125
+ flank_max (int, optional): _description_. Defaults to 100000.
126
+ smooth_times (int, optional): _description_. Defaults to 1.
127
+ window_size (int, optional): _description_. Defaults to 3.
128
+ binsize (int, optional): _description_. Defaults to 100.
129
+ flank_regions (list, optional): _description_. Defaults to [500,1000,2000,5000].
130
+ length_bkg (int, optional): _description_. Defaults to 20000.
131
+ length_binsize (int, optional): _description_. Defaults to 1000.
132
+ length_min_noise (float, optional): _description_. Defaults to 0.2.
133
+ n_std (int, optional): _description_. Defaults to 1.
134
+ end (str, optional): _description_. Defaults to 'end'.
135
+ start (str, optional): _description_. Defaults to 'start'.
136
+ value (str, optional): _description_. Defaults to 'residual'.
137
+ pct_offset (float, optional): _description_. Defaults to 0.0.
138
+
139
+ Returns:
140
+ _type_: _description_
141
+ """
118
142
  # 输入数据必须是同一条染色体内的
119
143
  # 统计 flank regions 的个数
120
144
  # n_regions = len(flank_regions)
@@ -328,6 +352,25 @@ def target_signal(df_bdg_chr, chrom, cleavage_site, flank_max=100000, smooth_tim
328
352
 
329
353
  def target_signal_chunk(df_bdg_chr, df_alignment_chr, flank_max=100000, smooth_times = 1, window_size = 3, binsize=100, flank_regions=[500,1000,2000,5000],
330
354
  length_bkg = 20000, length_binsize=1000, length_min_noise=0.2, n_std=1, pct_offset=0.0):
355
+ """
356
+
357
+ Args:
358
+ df_bdg_chr (_type_): .bdg table with the same chromosome
359
+ df_alignment_chr (_type_): candidate sites
360
+ flank_max (int, optional): _description_. Defaults to 100000.
361
+ smooth_times (int, optional): _description_. Defaults to 1.
362
+ window_size (int, optional): _description_. Defaults to 3.
363
+ binsize (int, optional): _description_. Defaults to 100.
364
+ flank_regions (list, optional): _description_. Defaults to [500,1000,2000,5000].
365
+ length_bkg (int, optional): _description_. Defaults to 20000.
366
+ length_binsize (int, optional): _description_. Defaults to 1000.
367
+ length_min_noise (float, optional): _description_. Defaults to 0.2.
368
+ n_std (int, optional): _description_. Defaults to 1.
369
+ pct_offset (float, optional): _description_. Defaults to 0.0.
370
+
371
+ Returns:
372
+ _type_: _description_
373
+ """
331
374
  # 输入数据必须是同一条染色体内的
332
375
  list_target_all = []
333
376
  for a_row in df_alignment_chr.iterrows():
@@ -348,3 +391,261 @@ def target_signal_chunk(df_bdg_chr, df_alignment_chr, flank_max=100000, smooth_t
348
391
  return df_result
349
392
 
350
393
 
394
+
395
+
396
+
397
+
398
+
399
+
400
+
401
+
402
+
403
+ ############################################################################
404
+ # 2025.08.08 新写的的基于 Bio.Align 的 local realign,用于局部修正脱靶位点坐标 #
405
+ ############################################################################
406
+ def create_substitution_matrix(mismatch_score=0.01):
407
+ """
408
+ Create substitution matrix for DNA alignment using Bio.Align format.
409
+ """
410
+ alphabet = 'ATGCN'
411
+ matrix = np.full((len(alphabet), len(alphabet)), mismatch_score)
412
+
413
+ # Set match scores
414
+ for i in range(len(alphabet)):
415
+ matrix[i][i] = 2.0
416
+
417
+ # N matches with everything
418
+ n_idx = alphabet.index('N')
419
+ matrix[n_idx, :] = 2.0
420
+ matrix[:, n_idx] = 2.0
421
+
422
+ return matrix, alphabet
423
+
424
+ def sgRNA_alignment_new(a_key, sgRNA, seq, substitution_matrix=None, alphabet=None,
425
+ mismatch_score=0.01):
426
+ """
427
+ Perform local alignment using Bio.Align instead of deprecated pairwise2.
428
+ """
429
+ from Bio import Align
430
+ if substitution_matrix is None or alphabet is None:
431
+ substitution_matrix, alphabet = create_substitution_matrix(mismatch_score)
432
+
433
+ # Create aligner
434
+ aligner = Align.PairwiseAligner()
435
+ aligner.substitution_matrix = Align.substitution_matrices.Array(
436
+ alphabet=alphabet, dims=2, data=substitution_matrix
437
+ )
438
+ aligner.open_gap_score = -2
439
+ aligner.extend_gap_score = -2
440
+ aligner.mode = 'local'
441
+
442
+ try:
443
+ # Perform alignment
444
+ alignments = aligner.align(sgRNA, seq)
445
+
446
+ if not alignments:
447
+ # No alignment found, return default values
448
+ return [0, 0, '', f"{a_key.split(':')[0]}:0-0", 0, 0, len(sgRNA)]
449
+
450
+ # Convert to list for indexing
451
+ alignments = list(alignments)
452
+
453
+ # Extract alignment information
454
+ coords = alignments[0].coordinates
455
+ start_target = coords[1][0]
456
+ end_target = coords[1][-1]
457
+
458
+ # Extract target sequence directly from coordinates
459
+ # target = seq[start_target:end_target]
460
+
461
+ # Get aligned sequences for detailed analysis
462
+ alignment_str = str(alignments[0])
463
+ alignment_lines = alignment_str.split('\n')
464
+ if len(alignment_lines) >= 3:
465
+ aligned_sgrna = [x for x in alignment_lines[0].split(' ') if x != '']
466
+ aligned_genome = [x for x in alignment_lines[2].split(' ') if x != '']
467
+ else:
468
+ raise ValueError("Unexpected alignment format")
469
+
470
+ assert int(aligned_sgrna[-1]) == len(sgRNA)
471
+
472
+ # Calculate indels and mismatches
473
+ # deletion = RNA bulge
474
+ # insertion = DNA bulge
475
+ aligned_sgrna_seq = aligned_sgrna[-2]
476
+ aligned_genome_seq = aligned_genome[-2]
477
+ insertion = aligned_sgrna_seq.count('-') if '-' in aligned_sgrna_seq else 0
478
+ deletion = aligned_genome_seq.count('-') if '-' in aligned_genome_seq else 0
479
+
480
+ # Count mismatches by comparing sequences directly
481
+ # mismatch = 0
482
+ # assert len(aligned_sgrna_seq) == len(aligned_genome_seq)
483
+ # for i in range(len(aligned_sgrna_seq)):
484
+ # if (aligned_sgrna_seq[i] != aligned_genome_seq[i]) & (aligned_sgrna_seq[i] != 'N') & (aligned_genome_seq[i] != 'N'):
485
+ # mismatch += 1
486
+
487
+ mismatch = round((alignments[0].score % 1)/mismatch_score)
488
+
489
+ # Calculate target location
490
+ pos_st = int(a_key.split('-')[0].split(':')[1]) + 1
491
+ chr_name = a_key.split(':')[0]
492
+ target_st = pos_st + start_target
493
+ target_ed = pos_st + end_target - 1
494
+ target_location = f"{chr_name}:{target_st}-{target_ed}"
495
+
496
+ score = alignments[0].score
497
+
498
+ return [score, aligned_genome_seq, target_location, deletion, insertion, mismatch]
499
+
500
+ except Exception as e:
501
+ print(f"Alignment error for {a_key}: {e}")
502
+ return [0, 0, '', f"{a_key.split(':')[0]}:0-0", 0, 0, len(sgRNA)]
503
+
504
+ def local_realign(sgRNA_seq, fasta, PAM='NGG', PAM_loc='downstream'):
505
+ # 添加 PAM
506
+ if PAM_loc == 'downstream':
507
+ sgRNA_PAM_fw = sgRNA_seq + PAM
508
+ else:
509
+ sgRNA_PAM_fw = PAM + sgRNA_seq
510
+ sgRNA_PAM_rv = xseq.reverse_complement(sgRNA_PAM_fw)
511
+ list_args_fw=[]
512
+ list_args_rv=[]
513
+ for a_key, a_seq in fasta.items():
514
+ # 2025.04.25 修正大小写问题
515
+ a_seq = re.sub('[^ATCG]','N',a_seq.upper())
516
+ list_args_fw.append( [a_key, sgRNA_PAM_fw, a_seq])
517
+ list_args_rv.append( [a_key, sgRNA_PAM_rv, a_seq])
518
+ list_align_forward = [sgRNA_alignment_new(*args) for args in list_args_fw]
519
+ list_align_reverse = [sgRNA_alignment_new(*args) for args in list_args_rv]
520
+ #
521
+ df_align_forward = pd.DataFrame(list_align_forward, columns= ['fw_score', 'fw_target','fw_location','fw_deletion','fw_insertion','fw_mismatch'])
522
+ df_align_reverse = pd.DataFrame(list_align_reverse, columns= ['rv_score', 'rv_target','rv_location','rv_deletion','rv_insertion','rv_mismatch'])
523
+ df_align_reverse['rv_target'] = df_align_reverse['rv_target'].apply(xseq.reverse_complement)
524
+ df_candidate = pd.concat([df_align_forward,df_align_reverse],axis=1)
525
+ df_candidate['location'] = fasta.keys()
526
+ df_candidate['alignment_score'] = df_candidate[['fw_score','rv_score']].max(axis=1)
527
+ df_candidate['best_seq_score'] = df_candidate[['fw_score', 'rv_score']].max(axis=1)
528
+ df_candidate['best_strand'] = df_candidate[['fw_score', 'rv_score']].idxmax(axis='columns').replace({'fw_score':'+', 'rv_score':'-'})
529
+ df_candidate.loc[df_candidate['fw_score']==df_candidate['rv_score'],'best_strand']='equal_score'
530
+
531
+ # GG check
532
+ # 2023.12.05 增加 cleavage_site 推测
533
+ list_best_target = []
534
+ list_best_location = []
535
+ list_cleavage_site = []
536
+ list_delete = []
537
+ list_insert = []
538
+ list_mismat = []
539
+ list_GG = []
540
+ for a_row in df_candidate.iterrows():
541
+ if a_row[1]['best_strand']=='+':
542
+ list_best_target.append(a_row[1]['fw_target'])
543
+ list_best_location.append(a_row[1]['fw_location'])
544
+ list_cleavage_site.append(int(a_row[1]['fw_location'].split('-')[1]) - 6)
545
+ list_delete.append(a_row[1]['fw_deletion'])
546
+ list_insert.append(a_row[1]['fw_insertion'])
547
+ list_mismat.append(a_row[1]['fw_mismatch'])
548
+ if a_row[1]['fw_target'][-2:]=='GG':
549
+ list_GG.append('OK')
550
+ else:
551
+ list_GG.append('NO')
552
+ elif a_row[1]['best_strand']=='-':
553
+ list_best_target.append(a_row[1]['rv_target'])
554
+ list_best_location.append(a_row[1]['rv_location'])
555
+ list_cleavage_site.append(int(a_row[1]['rv_location'].split('-')[0].split(':')[1]) + 5)
556
+ list_delete.append(a_row[1]['rv_deletion'])
557
+ list_insert.append(a_row[1]['rv_insertion'])
558
+ list_mismat.append(a_row[1]['rv_mismatch'])
559
+ if a_row[1]['rv_target'][-2:]=='GG':
560
+ list_GG.append('OK')
561
+ else:
562
+ list_GG.append('NO')
563
+ else:
564
+ if a_row[1]['fw_target'][-2:]=='GG':
565
+ list_best_target.append(a_row[1]['fw_target'])
566
+ list_best_location.append(a_row[1]['fw_location'])
567
+ list_cleavage_site.append(int(a_row[1]['fw_location'].split('-')[1]) - 6)
568
+ list_delete.append(a_row[1]['fw_deletion'])
569
+ list_insert.append(a_row[1]['fw_insertion'])
570
+ list_mismat.append(a_row[1]['fw_mismatch'])
571
+ list_GG.append('OK_same_score')
572
+ # 发现没有 GG 则看 RC
573
+ elif a_row[1]['rv_target'][-2:]=='GG':
574
+ list_best_target.append(a_row[1]['rv_target'])
575
+ list_best_location.append(a_row[1]['rv_location'])
576
+ list_cleavage_site.append(int(a_row[1]['rv_location'].split('-')[0].split(':')[1]) + 5)
577
+ list_delete.append(a_row[1]['rv_deletion'])
578
+ list_insert.append(a_row[1]['rv_insertion'])
579
+ list_mismat.append(a_row[1]['rv_mismatch'])
580
+ list_GG.append('OK_same_score')
581
+ else:
582
+ list_best_target.append(a_row[1]['fw_target'])
583
+ list_best_location.append(a_row[1]['fw_location'])
584
+ list_cleavage_site.append(int(a_row[1]['fw_location'].split('-')[1]) - 6)
585
+ list_delete.append(a_row[1]['fw_deletion'])
586
+ list_insert.append(a_row[1]['fw_insertion'])
587
+ list_mismat.append(a_row[1]['fw_mismatch'])
588
+ list_GG.append('NO_same_score')
589
+ # 记入 df_candidate
590
+ df_candidate['deletion'] = list_delete
591
+ df_candidate['insertion'] = list_insert
592
+ df_candidate['mismatch'] = list_mismat
593
+ df_candidate['GG'] = list_GG
594
+ df_candidate['best_target'] = list_best_target
595
+ df_candidate['target_location'] = list_best_location
596
+ df_candidate['cleavage_site'] = list_cleavage_site
597
+ df_candidate = pd.concat([xseq.bedfmt(df_candidate['target_location']), df_candidate], axis=1)
598
+
599
+ return df_candidate
600
+
601
+ def left_realign(dp_bdg_chr, loc_shift_left, ref_fasta, sgRNA_seq, PAM, PAM_loc, n_iter):
602
+ # print(loc_shift_left)
603
+ fasta = xseq.get_seq(loc_shift_left, ref_fasta)
604
+ df_candidate = local_realign(sgRNA_seq, fasta, PAM, PAM_loc)
605
+ sr_candidate = df_candidate.iloc[0].copy()
606
+ chrom = sr_candidate['chr']
607
+ cleavage_site = sr_candidate['cleavage_site']
608
+ flank_regions = [500]
609
+ signals = target_signal(dp_bdg_chr.to_pandas(), chrom, cleavage_site, flank_regions=flank_regions)
610
+ L_neg_1000 = signals[2]
611
+ # 计算左移后的 L_neg_1000,如果还是负数则迭代,最多迭代 10 次
612
+ if L_neg_1000 < 0:
613
+ st = sr_candidate['st']
614
+ ed = sr_candidate['ed']
615
+ loc_shift_left = f'{chrom}:{int(st)-1000}-{int(ed)-20}'
616
+ n_iter += 1
617
+ if n_iter < 10:
618
+ return left_realign(dp_bdg_chr, loc_shift_left, ref_fasta, sgRNA_seq, PAM, PAM_loc, n_iter)
619
+ else:
620
+ sr_candidate.loc['realign'] = 'fail'
621
+ return sr_candidate
622
+ else:
623
+ sr_candidate.loc['realign'] = 'success'
624
+ return sr_candidate
625
+
626
+ def right_realign(dp_bdg_chr, loc_shift_right, ref_fasta, sgRNA_seq, PAM, PAM_loc, n_iter):
627
+ # print(loc_shift_right)
628
+ fasta = xseq.get_seq(loc_shift_right, ref_fasta)
629
+ df_candidate = local_realign(sgRNA_seq, fasta, PAM, PAM_loc)
630
+ sr_candidate = df_candidate.iloc[0].copy()
631
+ chrom = sr_candidate['chr']
632
+ cleavage_site = sr_candidate['cleavage_site']
633
+ flank_regions = [500]
634
+ signals = target_signal(dp_bdg_chr.to_pandas(), chrom, cleavage_site, flank_regions=flank_regions)
635
+ R_neg_1000 = signals[5]
636
+ # 计算右移后的 R_neg_1000,如果还是负数则迭代,最多迭代 10 次
637
+ if R_neg_1000 < 0:
638
+ st = sr_candidate['st']
639
+ ed = sr_candidate['ed']
640
+ loc_shift_right = f'{chrom}:{int(st)+20}-{int(ed)+1000}'
641
+ n_iter += 1
642
+ if n_iter < 10:
643
+ return right_realign(dp_bdg_chr, loc_shift_right, ref_fasta, sgRNA_seq, PAM, PAM_loc, n_iter)
644
+ else:
645
+ sr_candidate.loc['realign'] = 'fail'
646
+ return sr_candidate
647
+ else:
648
+ sr_candidate.loc['realign'] = 'success'
649
+ return sr_candidate
650
+
651
+
@@ -3,7 +3,8 @@ import math
3
3
  import pandas as pd
4
4
  from itertools import product
5
5
  import numpy as np
6
- import os, glob
6
+ import os, glob, sys
7
+ import polars as pl
7
8
 
8
9
  ambiguous_nt = {'A': ['A'],
9
10
  'T': ['T'],
@@ -115,7 +116,7 @@ def add_ID(df, chr_col=0, midpoint='cleavage_site'):#, midpoint='midpoint'):
115
116
 
116
117
 
117
118
 
118
- def detect_fastq(folder, n_subfolder, NGS_type='paired-end'):
119
+ def detect_fastq(folder, n_subfolder, NGS_type='paired-end', skip_trimmed=False):
119
120
  """
120
121
  搜索 folder 的 n级子目录下的所有 fastq/fastq.gz/fq/fq.gz 文件
121
122
  paired-end 模式 : 识别 2.fq/2.fastq 为 paired-end 的 R2 文件,并验证对应 R1 文件
@@ -151,6 +152,9 @@ def detect_fastq(folder, n_subfolder, NGS_type='paired-end'):
151
152
  fq_files = glob.glob( os.path.join(folder, n_subfolder*'*/', fastq ) )
152
153
  print(f'{len(fq_files)} {fastq[2:]} samples detected')
153
154
  files_R2.extend( fq_files )
155
+
156
+ if skip_trimmed:
157
+ files_R2 = [f for f in files_R2 if '_trimmed_2.fq.gz' not in f]
154
158
  #
155
159
  if len(files_R2) > 0:
156
160
  files_R2 = pd.Series(files_R2).sort_values().reset_index(drop=True)
@@ -243,6 +247,137 @@ def sgRNA_alignment(a_key, sgRNA, seq, frag_len, DNA_matrix=None, mismatch_score
243
247
  else:
244
248
  return [best_alignment.score, position_pct, target, target_location, deletion, insertion, mismatch]
245
249
 
250
+ def sgRNA_alignment_new(a_key, sgRNA, seq, substitution_matrix=None, alphabet=None,
251
+ mismatch_score=0.01):
252
+ """
253
+ Perform local alignment using Bio.Align instead of deprecated pairwise2.
254
+ """
255
+ if substitution_matrix is None or alphabet is None:
256
+ substitution_matrix, alphabet = create_substitution_matrix(mismatch_score)
257
+
258
+ # Create aligner
259
+ aligner = Align.PairwiseAligner()
260
+ aligner.substitution_matrix = Align.substitution_matrices.Array(
261
+ alphabet=alphabet, dims=2, data=substitution_matrix
262
+ )
263
+ aligner.open_gap_score = -2
264
+ aligner.extend_gap_score = -2
265
+ aligner.mode = 'local'
266
+
267
+ try:
268
+ # Perform alignment
269
+ alignments = aligner.align(sgRNA, seq)
270
+
271
+ if not alignments:
272
+ # No alignment found, return default values
273
+ return [0, 0, '', f"{a_key.split(':')[0]}:0-0", 0, 0, len(sgRNA)]
274
+
275
+ # Convert to list for indexing
276
+ alignments = list(alignments)
277
+
278
+ # Extract alignment information
279
+ coords = alignments[0].coordinates
280
+ start_target = coords[1][0]
281
+ end_target = coords[1][-1]
282
+
283
+ # Extract target sequence directly from coordinates
284
+ # target = seq[start_target:end_target]
285
+
286
+ # Get aligned sequences for detailed analysis
287
+ alignment_str = str(alignments[0])
288
+ alignment_lines = alignment_str.split('\n')
289
+ if len(alignment_lines) >= 3:
290
+ aligned_sgrna = [x for x in alignment_lines[0].split(' ') if x != '']
291
+ aligned_genome = [x for x in alignment_lines[2].split(' ') if x != '']
292
+ else:
293
+ raise ValueError("Unexpected alignment format")
294
+
295
+ assert int(aligned_sgrna[-1]) == len(sgRNA)
296
+
297
+ # Calculate indels and mismatches
298
+ # deletion = RNA bulge
299
+ # insertion = DNA bulge
300
+ aligned_sgrna_seq = aligned_sgrna[-2]
301
+ aligned_genome_seq = aligned_genome[-2]
302
+ insertion = aligned_sgrna_seq.count('-') if '-' in aligned_sgrna_seq else 0
303
+ deletion = aligned_genome_seq.count('-') if '-' in aligned_genome_seq else 0
304
+
305
+ # Count mismatches by comparing sequences directly
306
+ # mismatch = 0
307
+ # assert len(aligned_sgrna_seq) == len(aligned_genome_seq)
308
+ # for i in range(len(aligned_sgrna_seq)):
309
+ # if (aligned_sgrna_seq[i] != aligned_genome_seq[i]) & (aligned_sgrna_seq[i] != 'N') & (aligned_genome_seq[i] != 'N'):
310
+ # mismatch += 1
311
+
312
+ mismatch = round((alignments[0].score % 1)/mismatch_score)
313
+
314
+ # Calculate target location
315
+ pos_st = int(a_key.split('-')[0].split(':')[1]) + 1
316
+ chr_name = a_key.split(':')[0]
317
+ target_st = pos_st + start_target
318
+ target_ed = pos_st + end_target - 1
319
+ target_location = f"{chr_name}:{target_st}-{target_ed}"
320
+
321
+ score = alignments[0].score
322
+
323
+ return [score, aligned_genome_seq, target_location, deletion, insertion, mismatch]
324
+
325
+ except Exception as e:
326
+ print(f"Alignment error for {a_key}: {e}")
327
+ return [0, 0, '', f"{a_key.split(':')[0]}:0-0", 0, 0, len(sgRNA)]
328
+
329
+
330
+ def get_seq(location, ref_fasta, return_df=False) -> dict:
331
+ """
332
+ 根据 genome location 取序列
333
+ location 如果是 list, str 则形式为 "chr1:123456-123458" 或 ["chr1:123456-123458", "chr2:123456-123458"]
334
+ location 如果是 pd.DataFrame, pl.DataFrame 则默认前三列为 bed 格式
335
+ ref_fasta 是参考基因组 fasta 文件的路径
336
+ 默认返回字典, key 是位置, value 是序列
337
+ 如果 return_df 为 True, 则返回 pl.DataFrame, 第一列为位置, 第二列为序列
338
+
339
+ pybedtools 返回的序列实际上没有包括坐标 start 的那个碱基,这一点和 twoBitToFa 一样
340
+ 但是 IGV/UCSC 等序列是包括 start 的碱基,blast 的结果也是包括 start 的
341
+ 所以在后续分析时要注意这一点
342
+ """
343
+ if sys.platform[:3]=='win':
344
+ # windows 似乎装不了 pybedtools
345
+ raise ValueError('windows 似乎装不了 pybedtools')
346
+ else:
347
+ import pybedtools
348
+ #########
349
+ # 根据 genome location 取序列
350
+ #########
351
+ if isinstance(location,(list,str)):
352
+ bed_loc = bedfmt(location)
353
+ elif isinstance(location,pd.DataFrame):
354
+ bed_loc = location.iloc[:,:3]
355
+ elif isinstance(location,pl.DataFrame):
356
+ bed_loc = location[:,:3]
357
+ else:
358
+ raise ValueError('location must be a list, str or pd.DataFrame')
359
+
360
+ fasta = pybedtools.example_filename(ref_fasta)
361
+ temp_bed = './temp_amp_loc.bed'
362
+ write_bed(bed_loc, temp_bed)
363
+ a = pybedtools.BedTool(temp_bed)
364
+ a = a.sequence(fi=fasta)
365
+ with open(a.seqfn, encoding='utf-8') as f:
366
+ dict_seq = {} # 定义一个空的字典
367
+ for line in f:
368
+ line = line.strip() # 去除末尾换行符
369
+ if line[0] == '>':
370
+ header = line[1:]
371
+ else:
372
+ sequence = line
373
+ dict_seq[header] = dict_seq.get(header,'') + sequence
374
+
375
+ # remove temp_amp_loc.bed
376
+ os.remove(temp_bed)
377
+ if return_df:
378
+ return pl.DataFrame(list(dict_seq.items()), orient='row', schema={'location':pl.String,'sequence':pl.String})
379
+ else:
380
+ return dict_seq
246
381
 
247
382
  def combine_df(list_df, op = 'mean'):
248
383
  # df 行列、结构必须一模一样,非数字部分也一模一样,只有数字不同
@@ -1,4 +1,4 @@
1
- __version__ = "2.12.2"
1
+ __version__ = "2.13.0"
2
2
  # 2023.08.11. v1.1.0 adding a option for not normalizing the bw file
3
3
  # 2023.10.26. v1.9.0 prerelease for v2.0
4
4
  # 2023.10.27. v2.0.0 大更新,还没微调
@@ -33,11 +33,13 @@ __version__ = "2.12.2"
33
33
  # 2025.04.25. v2.8.0 修复了 offtracker candidates 会把小写序列转换成 N 的 bug
34
34
  # 2025.05.22. v2.9.0 翻新部分代码结构
35
35
  # 2025.06.05. v2.10.0 增加了QC模块。保留了负数score的记录,并在plot时显示为红字。增加了 "--ignore_chr" 用于跳过common chr过滤。
36
- # 2025.06.17. v2.10.7 修复翻新代码结构导致的bug
37
- # 2025.06.27. v2.10.8 将 chmod 放在了 setup.py 里
38
- # 2025.06.28. v2.10.9 现在 pip 都是从 wheel 安装,不再运行 setup.py,所以增加一个 offtracker_init.py
39
- # 2025.06.28. v2.10.10 直接塞 script 里试试
40
- # 2025.06.28. v2.10.11 回滚到2.10.9外加修正
41
- # 2025.07.02. v2.11.4 基于 blast 的缺陷更新 candidates,去除 quick mode
42
- # 2025.07.04. v2.11.5 offtracker_analysis 提前 skip 已有结果的样本
43
- # 2025.07.04. v2.12.2 新增 region_index 标记区域,用于更好的去重
36
+ # 2025.06.17. v2.10.7 修复翻新代码结构导致的bug
37
+ # 2025.06.27. v2.10.8 将 chmod 放在了 setup.py 里
38
+ # 2025.06.28. v2.10.9 现在 pip 都是从 wheel 安装,不再运行 setup.py,所以增加一个 offtracker_init.py
39
+ # 2025.06.28. v2.10.10 直接塞 script 里试试
40
+ # 2025.06.28. v2.10.11 回滚到2.10.9外加修正
41
+ # 2025.07.02. v2.11.4 基于 blast 的缺陷更新 candidates,去除 quick mode
42
+ # 2025.07.04. v2.11.5 offtracker_analysis 提前 skip 已有结果的样本
43
+ # 2025.07.04. v2.12.2 新增 region_index 标记区域,用于更好的去重
44
+ # 2025.07.18. v2.12.3 新增QC自动避免重复读取 trimmed fastq files
45
+ # 2025.08.08. v2.13.0 测试 local realign 功能
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: offtracker
3
- Version: 2.12.2
3
+ Version: 2.13.0
4
4
  Summary: Tracking-seq data analysis
5
5
  Home-page: https://github.com/Lan-lab/offtracker
6
6
  Author: Runda Xu
@@ -83,7 +83,7 @@ def main():
83
83
  df_candidate.index = df_candidate['target_location']
84
84
  df_candidate_brief = df_candidate[['chr','st','ed','best_strand','best_target','best_seq_score',
85
85
  'deletion', 'insertion','mismatch', 'GG',
86
- 'target_location', 'cleavage_site', 'ID_1','ID_2', 'region_index']] # 2025.07.06 添加 region_index
86
+ 'target_location', 'cleavage_site', 'region_index']] # 2025.07.06 添加 region_index, 去除 'ID_1','ID_2',
87
87
  df_candidate_sub = df_candidate[['chr','cleavage_site']]
88
88
  except FileNotFoundError:
89
89
  return 'Please run offtracker_candidates.py first and provide the correct directory with --seqfolder'
@@ -366,9 +366,9 @@ def main():
366
366
  # 2024.06.03. 以防 fdr<=fdr_thresh 滤掉了 track_score>=2 的位点
367
367
  bool_fdr = df_result['fdr']<=fdr_thresh
368
368
  bool_score = df_result['track_score']>=score_thresh
369
- # 2025.06.05. BE可能会形成单边信号,导致 track_score 为负数,也保留
370
- bool_neg_score = df_result['track_score']< -1
371
- df_output = df_result[bool_fdr|bool_score|bool_neg_score].copy()
369
+ # 2025.06.05. BE可能会形成单边信号,但是很少见,如果 control 用的是别的 sgRNA 的样本,对应脱靶位置附近一般就是负数
370
+ # bool_neg_score = df_result['track_score']< -1
371
+ df_output = df_result[bool_fdr|bool_score].copy()
372
372
  if pattern_ctr != 'none':
373
373
  df_output = df_output[['target_location', 'best_strand','best_target','deletion','insertion','mismatch',
374
374
  'exp_L_length', 'exp_R_length','ctr_L_length','ctr_R_length','L_length','R_length','signal_length',
@@ -383,6 +383,13 @@ def main():
383
383
  df_output.columns = ['target_location', 'strand', 'target', 'deletion', 'insertion', 'mismatch',
384
384
  'L_length', 'R_length','signal_length',
385
385
  'seq_score', 'track_score', 'log2_track_score','FDR', 'rank']
386
+
387
+
388
+ # 2025.08.08. 增加对阳性位点的 target_location 重比对功能,避免 blast 比对后的 realign 在更大范围内的存在不准确的情况
389
+
390
+
391
+
392
+
386
393
  df_output.to_csv(f'Offtracker_result_{outname}.csv', index=False)
387
394
 
388
395
  if args.clean:
@@ -29,7 +29,8 @@ def main():
29
29
  parser.add_argument('-o','--outdir', type=str, default='same', help='The output folder')
30
30
  parser.add_argument('--subfolder' , type=int, default=0, help='subfolder level')
31
31
  parser.add_argument('-t','--thread', type=int, default=8, help='Number of threads to be used')
32
-
32
+ parser.add_argument('--include_trimmed', action='store_true', help='Do not skip trimmed fastq files')
33
+
33
34
  args = parser.parse_args()
34
35
 
35
36
  # 自动化的参数调整和报错
@@ -42,7 +43,10 @@ def main():
42
43
  os.makedirs(args.outdir)
43
44
 
44
45
  # 搜索 folder 的 n级子目录下的所有 fastq/fastq.gz/fq/fq.gz 文件
45
- sample_names, files_R1, files_R2 = xseq.detect_fastq(args.folder, n_subfolder=args.subfolder)
46
+ if args.include_trimmed:
47
+ sample_names, files_R1, files_R2 = xseq.detect_fastq(args.folder, n_subfolder=args.subfolder)
48
+ else:
49
+ sample_names, files_R1, files_R2 = xseq.detect_fastq(args.folder, n_subfolder=args.subfolder, skip_trimmed=True)
46
50
 
47
51
  assert not isinstance(sample_names, str), 'No fastq file is detected!'
48
52
 
File without changes
File without changes
File without changes
File without changes
File without changes