PyPI - offtracker - Versions diffs - 2.13.1__zip → 2.14.0__zip - Mend

offtracker 2.13.1zip → 2.14.0zip

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

{offtracker-2.13.1 → offtracker-2.14.0}/scripts/offtracker_config.py RENAMED Viewed

@@ -4,6 +4,7 @@
 # 2023.08.11. adding a option for not normalizing the bw file
 # 2025.05.22. refine the structure
 # 2025.06.05. 增加 ignore_chr 选项，默认只取 common chromosomes，用于 1.1_bed2fr.py
+# 2025.10.05. 添加 threads 监测，并添加互动模式 --cpu_help
 import argparse
 import os, glob, yaml
@@ -36,8 +37,10 @@ def main():
     parser.add_argument('--blacklist'  , type=str, default='same', help='Blacklist of genome regions in bed format. "none" for no filter')
     parser.add_argument('--binsize'    , type=str, default=100,    help='Bin size for calculating bw residue')
     parser.add_argument('--normalize'  , type=str, default='True', help='Whether to normalize the BigWig file. "True" or "False"')
-    parser.add_argument('--ignore_chr' , action='store_true', help='If not set, only chr1-chr22, chrX, chrY, chrM will be analyzed.')
+    parser.add_argument('--ignore_chr' , action='store_true',      help='If not set, only chr1-chr22, chrX, chrY, chrM will be analyzed.')
+    parser.add_argument('--cpu_help'   , action='store_true',      help='Interactive mode to recommend the number of threads and cores according to available memory and CPUs.'
+                                                                        '-t/--thread will be reset to the recommended value in this mode.'
+                                                                        )
     args = parser.parse_args()
@@ -74,6 +77,37 @@ def main():
     assert not isinstance(sample_names, str), 'No fastq file is detected!'
+    #####################
+    # threads 监测和推荐 #
+    #####################
+    import psutil
+    if args.cpu_help:
+        cpu_count_total = psutil.cpu_count(logical=True)  # 逻辑 CPU 总数（包括超线程）
+        memory = psutil.virtual_memory()
+        memory_available = round(memory.available/1024/1024/1024, 2)  # 可用内存 GB
+        print('Total available memory:', memory_available, 'GB')
+        print('Total CPU threads:', cpu_count_total)
+        n_sample = len(sample_names)
+        print('Total samples:', n_sample)
+        # 用户输入分配的最大内存和CPU线程数
+        max_memory_gb = float(input(f"Please input the maximum memory for the program (GB): 25 - {memory_available}"))
+        max_cpu_threads = int(input(f"Please input the maximum CPU threads for the program: 1 - {cpu_count_total}"))
+        assert (max_memory_gb < memory_available)&(max_memory_gb >= 25), f'max memory must be < available memory ({memory_available} GB) and >= 25 GB, current input: {max_memory_gb} GB'
+        assert (max_cpu_threads <= cpu_count_total)&(max_cpu_threads >= 1), f'max cpu threads must be <= total cpu threads ({cpu_count_total}) and >= 1, current input: {max_cpu_threads}'
+        # 计算推荐的 cpu 参数
+        max_task = min(int(max(max_memory_gb,30)/30), n_sample)
+        max_cpu_per_task = int(max_cpu_threads/max_task)
+        total_cpu = max_task*max_cpu_per_task
+        print('Assigning', max_cpu_per_task, f'CPU threads to each task. (i.e., -t {max_cpu_per_task})')
+        print('Number of parallel tasks:', max_task)
+        print(f'Please specify "--cores {total_cpu}" when formally running snakemake.')
+        n_threads = max_cpu_per_task
+    else:
+        n_threads = args.thread
     dict_yaml = {
         # fastq 信息
         'files_R1':dict(zip(sample_names,files_R1)),
@@ -82,7 +116,7 @@ def main():
         'input_dir':args.folder,
         'output_dir':args.outdir,
         # 运行参数
-        'thread':args.thread,
+        'thread':n_threads,
         'index':args.index,
         'fasta':args.ref,
         'binsize':args.binsize,

{offtracker-2.13.1 → offtracker-2.14.0}/scripts/offtracker_correction.py RENAMED Viewed

@@ -1,4 +1,5 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
 import polars as pl
 import pandas as pd
@@ -35,7 +36,7 @@ def main():
     parser.add_argument('--pam'   ,        type=str, required=True,    help='The protospacer adjacent motif' )
     parser.add_argument('--pam_location',  type=str, default='downstream', help='Upstream or downstream, default is downstream (Cas9)' )
     # not used
-    parser.add_argument('--seqfolder'    , type=str, required=True,    help='Actually not used in this script.Only in case you forget to remove this argument.')
+    parser.add_argument('--seqfolder'    , type=str, default='none',    help='Actually not used in this script.Only in case you forget to remove this argument.')
     args = parser.parse_args()
     # 2025.08.08. 增加对阳性位点的 target_location 重比对功能，避免 blast 比对后的 realign 在更大范围内的存在不准确的情况
@@ -53,7 +54,7 @@ def main():
     score_thresh = args.score
     binsize = args.binsize
     flank_max = args.flank_max
-    flank_regions = args.flank_regions
+    flank_regions = args.flank_regions # 如果 analysis 时修改了这个参数没有写 1000 的话会出bug，暂时懒得改了
     smooth_times = args.smooth
     window_size = args.window
     seq_score_power = args.SeqScorePower
@@ -139,8 +140,8 @@ def main():
     ctr_sample_files = all_sample_files[bool_ctr]
     exp_sample_names = all_sample_names[bool_exp]
     ctr_sample_names = all_sample_names[bool_ctr]
-    selected_sample_files = pd.concat([exp_sample_files,ctr_sample_files])
-    selected_sample_names = pd.concat([exp_sample_names,ctr_sample_names]) # no use
+    # selected_sample_files = pd.concat([exp_sample_files,ctr_sample_files])
+    # selected_sample_names = pd.concat([exp_sample_names,ctr_sample_names]) # no use
@@ -154,8 +155,17 @@ def main():
     PAM = args.pam
     PAM_loc = args.pam_location
     # read result
-    dp_result = pl.read_csv(f'Offtracker_result_{outname}.csv')
-    dp_bdg = pl.read_parquet(selected_sample_files[0], separator='\t', has_header=False,
+    dp_result = pl.read_csv(f'./temp/df_result_{outname}.csv')
+    # negative for next section
+    bool_fdr_bkg = dp_result['fdr']>fdr_thresh
+    bool_score_bkg = dp_result['track_score']<score_thresh
+    dp_result_bkg = dp_result.filter(bool_fdr_bkg & bool_score_bkg)
+    # positive
+    bool_fdr = pl.col('fdr')<=fdr_thresh
+    bool_score = pl.col('track_score')>=score_thresh
+    dp_result = dp_result.filter(bool_fdr & bool_score)
+    # bdg
+    dp_bdg = pl.read_csv(exp_sample_files.iloc[0], separator='\t', has_header=False,
                              schema_overrides={'chr':pl.String,'start':pl.Int32,'end':pl.Int32,'residual':pl.Float32})
     # check and realign
     bool_left_neg=(dp_result['exp_L_neg_1000']<-5)&(dp_result['exp_R_neg_1000']==0)
@@ -163,11 +173,13 @@ def main():
     list_good_result = []
     list_bad_left = []
     list_bad_right = []
+    n_left_for_correct = 0
+    n_right_for_correct = 0
     for a_left_bool, a_right_bool, a_row in zip(bool_left_neg, bool_right_neg, dp_result.iter_rows(named=True)):
         if a_left_bool & a_right_bool:
             raise ValueError('abnormal on both left and right')
         if a_left_bool:
-            print('left')
+            n_left_for_correct += 1
             loc_shift_left = a_row['chr'] + ':' + str(a_row['st']-1000) + '-' + str(a_row['ed']-20)
             region_index = a_row['region_index']
             dp_bdg_chr = dp_bdg.filter(pl.col('chr') == a_row['chr'])
@@ -175,7 +187,7 @@ def main():
             sr_candidate.loc['region_index'] = region_index
             list_bad_left.append(sr_candidate)
         elif a_right_bool:
-            print('right')
+            n_right_for_correct += 1
             loc_shift_right = a_row['chr'] + ':' + str(a_row['st']+20) + '-' + str(a_row['ed']+1000)
             region_index = a_row['region_index']
             dp_bdg_chr = dp_bdg.filter(pl.col('chr') == a_row['chr'])
@@ -188,17 +200,50 @@ def main():
     df_cand_left = pd.DataFrame(list_bad_left)
     df_cand_right = pd.DataFrame(list_bad_right)
     df_cand_realign = pd.concat([df_cand_left, df_cand_right])
+    if len(df_cand_realign) == 0:
+        print('No candidate is found for realignment.')
+        return 'finished'
+    # 情况判断
+    n_success_realign = sum(df_cand_realign['realign']=='success')
+    n_fail_realign = sum(df_cand_realign['realign']!='success')
+    if (n_success_realign == 0) and (n_fail_realign > 0):
+        print(f'{n_fail_realign} candidates are found for realignment, but all failed.')
+        return 'finished'
+    elif (n_success_realign > 0) and (n_fail_realign > 0):
+        print(f'{n_success_realign} candidates succeeded, and {n_fail_realign} candidates failed.')
+    else:
+        print(f'{n_success_realign} candidates succeeded.')
+    df_cand_realign = df_cand_realign[df_cand_realign['realign']=='success']
     seqfile = rf'correction_df_candidate_{outname}_realign.csv'
     df_cand_realign.to_csv(seqfile)
     # run offtracker_analysis with check_loc mode
     running_log = rf'correction_analysis_{outname}.log'
+    # list 转空格分割参数
+    if isinstance(pattern_exp, list):
+        param_pattern_exp = ' '.join(pattern_exp)
+    else:
+        param_pattern_exp = pattern_exp
+    if isinstance(pattern_ctr, list):
+        param_pattern_ctr = ' '.join(pattern_ctr)
+    else:
+        param_pattern_ctr = pattern_ctr
+    if isinstance(flank_regions, list):
+        param_flank_regions = ' '.join([str(x) for x in flank_regions])
+    else:
+        param_flank_regions = flank_regions
+    if isinstance(folders, list):
+        param_folders = ' '.join([str(x) for x in folders])
+    else:
+        param_folders = folders
     with open(running_log, "w+") as running_log:
         command   = f'offtracker_analysis.py -t {args.thread} -g {args.genome} --seqfile {seqfile} --name {sgRNA_name} \
-        --exp {pattern_exp} --control {pattern_ctr} --outname {outname}_loc_correction -f {folders} -o {outdir} \
+        --exp {param_pattern_exp} --control {param_pattern_ctr} --outname {outname}_loc_correction -f {param_folders} -o {outdir} \
         --fdr {fdr_thresh} --window {window_size} --smooth {smooth_times} --SeqScorePower {seq_score_power} \
-        --score {score_thresh} --binsize {binsize} --flank_max {flank_max} --flank_regions {flank_regions} --CtrClip {ctr_clip} \
+        --score {score_thresh} --binsize {binsize} --flank_max {flank_max} --flank_regions {param_flank_regions} --CtrClip {ctr_clip} \
         --check_loc'
         command2  = shlex.split('bash -c "{}"'.format(command))
         process_1 = subprocess.Popen(command2, stdout=running_log, stderr=subprocess.STDOUT )
@@ -213,10 +258,7 @@ def main():
     #######################
     ## recalculate score ##
     #######################
-    dp_result_bkg = pl.read_csv(f'./temp/df_result_{outname}.csv')
-    bool_fdr_bkg = dp_result_bkg['fdr']>fdr_thresh
-    bool_score_bkg = dp_result_bkg['track_score']<score_thresh
-    dp_result_bkg = dp_result_bkg.filter(bool_fdr_bkg & bool_score_bkg)
     dp_result_realign = pl.read_csv(f'./temp/df_result_{outname}_loc_correction.csv')
     # 兼容旧版输出列名
@@ -256,7 +298,7 @@ def main():
     # ouput Offtracker result
     bool_fdr = pl.col('fdr')<=fdr_thresh
     bool_score = pl.col('track_score')>=score_thresh
-    dp_output = dp_result_new.filter(bool_fdr|bool_score).copy()
+    dp_output = dp_result_new.filter(bool_fdr|bool_score)
     if pattern_ctr != 'none':
         dp_output = dp_output[['target_location', 'best_strand','best_target','deletion','insertion','mismatch',
                             'exp_L_length', 'exp_R_length','ctr_L_length','ctr_R_length','L_length','R_length','signal_length',
@@ -271,7 +313,7 @@ def main():
         dp_output.columns = ['target_location', 'strand', 'target', 'deletion', 'insertion', 'mismatch',
                             'L_length', 'R_length','signal_length',
                             'seq_score', 'track_score', 'log2_track_score','FDR', 'rank']
-        dp_output.write_csv(f'Offtracker_result_{outname}.csv')
+    dp_output.write_csv(f'Offtracker_result_{outname}.csv')
     return 'correction finished'

{offtracker-2.13.1 → offtracker-2.14.0}/scripts/offtracker_qc.py RENAMED Viewed

@@ -1,7 +1,9 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
-THIS_VERSION = '0.4.1'
+THIS_VERSION = '0.4.2'
+# 2025.10.05. 0.4.2. 添加 threads 监测
 import argparse
 import os, glob, yaml
@@ -50,6 +52,17 @@ def main():
     assert not isinstance(sample_names, str), 'No fastq file is detected!'
+    ################
+    # threads 监测 #
+    ################
+    import psutil
+    n_threads = args.thread
+    assert n_threads > 0, f'n_threads should be greater than 0, while {n_threads} is given.'
+    cpu_count_total = psutil.cpu_count(logical=True)  # 逻辑 CPU 总数（包括超线程）
+    if n_threads > cpu_count_total:
+        n_threads = cpu_count_total-1
+        print(f'n_threads is reset to {n_threads} due to the total number of threads ({cpu_count_total}).')
     dict_yaml = {
         # fastq 信息
         'files_R1':dict(zip(sample_names,files_R1)),
@@ -58,7 +71,7 @@ def main():
         'input_dir':args.folder,
         'output_dir':args.outdir,
         # 运行参数
-        'thread':args.thread,
+        'thread':n_threads,
         'utility_dir':utility_dir
         }

{offtracker-2.13.1 → offtracker-2.14.0}/setup.py RENAMED Viewed

@@ -26,7 +26,7 @@ with open(os.path.join(here, package_folder, '_version.py'),'r',encoding='utf-8'
 # requirements
 REQUIRED = [
-   'pandas', 'polars>=1.19.0', 'numpy', 'biopython<=1.85', 'pybedtools', 'pyyaml',
+   'pandas', 'polars>=1.19.0', 'numpy', 'biopython<=1.85', 'pybedtools', 'pyyaml', 'psutil'
 ]
 ## pybedtools may be not supported in Windows