PyPI - rdrpcatch - Versions diffs - 0.0.6__tar.gz → 0.0.7__tar.gz - Mend

rdrpcatch 0.0.6tar.gz → 0.0.7tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (89) hide show

rdrpcatch-0.0.7/.idea/shelf/Uncommitted_changes_before_Update_at_19_05_2025_13_00_[Changes]/shelved.patch ADDED Viewed

@@ -0,0 +1,145 @@
+Index: rdrpcatch/rdrpcatch_wrapper.py
+IDEA additional info:
+Subsystem: com.intellij.openapi.diff.impl.patch.BaseRevisionTextPatchEP
+<+>\"\"\"\r\nWrapper for the RdRpCATCH package.\r\n\r\n\"\"\"\r\nimport os\r\nfrom pathlib import Path\r\nfrom rich.console import Console\r\nimport warnings\r\nwarnings.filterwarnings(\"ignore\", category=UserWarning, module=\"numpy\") # see https://moyix.blogspot.com/2022/09/someones-been-messing-with-my-subnormals.html\r\n\r\ndef main():\r\n    pass\r\n\r\n\r\n# def run_gui():\r\n#\r\n#     gui_runner = gui.colabscanner_gui()\r\n#     gui_runner.run()\r\n\r\n\r\ndef bundle_results(output_dir, prefix):\r\n    \"\"\"\r\n    Bundle the results into a tar.gz file.\r\n\r\n    :param output_dir: Path to the output directory.\r\n    :type output_dir: str\r\n    :param prefix: Prefix for the output files.\r\n    :type prefix: str\r\n    :return: Path to the bundled file\r\n    :rtype: str\r\n    \"\"\"\r\n    import tarfile\r\n    import datetime\r\n    \r\n    # Create timestamp for the archive name\r\n    timestamp = datetime.datetime.now().strftime(\"%Y%m%d_%H%M%S\")\r\n    archive_name = f\"{prefix}_rdrpcatch_results_{timestamp}.tar.gz\"\r\n    archive_path = os.path.join(output_dir, archive_name)\r\n    \r\n    # Create tar.gz archive\r\n    with tarfile.open(archive_path, \"w:gz\") as tar:\r\n        # Add all relevant directories\r\n        for dir_name in [f\"{prefix}_rdrpcatch_fasta\", f\"{prefix}_rdrpcatch_plots\", \r\n                        f\"{prefix}_gff_files\", \"tmp\"]:\r\n            dir_path = os.path.join(output_dir, dir_name)\r\n            if os.path.exists(dir_path):\r\n                tar.add(dir_path, arcname=dir_name)\r\n        \r\n        # Add the main output file\r\n        output_file = os.path.join(output_dir, f\"{prefix}_rdrpcatch_output_annotated.tsv\")\r\n        if os.path.exists(output_file):\r\n            tar.add(output_file, arcname=os.path.basename(output_file))\r\n    \r\n    return archive_path\r\n\r\ndef run_scan(input_file, output_dir, db_options, db_dir, seq_type, verbose, e,incdomE,domE,incE,z, cpus, length_thr, gen_code, bundle, keep_tmp, overwrite):\r\n    \"\"\"\r\n    Run RdRpCATCH scan.\r\n\r\n    :param input_file: Path to the input FASTA file.\r\n    :type input_file: str\r\n    :param output_dir: Path to the output directory.\r\n    :type output_dir: str\r\n    :param db_options: List of databases to search against.\r\n    :type db_options: list\r\n    :param db_dir: Path to the directory containing RdRpCATCH databases.\r\n    :type db_dir: str\r\n    :param seq_type: Type of sequence (prot or nuc).\r\n    :type seq_type: str\r\n    :param verbose: Whether to print verbose output.\r\n    :type verbose: bool\r\n    :param e: E-value threshold for HMMsearch.\r\n    :type e: float\r\n    :param incdomE: Inclusion domain E-value threshold for HMMsearch.\r\n    :type incdomE: float\r\n    :param domE: Domain E-value threshold for HMMsearch.\r\n    :type domE: float\r\n    :param incE: Inclusion E-value threshold for HMMsearch.\r\n    :type incE: float\r\n    :param z: Number of sequences to search against.\r\n    :type z: int\r\n    :param cpus: Number of CPUs to use for HMMsearch.\r\n    :type cpus: int\r\n    :param length_thr: Minimum length threshold for seqkit seq.\r\n    :type length_thr: int\r\n    :param gen_code: Genetic code to use for translation.\r\n    :type gen_code: int\r\n    :return: None\r\n    \"\"\"\r\n    from .rdrpcatch_scripts import utils\r\n    from .rdrpcatch_scripts import paths\r\n    from .rdrpcatch_scripts import run_pyhmmer\r\n    from .rdrpcatch_scripts import fetch_dbs\r\n    from .rdrpcatch_scripts import format_pyhmmer_out\r\n    from .rdrpcatch_scripts import run_seqkit\r\n    from .rdrpcatch_scripts import plot\r\n    import polars as pl\r\n    from .rdrpcatch_scripts import mmseqs_tax\r\n    import datetime\r\n    \r\n    ## Ignore warnings\r\n    warnings.filterwarnings(\"ignore\", category=FutureWarning)\r\n    warnings.filterwarnings(\"ignore\", category=UserWarning)\r\n\r\n    ## Set output directories\r\n    prefix = Path(input_file).stem\r\n    outputs = paths.rdrpcatch_output(prefix, Path(output_dir))\r\n\r\n    ## Set up logger\r\n    log_file = outputs.log_file\r\n    if not os.path.exists(outputs.output_dir):\r\n        os.makedirs(outputs.output_dir)\r\n    elif os.path.exists(outputs.output_dir) and overwrite:\r\n        # If the output directory already exists and force_overwrite is True, remove the existing directory\r\n        import shutil\r\n        shutil.rmtree(outputs.output_dir)\r\n        os.makedirs(outputs.output_dir)\r\n        outputs = paths.rdrpcatch_output(prefix, Path(output_dir))\r\n    else:\r\n        raise FileExistsError(f\"Output directory already exists: {outputs.output_dir}, Please choose a different directory\"\r\n                              f\" or activate the -overwrite flag to overwrite the contents of the directory.\")\r\n\r\n    if not os.path.exists(outputs.log_dir):\r\n        os.makedirs(outputs.log_dir)\r\n\r\n    logger = utils.Logger(log_file)\r\n\r\n    logger.silent_log(f\"Input File: {input_file}\")\r\n    logger.silent_log(f\"Output Directory: {output_dir}\")\r\n    logger.silent_log(f\"Databases: {db_options}\")\r\n    logger.silent_log(f\"Database Directory: {db_dir}\")\r\n    logger.silent_log(f\"Sequence Type: {seq_type}\")\r\n    logger.silent_log(f\"Verbose Mode: {'ON' if verbose else 'OFF'}\")\r\n    logger.silent_log(f\"E-value: {e}\")\r\n    logger.silent_log(f\"Inclusion E-value: {incE}\")\r\n    logger.silent_log(f\"Domain E-value: {domE}\")\r\n    logger.silent_log(f\"Inclusion Domain E-value: {incdomE}\")\r\n    logger.silent_log(f\"Z-value: {z}\")\r\n    logger.silent_log(f\"CPUs: {cpus}\")\r\n    logger.silent_log(f\"Length Threshold: {length_thr}\")\r\n    logger.silent_log(f\"Genetic Code: {gen_code}\")\r\n    logger.silent_log(f\"Bundle Results: {'ON' if bundle else 'OFF'}\")\r\n    logger.silent_log(f\"Save Temporary Files: {'ON' if keep_tmp else 'OFF'}\")\r\n\r\n    ## Start time\r\n    start_time = logger.start_timer()\r\n\r\n    ## Check fasta validity\r\n    if not utils.fasta_checker(input_file, logger).check_fasta_validity():\r\n        raise Exception(\"Invalid fasta file.\")\r\n    else:\r\n        if verbose:\r\n            logger.loud_log(f\"Valid fasta file: {input_file}\")\r\n        else:\r\n            logger.silent_log(f\"Valid fasta file: {input_file}\")\r\n\r\n    ## Check sequence type\r\n    if not seq_type:\r\n        seq_type = utils.fasta_checker(input_file, logger).check_seq_type()\r\n    if verbose:\r\n        logger.loud_log(f\"Sequence type: {seq_type}\")\r\n    else:\r\n        logger.silent_log(f\"Sequence type: {seq_type}\")\r\n\r\n    ## Check sequence length in .fasta files, if >100000, pyHMMER breaks\r\n    if seq_type == 'nuc':\r\n        utils.fasta_checker(input_file, logger).check_seq_length(300000)\r\n    if seq_type == 'prot':\r\n        utils.fasta_checker(input_file, logger).check_seq_length(100000)\r\n\r\n    logger.loud_log(\"Fetching HMM databases...\")\r\n\r\n    ## Fetch HMM databases- RVMT, NeoRdRp, NeoRdRp.2.1, TSA_Olendraite, RDRP-scan, Lucaprot\r\n    rvmt_hmm_db = fetch_dbs.db_fetcher(db_dir).fetch_hmm_db_path(\"RVMT\")\r\n    if verbose:\r\n        logger.loud_log(f\"RVMT HMM database fetched from: {rvmt_hmm_db}\")\r\n    else:\r\n        logger.silent_log(f\"RVMT HMM database fetched from: {rvmt_hmm_db}\")\r\n    neordrp_hmm_db = fetch_dbs.db_fetcher(db_dir).fetch_hmm_db_path(\"NeoRdRp\")\r\n    if verbose:\r\n        logger.loud_log(f\"NeoRdRp HMM database fetched from: {neordrp_hmm_db}\")\r\n    else:\r\n        logger.silent_log(f\"NeoRdRp HMM database fetched from: {neordrp_hmm_db}\")\r\n    neordrp_2_hmm_db = fetch_dbs.db_fetcher(db_dir).fetch_hmm_db_path(\"NeoRdRp.2.1\")\r\n    if verbose:\r\n        logger.loud_log(f\"NeoRdRp.2.1 HMM database fetched from: {neordrp_2_hmm_db}\")\r\n    else:\r\n        logger.silent_log(f\"NeoRdRp.2.1 HMM database fetched from: {neordrp_2_hmm_db}\")\r\n    tsa_olen_fam_hmm_db = fetch_dbs.db_fetcher(db_dir).fetch_hmm_db_path(\"TSA_Olendraite_fam\")\r\n    if verbose:\r\n        logger.loud_log(f\"TSA_Olendraite_fam HMM database fetched from: {tsa_olen_fam_hmm_db}\")\r\n    else:\r\n        logger.silent_log(f\"TSA_Olendraite_fam HMM database fetched from: {tsa_olen_fam_hmm_db}\")\r\n\r\n    tsa_olen_gen_hmm_db = fetch_dbs.db_fetcher(db_dir).fetch_hmm_db_path(\"TSA_Olendraite_gen\")\r\n    if verbose:\r\n        logger.loud_log(f\"TSA_Olendraite HMM database fetched from: {tsa_olen_gen_hmm_db}\")\r\n    else:\r\n        logger.silent_log(f\"TSA_Olendraite HMM database fetched from: {tsa_olen_gen_hmm_db}\")\r\n    rdrpscan_hmm_db = fetch_dbs.db_fetcher(db_dir).fetch_hmm_db_path(\"RDRP-scan\")\r\n    if verbose:\r\n        logger.loud_log(f\"RDRP-scan HMM database fetched from: {rdrpscan_hmm_db}\")\r\n    else:\r\n        logger.silent_log(f\"RDRP-scan HMM database fetched from: {rdrpscan_hmm_db}\")\r\n    lucaprot_hmm_db = fetch_dbs.db_fetcher(db_dir).fetch_hmm_db_path(\"Lucaprot\")\r\n    if verbose:\r\n        logger.loud_log(f\"Lucaprot HMM database fetched from: {lucaprot_hmm_db}\")\r\n    else:\r\n        logger.silent_log(f\"Lucaprot HMM database fetched from: {lucaprot_hmm_db}\")\r\n\r\n    db_name_list = []\r\n    db_path_list = []\r\n\r\n    ## Set up HMM databases\r\n    if db_options == ['all']:\r\n        db_name_list = [\"RVMT\", \"NeoRdRp\", \"NeoRdRp.2.1\", \"TSA_Olendraite_fam\",\"TSA_Olendraite_gen\", \"RDRP-scan\", \"Lucaprot\"]\r\n        db_path_list = [rvmt_hmm_db, neordrp_hmm_db, neordrp_2_hmm_db, tsa_olen_fam_hmm_db,tsa_olen_gen_hmm_db, rdrpscan_hmm_db, lucaprot_hmm_db]\r\n\r\n    else:\r\n        for db in db_options:\r\n            if db == \"RVMT\".lower():\r\n                db_name_list.append(\"RVMT\")\r\n                db_path_list.append(rvmt_hmm_db)\r\n            elif db == \"NeoRdRp\".lower():\r\n                db_name_list.append(\"NeoRdRp\")\r\n                db_path_list.append(neordrp_hmm_db)\r\n            elif db == \"NeoRdRp.2.1\":\r\n                db_name_list.append(\"NeoRdRp.2.1\".lower())\r\n                db_path_list.append(neordrp_2_hmm_db)\r\n            elif db == \"TSA_Olendraite_fam\".lower():\r\n                db_name_list.append(\"TSA_Olendraite_fam\")\r\n                db_path_list.append(tsa_olen_fam_hmm_db)\r\n            elif db == \"TSA_Olendraite_gen\".lower():\r\n                db_name_list.append(\"TSA_Olendraite_gen\")\r\n                db_path_list.append(tsa_olen_gen_hmm_db)\r\n            elif db == \"RDRP-scan\".lower():\r\n                db_name_list.append(\"RDRP-scan\")\r\n                db_path_list.append(rdrpscan_hmm_db)\r\n            elif db == \"Lucaprot\".lower():\r\n                db_name_list.append(\"Lucaprot\")\r\n                db_path_list.append(lucaprot_hmm_db)\r\n            else:\r\n                raise Exception(f\"Invalid database option: {db}\")\r\n\r\n    # Fetch mmseqs database\r\n\r\n\r\n    logger.loud_log(\"Fetching Mmseqs2 databases...\")\r\n\r\n    mmseqs_db_path = fetch_dbs.db_fetcher(db_dir).fetch_mmseqs_db_path(\"mmseqs_refseq_riboviria_20250211\")\r\n\r\n    if verbose:\r\n        logger.loud_log(f\"mmseqs database fetched from: {mmseqs_db_path}\")\r\n    else:\r\n        logger.silent_log(f\"mmseqs database fetched from: {mmseqs_db_path}\")\r\n\r\n    if not os.path.exists(outputs.hmm_output_dir):\r\n        outputs.hmm_output_dir.mkdir(parents=True)\r\n\r\n    if not os.path.exists(outputs.formatted_hmm_output_dir):\r\n        outputs.formatted_hmm_output_dir.mkdir(parents=True)\r\n\r\n    if not os.path.exists(outputs.tsv_outdir):\r\n        outputs.tsv_outdir.mkdir(parents=True)\r\n\r\n    if not os.path.exists(outputs.plot_outdir):\r\n        outputs.plot_outdir.mkdir(parents=True)\r\n\r\n    if not os.path.exists(outputs.tmp_dir):\r\n        outputs.tmp_dir.mkdir(parents=True)\r\n\r\n    logger.loud_log(\"Databases fetched successfully.\")\r\n\r\n    if seq_type == 'nuc':\r\n        logger.loud_log(\"Nucleotide sequence detected.\")\r\n\r\n        set_dict = {}\r\n        translated_set_dict = {}\r\n        df_list = []\r\n\r\n        ## Filter out sequences with length less than 400 bp with seqkit\r\n        logger.loud_log(\"Filtering out sequences with length less than 400 bp.\")\r\n\r\n        if not os.path.exists(outputs.seqkit_seq_output_dir):\r\n            outputs.seqkit_seq_output_dir.mkdir(parents=True)\r\n\r\n        run_seqkit.seqkit(input_file, outputs.seqkit_seq_output_path, log_file, threads=cpus, logger=logger).run_seqkit_seq(length_thr)\r\n        if verbose:\r\n            logger.loud_log(f\"Filtered sequence written to: { outputs.seqkit_seq_output_path}\")\r\n        else:\r\n            logger.silent_log(f\"Filtered sequence written to: { outputs.seqkit_seq_output_path}\")\r\n\r\n        ## Translate nucleotide sequences to protein sequences with seqkit\r\n        logger.loud_log(\"Translating nucleotide sequences to protein sequences.\")\r\n\r\n        if not os.path.exists(outputs.seqkit_translate_output_dir):\r\n            outputs.seqkit_translate_output_dir.mkdir(parents=True)\r\n\r\n        run_seqkit.seqkit(outputs.seqkit_seq_output_path, outputs.seqkit_translate_output_path, log_file, threads=cpus, logger=logger).run_seqkit_translate(gen_code, 6)\r\n\r\n        if verbose:\r\n            logger.loud_log(f\"Translated sequence written to: {outputs.seqkit_translate_output_path}\")\r\n        else:\r\n            logger.silent_log(f\"Translated sequence written to: {outputs.seqkit_translate_output_path}\")\r\n\r\n        for db_name,db_path in zip(db_name_list, db_path_list):\r\n            logger.loud_log(f\"Running HMMsearch for {db_name} database.\")\r\n\r\n            if verbose:\r\n                logger.loud_log(f\"HMM output path: {outputs.hmm_output_path(db_name)}\")\r\n            else:\r\n                logger.silent_log(f\"HMM output path: {outputs.hmm_output_path(db_name)}\")\r\n\r\n            start_hmmsearch_time = logger.start_timer()\r\n            run_pyhmmer.pyhmmsearch(outputs.hmm_output_path(db_name), outputs.seqkit_translate_output_path, db_path, cpus, e, incdomE, domE, incE,\r\n                                              z).run_pyhmmsearch()\r\n            end_hmmsearch_time = logger.stop_timer(start_hmmsearch_time, verbose)\r\n            if verbose:\r\n                logger.loud_log(f\"{db_name} HMMsearch Runtime: {end_hmmsearch_time}\")\r\n            else:\r\n                logger.silent_log(f\"{db_name} HMMsearch Runtime: {end_hmmsearch_time}\")\r\n\r\n            if verbose:\r\n                logger.loud_log(f\"Pyhmmer output written to: {outputs.hmm_output_path(db_name)}\")\r\n            else:\r\n                logger.silent_log(f\"Pyhmmer output written to: {outputs.hmm_output_path(db_name)}\")\r\n\r\n            if not os.path.exists(outputs.formatted_hmm_output_dir):\r\n                outputs.formatted_hmm_output_dir.mkdir(parents=True)\r\n\r\n            format_pyhmmer_out.hmmsearch_formatter(outputs.hmm_output_path(db_name), outputs.formatted_hmm_output_path(db_name), seq_type)\r\n\r\n            if verbose:\r\n                logger.loud_log(f\"Formatted Pyhmmer output written to: {outputs.formatted_hmm_output_path(db_name)}\")\r\n            else:\r\n                logger.silent_log(f\"Formatted Pyhmmer output written to: {outputs.formatted_hmm_output_path(db_name)}\")\r\n            if not os.path.exists(outputs.best_hit_dir):\r\n                outputs.best_hit_dir.mkdir(parents=True)\r\n\r\n            format_pyhmmer_out.hmmsearch_format_helpers(outputs.formatted_hmm_output_path(db_name), seq_type, logger).highest_bitscore_hits(\r\n                outputs.best_hit_path(db_name))\r\n            if verbose:\r\n                logger.loud_log(f\"Highest Bitscore hits written to: {outputs.best_hit_path(db_name)}\")\r\n            else:\r\n                logger.silent_log(f\"Highest Bitscore hits written to: {outputs.best_hit_path(db_name)}\")\r\n\r\n            set_dict[db_name] = format_pyhmmer_out.hmmsearch_format_helpers(outputs.formatted_hmm_output_path(db_name),\r\n                                                                            seq_type, logger).hmm_to_contig_set()\r\n            translated_set_dict[db_name] = format_pyhmmer_out.hmmsearch_format_helpers(outputs.formatted_hmm_output_path(db_name),\r\n                                                                                       'prot', logger).hmm_to_contig_set()\r\n\r\n            # Convert to dataframe, add db_name column and append to df_list\r\n            df = pl.read_csv(outputs.best_hit_path(db_name), separator='\\t')\r\n            df = df.with_columns([\r\n                pl.lit(db_name).alias('db_name')\r\n            ])\r\n            df_list.append(df)\r\n\r\n            logger.loud_log(f\"HMMsearch for {db_name} completed.\")\r\n\r\n        logger.loud_log(\"HMMsearch completed.\")\r\n\r\n        if not os.path.exists(outputs.plot_outdir):\r\n            outputs.plot_outdir.mkdir(parents=True)\r\n\r\n        if not os.path.exists(outputs.tsv_outdir):\r\n            outputs.tsv_outdir.mkdir(parents=True)\r\n\r\n        logger.loud_log(\"Consolidating results.\")\r\n\r\n        # Combine all the dataframes in the list\r\n        combined_df = pl.concat(df_list, how='vertical_relaxed')\r\n        # Write the combined dataframe to a tsv file\r\n        for col in ['E-value', 'score', 'norm_bitscore_profile', 'norm_bitscore_contig',\r\n                    'ID_score', 'profile_coverage', 'contig_coverage']:\r\n            combined_df = combined_df.with_columns([\r\n                pl.col(col).cast(pl.Float64)\r\n            ])\r\n\r\n\r\n        combined_df.write_csv(outputs.combined_tsv_path, separator=\"\\t\")\r\n\r\n        # Check if the combined dataframe is empty\r\n        if combined_df.is_empty():\r\n            logger.loud_log(\"No hits found by RdRpCATCH. Exiting.\")\r\n            return None\r\n\r\n        # Generate upset plot\r\n        logger.loud_log(\"Generating plots.\")\r\n\r\n        if len(db_name_list) > 1:\r\n            if verbose:\r\n                logger.loud_log(\"Generating upset plot.\")\r\n            else:\r\n                logger.silent_log(\"Generating upset plot.\")\r\n\r\n            plot.Plotter(outputs.plot_outdir, outputs.tsv_outdir, prefix).upset_plotter(set_dict)\r\n\r\n\r\n        if verbose:\r\n            logger.loud_log(f\"Combined dataframe written to: {outputs.combined_tsv_path}\")\r\n        else:\r\n            logger.silent_log(f\"Combined dataframe written to: {outputs.combined_tsv_path}\")\r\n        # Generate e-value plot\r\n        plot.Plotter(outputs.plot_outdir, outputs.tsv_outdir, prefix).plot_evalue(combined_df)\r\n        # Generate score plot\r\n        plot.Plotter(outputs.plot_outdir, outputs.tsv_outdir, prefix).plot_score(combined_df)\r\n        # Generate normalized bitscore plot\r\n        plot.Plotter(outputs.plot_outdir, outputs.tsv_outdir, prefix).plot_norm_bitscore_profile(combined_df)\r\n        # Generate normalized bitscore contig plot\r\n        plot.Plotter(outputs.plot_outdir, outputs.tsv_outdir, prefix).plot_norm_bitscore_contig(combined_df)\r\n        # Generate ID score plot\r\n        plot.Plotter(outputs.plot_outdir, outputs.tsv_outdir, prefix).plot_ID_score(combined_df)\r\n        # Generate Profile coverage plot\r\n        plot.Plotter(outputs.plot_outdir, outputs.tsv_outdir, prefix).plot_profile_coverage(combined_df)\r\n        # Generate contig coverage plot\r\n        plot.Plotter(outputs.plot_outdir, outputs.tsv_outdir, prefix).plot_contig_coverage(combined_df)\r\n        # Extract all the contigs\r\n        combined_set = set.union(*[value for value in set_dict.values()])\r\n        translated_combined_set = set.union(*[value for value in translated_set_dict.values()])\r\n\r\n        logger.loud_log(\"Extracting RdRp contigs from the input file.\")\r\n\r\n        # Write a fasta file with all the contigs\r\n        if not os.path.exists(outputs.fasta_output_dir):\r\n            outputs.fasta_output_dir.mkdir(parents=True)\r\n\r\n        utils.fasta(input_file).write_fasta(utils.fasta(input_file).extract_contigs(combined_set), outputs.fasta_nuc_out_path)\r\n\r\n        utils.fasta(outputs.seqkit_translate_output_path).write_fasta(utils.fasta(outputs.seqkit_translate_output_path).extract_contigs(translated_combined_set),\r\n                                            outputs.fasta_prot_out_path)\r\n\r\n        if not os.path.exists(outputs.gff_output_dir):\r\n            outputs.gff_output_dir.mkdir(parents=True)\r\n        hmm_writer = format_pyhmmer_out.hmmsearch_output_writter(logger)\r\n        hmm_writer.write_hmmsearch_hits(outputs.combined_tsv_path, seq_type, outputs.rdrpcatch_output_tsv, outputs.gff_output_path)\r\n        rdrp_coords_list = hmm_writer.get_rdrp_coords(outputs.rdrpcatch_output_tsv,seq_type)\r\n        utils.fasta(outputs.seqkit_translate_output_path, logger).write_fasta_coords(rdrp_coords_list,outputs.fasta_trimmed_out_path, seq_type)\r\n\r\n        if verbose:\r\n            logger.loud_log(f\"Contigs written to: {outputs.fasta_nuc_out_path}\")\r\n            logger.loud_log(f\"Translated contigs written to: {outputs.fasta_prot_out_path}\")\r\n            logger.loud_log(f\"Trimmed contigs written to: {outputs.fasta_trimmed_out_path}\")\r\n        else:\r\n            logger.silent_log(f\"Contigs written to: {outputs.fasta_nuc_out_path}\")\r\n            logger.silent_log(f\"Translated contigs written to: {outputs.fasta_prot_out_path}\")\r\n            logger.silent_log(f\"Trimmed contigs written to: {outputs.fasta_trimmed_out_path}\")\r\n\r\n        if not os.path.exists(outputs.mmseqs_tax_output_dir):\r\n            outputs.mmseqs_tax_output_dir.mkdir(parents=True)\r\n\r\n        logger.loud_log(\"Running mmseqs easy-taxonomy for taxonomic annotation.\")\r\n\r\n        mmseqs_tax.mmseqs(outputs.fasta_prot_out_path, mmseqs_db_path, outputs.mmseqs_tax_output_prefix,\r\n                          outputs.mmseqs_tax_output_dir, 7, cpus, outputs.mmseqs_tax_log_path).run_mmseqs_easy_tax_lca()\r\n\r\n        logger.loud_log(\"Running mmseqs easy-search for taxonomic annotation.\")\r\n\r\n        if not os.path.exists(outputs.mmseqs_e_search_output_dir):\r\n            outputs.mmseqs_e_search_output_dir.mkdir(parents=True)\r\n\r\n\r\n        mmseqs_tax.mmseqs(outputs.fasta_prot_out_path, mmseqs_db_path, outputs.mmseqs_e_search_output_dir,\r\n                          outputs.mmseqs_e_search_output_path, 7, cpus, outputs.mmseqs_e_search_log_path).run_mmseqs_e_search()\r\n\r\n        utils.mmseqs_parser(outputs.mmseqs_tax_output_lca_path, outputs.mmseqs_e_search_output_path).tax_to_rdrpcatch(\r\n            outputs.rdrpcatch_output_tsv, outputs.extended_rdrpcatch_output, seq_type)\r\n\r\n        logger.loud_log(\"Taxonomic annotation completed.\")\r\n\r\n    elif seq_type == 'prot':\r\n\r\n        logger.loud_log(\"Protein sequence detected.\")\r\n\r\n        set_dict = {}\r\n        df_list = []\r\n\r\n        for db_name,db_path in zip (db_name_list, db_path_list):\r\n            logger.loud_log(f\"Running HMMsearch for {db_name} database.\")\r\n\r\n            if verbose:\r\n                logger.loud_log(f\"HMM output path: {outputs.hmm_output_path(db_name)}\")\r\n            else:\r\n                logger.silent_log(f\"HMM output path: {outputs.hmm_output_path(db_name)}\")\r\n            start_hmmsearch_time = logger.start_timer()\r\n            hmm_out = run_pyhmmer.pyhmmsearch(outputs.hmm_output_path(db_name), input_file, db_path, cpus, e, incdomE, domE, incE, z).run_pyhmmsearch()\r\n            end_hmmsearch_time = logger.stop_timer(start_hmmsearch_time,verbose)\r\n            if verbose:\r\n                logger.loud_log(f\"{db_name} HMMsearch Runtime: {end_hmmsearch_time}\")\r\n            else:\r\n                logger.silent_log(f\"{db_name} HMMsearch Runtime: {end_hmmsearch_time}\")\r\n\r\n            if verbose:\r\n                logger.loud_log(f\"Pyhmmer output written to: {hmm_out}\")\r\n            else:\r\n                logger.silent_log(f\"Pyhmmer output written to: {hmm_out}\")\r\n            if not os.path.exists(outputs.formatted_hmm_output_dir):\r\n                outputs.formatted_hmm_output_dir.mkdir(parents=True)\r\n\r\n            format_pyhmmer_out.hmmsearch_formatter(hmm_out, outputs.formatted_hmm_output_path(db_name), seq_type)\r\n            if verbose:\r\n                logger.loud_log(f\"Formatted Pyhmmer output written to: {outputs.formatted_hmm_output_path(db_name)}\")\r\n            else:\r\n                logger.silent_log(f\"Formatted Pyhmmer output written to: {outputs.formatted_hmm_output_path(db_name)}\")\r\n\r\n            # Extract Highest Bitscore hits from the formatted hmm output\r\n\r\n            if not os.path.exists(outputs.best_hit_dir):\r\n                outputs.best_hit_dir.mkdir(parents=True)\r\n\r\n            format_pyhmmer_out.hmmsearch_format_helpers(outputs.formatted_hmm_output_path(db_name),seq_type, logger).highest_bitscore_hits(outputs.best_hit_path(db_name))\r\n\r\n            if verbose:\r\n                logger.loud_log(f\"Highest Bitscore hits written to: {outputs.best_hit_path(db_name)}\")\r\n            else:\r\n                logger.silent_log(f\"Highest Bitscore hits written to: {outputs.best_hit_path(db_name)}\")\r\n            # Here I overwrite prot to nuc, because I need the contig name to extract the contigs\r\n            set_dict[db_name] = format_pyhmmer_out.hmmsearch_format_helpers(outputs.formatted_hmm_output_path(db_name),\"nuc\", logger).hmm_to_contig_set()\r\n\r\n            # Convert to  dataframe, add db_name column and append to df_list\r\n            df = pl.read_csv(outputs.best_hit_path(db_name), separator='\\t')\r\n            df = df.with_columns([\r\n                pl.lit(db_name).alias('db_name')\r\n            ])\r\n            df_list.append(df)\r\n\r\n            logger.loud_log(f\"HMMsearch for {db_name} completed.\")\r\n\r\n        logger.loud_log(\"HMMsearch completed.\")\r\n\r\n        if not os.path.exists(outputs.plot_outdir):\r\n            outputs.plot_outdir.mkdir(parents=True)\r\n\r\n        if not os.path.exists(outputs.tsv_outdir):\r\n            outputs.tsv_outdir.mkdir(parents=True)\r\n\r\n        logger.loud_log(\"Consolidating results.\")\r\n\r\n        # Combine all the dataframes in the list\r\n        combined_df = pl.concat(df_list, how='vertical_relaxed')\r\n        # Write the combined dataframe to a tsv file\r\n        for col in ['E-value', 'score', 'norm_bitscore_profile', 'norm_bitscore_contig',\r\n                    'ID_score', 'profile_coverage', 'contig_coverage']:\r\n            combined_df = combined_df.with_columns([\r\n                pl.col(col).cast(pl.Float64)\r\n            ])\r\n\r\n        combined_df.write_csv(outputs.combined_tsv_path, separator=\"\\t\")\r\n\r\n        # Check if the combined dataframe is empty\r\n        if combined_df.is_empty():\r\n            logger.loud_log(\"No hits found by RdRpCATCH. Exiting.\")\r\n            return None\r\n\r\n        # Generate upset plot\r\n        logger.loud_log(\"Generating plots.\")\r\n\r\n        if len(db_name_list) > 1:\r\n            if verbose:\r\n                logger.loud_log(\"Generating upset plot.\")\r\n            else:\r\n                logger.silent_log(\"Generating upset plot.\")\r\n\r\n            plot.Plotter(outputs.plot_outdir,outputs.tsv_outdir, prefix).upset_plotter(set_dict)\r\n\r\n\r\n        if verbose:\r\n            logger.loud_log(f\"Combined dataframe written to: {outputs.combined_tsv_path}\")\r\n        else:\r\n            logger.silent_log(f\"Combined dataframe written to: {outputs.combined_tsv_path}\")\r\n\r\n        # Generate e-value plot\r\n        plot.Plotter(outputs.plot_outdir,outputs.tsv_outdir, prefix).plot_evalue(combined_df)\r\n        # Generate score plot\r\n        plot.Plotter(outputs.plot_outdir,outputs.tsv_outdir, prefix).plot_score(combined_df)\r\n        # Generate normalized bitscore plot\r\n        plot.Plotter(outputs.plot_outdir,outputs.tsv_outdir, prefix).plot_norm_bitscore_profile(combined_df)\r\n        # Generate normalized bitscore contig plot\r\n        plot.Plotter(outputs.plot_outdir,outputs.tsv_outdir, prefix).plot_norm_bitscore_contig(combined_df)\r\n        # Generate ID score plot\r\n        plot.Plotter(outputs.plot_outdir,outputs.tsv_outdir, prefix).plot_ID_score(combined_df)\r\n        # Generate Profile coverage plot\r\n        plot.Plotter(outputs.plot_outdir,outputs.tsv_outdir, prefix).plot_profile_coverage(combined_df)\r\n        # Generate contig coverage plot\r\n        plot.Plotter(outputs.plot_outdir,outputs.tsv_outdir, prefix).plot_contig_coverage(combined_df)\r\n\r\n        # Extract all the contigs\r\n        combined_set = set.union(*[value for value in set_dict.values()])\r\n        # Write a fasta file with all the contigs\r\n\r\n        logger.loud_log(\"Extracting RdRp contigs from the input file.\")\r\n\r\n        if not os.path.exists(outputs.fasta_output_dir):\r\n            outputs.fasta_output_dir.mkdir(parents=True)\r\n\r\n        utils.fasta(input_file).write_fasta(utils.fasta(input_file).extract_contigs(combined_set), outputs.fasta_prot_out_path)\r\n\r\n        if verbose:\r\n            logger.loud_log(f\"Full aminoacid contigs written to: {outputs.fasta_prot_out_path}\")\r\n        else:\r\n            logger.silent_log(f\" Full aminoacid contigs written to: {outputs.fasta_prot_out_path}\")\r\n\r\n        if not os.path.exists(outputs.gff_output_dir):\r\n            outputs.gff_output_dir.mkdir(parents=True)\r\n\r\n        hmm_writer = format_pyhmmer_out.hmmsearch_output_writter(logger)\r\n        hmm_writer.write_hmmsearch_hits(outputs.combined_tsv_path, seq_type, outputs.rdrpcatch_output_tsv, outputs.gff_output_path)\r\n        rdrp_coords_list = hmm_writer.get_rdrp_coords(outputs.rdrpcatch_output_tsv,seq_type)\r\n        utils.fasta(input_file, logger).write_fasta_coords(rdrp_coords_list,outputs.fasta_trimmed_out_path, seq_type)\r\n\r\n        if verbose:\r\n            logger.loud_log(f\"Trimmed contigs written to: {outputs.fasta_trimmed_out_path}\")\r\n        else:\r\n            logger.silent_log(f\"Trimmed contigs written to: {outputs.fasta_trimmed_out_path}\")\r\n\r\n        if not os.path.exists(outputs.mmseqs_tax_output_dir):\r\n            outputs.mmseqs_tax_output_dir.mkdir(parents=True)\r\n\r\n        logger.loud_log(\"Running mmseqs easy-taxonomy for taxonomic annotation.\")\r\n\r\n        mmseqs_tax.mmseqs(outputs.fasta_prot_out_path, mmseqs_db_path, outputs.mmseqs_tax_output_prefix,\r\n                          outputs.mmseqs_tax_output_dir, 7, cpus, outputs.mmseqs_tax_log_path).run_mmseqs_easy_tax_lca()\r\n\r\n        if not os.path.exists(outputs.mmseqs_e_search_output_dir):\r\n            outputs.mmseqs_e_search_output_dir.mkdir(parents=True)\r\n\r\n        logger.loud_log(\"Running mmseqs easy-search for taxonomic annotation.\")\r\n\r\n        mmseqs_tax.mmseqs(outputs.fasta_prot_out_path, mmseqs_db_path, outputs.mmseqs_e_search_output_dir,\r\n                          outputs.mmseqs_e_search_output_path, 7, cpus, outputs.mmseqs_e_search_log_path).run_mmseqs_e_search()\r\n\r\n        utils.mmseqs_parser(outputs.mmseqs_tax_output_lca_path, outputs.mmseqs_e_search_output_path).tax_to_rdrpcatch(\r\n            outputs.rdrpcatch_output_tsv, outputs.extended_rdrpcatch_output, seq_type)\r\n\r\n\r\n\r\n\r\n\r\n\r\n    if not keep_tmp:\r\n        if verbose:\r\n            logger.loud_log(\"Deleting temporary files.\")\r\n        else:\r\n            logger.silent_log(\"Deleting temporary files.\")\r\n\r\n        try:\r\n            import shutil\r\n            shutil.rmtree(outputs.tmp_dir)\r\n            logger.silent_log(f\"Temporary files deleted.\")\r\n        except FileNotFoundError:\r\n            print(f\"Directory '{outputs.tmp_dir}' does not exist.\")\r\n        except PermissionError:\r\n            print(f\"Permission denied while trying to delete '{outputs.tmp_dir}'.\")\r\n        except Exception as e:\r\n            print(f\"An error occurred: {e}\")\r\n\r\n    # Bundle results\r\n    if bundle:\r\n        archive_path = bundle_results(output_dir, prefix)\r\n        if verbose:\r\n            logger.loud_log(f\"Results bundled into: {archive_path}\")\r\n        else:\r\n            logger.silent_log(f\"Results bundled into: {archive_path}\")\r\n\r\n    end_time = logger.stop_timer(start_time, verbose)\r\n\r\n    logger.loud_log(f\"Total Runtime: {end_time}\")\r\n\r\n    logger.loud_log(\"RdRpCATCH completed successfully.\")\r\n\r\n\r\n    return outputs.extended_rdrpcatch_output\r\n\r\nif __name__ == \"__main__\":\r\n    main()\r\n
+Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
+<+>UTF-8
+===================================================================
+diff --git a/rdrpcatch/rdrpcatch_wrapper.py b/rdrpcatch/rdrpcatch_wrapper.py
+--- a/rdrpcatch/rdrpcatch_wrapper.py	(revision 2110790421475da92fd4f5e5dbf44f1191829a02)
++++ b/rdrpcatch/rdrpcatch_wrapper.py	(date 1747134242424)
+@@ -170,7 +170,7 @@
+     logger.loud_log("Fetching HMM databases...")
+-    ## Fetch HMM databases- RVMT, NeoRdRp, NeoRdRp.2.1, TSA_Olendraite, RDRP-scan, Lucaprot
++    ## Fetch HMM databases- RVMT, NeoRdRp, NeoRdRp.2.1, TSA_Olendraite, RDRP-scan, Lucaprot_HMM,Zayed_HMM
+     rvmt_hmm_db = fetch_dbs.db_fetcher(db_dir).fetch_hmm_db_path("RVMT")
+     if verbose:
+         logger.loud_log(f"RVMT HMM database fetched from: {rvmt_hmm_db}")
+@@ -202,19 +202,24 @@
+         logger.loud_log(f"RDRP-scan HMM database fetched from: {rdrpscan_hmm_db}")
+     else:
+         logger.silent_log(f"RDRP-scan HMM database fetched from: {rdrpscan_hmm_db}")
+-    lucaprot_hmm_db = fetch_dbs.db_fetcher(db_dir).fetch_hmm_db_path("Lucaprot")
++    lucaprot_hmm_db = fetch_dbs.db_fetcher(db_dir).fetch_hmm_db_path("Lucaprot_HMM")
+     if verbose:
+         logger.loud_log(f"Lucaprot HMM database fetched from: {lucaprot_hmm_db}")
+     else:
+         logger.silent_log(f"Lucaprot HMM database fetched from: {lucaprot_hmm_db}")
++    zayed_hmm_db = fetch_dbs.db_fetcher(db_dir).fetch_hmm_db_path("Zayed_HMM")
++    if verbose:
++        logger.loud_log(f"Zayed HMM database fetched from: {zayed_hmm_db}")
++    else:
++        logger.silent_log(f"Zayed HMM database fetched from: {zayed_hmm_db}")
+     db_name_list = []
+     db_path_list = []
+     ## Set up HMM databases
+     if db_options == ['all']:
+-        db_name_list = ["RVMT", "NeoRdRp", "NeoRdRp.2.1", "TSA_Olendraite_fam","TSA_Olendraite_gen", "RDRP-scan", "Lucaprot"]
+-        db_path_list = [rvmt_hmm_db, neordrp_hmm_db, neordrp_2_hmm_db, tsa_olen_fam_hmm_db,tsa_olen_gen_hmm_db, rdrpscan_hmm_db, lucaprot_hmm_db]
++        db_name_list = ["RVMT", "NeoRdRp", "NeoRdRp.2.1", "TSA_Olendraite_fam","TSA_Olendraite_gen", "RDRP-scan", "Lucaprot_HMM", "Zayed_HMM"]
++        db_path_list = [rvmt_hmm_db, neordrp_hmm_db, neordrp_2_hmm_db, tsa_olen_fam_hmm_db,tsa_olen_gen_hmm_db, rdrpscan_hmm_db, lucaprot_hmm_db, zayed_hmm_db]
+     else:
+         for db in db_options:
+@@ -236,9 +241,12 @@
+             elif db == "RDRP-scan".lower():
+                 db_name_list.append("RDRP-scan")
+                 db_path_list.append(rdrpscan_hmm_db)
+-            elif db == "Lucaprot".lower():
+-                db_name_list.append("Lucaprot")
++            elif db == "Lucaprot_HMM".lower():
++                db_name_list.append("Lucaprot_HMM")
+                 db_path_list.append(lucaprot_hmm_db)
++            elif db == "Zayed_HMM".lower():
++                db_name_list.append("Zayed_HMM")
++                db_path_list.append(zayed_hmm_db)
+             else:
+                 raise Exception(f"Invalid database option: {db}")
+Index: rdrpcatch/cli/args.py
+IDEA additional info:
+Subsystem: com.intellij.openapi.diff.impl.patch.BaseRevisionTextPatchEP
+<+>import warnings\r\n# Filter numpy warnings before any imports that might trigger them\r\nwarnings.filterwarnings(\"ignore\", category=UserWarning, module=\"numpy\")\r\nwarnings.filterwarnings(\"ignore\", category=RuntimeWarning, module=\"numpy\")\r\nwarnings.filterwarnings(\"ignore\", message=\".*subnormal.*\")\r\n\r\nimport rich_click as click\r\nfrom rich.console import Console\r\nfrom rich.table import Table\r\nfrom rich.panel import Panel\r\nfrom rich.syntax import Syntax\r\nfrom rich.progress import Progress, BarColumn, TextColumn, DownloadColumn, TimeRemainingColumn\r\nfrom pathlib import Path\r\nimport datetime\r\nfrom ..rdrpcatch_wrapper import run_scan\r\nfrom ..rdrpcatch_scripts.fetch_dbs import ZenodoDownloader, db_fetcher\r\nimport os\r\nimport shutil\r\nimport requests\r\n\r\nconsole = Console()\r\n\r\n## FUNCTIONS\r\ndef parse_comma_separated_options(ctx, param, value):\r\n    if not value:\r\n        return ['all']\r\n\r\n    allowed_choices = ['RVMT', 'NeoRdRp', 'NeoRdRp.2.1', 'TSA_Olendraite_fam', 'TSA_Olendraite_gen', 'RDRP-scan',\r\n                       'Lucaprot', 'all']\r\n    lower_choices = [choice.lower() for choice in allowed_choices]\r\n    options = value.split(',')\r\n    lower_options = [option.lower() for option in options]\r\n\r\n    for option in options:\r\n        if option.lower() not in lower_choices:\r\n            raise click.BadParameter(f\"Invalid choice: '{option}' (choose from {', '.join(allowed_choices)})\")\r\n\r\n    return lower_options\r\n\r\n\r\ndef format_size(bytes_size: int) -> str:\r\n    \"\"\"Convert bytes to human-readable format without external dependencies\"\"\"\r\n    units = [\"B\", \"KB\", \"MB\", \"GB\", \"TB\"]\r\n    unit_idx = 0\r\n    size = float(bytes_size)\r\n\r\n    while size >= 1024 and unit_idx < len(units) - 1:\r\n        size /= 1024\r\n        unit_idx += 1\r\n\r\n    return f\"{size:.2f} {units[unit_idx]}\"\r\n\r\n\r\n\r\n## CLI ENTRY POINT\r\n\r\n@click.group()\r\ndef cli():\r\n    \"\"\"RdRpCATCH - RNA-dependent RNA polymerase Collaborative Analysis Tool with Collections of pHMMs\"\"\"\r\n    pass\r\n\r\n@cli.command(\"scan\", help=\"Scan sequences for RdRps.\")\r\n@click.option(\"-i\", \"--input\",\r\n              help=\"Path to the input FASTA file.\",\r\n              type=click.Path(exists=True, dir_okay=False, readable=True, path_type=Path), required=True)\r\n@click.option(\"-o\", \"--output\",\r\n              help=\"Path to the output directory.\",\r\n              type=click.Path(exists=False, file_okay=False, writable=True, path_type=Path), required=True)\r\n@click.option(\"-db_dir\", \"--db_dir\",\r\n              help=\"Path to the directory containing RdRpCATCH databases.\",\r\n              type=click.Path(exists=True, dir_okay=True, readable=True, path_type=Path),required=True)\r\n@click.option(\"-dbs\", \"--db_options\",\r\n              callback=parse_comma_separated_options,\r\n              default=\"all\",\r\n              help=\"Comma-separated list of databases to search against. Valid options: RVMT, NeoRdRp, NeoRdRp.2.1,\"\r\n                   \" TSA_Olendraite_fam, TSA_Olendraite_gen, RDRP-scan,Lucaprot, all\")\r\n@click.option(\"--custom-dbs\",\r\n              help=\"Path to directory containing custom MSAs/pHMM files to use as additional databases\",\r\n              type=click.Path(exists=True, path_type=Path))\r\n@click.option(\"-seq_type\", \"--seq_type\",\r\n              type=click.STRING,\r\n              default=None,\r\n              help=\"Type of sequence to search against: (prot,nuc) Default: unknown\")\r\n@click.option(\"-v\", \"--verbose\",\r\n              is_flag=True,\r\n              help=\"Print verbose output.\")\r\n@click.option('-e', '--evalue',\r\n              type=click.FLOAT,\r\n              default=1e-5,\r\n              help=\"E-value threshold for HMMsearch. (default: 1e-5)\")\r\n@click.option('-incE', '--incevalue',\r\n              type=click.FLOAT,\r\n              default=1e-5,\r\n              help=\"Inclusion E-value threshold for HMMsearch. (default: 1e-5)\")\r\n@click.option('-domE', '--domevalue',\r\n              type=click.FLOAT,\r\n              default=1e-5,\r\n              help=\"Domain E-value threshold for HMMsearch. (default: 1e-5)\")\r\n@click.option('-incdomE', '--incdomevalue',\r\n              type=click.FLOAT,\r\n              default=1e-5,\r\n              help=\"Inclusion domain E-value threshold for HMMsearch. (default: 1e-5)\")\r\n@click.option('-z', '--zvalue',\r\n              type=click.INT,\r\n              default=1000000,\r\n              help=\"Number of sequences to search against. (default: 1000000)\")\r\n@click.option('-cpus', '--cpus',\r\n              type=click.INT,\r\n              default=1,\r\n              help=\"Number of CPUs to use for HMMsearch. (default: 1)\")\r\n@click.option('-length_thr', '--length_thr',\r\n              type=click.INT,\r\n              default=400,\r\n              help=\"Minimum length threshold for seqkit seq. (default: 400)\")\r\n@click.option('-gen_code', '--gen_code',\r\n              type=click.INT,\r\n              default=1,\r\n              help='Genetic code to use for translation. (default: 1) Possible genetic codes (supported by seqkit translate) : 1: The Standard Code, '\r\n                     '2: The Vertebrate Mitochondrial Code, '\r\n                     '3: The Yeast Mitochondrial Code, '\r\n                     '4: The Mold, Protozoan, and Coelenterate Mitochondrial Code and the Mycoplasma/Spiroplasma Code, '\r\n                     '5: The Invertebrate Mitochondrial Code, '\r\n                     '6: The Ciliate, Dasycladacean and Hexamita Nuclear Code, '\r\n                     '9: The Echinoderm and Flatworm Mitochondrial Code, '\r\n                    '10: The Euplotid Nuclear Code, '\r\n                    '11: The Bacterial, Archaeal and Plant Plastid Code, '\r\n                    '12: The Alternative Yeast Nuclear Code, '\r\n                    '13: The Ascidian Mitochondrial Code, '\r\n                    '14: The Alternative Flatworm Mitochondrial Code, '\r\n                    '16: Chlorophycean Mitochondrial Code, '\r\n                    '21: Trematode Mitochondrial Code, '\r\n                    '22: Scenedesmus obliquus Mitochondrial Code, '\r\n                    '23: Thraustochytrium Mitochondrial Code, '\r\n                    '24: Pterobranchia Mitochondrial Code, '\r\n                    '25: Candidate Division SR1 and Gracilibacteria Code, '\r\n                    '26: Pachysolen tannophilus Nuclear Code, '\r\n                    '27: Karyorelict Nuclear, '\r\n                    '28: Condylostoma Nuclear, '\r\n                    '29: Mesodinium Nuclear, '\r\n                    '30: Peritrich Nuclear, '\r\n                    '31: Blastocrithidia Nuclear, ')\r\n@click.option('-bundle', '--bundle',\r\n              is_flag=True,\r\n              default=False,\r\n              help=\"Bundle the output files into a single archive. (default: False)\")\r\n@click.option('-keep_tmp', '--keep_tmp',\r\n              is_flag=True,\r\n              default=False,\r\n              help=\"Keep temporary files (Expert users) (default: False)\")\r\n@click.option('-overwrite', '--overwrite',\r\n              is_flag=True,\r\n              default=False,\r\n              help=\"Force overwrite of existing output directory. (default: False)\")\r\n\r\n@click.pass_context\r\ndef scan(ctx, input, output, db_options, db_dir, custom_dbs, seq_type, verbose, evalue,\r\n         incevalue, domevalue, incdomevalue, zvalue, cpus, length_thr, gen_code, bundle, keep_tmp, overwrite):\r\n    \"\"\"Scan sequences for RdRps.\"\"\"\r\n\r\n    # Create a rich table for displaying parameters\r\n    table = Table(title=\"Scan Parameters\")\r\n    table.add_column(\"Parameter\", style=\"cyan\")\r\n    table.add_column(\"Value\", style=\"green\")\r\n\r\n    table.add_row(\"Input File\", str(input))\r\n    table.add_row(\"Output Directory\", str(output))\r\n    table.add_row(\"Databases\", \", \".join(db_options))\r\n    table.add_row(\"Database Directory\", str(db_dir))\r\n    if custom_dbs:\r\n        table.add_row(\"Custom Databases\", str(custom_dbs))\r\n    table.add_row(\"Sequence Type\", seq_type or \"unknown\")\r\n    table.add_row(\"Verbose Mode\", \"ON\" if verbose else \"OFF\")\r\n    table.add_row(\"E-value\", str(evalue))\r\n    table.add_row(\"Inclusion E-value\", str(incevalue))\r\n    table.add_row(\"Domain E-value\", str(domevalue))\r\n    table.add_row(\"Inclusion Domain E-value\", str(incdomevalue))\r\n    table.add_row(\"Z-value\", str(zvalue))\r\n    table.add_row(\"CPUs\", str(cpus))\r\n    table.add_row(\"Length Threshold\", str(length_thr))\r\n    table.add_row(\"Genetic Code\", str(gen_code))\r\n    table.add_row(\"Bundle Output\", \"ON\" if bundle else \"OFF\")\r\n    table.add_row(\"Save Temporary Files\", \"ON\" if keep_tmp else \"OFF\")\r\n    table.add_row(\"Force Overwrite\", \"ON\" if overwrite else \"OFF\")\r\n\r\n    console.print(Panel(table, title=\"Scan Configuration\"))\r\n\r\n    # Add custom databases if provided\r\n    if custom_dbs:\r\n        db = db_fetcher(db_dir)\r\n        if os.path.isfile(custom_dbs):\r\n            db.add_custom_db(custom_dbs)\r\n        else:\r\n            for item in os.listdir(custom_dbs):\r\n                item_path = os.path.join(custom_dbs, item)\r\n                if os.path.isfile(item_path) and item_path.endswith(('.hmm', '.h3m', '.msa', '.sto', '.fasta', '.fa')):\r\n                    db.add_custom_db(item_path)\r\n                elif os.path.isdir(item_path):\r\n                    db.add_custom_db(item_path, item)\r\n\r\n    run_scan(\r\n        input_file=input,\r\n        output_dir=output,\r\n        db_options=db_options,\r\n        db_dir=db_dir,\r\n        seq_type=seq_type,\r\n        verbose=verbose,\r\n        e=evalue,\r\n        incE=incevalue,\r\n        domE=domevalue,\r\n        incdomE=incdomevalue,\r\n        z=zvalue,\r\n        cpus=cpus,\r\n        length_thr=length_thr,\r\n        gen_code=gen_code,\r\n        bundle=bundle,\r\n        keep_tmp=keep_tmp,\r\n        overwrite=overwrite\r\n    )\r\n\r\n# @cli.command(\"download\", help=\"Download RdRpCATCH databases.\")\r\n# @click.option(\"--destination_dir\", \"-dest\",\r\n#               help=\"Path to the directory to download HMM databases.\",\r\n#               type=click.Path(exists=False, file_okay=False, writable=True, path_type=Path), required=True)\r\n# @click.option(\"--check-updates\", \"-u\",\r\n#               is_flag=True,\r\n#               help=\"Check for database updates\")\r\n# @click.pass_context\r\n# def download(ctx, destination_dir, check_updates):\r\n#     \"\"\"Download RdRpCATCH databases.\"\"\"\r\n#\r\n#     # if check_updates:\r\n#     #     db = db_fetcher(destination_dir)\r\n#     #     version_info = db.check_db_updates()\r\n#     #     if version_info:\r\n#     #         console.print(\"Current database versions:\")\r\n#     #         for db_name, info in version_info.items():\r\n#     #             console.print(f\"- {db_name}: {info}\")\r\n#     #     else:\r\n#     #         console.print(\"No version information available\")\r\n#     #     return\r\n#\r\n#     run_download(destination_dir)\r\n#\r\n# # @cli.command(\"gui\", help=\"Launch the GUI.\")\r\n# # @click.pass_context\r\n# # def gui(ctx):\r\n# #     \"\"\"Launch the GUI.\"\"\"\r\n# #\r\n# #     console.print(Panel(\"Starting ColabScan GUI...\", title=\"GUI Launch\"))\r\n# #     run_gui()\r\n\r\n\r\n\r\n@cli.command(\"download\", help=\"Download &  update RdRpCATCH databases. If databases are already installed in the \"\r\n                              \"specified directory,\"\r\n                              \" it will check for updates and download the latest version if available.\")\r\n@click.option(\"--destination_dir\", \"-dest\",\r\n              help=\"Path to directory to download databases\",\r\n              type=click.Path(path_type=Path, file_okay=False, writable=True),\r\n              required=True)\r\n@click.option(\"--concept-doi\", default=\"10.5281/zenodo.14358348\",\r\n              help=\"Zenodo Concept DOI for database repository\")\r\ndef download(destination_dir: Path, concept_doi: str):\r\n    \"\"\"Handle database download/update workflow\"\"\"\r\n    downloader = ZenodoDownloader(concept_doi, destination_dir)\r\n\r\n    try:\r\n\r\n        current_version = downloader.get_current_version()\r\n        if downloader.lock_file.exists():\r\n            console.print(\"[red]× Another download is already in progress[/red]\")\r\n            raise click.Abort()\r\n\r\n        if downloader.needs_update() or not current_version:\r\n            downloader.lock_file.touch(exist_ok=False)\r\n            with Progress(\r\n                    TextColumn(\"[progress.description]{task.description}\"),\r\n                    BarColumn(),\r\n                    TextColumn(\"{task.completed:.2f}/{task.total:.2f} MB\"),\r\n                    TimeRemainingColumn(),\r\n                    transient=True\r\n            ) as progress:\r\n                # Setup main download task\r\n                main_task = progress.add_task(\"[cyan]Database Manager\", total=4)\r\n\r\n                # Phase 1: Metadata fetching\r\n                progress.update(main_task, description=\"Fetching Zenodo metadata...\")\r\n                metadata = downloader._fetch_latest_metadata()\r\n                progress.advance(main_task)\r\n\r\n                # Phase 2: Prepare download\r\n                progress.update(main_task, description=\"Analyzing package...\")\r\n                tarball_info = downloader._get_tarball_info()\r\n                file_size_mb = tarball_info[\"size\"] / (1024 * 1024)\r\n                progress.advance(main_task)\r\n\r\n                # Phase 3: Download with progress\r\n                progress.update(main_task,\r\n                                description=\"Downloading RdRpCATCH databases...\",\r\n                                total=file_size_mb)\r\n\r\n                if not downloader.temp_dir.exists():\r\n                    downloader.temp_dir.mkdir(parents=True, exist_ok=True)\r\n\r\n                temp_tar = downloader.temp_dir / \"download.tmp\"\r\n\r\n                with requests.get(tarball_info[\"url\"], stream=True) as response:\r\n                    response.raise_for_status()\r\n                    with open(temp_tar, \"wb\") as f:\r\n                        downloaded = 0\r\n                        for chunk in response.iter_content(chunk_size=8192):\r\n                            f.write(chunk)\r\n                            downloaded += len(chunk)\r\n                            progress.update(main_task, advance=len(chunk) / (1024 * 1024))\r\n\r\n                # Phase 4: Verification & installation\r\n                progress.update(main_task, description=\"Verifying checksum...\")\r\n                if not downloader._verify_checksum(temp_tar, tarball_info[\"checksum\"]):\r\n                    raise ValueError(\"Checksum verification failed\")\r\n\r\n                progress.update(main_task, description=\"Installing databases...\")\r\n                downloader.extract_and_verify(temp_tar)\r\n                version_info = downloader.get_latest_version_info()\r\n                downloader.atomic_write_version(version_info)\r\n                progress.advance(main_task)\r\n\r\n            # Success message\r\n            size_str = format_size(tarball_info[\"size\"])\r\n            console.print(\r\n                f\"\\n[bold green]✓ Successfully downloaded version {version_info['record_id']}[/bold green]\",\r\n                f\"Release date: {version_info['created']}\",\r\n                f\"Size: {size_str}\",\r\n                sep=\"\\n\"\r\n            )\r\n\r\n        else:\r\n            installed_date = current_version[\"downloaded\"]\r\n            console.print(\r\n                f\"[green]✓ Databases are current[/green]\",\r\n                f\"Version ID: {current_version['record_id']}\",\r\n                f\"Installed: {installed_date}\",\r\n                sep=\"\\n\"\r\n            )\r\n    except FileExistsError:\r\n        console.print(\"[red]× Another download is already in progress![/red]\")\r\n        console.print(f\"Lock file exists: {downloader.lock_file}\")\r\n        raise click.Abort()\r\n\r\n    except Exception as e:\r\n        console.print(f\"\\n[red]× Download failed: {str(e)}[/red]\")\r\n        if downloader.temp_dir.exists():\r\n            shutil.rmtree(downloader.temp_dir)\r\n        raise click.Abort()\r\n\r\n    finally:\r\n        # Cleanup operations\r\n        if downloader.lock_file.exists():\r\n            downloader.lock_file.unlink()\r\n        if downloader.temp_dir.exists():\r\n            shutil.rmtree(downloader.temp_dir)\r\n\r\n\r\nif __name__ == '__main__':\r\n    cli(obj={})\r\n\r\n
+Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
+<+>UTF-8
+===================================================================
+diff --git a/rdrpcatch/cli/args.py b/rdrpcatch/cli/args.py
+--- a/rdrpcatch/cli/args.py	(revision 2110790421475da92fd4f5e5dbf44f1191829a02)
++++ b/rdrpcatch/cli/args.py	(date 1747651046007)
+@@ -26,7 +26,7 @@
+         return ['all']
+     allowed_choices = ['RVMT', 'NeoRdRp', 'NeoRdRp.2.1', 'TSA_Olendraite_fam', 'TSA_Olendraite_gen', 'RDRP-scan',
+-                       'Lucaprot', 'all']
++                       'Lucaprot_HMM, Zayed_HMM', 'all']
+     lower_choices = [choice.lower() for choice in allowed_choices]
+     options = value.split(',')
+     lower_options = [option.lower() for option in options]
+@@ -73,7 +73,7 @@
+               callback=parse_comma_separated_options,
+               default="all",
+               help="Comma-separated list of databases to search against. Valid options: RVMT, NeoRdRp, NeoRdRp.2.1,"
+-                   " TSA_Olendraite_fam, TSA_Olendraite_gen, RDRP-scan,Lucaprot, all")
++                   " TSA_Olendraite_fam, TSA_Olendraite_gen, RDRP-scan,Lucaprot_HMM, Zayed_HMM, all")
+ @click.option("--custom-dbs",
+               help="Path to directory containing custom MSAs/pHMM files to use as additional databases",
+               type=click.Path(exists=True, path_type=Path))
+Index: README.md
+IDEA additional info:
+Subsystem: com.intellij.openapi.diff.impl.patch.BaseRevisionTextPatchEP
+<+># RdRpCATCH\r\n## RNA-dependent RNA polymerase Collaborative Analysis Tool with Collections of pHMMs\r\n\r\n\r\n\r\nRdRpCATCH is collaborative effort to combine various publicly available RNA virus RNA-dependent RNA polymerase pHMM databases in one tool\r\nto facilitate their detection  in (meta-)transcriptomics data.\r\n\r\n\r\nRdRpCATCH  is written in Python and uses the pyHMMER3\r\nlibrary to perform pHMM searches.  In addition, the tool scans each sequence (aa or nt) in the input file with the selected databases and provides the best hit (hit with the highest bitscore across all databases) as output.\r\nIn addition, RdRpCATCH provides information about the number of profiles\r\nthat were positive for each sequence across all pHMM databases, and taxonomic information based on the MMseqs2 easy-taxonomy and search modules against a custom RefSeq Riboviria database.\r\n\r\n** The tool has been modified to use [rolypoly](https://code.jgi.doe.gov/UNeri/rolypoly) code/approaches **\r\n\r\n![rdrpcatch_flowchart_v0.png](images%2Frdrpcatch_flowchart_v0.png)\r\n\r\n### Supported databases\r\n- NeoRdRp <sup>1</sup> : 1182 pHMMs \r\n- NeoRdRp2 <sup>2</sup>: 19394 pHMMs  \r\n- RVMT <sup>3</sup>: 710 pHMMs  \r\n- RdRp-Scan <sup>4</sup> : 68 pHMMs\r\n- TSA_Oleandrite_fam <sup>5</sup>: 77 pHMMs \r\n- TSA_Oleandrite_gen <sup>6</sup> : 341 pHMMs\r\n- LucaProt_pHMM<sup>7 </sup> : 754 pHMMs \r\n\r\n1. Sakaguchi, S. et al. (2022) 'NeoRdRp: A comprehensive dataset for identifying RNA-dependent RNA polymerases of various RNA viruses from metatranscriptomic data', *Microbes and Environments*, 37(3). [doi:10.1264/jsme2.me22001](https://doi.org/10.1264/jsme2.me22001)\r\n2. Sakaguchi, S., Nakano, T. and Nakagawa, S. (2024) 'Neordrp2 with improved seed data, annotations, and scoring', *Frontiers in Virology*, 4. [doi:10.3389/fviro.2024.1378695](https://doi.org/10.3389/fviro.2024.1378695)\r\n3. Neri, U. et al. (2022) 'Expansion of the global RNA virome reveals diverse clades of bacteriophages', *Cell*, 185(21). [doi:10.1016/j.cell.2022.08.023](https://doi.org/10.1016/j.cell.2022.08.023)\r\n4. Charon, J. et al. (2022) 'RDRP-Scan: A bioinformatic resource to identify and annotate divergent RNA viruses in metagenomic sequence data', *Virus Evolution*, 8(2). [doi:10.1093/ve/veac082](https://doi.org/10.1093/ve/veac082)\r\n5. Olendraite, I., Brown, K. and Firth, A.E. (2023) 'Identification of RNA virus–derived rdrp sequences in publicly available transcriptomic data sets', *Molecular Biology and Evolution*, 40(4). [doi:10.1093/molbev/msad060](https://doi.org/10.1093/molbev/msad060)\r\n6. Olendraite, I. (2021) 'Mining diverse and novel RNA viruses in transcriptomic datasets', Apollo. Available at: [https://www.repository.cam.ac.uk/items/1fabebd2-429b-45c9-b6eb-41d27d0a90c2](https://www.repository.cam.ac.uk/items/1fabebd2-429b-45c9-b6eb-41d27d0a90c2)\r\n7. Hou, X. et al. (2024) 'Using artificial intelligence to document the hidden RNA virosphere', *Cell*, 187(24). [doi:10.1016/j.cell.2024.09.027](https://doi.org/10.1016/j.cell.2024.09.027)\r\n\r\n\r\n## Installation\r\n\r\n\r\n#### Prerequisites\r\nFor the installation process, conda is required. If you don't have conda installed, you can find instructions on how to\r\nhttps://docs.conda.io/projects/conda/en/latest/user-guide/install/index.html  \r\nMamba is a faster alternative to conda. If you have it installed, you can use it instead of conda.  \r\n\r\n#### Installation steps\r\n\r\nThe package is available as a bioconda package. You can install it using the following command:\r\n\r\n```bash\r\nconda env create rdrpcatch -c bioconda rdrpcatch\r\n```\r\n\r\nAlternatively, you can install RdRpCATCH from python package index (PyPI) using pip. This requires the installation of the dependencies\r\nmanually. The dependencies are:\r\n- mmseqs2\r\n- seqkit\r\n\r\nThe dependencies can be installed using conda or mamba. Follow these steps:\r\n\r\nCreate a new conda environment and install the dependencies:\r\n```bash\r\nconda env create -n rdrpcatch python=3.12\r\nconda activate rdrpcatch\r\nconda install -c bioconda mmseqs2==17.b804f seqkit==2.10.0\r\n```\r\nInstall the tool from pip:\r\n```bash\r\npip install rdrpcatch\r\n```\r\n\r\nActivate the environment and download the RdRpCATCH databases:\r\n\r\n```bash \r\nconda activate rdrpcatch\r\nrdrpcatch download --destination_dir path/to/store/databases\r\n```\r\n\r\n* Note 1: The databases are large files and may take some time to download (~ 3 GB).\r\n* Note 2: The databases are stored in the specified directory, and the path is required to run RdRpCATCH.\r\n* Note 3: If you encounter an SSL error while downloading, please try again. The error seems to appear sporadically during testing, and a simple re-initiation of the downloading process seems to fix it. \r\n\r\n## Usage\r\nRdRpCATCH can be used as a CLI tool as follows:\r\n\r\n```bash \r\n# make sure the conda environment is activated\r\n# conda activate rdrpcatch\r\n\r\n# scan the input fasta file with the selected databases\r\nrdrpcatch scan -i path/to/input.fasta -o path/to/output_dir -db_dir path/to/database\r\n```\r\n### input: \r\nThe input file can be one or more nucleotide or protein sequences in multi-fasta format. \r\nThe output directory is where the results will be stored. We recommend specifying the type of the sequence in the command line,\r\nAn optional argument `--seq_type` (nuc or prot) can be used to specify if the input fasta file sequences are nucleotide or amino acid.\r\n\r\n## Commands\r\nThe following two commands are available in RdRpCATCH:  \r\n* [`rdrpcatch scan`](#rdrpcatch-scan)  \r\n* [`rdrpcatch download`](#rdrpcatch-download)\r\n\r\n### rdrpcatch download:\r\nCommand to download pre-compiled databases from Zenodo. If the databases are already downloaded in the specified directory\r\n, the command will check for updates and download the latest version if available.\r\n\r\n| Argument | Short Flag | Type | Description                                                 |\r\n|----------|------------|------|-------------------------------------------------------------|\r\n| `--destination_dir` | `-dest` | PATH | Path to the directory to download HMM databases. [required] |\r\n| `--concept-doi` | `` | TEXT | Zenodo Concept DOI for database repository                  |\r\n| `--help` | `` |  | Show help message and exit                                  |\r\n### rdrpcatch scan:\r\nSearch a given input using selected RdRp databases.  \r\n\r\n| Argument | Short Flag | Type | Description |\r\n|----------|------------|------|-------------|\r\n| `--input` | `-i` | FILE | Path to the input FASTA file. [required] |\r\n| `--output` | `-o` | DIRECTORY | Path to the output directory. [required] |\r\n| `--db_dir` | `-db_dir` | PATH | Path to the directory containing RdRpCATCH databases. [required] |\r\n| `--db_options` | `-dbs` | TEXT | Comma-separated list of databases to search against. Valid options: RVMT, NeoRdRp, NeoRdRp.2.1, TSA_Olendraite_fam, TSA_Olendraite_gen, RDRP-scan, Lucaprot, all |\r\n| `--custom-dbs` | | PATH | Path to directory containing custom MSAs/pHMM files to use as additional databases |\r\n| `--seq_type` | `-seq_type` | TEXT | Type of sequence to search against: (prot,nuc) Default: unknown |\r\n| `--verbose` | `-v` | FLAG | Print verbose output. |\r\n| `--evalue` | `-e` | FLOAT | E-value threshold for HMMsearch. (default: 1e-5) |\r\n| `--incevalue` | `-incE` | FLOAT | Inclusion E-value threshold for HMMsearch. (default: 1e-5) |\r\n| `--domevalue` | `-domE` | FLOAT | Domain E-value threshold for HMMsearch. (default: 1e-5) |\r\n| `--incdomevalue` | `-incdomE` | FLOAT | Inclusion domain E-value threshold for HMMsearch. (default: 1e-5) |\r\n| `--zvalue` | `-z` | INTEGER | Number of sequences to search against. (default: 1000000) |\r\n| `--cpus` | `-cpus` | INTEGER | Number of CPUs to use for HMMsearch. (default: 1) |\r\n| `--length_thr` | `-length_thr` | INTEGER | Minimum length threshold for seqkit seq. (default: 400) |\r\n| `--gen_code` | `-gen_code` | INTEGER | Genetic code to use for translation. (default: 1) |\r\n| `--bundle` | `-bundle` |  |  Bundle the output files into a single archive. (default: False) |\r\n| `--keep_tmp` | `-keep_tmp` |  | Keep the temporary files generated during the analysis. (default: False) |\r\n\r\n\r\n\r\n#### Output files  \r\nrdrpcatch scan will create a folder with the following structure:\r\n\r\n| Output | Description                                                                  |\r\n|--------|------------------------------------------------------------------------------|\r\n| `{prefix}_rdrpcatch_output_annotated.tsv` | A tab-separated file containing the results of the RdRpCATCH analysis.       |\r\n| `{prefix}_rdrpcatch_fasta` | A directory containing the sequences that were identified as RdRp sequences. |\r\n| `{prefix}_rdrpcatch_plots` | A directory containing the plots generated during the analysis.              |\r\n| `{prefix}_gff_files` | A directory containing the GFF files generated during the analysis. (For now only based on protein sequences) |\r\n| `tmp` | A directory containing temporary files generated during the analysis. (Only available if the -keep_tmp flag is used )|\r\n\r\n#### Output table fields\r\nA summary of the results is stored in the `{prefix}_rdrpcatch_output_annotated.tsv` file, which contains the following fields:\r\n| Field | Description                                                                                                         |\r\n|-------|---------------------------------------------------------------------------------------------------------------------|\r\n| `Contig_name` | The name of the contig.                                                                                             |\r\n| `Translated_contig_name (frame)` | The name of the translated contig and the frame of the RdRp sequence.                                               |\r\n| `Sequence_length(AA)` | The length of the RdRp sequence in amino acids.                                                                     |\r\n| `Total_databases_that_the_contig_was_detected(No_of_Profiles)` | The name of databases and the number of profiles that the RdRp sequence was detected by.                            |\r\n| `Best_hit_Database` | The database with the best hit.                                                                                     |\r\n| `Best_hit_profile_name` | The name of the profile with the best hit.                                                                          |\r\n| `Best_hit_profile_length` | The length of the profile with the best hit.                                                                        |\r\n| `Best_hit_e-value` | The e-value of the best hit.                                                                                        |\r\n| `Best_hit_bitscore` | The bitscore of the best hit.                                                                                       |\r\n| `RdRp_from(AA)` | The start position of the RdRp sequence, in relation to the amino acid sequence.                                    |\r\n| `RdRp_to(AA)` | The end position of the RdRp sequence, in relation to the amino acid sequence.                                      |\r\n| `Best_hit_profile_coverage` | The fraction of the profile that was covered by the RdRp sequence.                                                  |\r\n| `Best_hit_contig_coverage` | The fraction of the contig that was covered by the RdRp sequence. (Based on aminoacid sequence)                     |\r\n| `MMseqs_Taxonomy_2bLCA` | The taxonomy of the RdRp sequence based on MMseqs2 easy-taxonomy module against a custom RefSeq Riboviria database. |\r\n| `MMseqs_TopHit_accession` | The accession of the top hit in the RefSeq Riboviria database.                                                      |\r\n| `MMseqs_TopHit_fident` | The fraction of identical matches of the top hit in the RefSeq Riboviria database.                                  |\r\n| `MMseqs_TopHit_alnlen` | The alignment length of the top hit in the RefSeq Riboviria database.                                               |\r\n| `MMseqs_TopHit_eval` | The e-value of the top hit in the RefSeq Riboviria database.                                                        |\r\n| `MMseqs_TopHit_bitscore` | The bitscore of the top hit in the RefSeq Riboviria database.                                                       |\r\n| `MMseqs_TopHit_qcov` | The query coverage of the top hit in the RefSeq Riboviria database.                                                 |\r\n| `MMseqs_TopHit_lineage` | The lineage of the top hit in the RefSeq Riboviria database.                                                        |\r\n\r\n## Citations\r\nManuscript still in preparation. If you use RdRpCATCH, please cite this GitHub repository \r\nA precompiled version of the used databases is available at Zenodo DOI: [10.5281/zenodo.14358348](https://doi.org/10.5281/zenodo.14358348).  \r\nIf you use RdRpCATCH, please cite the [underlying third party databases](#supported-databases) :\r\n\r\n## Acknowledgements\r\nRdRpCATCH is a collaborative effort and we would like to thank all the authors and developers of the underlying databases. \r\n\r\n## Contact\r\nDimitris Karapliafis (dimitris.karapliafis@wur.nl), potentially via slack/teams or an issue in the main repo.\r\n\r\n##TODO:\r\n- [ ] loud logging is linking to the utils.py file, not the actual line of code causing the error.\r\n- [ ] drop `db_dir` argument and use global/environment/config variable that is set after running the `download` command\r\n\r\n\r\n## Contributing\r\nTBD up to Dimitris and Anne\r\n\r\n## Licence\r\n[MIT](LICENSE)\r\n
+Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
+<+>UTF-8
+===================================================================
+diff --git a/README.md b/README.md
+--- a/README.md	(revision 2110790421475da92fd4f5e5dbf44f1191829a02)
++++ b/README.md	(date 1747651045997)
+@@ -111,25 +111,25 @@
+ ### rdrpcatch scan:
+ Search a given input using selected RdRp databases.
+-| Argument | Short Flag | Type | Description |
+-|----------|------------|------|-------------|
+-| `--input` | `-i` | FILE | Path to the input FASTA file. [required] |
+-| `--output` | `-o` | DIRECTORY | Path to the output directory. [required] |
+-| `--db_dir` | `-db_dir` | PATH | Path to the directory containing RdRpCATCH databases. [required] |
+-| `--db_options` | `-dbs` | TEXT | Comma-separated list of databases to search against. Valid options: RVMT, NeoRdRp, NeoRdRp.2.1, TSA_Olendraite_fam, TSA_Olendraite_gen, RDRP-scan, Lucaprot, all |
+-| `--custom-dbs` | | PATH | Path to directory containing custom MSAs/pHMM files to use as additional databases |
+-| `--seq_type` | `-seq_type` | TEXT | Type of sequence to search against: (prot,nuc) Default: unknown |
+-| `--verbose` | `-v` | FLAG | Print verbose output. |
+-| `--evalue` | `-e` | FLOAT | E-value threshold for HMMsearch. (default: 1e-5) |
+-| `--incevalue` | `-incE` | FLOAT | Inclusion E-value threshold for HMMsearch. (default: 1e-5) |
+-| `--domevalue` | `-domE` | FLOAT | Domain E-value threshold for HMMsearch. (default: 1e-5) |
+-| `--incdomevalue` | `-incdomE` | FLOAT | Inclusion domain E-value threshold for HMMsearch. (default: 1e-5) |
+-| `--zvalue` | `-z` | INTEGER | Number of sequences to search against. (default: 1000000) |
+-| `--cpus` | `-cpus` | INTEGER | Number of CPUs to use for HMMsearch. (default: 1) |
+-| `--length_thr` | `-length_thr` | INTEGER | Minimum length threshold for seqkit seq. (default: 400) |
+-| `--gen_code` | `-gen_code` | INTEGER | Genetic code to use for translation. (default: 1) |
+-| `--bundle` | `-bundle` |  |  Bundle the output files into a single archive. (default: False) |
+-| `--keep_tmp` | `-keep_tmp` |  | Keep the temporary files generated during the analysis. (default: False) |
++| Argument | Short Flag | Type | Description                                                                                                                                                                    |
++|----------|------------|------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
++| `--input` | `-i` | FILE | Path to the input FASTA file. [required]                                                                                                                                       |
++| `--output` | `-o` | DIRECTORY | Path to the output directory. [required]                                                                                                                                       |
++| `--db_dir` | `-db_dir` | PATH | Path to the directory containing RdRpCATCH databases. [required]                                                                                                               |
++| `--db_options` | `-dbs` | TEXT | Comma-separated list of databases to search against. Valid options: RVMT, NeoRdRp, NeoRdRp.2.1, TSA_Olendraite_fam, TSA_Olendraite_gen, RDRP-scan, Lucaprot_HMM,Zayed_HMM, all |
++| `--custom-dbs` | | PATH | Path to directory containing custom MSAs/pHMM files to use as additional databases                                                                                             |
++| `--seq_type` | `-seq_type` | TEXT | Type of sequence to search against: (prot,nuc) Default: unknown                                                                                                                |
++| `--verbose` | `-v` | FLAG | Print verbose output.                                                                                                                                                          |
++| `--evalue` | `-e` | FLOAT | E-value threshold for HMMsearch. (default: 1e-5)                                                                                                                               |
++| `--incevalue` | `-incE` | FLOAT | Inclusion E-value threshold for HMMsearch. (default: 1e-5)                                                                                                                     |
++| `--domevalue` | `-domE` | FLOAT | Domain E-value threshold for HMMsearch. (default: 1e-5)                                                                                                                        |
++| `--incdomevalue` | `-incdomE` | FLOAT | Inclusion domain E-value threshold for HMMsearch. (default: 1e-5)                                                                                                              |
++| `--zvalue` | `-z` | INTEGER | Number of sequences to search against. (default: 1000000)                                                                                                                      |
++| `--cpus` | `-cpus` | INTEGER | Number of CPUs to use for HMMsearch. (default: 1)                                                                                                                              |
++| `--length_thr` | `-length_thr` | INTEGER | Minimum length threshold for seqkit seq. (default: 400)                                                                                                                        |
++| `--gen_code` | `-gen_code` | INTEGER | Genetic code to use for translation. (default: 1)                                                                                                                              |
++| `--bundle` | `-bundle` |  | Bundle the output files into a single archive. (default: False)                                                                                                                |
++| `--keep_tmp` | `-keep_tmp` |  | Keep the temporary files generated during the analysis. (default: False)                                                                                                       |

{rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/.idea/workspace.xml RENAMED Viewed

@@ -4,8 +4,8 @@
     <option name="autoReloadType" value="SELECTIVE" />
   </component>
   <component name="ChangeListManager">
-    <list default="true" id="d849e6fa-87f9-4e92-9c33-abef7cc975d3" name="Changes" comment="Updates:&#10;Fixed bug that crushed the script when at least one  pHMM DB does not have a match against the sequence database also for nuc branch">
-      <change beforePath="$PROJECT_DIR$/pyproject.toml" beforeDir="false" afterPath="$PROJECT_DIR$/pyproject.toml" afterDir="false" />
+    <list default="true" id="d849e6fa-87f9-4e92-9c33-abef7cc975d3" name="Changes" comment="Updates:&#10;Add -overwrite as a flag&#10;Add informative progress statements in cli">
+      <change beforePath="$PROJECT_DIR$/README.md" beforeDir="false" afterPath="$PROJECT_DIR$/README.md" afterDir="false" />
       <change beforePath="$PROJECT_DIR$/rdrpcatch/cli/args.py" beforeDir="false" afterPath="$PROJECT_DIR$/rdrpcatch/cli/args.py" afterDir="false" />
       <change beforePath="$PROJECT_DIR$/rdrpcatch/rdrpcatch_wrapper.py" beforeDir="false" afterPath="$PROJECT_DIR$/rdrpcatch/rdrpcatch_wrapper.py" afterDir="false" />
     </list>
@@ -40,22 +40,22 @@
     <option name="hideEmptyMiddlePackages" value="true" />
     <option name="showLibraryContents" value="true" />
   </component>
-  <component name="PropertiesComponent">{
-  &quot;keyToString&quot;: {
-    &quot;ASKED_ADD_EXTERNAL_FILES&quot;: &quot;true&quot;,
-    &quot;RunOnceActivity.OpenProjectViewOnStart&quot;: &quot;true&quot;,
-    &quot;RunOnceActivity.ShowReadmeOnStart&quot;: &quot;true&quot;,
-    &quot;ignore.virus.scanning.warn.message&quot;: &quot;true&quot;,
-    &quot;last_opened_file_path&quot;: &quot;C:/Users/karso/PycharmProjects/rdrpcatch_benchmarks&quot;,
-    &quot;node.js.detected.package.eslint&quot;: &quot;true&quot;,
-    &quot;node.js.detected.package.tslint&quot;: &quot;true&quot;,
-    &quot;node.js.selected.package.eslint&quot;: &quot;(autodetect)&quot;,
-    &quot;node.js.selected.package.tslint&quot;: &quot;(autodetect)&quot;,
-    &quot;nodejs_package_manager_path&quot;: &quot;npm&quot;,
-    &quot;settings.editor.selected.configurable&quot;: &quot;preferences.pluginManager&quot;,
-    &quot;vue.rearranger.settings.migration&quot;: &quot;true&quot;
+  <component name="PropertiesComponent"><![CDATA[{
+  "keyToString": {
+    "ASKED_ADD_EXTERNAL_FILES": "true",
+    "RunOnceActivity.OpenProjectViewOnStart": "true",
+    "RunOnceActivity.ShowReadmeOnStart": "true",
+    "ignore.virus.scanning.warn.message": "true",
+    "last_opened_file_path": "C:/Users/karso/PycharmProjects/testing_approaches",
+    "node.js.detected.package.eslint": "true",
+    "node.js.detected.package.tslint": "true",
+    "node.js.selected.package.eslint": "(autodetect)",
+    "node.js.selected.package.tslint": "(autodetect)",
+    "nodejs_package_manager_path": "npm",
+    "settings.editor.selected.configurable": "preferences.pluginManager",
+    "vue.rearranger.settings.migration": "true"
   }
-}</component>
+}]]></component>
   <component name="RecentsManager">
     <key name="CopyFile.RECENT_KEYS">
       <recent name="C:\Users\karso\PycharmProjects\ColaB-Scan\testing" />
@@ -121,7 +121,10 @@
       <workItem from="1743714892367" duration="21775000" />
       <workItem from="1744200654491" duration="635000" />
       <workItem from="1744241097621" duration="28847000" />
-      <workItem from="1745576502650" duration="11360000" />
+      <workItem from="1745576502650" duration="11691000" />
+      <workItem from="1746005454102" duration="1271000" />
+      <workItem from="1746359600096" duration="3517000" />
+      <workItem from="1747128382581" duration="8962000" />
     </task>
     <task id="LOCAL-00001" summary="First commit: Script for benchmark">
       <option name="closed" value="true" />
@@ -363,7 +366,23 @@
       <option name="project" value="LOCAL" />
       <updated>1744796108058</updated>
     </task>
-    <option name="localTasksCounter" value="31" />
+    <task id="LOCAL-00031" summary="Updates:&#10;Add -overwrite as a flag&#10;Add informative progress statements in cli">
+      <option name="closed" value="true" />
+      <created>1745863439863</created>
+      <option name="number" value="00031" />
+      <option name="presentableId" value="LOCAL-00031" />
+      <option name="project" value="LOCAL" />
+      <updated>1745863439863</updated>
+    </task>
+    <task id="LOCAL-00032" summary="Updates:&#10;Add -overwrite as a flag&#10;Add informative progress statements in cli">
+      <option name="closed" value="true" />
+      <created>1745863445358</created>
+      <option name="number" value="00032" />
+      <option name="presentableId" value="LOCAL-00032" />
+      <option name="project" value="LOCAL" />
+      <updated>1745863445358</updated>
+    </task>
+    <option name="localTasksCounter" value="33" />
     <servers />
   </component>
   <component name="TypeScriptGeneratedFilesManager">
@@ -381,7 +400,6 @@
     </option>
   </component>
   <component name="VcsManagerConfiguration">
-    <MESSAGE value="Commit: Plots and result summary" />
     <MESSAGE value="Commit: File name change" />
     <MESSAGE value="Commit: Upload script and results" />
     <MESSAGE value="Upload Jupyter notebooks and their respective documentation" />
@@ -406,6 +424,7 @@
     <MESSAGE value="Updates:&#10;Optimize fasta writer from O(n*m) to O(n+m)" />
     <MESSAGE value="Updates:&#10;Polishing ReadME&#10;Fixed bug that crushed the script when at least one  pHMM DB does not have a match against the sequence database" />
     <MESSAGE value="Updates:&#10;Fixed bug that crushed the script when at least one  pHMM DB does not have a match against the sequence database also for nuc branch" />
-    <option name="LAST_COMMIT_MESSAGE" value="Updates:&#10;Fixed bug that crushed the script when at least one  pHMM DB does not have a match against the sequence database also for nuc branch" />
+    <MESSAGE value="Updates:&#10;Add -overwrite as a flag&#10;Add informative progress statements in cli" />
+    <option name="LAST_COMMIT_MESSAGE" value="Updates:&#10;Add -overwrite as a flag&#10;Add informative progress statements in cli" />
   </component>
 </project>

{rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: rdrpcatch
-Version: 0.0.6
+Version: 0.0.7
 Dynamic: Summary
 Project-URL: Home, https://github.com/dimitris-karapliafis/RdRpCATCH
 Project-URL: Source, https://github.com/dimitris-karapliafis/RdRpCATCH
@@ -36,7 +36,7 @@ that were positive for each sequence across all pHMM databases, and taxonomic in
 ** The tool has been modified to use [rolypoly](https://code.jgi.doe.gov/UNeri/rolypoly) code/approaches **
-![rdrpcatch_flowchart_v0.png](images%2Frdrpcatch_flowchart_v0.png)
+![rdrpcatch_flowchart_v0.png](images%2Frdrpcatch_illustration.png)
 ### Supported databases
 - NeoRdRp <sup>1</sup> : 1182 pHMMs
@@ -133,25 +133,25 @@ Command to download pre-compiled databases from Zenodo. If the databases are alr
 ### rdrpcatch scan:
 Search a given input using selected RdRp databases.
-| Argument | Short Flag | Type | Description |
-|----------|------------|------|-------------|
-| `--input` | `-i` | FILE | Path to the input FASTA file. [required] |
-| `--output` | `-o` | DIRECTORY | Path to the output directory. [required] |
-| `--db_dir` | `-db_dir` | PATH | Path to the directory containing RdRpCATCH databases. [required] |
-| `--db_options` | `-dbs` | TEXT | Comma-separated list of databases to search against. Valid options: RVMT, NeoRdRp, NeoRdRp.2.1, TSA_Olendraite_fam, TSA_Olendraite_gen, RDRP-scan, Lucaprot, all |
-| `--custom-dbs` | | PATH | Path to directory containing custom MSAs/pHMM files to use as additional databases |
-| `--seq_type` | `-seq_type` | TEXT | Type of sequence to search against: (prot,nuc) Default: unknown |
-| `--verbose` | `-v` | FLAG | Print verbose output. |
-| `--evalue` | `-e` | FLOAT | E-value threshold for HMMsearch. (default: 1e-5) |
-| `--incevalue` | `-incE` | FLOAT | Inclusion E-value threshold for HMMsearch. (default: 1e-5) |
-| `--domevalue` | `-domE` | FLOAT | Domain E-value threshold for HMMsearch. (default: 1e-5) |
-| `--incdomevalue` | `-incdomE` | FLOAT | Inclusion domain E-value threshold for HMMsearch. (default: 1e-5) |
-| `--zvalue` | `-z` | INTEGER | Number of sequences to search against. (default: 1000000) |
-| `--cpus` | `-cpus` | INTEGER | Number of CPUs to use for HMMsearch. (default: 1) |
-| `--length_thr` | `-length_thr` | INTEGER | Minimum length threshold for seqkit seq. (default: 400) |
-| `--gen_code` | `-gen_code` | INTEGER | Genetic code to use for translation. (default: 1) |
-| `--bundle` | `-bundle` |  |  Bundle the output files into a single archive. (default: False) |
-| `--keep_tmp` | `-keep_tmp` |  | Keep the temporary files generated during the analysis. (default: False) |
+| Argument | Short Flag | Type | Description                                                                                                                                                                    |
+|----------|------------|------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| `--input` | `-i` | FILE | Path to the input FASTA file. [required]                                                                                                                                       |
+| `--output` | `-o` | DIRECTORY | Path to the output directory. [required]                                                                                                                                       |
+| `--db_dir` | `-db_dir` | PATH | Path to the directory containing RdRpCATCH databases. [required]                                                                                                               |
+| `--db_options` | `-dbs` | TEXT | Comma-separated list of databases to search against. Valid options: RVMT, NeoRdRp, NeoRdRp.2.1, TSA_Olendraite_fam, TSA_Olendraite_gen, RDRP-scan, Lucaprot_HMM,Zayed_HMM, all |
+| `--custom-dbs` | | PATH | Path to directory containing custom MSAs/pHMM files to use as additional databases                                                                                             |
+| `--seq_type` | `-seq_type` | TEXT | Type of sequence to search against: (prot,nuc) Default: unknown                                                                                                                |
+| `--verbose` | `-v` | FLAG | Print verbose output.                                                                                                                                                          |
+| `--evalue` | `-e` | FLOAT | E-value threshold for HMMsearch. (default: 1e-5)                                                                                                                               |
+| `--incevalue` | `-incE` | FLOAT | Inclusion E-value threshold for HMMsearch. (default: 1e-5)                                                                                                                     |
+| `--domevalue` | `-domE` | FLOAT | Domain E-value threshold for HMMsearch. (default: 1e-5)                                                                                                                        |
+| `--incdomevalue` | `-incdomE` | FLOAT | Inclusion domain E-value threshold for HMMsearch. (default: 1e-5)                                                                                                              |
+| `--zvalue` | `-z` | INTEGER | Number of sequences to search against. (default: 1000000)                                                                                                                      |
+| `--cpus` | `-cpus` | INTEGER | Number of CPUs to use for HMMsearch. (default: 1)                                                                                                                              |
+| `--length_thr` | `-length_thr` | INTEGER | Minimum length threshold for seqkit seq. (default: 400)                                                                                                                        |
+| `--gen_code` | `-gen_code` | INTEGER | Genetic code to use for translation. (default: 1)                                                                                                                              |
+| `--bundle` | `-bundle` |  | Bundle the output files into a single archive. (default: False)                                                                                                                |
+| `--keep_tmp` | `-keep_tmp` |  | Keep the temporary files generated during the analysis. (default: False)                                                                                                       |

{rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/README.md RENAMED Viewed

@@ -14,7 +14,7 @@ that were positive for each sequence across all pHMM databases, and taxonomic in
 ** The tool has been modified to use [rolypoly](https://code.jgi.doe.gov/UNeri/rolypoly) code/approaches **
-![rdrpcatch_flowchart_v0.png](images%2Frdrpcatch_flowchart_v0.png)
+![rdrpcatch_flowchart_v0.png](images%2Frdrpcatch_illustration.png)
 ### Supported databases
 - NeoRdRp <sup>1</sup> : 1182 pHMMs
@@ -111,25 +111,25 @@ Command to download pre-compiled databases from Zenodo. If the databases are alr
 ### rdrpcatch scan:
 Search a given input using selected RdRp databases.
-| Argument | Short Flag | Type | Description |
-|----------|------------|------|-------------|
-| `--input` | `-i` | FILE | Path to the input FASTA file. [required] |
-| `--output` | `-o` | DIRECTORY | Path to the output directory. [required] |
-| `--db_dir` | `-db_dir` | PATH | Path to the directory containing RdRpCATCH databases. [required] |
-| `--db_options` | `-dbs` | TEXT | Comma-separated list of databases to search against. Valid options: RVMT, NeoRdRp, NeoRdRp.2.1, TSA_Olendraite_fam, TSA_Olendraite_gen, RDRP-scan, Lucaprot, all |
-| `--custom-dbs` | | PATH | Path to directory containing custom MSAs/pHMM files to use as additional databases |
-| `--seq_type` | `-seq_type` | TEXT | Type of sequence to search against: (prot,nuc) Default: unknown |
-| `--verbose` | `-v` | FLAG | Print verbose output. |
-| `--evalue` | `-e` | FLOAT | E-value threshold for HMMsearch. (default: 1e-5) |
-| `--incevalue` | `-incE` | FLOAT | Inclusion E-value threshold for HMMsearch. (default: 1e-5) |
-| `--domevalue` | `-domE` | FLOAT | Domain E-value threshold for HMMsearch. (default: 1e-5) |
-| `--incdomevalue` | `-incdomE` | FLOAT | Inclusion domain E-value threshold for HMMsearch. (default: 1e-5) |
-| `--zvalue` | `-z` | INTEGER | Number of sequences to search against. (default: 1000000) |
-| `--cpus` | `-cpus` | INTEGER | Number of CPUs to use for HMMsearch. (default: 1) |
-| `--length_thr` | `-length_thr` | INTEGER | Minimum length threshold for seqkit seq. (default: 400) |
-| `--gen_code` | `-gen_code` | INTEGER | Genetic code to use for translation. (default: 1) |
-| `--bundle` | `-bundle` |  |  Bundle the output files into a single archive. (default: False) |
-| `--keep_tmp` | `-keep_tmp` |  | Keep the temporary files generated during the analysis. (default: False) |
+| Argument | Short Flag | Type | Description                                                                                                                                                                    |
+|----------|------------|------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| `--input` | `-i` | FILE | Path to the input FASTA file. [required]                                                                                                                                       |
+| `--output` | `-o` | DIRECTORY | Path to the output directory. [required]                                                                                                                                       |
+| `--db_dir` | `-db_dir` | PATH | Path to the directory containing RdRpCATCH databases. [required]                                                                                                               |
+| `--db_options` | `-dbs` | TEXT | Comma-separated list of databases to search against. Valid options: RVMT, NeoRdRp, NeoRdRp.2.1, TSA_Olendraite_fam, TSA_Olendraite_gen, RDRP-scan, Lucaprot_HMM,Zayed_HMM, all |
+| `--custom-dbs` | | PATH | Path to directory containing custom MSAs/pHMM files to use as additional databases                                                                                             |
+| `--seq_type` | `-seq_type` | TEXT | Type of sequence to search against: (prot,nuc) Default: unknown                                                                                                                |
+| `--verbose` | `-v` | FLAG | Print verbose output.                                                                                                                                                          |
+| `--evalue` | `-e` | FLOAT | E-value threshold for HMMsearch. (default: 1e-5)                                                                                                                               |
+| `--incevalue` | `-incE` | FLOAT | Inclusion E-value threshold for HMMsearch. (default: 1e-5)                                                                                                                     |
+| `--domevalue` | `-domE` | FLOAT | Domain E-value threshold for HMMsearch. (default: 1e-5)                                                                                                                        |
+| `--incdomevalue` | `-incdomE` | FLOAT | Inclusion domain E-value threshold for HMMsearch. (default: 1e-5)                                                                                                              |
+| `--zvalue` | `-z` | INTEGER | Number of sequences to search against. (default: 1000000)                                                                                                                      |
+| `--cpus` | `-cpus` | INTEGER | Number of CPUs to use for HMMsearch. (default: 1)                                                                                                                              |
+| `--length_thr` | `-length_thr` | INTEGER | Minimum length threshold for seqkit seq. (default: 400)                                                                                                                        |
+| `--gen_code` | `-gen_code` | INTEGER | Genetic code to use for translation. (default: 1)                                                                                                                              |
+| `--bundle` | `-bundle` |  | Bundle the output files into a single archive. (default: False)                                                                                                                |
+| `--keep_tmp` | `-keep_tmp` |  | Keep the temporary files generated during the analysis. (default: False)                                                                                                       |

rdrpcatch-0.0.7/images/rdrpcatch_illustration.png ADDED Viewed

Binary file

{rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 [project]
 name = "rdrpcatch"
-version = "0.0.6"
+version = "0.0.7"
 authors = [
     {name = "Dimitris Karapliafis", email = "dimitris.karapliafis@wur.nl"},
     {name = "Uri Neri", email = "uneri@lbl.gov"},

{rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/rdrpcatch/cli/args.py RENAMED Viewed

@@ -26,7 +26,7 @@ def parse_comma_separated_options(ctx, param, value):
         return ['all']
     allowed_choices = ['RVMT', 'NeoRdRp', 'NeoRdRp.2.1', 'TSA_Olendraite_fam', 'TSA_Olendraite_gen', 'RDRP-scan',
-                       'Lucaprot', 'all']
+                       'Lucaprot_HMM, Zayed_HMM', 'all']
     lower_choices = [choice.lower() for choice in allowed_choices]
     options = value.split(',')
     lower_options = [option.lower() for option in options]
@@ -73,7 +73,7 @@ def cli():
               callback=parse_comma_separated_options,
               default="all",
               help="Comma-separated list of databases to search against. Valid options: RVMT, NeoRdRp, NeoRdRp.2.1,"
-                   " TSA_Olendraite_fam, TSA_Olendraite_gen, RDRP-scan,Lucaprot, all")
+                   " TSA_Olendraite_fam, TSA_Olendraite_gen, RDRP-scan,Lucaprot_HMM, Zayed_HMM, all")
 @click.option("--custom-dbs",
               help="Path to directory containing custom MSAs/pHMM files to use as additional databases",
               type=click.Path(exists=True, path_type=Path))

{rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/rdrpcatch/rdrpcatch_wrapper.py RENAMED Viewed

@@ -170,7 +170,7 @@ def run_scan(input_file, output_dir, db_options, db_dir, seq_type, verbose, e,in
     logger.loud_log("Fetching HMM databases...")
-    ## Fetch HMM databases- RVMT, NeoRdRp, NeoRdRp.2.1, TSA_Olendraite, RDRP-scan, Lucaprot
+    ## Fetch HMM databases- RVMT, NeoRdRp, NeoRdRp.2.1, TSA_Olendraite, RDRP-scan, Lucaprot_HMM,Zayed_HMM
     rvmt_hmm_db = fetch_dbs.db_fetcher(db_dir).fetch_hmm_db_path("RVMT")
     if verbose:
         logger.loud_log(f"RVMT HMM database fetched from: {rvmt_hmm_db}")
@@ -202,19 +202,24 @@ def run_scan(input_file, output_dir, db_options, db_dir, seq_type, verbose, e,in
         logger.loud_log(f"RDRP-scan HMM database fetched from: {rdrpscan_hmm_db}")
     else:
         logger.silent_log(f"RDRP-scan HMM database fetched from: {rdrpscan_hmm_db}")
-    lucaprot_hmm_db = fetch_dbs.db_fetcher(db_dir).fetch_hmm_db_path("Lucaprot")
+    lucaprot_hmm_db = fetch_dbs.db_fetcher(db_dir).fetch_hmm_db_path("Lucaprot_HMM")
     if verbose:
         logger.loud_log(f"Lucaprot HMM database fetched from: {lucaprot_hmm_db}")
     else:
         logger.silent_log(f"Lucaprot HMM database fetched from: {lucaprot_hmm_db}")
+    zayed_hmm_db = fetch_dbs.db_fetcher(db_dir).fetch_hmm_db_path("Zayed_HMM")
+    if verbose:
+        logger.loud_log(f"Zayed HMM database fetched from: {zayed_hmm_db}")
+    else:
+        logger.silent_log(f"Zayed HMM database fetched from: {zayed_hmm_db}")
     db_name_list = []
     db_path_list = []
     ## Set up HMM databases
     if db_options == ['all']:
-        db_name_list = ["RVMT", "NeoRdRp", "NeoRdRp.2.1", "TSA_Olendraite_fam","TSA_Olendraite_gen", "RDRP-scan", "Lucaprot"]
-        db_path_list = [rvmt_hmm_db, neordrp_hmm_db, neordrp_2_hmm_db, tsa_olen_fam_hmm_db,tsa_olen_gen_hmm_db, rdrpscan_hmm_db, lucaprot_hmm_db]
+        db_name_list = ["RVMT", "NeoRdRp", "NeoRdRp.2.1", "TSA_Olendraite_fam","TSA_Olendraite_gen", "RDRP-scan", "Lucaprot_HMM", "Zayed_HMM"]
+        db_path_list = [rvmt_hmm_db, neordrp_hmm_db, neordrp_2_hmm_db, tsa_olen_fam_hmm_db,tsa_olen_gen_hmm_db, rdrpscan_hmm_db, lucaprot_hmm_db, zayed_hmm_db]
     else:
         for db in db_options:
@@ -236,9 +241,12 @@ def run_scan(input_file, output_dir, db_options, db_dir, seq_type, verbose, e,in
             elif db == "RDRP-scan".lower():
                 db_name_list.append("RDRP-scan")
                 db_path_list.append(rdrpscan_hmm_db)
-            elif db == "Lucaprot".lower():
-                db_name_list.append("Lucaprot")
+            elif db == "Lucaprot_HMM".lower():
+                db_name_list.append("Lucaprot_HMM")
                 db_path_list.append(lucaprot_hmm_db)
+            elif db == "Zayed_HMM".lower():
+                db_name_list.append("Zayed_HMM")
+                db_path_list.append(zayed_hmm_db)
             else:
                 raise Exception(f"Invalid database option: {db}")