rdrpcatch 0.0.6__tar.gz → 0.0.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. rdrpcatch-0.0.7/.idea/shelf/Uncommitted_changes_before_Update_at_19_05_2025_13_00_[Changes]/shelved.patch +145 -0
  2. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/.idea/workspace.xml +40 -21
  3. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/PKG-INFO +21 -21
  4. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/README.md +20 -20
  5. rdrpcatch-0.0.7/images/rdrpcatch_illustration.png +0 -0
  6. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/pyproject.toml +1 -1
  7. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/rdrpcatch/cli/args.py +2 -2
  8. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/rdrpcatch/rdrpcatch_wrapper.py +14 -6
  9. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/.gitignore +0 -0
  10. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/.idea/.gitignore +0 -0
  11. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/.idea/.name +0 -0
  12. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/.idea/ColaB-Scan.iml +0 -0
  13. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/.idea/inspectionProfiles/profiles_settings.xml +0 -0
  14. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/.idea/misc.xml +0 -0
  15. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/.idea/modules.xml +0 -0
  16. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/.idea/vcs.xml +0 -0
  17. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/LICENSE +0 -0
  18. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/dependencies/rdrpcatch_test_env.yaml +0 -0
  19. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/dependencies/requirements.txt +0 -0
  20. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/images/rdrpcatch_flowchart_v0.png +0 -0
  21. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/meta.yaml +0 -0
  22. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/rdrpcatch/__init__.py +0 -0
  23. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/rdrpcatch/cli/__init__.py +0 -0
  24. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/rdrpcatch/rdrpcatch_scripts/__init__.py +0 -0
  25. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/rdrpcatch/rdrpcatch_scripts/fetch_dbs.py +0 -0
  26. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/rdrpcatch/rdrpcatch_scripts/format_pyhmmer_out.py +0 -0
  27. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/rdrpcatch/rdrpcatch_scripts/gui.py +0 -0
  28. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/rdrpcatch/rdrpcatch_scripts/mmseqs_tax.py +0 -0
  29. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/rdrpcatch/rdrpcatch_scripts/paths.py +0 -0
  30. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/rdrpcatch/rdrpcatch_scripts/plot.py +0 -0
  31. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/rdrpcatch/rdrpcatch_scripts/run_pyhmmer.py +0 -0
  32. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/rdrpcatch/rdrpcatch_scripts/run_seqkit.py +0 -0
  33. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/rdrpcatch/rdrpcatch_scripts/utils.py +0 -0
  34. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/testing/meta_4test.yaml +0 -0
  35. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/testing/rdrpcatch-1.0.0-py312_2.tar.bz2 +0 -0
  36. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/testing/rdrpcatch_test_env.yaml +0 -0
  37. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/testing/rdrpcatch_test_translate/test_translate_gff_files/test_translate_full_aminoacid_rdrpcatch.gff3 +0 -0
  38. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/testing/rdrpcatch_test_translate/test_translate_rdrpcatch_fasta/test_translate_full_aminoacid_contigs.fasta +0 -0
  39. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/testing/rdrpcatch_test_translate/test_translate_rdrpcatch_fasta/test_translate_trimmed_aminoacid_contigs.fasta +0 -0
  40. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/testing/rdrpcatch_test_translate/test_translate_rdrpcatch_output_annotated.tsv +0 -0
  41. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/testing/rdrpcatch_test_translate/test_translate_rdrpcatch_plots/test_translate_ID_score_plot.html +0 -0
  42. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/testing/rdrpcatch_test_translate/test_translate_rdrpcatch_plots/test_translate_contig_coverage_plot.html +0 -0
  43. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/testing/rdrpcatch_test_translate/test_translate_rdrpcatch_plots/test_translate_evalue_plot.html +0 -0
  44. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/testing/rdrpcatch_test_translate/test_translate_rdrpcatch_plots/test_translate_norm_bitscore_contig_plot.html +0 -0
  45. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/testing/rdrpcatch_test_translate/test_translate_rdrpcatch_plots/test_translate_norm_bitscore_plot_profile.html +0 -0
  46. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/testing/rdrpcatch_test_translate/test_translate_rdrpcatch_plots/test_translate_profile_coverage_plot.html +0 -0
  47. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/testing/rdrpcatch_test_translate/test_translate_rdrpcatch_plots/test_translate_score_plot.html +0 -0
  48. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/testing/rdrpcatch_test_translate/test_translate_rdrpcatch_plots/test_translate_upset_plot.png +0 -0
  49. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/testing/rdrpcatch_test_translate/tmp/best_hit_hmm_output/test_translate_Lucaprot_hmm_output_best_hit.txt +0 -0
  50. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/testing/rdrpcatch_test_translate/tmp/best_hit_hmm_output/test_translate_NeoRdRp.2.1_hmm_output_best_hit.txt +0 -0
  51. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/testing/rdrpcatch_test_translate/tmp/best_hit_hmm_output/test_translate_NeoRdRp_hmm_output_best_hit.txt +0 -0
  52. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/testing/rdrpcatch_test_translate/tmp/best_hit_hmm_output/test_translate_RDRP-scan_hmm_output_best_hit.txt +0 -0
  53. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/testing/rdrpcatch_test_translate/tmp/best_hit_hmm_output/test_translate_RVMT_hmm_output_best_hit.txt +0 -0
  54. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/testing/rdrpcatch_test_translate/tmp/best_hit_hmm_output/test_translate_TSA_Olendraite_fam_hmm_output_best_hit.txt +0 -0
  55. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/testing/rdrpcatch_test_translate/tmp/best_hit_hmm_output/test_translate_TSA_Olendraite_gen_hmm_output_best_hit.txt +0 -0
  56. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/testing/rdrpcatch_test_translate/tmp/formatted_hmm_output/test_translate_Lucaprot_hmm_output_formatted.txt +0 -0
  57. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/testing/rdrpcatch_test_translate/tmp/formatted_hmm_output/test_translate_NeoRdRp.2.1_hmm_output_formatted.txt +0 -0
  58. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/testing/rdrpcatch_test_translate/tmp/formatted_hmm_output/test_translate_NeoRdRp_hmm_output_formatted.txt +0 -0
  59. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/testing/rdrpcatch_test_translate/tmp/formatted_hmm_output/test_translate_RDRP-scan_hmm_output_formatted.txt +0 -0
  60. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/testing/rdrpcatch_test_translate/tmp/formatted_hmm_output/test_translate_RVMT_hmm_output_formatted.txt +0 -0
  61. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/testing/rdrpcatch_test_translate/tmp/formatted_hmm_output/test_translate_TSA_Olendraite_fam_hmm_output_formatted.txt +0 -0
  62. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/testing/rdrpcatch_test_translate/tmp/formatted_hmm_output/test_translate_TSA_Olendraite_gen_hmm_output_formatted.txt +0 -0
  63. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/testing/rdrpcatch_test_translate/tmp/hmm_output/test_translate_Lucaprot_hmmsearch_output.custom.tsv +0 -0
  64. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/testing/rdrpcatch_test_translate/tmp/hmm_output/test_translate_Lucaprot_hmmsearch_output.txt +0 -0
  65. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/testing/rdrpcatch_test_translate/tmp/hmm_output/test_translate_NeoRdRp.2.1_hmmsearch_output.custom.tsv +0 -0
  66. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/testing/rdrpcatch_test_translate/tmp/hmm_output/test_translate_NeoRdRp.2.1_hmmsearch_output.txt +0 -0
  67. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/testing/rdrpcatch_test_translate/tmp/hmm_output/test_translate_NeoRdRp_hmmsearch_output.custom.tsv +0 -0
  68. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/testing/rdrpcatch_test_translate/tmp/hmm_output/test_translate_NeoRdRp_hmmsearch_output.txt +0 -0
  69. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/testing/rdrpcatch_test_translate/tmp/hmm_output/test_translate_RDRP-scan_hmmsearch_output.custom.tsv +0 -0
  70. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/testing/rdrpcatch_test_translate/tmp/hmm_output/test_translate_RDRP-scan_hmmsearch_output.txt +0 -0
  71. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/testing/rdrpcatch_test_translate/tmp/hmm_output/test_translate_RVMT_hmmsearch_output.custom.tsv +0 -0
  72. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/testing/rdrpcatch_test_translate/tmp/hmm_output/test_translate_RVMT_hmmsearch_output.txt +0 -0
  73. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/testing/rdrpcatch_test_translate/tmp/hmm_output/test_translate_TSA_Olendraite_fam_hmmsearch_output.custom.tsv +0 -0
  74. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/testing/rdrpcatch_test_translate/tmp/hmm_output/test_translate_TSA_Olendraite_fam_hmmsearch_output.txt +0 -0
  75. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/testing/rdrpcatch_test_translate/tmp/hmm_output/test_translate_TSA_Olendraite_gen_hmmsearch_output.custom.tsv +0 -0
  76. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/testing/rdrpcatch_test_translate/tmp/hmm_output/test_translate_TSA_Olendraite_gen_hmmsearch_output.txt +0 -0
  77. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/testing/rdrpcatch_test_translate/tmp/mmseqs_e_search_output/test_translate_mmseqs_e_search.tsv +0 -0
  78. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/testing/rdrpcatch_test_translate/tmp/mmseqs_tax_output/test_translate_mmseqs_tax_lca.tsv +0 -0
  79. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/testing/rdrpcatch_test_translate/tmp/mmseqs_tax_output/test_translate_mmseqs_tax_report +0 -0
  80. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/testing/rdrpcatch_test_translate/tmp/mmseqs_tax_output/test_translate_mmseqs_tax_tophit_aln +0 -0
  81. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/testing/rdrpcatch_test_translate/tmp/mmseqs_tax_output/test_translate_mmseqs_tax_tophit_report +0 -0
  82. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/testing/rdrpcatch_test_translate/tmp/mmseqs_tax_output/tmp/16608414482057878997/easy-taxonomy.sh +0 -0
  83. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/testing/rdrpcatch_test_translate/tmp/test_translate_logs/test_translate_mmseqs_e_search.log +0 -0
  84. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/testing/rdrpcatch_test_translate/tmp/test_translate_logs/test_translate_mmseqs_tax.log +0 -0
  85. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/testing/rdrpcatch_test_translate/tmp/test_translate_logs/test_translate_rdrpcatch.log +0 -0
  86. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/testing/rdrpcatch_test_translate/tmp/tsv_files/test_translate_combined.tsv +0 -0
  87. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/testing/rdrpcatch_test_translate/tmp/tsv_files/test_translate_rdrpcatch_output.tsv +0 -0
  88. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/testing/rdrpcatch_test_translate/tmp/tsv_files/test_translate_upset_data.tsv +0 -0
  89. {rdrpcatch-0.0.6 → rdrpcatch-0.0.7}/testing/test_translate.fasta +0 -0
@@ -0,0 +1,145 @@
1
+ Index: rdrpcatch/rdrpcatch_wrapper.py
2
+ IDEA additional info:
3
+ Subsystem: com.intellij.openapi.diff.impl.patch.BaseRevisionTextPatchEP
4
+ <+>\"\"\"\r\nWrapper for the RdRpCATCH package.\r\n\r\n\"\"\"\r\nimport os\r\nfrom pathlib import Path\r\nfrom rich.console import Console\r\nimport warnings\r\nwarnings.filterwarnings(\"ignore\", category=UserWarning, module=\"numpy\") # see https://moyix.blogspot.com/2022/09/someones-been-messing-with-my-subnormals.html\r\n\r\ndef main():\r\n pass\r\n\r\n\r\n# def run_gui():\r\n#\r\n# gui_runner = gui.colabscanner_gui()\r\n# gui_runner.run()\r\n\r\n\r\ndef bundle_results(output_dir, prefix):\r\n \"\"\"\r\n Bundle the results into a tar.gz file.\r\n\r\n :param output_dir: Path to the output directory.\r\n :type output_dir: str\r\n :param prefix: Prefix for the output files.\r\n :type prefix: str\r\n :return: Path to the bundled file\r\n :rtype: str\r\n \"\"\"\r\n import tarfile\r\n import datetime\r\n \r\n # Create timestamp for the archive name\r\n timestamp = datetime.datetime.now().strftime(\"%Y%m%d_%H%M%S\")\r\n archive_name = f\"{prefix}_rdrpcatch_results_{timestamp}.tar.gz\"\r\n archive_path = os.path.join(output_dir, archive_name)\r\n \r\n # Create tar.gz archive\r\n with tarfile.open(archive_path, \"w:gz\") as tar:\r\n # Add all relevant directories\r\n for dir_name in [f\"{prefix}_rdrpcatch_fasta\", f\"{prefix}_rdrpcatch_plots\", \r\n f\"{prefix}_gff_files\", \"tmp\"]:\r\n dir_path = os.path.join(output_dir, dir_name)\r\n if os.path.exists(dir_path):\r\n tar.add(dir_path, arcname=dir_name)\r\n \r\n # Add the main output file\r\n output_file = os.path.join(output_dir, f\"{prefix}_rdrpcatch_output_annotated.tsv\")\r\n if os.path.exists(output_file):\r\n tar.add(output_file, arcname=os.path.basename(output_file))\r\n \r\n return archive_path\r\n\r\ndef run_scan(input_file, output_dir, db_options, db_dir, seq_type, verbose, e,incdomE,domE,incE,z, cpus, length_thr, gen_code, bundle, keep_tmp, overwrite):\r\n \"\"\"\r\n Run RdRpCATCH scan.\r\n\r\n :param input_file: Path to the input FASTA file.\r\n :type input_file: str\r\n :param output_dir: Path to the output directory.\r\n :type output_dir: str\r\n :param db_options: List of databases to search against.\r\n :type db_options: list\r\n :param db_dir: Path to the directory containing RdRpCATCH databases.\r\n :type db_dir: str\r\n :param seq_type: Type of sequence (prot or nuc).\r\n :type seq_type: str\r\n :param verbose: Whether to print verbose output.\r\n :type verbose: bool\r\n :param e: E-value threshold for HMMsearch.\r\n :type e: float\r\n :param incdomE: Inclusion domain E-value threshold for HMMsearch.\r\n :type incdomE: float\r\n :param domE: Domain E-value threshold for HMMsearch.\r\n :type domE: float\r\n :param incE: Inclusion E-value threshold for HMMsearch.\r\n :type incE: float\r\n :param z: Number of sequences to search against.\r\n :type z: int\r\n :param cpus: Number of CPUs to use for HMMsearch.\r\n :type cpus: int\r\n :param length_thr: Minimum length threshold for seqkit seq.\r\n :type length_thr: int\r\n :param gen_code: Genetic code to use for translation.\r\n :type gen_code: int\r\n :return: None\r\n \"\"\"\r\n from .rdrpcatch_scripts import utils\r\n from .rdrpcatch_scripts import paths\r\n from .rdrpcatch_scripts import run_pyhmmer\r\n from .rdrpcatch_scripts import fetch_dbs\r\n from .rdrpcatch_scripts import format_pyhmmer_out\r\n from .rdrpcatch_scripts import run_seqkit\r\n from .rdrpcatch_scripts import plot\r\n import polars as pl\r\n from .rdrpcatch_scripts import mmseqs_tax\r\n import datetime\r\n \r\n ## Ignore warnings\r\n warnings.filterwarnings(\"ignore\", category=FutureWarning)\r\n warnings.filterwarnings(\"ignore\", category=UserWarning)\r\n\r\n ## Set output directories\r\n prefix = Path(input_file).stem\r\n outputs = paths.rdrpcatch_output(prefix, Path(output_dir))\r\n\r\n ## Set up logger\r\n log_file = outputs.log_file\r\n if not os.path.exists(outputs.output_dir):\r\n os.makedirs(outputs.output_dir)\r\n elif os.path.exists(outputs.output_dir) and overwrite:\r\n # If the output directory already exists and force_overwrite is True, remove the existing directory\r\n import shutil\r\n shutil.rmtree(outputs.output_dir)\r\n os.makedirs(outputs.output_dir)\r\n outputs = paths.rdrpcatch_output(prefix, Path(output_dir))\r\n else:\r\n raise FileExistsError(f\"Output directory already exists: {outputs.output_dir}, Please choose a different directory\"\r\n f\" or activate the -overwrite flag to overwrite the contents of the directory.\")\r\n\r\n if not os.path.exists(outputs.log_dir):\r\n os.makedirs(outputs.log_dir)\r\n\r\n logger = utils.Logger(log_file)\r\n\r\n logger.silent_log(f\"Input File: {input_file}\")\r\n logger.silent_log(f\"Output Directory: {output_dir}\")\r\n logger.silent_log(f\"Databases: {db_options}\")\r\n logger.silent_log(f\"Database Directory: {db_dir}\")\r\n logger.silent_log(f\"Sequence Type: {seq_type}\")\r\n logger.silent_log(f\"Verbose Mode: {'ON' if verbose else 'OFF'}\")\r\n logger.silent_log(f\"E-value: {e}\")\r\n logger.silent_log(f\"Inclusion E-value: {incE}\")\r\n logger.silent_log(f\"Domain E-value: {domE}\")\r\n logger.silent_log(f\"Inclusion Domain E-value: {incdomE}\")\r\n logger.silent_log(f\"Z-value: {z}\")\r\n logger.silent_log(f\"CPUs: {cpus}\")\r\n logger.silent_log(f\"Length Threshold: {length_thr}\")\r\n logger.silent_log(f\"Genetic Code: {gen_code}\")\r\n logger.silent_log(f\"Bundle Results: {'ON' if bundle else 'OFF'}\")\r\n logger.silent_log(f\"Save Temporary Files: {'ON' if keep_tmp else 'OFF'}\")\r\n\r\n ## Start time\r\n start_time = logger.start_timer()\r\n\r\n ## Check fasta validity\r\n if not utils.fasta_checker(input_file, logger).check_fasta_validity():\r\n raise Exception(\"Invalid fasta file.\")\r\n else:\r\n if verbose:\r\n logger.loud_log(f\"Valid fasta file: {input_file}\")\r\n else:\r\n logger.silent_log(f\"Valid fasta file: {input_file}\")\r\n\r\n ## Check sequence type\r\n if not seq_type:\r\n seq_type = utils.fasta_checker(input_file, logger).check_seq_type()\r\n if verbose:\r\n logger.loud_log(f\"Sequence type: {seq_type}\")\r\n else:\r\n logger.silent_log(f\"Sequence type: {seq_type}\")\r\n\r\n ## Check sequence length in .fasta files, if >100000, pyHMMER breaks\r\n if seq_type == 'nuc':\r\n utils.fasta_checker(input_file, logger).check_seq_length(300000)\r\n if seq_type == 'prot':\r\n utils.fasta_checker(input_file, logger).check_seq_length(100000)\r\n\r\n logger.loud_log(\"Fetching HMM databases...\")\r\n\r\n ## Fetch HMM databases- RVMT, NeoRdRp, NeoRdRp.2.1, TSA_Olendraite, RDRP-scan, Lucaprot\r\n rvmt_hmm_db = fetch_dbs.db_fetcher(db_dir).fetch_hmm_db_path(\"RVMT\")\r\n if verbose:\r\n logger.loud_log(f\"RVMT HMM database fetched from: {rvmt_hmm_db}\")\r\n else:\r\n logger.silent_log(f\"RVMT HMM database fetched from: {rvmt_hmm_db}\")\r\n neordrp_hmm_db = fetch_dbs.db_fetcher(db_dir).fetch_hmm_db_path(\"NeoRdRp\")\r\n if verbose:\r\n logger.loud_log(f\"NeoRdRp HMM database fetched from: {neordrp_hmm_db}\")\r\n else:\r\n logger.silent_log(f\"NeoRdRp HMM database fetched from: {neordrp_hmm_db}\")\r\n neordrp_2_hmm_db = fetch_dbs.db_fetcher(db_dir).fetch_hmm_db_path(\"NeoRdRp.2.1\")\r\n if verbose:\r\n logger.loud_log(f\"NeoRdRp.2.1 HMM database fetched from: {neordrp_2_hmm_db}\")\r\n else:\r\n logger.silent_log(f\"NeoRdRp.2.1 HMM database fetched from: {neordrp_2_hmm_db}\")\r\n tsa_olen_fam_hmm_db = fetch_dbs.db_fetcher(db_dir).fetch_hmm_db_path(\"TSA_Olendraite_fam\")\r\n if verbose:\r\n logger.loud_log(f\"TSA_Olendraite_fam HMM database fetched from: {tsa_olen_fam_hmm_db}\")\r\n else:\r\n logger.silent_log(f\"TSA_Olendraite_fam HMM database fetched from: {tsa_olen_fam_hmm_db}\")\r\n\r\n tsa_olen_gen_hmm_db = fetch_dbs.db_fetcher(db_dir).fetch_hmm_db_path(\"TSA_Olendraite_gen\")\r\n if verbose:\r\n logger.loud_log(f\"TSA_Olendraite HMM database fetched from: {tsa_olen_gen_hmm_db}\")\r\n else:\r\n logger.silent_log(f\"TSA_Olendraite HMM database fetched from: {tsa_olen_gen_hmm_db}\")\r\n rdrpscan_hmm_db = fetch_dbs.db_fetcher(db_dir).fetch_hmm_db_path(\"RDRP-scan\")\r\n if verbose:\r\n logger.loud_log(f\"RDRP-scan HMM database fetched from: {rdrpscan_hmm_db}\")\r\n else:\r\n logger.silent_log(f\"RDRP-scan HMM database fetched from: {rdrpscan_hmm_db}\")\r\n lucaprot_hmm_db = fetch_dbs.db_fetcher(db_dir).fetch_hmm_db_path(\"Lucaprot\")\r\n if verbose:\r\n logger.loud_log(f\"Lucaprot HMM database fetched from: {lucaprot_hmm_db}\")\r\n else:\r\n logger.silent_log(f\"Lucaprot HMM database fetched from: {lucaprot_hmm_db}\")\r\n\r\n db_name_list = []\r\n db_path_list = []\r\n\r\n ## Set up HMM databases\r\n if db_options == ['all']:\r\n db_name_list = [\"RVMT\", \"NeoRdRp\", \"NeoRdRp.2.1\", \"TSA_Olendraite_fam\",\"TSA_Olendraite_gen\", \"RDRP-scan\", \"Lucaprot\"]\r\n db_path_list = [rvmt_hmm_db, neordrp_hmm_db, neordrp_2_hmm_db, tsa_olen_fam_hmm_db,tsa_olen_gen_hmm_db, rdrpscan_hmm_db, lucaprot_hmm_db]\r\n\r\n else:\r\n for db in db_options:\r\n if db == \"RVMT\".lower():\r\n db_name_list.append(\"RVMT\")\r\n db_path_list.append(rvmt_hmm_db)\r\n elif db == \"NeoRdRp\".lower():\r\n db_name_list.append(\"NeoRdRp\")\r\n db_path_list.append(neordrp_hmm_db)\r\n elif db == \"NeoRdRp.2.1\":\r\n db_name_list.append(\"NeoRdRp.2.1\".lower())\r\n db_path_list.append(neordrp_2_hmm_db)\r\n elif db == \"TSA_Olendraite_fam\".lower():\r\n db_name_list.append(\"TSA_Olendraite_fam\")\r\n db_path_list.append(tsa_olen_fam_hmm_db)\r\n elif db == \"TSA_Olendraite_gen\".lower():\r\n db_name_list.append(\"TSA_Olendraite_gen\")\r\n db_path_list.append(tsa_olen_gen_hmm_db)\r\n elif db == \"RDRP-scan\".lower():\r\n db_name_list.append(\"RDRP-scan\")\r\n db_path_list.append(rdrpscan_hmm_db)\r\n elif db == \"Lucaprot\".lower():\r\n db_name_list.append(\"Lucaprot\")\r\n db_path_list.append(lucaprot_hmm_db)\r\n else:\r\n raise Exception(f\"Invalid database option: {db}\")\r\n\r\n # Fetch mmseqs database\r\n\r\n\r\n logger.loud_log(\"Fetching Mmseqs2 databases...\")\r\n\r\n mmseqs_db_path = fetch_dbs.db_fetcher(db_dir).fetch_mmseqs_db_path(\"mmseqs_refseq_riboviria_20250211\")\r\n\r\n if verbose:\r\n logger.loud_log(f\"mmseqs database fetched from: {mmseqs_db_path}\")\r\n else:\r\n logger.silent_log(f\"mmseqs database fetched from: {mmseqs_db_path}\")\r\n\r\n if not os.path.exists(outputs.hmm_output_dir):\r\n outputs.hmm_output_dir.mkdir(parents=True)\r\n\r\n if not os.path.exists(outputs.formatted_hmm_output_dir):\r\n outputs.formatted_hmm_output_dir.mkdir(parents=True)\r\n\r\n if not os.path.exists(outputs.tsv_outdir):\r\n outputs.tsv_outdir.mkdir(parents=True)\r\n\r\n if not os.path.exists(outputs.plot_outdir):\r\n outputs.plot_outdir.mkdir(parents=True)\r\n\r\n if not os.path.exists(outputs.tmp_dir):\r\n outputs.tmp_dir.mkdir(parents=True)\r\n\r\n logger.loud_log(\"Databases fetched successfully.\")\r\n\r\n if seq_type == 'nuc':\r\n logger.loud_log(\"Nucleotide sequence detected.\")\r\n\r\n set_dict = {}\r\n translated_set_dict = {}\r\n df_list = []\r\n\r\n ## Filter out sequences with length less than 400 bp with seqkit\r\n logger.loud_log(\"Filtering out sequences with length less than 400 bp.\")\r\n\r\n if not os.path.exists(outputs.seqkit_seq_output_dir):\r\n outputs.seqkit_seq_output_dir.mkdir(parents=True)\r\n\r\n run_seqkit.seqkit(input_file, outputs.seqkit_seq_output_path, log_file, threads=cpus, logger=logger).run_seqkit_seq(length_thr)\r\n if verbose:\r\n logger.loud_log(f\"Filtered sequence written to: { outputs.seqkit_seq_output_path}\")\r\n else:\r\n logger.silent_log(f\"Filtered sequence written to: { outputs.seqkit_seq_output_path}\")\r\n\r\n ## Translate nucleotide sequences to protein sequences with seqkit\r\n logger.loud_log(\"Translating nucleotide sequences to protein sequences.\")\r\n\r\n if not os.path.exists(outputs.seqkit_translate_output_dir):\r\n outputs.seqkit_translate_output_dir.mkdir(parents=True)\r\n\r\n run_seqkit.seqkit(outputs.seqkit_seq_output_path, outputs.seqkit_translate_output_path, log_file, threads=cpus, logger=logger).run_seqkit_translate(gen_code, 6)\r\n\r\n if verbose:\r\n logger.loud_log(f\"Translated sequence written to: {outputs.seqkit_translate_output_path}\")\r\n else:\r\n logger.silent_log(f\"Translated sequence written to: {outputs.seqkit_translate_output_path}\")\r\n\r\n for db_name,db_path in zip(db_name_list, db_path_list):\r\n logger.loud_log(f\"Running HMMsearch for {db_name} database.\")\r\n\r\n if verbose:\r\n logger.loud_log(f\"HMM output path: {outputs.hmm_output_path(db_name)}\")\r\n else:\r\n logger.silent_log(f\"HMM output path: {outputs.hmm_output_path(db_name)}\")\r\n\r\n start_hmmsearch_time = logger.start_timer()\r\n run_pyhmmer.pyhmmsearch(outputs.hmm_output_path(db_name), outputs.seqkit_translate_output_path, db_path, cpus, e, incdomE, domE, incE,\r\n z).run_pyhmmsearch()\r\n end_hmmsearch_time = logger.stop_timer(start_hmmsearch_time, verbose)\r\n if verbose:\r\n logger.loud_log(f\"{db_name} HMMsearch Runtime: {end_hmmsearch_time}\")\r\n else:\r\n logger.silent_log(f\"{db_name} HMMsearch Runtime: {end_hmmsearch_time}\")\r\n\r\n if verbose:\r\n logger.loud_log(f\"Pyhmmer output written to: {outputs.hmm_output_path(db_name)}\")\r\n else:\r\n logger.silent_log(f\"Pyhmmer output written to: {outputs.hmm_output_path(db_name)}\")\r\n\r\n if not os.path.exists(outputs.formatted_hmm_output_dir):\r\n outputs.formatted_hmm_output_dir.mkdir(parents=True)\r\n\r\n format_pyhmmer_out.hmmsearch_formatter(outputs.hmm_output_path(db_name), outputs.formatted_hmm_output_path(db_name), seq_type)\r\n\r\n if verbose:\r\n logger.loud_log(f\"Formatted Pyhmmer output written to: {outputs.formatted_hmm_output_path(db_name)}\")\r\n else:\r\n logger.silent_log(f\"Formatted Pyhmmer output written to: {outputs.formatted_hmm_output_path(db_name)}\")\r\n if not os.path.exists(outputs.best_hit_dir):\r\n outputs.best_hit_dir.mkdir(parents=True)\r\n\r\n format_pyhmmer_out.hmmsearch_format_helpers(outputs.formatted_hmm_output_path(db_name), seq_type, logger).highest_bitscore_hits(\r\n outputs.best_hit_path(db_name))\r\n if verbose:\r\n logger.loud_log(f\"Highest Bitscore hits written to: {outputs.best_hit_path(db_name)}\")\r\n else:\r\n logger.silent_log(f\"Highest Bitscore hits written to: {outputs.best_hit_path(db_name)}\")\r\n\r\n set_dict[db_name] = format_pyhmmer_out.hmmsearch_format_helpers(outputs.formatted_hmm_output_path(db_name),\r\n seq_type, logger).hmm_to_contig_set()\r\n translated_set_dict[db_name] = format_pyhmmer_out.hmmsearch_format_helpers(outputs.formatted_hmm_output_path(db_name),\r\n 'prot', logger).hmm_to_contig_set()\r\n\r\n # Convert to dataframe, add db_name column and append to df_list\r\n df = pl.read_csv(outputs.best_hit_path(db_name), separator='\\t')\r\n df = df.with_columns([\r\n pl.lit(db_name).alias('db_name')\r\n ])\r\n df_list.append(df)\r\n\r\n logger.loud_log(f\"HMMsearch for {db_name} completed.\")\r\n\r\n logger.loud_log(\"HMMsearch completed.\")\r\n\r\n if not os.path.exists(outputs.plot_outdir):\r\n outputs.plot_outdir.mkdir(parents=True)\r\n\r\n if not os.path.exists(outputs.tsv_outdir):\r\n outputs.tsv_outdir.mkdir(parents=True)\r\n\r\n logger.loud_log(\"Consolidating results.\")\r\n\r\n # Combine all the dataframes in the list\r\n combined_df = pl.concat(df_list, how='vertical_relaxed')\r\n # Write the combined dataframe to a tsv file\r\n for col in ['E-value', 'score', 'norm_bitscore_profile', 'norm_bitscore_contig',\r\n 'ID_score', 'profile_coverage', 'contig_coverage']:\r\n combined_df = combined_df.with_columns([\r\n pl.col(col).cast(pl.Float64)\r\n ])\r\n\r\n\r\n combined_df.write_csv(outputs.combined_tsv_path, separator=\"\\t\")\r\n\r\n # Check if the combined dataframe is empty\r\n if combined_df.is_empty():\r\n logger.loud_log(\"No hits found by RdRpCATCH. Exiting.\")\r\n return None\r\n\r\n # Generate upset plot\r\n logger.loud_log(\"Generating plots.\")\r\n\r\n if len(db_name_list) > 1:\r\n if verbose:\r\n logger.loud_log(\"Generating upset plot.\")\r\n else:\r\n logger.silent_log(\"Generating upset plot.\")\r\n\r\n plot.Plotter(outputs.plot_outdir, outputs.tsv_outdir, prefix).upset_plotter(set_dict)\r\n\r\n\r\n if verbose:\r\n logger.loud_log(f\"Combined dataframe written to: {outputs.combined_tsv_path}\")\r\n else:\r\n logger.silent_log(f\"Combined dataframe written to: {outputs.combined_tsv_path}\")\r\n # Generate e-value plot\r\n plot.Plotter(outputs.plot_outdir, outputs.tsv_outdir, prefix).plot_evalue(combined_df)\r\n # Generate score plot\r\n plot.Plotter(outputs.plot_outdir, outputs.tsv_outdir, prefix).plot_score(combined_df)\r\n # Generate normalized bitscore plot\r\n plot.Plotter(outputs.plot_outdir, outputs.tsv_outdir, prefix).plot_norm_bitscore_profile(combined_df)\r\n # Generate normalized bitscore contig plot\r\n plot.Plotter(outputs.plot_outdir, outputs.tsv_outdir, prefix).plot_norm_bitscore_contig(combined_df)\r\n # Generate ID score plot\r\n plot.Plotter(outputs.plot_outdir, outputs.tsv_outdir, prefix).plot_ID_score(combined_df)\r\n # Generate Profile coverage plot\r\n plot.Plotter(outputs.plot_outdir, outputs.tsv_outdir, prefix).plot_profile_coverage(combined_df)\r\n # Generate contig coverage plot\r\n plot.Plotter(outputs.plot_outdir, outputs.tsv_outdir, prefix).plot_contig_coverage(combined_df)\r\n # Extract all the contigs\r\n combined_set = set.union(*[value for value in set_dict.values()])\r\n translated_combined_set = set.union(*[value for value in translated_set_dict.values()])\r\n\r\n logger.loud_log(\"Extracting RdRp contigs from the input file.\")\r\n\r\n # Write a fasta file with all the contigs\r\n if not os.path.exists(outputs.fasta_output_dir):\r\n outputs.fasta_output_dir.mkdir(parents=True)\r\n\r\n utils.fasta(input_file).write_fasta(utils.fasta(input_file).extract_contigs(combined_set), outputs.fasta_nuc_out_path)\r\n\r\n utils.fasta(outputs.seqkit_translate_output_path).write_fasta(utils.fasta(outputs.seqkit_translate_output_path).extract_contigs(translated_combined_set),\r\n outputs.fasta_prot_out_path)\r\n\r\n if not os.path.exists(outputs.gff_output_dir):\r\n outputs.gff_output_dir.mkdir(parents=True)\r\n hmm_writer = format_pyhmmer_out.hmmsearch_output_writter(logger)\r\n hmm_writer.write_hmmsearch_hits(outputs.combined_tsv_path, seq_type, outputs.rdrpcatch_output_tsv, outputs.gff_output_path)\r\n rdrp_coords_list = hmm_writer.get_rdrp_coords(outputs.rdrpcatch_output_tsv,seq_type)\r\n utils.fasta(outputs.seqkit_translate_output_path, logger).write_fasta_coords(rdrp_coords_list,outputs.fasta_trimmed_out_path, seq_type)\r\n\r\n if verbose:\r\n logger.loud_log(f\"Contigs written to: {outputs.fasta_nuc_out_path}\")\r\n logger.loud_log(f\"Translated contigs written to: {outputs.fasta_prot_out_path}\")\r\n logger.loud_log(f\"Trimmed contigs written to: {outputs.fasta_trimmed_out_path}\")\r\n else:\r\n logger.silent_log(f\"Contigs written to: {outputs.fasta_nuc_out_path}\")\r\n logger.silent_log(f\"Translated contigs written to: {outputs.fasta_prot_out_path}\")\r\n logger.silent_log(f\"Trimmed contigs written to: {outputs.fasta_trimmed_out_path}\")\r\n\r\n if not os.path.exists(outputs.mmseqs_tax_output_dir):\r\n outputs.mmseqs_tax_output_dir.mkdir(parents=True)\r\n\r\n logger.loud_log(\"Running mmseqs easy-taxonomy for taxonomic annotation.\")\r\n\r\n mmseqs_tax.mmseqs(outputs.fasta_prot_out_path, mmseqs_db_path, outputs.mmseqs_tax_output_prefix,\r\n outputs.mmseqs_tax_output_dir, 7, cpus, outputs.mmseqs_tax_log_path).run_mmseqs_easy_tax_lca()\r\n\r\n logger.loud_log(\"Running mmseqs easy-search for taxonomic annotation.\")\r\n\r\n if not os.path.exists(outputs.mmseqs_e_search_output_dir):\r\n outputs.mmseqs_e_search_output_dir.mkdir(parents=True)\r\n\r\n\r\n mmseqs_tax.mmseqs(outputs.fasta_prot_out_path, mmseqs_db_path, outputs.mmseqs_e_search_output_dir,\r\n outputs.mmseqs_e_search_output_path, 7, cpus, outputs.mmseqs_e_search_log_path).run_mmseqs_e_search()\r\n\r\n utils.mmseqs_parser(outputs.mmseqs_tax_output_lca_path, outputs.mmseqs_e_search_output_path).tax_to_rdrpcatch(\r\n outputs.rdrpcatch_output_tsv, outputs.extended_rdrpcatch_output, seq_type)\r\n\r\n logger.loud_log(\"Taxonomic annotation completed.\")\r\n\r\n elif seq_type == 'prot':\r\n\r\n logger.loud_log(\"Protein sequence detected.\")\r\n\r\n set_dict = {}\r\n df_list = []\r\n\r\n for db_name,db_path in zip (db_name_list, db_path_list):\r\n logger.loud_log(f\"Running HMMsearch for {db_name} database.\")\r\n\r\n if verbose:\r\n logger.loud_log(f\"HMM output path: {outputs.hmm_output_path(db_name)}\")\r\n else:\r\n logger.silent_log(f\"HMM output path: {outputs.hmm_output_path(db_name)}\")\r\n start_hmmsearch_time = logger.start_timer()\r\n hmm_out = run_pyhmmer.pyhmmsearch(outputs.hmm_output_path(db_name), input_file, db_path, cpus, e, incdomE, domE, incE, z).run_pyhmmsearch()\r\n end_hmmsearch_time = logger.stop_timer(start_hmmsearch_time,verbose)\r\n if verbose:\r\n logger.loud_log(f\"{db_name} HMMsearch Runtime: {end_hmmsearch_time}\")\r\n else:\r\n logger.silent_log(f\"{db_name} HMMsearch Runtime: {end_hmmsearch_time}\")\r\n\r\n if verbose:\r\n logger.loud_log(f\"Pyhmmer output written to: {hmm_out}\")\r\n else:\r\n logger.silent_log(f\"Pyhmmer output written to: {hmm_out}\")\r\n if not os.path.exists(outputs.formatted_hmm_output_dir):\r\n outputs.formatted_hmm_output_dir.mkdir(parents=True)\r\n\r\n format_pyhmmer_out.hmmsearch_formatter(hmm_out, outputs.formatted_hmm_output_path(db_name), seq_type)\r\n if verbose:\r\n logger.loud_log(f\"Formatted Pyhmmer output written to: {outputs.formatted_hmm_output_path(db_name)}\")\r\n else:\r\n logger.silent_log(f\"Formatted Pyhmmer output written to: {outputs.formatted_hmm_output_path(db_name)}\")\r\n\r\n # Extract Highest Bitscore hits from the formatted hmm output\r\n\r\n if not os.path.exists(outputs.best_hit_dir):\r\n outputs.best_hit_dir.mkdir(parents=True)\r\n\r\n format_pyhmmer_out.hmmsearch_format_helpers(outputs.formatted_hmm_output_path(db_name),seq_type, logger).highest_bitscore_hits(outputs.best_hit_path(db_name))\r\n\r\n if verbose:\r\n logger.loud_log(f\"Highest Bitscore hits written to: {outputs.best_hit_path(db_name)}\")\r\n else:\r\n logger.silent_log(f\"Highest Bitscore hits written to: {outputs.best_hit_path(db_name)}\")\r\n # Here I overwrite prot to nuc, because I need the contig name to extract the contigs\r\n set_dict[db_name] = format_pyhmmer_out.hmmsearch_format_helpers(outputs.formatted_hmm_output_path(db_name),\"nuc\", logger).hmm_to_contig_set()\r\n\r\n # Convert to dataframe, add db_name column and append to df_list\r\n df = pl.read_csv(outputs.best_hit_path(db_name), separator='\\t')\r\n df = df.with_columns([\r\n pl.lit(db_name).alias('db_name')\r\n ])\r\n df_list.append(df)\r\n\r\n logger.loud_log(f\"HMMsearch for {db_name} completed.\")\r\n\r\n logger.loud_log(\"HMMsearch completed.\")\r\n\r\n if not os.path.exists(outputs.plot_outdir):\r\n outputs.plot_outdir.mkdir(parents=True)\r\n\r\n if not os.path.exists(outputs.tsv_outdir):\r\n outputs.tsv_outdir.mkdir(parents=True)\r\n\r\n logger.loud_log(\"Consolidating results.\")\r\n\r\n # Combine all the dataframes in the list\r\n combined_df = pl.concat(df_list, how='vertical_relaxed')\r\n # Write the combined dataframe to a tsv file\r\n for col in ['E-value', 'score', 'norm_bitscore_profile', 'norm_bitscore_contig',\r\n 'ID_score', 'profile_coverage', 'contig_coverage']:\r\n combined_df = combined_df.with_columns([\r\n pl.col(col).cast(pl.Float64)\r\n ])\r\n\r\n combined_df.write_csv(outputs.combined_tsv_path, separator=\"\\t\")\r\n\r\n # Check if the combined dataframe is empty\r\n if combined_df.is_empty():\r\n logger.loud_log(\"No hits found by RdRpCATCH. Exiting.\")\r\n return None\r\n\r\n # Generate upset plot\r\n logger.loud_log(\"Generating plots.\")\r\n\r\n if len(db_name_list) > 1:\r\n if verbose:\r\n logger.loud_log(\"Generating upset plot.\")\r\n else:\r\n logger.silent_log(\"Generating upset plot.\")\r\n\r\n plot.Plotter(outputs.plot_outdir,outputs.tsv_outdir, prefix).upset_plotter(set_dict)\r\n\r\n\r\n if verbose:\r\n logger.loud_log(f\"Combined dataframe written to: {outputs.combined_tsv_path}\")\r\n else:\r\n logger.silent_log(f\"Combined dataframe written to: {outputs.combined_tsv_path}\")\r\n\r\n # Generate e-value plot\r\n plot.Plotter(outputs.plot_outdir,outputs.tsv_outdir, prefix).plot_evalue(combined_df)\r\n # Generate score plot\r\n plot.Plotter(outputs.plot_outdir,outputs.tsv_outdir, prefix).plot_score(combined_df)\r\n # Generate normalized bitscore plot\r\n plot.Plotter(outputs.plot_outdir,outputs.tsv_outdir, prefix).plot_norm_bitscore_profile(combined_df)\r\n # Generate normalized bitscore contig plot\r\n plot.Plotter(outputs.plot_outdir,outputs.tsv_outdir, prefix).plot_norm_bitscore_contig(combined_df)\r\n # Generate ID score plot\r\n plot.Plotter(outputs.plot_outdir,outputs.tsv_outdir, prefix).plot_ID_score(combined_df)\r\n # Generate Profile coverage plot\r\n plot.Plotter(outputs.plot_outdir,outputs.tsv_outdir, prefix).plot_profile_coverage(combined_df)\r\n # Generate contig coverage plot\r\n plot.Plotter(outputs.plot_outdir,outputs.tsv_outdir, prefix).plot_contig_coverage(combined_df)\r\n\r\n # Extract all the contigs\r\n combined_set = set.union(*[value for value in set_dict.values()])\r\n # Write a fasta file with all the contigs\r\n\r\n logger.loud_log(\"Extracting RdRp contigs from the input file.\")\r\n\r\n if not os.path.exists(outputs.fasta_output_dir):\r\n outputs.fasta_output_dir.mkdir(parents=True)\r\n\r\n utils.fasta(input_file).write_fasta(utils.fasta(input_file).extract_contigs(combined_set), outputs.fasta_prot_out_path)\r\n\r\n if verbose:\r\n logger.loud_log(f\"Full aminoacid contigs written to: {outputs.fasta_prot_out_path}\")\r\n else:\r\n logger.silent_log(f\" Full aminoacid contigs written to: {outputs.fasta_prot_out_path}\")\r\n\r\n if not os.path.exists(outputs.gff_output_dir):\r\n outputs.gff_output_dir.mkdir(parents=True)\r\n\r\n hmm_writer = format_pyhmmer_out.hmmsearch_output_writter(logger)\r\n hmm_writer.write_hmmsearch_hits(outputs.combined_tsv_path, seq_type, outputs.rdrpcatch_output_tsv, outputs.gff_output_path)\r\n rdrp_coords_list = hmm_writer.get_rdrp_coords(outputs.rdrpcatch_output_tsv,seq_type)\r\n utils.fasta(input_file, logger).write_fasta_coords(rdrp_coords_list,outputs.fasta_trimmed_out_path, seq_type)\r\n\r\n if verbose:\r\n logger.loud_log(f\"Trimmed contigs written to: {outputs.fasta_trimmed_out_path}\")\r\n else:\r\n logger.silent_log(f\"Trimmed contigs written to: {outputs.fasta_trimmed_out_path}\")\r\n\r\n if not os.path.exists(outputs.mmseqs_tax_output_dir):\r\n outputs.mmseqs_tax_output_dir.mkdir(parents=True)\r\n\r\n logger.loud_log(\"Running mmseqs easy-taxonomy for taxonomic annotation.\")\r\n\r\n mmseqs_tax.mmseqs(outputs.fasta_prot_out_path, mmseqs_db_path, outputs.mmseqs_tax_output_prefix,\r\n outputs.mmseqs_tax_output_dir, 7, cpus, outputs.mmseqs_tax_log_path).run_mmseqs_easy_tax_lca()\r\n\r\n if not os.path.exists(outputs.mmseqs_e_search_output_dir):\r\n outputs.mmseqs_e_search_output_dir.mkdir(parents=True)\r\n\r\n logger.loud_log(\"Running mmseqs easy-search for taxonomic annotation.\")\r\n\r\n mmseqs_tax.mmseqs(outputs.fasta_prot_out_path, mmseqs_db_path, outputs.mmseqs_e_search_output_dir,\r\n outputs.mmseqs_e_search_output_path, 7, cpus, outputs.mmseqs_e_search_log_path).run_mmseqs_e_search()\r\n\r\n utils.mmseqs_parser(outputs.mmseqs_tax_output_lca_path, outputs.mmseqs_e_search_output_path).tax_to_rdrpcatch(\r\n outputs.rdrpcatch_output_tsv, outputs.extended_rdrpcatch_output, seq_type)\r\n\r\n\r\n\r\n\r\n\r\n\r\n if not keep_tmp:\r\n if verbose:\r\n logger.loud_log(\"Deleting temporary files.\")\r\n else:\r\n logger.silent_log(\"Deleting temporary files.\")\r\n\r\n try:\r\n import shutil\r\n shutil.rmtree(outputs.tmp_dir)\r\n logger.silent_log(f\"Temporary files deleted.\")\r\n except FileNotFoundError:\r\n print(f\"Directory '{outputs.tmp_dir}' does not exist.\")\r\n except PermissionError:\r\n print(f\"Permission denied while trying to delete '{outputs.tmp_dir}'.\")\r\n except Exception as e:\r\n print(f\"An error occurred: {e}\")\r\n\r\n # Bundle results\r\n if bundle:\r\n archive_path = bundle_results(output_dir, prefix)\r\n if verbose:\r\n logger.loud_log(f\"Results bundled into: {archive_path}\")\r\n else:\r\n logger.silent_log(f\"Results bundled into: {archive_path}\")\r\n\r\n end_time = logger.stop_timer(start_time, verbose)\r\n\r\n logger.loud_log(f\"Total Runtime: {end_time}\")\r\n\r\n logger.loud_log(\"RdRpCATCH completed successfully.\")\r\n\r\n\r\n return outputs.extended_rdrpcatch_output\r\n\r\nif __name__ == \"__main__\":\r\n main()\r\n
5
+ Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
6
+ <+>UTF-8
7
+ ===================================================================
8
+ diff --git a/rdrpcatch/rdrpcatch_wrapper.py b/rdrpcatch/rdrpcatch_wrapper.py
9
+ --- a/rdrpcatch/rdrpcatch_wrapper.py (revision 2110790421475da92fd4f5e5dbf44f1191829a02)
10
+ +++ b/rdrpcatch/rdrpcatch_wrapper.py (date 1747134242424)
11
+ @@ -170,7 +170,7 @@
12
+
13
+ logger.loud_log("Fetching HMM databases...")
14
+
15
+ - ## Fetch HMM databases- RVMT, NeoRdRp, NeoRdRp.2.1, TSA_Olendraite, RDRP-scan, Lucaprot
16
+ + ## Fetch HMM databases- RVMT, NeoRdRp, NeoRdRp.2.1, TSA_Olendraite, RDRP-scan, Lucaprot_HMM,Zayed_HMM
17
+ rvmt_hmm_db = fetch_dbs.db_fetcher(db_dir).fetch_hmm_db_path("RVMT")
18
+ if verbose:
19
+ logger.loud_log(f"RVMT HMM database fetched from: {rvmt_hmm_db}")
20
+ @@ -202,19 +202,24 @@
21
+ logger.loud_log(f"RDRP-scan HMM database fetched from: {rdrpscan_hmm_db}")
22
+ else:
23
+ logger.silent_log(f"RDRP-scan HMM database fetched from: {rdrpscan_hmm_db}")
24
+ - lucaprot_hmm_db = fetch_dbs.db_fetcher(db_dir).fetch_hmm_db_path("Lucaprot")
25
+ + lucaprot_hmm_db = fetch_dbs.db_fetcher(db_dir).fetch_hmm_db_path("Lucaprot_HMM")
26
+ if verbose:
27
+ logger.loud_log(f"Lucaprot HMM database fetched from: {lucaprot_hmm_db}")
28
+ else:
29
+ logger.silent_log(f"Lucaprot HMM database fetched from: {lucaprot_hmm_db}")
30
+ + zayed_hmm_db = fetch_dbs.db_fetcher(db_dir).fetch_hmm_db_path("Zayed_HMM")
31
+ + if verbose:
32
+ + logger.loud_log(f"Zayed HMM database fetched from: {zayed_hmm_db}")
33
+ + else:
34
+ + logger.silent_log(f"Zayed HMM database fetched from: {zayed_hmm_db}")
35
+
36
+ db_name_list = []
37
+ db_path_list = []
38
+
39
+ ## Set up HMM databases
40
+ if db_options == ['all']:
41
+ - db_name_list = ["RVMT", "NeoRdRp", "NeoRdRp.2.1", "TSA_Olendraite_fam","TSA_Olendraite_gen", "RDRP-scan", "Lucaprot"]
42
+ - db_path_list = [rvmt_hmm_db, neordrp_hmm_db, neordrp_2_hmm_db, tsa_olen_fam_hmm_db,tsa_olen_gen_hmm_db, rdrpscan_hmm_db, lucaprot_hmm_db]
43
+ + db_name_list = ["RVMT", "NeoRdRp", "NeoRdRp.2.1", "TSA_Olendraite_fam","TSA_Olendraite_gen", "RDRP-scan", "Lucaprot_HMM", "Zayed_HMM"]
44
+ + db_path_list = [rvmt_hmm_db, neordrp_hmm_db, neordrp_2_hmm_db, tsa_olen_fam_hmm_db,tsa_olen_gen_hmm_db, rdrpscan_hmm_db, lucaprot_hmm_db, zayed_hmm_db]
45
+
46
+ else:
47
+ for db in db_options:
48
+ @@ -236,9 +241,12 @@
49
+ elif db == "RDRP-scan".lower():
50
+ db_name_list.append("RDRP-scan")
51
+ db_path_list.append(rdrpscan_hmm_db)
52
+ - elif db == "Lucaprot".lower():
53
+ - db_name_list.append("Lucaprot")
54
+ + elif db == "Lucaprot_HMM".lower():
55
+ + db_name_list.append("Lucaprot_HMM")
56
+ db_path_list.append(lucaprot_hmm_db)
57
+ + elif db == "Zayed_HMM".lower():
58
+ + db_name_list.append("Zayed_HMM")
59
+ + db_path_list.append(zayed_hmm_db)
60
+ else:
61
+ raise Exception(f"Invalid database option: {db}")
62
+
63
+ Index: rdrpcatch/cli/args.py
64
+ IDEA additional info:
65
+ Subsystem: com.intellij.openapi.diff.impl.patch.BaseRevisionTextPatchEP
66
+ <+>import warnings\r\n# Filter numpy warnings before any imports that might trigger them\r\nwarnings.filterwarnings(\"ignore\", category=UserWarning, module=\"numpy\")\r\nwarnings.filterwarnings(\"ignore\", category=RuntimeWarning, module=\"numpy\")\r\nwarnings.filterwarnings(\"ignore\", message=\".*subnormal.*\")\r\n\r\nimport rich_click as click\r\nfrom rich.console import Console\r\nfrom rich.table import Table\r\nfrom rich.panel import Panel\r\nfrom rich.syntax import Syntax\r\nfrom rich.progress import Progress, BarColumn, TextColumn, DownloadColumn, TimeRemainingColumn\r\nfrom pathlib import Path\r\nimport datetime\r\nfrom ..rdrpcatch_wrapper import run_scan\r\nfrom ..rdrpcatch_scripts.fetch_dbs import ZenodoDownloader, db_fetcher\r\nimport os\r\nimport shutil\r\nimport requests\r\n\r\nconsole = Console()\r\n\r\n## FUNCTIONS\r\ndef parse_comma_separated_options(ctx, param, value):\r\n if not value:\r\n return ['all']\r\n\r\n allowed_choices = ['RVMT', 'NeoRdRp', 'NeoRdRp.2.1', 'TSA_Olendraite_fam', 'TSA_Olendraite_gen', 'RDRP-scan',\r\n 'Lucaprot', 'all']\r\n lower_choices = [choice.lower() for choice in allowed_choices]\r\n options = value.split(',')\r\n lower_options = [option.lower() for option in options]\r\n\r\n for option in options:\r\n if option.lower() not in lower_choices:\r\n raise click.BadParameter(f\"Invalid choice: '{option}' (choose from {', '.join(allowed_choices)})\")\r\n\r\n return lower_options\r\n\r\n\r\ndef format_size(bytes_size: int) -> str:\r\n \"\"\"Convert bytes to human-readable format without external dependencies\"\"\"\r\n units = [\"B\", \"KB\", \"MB\", \"GB\", \"TB\"]\r\n unit_idx = 0\r\n size = float(bytes_size)\r\n\r\n while size >= 1024 and unit_idx < len(units) - 1:\r\n size /= 1024\r\n unit_idx += 1\r\n\r\n return f\"{size:.2f} {units[unit_idx]}\"\r\n\r\n\r\n\r\n## CLI ENTRY POINT\r\n\r\n@click.group()\r\ndef cli():\r\n \"\"\"RdRpCATCH - RNA-dependent RNA polymerase Collaborative Analysis Tool with Collections of pHMMs\"\"\"\r\n pass\r\n\r\n@cli.command(\"scan\", help=\"Scan sequences for RdRps.\")\r\n@click.option(\"-i\", \"--input\",\r\n help=\"Path to the input FASTA file.\",\r\n type=click.Path(exists=True, dir_okay=False, readable=True, path_type=Path), required=True)\r\n@click.option(\"-o\", \"--output\",\r\n help=\"Path to the output directory.\",\r\n type=click.Path(exists=False, file_okay=False, writable=True, path_type=Path), required=True)\r\n@click.option(\"-db_dir\", \"--db_dir\",\r\n help=\"Path to the directory containing RdRpCATCH databases.\",\r\n type=click.Path(exists=True, dir_okay=True, readable=True, path_type=Path),required=True)\r\n@click.option(\"-dbs\", \"--db_options\",\r\n callback=parse_comma_separated_options,\r\n default=\"all\",\r\n help=\"Comma-separated list of databases to search against. Valid options: RVMT, NeoRdRp, NeoRdRp.2.1,\"\r\n \" TSA_Olendraite_fam, TSA_Olendraite_gen, RDRP-scan,Lucaprot, all\")\r\n@click.option(\"--custom-dbs\",\r\n help=\"Path to directory containing custom MSAs/pHMM files to use as additional databases\",\r\n type=click.Path(exists=True, path_type=Path))\r\n@click.option(\"-seq_type\", \"--seq_type\",\r\n type=click.STRING,\r\n default=None,\r\n help=\"Type of sequence to search against: (prot,nuc) Default: unknown\")\r\n@click.option(\"-v\", \"--verbose\",\r\n is_flag=True,\r\n help=\"Print verbose output.\")\r\n@click.option('-e', '--evalue',\r\n type=click.FLOAT,\r\n default=1e-5,\r\n help=\"E-value threshold for HMMsearch. (default: 1e-5)\")\r\n@click.option('-incE', '--incevalue',\r\n type=click.FLOAT,\r\n default=1e-5,\r\n help=\"Inclusion E-value threshold for HMMsearch. (default: 1e-5)\")\r\n@click.option('-domE', '--domevalue',\r\n type=click.FLOAT,\r\n default=1e-5,\r\n help=\"Domain E-value threshold for HMMsearch. (default: 1e-5)\")\r\n@click.option('-incdomE', '--incdomevalue',\r\n type=click.FLOAT,\r\n default=1e-5,\r\n help=\"Inclusion domain E-value threshold for HMMsearch. (default: 1e-5)\")\r\n@click.option('-z', '--zvalue',\r\n type=click.INT,\r\n default=1000000,\r\n help=\"Number of sequences to search against. (default: 1000000)\")\r\n@click.option('-cpus', '--cpus',\r\n type=click.INT,\r\n default=1,\r\n help=\"Number of CPUs to use for HMMsearch. (default: 1)\")\r\n@click.option('-length_thr', '--length_thr',\r\n type=click.INT,\r\n default=400,\r\n help=\"Minimum length threshold for seqkit seq. (default: 400)\")\r\n@click.option('-gen_code', '--gen_code',\r\n type=click.INT,\r\n default=1,\r\n help='Genetic code to use for translation. (default: 1) Possible genetic codes (supported by seqkit translate) : 1: The Standard Code, '\r\n '2: The Vertebrate Mitochondrial Code, '\r\n '3: The Yeast Mitochondrial Code, '\r\n '4: The Mold, Protozoan, and Coelenterate Mitochondrial Code and the Mycoplasma/Spiroplasma Code, '\r\n '5: The Invertebrate Mitochondrial Code, '\r\n '6: The Ciliate, Dasycladacean and Hexamita Nuclear Code, '\r\n '9: The Echinoderm and Flatworm Mitochondrial Code, '\r\n '10: The Euplotid Nuclear Code, '\r\n '11: The Bacterial, Archaeal and Plant Plastid Code, '\r\n '12: The Alternative Yeast Nuclear Code, '\r\n '13: The Ascidian Mitochondrial Code, '\r\n '14: The Alternative Flatworm Mitochondrial Code, '\r\n '16: Chlorophycean Mitochondrial Code, '\r\n '21: Trematode Mitochondrial Code, '\r\n '22: Scenedesmus obliquus Mitochondrial Code, '\r\n '23: Thraustochytrium Mitochondrial Code, '\r\n '24: Pterobranchia Mitochondrial Code, '\r\n '25: Candidate Division SR1 and Gracilibacteria Code, '\r\n '26: Pachysolen tannophilus Nuclear Code, '\r\n '27: Karyorelict Nuclear, '\r\n '28: Condylostoma Nuclear, '\r\n '29: Mesodinium Nuclear, '\r\n '30: Peritrich Nuclear, '\r\n '31: Blastocrithidia Nuclear, ')\r\n@click.option('-bundle', '--bundle',\r\n is_flag=True,\r\n default=False,\r\n help=\"Bundle the output files into a single archive. (default: False)\")\r\n@click.option('-keep_tmp', '--keep_tmp',\r\n is_flag=True,\r\n default=False,\r\n help=\"Keep temporary files (Expert users) (default: False)\")\r\n@click.option('-overwrite', '--overwrite',\r\n is_flag=True,\r\n default=False,\r\n help=\"Force overwrite of existing output directory. (default: False)\")\r\n\r\n@click.pass_context\r\ndef scan(ctx, input, output, db_options, db_dir, custom_dbs, seq_type, verbose, evalue,\r\n incevalue, domevalue, incdomevalue, zvalue, cpus, length_thr, gen_code, bundle, keep_tmp, overwrite):\r\n \"\"\"Scan sequences for RdRps.\"\"\"\r\n\r\n # Create a rich table for displaying parameters\r\n table = Table(title=\"Scan Parameters\")\r\n table.add_column(\"Parameter\", style=\"cyan\")\r\n table.add_column(\"Value\", style=\"green\")\r\n\r\n table.add_row(\"Input File\", str(input))\r\n table.add_row(\"Output Directory\", str(output))\r\n table.add_row(\"Databases\", \", \".join(db_options))\r\n table.add_row(\"Database Directory\", str(db_dir))\r\n if custom_dbs:\r\n table.add_row(\"Custom Databases\", str(custom_dbs))\r\n table.add_row(\"Sequence Type\", seq_type or \"unknown\")\r\n table.add_row(\"Verbose Mode\", \"ON\" if verbose else \"OFF\")\r\n table.add_row(\"E-value\", str(evalue))\r\n table.add_row(\"Inclusion E-value\", str(incevalue))\r\n table.add_row(\"Domain E-value\", str(domevalue))\r\n table.add_row(\"Inclusion Domain E-value\", str(incdomevalue))\r\n table.add_row(\"Z-value\", str(zvalue))\r\n table.add_row(\"CPUs\", str(cpus))\r\n table.add_row(\"Length Threshold\", str(length_thr))\r\n table.add_row(\"Genetic Code\", str(gen_code))\r\n table.add_row(\"Bundle Output\", \"ON\" if bundle else \"OFF\")\r\n table.add_row(\"Save Temporary Files\", \"ON\" if keep_tmp else \"OFF\")\r\n table.add_row(\"Force Overwrite\", \"ON\" if overwrite else \"OFF\")\r\n\r\n console.print(Panel(table, title=\"Scan Configuration\"))\r\n\r\n # Add custom databases if provided\r\n if custom_dbs:\r\n db = db_fetcher(db_dir)\r\n if os.path.isfile(custom_dbs):\r\n db.add_custom_db(custom_dbs)\r\n else:\r\n for item in os.listdir(custom_dbs):\r\n item_path = os.path.join(custom_dbs, item)\r\n if os.path.isfile(item_path) and item_path.endswith(('.hmm', '.h3m', '.msa', '.sto', '.fasta', '.fa')):\r\n db.add_custom_db(item_path)\r\n elif os.path.isdir(item_path):\r\n db.add_custom_db(item_path, item)\r\n\r\n run_scan(\r\n input_file=input,\r\n output_dir=output,\r\n db_options=db_options,\r\n db_dir=db_dir,\r\n seq_type=seq_type,\r\n verbose=verbose,\r\n e=evalue,\r\n incE=incevalue,\r\n domE=domevalue,\r\n incdomE=incdomevalue,\r\n z=zvalue,\r\n cpus=cpus,\r\n length_thr=length_thr,\r\n gen_code=gen_code,\r\n bundle=bundle,\r\n keep_tmp=keep_tmp,\r\n overwrite=overwrite\r\n )\r\n\r\n# @cli.command(\"download\", help=\"Download RdRpCATCH databases.\")\r\n# @click.option(\"--destination_dir\", \"-dest\",\r\n# help=\"Path to the directory to download HMM databases.\",\r\n# type=click.Path(exists=False, file_okay=False, writable=True, path_type=Path), required=True)\r\n# @click.option(\"--check-updates\", \"-u\",\r\n# is_flag=True,\r\n# help=\"Check for database updates\")\r\n# @click.pass_context\r\n# def download(ctx, destination_dir, check_updates):\r\n# \"\"\"Download RdRpCATCH databases.\"\"\"\r\n#\r\n# # if check_updates:\r\n# # db = db_fetcher(destination_dir)\r\n# # version_info = db.check_db_updates()\r\n# # if version_info:\r\n# # console.print(\"Current database versions:\")\r\n# # for db_name, info in version_info.items():\r\n# # console.print(f\"- {db_name}: {info}\")\r\n# # else:\r\n# # console.print(\"No version information available\")\r\n# # return\r\n#\r\n# run_download(destination_dir)\r\n#\r\n# # @cli.command(\"gui\", help=\"Launch the GUI.\")\r\n# # @click.pass_context\r\n# # def gui(ctx):\r\n# # \"\"\"Launch the GUI.\"\"\"\r\n# #\r\n# # console.print(Panel(\"Starting ColabScan GUI...\", title=\"GUI Launch\"))\r\n# # run_gui()\r\n\r\n\r\n\r\n@cli.command(\"download\", help=\"Download & update RdRpCATCH databases. If databases are already installed in the \"\r\n \"specified directory,\"\r\n \" it will check for updates and download the latest version if available.\")\r\n@click.option(\"--destination_dir\", \"-dest\",\r\n help=\"Path to directory to download databases\",\r\n type=click.Path(path_type=Path, file_okay=False, writable=True),\r\n required=True)\r\n@click.option(\"--concept-doi\", default=\"10.5281/zenodo.14358348\",\r\n help=\"Zenodo Concept DOI for database repository\")\r\ndef download(destination_dir: Path, concept_doi: str):\r\n \"\"\"Handle database download/update workflow\"\"\"\r\n downloader = ZenodoDownloader(concept_doi, destination_dir)\r\n\r\n try:\r\n\r\n current_version = downloader.get_current_version()\r\n if downloader.lock_file.exists():\r\n console.print(\"[red]× Another download is already in progress[/red]\")\r\n raise click.Abort()\r\n\r\n if downloader.needs_update() or not current_version:\r\n downloader.lock_file.touch(exist_ok=False)\r\n with Progress(\r\n TextColumn(\"[progress.description]{task.description}\"),\r\n BarColumn(),\r\n TextColumn(\"{task.completed:.2f}/{task.total:.2f} MB\"),\r\n TimeRemainingColumn(),\r\n transient=True\r\n ) as progress:\r\n # Setup main download task\r\n main_task = progress.add_task(\"[cyan]Database Manager\", total=4)\r\n\r\n # Phase 1: Metadata fetching\r\n progress.update(main_task, description=\"Fetching Zenodo metadata...\")\r\n metadata = downloader._fetch_latest_metadata()\r\n progress.advance(main_task)\r\n\r\n # Phase 2: Prepare download\r\n progress.update(main_task, description=\"Analyzing package...\")\r\n tarball_info = downloader._get_tarball_info()\r\n file_size_mb = tarball_info[\"size\"] / (1024 * 1024)\r\n progress.advance(main_task)\r\n\r\n # Phase 3: Download with progress\r\n progress.update(main_task,\r\n description=\"Downloading RdRpCATCH databases...\",\r\n total=file_size_mb)\r\n\r\n if not downloader.temp_dir.exists():\r\n downloader.temp_dir.mkdir(parents=True, exist_ok=True)\r\n\r\n temp_tar = downloader.temp_dir / \"download.tmp\"\r\n\r\n with requests.get(tarball_info[\"url\"], stream=True) as response:\r\n response.raise_for_status()\r\n with open(temp_tar, \"wb\") as f:\r\n downloaded = 0\r\n for chunk in response.iter_content(chunk_size=8192):\r\n f.write(chunk)\r\n downloaded += len(chunk)\r\n progress.update(main_task, advance=len(chunk) / (1024 * 1024))\r\n\r\n # Phase 4: Verification & installation\r\n progress.update(main_task, description=\"Verifying checksum...\")\r\n if not downloader._verify_checksum(temp_tar, tarball_info[\"checksum\"]):\r\n raise ValueError(\"Checksum verification failed\")\r\n\r\n progress.update(main_task, description=\"Installing databases...\")\r\n downloader.extract_and_verify(temp_tar)\r\n version_info = downloader.get_latest_version_info()\r\n downloader.atomic_write_version(version_info)\r\n progress.advance(main_task)\r\n\r\n # Success message\r\n size_str = format_size(tarball_info[\"size\"])\r\n console.print(\r\n f\"\\n[bold green]✓ Successfully downloaded version {version_info['record_id']}[/bold green]\",\r\n f\"Release date: {version_info['created']}\",\r\n f\"Size: {size_str}\",\r\n sep=\"\\n\"\r\n )\r\n\r\n else:\r\n installed_date = current_version[\"downloaded\"]\r\n console.print(\r\n f\"[green]✓ Databases are current[/green]\",\r\n f\"Version ID: {current_version['record_id']}\",\r\n f\"Installed: {installed_date}\",\r\n sep=\"\\n\"\r\n )\r\n except FileExistsError:\r\n console.print(\"[red]× Another download is already in progress![/red]\")\r\n console.print(f\"Lock file exists: {downloader.lock_file}\")\r\n raise click.Abort()\r\n\r\n except Exception as e:\r\n console.print(f\"\\n[red]× Download failed: {str(e)}[/red]\")\r\n if downloader.temp_dir.exists():\r\n shutil.rmtree(downloader.temp_dir)\r\n raise click.Abort()\r\n\r\n finally:\r\n # Cleanup operations\r\n if downloader.lock_file.exists():\r\n downloader.lock_file.unlink()\r\n if downloader.temp_dir.exists():\r\n shutil.rmtree(downloader.temp_dir)\r\n\r\n\r\nif __name__ == '__main__':\r\n cli(obj={})\r\n\r\n
67
+ Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
68
+ <+>UTF-8
69
+ ===================================================================
70
+ diff --git a/rdrpcatch/cli/args.py b/rdrpcatch/cli/args.py
71
+ --- a/rdrpcatch/cli/args.py (revision 2110790421475da92fd4f5e5dbf44f1191829a02)
72
+ +++ b/rdrpcatch/cli/args.py (date 1747651046007)
73
+ @@ -26,7 +26,7 @@
74
+ return ['all']
75
+
76
+ allowed_choices = ['RVMT', 'NeoRdRp', 'NeoRdRp.2.1', 'TSA_Olendraite_fam', 'TSA_Olendraite_gen', 'RDRP-scan',
77
+ - 'Lucaprot', 'all']
78
+ + 'Lucaprot_HMM, Zayed_HMM', 'all']
79
+ lower_choices = [choice.lower() for choice in allowed_choices]
80
+ options = value.split(',')
81
+ lower_options = [option.lower() for option in options]
82
+ @@ -73,7 +73,7 @@
83
+ callback=parse_comma_separated_options,
84
+ default="all",
85
+ help="Comma-separated list of databases to search against. Valid options: RVMT, NeoRdRp, NeoRdRp.2.1,"
86
+ - " TSA_Olendraite_fam, TSA_Olendraite_gen, RDRP-scan,Lucaprot, all")
87
+ + " TSA_Olendraite_fam, TSA_Olendraite_gen, RDRP-scan,Lucaprot_HMM, Zayed_HMM, all")
88
+ @click.option("--custom-dbs",
89
+ help="Path to directory containing custom MSAs/pHMM files to use as additional databases",
90
+ type=click.Path(exists=True, path_type=Path))
91
+ Index: README.md
92
+ IDEA additional info:
93
+ Subsystem: com.intellij.openapi.diff.impl.patch.BaseRevisionTextPatchEP
94
+ <+># RdRpCATCH\r\n## RNA-dependent RNA polymerase Collaborative Analysis Tool with Collections of pHMMs\r\n\r\n\r\n\r\nRdRpCATCH is collaborative effort to combine various publicly available RNA virus RNA-dependent RNA polymerase pHMM databases in one tool\r\nto facilitate their detection in (meta-)transcriptomics data.\r\n\r\n\r\nRdRpCATCH is written in Python and uses the pyHMMER3\r\nlibrary to perform pHMM searches. In addition, the tool scans each sequence (aa or nt) in the input file with the selected databases and provides the best hit (hit with the highest bitscore across all databases) as output.\r\nIn addition, RdRpCATCH provides information about the number of profiles\r\nthat were positive for each sequence across all pHMM databases, and taxonomic information based on the MMseqs2 easy-taxonomy and search modules against a custom RefSeq Riboviria database.\r\n\r\n** The tool has been modified to use [rolypoly](https://code.jgi.doe.gov/UNeri/rolypoly) code/approaches **\r\n\r\n![rdrpcatch_flowchart_v0.png](images%2Frdrpcatch_flowchart_v0.png)\r\n\r\n### Supported databases\r\n- NeoRdRp <sup>1</sup> : 1182 pHMMs \r\n- NeoRdRp2 <sup>2</sup>: 19394 pHMMs \r\n- RVMT <sup>3</sup>: 710 pHMMs \r\n- RdRp-Scan <sup>4</sup> : 68 pHMMs\r\n- TSA_Oleandrite_fam <sup>5</sup>: 77 pHMMs \r\n- TSA_Oleandrite_gen <sup>6</sup> : 341 pHMMs\r\n- LucaProt_pHMM<sup>7 </sup> : 754 pHMMs \r\n\r\n1. Sakaguchi, S. et al. (2022) 'NeoRdRp: A comprehensive dataset for identifying RNA-dependent RNA polymerases of various RNA viruses from metatranscriptomic data', *Microbes and Environments*, 37(3). [doi:10.1264/jsme2.me22001](https://doi.org/10.1264/jsme2.me22001)\r\n2. Sakaguchi, S., Nakano, T. and Nakagawa, S. (2024) 'Neordrp2 with improved seed data, annotations, and scoring', *Frontiers in Virology*, 4. [doi:10.3389/fviro.2024.1378695](https://doi.org/10.3389/fviro.2024.1378695)\r\n3. Neri, U. et al. (2022) 'Expansion of the global RNA virome reveals diverse clades of bacteriophages', *Cell*, 185(21). [doi:10.1016/j.cell.2022.08.023](https://doi.org/10.1016/j.cell.2022.08.023)\r\n4. Charon, J. et al. (2022) 'RDRP-Scan: A bioinformatic resource to identify and annotate divergent RNA viruses in metagenomic sequence data', *Virus Evolution*, 8(2). [doi:10.1093/ve/veac082](https://doi.org/10.1093/ve/veac082)\r\n5. Olendraite, I., Brown, K. and Firth, A.E. (2023) 'Identification of RNA virus–derived rdrp sequences in publicly available transcriptomic data sets', *Molecular Biology and Evolution*, 40(4). [doi:10.1093/molbev/msad060](https://doi.org/10.1093/molbev/msad060)\r\n6. Olendraite, I. (2021) 'Mining diverse and novel RNA viruses in transcriptomic datasets', Apollo. Available at: [https://www.repository.cam.ac.uk/items/1fabebd2-429b-45c9-b6eb-41d27d0a90c2](https://www.repository.cam.ac.uk/items/1fabebd2-429b-45c9-b6eb-41d27d0a90c2)\r\n7. Hou, X. et al. (2024) 'Using artificial intelligence to document the hidden RNA virosphere', *Cell*, 187(24). [doi:10.1016/j.cell.2024.09.027](https://doi.org/10.1016/j.cell.2024.09.027)\r\n\r\n\r\n## Installation\r\n\r\n\r\n#### Prerequisites\r\nFor the installation process, conda is required. If you don't have conda installed, you can find instructions on how to\r\nhttps://docs.conda.io/projects/conda/en/latest/user-guide/install/index.html \r\nMamba is a faster alternative to conda. If you have it installed, you can use it instead of conda. \r\n\r\n#### Installation steps\r\n\r\nThe package is available as a bioconda package. You can install it using the following command:\r\n\r\n```bash\r\nconda env create rdrpcatch -c bioconda rdrpcatch\r\n```\r\n\r\nAlternatively, you can install RdRpCATCH from python package index (PyPI) using pip. This requires the installation of the dependencies\r\nmanually. The dependencies are:\r\n- mmseqs2\r\n- seqkit\r\n\r\nThe dependencies can be installed using conda or mamba. Follow these steps:\r\n\r\nCreate a new conda environment and install the dependencies:\r\n```bash\r\nconda env create -n rdrpcatch python=3.12\r\nconda activate rdrpcatch\r\nconda install -c bioconda mmseqs2==17.b804f seqkit==2.10.0\r\n```\r\nInstall the tool from pip:\r\n```bash\r\npip install rdrpcatch\r\n```\r\n\r\nActivate the environment and download the RdRpCATCH databases:\r\n\r\n```bash \r\nconda activate rdrpcatch\r\nrdrpcatch download --destination_dir path/to/store/databases\r\n```\r\n\r\n* Note 1: The databases are large files and may take some time to download (~ 3 GB).\r\n* Note 2: The databases are stored in the specified directory, and the path is required to run RdRpCATCH.\r\n* Note 3: If you encounter an SSL error while downloading, please try again. The error seems to appear sporadically during testing, and a simple re-initiation of the downloading process seems to fix it. \r\n\r\n## Usage\r\nRdRpCATCH can be used as a CLI tool as follows:\r\n\r\n```bash \r\n# make sure the conda environment is activated\r\n# conda activate rdrpcatch\r\n\r\n# scan the input fasta file with the selected databases\r\nrdrpcatch scan -i path/to/input.fasta -o path/to/output_dir -db_dir path/to/database\r\n```\r\n### input: \r\nThe input file can be one or more nucleotide or protein sequences in multi-fasta format. \r\nThe output directory is where the results will be stored. We recommend specifying the type of the sequence in the command line,\r\nAn optional argument `--seq_type` (nuc or prot) can be used to specify if the input fasta file sequences are nucleotide or amino acid.\r\n\r\n## Commands\r\nThe following two commands are available in RdRpCATCH: \r\n* [`rdrpcatch scan`](#rdrpcatch-scan) \r\n* [`rdrpcatch download`](#rdrpcatch-download)\r\n\r\n### rdrpcatch download:\r\nCommand to download pre-compiled databases from Zenodo. If the databases are already downloaded in the specified directory\r\n, the command will check for updates and download the latest version if available.\r\n\r\n| Argument | Short Flag | Type | Description |\r\n|----------|------------|------|-------------------------------------------------------------|\r\n| `--destination_dir` | `-dest` | PATH | Path to the directory to download HMM databases. [required] |\r\n| `--concept-doi` | `` | TEXT | Zenodo Concept DOI for database repository |\r\n| `--help` | `` | | Show help message and exit |\r\n### rdrpcatch scan:\r\nSearch a given input using selected RdRp databases. \r\n\r\n| Argument | Short Flag | Type | Description |\r\n|----------|------------|------|-------------|\r\n| `--input` | `-i` | FILE | Path to the input FASTA file. [required] |\r\n| `--output` | `-o` | DIRECTORY | Path to the output directory. [required] |\r\n| `--db_dir` | `-db_dir` | PATH | Path to the directory containing RdRpCATCH databases. [required] |\r\n| `--db_options` | `-dbs` | TEXT | Comma-separated list of databases to search against. Valid options: RVMT, NeoRdRp, NeoRdRp.2.1, TSA_Olendraite_fam, TSA_Olendraite_gen, RDRP-scan, Lucaprot, all |\r\n| `--custom-dbs` | | PATH | Path to directory containing custom MSAs/pHMM files to use as additional databases |\r\n| `--seq_type` | `-seq_type` | TEXT | Type of sequence to search against: (prot,nuc) Default: unknown |\r\n| `--verbose` | `-v` | FLAG | Print verbose output. |\r\n| `--evalue` | `-e` | FLOAT | E-value threshold for HMMsearch. (default: 1e-5) |\r\n| `--incevalue` | `-incE` | FLOAT | Inclusion E-value threshold for HMMsearch. (default: 1e-5) |\r\n| `--domevalue` | `-domE` | FLOAT | Domain E-value threshold for HMMsearch. (default: 1e-5) |\r\n| `--incdomevalue` | `-incdomE` | FLOAT | Inclusion domain E-value threshold for HMMsearch. (default: 1e-5) |\r\n| `--zvalue` | `-z` | INTEGER | Number of sequences to search against. (default: 1000000) |\r\n| `--cpus` | `-cpus` | INTEGER | Number of CPUs to use for HMMsearch. (default: 1) |\r\n| `--length_thr` | `-length_thr` | INTEGER | Minimum length threshold for seqkit seq. (default: 400) |\r\n| `--gen_code` | `-gen_code` | INTEGER | Genetic code to use for translation. (default: 1) |\r\n| `--bundle` | `-bundle` | | Bundle the output files into a single archive. (default: False) |\r\n| `--keep_tmp` | `-keep_tmp` | | Keep the temporary files generated during the analysis. (default: False) |\r\n\r\n\r\n\r\n#### Output files \r\nrdrpcatch scan will create a folder with the following structure:\r\n\r\n| Output | Description |\r\n|--------|------------------------------------------------------------------------------|\r\n| `{prefix}_rdrpcatch_output_annotated.tsv` | A tab-separated file containing the results of the RdRpCATCH analysis. |\r\n| `{prefix}_rdrpcatch_fasta` | A directory containing the sequences that were identified as RdRp sequences. |\r\n| `{prefix}_rdrpcatch_plots` | A directory containing the plots generated during the analysis. |\r\n| `{prefix}_gff_files` | A directory containing the GFF files generated during the analysis. (For now only based on protein sequences) |\r\n| `tmp` | A directory containing temporary files generated during the analysis. (Only available if the -keep_tmp flag is used )|\r\n\r\n#### Output table fields\r\nA summary of the results is stored in the `{prefix}_rdrpcatch_output_annotated.tsv` file, which contains the following fields:\r\n| Field | Description |\r\n|-------|---------------------------------------------------------------------------------------------------------------------|\r\n| `Contig_name` | The name of the contig. |\r\n| `Translated_contig_name (frame)` | The name of the translated contig and the frame of the RdRp sequence. |\r\n| `Sequence_length(AA)` | The length of the RdRp sequence in amino acids. |\r\n| `Total_databases_that_the_contig_was_detected(No_of_Profiles)` | The name of databases and the number of profiles that the RdRp sequence was detected by. |\r\n| `Best_hit_Database` | The database with the best hit. |\r\n| `Best_hit_profile_name` | The name of the profile with the best hit. |\r\n| `Best_hit_profile_length` | The length of the profile with the best hit. |\r\n| `Best_hit_e-value` | The e-value of the best hit. |\r\n| `Best_hit_bitscore` | The bitscore of the best hit. |\r\n| `RdRp_from(AA)` | The start position of the RdRp sequence, in relation to the amino acid sequence. |\r\n| `RdRp_to(AA)` | The end position of the RdRp sequence, in relation to the amino acid sequence. |\r\n| `Best_hit_profile_coverage` | The fraction of the profile that was covered by the RdRp sequence. |\r\n| `Best_hit_contig_coverage` | The fraction of the contig that was covered by the RdRp sequence. (Based on aminoacid sequence) |\r\n| `MMseqs_Taxonomy_2bLCA` | The taxonomy of the RdRp sequence based on MMseqs2 easy-taxonomy module against a custom RefSeq Riboviria database. |\r\n| `MMseqs_TopHit_accession` | The accession of the top hit in the RefSeq Riboviria database. |\r\n| `MMseqs_TopHit_fident` | The fraction of identical matches of the top hit in the RefSeq Riboviria database. |\r\n| `MMseqs_TopHit_alnlen` | The alignment length of the top hit in the RefSeq Riboviria database. |\r\n| `MMseqs_TopHit_eval` | The e-value of the top hit in the RefSeq Riboviria database. |\r\n| `MMseqs_TopHit_bitscore` | The bitscore of the top hit in the RefSeq Riboviria database. |\r\n| `MMseqs_TopHit_qcov` | The query coverage of the top hit in the RefSeq Riboviria database. |\r\n| `MMseqs_TopHit_lineage` | The lineage of the top hit in the RefSeq Riboviria database. |\r\n\r\n## Citations\r\nManuscript still in preparation. If you use RdRpCATCH, please cite this GitHub repository \r\nA precompiled version of the used databases is available at Zenodo DOI: [10.5281/zenodo.14358348](https://doi.org/10.5281/zenodo.14358348). \r\nIf you use RdRpCATCH, please cite the [underlying third party databases](#supported-databases) :\r\n\r\n## Acknowledgements\r\nRdRpCATCH is a collaborative effort and we would like to thank all the authors and developers of the underlying databases. \r\n\r\n## Contact\r\nDimitris Karapliafis (dimitris.karapliafis@wur.nl), potentially via slack/teams or an issue in the main repo.\r\n\r\n##TODO:\r\n- [ ] loud logging is linking to the utils.py file, not the actual line of code causing the error.\r\n- [ ] drop `db_dir` argument and use global/environment/config variable that is set after running the `download` command\r\n\r\n\r\n## Contributing\r\nTBD up to Dimitris and Anne\r\n\r\n## Licence\r\n[MIT](LICENSE)\r\n
95
+ Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
96
+ <+>UTF-8
97
+ ===================================================================
98
+ diff --git a/README.md b/README.md
99
+ --- a/README.md (revision 2110790421475da92fd4f5e5dbf44f1191829a02)
100
+ +++ b/README.md (date 1747651045997)
101
+ @@ -111,25 +111,25 @@
102
+ ### rdrpcatch scan:
103
+ Search a given input using selected RdRp databases.
104
+
105
+ -| Argument | Short Flag | Type | Description |
106
+ -|----------|------------|------|-------------|
107
+ -| `--input` | `-i` | FILE | Path to the input FASTA file. [required] |
108
+ -| `--output` | `-o` | DIRECTORY | Path to the output directory. [required] |
109
+ -| `--db_dir` | `-db_dir` | PATH | Path to the directory containing RdRpCATCH databases. [required] |
110
+ -| `--db_options` | `-dbs` | TEXT | Comma-separated list of databases to search against. Valid options: RVMT, NeoRdRp, NeoRdRp.2.1, TSA_Olendraite_fam, TSA_Olendraite_gen, RDRP-scan, Lucaprot, all |
111
+ -| `--custom-dbs` | | PATH | Path to directory containing custom MSAs/pHMM files to use as additional databases |
112
+ -| `--seq_type` | `-seq_type` | TEXT | Type of sequence to search against: (prot,nuc) Default: unknown |
113
+ -| `--verbose` | `-v` | FLAG | Print verbose output. |
114
+ -| `--evalue` | `-e` | FLOAT | E-value threshold for HMMsearch. (default: 1e-5) |
115
+ -| `--incevalue` | `-incE` | FLOAT | Inclusion E-value threshold for HMMsearch. (default: 1e-5) |
116
+ -| `--domevalue` | `-domE` | FLOAT | Domain E-value threshold for HMMsearch. (default: 1e-5) |
117
+ -| `--incdomevalue` | `-incdomE` | FLOAT | Inclusion domain E-value threshold for HMMsearch. (default: 1e-5) |
118
+ -| `--zvalue` | `-z` | INTEGER | Number of sequences to search against. (default: 1000000) |
119
+ -| `--cpus` | `-cpus` | INTEGER | Number of CPUs to use for HMMsearch. (default: 1) |
120
+ -| `--length_thr` | `-length_thr` | INTEGER | Minimum length threshold for seqkit seq. (default: 400) |
121
+ -| `--gen_code` | `-gen_code` | INTEGER | Genetic code to use for translation. (default: 1) |
122
+ -| `--bundle` | `-bundle` | | Bundle the output files into a single archive. (default: False) |
123
+ -| `--keep_tmp` | `-keep_tmp` | | Keep the temporary files generated during the analysis. (default: False) |
124
+ +| Argument | Short Flag | Type | Description |
125
+ +|----------|------------|------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
126
+ +| `--input` | `-i` | FILE | Path to the input FASTA file. [required] |
127
+ +| `--output` | `-o` | DIRECTORY | Path to the output directory. [required] |
128
+ +| `--db_dir` | `-db_dir` | PATH | Path to the directory containing RdRpCATCH databases. [required] |
129
+ +| `--db_options` | `-dbs` | TEXT | Comma-separated list of databases to search against. Valid options: RVMT, NeoRdRp, NeoRdRp.2.1, TSA_Olendraite_fam, TSA_Olendraite_gen, RDRP-scan, Lucaprot_HMM,Zayed_HMM, all |
130
+ +| `--custom-dbs` | | PATH | Path to directory containing custom MSAs/pHMM files to use as additional databases |
131
+ +| `--seq_type` | `-seq_type` | TEXT | Type of sequence to search against: (prot,nuc) Default: unknown |
132
+ +| `--verbose` | `-v` | FLAG | Print verbose output. |
133
+ +| `--evalue` | `-e` | FLOAT | E-value threshold for HMMsearch. (default: 1e-5) |
134
+ +| `--incevalue` | `-incE` | FLOAT | Inclusion E-value threshold for HMMsearch. (default: 1e-5) |
135
+ +| `--domevalue` | `-domE` | FLOAT | Domain E-value threshold for HMMsearch. (default: 1e-5) |
136
+ +| `--incdomevalue` | `-incdomE` | FLOAT | Inclusion domain E-value threshold for HMMsearch. (default: 1e-5) |
137
+ +| `--zvalue` | `-z` | INTEGER | Number of sequences to search against. (default: 1000000) |
138
+ +| `--cpus` | `-cpus` | INTEGER | Number of CPUs to use for HMMsearch. (default: 1) |
139
+ +| `--length_thr` | `-length_thr` | INTEGER | Minimum length threshold for seqkit seq. (default: 400) |
140
+ +| `--gen_code` | `-gen_code` | INTEGER | Genetic code to use for translation. (default: 1) |
141
+ +| `--bundle` | `-bundle` | | Bundle the output files into a single archive. (default: False) |
142
+ +| `--keep_tmp` | `-keep_tmp` | | Keep the temporary files generated during the analysis. (default: False) |
143
+
144
+
145
+
@@ -4,8 +4,8 @@
4
4
  <option name="autoReloadType" value="SELECTIVE" />
5
5
  </component>
6
6
  <component name="ChangeListManager">
7
- <list default="true" id="d849e6fa-87f9-4e92-9c33-abef7cc975d3" name="Changes" comment="Updates:&#10;Fixed bug that crushed the script when at least one pHMM DB does not have a match against the sequence database also for nuc branch">
8
- <change beforePath="$PROJECT_DIR$/pyproject.toml" beforeDir="false" afterPath="$PROJECT_DIR$/pyproject.toml" afterDir="false" />
7
+ <list default="true" id="d849e6fa-87f9-4e92-9c33-abef7cc975d3" name="Changes" comment="Updates:&#10;Add -overwrite as a flag&#10;Add informative progress statements in cli">
8
+ <change beforePath="$PROJECT_DIR$/README.md" beforeDir="false" afterPath="$PROJECT_DIR$/README.md" afterDir="false" />
9
9
  <change beforePath="$PROJECT_DIR$/rdrpcatch/cli/args.py" beforeDir="false" afterPath="$PROJECT_DIR$/rdrpcatch/cli/args.py" afterDir="false" />
10
10
  <change beforePath="$PROJECT_DIR$/rdrpcatch/rdrpcatch_wrapper.py" beforeDir="false" afterPath="$PROJECT_DIR$/rdrpcatch/rdrpcatch_wrapper.py" afterDir="false" />
11
11
  </list>
@@ -40,22 +40,22 @@
40
40
  <option name="hideEmptyMiddlePackages" value="true" />
41
41
  <option name="showLibraryContents" value="true" />
42
42
  </component>
43
- <component name="PropertiesComponent">{
44
- &quot;keyToString&quot;: {
45
- &quot;ASKED_ADD_EXTERNAL_FILES&quot;: &quot;true&quot;,
46
- &quot;RunOnceActivity.OpenProjectViewOnStart&quot;: &quot;true&quot;,
47
- &quot;RunOnceActivity.ShowReadmeOnStart&quot;: &quot;true&quot;,
48
- &quot;ignore.virus.scanning.warn.message&quot;: &quot;true&quot;,
49
- &quot;last_opened_file_path&quot;: &quot;C:/Users/karso/PycharmProjects/rdrpcatch_benchmarks&quot;,
50
- &quot;node.js.detected.package.eslint&quot;: &quot;true&quot;,
51
- &quot;node.js.detected.package.tslint&quot;: &quot;true&quot;,
52
- &quot;node.js.selected.package.eslint&quot;: &quot;(autodetect)&quot;,
53
- &quot;node.js.selected.package.tslint&quot;: &quot;(autodetect)&quot;,
54
- &quot;nodejs_package_manager_path&quot;: &quot;npm&quot;,
55
- &quot;settings.editor.selected.configurable&quot;: &quot;preferences.pluginManager&quot;,
56
- &quot;vue.rearranger.settings.migration&quot;: &quot;true&quot;
43
+ <component name="PropertiesComponent"><![CDATA[{
44
+ "keyToString": {
45
+ "ASKED_ADD_EXTERNAL_FILES": "true",
46
+ "RunOnceActivity.OpenProjectViewOnStart": "true",
47
+ "RunOnceActivity.ShowReadmeOnStart": "true",
48
+ "ignore.virus.scanning.warn.message": "true",
49
+ "last_opened_file_path": "C:/Users/karso/PycharmProjects/testing_approaches",
50
+ "node.js.detected.package.eslint": "true",
51
+ "node.js.detected.package.tslint": "true",
52
+ "node.js.selected.package.eslint": "(autodetect)",
53
+ "node.js.selected.package.tslint": "(autodetect)",
54
+ "nodejs_package_manager_path": "npm",
55
+ "settings.editor.selected.configurable": "preferences.pluginManager",
56
+ "vue.rearranger.settings.migration": "true"
57
57
  }
58
- }</component>
58
+ }]]></component>
59
59
  <component name="RecentsManager">
60
60
  <key name="CopyFile.RECENT_KEYS">
61
61
  <recent name="C:\Users\karso\PycharmProjects\ColaB-Scan\testing" />
@@ -121,7 +121,10 @@
121
121
  <workItem from="1743714892367" duration="21775000" />
122
122
  <workItem from="1744200654491" duration="635000" />
123
123
  <workItem from="1744241097621" duration="28847000" />
124
- <workItem from="1745576502650" duration="11360000" />
124
+ <workItem from="1745576502650" duration="11691000" />
125
+ <workItem from="1746005454102" duration="1271000" />
126
+ <workItem from="1746359600096" duration="3517000" />
127
+ <workItem from="1747128382581" duration="8962000" />
125
128
  </task>
126
129
  <task id="LOCAL-00001" summary="First commit: Script for benchmark">
127
130
  <option name="closed" value="true" />
@@ -363,7 +366,23 @@
363
366
  <option name="project" value="LOCAL" />
364
367
  <updated>1744796108058</updated>
365
368
  </task>
366
- <option name="localTasksCounter" value="31" />
369
+ <task id="LOCAL-00031" summary="Updates:&#10;Add -overwrite as a flag&#10;Add informative progress statements in cli">
370
+ <option name="closed" value="true" />
371
+ <created>1745863439863</created>
372
+ <option name="number" value="00031" />
373
+ <option name="presentableId" value="LOCAL-00031" />
374
+ <option name="project" value="LOCAL" />
375
+ <updated>1745863439863</updated>
376
+ </task>
377
+ <task id="LOCAL-00032" summary="Updates:&#10;Add -overwrite as a flag&#10;Add informative progress statements in cli">
378
+ <option name="closed" value="true" />
379
+ <created>1745863445358</created>
380
+ <option name="number" value="00032" />
381
+ <option name="presentableId" value="LOCAL-00032" />
382
+ <option name="project" value="LOCAL" />
383
+ <updated>1745863445358</updated>
384
+ </task>
385
+ <option name="localTasksCounter" value="33" />
367
386
  <servers />
368
387
  </component>
369
388
  <component name="TypeScriptGeneratedFilesManager">
@@ -381,7 +400,6 @@
381
400
  </option>
382
401
  </component>
383
402
  <component name="VcsManagerConfiguration">
384
- <MESSAGE value="Commit: Plots and result summary" />
385
403
  <MESSAGE value="Commit: File name change" />
386
404
  <MESSAGE value="Commit: Upload script and results" />
387
405
  <MESSAGE value="Upload Jupyter notebooks and their respective documentation" />
@@ -406,6 +424,7 @@
406
424
  <MESSAGE value="Updates:&#10;Optimize fasta writer from O(n*m) to O(n+m)" />
407
425
  <MESSAGE value="Updates:&#10;Polishing ReadME&#10;Fixed bug that crushed the script when at least one pHMM DB does not have a match against the sequence database" />
408
426
  <MESSAGE value="Updates:&#10;Fixed bug that crushed the script when at least one pHMM DB does not have a match against the sequence database also for nuc branch" />
409
- <option name="LAST_COMMIT_MESSAGE" value="Updates:&#10;Fixed bug that crushed the script when at least one pHMM DB does not have a match against the sequence database also for nuc branch" />
427
+ <MESSAGE value="Updates:&#10;Add -overwrite as a flag&#10;Add informative progress statements in cli" />
428
+ <option name="LAST_COMMIT_MESSAGE" value="Updates:&#10;Add -overwrite as a flag&#10;Add informative progress statements in cli" />
410
429
  </component>
411
430
  </project>
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: rdrpcatch
3
- Version: 0.0.6
3
+ Version: 0.0.7
4
4
  Dynamic: Summary
5
5
  Project-URL: Home, https://github.com/dimitris-karapliafis/RdRpCATCH
6
6
  Project-URL: Source, https://github.com/dimitris-karapliafis/RdRpCATCH
@@ -36,7 +36,7 @@ that were positive for each sequence across all pHMM databases, and taxonomic in
36
36
 
37
37
  ** The tool has been modified to use [rolypoly](https://code.jgi.doe.gov/UNeri/rolypoly) code/approaches **
38
38
 
39
- ![rdrpcatch_flowchart_v0.png](images%2Frdrpcatch_flowchart_v0.png)
39
+ ![rdrpcatch_flowchart_v0.png](images%2Frdrpcatch_illustration.png)
40
40
 
41
41
  ### Supported databases
42
42
  - NeoRdRp <sup>1</sup> : 1182 pHMMs
@@ -133,25 +133,25 @@ Command to download pre-compiled databases from Zenodo. If the databases are alr
133
133
  ### rdrpcatch scan:
134
134
  Search a given input using selected RdRp databases.
135
135
 
136
- | Argument | Short Flag | Type | Description |
137
- |----------|------------|------|-------------|
138
- | `--input` | `-i` | FILE | Path to the input FASTA file. [required] |
139
- | `--output` | `-o` | DIRECTORY | Path to the output directory. [required] |
140
- | `--db_dir` | `-db_dir` | PATH | Path to the directory containing RdRpCATCH databases. [required] |
141
- | `--db_options` | `-dbs` | TEXT | Comma-separated list of databases to search against. Valid options: RVMT, NeoRdRp, NeoRdRp.2.1, TSA_Olendraite_fam, TSA_Olendraite_gen, RDRP-scan, Lucaprot, all |
142
- | `--custom-dbs` | | PATH | Path to directory containing custom MSAs/pHMM files to use as additional databases |
143
- | `--seq_type` | `-seq_type` | TEXT | Type of sequence to search against: (prot,nuc) Default: unknown |
144
- | `--verbose` | `-v` | FLAG | Print verbose output. |
145
- | `--evalue` | `-e` | FLOAT | E-value threshold for HMMsearch. (default: 1e-5) |
146
- | `--incevalue` | `-incE` | FLOAT | Inclusion E-value threshold for HMMsearch. (default: 1e-5) |
147
- | `--domevalue` | `-domE` | FLOAT | Domain E-value threshold for HMMsearch. (default: 1e-5) |
148
- | `--incdomevalue` | `-incdomE` | FLOAT | Inclusion domain E-value threshold for HMMsearch. (default: 1e-5) |
149
- | `--zvalue` | `-z` | INTEGER | Number of sequences to search against. (default: 1000000) |
150
- | `--cpus` | `-cpus` | INTEGER | Number of CPUs to use for HMMsearch. (default: 1) |
151
- | `--length_thr` | `-length_thr` | INTEGER | Minimum length threshold for seqkit seq. (default: 400) |
152
- | `--gen_code` | `-gen_code` | INTEGER | Genetic code to use for translation. (default: 1) |
153
- | `--bundle` | `-bundle` | | Bundle the output files into a single archive. (default: False) |
154
- | `--keep_tmp` | `-keep_tmp` | | Keep the temporary files generated during the analysis. (default: False) |
136
+ | Argument | Short Flag | Type | Description |
137
+ |----------|------------|------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
138
+ | `--input` | `-i` | FILE | Path to the input FASTA file. [required] |
139
+ | `--output` | `-o` | DIRECTORY | Path to the output directory. [required] |
140
+ | `--db_dir` | `-db_dir` | PATH | Path to the directory containing RdRpCATCH databases. [required] |
141
+ | `--db_options` | `-dbs` | TEXT | Comma-separated list of databases to search against. Valid options: RVMT, NeoRdRp, NeoRdRp.2.1, TSA_Olendraite_fam, TSA_Olendraite_gen, RDRP-scan, Lucaprot_HMM,Zayed_HMM, all |
142
+ | `--custom-dbs` | | PATH | Path to directory containing custom MSAs/pHMM files to use as additional databases |
143
+ | `--seq_type` | `-seq_type` | TEXT | Type of sequence to search against: (prot,nuc) Default: unknown |
144
+ | `--verbose` | `-v` | FLAG | Print verbose output. |
145
+ | `--evalue` | `-e` | FLOAT | E-value threshold for HMMsearch. (default: 1e-5) |
146
+ | `--incevalue` | `-incE` | FLOAT | Inclusion E-value threshold for HMMsearch. (default: 1e-5) |
147
+ | `--domevalue` | `-domE` | FLOAT | Domain E-value threshold for HMMsearch. (default: 1e-5) |
148
+ | `--incdomevalue` | `-incdomE` | FLOAT | Inclusion domain E-value threshold for HMMsearch. (default: 1e-5) |
149
+ | `--zvalue` | `-z` | INTEGER | Number of sequences to search against. (default: 1000000) |
150
+ | `--cpus` | `-cpus` | INTEGER | Number of CPUs to use for HMMsearch. (default: 1) |
151
+ | `--length_thr` | `-length_thr` | INTEGER | Minimum length threshold for seqkit seq. (default: 400) |
152
+ | `--gen_code` | `-gen_code` | INTEGER | Genetic code to use for translation. (default: 1) |
153
+ | `--bundle` | `-bundle` | | Bundle the output files into a single archive. (default: False) |
154
+ | `--keep_tmp` | `-keep_tmp` | | Keep the temporary files generated during the analysis. (default: False) |
155
155
 
156
156
 
157
157
 
@@ -14,7 +14,7 @@ that were positive for each sequence across all pHMM databases, and taxonomic in
14
14
 
15
15
  ** The tool has been modified to use [rolypoly](https://code.jgi.doe.gov/UNeri/rolypoly) code/approaches **
16
16
 
17
- ![rdrpcatch_flowchart_v0.png](images%2Frdrpcatch_flowchart_v0.png)
17
+ ![rdrpcatch_flowchart_v0.png](images%2Frdrpcatch_illustration.png)
18
18
 
19
19
  ### Supported databases
20
20
  - NeoRdRp <sup>1</sup> : 1182 pHMMs
@@ -111,25 +111,25 @@ Command to download pre-compiled databases from Zenodo. If the databases are alr
111
111
  ### rdrpcatch scan:
112
112
  Search a given input using selected RdRp databases.
113
113
 
114
- | Argument | Short Flag | Type | Description |
115
- |----------|------------|------|-------------|
116
- | `--input` | `-i` | FILE | Path to the input FASTA file. [required] |
117
- | `--output` | `-o` | DIRECTORY | Path to the output directory. [required] |
118
- | `--db_dir` | `-db_dir` | PATH | Path to the directory containing RdRpCATCH databases. [required] |
119
- | `--db_options` | `-dbs` | TEXT | Comma-separated list of databases to search against. Valid options: RVMT, NeoRdRp, NeoRdRp.2.1, TSA_Olendraite_fam, TSA_Olendraite_gen, RDRP-scan, Lucaprot, all |
120
- | `--custom-dbs` | | PATH | Path to directory containing custom MSAs/pHMM files to use as additional databases |
121
- | `--seq_type` | `-seq_type` | TEXT | Type of sequence to search against: (prot,nuc) Default: unknown |
122
- | `--verbose` | `-v` | FLAG | Print verbose output. |
123
- | `--evalue` | `-e` | FLOAT | E-value threshold for HMMsearch. (default: 1e-5) |
124
- | `--incevalue` | `-incE` | FLOAT | Inclusion E-value threshold for HMMsearch. (default: 1e-5) |
125
- | `--domevalue` | `-domE` | FLOAT | Domain E-value threshold for HMMsearch. (default: 1e-5) |
126
- | `--incdomevalue` | `-incdomE` | FLOAT | Inclusion domain E-value threshold for HMMsearch. (default: 1e-5) |
127
- | `--zvalue` | `-z` | INTEGER | Number of sequences to search against. (default: 1000000) |
128
- | `--cpus` | `-cpus` | INTEGER | Number of CPUs to use for HMMsearch. (default: 1) |
129
- | `--length_thr` | `-length_thr` | INTEGER | Minimum length threshold for seqkit seq. (default: 400) |
130
- | `--gen_code` | `-gen_code` | INTEGER | Genetic code to use for translation. (default: 1) |
131
- | `--bundle` | `-bundle` | | Bundle the output files into a single archive. (default: False) |
132
- | `--keep_tmp` | `-keep_tmp` | | Keep the temporary files generated during the analysis. (default: False) |
114
+ | Argument | Short Flag | Type | Description |
115
+ |----------|------------|------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
116
+ | `--input` | `-i` | FILE | Path to the input FASTA file. [required] |
117
+ | `--output` | `-o` | DIRECTORY | Path to the output directory. [required] |
118
+ | `--db_dir` | `-db_dir` | PATH | Path to the directory containing RdRpCATCH databases. [required] |
119
+ | `--db_options` | `-dbs` | TEXT | Comma-separated list of databases to search against. Valid options: RVMT, NeoRdRp, NeoRdRp.2.1, TSA_Olendraite_fam, TSA_Olendraite_gen, RDRP-scan, Lucaprot_HMM,Zayed_HMM, all |
120
+ | `--custom-dbs` | | PATH | Path to directory containing custom MSAs/pHMM files to use as additional databases |
121
+ | `--seq_type` | `-seq_type` | TEXT | Type of sequence to search against: (prot,nuc) Default: unknown |
122
+ | `--verbose` | `-v` | FLAG | Print verbose output. |
123
+ | `--evalue` | `-e` | FLOAT | E-value threshold for HMMsearch. (default: 1e-5) |
124
+ | `--incevalue` | `-incE` | FLOAT | Inclusion E-value threshold for HMMsearch. (default: 1e-5) |
125
+ | `--domevalue` | `-domE` | FLOAT | Domain E-value threshold for HMMsearch. (default: 1e-5) |
126
+ | `--incdomevalue` | `-incdomE` | FLOAT | Inclusion domain E-value threshold for HMMsearch. (default: 1e-5) |
127
+ | `--zvalue` | `-z` | INTEGER | Number of sequences to search against. (default: 1000000) |
128
+ | `--cpus` | `-cpus` | INTEGER | Number of CPUs to use for HMMsearch. (default: 1) |
129
+ | `--length_thr` | `-length_thr` | INTEGER | Minimum length threshold for seqkit seq. (default: 400) |
130
+ | `--gen_code` | `-gen_code` | INTEGER | Genetic code to use for translation. (default: 1) |
131
+ | `--bundle` | `-bundle` | | Bundle the output files into a single archive. (default: False) |
132
+ | `--keep_tmp` | `-keep_tmp` | | Keep the temporary files generated during the analysis. (default: False) |
133
133
 
134
134
 
135
135
 
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "rdrpcatch"
7
- version = "0.0.6"
7
+ version = "0.0.7"
8
8
  authors = [
9
9
  {name = "Dimitris Karapliafis", email = "dimitris.karapliafis@wur.nl"},
10
10
  {name = "Uri Neri", email = "uneri@lbl.gov"},
@@ -26,7 +26,7 @@ def parse_comma_separated_options(ctx, param, value):
26
26
  return ['all']
27
27
 
28
28
  allowed_choices = ['RVMT', 'NeoRdRp', 'NeoRdRp.2.1', 'TSA_Olendraite_fam', 'TSA_Olendraite_gen', 'RDRP-scan',
29
- 'Lucaprot', 'all']
29
+ 'Lucaprot_HMM, Zayed_HMM', 'all']
30
30
  lower_choices = [choice.lower() for choice in allowed_choices]
31
31
  options = value.split(',')
32
32
  lower_options = [option.lower() for option in options]
@@ -73,7 +73,7 @@ def cli():
73
73
  callback=parse_comma_separated_options,
74
74
  default="all",
75
75
  help="Comma-separated list of databases to search against. Valid options: RVMT, NeoRdRp, NeoRdRp.2.1,"
76
- " TSA_Olendraite_fam, TSA_Olendraite_gen, RDRP-scan,Lucaprot, all")
76
+ " TSA_Olendraite_fam, TSA_Olendraite_gen, RDRP-scan,Lucaprot_HMM, Zayed_HMM, all")
77
77
  @click.option("--custom-dbs",
78
78
  help="Path to directory containing custom MSAs/pHMM files to use as additional databases",
79
79
  type=click.Path(exists=True, path_type=Path))
@@ -170,7 +170,7 @@ def run_scan(input_file, output_dir, db_options, db_dir, seq_type, verbose, e,in
170
170
 
171
171
  logger.loud_log("Fetching HMM databases...")
172
172
 
173
- ## Fetch HMM databases- RVMT, NeoRdRp, NeoRdRp.2.1, TSA_Olendraite, RDRP-scan, Lucaprot
173
+ ## Fetch HMM databases- RVMT, NeoRdRp, NeoRdRp.2.1, TSA_Olendraite, RDRP-scan, Lucaprot_HMM,Zayed_HMM
174
174
  rvmt_hmm_db = fetch_dbs.db_fetcher(db_dir).fetch_hmm_db_path("RVMT")
175
175
  if verbose:
176
176
  logger.loud_log(f"RVMT HMM database fetched from: {rvmt_hmm_db}")
@@ -202,19 +202,24 @@ def run_scan(input_file, output_dir, db_options, db_dir, seq_type, verbose, e,in
202
202
  logger.loud_log(f"RDRP-scan HMM database fetched from: {rdrpscan_hmm_db}")
203
203
  else:
204
204
  logger.silent_log(f"RDRP-scan HMM database fetched from: {rdrpscan_hmm_db}")
205
- lucaprot_hmm_db = fetch_dbs.db_fetcher(db_dir).fetch_hmm_db_path("Lucaprot")
205
+ lucaprot_hmm_db = fetch_dbs.db_fetcher(db_dir).fetch_hmm_db_path("Lucaprot_HMM")
206
206
  if verbose:
207
207
  logger.loud_log(f"Lucaprot HMM database fetched from: {lucaprot_hmm_db}")
208
208
  else:
209
209
  logger.silent_log(f"Lucaprot HMM database fetched from: {lucaprot_hmm_db}")
210
+ zayed_hmm_db = fetch_dbs.db_fetcher(db_dir).fetch_hmm_db_path("Zayed_HMM")
211
+ if verbose:
212
+ logger.loud_log(f"Zayed HMM database fetched from: {zayed_hmm_db}")
213
+ else:
214
+ logger.silent_log(f"Zayed HMM database fetched from: {zayed_hmm_db}")
210
215
 
211
216
  db_name_list = []
212
217
  db_path_list = []
213
218
 
214
219
  ## Set up HMM databases
215
220
  if db_options == ['all']:
216
- db_name_list = ["RVMT", "NeoRdRp", "NeoRdRp.2.1", "TSA_Olendraite_fam","TSA_Olendraite_gen", "RDRP-scan", "Lucaprot"]
217
- db_path_list = [rvmt_hmm_db, neordrp_hmm_db, neordrp_2_hmm_db, tsa_olen_fam_hmm_db,tsa_olen_gen_hmm_db, rdrpscan_hmm_db, lucaprot_hmm_db]
221
+ db_name_list = ["RVMT", "NeoRdRp", "NeoRdRp.2.1", "TSA_Olendraite_fam","TSA_Olendraite_gen", "RDRP-scan", "Lucaprot_HMM", "Zayed_HMM"]
222
+ db_path_list = [rvmt_hmm_db, neordrp_hmm_db, neordrp_2_hmm_db, tsa_olen_fam_hmm_db,tsa_olen_gen_hmm_db, rdrpscan_hmm_db, lucaprot_hmm_db, zayed_hmm_db]
218
223
 
219
224
  else:
220
225
  for db in db_options:
@@ -236,9 +241,12 @@ def run_scan(input_file, output_dir, db_options, db_dir, seq_type, verbose, e,in
236
241
  elif db == "RDRP-scan".lower():
237
242
  db_name_list.append("RDRP-scan")
238
243
  db_path_list.append(rdrpscan_hmm_db)
239
- elif db == "Lucaprot".lower():
240
- db_name_list.append("Lucaprot")
244
+ elif db == "Lucaprot_HMM".lower():
245
+ db_name_list.append("Lucaprot_HMM")
241
246
  db_path_list.append(lucaprot_hmm_db)
247
+ elif db == "Zayed_HMM".lower():
248
+ db_name_list.append("Zayed_HMM")
249
+ db_path_list.append(zayed_hmm_db)
242
250
  else:
243
251
  raise Exception(f"Invalid database option: {db}")
244
252
 
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes