PyPI - dayhoff-tools - Versions diffs - 1.1.19__py3-none-any.whl → 1.1.21__py3-none-any.whl - Mend

dayhoff-tools 1.1.19py3-none-any.whl → 1.1.21py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

dayhoff_tools/__init__.py CHANGED Viewed

@@ -0,0 +1,10 @@
+import importlib.metadata
+try:
+    # The package name here should match the 'name' field in your pyproject.toml
+    __version__ = importlib.metadata.version("dayhoff-tools")
+except importlib.metadata.PackageNotFoundError:
+    # This is a fallback for when the package might not be installed (e.g., running from source
+    # without installation, or during development). You can set it to None, "unknown",
+    # or handle it as you see fit.
+    __version__ = "unknown"

dayhoff_tools/cli/utility_commands.py CHANGED Viewed

@@ -437,14 +437,13 @@ def install_dependencies(
         False,
         "--install-project",
         "-p",
-        help="Install the local project package itself into the environment.",
+        help="Install the local project package itself (with 'full' extras) into the environment.",
     ),
 ):
     """Install dependencies based on pyproject.toml.
     Ensures uv.lock matches pyproject.toml and syncs the environment.
-    This is the command to run after changing pyproject.toml manually
-    or cloning/pulling a repository.
+    When -p is used, installs the local project with its [full] optional dependencies.
     """
     # ANSI color codes
     BLUE = "\033[94m"
@@ -457,15 +456,26 @@ def install_dependencies(
         print(f"Running command: {BLUE}{' '.join(lock_cmd)}{RESET}")
         subprocess.run(lock_cmd, check=True, capture_output=True)
-        # Step 2: Sync environment
-        print("Syncing environment with lock file...")
-        sync_cmd = ["uv", "sync", "--all-groups"]
-        if not install_project:
-            sync_cmd.append("--no-install-project")
-        print(f"Running command: {BLUE}{' '.join(sync_cmd)}{RESET}")
-        subprocess.run(sync_cmd, check=True)
+        if install_project:
+            # Step 2a: Install the project with 'full' extras
+            print("Installing the local project with 'full' extras...")
+            # The .[full] syntax tells pip to install the current project ('.')
+            # with its 'full' optional dependencies.
+            pip_install_cmd = ["uv", "pip", "install", "-e", ".[full]"]
+            print(f"Running command: {BLUE}{' '.join(pip_install_cmd)}{RESET}")
+            subprocess.run(pip_install_cmd, check=True)
-        print("Dependencies installed/synced successfully.")
+            print("Project installed with 'full' extras successfully.")
+        else:
+            # Original behavior: Sync environment without installing the project
+            print(
+                "Syncing environment with lock file (project itself will not be installed)..."
+            )
+            # --all-groups ensures all non-project dependencies (like dev) are installed
+            sync_cmd = ["uv", "sync", "--all-groups", "--no-install-project"]
+            print(f"Running command: {BLUE}{' '.join(sync_cmd)}{RESET}")
+            subprocess.run(sync_cmd, check=True)
+            print("Dependencies synced successfully (project not installed).")
     except subprocess.CalledProcessError as e:
         stderr_output = e.stderr.decode() if e.stderr else "No stderr output."

dayhoff_tools/deployment/processors.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import csv
 import logging
 import os
 import shlex
@@ -6,6 +7,8 @@ import subprocess
 from abc import ABC, abstractmethod
 from pathlib import Path
+from dayhoff_tools.fasta import subset_fasta
 logger = logging.getLogger(__name__)
@@ -267,10 +270,12 @@ class MMSeqsProfileProcessor(Processor):
         # Define INTERMEDIATE output file paths within mmseqs_temp_dir
         intermediate_results_m8_file = mmseqs_temp_dir / "results.m8"
         intermediate_hits_fasta_file = mmseqs_temp_dir / "results.fasta"
+        intermediate_results_as_csv_file = mmseqs_temp_dir / "results_as.csv"
         # Define FINAL output file paths within run_base_dir, using target stem
-        final_results_m8_file = run_base_dir / f"{target_fasta_stem}_results.m8"
-        final_hits_fasta_file = run_base_dir / f"{target_fasta_stem}_hits.fasta"
+        final_results_csv_file = run_base_dir / f"{target_fasta_stem}.csv"
+        final_hits_fasta_file = run_base_dir / f"{target_fasta_stem}.fasta"
+        final_hits_txt_file = run_base_dir / f"{target_fasta_stem}.txt"
         # --- MMseqs2 Workflow Paths (intermediate files in mmseqs_temp_dir) ---
         query_db = mmseqs_temp_dir / "queryDB"
@@ -405,40 +410,147 @@ class MMSeqsProfileProcessor(Processor):
                 run_base_dir,
             )
-            # 9. Extract hit sequences directly to FASTA using createseqfiledb
-            self._run_mmseqs_command(
-                [
-                    "mmseqs",
-                    "createseqfiledb",
-                    str(target_db),  # <i:sequenceDB> - The DB to pull sequences from
-                    str(
-                        result_db
-                    ),  # <i:resultDB> - Contains IDs of target sequences to pull
-                    str(
-                        intermediate_hits_fasta_file
-                    ),  # <o:fastaDB> - Output FASTA file
-                    "--threads",  # createseqfiledb supports --threads
-                    self.num_threads,
-                ],
-                "Extract hit sequences to FASTA via createseqfiledb",
-                run_base_dir,
+            # 8.5 Convert M8 to CSV with headers
+            logger.info(
+                f"Converting M8 results {intermediate_results_m8_file} to CSV {intermediate_results_as_csv_file}"
+            )
+            csv_headers = [
+                "query_id",
+                "target_id",
+                "percent_identity",
+                "alignment_length",
+                "mismatches",
+                "gap_openings",
+                "query_start",
+                "query_end",
+                "target_start",
+                "target_end",
+                "e_value",
+                "bit_score",
+            ]
+            try:
+                if intermediate_results_m8_file.exists():
+                    with open(intermediate_results_m8_file, "r") as m8_in, open(
+                        intermediate_results_as_csv_file, "w", newline=""
+                    ) as csv_out:
+                        writer = csv.writer(csv_out)
+                        writer.writerow(csv_headers)
+                        for line in m8_in:
+                            if line.strip():  # Ensure line is not empty
+                                columns = line.strip().split("\t")
+                                writer.writerow(columns)
+                    logger.info(
+                        f"Successfully converted M8 to CSV: {intermediate_results_as_csv_file}"
+                    )
+                else:
+                    logger.warning(
+                        f"Intermediate M8 file {intermediate_results_m8_file} not found for CSV conversion. CSV will be empty or not created."
+                    )
+                    # Ensure the CSV file is touched if M8 was missing, so downstream move doesn't fail
+                    intermediate_results_as_csv_file.touch()
+            except Exception as e:
+                logger.error(f"Failed to convert M8 to CSV: {e}")
+                # Touch the CSV file in case of error to prevent downstream errors if it's expected
+                if not intermediate_results_as_csv_file.exists():
+                    intermediate_results_as_csv_file.touch()
+                # We might still want to raise e here, depending on desired error handling for CSV conversion failure
+            # 9. Extract hit sequences from M8 results using subset_fasta
+            logger.info(f"Parsing M8 results from: {intermediate_results_m8_file}")
+            hit_sequence_ids = set()
+            try:
+                if not intermediate_results_m8_file.exists():
+                    logger.warning(
+                        f"M8 results file {intermediate_results_m8_file} not found. Hits FASTA will be empty."
+                    )
+                    intermediate_hits_fasta_file.touch()  # Create empty hits file
+                else:
+                    with open(intermediate_results_m8_file, "r") as m8_file:
+                        for line in m8_file:
+                            if line.strip():  # Ensure line is not empty
+                                columns = line.strip().split("\t")
+                                if len(columns) >= 2:
+                                    hit_sequence_ids.add(
+                                        columns[1]
+                                    )  # Target ID is the second column
+                    logger.info(
+                        f"Found {len(hit_sequence_ids)} unique target IDs in M8 results."
+                    )
+                    if not hit_sequence_ids:
+                        logger.warning(
+                            f"No target IDs found in {intermediate_results_m8_file} after parsing. Output hits FASTA will be empty."
+                        )
+                        intermediate_hits_fasta_file.touch()  # Create empty file
+                    else:
+                        logger.info(
+                            f"Extracting {len(hit_sequence_ids)} hit sequences from {local_target_file} to {intermediate_hits_fasta_file} using subset_fasta."
+                        )
+                        try:
+                            subset_fasta(
+                                fasta_file=str(local_target_file),
+                                output_path=str(intermediate_hits_fasta_file),
+                                target_ids=hit_sequence_ids,
+                                exclude=False,
+                                return_written_ids=False,
+                            )
+                            logger.info(
+                                f"Successfully created hits FASTA: {intermediate_hits_fasta_file} using subset_fasta."
+                            )
+                        except FileNotFoundError as e:
+                            logger.error(
+                                f"subset_fasta FileNotFoundError: {e}. Ensuring {intermediate_hits_fasta_file} exists as empty."
+                            )
+                            if not intermediate_hits_fasta_file.exists():
+                                intermediate_hits_fasta_file.touch()
+                            raise
+                        except Exception as e:
+                            logger.error(
+                                f"subset_fasta failed to create {intermediate_hits_fasta_file}: {e}"
+                            )
+                            if not intermediate_hits_fasta_file.exists():
+                                intermediate_hits_fasta_file.touch()
+                            raise
+            except Exception as e:
+                logger.error(
+                    f"Error processing M8 file {intermediate_results_m8_file} or its hits extraction: {e}"
+                )
+                if not intermediate_hits_fasta_file.exists():
+                    intermediate_hits_fasta_file.touch()
+                raise
+            # 10. Write the set of hit sequence IDs to a .txt file
+            logger.info(
+                f"Writing {len(hit_sequence_ids)} hit sequence IDs to {final_hits_txt_file}"
             )
+            try:
+                with open(final_hits_txt_file, "w") as txt_out:
+                    for seq_id in sorted(
+                        list(hit_sequence_ids)
+                    ):  # Sort for consistent output
+                        txt_out.write(f"{seq_id}\n")
+                logger.info(f"Successfully wrote hit IDs to {final_hits_txt_file}")
+            except Exception as e:
+                logger.error(f"Failed to write hit IDs to {final_hits_txt_file}: {e}")
+                # The main workflow should still proceed even if this supplementary file fails
             logger.info(
                 f"MMseqs2 workflow completed successfully. Intermediate outputs in {mmseqs_temp_dir}"
             )
             # Move and rename final output files from mmseqs_temp_dir to run_base_dir
-            if intermediate_results_m8_file.exists():
+            if intermediate_results_as_csv_file.exists():
                 shutil.move(
-                    str(intermediate_results_m8_file), str(final_results_m8_file)
+                    str(intermediate_results_as_csv_file), str(final_results_csv_file)
+                )
+                logger.info(
+                    f"Moved and renamed M8 results to CSV: {final_results_csv_file}"
                 )
-                logger.info(f"Moved and renamed M8 results to {final_results_m8_file}")
             else:
                 logger.warning(
-                    f"Intermediate M8 file {intermediate_results_m8_file} not found. Creating empty target file."
+                    f"Intermediate CSV file {intermediate_results_as_csv_file} not found. Creating empty target CSV file."
                 )
-                final_results_m8_file.touch()  # Create empty file in run_base_dir if not found
+                final_results_csv_file.touch()  # Create empty file in run_base_dir if not found
             if intermediate_hits_fasta_file.exists():
                 shutil.move(

{dayhoff_tools-1.1.19.dist-info → dayhoff_tools-1.1.21.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: dayhoff-tools
-Version: 1.1.19
+Version: 1.1.21
 Summary: Common tools for all the repos at Dayhoff Labs
 Author: Daniel Martin-Alarcon
 Author-email: dma@dayhofflabs.com

{dayhoff_tools-1.1.19.dist-info → dayhoff_tools-1.1.21.dist-info}/RECORD RENAMED Viewed

@@ -1,17 +1,17 @@
-dayhoff_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+dayhoff_tools/__init__.py,sha256=M5zThPyEBRYa5CfwlzKhcqTevWn3OKu62cjV6Zqie2A,469
 dayhoff_tools/chemistry/standardizer.py,sha256=uMn7VwHnx02nc404eO6fRuS4rsl4dvSPf2ElfZDXEpY,11188
 dayhoff_tools/chemistry/utils.py,sha256=jt-7JgF-GeeVC421acX-bobKbLU_X94KNOW24p_P-_M,2257
 dayhoff_tools/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 dayhoff_tools/cli/cloud_commands.py,sha256=NGux28-cjDyCADF-L1tjdEMzkCMYX8V4xNvpK6EWcZA,40802
 dayhoff_tools/cli/main.py,sha256=47EGb28ALaYFc7oAUGlY1D66AIDmc4RZiXxN-gPVrpQ,4519
 dayhoff_tools/cli/swarm_commands.py,sha256=5EyKj8yietvT5lfoz8Zx0iQvVaNgc3SJX1z2zQR6o6M,5614
-dayhoff_tools/cli/utility_commands.py,sha256=PRShdh4O35JfjVxTOsduZ8-grGF5-SfXihcptcFT-hk,23584
+dayhoff_tools/cli/utility_commands.py,sha256=08MMLlQZEjJ_r7Cd6b92aYYPjH7gMvMQvCOcpoNnFVo,24352
 dayhoff_tools/deployment/base.py,sha256=8tXwsPYvRo-zV-aNhHw1c7Rji-KWg8S5xoCCznFnVVI,17412
 dayhoff_tools/deployment/deploy_aws.py,sha256=O0gQxHioSU_sNU8T8MD4wSOPvWc--V8eRRZzlRu035I,16446
 dayhoff_tools/deployment/deploy_gcp.py,sha256=DxBM4sUzwPK9RWLP9bSfr38n1HHl-TVrp4TsbdN8pUA,5795
 dayhoff_tools/deployment/deploy_utils.py,sha256=StFwbqnr2_FWiKVg3xnJF4kagTHzndqqDkpaIOaAn_4,26027
 dayhoff_tools/deployment/job_runner.py,sha256=4tmdplpvqSE9bVxRWHo2U5kwkYrYod0Uwzpg2Q7qG5o,4850
-dayhoff_tools/deployment/processors.py,sha256=4a8fYFilcbBGAVUeowjL5n-Pa7NIL3Jmhs69lCQIkpQ,19554
+dayhoff_tools/deployment/processors.py,sha256=0D2r2-NgVcFfxF0QSqLZy86cxEWSn78jLTKYf60fLBQ,25519
 dayhoff_tools/deployment/swarm.py,sha256=MGcS2_x4RNFtnVjWlU_SwNfhICz8NlGYr9cYBK4ZKDA,21688
 dayhoff_tools/embedders.py,sha256=CRgcb2z7KeeFrRQawyUZuJ4Yi0-J5jSr0hwuRhjG_FI,36513
 dayhoff_tools/fasta.py,sha256=e7xw3pInoupqCGE0-fJTOzmW_earL1M7qPyoqIPfUT4,46269
@@ -26,7 +26,7 @@ dayhoff_tools/intake/uniprot.py,sha256=BZYJQF63OtPcBBnQ7_P9gulxzJtqyorgyuDiPeOJq
 dayhoff_tools/logs.py,sha256=DKdeP0k0kliRcilwvX0mUB2eipO5BdWUeHwh-VnsICs,838
 dayhoff_tools/sqlite.py,sha256=jV55ikF8VpTfeQqqlHSbY8OgfyfHj8zgHNpZjBLos_E,18672
 dayhoff_tools/warehouse.py,sha256=TqV8nex1AluNaL4JuXH5zuu9P7qmE89lSo6f_oViy6U,14965
-dayhoff_tools-1.1.19.dist-info/METADATA,sha256=40SO5LPPs3GIs_bHfr5jt-TpZUb05-5EHz6Uo09TdxI,2225
-dayhoff_tools-1.1.19.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
-dayhoff_tools-1.1.19.dist-info/entry_points.txt,sha256=iAf4jteNqW3cJm6CO6czLxjW3vxYKsyGLZ8WGmxamSc,49
-dayhoff_tools-1.1.19.dist-info/RECORD,,
+dayhoff_tools-1.1.21.dist-info/METADATA,sha256=R6Q1CWHxPpXkuYs0OwJ-WSSb9pnPIS8V4xbZHKjugZE,2225
+dayhoff_tools-1.1.21.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
+dayhoff_tools-1.1.21.dist-info/entry_points.txt,sha256=iAf4jteNqW3cJm6CO6czLxjW3vxYKsyGLZ8WGmxamSc,49
+dayhoff_tools-1.1.21.dist-info/RECORD,,

{dayhoff_tools-1.1.19.dist-info → dayhoff_tools-1.1.21.dist-info}/WHEEL RENAMED Viewed

File without changes

{dayhoff_tools-1.1.19.dist-info → dayhoff_tools-1.1.21.dist-info}/entry_points.txt RENAMED Viewed

File without changes

dayhoff-tools 1.1.19__py3-none-any.whl → 1.1.21__py3-none-any.whl

dayhoff-tools 1.1.19py3-none-any.whl → 1.1.21py3-none-any.whl