PyPI - dayhoff-tools - Versions diffs - 1.1.25__py3-none-any.whl → 1.1.27__py3-none-any.whl - Mend

dayhoff-tools 1.1.25py3-none-any.whl → 1.1.27py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

dayhoff_tools/deployment/processors.py CHANGED Viewed

@@ -273,12 +273,10 @@ class MMSeqsProfileProcessor(Processor):
         # Define INTERMEDIATE output file paths within mmseqs_temp_dir
         intermediate_results_m8_file = mmseqs_temp_dir / "results.m8"
-        intermediate_hits_fasta_file = mmseqs_temp_dir / "results.fasta"
         intermediate_results_as_csv_file = mmseqs_temp_dir / "results_as.csv"
         # Define FINAL output file paths within run_base_dir, using target stem
         final_results_csv_file = run_base_dir / f"{target_fasta_stem}.csv"
-        final_hits_fasta_file = run_base_dir / f"{target_fasta_stem}.fasta"
         final_hits_txt_file = run_base_dir / f"{target_fasta_stem}.txt"
         # --- MMseqs2 Workflow Paths (intermediate files in mmseqs_temp_dir) ---
@@ -416,7 +414,7 @@ class MMSeqsProfileProcessor(Processor):
             # 8.5 Convert M8 to CSV with headers
             logger.info(
-                f"Converting M8 results {intermediate_results_m8_file} to CSV {intermediate_results_as_csv_file}"
+                f"Converting M8 results to CSV: {intermediate_results_m8_file} -> {intermediate_results_as_csv_file}"
             )
             csv_headers = [
                 "query_id",
@@ -433,130 +431,81 @@ class MMSeqsProfileProcessor(Processor):
                 "bit_score",
             ]
             try:
-                if intermediate_results_m8_file.exists():
-                    with open(intermediate_results_m8_file, "r") as m8_in, open(
-                        intermediate_results_as_csv_file, "w", newline=""
-                    ) as csv_out:
-                        writer = csv.writer(csv_out)
-                        writer.writerow(csv_headers)
-                        for line in m8_in:
-                            if line.strip():  # Ensure line is not empty
-                                columns = line.strip().split("\t")
-                                writer.writerow(columns)
-                    logger.info(
-                        f"Successfully converted M8 to CSV: {intermediate_results_as_csv_file}"
-                    )
-                else:
+                if not intermediate_results_m8_file.exists():
                     logger.warning(
-                        f"Intermediate M8 file {intermediate_results_m8_file} not found for CSV conversion. CSV will be empty or not created."
+                        f"M8 results file {intermediate_results_m8_file} not found. CSV will be empty."
                     )
-                    # Ensure the CSV file is touched if M8 was missing, so downstream move doesn't fail
-                    intermediate_results_as_csv_file.touch()
+                    # Create an empty CSV with headers if M8 is missing
+                    with open(
+                        intermediate_results_as_csv_file, "w", newline=""
+                    ) as csvfile:
+                        writer = csv.writer(csvfile)
+                        writer.writerow(m8_columns)
+                else:
+                    with open(intermediate_results_m8_file, "r") as m8file, open(
+                        intermediate_results_as_csv_file, "w", newline=""
+                    ) as csvfile:
+                        writer = csv.writer(csvfile)
+                        writer.writerow(m8_columns)
+                        for line in m8file:
+                            writer.writerow(line.strip().split("\t"))
             except Exception as e:
-                logger.error(f"Failed to convert M8 to CSV: {e}")
-                # Touch the CSV file in case of error to prevent downstream errors if it's expected
+                logger.error(f"Error converting M8 to CSV: {e}", exc_info=True)
+                # Ensure an empty csv is created on error to prevent downstream issues
                 if not intermediate_results_as_csv_file.exists():
-                    intermediate_results_as_csv_file.touch()
-                # We might still want to raise e here, depending on desired error handling for CSV conversion failure
+                    with open(
+                        intermediate_results_as_csv_file, "w", newline=""
+                    ) as csvfile:
+                        writer = csv.writer(csvfile)
+                        writer.writerow(m8_columns)  # write headers even on error
-            # 9. Extract hit sequences from M8 results using subset_fasta
+            # 9. Extract hit sequence IDs from M8 results for the TXT file
+            hit_sequence_ids = set()
             logger.info(
-                f"PROCESSOR: Parsing M8 results from: {intermediate_results_m8_file}"
+                f"Extracting hit IDs from {intermediate_results_m8_file} for TXT output."
             )
-            hit_sequence_ids = set()
             try:
-                if not intermediate_results_m8_file.exists():
-                    logger.warning(
-                        f"PROCESSOR: M8 results file {intermediate_results_m8_file} not found. Hits FASTA will be empty.",
-                        exc_info=True,
-                    )
-                    intermediate_hits_fasta_file.touch()  # Create empty hits file
-                else:
+                if intermediate_results_m8_file.exists():
                     with open(intermediate_results_m8_file, "r") as m8_file:
                         for line in m8_file:
-                            if line.strip():
+                            if line.strip():  # Check if line is not empty
                                 columns = line.strip().split("\t")
                                 if len(columns) >= 2:
-                                    hit_sequence_ids.add(columns[1])
+                                    hit_sequence_ids.add(
+                                        columns[1]
+                                    )  # Add target_accession
                     logger.info(
-                        f"PROCESSOR: Found {len(hit_sequence_ids)} unique target IDs in M8 results."
+                        f"Found {len(hit_sequence_ids)} unique hit IDs in M8 file."
                     )
-                    if not hit_sequence_ids:
-                        logger.warning(
-                            f"PROCESSOR: No target IDs found in {intermediate_results_m8_file} after parsing. Output hits FASTA will be empty.",
-                            exc_info=True,
-                        )
-                        intermediate_hits_fasta_file.touch()
-                    else:
-                        logger.info(f"PROCESSOR: === CALLING subset_fasta ===")
-                        logger.info(
-                            f"PROCESSOR: Input FASTA for subset_fasta: {local_target_file}"
-                        )
-                        logger.info(
-                            f"PROCESSOR: Output FASTA for subset_fasta: {intermediate_hits_fasta_file}"
-                        )
-                        logger.info(
-                            f"PROCESSOR: Number of target IDs for subset_fasta: {len(hit_sequence_ids)}"
-                        )
-                        try:
-                            subset_fasta(
-                                fasta_file=str(local_target_file),
-                                output_path=str(intermediate_hits_fasta_file),
-                                target_ids=hit_sequence_ids,
-                                exclude=False,
-                                return_written_ids=False,
-                            )
-                            logger.info(
-                                f"PROCESSOR: === RETURNED from subset_fasta ==="
-                            )
-                            logger.info(
-                                f"PROCESSOR: Successfully created hits FASTA: {intermediate_hits_fasta_file} using subset_fasta."
-                            )
-                        # More specific error catching can be added if subset_fasta raises custom exceptions
-                        except FileNotFoundError as e_fnf:
-                            logger.error(
-                                f"PROCESSOR: subset_fasta FileNotFoundError: {e_fnf}. Ensuring {intermediate_hits_fasta_file} exists as empty.",
-                                exc_info=True,
-                            )
-                            if not intermediate_hits_fasta_file.exists():
-                                intermediate_hits_fasta_file.touch()
-                            raise
-                        except (
-                            Exception
-                        ) as e_sub:  # Catch any other exception from subset_fasta
-                            logger.error(
-                                f"PROCESSOR: subset_fasta failed to create {intermediate_hits_fasta_file}: {e_sub}",
-                                exc_info=True,
-                            )
-                            if not intermediate_hits_fasta_file.exists():
-                                intermediate_hits_fasta_file.touch()
-                            raise
-            except Exception as e_m8_proc:
+                else:
+                    logger.warning(
+                        f"Intermediate M8 file {intermediate_results_m8_file} not found. Hit TXT file will be empty."
+                    )
+            except Exception as e:
                 logger.error(
-                    f"PROCESSOR: Error processing M8 file {intermediate_results_m8_file} or its hits extraction: {e_m8_proc}",
+                    f"Error reading M8 file {intermediate_results_m8_file} for hit ID extraction: {e}",
                     exc_info=True,
                 )
-                if not intermediate_hits_fasta_file.exists():
-                    intermediate_hits_fasta_file.touch()
-                raise
+                # Proceed even if M8 reading fails, TXT will be empty
-            # 10. Write the set of hit sequence IDs to a .txt file
+            # 10. Write the set of hit sequence IDs to the final .txt file
             logger.info(
-                f"PROCESSOR: Writing {len(hit_sequence_ids)} hit sequence IDs to {final_hits_txt_file}"
+                f"Writing {len(hit_sequence_ids)} hit sequence IDs to {final_hits_txt_file}"
             )
             try:
                 with open(final_hits_txt_file, "w") as txt_out:
-                    for seq_id in sorted(
-                        list(hit_sequence_ids)
-                    ):  # Sort for consistent output
+                    # Sort IDs for consistent output
+                    for seq_id in sorted(list(hit_sequence_ids)):
                         txt_out.write(f"{seq_id}\n")
-                logger.info(
-                    f"PROCESSOR: Successfully wrote hit IDs to {final_hits_txt_file}"
-                )
+                logger.info(f"Successfully wrote hit IDs to {final_hits_txt_file}")
             except Exception as e:
-                logger.error(f"Failed to write hit IDs to {final_hits_txt_file}: {e}")
-                # The main workflow should still proceed even if this supplementary file fails
+                logger.error(
+                    f"Failed to write hit IDs to {final_hits_txt_file}: {e}",
+                    exc_info=True,
+                )
+                # Ensure the file exists even if writing fails
+                if not final_hits_txt_file.exists():
+                    final_hits_txt_file.touch()
             logger.info(
                 f"PROCESSOR: MMseqs2 workflow and FASTA/TXT generation completed successfully. Intermediate outputs in {mmseqs_temp_dir}"
@@ -576,41 +525,25 @@ class MMSeqsProfileProcessor(Processor):
                 )
                 final_results_csv_file.touch()  # Create empty file in run_base_dir if not found
-            if intermediate_hits_fasta_file.exists():
-                shutil.move(
-                    str(intermediate_hits_fasta_file), str(final_hits_fasta_file)
-                )
-                logger.info(f"Moved and renamed hits FASTA to {final_hits_fasta_file}")
-            else:
-                logger.warning(
-                    f"Intermediate hits FASTA {intermediate_hits_fasta_file} not found. Creating empty target file."
-                )
-                final_hits_fasta_file.touch()  # Create empty file in run_base_dir if not found
+            logger.info(
+                f"MMSeqsProfileProcessor run completed for {input_file}. Output CSV: {final_results_csv_file}"
+            )
+        except Exception as e:
+            logger.error(
+                f"MMSeqsProfileProcessor failed for {input_file}: {e}", exc_info=True
+            )
+            raise
         finally:
-            # Clean up the MMseqs2 temporary directory (mmseqs_tmp) which contains intermediate DBs etc.
+            # --- Cleanup --- #
+            logger.info(f"Cleaning up temporary directory: {mmseqs_temp_dir}")
             if mmseqs_temp_dir.exists():
                 shutil.rmtree(mmseqs_temp_dir)
+            if local_target_file.exists() and local_target_file != Path(input_file):
                 logger.info(
-                    f"Cleaned up MMseqs2 temporary directory: {mmseqs_temp_dir}"
+                    f"Cleaning up local copy of target file: {local_target_file}"
                 )
+                local_target_file.unlink()
+            logger.info("MMSeqsProfileProcessor cleanup finished.")
-            # Clean up the copied input file (local_target_file) from the run_base_dir
-            # so it does not get uploaded with the results.
-            if local_target_file.exists():
-                try:
-                    local_target_file.unlink()
-                    logger.info(
-                        f"Cleaned up copied input file from run directory: {local_target_file}"
-                    )
-                except OSError as e:
-                    logger.error(
-                        f"Error deleting copied input file {local_target_file}: {e}"
-                    )
-            # The run_base_dir (containing only the final, meaningfully named output files)
-            # will be cleaned up by the Operator after its contents are uploaded.
-        return str(
-            run_base_dir
-        )  # Return the path to the directory containing meaningfully named results
+        return str(run_base_dir)  # Return the path to the directory containing outputs

dayhoff_tools/deployment/swarm.py CHANGED Viewed

@@ -439,11 +439,6 @@ class Operator:
         - For AWS spot instances, uses IMDSv2 to check instance-action metadata
         - For GCP preemptible VMs, checks both maintenance-event and preempted metadata
         """
-        logger.info(
-            "DEBUG: _check_for_termination has been temporarily disabled for testing."
-        )
-        return  # DEBUG: Temporarily disable to test if this is causing premature shutdown
         while not _shutdown_requested.is_set():
             try:
                 # Check AWS spot termination using IMDSv2 (token-based auth)

dayhoff_tools/embedders.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import logging
 import os
 import time
-from abc import ABC, abstractmethod
 from typing import Dict, List, Literal, Optional, Tuple, cast
 import h5py

{dayhoff_tools-1.1.25.dist-info → dayhoff_tools-1.1.27.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: dayhoff-tools
-Version: 1.1.25
+Version: 1.1.27
 Summary: Common tools for all the repos at Dayhoff Labs
 Author: Daniel Martin-Alarcon
 Author-email: dma@dayhofflabs.com
@@ -10,22 +10,32 @@ Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
 Classifier: Programming Language :: Python :: 3.13
+Provides-Extra: embedders
 Provides-Extra: full
 Requires-Dist: biopython (>=1.84) ; extra == "full"
+Requires-Dist: biopython (>=1.85) ; extra == "embedders"
 Requires-Dist: boto3 (>=1.36.8) ; extra == "full"
 Requires-Dist: docker (>=7.1.0) ; extra == "full"
+Requires-Dist: fair-esm (>=2.0.0) ; extra == "embedders"
 Requires-Dist: fair-esm (>=2.0.0) ; extra == "full"
 Requires-Dist: firebase-admin (>=6.5.0)
 Requires-Dist: h5py (>=3.11.0) ; extra == "full"
+Requires-Dist: h5py (>=3.13.0) ; extra == "embedders"
+Requires-Dist: numpy (>=1.26.4) ; extra == "embedders"
+Requires-Dist: pandas (>=2.2.3) ; extra == "embedders"
 Requires-Dist: pandas (>=2.2.3) ; extra == "full"
 Requires-Dist: pyyaml (>=6.0)
 Requires-Dist: questionary (>=2.0.1)
 Requires-Dist: rdkit-pypi (>=2022.9.5) ; extra == "full"
 Requires-Dist: requests (>=2.31.0)
+Requires-Dist: sentencepiece (>=0.2.0) ; extra == "embedders"
 Requires-Dist: sentencepiece (>=0.2.0) ; extra == "full"
 Requires-Dist: sqlalchemy (>=2.0.40,<3.0.0) ; extra == "full"
 Requires-Dist: toml (>=0.10)
+Requires-Dist: torch (>=2.4.0) ; extra == "embedders"
+Requires-Dist: tqdm (>=4.67.1) ; extra == "embedders"
 Requires-Dist: transformers (==4.36.2) ; extra == "full"
+Requires-Dist: transformers (>=4.36.2) ; extra == "embedders"
 Requires-Dist: typer (>=0.9.0)
 Description-Content-Type: text/markdown

{dayhoff_tools-1.1.25.dist-info → dayhoff_tools-1.1.27.dist-info}/RECORD RENAMED Viewed

@@ -11,9 +11,9 @@ dayhoff_tools/deployment/deploy_aws.py,sha256=O0gQxHioSU_sNU8T8MD4wSOPvWc--V8eRR
 dayhoff_tools/deployment/deploy_gcp.py,sha256=DxBM4sUzwPK9RWLP9bSfr38n1HHl-TVrp4TsbdN8pUA,5795
 dayhoff_tools/deployment/deploy_utils.py,sha256=StFwbqnr2_FWiKVg3xnJF4kagTHzndqqDkpaIOaAn_4,26027
 dayhoff_tools/deployment/job_runner.py,sha256=4tmdplpvqSE9bVxRWHo2U5kwkYrYod0Uwzpg2Q7qG5o,4850
-dayhoff_tools/deployment/processors.py,sha256=kdXbS354DUKCN-kkj2sbt6T06cNG8Hphjj9e_nyCt1g,26676
-dayhoff_tools/deployment/swarm.py,sha256=Xoe-lLQYDT3FwCrPzImgpbHdWRmsK6WERh1IMMNWb2c,21898
-dayhoff_tools/embedders.py,sha256=CRgcb2z7KeeFrRQawyUZuJ4Yi0-J5jSr0hwuRhjG_FI,36513
+dayhoff_tools/deployment/processors.py,sha256=Ao4hU4rEXlyvLq-8wt8syqnfB05N7fIKYtjjdKCmx3g,22695
+dayhoff_tools/deployment/swarm.py,sha256=MGcS2_x4RNFtnVjWlU_SwNfhICz8NlGYr9cYBK4ZKDA,21688
+dayhoff_tools/embedders.py,sha256=svP_ksm3FdyVZ8i8R9R5uoGu2qI_hVQ_eztG0drXkN8,36477
 dayhoff_tools/fasta.py,sha256=HJ25D_u5F-tU6fZMkJfIhvqMSmnR32JK1QdCPXoHJ5g,49785
 dayhoff_tools/file_ops.py,sha256=JlGowvr-CUJFidV-4g_JmhUTN9bsYuaxtqKmnKomm-Q,8506
 dayhoff_tools/h5.py,sha256=j1nxxaiHsMidVX_XwB33P1Pz9d7K8ZKiDZwJWQUUQSY,21158
@@ -26,7 +26,7 @@ dayhoff_tools/intake/uniprot.py,sha256=BZYJQF63OtPcBBnQ7_P9gulxzJtqyorgyuDiPeOJq
 dayhoff_tools/logs.py,sha256=DKdeP0k0kliRcilwvX0mUB2eipO5BdWUeHwh-VnsICs,838
 dayhoff_tools/sqlite.py,sha256=jV55ikF8VpTfeQqqlHSbY8OgfyfHj8zgHNpZjBLos_E,18672
 dayhoff_tools/warehouse.py,sha256=TqV8nex1AluNaL4JuXH5zuu9P7qmE89lSo6f_oViy6U,14965
-dayhoff_tools-1.1.25.dist-info/METADATA,sha256=u9c1AI8g1qTnwBU3rPpoZm9BCk1AMu3AZry03FaeduI,2225
-dayhoff_tools-1.1.25.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
-dayhoff_tools-1.1.25.dist-info/entry_points.txt,sha256=iAf4jteNqW3cJm6CO6czLxjW3vxYKsyGLZ8WGmxamSc,49
-dayhoff_tools-1.1.25.dist-info/RECORD,,
+dayhoff_tools-1.1.27.dist-info/METADATA,sha256=b49Gayl-plG0GhoElOAHZm_jAG2qWHNEdkcRbMfEJTM,2761
+dayhoff_tools-1.1.27.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
+dayhoff_tools-1.1.27.dist-info/entry_points.txt,sha256=iAf4jteNqW3cJm6CO6czLxjW3vxYKsyGLZ8WGmxamSc,49
+dayhoff_tools-1.1.27.dist-info/RECORD,,

{dayhoff_tools-1.1.25.dist-info → dayhoff_tools-1.1.27.dist-info}/WHEEL RENAMED Viewed

File without changes

{dayhoff_tools-1.1.25.dist-info → dayhoff_tools-1.1.27.dist-info}/entry_points.txt RENAMED Viewed

File without changes

dayhoff-tools 1.1.25__py3-none-any.whl → 1.1.27__py3-none-any.whl

dayhoff-tools 1.1.25py3-none-any.whl → 1.1.27py3-none-any.whl