dayhoff-tools 1.1.39__py3-none-any.whl → 1.1.41__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -400,425 +400,3 @@ class BoltzPredictor(Processor):
400
400
  f"Boltz prediction completed successfully. Output in {expected_output_dir}"
401
401
  )
402
402
  return expected_output_dir
403
-
404
-
405
- class MMSeqsProfileProcessor(Processor):
406
- """Processor for running MMseqs2 profile searches.
407
-
408
- This class wraps the MMseqs2 workflow to perform a profile-based search
409
- against a target database using a query FASTA.
410
- """
411
-
412
- def __init__(
413
- self,
414
- query_fasta_path_in_image: str,
415
- num_threads: int = 8,
416
- mmseqs_args: dict | None = None,
417
- ):
418
- """Initialize the MMSeqsProfileProcessor.
419
-
420
- Args:
421
- query_fasta_path_in_image: Path to the query FASTA file. This path is expected
422
- to be accessible within the execution environment (e.g.,
423
- packaged in a Docker image).
424
- num_threads: Number of threads to use for MMseqs2 commands.
425
- mmseqs_args: A dictionary of additional MMseqs2 parameters.
426
- Expected keys: "memory_limit_gb", "evalue", "sensitivity",
427
- "max_seqs_search", "min_seq_id_cluster", "max_seqs_profile_msa".
428
- Defaults are used if not provided.
429
- """
430
- if not Path(query_fasta_path_in_image).is_file():
431
- raise FileNotFoundError(
432
- f"Query FASTA file not found at: {query_fasta_path_in_image}"
433
- )
434
- self.query_fasta_path = query_fasta_path_in_image
435
- self.num_threads = str(num_threads) # MMseqs2 expects string for threads
436
-
437
- default_mmseqs_args = {
438
- "memory_limit_gb": "25",
439
- "evalue": "10",
440
- "sensitivity": "7.5",
441
- "max_seqs_search": "300",
442
- "min_seq_id_cluster": "0.8",
443
- "max_seqs_profile_msa": "1000",
444
- }
445
- if mmseqs_args:
446
- self.mmseqs_args = {**default_mmseqs_args, **mmseqs_args}
447
- else:
448
- self.mmseqs_args = default_mmseqs_args
449
-
450
- # Log dayhoff-tools version
451
- from dayhoff_tools import __version__
452
-
453
- logger.info(f"dayhoff-tools version: {__version__}")
454
- logger.info(
455
- f"MMSeqsProfileProcessor initialized with query: {self.query_fasta_path}"
456
- )
457
- logger.info(f"MMSeqs args: {self.mmseqs_args}")
458
- logger.info(f"Num threads: {self.num_threads}")
459
-
460
- def _run_mmseqs_command(
461
- self, command_parts: list[str], step_description: str, work_dir: Path
462
- ):
463
- """Runs an MMseqs2 command and logs its execution.
464
-
465
- Args:
466
- command_parts: A list of strings representing the command and its arguments.
467
- step_description: A human-readable description of the MMseqs2 step.
468
- work_dir: The working directory for the command.
469
-
470
- Raises:
471
- subprocess.CalledProcessError: If the MMseqs2 command returns a non-zero exit code.
472
- """
473
- full_command = " ".join(command_parts)
474
- logger.info(f"Running MMseqs2 step in {work_dir}: {step_description}")
475
- logger.info(f"Command: {full_command}")
476
- try:
477
- process = subprocess.run(
478
- command_parts,
479
- check=True,
480
- stdout=subprocess.PIPE,
481
- stderr=subprocess.PIPE,
482
- text=True,
483
- cwd=work_dir, # Run command in the specified working directory
484
- )
485
- if process.stdout:
486
- logger.info(f"MMseqs2 stdout: {process.stdout.strip()}")
487
- if process.stderr: # MMseqs often outputs informational messages to stderr
488
- logger.info(f"MMseqs2 stderr: {process.stderr.strip()}")
489
- logger.info(f"MMseqs2 step '{step_description}' completed successfully.")
490
- except subprocess.CalledProcessError as e:
491
- logger.error(f"MMseqs2 step '{step_description}' failed in {work_dir}.")
492
- if e.stdout:
493
- logger.error(f"MMseqs2 stdout: {e.stdout.strip()}")
494
- if e.stderr:
495
- logger.error(f"MMseqs2 stderr: {e.stderr.strip()}")
496
- raise
497
-
498
- def run(self, input_file: str) -> str:
499
- """Run MMseqs2 profile search.
500
-
501
- The input_file is the target FASTA. The query FASTA is provided
502
- during initialization.
503
- The method creates an output directory (e.g., {target_stem})
504
- which contains the result files, now named meaningfully using the target stem
505
- (e.g., {target_stem}_results.m8 and {target_stem}_hits.fasta).
506
-
507
- Args:
508
- input_file: Path to the input target FASTA file.
509
-
510
- Returns:
511
- Path to the output directory (e.g., {target_stem}) containing
512
- the meaningfully named result files.
513
-
514
- Raises:
515
- subprocess.CalledProcessError: If any MMseqs2 command fails.
516
- FileNotFoundError: If the input_file is not found.
517
- """
518
- if not Path(input_file).is_file():
519
- raise FileNotFoundError(f"Input target FASTA file not found: {input_file}")
520
-
521
- input_file_path = Path(input_file).resolve() # Ensure absolute path
522
- target_fasta_filename = input_file_path.name
523
- target_fasta_stem = input_file_path.stem # Get stem for naming
524
-
525
- # Create a unique base directory for this run's outputs and temp files
526
- # This directory will be returned and subsequently uploaded by the Operator
527
- run_base_dir_name = f"{target_fasta_stem}" # Use stem as the dir name
528
- run_base_dir = Path(run_base_dir_name).resolve()
529
- run_base_dir.mkdir(parents=True, exist_ok=True)
530
- logger.info(f"Created run base directory: {run_base_dir}")
531
-
532
- # Define local paths within the run_base_dir
533
- local_target_file = run_base_dir / target_fasta_filename
534
- # Copy the target file into the run directory to keep inputs and outputs together
535
- shutil.copy(input_file_path, local_target_file)
536
- logger.info(f"Copied target file {input_file_path} to {local_target_file}")
537
-
538
- # Query file is already specified by self.query_fasta_path (path in image)
539
- local_query_file = Path(self.query_fasta_path).resolve()
540
-
541
- # Temporary directory for MMseqs2 intermediate files, created inside run_base_dir
542
- mmseqs_temp_dir = run_base_dir / "mmseqs_tmp"
543
- mmseqs_temp_dir.mkdir(parents=True, exist_ok=True)
544
- logger.info(f"Created MMseqs2 temporary directory: {mmseqs_temp_dir}")
545
-
546
- # Define INTERMEDIATE output file paths within mmseqs_temp_dir
547
- intermediate_results_m8_file = mmseqs_temp_dir / "results.m8"
548
- intermediate_results_as_csv_file = mmseqs_temp_dir / "results_as.csv"
549
-
550
- # Define FINAL output file paths within run_base_dir, using target stem
551
- final_results_csv_file = run_base_dir / f"{target_fasta_stem}.csv"
552
- final_hits_txt_file = run_base_dir / f"{target_fasta_stem}.txt"
553
-
554
- # --- MMseqs2 Workflow Paths (intermediate files in mmseqs_temp_dir) ---
555
- query_db = mmseqs_temp_dir / "queryDB"
556
- target_db = mmseqs_temp_dir / "targetDB"
557
- # Ensure local_target_file is used for creating targetDB
558
- target_db_input_file = local_target_file
559
-
560
- query_db_cluster = mmseqs_temp_dir / "queryDB_cluster"
561
- query_db_rep = mmseqs_temp_dir / "queryDB_rep"
562
- aln_db = mmseqs_temp_dir / "alnDB"
563
- profile_db = mmseqs_temp_dir / "profileDB"
564
- result_db = mmseqs_temp_dir / "resultDB"
565
-
566
- try:
567
- # 1. Create query database
568
- self._run_mmseqs_command(
569
- ["mmseqs", "createdb", str(local_query_file), str(query_db)],
570
- "Create query DB",
571
- run_base_dir, # Working directory for the command
572
- )
573
-
574
- # 2. Create target database
575
- self._run_mmseqs_command(
576
- ["mmseqs", "createdb", str(target_db_input_file), str(target_db)],
577
- "Create target DB",
578
- run_base_dir,
579
- )
580
-
581
- # 3. Cluster query sequences
582
- self._run_mmseqs_command(
583
- [
584
- "mmseqs",
585
- "cluster",
586
- str(query_db),
587
- str(query_db_cluster),
588
- str(
589
- mmseqs_temp_dir / "tmp_cluster"
590
- ), # MMseqs needs a temp dir for cluster
591
- "--min-seq-id",
592
- self.mmseqs_args["min_seq_id_cluster"],
593
- "--threads",
594
- self.num_threads,
595
- ],
596
- "Cluster query sequences",
597
- run_base_dir,
598
- )
599
-
600
- # 4. Create representative set from query clusters
601
- self._run_mmseqs_command(
602
- [
603
- "mmseqs",
604
- "createsubdb",
605
- str(query_db_cluster),
606
- str(query_db),
607
- str(query_db_rep),
608
- ],
609
- "Create representative query set",
610
- run_base_dir,
611
- )
612
-
613
- # 5. Create MSA for profile generation
614
- self._run_mmseqs_command(
615
- [
616
- "mmseqs",
617
- "search",
618
- str(query_db_rep),
619
- str(query_db), # Search representative against full query DB
620
- str(aln_db),
621
- str(mmseqs_temp_dir / "tmp_search_msa"), # Temp for this search
622
- "--max-seqs",
623
- self.mmseqs_args["max_seqs_profile_msa"],
624
- "--threads",
625
- self.num_threads,
626
- ],
627
- "Create MSA for profile",
628
- run_base_dir,
629
- )
630
-
631
- # 6. Create profile database
632
- self._run_mmseqs_command(
633
- [
634
- "mmseqs",
635
- "result2profile",
636
- str(query_db_rep), # Use query_db_rep as input for profile
637
- str(query_db), # Full query DB as second arg
638
- str(aln_db),
639
- str(profile_db),
640
- "--threads", # Added threads option
641
- self.num_threads,
642
- ],
643
- "Create profile DB",
644
- run_base_dir,
645
- )
646
-
647
- # 7. Perform profile search
648
- self._run_mmseqs_command(
649
- [
650
- "mmseqs",
651
- "search",
652
- str(profile_db),
653
- str(target_db),
654
- str(result_db),
655
- str(mmseqs_temp_dir / "tmp_search_profile"), # Temp for this search
656
- "--split-memory-limit",
657
- f"{self.mmseqs_args['memory_limit_gb']}G",
658
- "-e",
659
- self.mmseqs_args["evalue"],
660
- "--max-seqs",
661
- self.mmseqs_args["max_seqs_search"],
662
- "--threads",
663
- self.num_threads,
664
- "-s",
665
- self.mmseqs_args["sensitivity"],
666
- ],
667
- "Perform profile search",
668
- run_base_dir,
669
- )
670
-
671
- # 8. Convert results to tabular format (M8) -> to intermediate file
672
- self._run_mmseqs_command(
673
- [
674
- "mmseqs",
675
- "convertalis",
676
- str(profile_db), # Query DB used for search (profileDB)
677
- str(target_db),
678
- str(result_db),
679
- str(intermediate_results_m8_file), # Output M8 file to temp dir
680
- "--threads",
681
- self.num_threads,
682
- ],
683
- "Convert results to M8",
684
- run_base_dir,
685
- )
686
-
687
- # 8.5 Convert M8 to CSV with headers
688
- logger.info(
689
- f"Converting M8 results to CSV: {intermediate_results_m8_file} -> {intermediate_results_as_csv_file}"
690
- )
691
- csv_headers = [
692
- "query_id",
693
- "target_id",
694
- "percent_identity",
695
- "alignment_length",
696
- "mismatches",
697
- "gap_openings",
698
- "query_start",
699
- "query_end",
700
- "target_start",
701
- "target_end",
702
- "e_value",
703
- "bit_score",
704
- ]
705
- try:
706
- if not intermediate_results_m8_file.exists():
707
- logger.warning(
708
- f"M8 results file {intermediate_results_m8_file} not found. CSV will be empty."
709
- )
710
- # Create an empty CSV with headers if M8 is missing
711
- with open(
712
- intermediate_results_as_csv_file, "w", newline=""
713
- ) as csvfile:
714
- writer = csv.writer(csvfile)
715
- writer.writerow(csv_headers)
716
- else:
717
- with (
718
- open(intermediate_results_m8_file, "r") as m8file,
719
- open(
720
- intermediate_results_as_csv_file, "w", newline=""
721
- ) as csvfile,
722
- ):
723
- writer = csv.writer(csvfile)
724
- writer.writerow(csv_headers)
725
- for line in m8file:
726
- writer.writerow(line.strip().split("\t"))
727
- except Exception as e:
728
- logger.error(f"Error converting M8 to CSV: {e}", exc_info=True)
729
- # Ensure an empty csv is created on error to prevent downstream issues
730
- if not intermediate_results_as_csv_file.exists():
731
- with open(
732
- intermediate_results_as_csv_file, "w", newline=""
733
- ) as csvfile:
734
- writer = csv.writer(csvfile)
735
- writer.writerow(csv_headers) # write headers even on error
736
-
737
- # 9. Extract hit sequence IDs from M8 results for the TXT file
738
- hit_sequence_ids = set()
739
- logger.info(
740
- f"Extracting hit IDs from {intermediate_results_m8_file} for TXT output."
741
- )
742
- try:
743
- if intermediate_results_m8_file.exists():
744
- with open(intermediate_results_m8_file, "r") as m8_file:
745
- for line in m8_file:
746
- if line.strip(): # Check if line is not empty
747
- columns = line.strip().split("\t")
748
- if len(columns) >= 2:
749
- hit_sequence_ids.add(
750
- columns[1]
751
- ) # Add target_accession
752
- logger.info(
753
- f"Found {len(hit_sequence_ids)} unique hit IDs in M8 file."
754
- )
755
- else:
756
- logger.warning(
757
- f"Intermediate M8 file {intermediate_results_m8_file} not found. Hit TXT file will be empty."
758
- )
759
- except Exception as e:
760
- logger.error(
761
- f"Error reading M8 file {intermediate_results_m8_file} for hit ID extraction: {e}",
762
- exc_info=True,
763
- )
764
- # Proceed even if M8 reading fails, TXT will be empty
765
-
766
- # 10. Write the set of hit sequence IDs to the final .txt file
767
- logger.info(
768
- f"Writing {len(hit_sequence_ids)} hit sequence IDs to {final_hits_txt_file}"
769
- )
770
- try:
771
- with open(final_hits_txt_file, "w") as txt_out:
772
- # Sort IDs for consistent output
773
- for seq_id in sorted(list(hit_sequence_ids)):
774
- txt_out.write(f"{seq_id}\n")
775
- logger.info(f"Successfully wrote hit IDs to {final_hits_txt_file}")
776
- except Exception as e:
777
- logger.error(
778
- f"Failed to write hit IDs to {final_hits_txt_file}: {e}",
779
- exc_info=True,
780
- )
781
- # Ensure the file exists even if writing fails
782
- if not final_hits_txt_file.exists():
783
- final_hits_txt_file.touch()
784
-
785
- logger.info(
786
- f"PROCESSOR: MMseqs2 workflow and FASTA/TXT generation completed successfully. Intermediate outputs in {mmseqs_temp_dir}"
787
- )
788
-
789
- # Move and rename final output files from mmseqs_temp_dir to run_base_dir
790
- if intermediate_results_as_csv_file.exists():
791
- shutil.move(
792
- str(intermediate_results_as_csv_file), str(final_results_csv_file)
793
- )
794
- logger.info(
795
- f"Moved and renamed M8 results to CSV: {final_results_csv_file}"
796
- )
797
- else:
798
- logger.warning(
799
- f"Intermediate CSV file {intermediate_results_as_csv_file} not found. Creating empty target CSV file."
800
- )
801
- final_results_csv_file.touch() # Create empty file in run_base_dir if not found
802
-
803
- logger.info(
804
- f"MMSeqsProfileProcessor run completed for {input_file}. Output CSV: {final_results_csv_file}"
805
- )
806
-
807
- except Exception as e:
808
- logger.error(
809
- f"MMSeqsProfileProcessor failed for {input_file}: {e}", exc_info=True
810
- )
811
- raise
812
- finally:
813
- # --- Cleanup --- #
814
- logger.info(f"Cleaning up temporary directory: {mmseqs_temp_dir}")
815
- if mmseqs_temp_dir.exists():
816
- shutil.rmtree(mmseqs_temp_dir)
817
- if local_target_file.exists() and local_target_file != Path(input_file):
818
- logger.info(
819
- f"Cleaning up local copy of target file: {local_target_file}"
820
- )
821
- local_target_file.unlink()
822
- logger.info("MMSeqsProfileProcessor cleanup finished.")
823
-
824
- return str(run_base_dir) # Return the path to the directory containing outputs
@@ -442,35 +442,38 @@ class Embedder(Processor):
442
442
  for seq_id, seq in small_seqs_sorted:
443
443
  seq_len = len(seq)
444
444
 
445
- if current_size + seq_len > self.batch_residue_limit:
446
- if current_batch:
447
- small_batch_count += 1
448
- logger.info(
449
- f"Processing small batch {small_batch_count}/{total_small_batches} with {len(current_batch)} sequences"
450
- )
451
- batch_results = self.embed_batch(current_batch)
452
- results.update(batch_results)
453
- self.cleanup_memory()
445
+ # Check if adding this sequence would exceed the limit
446
+ if current_batch and current_size + seq_len > self.batch_residue_limit:
447
+ # Process current batch before adding the new sequence
448
+ small_batch_count += 1
449
+ logger.info(
450
+ f"Processing small batch {small_batch_count}/{total_small_batches} with {len(current_batch)} sequences"
451
+ )
452
+ batch_results = self.embed_batch(current_batch)
453
+ results.update(batch_results)
454
+ self.cleanup_memory()
454
455
 
455
- # Update progress
456
- processed_sequences += len(current_batch)
457
- elapsed_time = time.time() - start_time
458
- remaining_sequences = total_sequences - processed_sequences
459
- avg_time_per_seq = (
460
- elapsed_time / processed_sequences
461
- if processed_sequences > 0
462
- else 0
463
- )
464
- estimated_time_left = avg_time_per_seq * remaining_sequences
456
+ # Update progress
457
+ processed_sequences += len(current_batch)
458
+ elapsed_time = time.time() - start_time
459
+ remaining_sequences = total_sequences - processed_sequences
460
+ avg_time_per_seq = (
461
+ elapsed_time / processed_sequences
462
+ if processed_sequences > 0
463
+ else 0
464
+ )
465
+ estimated_time_left = avg_time_per_seq * remaining_sequences
465
466
 
466
- logger.info(
467
- f"Progress: {processed_sequences}/{total_sequences} sequences ({processed_sequences/total_sequences*100:.1f}%) | "
468
- f"Elapsed: {elapsed_time/60:.1f} min | "
469
- f"Est. remaining: {estimated_time_left/60:.1f} min"
470
- )
467
+ logger.info(
468
+ f"Progress: {processed_sequences}/{total_sequences} sequences ({processed_sequences/total_sequences*100:.1f}%) | "
469
+ f"Elapsed: {elapsed_time/60:.1f} min | "
470
+ f"Est. remaining: {estimated_time_left/60:.1f} min"
471
+ )
472
+ # Start new batch
471
473
  current_batch = []
472
474
  current_size = 0
473
475
 
476
+ # Add the current sequence to the batch
474
477
  current_batch.append((seq_id, seq, seq_len))
475
478
  current_size += seq_len
476
479
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: dayhoff-tools
3
- Version: 1.1.39
3
+ Version: 1.1.41
4
4
  Summary: Common tools for all the repos at Dayhoff Labs
5
5
  Author: Daniel Martin-Alarcon
6
6
  Author-email: dma@dayhofflabs.com
@@ -11,9 +11,9 @@ dayhoff_tools/deployment/deploy_aws.py,sha256=jQyQ0fbm2793jEHFO84lr5tNqiOpdBg6U0
11
11
  dayhoff_tools/deployment/deploy_gcp.py,sha256=xgaOVsUDmP6wSEMYNkm1yRNcVskfdz80qJtCulkBIAM,8860
12
12
  dayhoff_tools/deployment/deploy_utils.py,sha256=StFwbqnr2_FWiKVg3xnJF4kagTHzndqqDkpaIOaAn_4,26027
13
13
  dayhoff_tools/deployment/job_runner.py,sha256=hljvFpH2Bw96uYyUup5Ths72PZRL_X27KxlYzBMgguo,5086
14
- dayhoff_tools/deployment/processors.py,sha256=A7zvF47TfCkuLTCvaqZmk1M9ZgZcv6CAoXZCV6rEXuE,34599
14
+ dayhoff_tools/deployment/processors.py,sha256=f4L52ekx_zYirl8C4WfavxtOioyD-c34TdTJVDoLpWs,16572
15
15
  dayhoff_tools/deployment/swarm.py,sha256=MGcS2_x4RNFtnVjWlU_SwNfhICz8NlGYr9cYBK4ZKDA,21688
16
- dayhoff_tools/embedders.py,sha256=svP_ksm3FdyVZ8i8R9R5uoGu2qI_hVQ_eztG0drXkN8,36477
16
+ dayhoff_tools/embedders.py,sha256=fRkyWjHo8OmbNUBY_FwrgfvyiLqpmrpI57UAb1Szn1Y,36609
17
17
  dayhoff_tools/fasta.py,sha256=_kA2Cpiy7JAGbBqLrjElkzbcUD_p-nO2d5Aj1LVmOvc,50509
18
18
  dayhoff_tools/file_ops.py,sha256=JlGowvr-CUJFidV-4g_JmhUTN9bsYuaxtqKmnKomm-Q,8506
19
19
  dayhoff_tools/h5.py,sha256=j1nxxaiHsMidVX_XwB33P1Pz9d7K8ZKiDZwJWQUUQSY,21158
@@ -26,7 +26,7 @@ dayhoff_tools/intake/uniprot.py,sha256=BZYJQF63OtPcBBnQ7_P9gulxzJtqyorgyuDiPeOJq
26
26
  dayhoff_tools/logs.py,sha256=DKdeP0k0kliRcilwvX0mUB2eipO5BdWUeHwh-VnsICs,838
27
27
  dayhoff_tools/sqlite.py,sha256=jV55ikF8VpTfeQqqlHSbY8OgfyfHj8zgHNpZjBLos_E,18672
28
28
  dayhoff_tools/warehouse.py,sha256=TqV8nex1AluNaL4JuXH5zuu9P7qmE89lSo6f_oViy6U,14965
29
- dayhoff_tools-1.1.39.dist-info/METADATA,sha256=c-7TpBB15X71z48kz2BUHFFqWddHgywH88BLequz3d0,2843
30
- dayhoff_tools-1.1.39.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
31
- dayhoff_tools-1.1.39.dist-info/entry_points.txt,sha256=iAf4jteNqW3cJm6CO6czLxjW3vxYKsyGLZ8WGmxamSc,49
32
- dayhoff_tools-1.1.39.dist-info/RECORD,,
29
+ dayhoff_tools-1.1.41.dist-info/METADATA,sha256=HgiBVffpoYUtLMGV4uAKXJCyiPVmo39ytRHQ41b6-hg,2843
30
+ dayhoff_tools-1.1.41.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
31
+ dayhoff_tools-1.1.41.dist-info/entry_points.txt,sha256=iAf4jteNqW3cJm6CO6czLxjW3vxYKsyGLZ8WGmxamSc,49
32
+ dayhoff_tools-1.1.41.dist-info/RECORD,,