dayhoff-tools 1.1.18__py3-none-any.whl → 1.1.20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -437,14 +437,13 @@ def install_dependencies(
437
437
  False,
438
438
  "--install-project",
439
439
  "-p",
440
- help="Install the local project package itself into the environment.",
440
+ help="Install the local project package itself (with 'full' extras) into the environment.",
441
441
  ),
442
442
  ):
443
443
  """Install dependencies based on pyproject.toml.
444
444
 
445
445
  Ensures uv.lock matches pyproject.toml and syncs the environment.
446
- This is the command to run after changing pyproject.toml manually
447
- or cloning/pulling a repository.
446
+ When -p is used, installs the local project with its [full] optional dependencies.
448
447
  """
449
448
  # ANSI color codes
450
449
  BLUE = "\033[94m"
@@ -457,15 +456,35 @@ def install_dependencies(
457
456
  print(f"Running command: {BLUE}{' '.join(lock_cmd)}{RESET}")
458
457
  subprocess.run(lock_cmd, check=True, capture_output=True)
459
458
 
460
- # Step 2: Sync environment
461
- print("Syncing environment with lock file...")
462
- sync_cmd = ["uv", "sync", "--all-groups"]
463
- if not install_project:
464
- sync_cmd.append("--no-install-project")
465
- print(f"Running command: {BLUE}{' '.join(sync_cmd)}{RESET}")
466
- subprocess.run(sync_cmd, check=True)
467
-
468
- print("Dependencies installed/synced successfully.")
459
+ if install_project:
460
+ # Step 2a: Install the project with 'full' extras
461
+ print("Installing the local project with 'full' extras...")
462
+ # The .[full] syntax tells pip to install the current project ('.')
463
+ # with its 'full' optional dependencies.
464
+ pip_install_cmd = ["uv", "pip", "install", "-e", ".[full]"]
465
+ print(f"Running command: {BLUE}{' '.join(pip_install_cmd)}{RESET}")
466
+ subprocess.run(pip_install_cmd, check=True)
467
+
468
+ # Step 2b: Sync other dependency groups (e.g., dev)
469
+ # We use --no-install-project here because the project itself was just installed.
470
+ # --all-groups will ensure other groups like 'dev' are synced.
471
+ print("Syncing other dependency groups (e.g., dev)...")
472
+ sync_cmd = ["uv", "sync", "--all-groups", "--no-install-project"]
473
+ print(f"Running command: {BLUE}{' '.join(sync_cmd)}{RESET}")
474
+ subprocess.run(sync_cmd, check=True)
475
+ print(
476
+ "Project installed with 'full' extras and other dependency groups synced successfully."
477
+ )
478
+ else:
479
+ # Original behavior: Sync environment without installing the project
480
+ print(
481
+ "Syncing environment with lock file (project itself will not be installed)..."
482
+ )
483
+ # --all-groups ensures all non-project dependencies (like dev) are installed
484
+ sync_cmd = ["uv", "sync", "--all-groups", "--no-install-project"]
485
+ print(f"Running command: {BLUE}{' '.join(sync_cmd)}{RESET}")
486
+ subprocess.run(sync_cmd, check=True)
487
+ print("Dependencies synced successfully (project not installed).")
469
488
 
470
489
  except subprocess.CalledProcessError as e:
471
490
  stderr_output = e.stderr.decode() if e.stderr else "No stderr output."
@@ -1,3 +1,4 @@
1
+ import csv
1
2
  import logging
2
3
  import os
3
4
  import shlex
@@ -6,6 +7,8 @@ import subprocess
6
7
  from abc import ABC, abstractmethod
7
8
  from pathlib import Path
8
9
 
10
+ from dayhoff_tools.fasta import subset_fasta
11
+
9
12
  logger = logging.getLogger(__name__)
10
13
 
11
14
 
@@ -267,10 +270,12 @@ class MMSeqsProfileProcessor(Processor):
267
270
  # Define INTERMEDIATE output file paths within mmseqs_temp_dir
268
271
  intermediate_results_m8_file = mmseqs_temp_dir / "results.m8"
269
272
  intermediate_hits_fasta_file = mmseqs_temp_dir / "results.fasta"
273
+ intermediate_results_as_csv_file = mmseqs_temp_dir / "results_as.csv"
270
274
 
271
275
  # Define FINAL output file paths within run_base_dir, using target stem
272
- final_results_m8_file = run_base_dir / f"{target_fasta_stem}_results.m8"
273
- final_hits_fasta_file = run_base_dir / f"{target_fasta_stem}_hits.fasta"
276
+ final_results_csv_file = run_base_dir / f"{target_fasta_stem}.csv"
277
+ final_hits_fasta_file = run_base_dir / f"{target_fasta_stem}.fasta"
278
+ final_hits_txt_file = run_base_dir / f"{target_fasta_stem}.txt"
274
279
 
275
280
  # --- MMseqs2 Workflow Paths (intermediate files in mmseqs_temp_dir) ---
276
281
  query_db = mmseqs_temp_dir / "queryDB"
@@ -405,36 +410,147 @@ class MMSeqsProfileProcessor(Processor):
405
410
  run_base_dir,
406
411
  )
407
412
 
408
- # 9. Extract hit sequences directly to FASTA using result2flat
409
- self._run_mmseqs_command(
410
- [
411
- "mmseqs",
412
- "createseqfiledb",
413
- str(result_db), # resultDB containing target hits
414
- str(target_db), # targetDB containing actual sequences
415
- str(intermediate_hits_fasta_file), # output FASTA
416
- "--db-output",
417
- "1", # Output in FASTA format
418
- ],
419
- "Extract hit sequences to FASTA via createseqfiledb",
420
- run_base_dir,
413
+ # 8.5 Convert M8 to CSV with headers
414
+ logger.info(
415
+ f"Converting M8 results {intermediate_results_m8_file} to CSV {intermediate_results_as_csv_file}"
421
416
  )
417
+ csv_headers = [
418
+ "query_id",
419
+ "target_id",
420
+ "percent_identity",
421
+ "alignment_length",
422
+ "mismatches",
423
+ "gap_openings",
424
+ "query_start",
425
+ "query_end",
426
+ "target_start",
427
+ "target_end",
428
+ "e_value",
429
+ "bit_score",
430
+ ]
431
+ try:
432
+ if intermediate_results_m8_file.exists():
433
+ with open(intermediate_results_m8_file, "r") as m8_in, open(
434
+ intermediate_results_as_csv_file, "w", newline=""
435
+ ) as csv_out:
436
+ writer = csv.writer(csv_out)
437
+ writer.writerow(csv_headers)
438
+ for line in m8_in:
439
+ if line.strip(): # Ensure line is not empty
440
+ columns = line.strip().split("\t")
441
+ writer.writerow(columns)
442
+ logger.info(
443
+ f"Successfully converted M8 to CSV: {intermediate_results_as_csv_file}"
444
+ )
445
+ else:
446
+ logger.warning(
447
+ f"Intermediate M8 file {intermediate_results_m8_file} not found for CSV conversion. CSV will be empty or not created."
448
+ )
449
+ # Ensure the CSV file is touched if M8 was missing, so downstream move doesn't fail
450
+ intermediate_results_as_csv_file.touch()
451
+ except Exception as e:
452
+ logger.error(f"Failed to convert M8 to CSV: {e}")
453
+ # Touch the CSV file in case of error to prevent downstream errors if it's expected
454
+ if not intermediate_results_as_csv_file.exists():
455
+ intermediate_results_as_csv_file.touch()
456
+ # We might still want to raise e here, depending on desired error handling for CSV conversion failure
457
+
458
+ # 9. Extract hit sequences from M8 results using subset_fasta
459
+ logger.info(f"Parsing M8 results from: {intermediate_results_m8_file}")
460
+ hit_sequence_ids = set()
461
+ try:
462
+ if not intermediate_results_m8_file.exists():
463
+ logger.warning(
464
+ f"M8 results file {intermediate_results_m8_file} not found. Hits FASTA will be empty."
465
+ )
466
+ intermediate_hits_fasta_file.touch() # Create empty hits file
467
+ else:
468
+ with open(intermediate_results_m8_file, "r") as m8_file:
469
+ for line in m8_file:
470
+ if line.strip(): # Ensure line is not empty
471
+ columns = line.strip().split("\t")
472
+ if len(columns) >= 2:
473
+ hit_sequence_ids.add(
474
+ columns[1]
475
+ ) # Target ID is the second column
476
+ logger.info(
477
+ f"Found {len(hit_sequence_ids)} unique target IDs in M8 results."
478
+ )
479
+
480
+ if not hit_sequence_ids:
481
+ logger.warning(
482
+ f"No target IDs found in {intermediate_results_m8_file} after parsing. Output hits FASTA will be empty."
483
+ )
484
+ intermediate_hits_fasta_file.touch() # Create empty file
485
+ else:
486
+ logger.info(
487
+ f"Extracting {len(hit_sequence_ids)} hit sequences from {local_target_file} to {intermediate_hits_fasta_file} using subset_fasta."
488
+ )
489
+ try:
490
+ subset_fasta(
491
+ fasta_file=str(local_target_file),
492
+ output_path=str(intermediate_hits_fasta_file),
493
+ target_ids=hit_sequence_ids,
494
+ exclude=False,
495
+ return_written_ids=False,
496
+ )
497
+ logger.info(
498
+ f"Successfully created hits FASTA: {intermediate_hits_fasta_file} using subset_fasta."
499
+ )
500
+ except FileNotFoundError as e:
501
+ logger.error(
502
+ f"subset_fasta FileNotFoundError: {e}. Ensuring {intermediate_hits_fasta_file} exists as empty."
503
+ )
504
+ if not intermediate_hits_fasta_file.exists():
505
+ intermediate_hits_fasta_file.touch()
506
+ raise
507
+ except Exception as e:
508
+ logger.error(
509
+ f"subset_fasta failed to create {intermediate_hits_fasta_file}: {e}"
510
+ )
511
+ if not intermediate_hits_fasta_file.exists():
512
+ intermediate_hits_fasta_file.touch()
513
+ raise
514
+ except Exception as e:
515
+ logger.error(
516
+ f"Error processing M8 file {intermediate_results_m8_file} or its hits extraction: {e}"
517
+ )
518
+ if not intermediate_hits_fasta_file.exists():
519
+ intermediate_hits_fasta_file.touch()
520
+ raise
521
+
522
+ # 10. Write the set of hit sequence IDs to a .txt file
523
+ logger.info(
524
+ f"Writing {len(hit_sequence_ids)} hit sequence IDs to {final_hits_txt_file}"
525
+ )
526
+ try:
527
+ with open(final_hits_txt_file, "w") as txt_out:
528
+ for seq_id in sorted(
529
+ list(hit_sequence_ids)
530
+ ): # Sort for consistent output
531
+ txt_out.write(f"{seq_id}\n")
532
+ logger.info(f"Successfully wrote hit IDs to {final_hits_txt_file}")
533
+ except Exception as e:
534
+ logger.error(f"Failed to write hit IDs to {final_hits_txt_file}: {e}")
535
+ # The main workflow should still proceed even if this supplementary file fails
422
536
 
423
537
  logger.info(
424
538
  f"MMseqs2 workflow completed successfully. Intermediate outputs in {mmseqs_temp_dir}"
425
539
  )
426
540
 
427
541
  # Move and rename final output files from mmseqs_temp_dir to run_base_dir
428
- if intermediate_results_m8_file.exists():
542
+ if intermediate_results_as_csv_file.exists():
429
543
  shutil.move(
430
- str(intermediate_results_m8_file), str(final_results_m8_file)
544
+ str(intermediate_results_as_csv_file), str(final_results_csv_file)
545
+ )
546
+ logger.info(
547
+ f"Moved and renamed M8 results to CSV: {final_results_csv_file}"
431
548
  )
432
- logger.info(f"Moved and renamed M8 results to {final_results_m8_file}")
433
549
  else:
434
550
  logger.warning(
435
- f"Intermediate M8 file {intermediate_results_m8_file} not found. Creating empty target file."
551
+ f"Intermediate CSV file {intermediate_results_as_csv_file} not found. Creating empty target CSV file."
436
552
  )
437
- final_results_m8_file.touch() # Create empty file in run_base_dir if not found
553
+ final_results_csv_file.touch() # Create empty file in run_base_dir if not found
438
554
 
439
555
  if intermediate_hits_fasta_file.exists():
440
556
  shutil.move(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: dayhoff-tools
3
- Version: 1.1.18
3
+ Version: 1.1.20
4
4
  Summary: Common tools for all the repos at Dayhoff Labs
5
5
  Author: Daniel Martin-Alarcon
6
6
  Author-email: dma@dayhofflabs.com
@@ -5,13 +5,13 @@ dayhoff_tools/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
5
5
  dayhoff_tools/cli/cloud_commands.py,sha256=NGux28-cjDyCADF-L1tjdEMzkCMYX8V4xNvpK6EWcZA,40802
6
6
  dayhoff_tools/cli/main.py,sha256=47EGb28ALaYFc7oAUGlY1D66AIDmc4RZiXxN-gPVrpQ,4519
7
7
  dayhoff_tools/cli/swarm_commands.py,sha256=5EyKj8yietvT5lfoz8Zx0iQvVaNgc3SJX1z2zQR6o6M,5614
8
- dayhoff_tools/cli/utility_commands.py,sha256=PRShdh4O35JfjVxTOsduZ8-grGF5-SfXihcptcFT-hk,23584
8
+ dayhoff_tools/cli/utility_commands.py,sha256=_8l5o0il_wyC96Bx_UHvHRrdnaJUgji7D0L6VYsdops,24918
9
9
  dayhoff_tools/deployment/base.py,sha256=8tXwsPYvRo-zV-aNhHw1c7Rji-KWg8S5xoCCznFnVVI,17412
10
10
  dayhoff_tools/deployment/deploy_aws.py,sha256=O0gQxHioSU_sNU8T8MD4wSOPvWc--V8eRRZzlRu035I,16446
11
11
  dayhoff_tools/deployment/deploy_gcp.py,sha256=DxBM4sUzwPK9RWLP9bSfr38n1HHl-TVrp4TsbdN8pUA,5795
12
12
  dayhoff_tools/deployment/deploy_utils.py,sha256=StFwbqnr2_FWiKVg3xnJF4kagTHzndqqDkpaIOaAn_4,26027
13
13
  dayhoff_tools/deployment/job_runner.py,sha256=4tmdplpvqSE9bVxRWHo2U5kwkYrYod0Uwzpg2Q7qG5o,4850
14
- dayhoff_tools/deployment/processors.py,sha256=1msmRHjSToup2f7DQVpAfpFkBMSzMh1YG_7WJEpChWg,19382
14
+ dayhoff_tools/deployment/processors.py,sha256=0D2r2-NgVcFfxF0QSqLZy86cxEWSn78jLTKYf60fLBQ,25519
15
15
  dayhoff_tools/deployment/swarm.py,sha256=MGcS2_x4RNFtnVjWlU_SwNfhICz8NlGYr9cYBK4ZKDA,21688
16
16
  dayhoff_tools/embedders.py,sha256=CRgcb2z7KeeFrRQawyUZuJ4Yi0-J5jSr0hwuRhjG_FI,36513
17
17
  dayhoff_tools/fasta.py,sha256=e7xw3pInoupqCGE0-fJTOzmW_earL1M7qPyoqIPfUT4,46269
@@ -26,7 +26,7 @@ dayhoff_tools/intake/uniprot.py,sha256=BZYJQF63OtPcBBnQ7_P9gulxzJtqyorgyuDiPeOJq
26
26
  dayhoff_tools/logs.py,sha256=DKdeP0k0kliRcilwvX0mUB2eipO5BdWUeHwh-VnsICs,838
27
27
  dayhoff_tools/sqlite.py,sha256=jV55ikF8VpTfeQqqlHSbY8OgfyfHj8zgHNpZjBLos_E,18672
28
28
  dayhoff_tools/warehouse.py,sha256=TqV8nex1AluNaL4JuXH5zuu9P7qmE89lSo6f_oViy6U,14965
29
- dayhoff_tools-1.1.18.dist-info/METADATA,sha256=9uJXFcLDGpuxssU_xBvmehDR58gPs-gFtKShHhEw4u4,2225
30
- dayhoff_tools-1.1.18.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
31
- dayhoff_tools-1.1.18.dist-info/entry_points.txt,sha256=iAf4jteNqW3cJm6CO6czLxjW3vxYKsyGLZ8WGmxamSc,49
32
- dayhoff_tools-1.1.18.dist-info/RECORD,,
29
+ dayhoff_tools-1.1.20.dist-info/METADATA,sha256=FfqFxLcWkWI83uz38_0HKCCwfOB2uHt4q2u2VnZuRXQ,2225
30
+ dayhoff_tools-1.1.20.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
31
+ dayhoff_tools-1.1.20.dist-info/entry_points.txt,sha256=iAf4jteNqW3cJm6CO6czLxjW3vxYKsyGLZ8WGmxamSc,49
32
+ dayhoff_tools-1.1.20.dist-info/RECORD,,