dayhoff-tools 1.1.19__py3-none-any.whl → 1.1.21__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
dayhoff_tools/__init__.py CHANGED
@@ -0,0 +1,10 @@
1
+ import importlib.metadata
2
+
3
+ try:
4
+ # The package name here should match the 'name' field in your pyproject.toml
5
+ __version__ = importlib.metadata.version("dayhoff-tools")
6
+ except importlib.metadata.PackageNotFoundError:
7
+ # This is a fallback for when the package might not be installed (e.g., running from source
8
+ # without installation, or during development). You can set it to None, "unknown",
9
+ # or handle it as you see fit.
10
+ __version__ = "unknown"
@@ -437,14 +437,13 @@ def install_dependencies(
437
437
  False,
438
438
  "--install-project",
439
439
  "-p",
440
- help="Install the local project package itself into the environment.",
440
+ help="Install the local project package itself (with 'full' extras) into the environment.",
441
441
  ),
442
442
  ):
443
443
  """Install dependencies based on pyproject.toml.
444
444
 
445
445
  Ensures uv.lock matches pyproject.toml and syncs the environment.
446
- This is the command to run after changing pyproject.toml manually
447
- or cloning/pulling a repository.
446
+ When -p is used, installs the local project with its [full] optional dependencies.
448
447
  """
449
448
  # ANSI color codes
450
449
  BLUE = "\033[94m"
@@ -457,15 +456,26 @@ def install_dependencies(
457
456
  print(f"Running command: {BLUE}{' '.join(lock_cmd)}{RESET}")
458
457
  subprocess.run(lock_cmd, check=True, capture_output=True)
459
458
 
460
- # Step 2: Sync environment
461
- print("Syncing environment with lock file...")
462
- sync_cmd = ["uv", "sync", "--all-groups"]
463
- if not install_project:
464
- sync_cmd.append("--no-install-project")
465
- print(f"Running command: {BLUE}{' '.join(sync_cmd)}{RESET}")
466
- subprocess.run(sync_cmd, check=True)
459
+ if install_project:
460
+ # Step 2a: Install the project with 'full' extras
461
+ print("Installing the local project with 'full' extras...")
462
+ # The .[full] syntax tells pip to install the current project ('.')
463
+ # with its 'full' optional dependencies.
464
+ pip_install_cmd = ["uv", "pip", "install", "-e", ".[full]"]
465
+ print(f"Running command: {BLUE}{' '.join(pip_install_cmd)}{RESET}")
466
+ subprocess.run(pip_install_cmd, check=True)
467
467
 
468
- print("Dependencies installed/synced successfully.")
468
+ print("Project installed with 'full' extras successfully.")
469
+ else:
470
+ # Original behavior: Sync environment without installing the project
471
+ print(
472
+ "Syncing environment with lock file (project itself will not be installed)..."
473
+ )
474
+ # --all-groups ensures all non-project dependencies (like dev) are installed
475
+ sync_cmd = ["uv", "sync", "--all-groups", "--no-install-project"]
476
+ print(f"Running command: {BLUE}{' '.join(sync_cmd)}{RESET}")
477
+ subprocess.run(sync_cmd, check=True)
478
+ print("Dependencies synced successfully (project not installed).")
469
479
 
470
480
  except subprocess.CalledProcessError as e:
471
481
  stderr_output = e.stderr.decode() if e.stderr else "No stderr output."
@@ -1,3 +1,4 @@
1
+ import csv
1
2
  import logging
2
3
  import os
3
4
  import shlex
@@ -6,6 +7,8 @@ import subprocess
6
7
  from abc import ABC, abstractmethod
7
8
  from pathlib import Path
8
9
 
10
+ from dayhoff_tools.fasta import subset_fasta
11
+
9
12
  logger = logging.getLogger(__name__)
10
13
 
11
14
 
@@ -267,10 +270,12 @@ class MMSeqsProfileProcessor(Processor):
267
270
  # Define INTERMEDIATE output file paths within mmseqs_temp_dir
268
271
  intermediate_results_m8_file = mmseqs_temp_dir / "results.m8"
269
272
  intermediate_hits_fasta_file = mmseqs_temp_dir / "results.fasta"
273
+ intermediate_results_as_csv_file = mmseqs_temp_dir / "results_as.csv"
270
274
 
271
275
  # Define FINAL output file paths within run_base_dir, using target stem
272
- final_results_m8_file = run_base_dir / f"{target_fasta_stem}_results.m8"
273
- final_hits_fasta_file = run_base_dir / f"{target_fasta_stem}_hits.fasta"
276
+ final_results_csv_file = run_base_dir / f"{target_fasta_stem}.csv"
277
+ final_hits_fasta_file = run_base_dir / f"{target_fasta_stem}.fasta"
278
+ final_hits_txt_file = run_base_dir / f"{target_fasta_stem}.txt"
274
279
 
275
280
  # --- MMseqs2 Workflow Paths (intermediate files in mmseqs_temp_dir) ---
276
281
  query_db = mmseqs_temp_dir / "queryDB"
@@ -405,40 +410,147 @@ class MMSeqsProfileProcessor(Processor):
405
410
  run_base_dir,
406
411
  )
407
412
 
408
- # 9. Extract hit sequences directly to FASTA using createseqfiledb
409
- self._run_mmseqs_command(
410
- [
411
- "mmseqs",
412
- "createseqfiledb",
413
- str(target_db), # <i:sequenceDB> - The DB to pull sequences from
414
- str(
415
- result_db
416
- ), # <i:resultDB> - Contains IDs of target sequences to pull
417
- str(
418
- intermediate_hits_fasta_file
419
- ), # <o:fastaDB> - Output FASTA file
420
- "--threads", # createseqfiledb supports --threads
421
- self.num_threads,
422
- ],
423
- "Extract hit sequences to FASTA via createseqfiledb",
424
- run_base_dir,
413
+ # 8.5 Convert M8 to CSV with headers
414
+ logger.info(
415
+ f"Converting M8 results {intermediate_results_m8_file} to CSV {intermediate_results_as_csv_file}"
416
+ )
417
+ csv_headers = [
418
+ "query_id",
419
+ "target_id",
420
+ "percent_identity",
421
+ "alignment_length",
422
+ "mismatches",
423
+ "gap_openings",
424
+ "query_start",
425
+ "query_end",
426
+ "target_start",
427
+ "target_end",
428
+ "e_value",
429
+ "bit_score",
430
+ ]
431
+ try:
432
+ if intermediate_results_m8_file.exists():
433
+ with open(intermediate_results_m8_file, "r") as m8_in, open(
434
+ intermediate_results_as_csv_file, "w", newline=""
435
+ ) as csv_out:
436
+ writer = csv.writer(csv_out)
437
+ writer.writerow(csv_headers)
438
+ for line in m8_in:
439
+ if line.strip(): # Ensure line is not empty
440
+ columns = line.strip().split("\t")
441
+ writer.writerow(columns)
442
+ logger.info(
443
+ f"Successfully converted M8 to CSV: {intermediate_results_as_csv_file}"
444
+ )
445
+ else:
446
+ logger.warning(
447
+ f"Intermediate M8 file {intermediate_results_m8_file} not found for CSV conversion. CSV will be empty or not created."
448
+ )
449
+ # Ensure the CSV file is touched if M8 was missing, so downstream move doesn't fail
450
+ intermediate_results_as_csv_file.touch()
451
+ except Exception as e:
452
+ logger.error(f"Failed to convert M8 to CSV: {e}")
453
+ # Touch the CSV file in case of error to prevent downstream errors if it's expected
454
+ if not intermediate_results_as_csv_file.exists():
455
+ intermediate_results_as_csv_file.touch()
456
+ # We might still want to raise e here, depending on desired error handling for CSV conversion failure
457
+
458
+ # 9. Extract hit sequences from M8 results using subset_fasta
459
+ logger.info(f"Parsing M8 results from: {intermediate_results_m8_file}")
460
+ hit_sequence_ids = set()
461
+ try:
462
+ if not intermediate_results_m8_file.exists():
463
+ logger.warning(
464
+ f"M8 results file {intermediate_results_m8_file} not found. Hits FASTA will be empty."
465
+ )
466
+ intermediate_hits_fasta_file.touch() # Create empty hits file
467
+ else:
468
+ with open(intermediate_results_m8_file, "r") as m8_file:
469
+ for line in m8_file:
470
+ if line.strip(): # Ensure line is not empty
471
+ columns = line.strip().split("\t")
472
+ if len(columns) >= 2:
473
+ hit_sequence_ids.add(
474
+ columns[1]
475
+ ) # Target ID is the second column
476
+ logger.info(
477
+ f"Found {len(hit_sequence_ids)} unique target IDs in M8 results."
478
+ )
479
+
480
+ if not hit_sequence_ids:
481
+ logger.warning(
482
+ f"No target IDs found in {intermediate_results_m8_file} after parsing. Output hits FASTA will be empty."
483
+ )
484
+ intermediate_hits_fasta_file.touch() # Create empty file
485
+ else:
486
+ logger.info(
487
+ f"Extracting {len(hit_sequence_ids)} hit sequences from {local_target_file} to {intermediate_hits_fasta_file} using subset_fasta."
488
+ )
489
+ try:
490
+ subset_fasta(
491
+ fasta_file=str(local_target_file),
492
+ output_path=str(intermediate_hits_fasta_file),
493
+ target_ids=hit_sequence_ids,
494
+ exclude=False,
495
+ return_written_ids=False,
496
+ )
497
+ logger.info(
498
+ f"Successfully created hits FASTA: {intermediate_hits_fasta_file} using subset_fasta."
499
+ )
500
+ except FileNotFoundError as e:
501
+ logger.error(
502
+ f"subset_fasta FileNotFoundError: {e}. Ensuring {intermediate_hits_fasta_file} exists as empty."
503
+ )
504
+ if not intermediate_hits_fasta_file.exists():
505
+ intermediate_hits_fasta_file.touch()
506
+ raise
507
+ except Exception as e:
508
+ logger.error(
509
+ f"subset_fasta failed to create {intermediate_hits_fasta_file}: {e}"
510
+ )
511
+ if not intermediate_hits_fasta_file.exists():
512
+ intermediate_hits_fasta_file.touch()
513
+ raise
514
+ except Exception as e:
515
+ logger.error(
516
+ f"Error processing M8 file {intermediate_results_m8_file} or its hits extraction: {e}"
517
+ )
518
+ if not intermediate_hits_fasta_file.exists():
519
+ intermediate_hits_fasta_file.touch()
520
+ raise
521
+
522
+ # 10. Write the set of hit sequence IDs to a .txt file
523
+ logger.info(
524
+ f"Writing {len(hit_sequence_ids)} hit sequence IDs to {final_hits_txt_file}"
425
525
  )
526
+ try:
527
+ with open(final_hits_txt_file, "w") as txt_out:
528
+ for seq_id in sorted(
529
+ list(hit_sequence_ids)
530
+ ): # Sort for consistent output
531
+ txt_out.write(f"{seq_id}\n")
532
+ logger.info(f"Successfully wrote hit IDs to {final_hits_txt_file}")
533
+ except Exception as e:
534
+ logger.error(f"Failed to write hit IDs to {final_hits_txt_file}: {e}")
535
+ # The main workflow should still proceed even if this supplementary file fails
426
536
 
427
537
  logger.info(
428
538
  f"MMseqs2 workflow completed successfully. Intermediate outputs in {mmseqs_temp_dir}"
429
539
  )
430
540
 
431
541
  # Move and rename final output files from mmseqs_temp_dir to run_base_dir
432
- if intermediate_results_m8_file.exists():
542
+ if intermediate_results_as_csv_file.exists():
433
543
  shutil.move(
434
- str(intermediate_results_m8_file), str(final_results_m8_file)
544
+ str(intermediate_results_as_csv_file), str(final_results_csv_file)
545
+ )
546
+ logger.info(
547
+ f"Moved and renamed M8 results to CSV: {final_results_csv_file}"
435
548
  )
436
- logger.info(f"Moved and renamed M8 results to {final_results_m8_file}")
437
549
  else:
438
550
  logger.warning(
439
- f"Intermediate M8 file {intermediate_results_m8_file} not found. Creating empty target file."
551
+ f"Intermediate CSV file {intermediate_results_as_csv_file} not found. Creating empty target CSV file."
440
552
  )
441
- final_results_m8_file.touch() # Create empty file in run_base_dir if not found
553
+ final_results_csv_file.touch() # Create empty file in run_base_dir if not found
442
554
 
443
555
  if intermediate_hits_fasta_file.exists():
444
556
  shutil.move(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: dayhoff-tools
3
- Version: 1.1.19
3
+ Version: 1.1.21
4
4
  Summary: Common tools for all the repos at Dayhoff Labs
5
5
  Author: Daniel Martin-Alarcon
6
6
  Author-email: dma@dayhofflabs.com
@@ -1,17 +1,17 @@
1
- dayhoff_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
1
+ dayhoff_tools/__init__.py,sha256=M5zThPyEBRYa5CfwlzKhcqTevWn3OKu62cjV6Zqie2A,469
2
2
  dayhoff_tools/chemistry/standardizer.py,sha256=uMn7VwHnx02nc404eO6fRuS4rsl4dvSPf2ElfZDXEpY,11188
3
3
  dayhoff_tools/chemistry/utils.py,sha256=jt-7JgF-GeeVC421acX-bobKbLU_X94KNOW24p_P-_M,2257
4
4
  dayhoff_tools/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
5
  dayhoff_tools/cli/cloud_commands.py,sha256=NGux28-cjDyCADF-L1tjdEMzkCMYX8V4xNvpK6EWcZA,40802
6
6
  dayhoff_tools/cli/main.py,sha256=47EGb28ALaYFc7oAUGlY1D66AIDmc4RZiXxN-gPVrpQ,4519
7
7
  dayhoff_tools/cli/swarm_commands.py,sha256=5EyKj8yietvT5lfoz8Zx0iQvVaNgc3SJX1z2zQR6o6M,5614
8
- dayhoff_tools/cli/utility_commands.py,sha256=PRShdh4O35JfjVxTOsduZ8-grGF5-SfXihcptcFT-hk,23584
8
+ dayhoff_tools/cli/utility_commands.py,sha256=08MMLlQZEjJ_r7Cd6b92aYYPjH7gMvMQvCOcpoNnFVo,24352
9
9
  dayhoff_tools/deployment/base.py,sha256=8tXwsPYvRo-zV-aNhHw1c7Rji-KWg8S5xoCCznFnVVI,17412
10
10
  dayhoff_tools/deployment/deploy_aws.py,sha256=O0gQxHioSU_sNU8T8MD4wSOPvWc--V8eRRZzlRu035I,16446
11
11
  dayhoff_tools/deployment/deploy_gcp.py,sha256=DxBM4sUzwPK9RWLP9bSfr38n1HHl-TVrp4TsbdN8pUA,5795
12
12
  dayhoff_tools/deployment/deploy_utils.py,sha256=StFwbqnr2_FWiKVg3xnJF4kagTHzndqqDkpaIOaAn_4,26027
13
13
  dayhoff_tools/deployment/job_runner.py,sha256=4tmdplpvqSE9bVxRWHo2U5kwkYrYod0Uwzpg2Q7qG5o,4850
14
- dayhoff_tools/deployment/processors.py,sha256=4a8fYFilcbBGAVUeowjL5n-Pa7NIL3Jmhs69lCQIkpQ,19554
14
+ dayhoff_tools/deployment/processors.py,sha256=0D2r2-NgVcFfxF0QSqLZy86cxEWSn78jLTKYf60fLBQ,25519
15
15
  dayhoff_tools/deployment/swarm.py,sha256=MGcS2_x4RNFtnVjWlU_SwNfhICz8NlGYr9cYBK4ZKDA,21688
16
16
  dayhoff_tools/embedders.py,sha256=CRgcb2z7KeeFrRQawyUZuJ4Yi0-J5jSr0hwuRhjG_FI,36513
17
17
  dayhoff_tools/fasta.py,sha256=e7xw3pInoupqCGE0-fJTOzmW_earL1M7qPyoqIPfUT4,46269
@@ -26,7 +26,7 @@ dayhoff_tools/intake/uniprot.py,sha256=BZYJQF63OtPcBBnQ7_P9gulxzJtqyorgyuDiPeOJq
26
26
  dayhoff_tools/logs.py,sha256=DKdeP0k0kliRcilwvX0mUB2eipO5BdWUeHwh-VnsICs,838
27
27
  dayhoff_tools/sqlite.py,sha256=jV55ikF8VpTfeQqqlHSbY8OgfyfHj8zgHNpZjBLos_E,18672
28
28
  dayhoff_tools/warehouse.py,sha256=TqV8nex1AluNaL4JuXH5zuu9P7qmE89lSo6f_oViy6U,14965
29
- dayhoff_tools-1.1.19.dist-info/METADATA,sha256=40SO5LPPs3GIs_bHfr5jt-TpZUb05-5EHz6Uo09TdxI,2225
30
- dayhoff_tools-1.1.19.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
31
- dayhoff_tools-1.1.19.dist-info/entry_points.txt,sha256=iAf4jteNqW3cJm6CO6czLxjW3vxYKsyGLZ8WGmxamSc,49
32
- dayhoff_tools-1.1.19.dist-info/RECORD,,
29
+ dayhoff_tools-1.1.21.dist-info/METADATA,sha256=R6Q1CWHxPpXkuYs0OwJ-WSSb9pnPIS8V4xbZHKjugZE,2225
30
+ dayhoff_tools-1.1.21.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
31
+ dayhoff_tools-1.1.21.dist-info/entry_points.txt,sha256=iAf4jteNqW3cJm6CO6czLxjW3vxYKsyGLZ8WGmxamSc,49
32
+ dayhoff_tools-1.1.21.dist-info/RECORD,,