dayhoff-tools 1.1.4__tar.gz → 1.1.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {dayhoff_tools-1.1.4 → dayhoff_tools-1.1.6}/PKG-INFO +1 -1
  2. dayhoff_tools-1.1.6/dayhoff_tools/deployment/processors.py +429 -0
  3. {dayhoff_tools-1.1.4 → dayhoff_tools-1.1.6}/dayhoff_tools/fasta.py +144 -41
  4. dayhoff_tools-1.1.6/dayhoff_tools/intake/gtdb.py +269 -0
  5. {dayhoff_tools-1.1.4 → dayhoff_tools-1.1.6}/pyproject.toml +1 -1
  6. dayhoff_tools-1.1.4/dayhoff_tools/deployment/processors.py +0 -125
  7. {dayhoff_tools-1.1.4 → dayhoff_tools-1.1.6}/README.md +0 -0
  8. {dayhoff_tools-1.1.4 → dayhoff_tools-1.1.6}/dayhoff_tools/__init__.py +0 -0
  9. {dayhoff_tools-1.1.4 → dayhoff_tools-1.1.6}/dayhoff_tools/chemistry/standardizer.py +0 -0
  10. {dayhoff_tools-1.1.4 → dayhoff_tools-1.1.6}/dayhoff_tools/chemistry/utils.py +0 -0
  11. {dayhoff_tools-1.1.4 → dayhoff_tools-1.1.6}/dayhoff_tools/cli/__init__.py +0 -0
  12. {dayhoff_tools-1.1.4 → dayhoff_tools-1.1.6}/dayhoff_tools/cli/cloud_commands.py +0 -0
  13. {dayhoff_tools-1.1.4 → dayhoff_tools-1.1.6}/dayhoff_tools/cli/main.py +0 -0
  14. {dayhoff_tools-1.1.4 → dayhoff_tools-1.1.6}/dayhoff_tools/cli/swarm_commands.py +0 -0
  15. {dayhoff_tools-1.1.4 → dayhoff_tools-1.1.6}/dayhoff_tools/cli/utility_commands.py +0 -0
  16. {dayhoff_tools-1.1.4 → dayhoff_tools-1.1.6}/dayhoff_tools/deployment/base.py +0 -0
  17. {dayhoff_tools-1.1.4 → dayhoff_tools-1.1.6}/dayhoff_tools/deployment/deploy_aws.py +0 -0
  18. {dayhoff_tools-1.1.4 → dayhoff_tools-1.1.6}/dayhoff_tools/deployment/deploy_gcp.py +0 -0
  19. {dayhoff_tools-1.1.4 → dayhoff_tools-1.1.6}/dayhoff_tools/deployment/deploy_utils.py +0 -0
  20. {dayhoff_tools-1.1.4 → dayhoff_tools-1.1.6}/dayhoff_tools/deployment/job_runner.py +0 -0
  21. {dayhoff_tools-1.1.4 → dayhoff_tools-1.1.6}/dayhoff_tools/deployment/swarm.py +0 -0
  22. {dayhoff_tools-1.1.4 → dayhoff_tools-1.1.6}/dayhoff_tools/embedders.py +0 -0
  23. {dayhoff_tools-1.1.4 → dayhoff_tools-1.1.6}/dayhoff_tools/file_ops.py +0 -0
  24. {dayhoff_tools-1.1.4 → dayhoff_tools-1.1.6}/dayhoff_tools/h5.py +0 -0
  25. {dayhoff_tools-1.1.4/dayhoff_tools → dayhoff_tools-1.1.6/dayhoff_tools/intake}/gcp.py +0 -0
  26. {dayhoff_tools-1.1.4/dayhoff_tools → dayhoff_tools-1.1.6/dayhoff_tools/intake}/kegg.py +0 -0
  27. {dayhoff_tools-1.1.4/dayhoff_tools → dayhoff_tools-1.1.6/dayhoff_tools/intake}/mmseqs.py +0 -0
  28. {dayhoff_tools-1.1.4/dayhoff_tools → dayhoff_tools-1.1.6/dayhoff_tools/intake}/structure.py +0 -0
  29. {dayhoff_tools-1.1.4/dayhoff_tools → dayhoff_tools-1.1.6/dayhoff_tools/intake}/uniprot.py +0 -0
  30. {dayhoff_tools-1.1.4 → dayhoff_tools-1.1.6}/dayhoff_tools/logs.py +0 -0
  31. {dayhoff_tools-1.1.4 → dayhoff_tools-1.1.6}/dayhoff_tools/sqlite.py +0 -0
  32. {dayhoff_tools-1.1.4 → dayhoff_tools-1.1.6}/dayhoff_tools/warehouse.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: dayhoff-tools
3
- Version: 1.1.4
3
+ Version: 1.1.6
4
4
  Summary: Common tools for all the repos at Dayhoff Labs
5
5
  Author: Daniel Martin-Alarcon
6
6
  Author-email: dma@dayhofflabs.com
@@ -0,0 +1,429 @@
1
+ import logging
2
+ import os
3
+ import shlex
4
+ import shutil
5
+ import subprocess
6
+ from abc import ABC, abstractmethod
7
+ from pathlib import Path
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ class Processor(ABC):
13
+ """Processes data locally. Abstract class for specific calculations.
14
+ Takes in a single file and produces a single file or folder of outputs."""
15
+
16
+ @abstractmethod
17
+ def run(self, input_file: str) -> str:
18
+ """Do the calculation, including reading from input_file
19
+ and writing to output_file"""
20
+ output_path = "output_file"
21
+
22
+ return output_path
23
+
24
+
25
+ class BoltzPredictor(Processor):
26
+ """Processor for running Boltz docking predictions.
27
+
28
+ This class wraps the Boltz docking tool to predict protein structures
29
+ from sequence data.
30
+ """
31
+
32
+ def __init__(self, num_workers: int, boltz_options: str | None = None):
33
+ """Initialize the BoltzPredictor.
34
+
35
+ Args:
36
+ num_workers: Number of worker threads to use as a default.
37
+ This can be overridden if --num_workers is present
38
+ in boltz_options.
39
+ boltz_options: A string containing additional command-line options
40
+ to pass to the Boltz predictor. Options should be
41
+ space-separated (e.g., "--option1 value1 --option2").
42
+ """
43
+ self.num_workers = num_workers
44
+ self.boltz_options = boltz_options
45
+
46
+ def run(self, input_file: str) -> str:
47
+ """Run Boltz prediction on the input file.
48
+
49
+ Constructs the command using the input file, default number of workers,
50
+ and any additional options provided via `boltz_options`. If `--num_workers`
51
+ is specified in `boltz_options`, it overrides the default `num_workers`.
52
+
53
+ Args:
54
+ input_file: Path to the input file containing sequences
55
+
56
+ Returns:
57
+ Path to the output directory created by Boltz
58
+
59
+ Raises:
60
+ subprocess.CalledProcessError: If Boltz prediction fails
61
+ """
62
+ # Determine expected output directory name
63
+ input_base = os.path.splitext(os.path.basename(input_file))[0]
64
+ expected_output_dir = f"boltz_results_{input_base}"
65
+ logger.info(f"Expected output directory: {expected_output_dir}")
66
+
67
+ # Start building the command
68
+ cmd = ["boltz", "predict", input_file]
69
+
70
+ # Parse additional options if provided
71
+ additional_args = []
72
+ num_workers_in_opts = False
73
+ if self.boltz_options:
74
+ try:
75
+ parsed_opts = shlex.split(self.boltz_options)
76
+ additional_args.extend(parsed_opts)
77
+ if "--num_workers" in parsed_opts:
78
+ num_workers_in_opts = True
79
+ logger.info(
80
+ f"Using --num_workers from BOLTZ_OPTIONS: {self.boltz_options}"
81
+ )
82
+ except ValueError as e:
83
+ logger.error(f"Error parsing BOLTZ_OPTIONS '{self.boltz_options}': {e}")
84
+ # Decide if we should raise an error or proceed without options
85
+ # For now, proceed without the additional options
86
+ additional_args = [] # Clear potentially partially parsed args
87
+
88
+ # Add num_workers if not specified in options
89
+ if not num_workers_in_opts:
90
+ logger.info(f"Using default num_workers: {self.num_workers}")
91
+ cmd.extend(["--num_workers", str(self.num_workers)])
92
+
93
+ # Add the parsed additional arguments
94
+ cmd.extend(additional_args)
95
+
96
+ # Log the final command
97
+ # Use shlex.join for safer command logging, especially if paths/args have spaces
98
+ try:
99
+ safe_cmd_str = shlex.join(cmd)
100
+ logger.info(f"Running command: {safe_cmd_str}")
101
+ except AttributeError: # shlex.join is Python 3.8+
102
+ logger.info(f"Running command: {' '.join(cmd)}")
103
+
104
+ # Stream output in real-time
105
+ process = subprocess.Popen(
106
+ cmd,
107
+ stdout=subprocess.PIPE,
108
+ stderr=subprocess.STDOUT,
109
+ text=True,
110
+ bufsize=1,
111
+ )
112
+
113
+ stdout = process.stdout
114
+ if stdout:
115
+ for line in iter(stdout.readline, ""):
116
+ logger.info(f"BOLTZ: {line.rstrip()}")
117
+
118
+ # Wait for process to complete
119
+ return_code = process.wait()
120
+ if return_code != 0:
121
+ logger.error(f"Boltz prediction failed with exit code {return_code}")
122
+ raise subprocess.CalledProcessError(return_code, cmd)
123
+
124
+ logger.info(
125
+ f"Boltz prediction completed successfully. Output in {expected_output_dir}"
126
+ )
127
+ return expected_output_dir
128
+
129
+
130
+ class MMSeqsProfileProcessor(Processor):
131
+ """Processor for running MMseqs2 profile searches.
132
+
133
+ This class wraps the MMseqs2 workflow to perform a profile-based search
134
+ against a target database using a query FASTA.
135
+ """
136
+
137
+ def __init__(
138
+ self,
139
+ query_fasta_path_in_image: str,
140
+ num_threads: int = 8,
141
+ mmseqs_args: dict | None = None,
142
+ ):
143
+ """Initialize the MMSeqsProfileProcessor.
144
+
145
+ Args:
146
+ query_fasta_path_in_image: Path to the query FASTA file. This path is expected
147
+ to be accessible within the execution environment (e.g.,
148
+ packaged in a Docker image).
149
+ num_threads: Number of threads to use for MMseqs2 commands.
150
+ mmseqs_args: A dictionary of additional MMseqs2 parameters.
151
+ Expected keys: "memory_limit_gb", "evalue", "sensitivity",
152
+ "max_seqs_search", "min_seq_id_cluster", "max_seqs_profile_msa".
153
+ Defaults are used if not provided.
154
+ """
155
+ if not Path(query_fasta_path_in_image).is_file():
156
+ raise FileNotFoundError(
157
+ f"Query FASTA file not found at: {query_fasta_path_in_image}"
158
+ )
159
+ self.query_fasta_path = query_fasta_path_in_image
160
+ self.num_threads = str(num_threads) # MMseqs2 expects string for threads
161
+
162
+ default_mmseqs_args = {
163
+ "memory_limit_gb": "30",
164
+ "evalue": "10",
165
+ "sensitivity": "7.5",
166
+ "max_seqs_search": "300",
167
+ "min_seq_id_cluster": "0.8",
168
+ "max_seqs_profile_msa": "1000",
169
+ }
170
+ if mmseqs_args:
171
+ self.mmseqs_args = {**default_mmseqs_args, **mmseqs_args}
172
+ else:
173
+ self.mmseqs_args = default_mmseqs_args
174
+
175
+ logger.info(
176
+ f"MMSeqsProfileProcessor initialized with query: {self.query_fasta_path}"
177
+ )
178
+ logger.info(f"MMSeqs args: {self.mmseqs_args}")
179
+ logger.info(f"Num threads: {self.num_threads}")
180
+
181
+ def _run_mmseqs_command(
182
+ self, command_parts: list[str], step_description: str, work_dir: Path
183
+ ):
184
+ """Runs an MMseqs2 command and logs its execution.
185
+
186
+ Args:
187
+ command_parts: A list of strings representing the command and its arguments.
188
+ step_description: A human-readable description of the MMseqs2 step.
189
+ work_dir: The working directory for the command.
190
+
191
+ Raises:
192
+ subprocess.CalledProcessError: If the MMseqs2 command returns a non-zero exit code.
193
+ """
194
+ full_command = " ".join(command_parts)
195
+ logger.info(f"Running MMseqs2 step in {work_dir}: {step_description}")
196
+ logger.info(f"Command: {full_command}")
197
+ try:
198
+ process = subprocess.run(
199
+ command_parts,
200
+ check=True,
201
+ stdout=subprocess.PIPE,
202
+ stderr=subprocess.PIPE,
203
+ text=True,
204
+ cwd=work_dir, # Run command in the specified working directory
205
+ )
206
+ if process.stdout:
207
+ logger.info(f"MMseqs2 stdout: {process.stdout.strip()}")
208
+ if process.stderr: # MMseqs often outputs informational messages to stderr
209
+ logger.info(f"MMseqs2 stderr: {process.stderr.strip()}")
210
+ logger.info(f"MMseqs2 step '{step_description}' completed successfully.")
211
+ except subprocess.CalledProcessError as e:
212
+ logger.error(f"MMseqs2 step '{step_description}' failed in {work_dir}.")
213
+ if e.stdout:
214
+ logger.error(f"MMseqs2 stdout: {e.stdout.strip()}")
215
+ if e.stderr:
216
+ logger.error(f"MMseqs2 stderr: {e.stderr.strip()}")
217
+ raise
218
+
219
+ def run(self, input_file: str) -> str:
220
+ """Run MMseqs2 profile search.
221
+
222
+ The input_file is the target FASTA. The query FASTA is provided
223
+ during initialization.
224
+
225
+ Args:
226
+ input_file: Path to the input target FASTA file.
227
+
228
+ Returns:
229
+ Path to the output directory containing results.m8 and results.fasta.
230
+
231
+ Raises:
232
+ subprocess.CalledProcessError: If any MMseqs2 command fails.
233
+ FileNotFoundError: If the input_file is not found.
234
+ """
235
+ if not Path(input_file).is_file():
236
+ raise FileNotFoundError(f"Input target FASTA file not found: {input_file}")
237
+
238
+ input_file_path = Path(input_file).resolve() # Ensure absolute path
239
+ target_fasta_filename = input_file_path.name
240
+
241
+ # Create a unique base directory for this run's outputs and temp files
242
+ # This directory will be returned and subsequently uploaded by the Operator
243
+ run_base_dir_name = f"mmseqs_run_{Path(target_fasta_filename).stem}"
244
+ run_base_dir = Path(run_base_dir_name).resolve()
245
+ run_base_dir.mkdir(parents=True, exist_ok=True)
246
+ logger.info(f"Created run base directory: {run_base_dir}")
247
+
248
+ # Define local paths within the run_base_dir
249
+ local_target_file = run_base_dir / target_fasta_filename
250
+ # Copy the target file into the run directory to keep inputs and outputs together
251
+ shutil.copy(input_file_path, local_target_file)
252
+ logger.info(f"Copied target file {input_file_path} to {local_target_file}")
253
+
254
+ # Query file is already specified by self.query_fasta_path (path in image)
255
+ local_query_file = Path(self.query_fasta_path).resolve()
256
+
257
+ # Temporary directory for MMseqs2 intermediate files, created inside run_base_dir
258
+ mmseqs_temp_dir = run_base_dir / "mmseqs_tmp"
259
+ mmseqs_temp_dir.mkdir(parents=True, exist_ok=True)
260
+ logger.info(f"Created MMseqs2 temporary directory: {mmseqs_temp_dir}")
261
+
262
+ # Define output file paths directly within run_base_dir
263
+ local_results_m8_file = run_base_dir / "results.m8"
264
+ local_hits_fasta_file = run_base_dir / "results.fasta"
265
+
266
+ # --- MMseqs2 Workflow Paths (intermediate files in mmseqs_temp_dir) ---
267
+ query_db = mmseqs_temp_dir / "queryDB"
268
+ target_db = mmseqs_temp_dir / "targetDB"
269
+ # Ensure local_target_file is used for creating targetDB
270
+ target_db_input_file = local_target_file
271
+
272
+ query_db_cluster = mmseqs_temp_dir / "queryDB_cluster"
273
+ query_db_rep = mmseqs_temp_dir / "queryDB_rep"
274
+ aln_db = mmseqs_temp_dir / "alnDB"
275
+ profile_db = mmseqs_temp_dir / "profileDB"
276
+ result_db = mmseqs_temp_dir / "resultDB"
277
+ hits_db = mmseqs_temp_dir / "hitsDB"
278
+
279
+ try:
280
+ # 1. Create query database
281
+ self._run_mmseqs_command(
282
+ ["mmseqs", "createdb", str(local_query_file), str(query_db)],
283
+ "Create query DB",
284
+ run_base_dir, # Working directory for the command
285
+ )
286
+
287
+ # 2. Create target database
288
+ self._run_mmseqs_command(
289
+ ["mmseqs", "createdb", str(target_db_input_file), str(target_db)],
290
+ "Create target DB",
291
+ run_base_dir,
292
+ )
293
+
294
+ # 3. Cluster query sequences
295
+ self._run_mmseqs_command(
296
+ [
297
+ "mmseqs",
298
+ "cluster",
299
+ str(query_db),
300
+ str(query_db_cluster),
301
+ str(
302
+ mmseqs_temp_dir / "tmp_cluster"
303
+ ), # MMseqs needs a temp dir for cluster
304
+ "--min-seq-id",
305
+ self.mmseqs_args["min_seq_id_cluster"],
306
+ "--threads",
307
+ self.num_threads,
308
+ ],
309
+ "Cluster query sequences",
310
+ run_base_dir,
311
+ )
312
+
313
+ # 4. Create representative set from query clusters
314
+ self._run_mmseqs_command(
315
+ [
316
+ "mmseqs",
317
+ "createsubdb",
318
+ str(query_db_cluster),
319
+ str(query_db),
320
+ str(query_db_rep),
321
+ ],
322
+ "Create representative query set",
323
+ run_base_dir,
324
+ )
325
+
326
+ # 5. Create MSA for profile generation
327
+ self._run_mmseqs_command(
328
+ [
329
+ "mmseqs",
330
+ "search",
331
+ str(query_db_rep),
332
+ str(query_db), # Search representative against full query DB
333
+ str(aln_db),
334
+ str(mmseqs_temp_dir / "tmp_search_msa"), # Temp for this search
335
+ "--max-seqs",
336
+ self.mmseqs_args["max_seqs_profile_msa"],
337
+ "--threads",
338
+ self.num_threads,
339
+ ],
340
+ "Create MSA for profile",
341
+ run_base_dir,
342
+ )
343
+
344
+ # 6. Create profile database
345
+ self._run_mmseqs_command(
346
+ [
347
+ "mmseqs",
348
+ "result2profile",
349
+ str(query_db_rep), # Use query_db_rep as input for profile
350
+ str(query_db), # Full query DB as second arg
351
+ str(aln_db),
352
+ str(profile_db),
353
+ "--threads", # Added threads option
354
+ self.num_threads,
355
+ ],
356
+ "Create profile DB",
357
+ run_base_dir,
358
+ )
359
+
360
+ # 7. Perform profile search
361
+ self._run_mmseqs_command(
362
+ [
363
+ "mmseqs",
364
+ "search",
365
+ str(profile_db),
366
+ str(target_db),
367
+ str(result_db),
368
+ str(mmseqs_temp_dir / "tmp_search_profile"), # Temp for this search
369
+ "--split-memory-limit",
370
+ f"{self.mmseqs_args['memory_limit_gb']}G",
371
+ "-e",
372
+ self.mmseqs_args["evalue"],
373
+ "--max-seqs",
374
+ self.mmseqs_args["max_seqs_search"],
375
+ "--threads",
376
+ self.num_threads,
377
+ "-s",
378
+ self.mmseqs_args["sensitivity"],
379
+ ],
380
+ "Perform profile search",
381
+ run_base_dir,
382
+ )
383
+
384
+ # 8. Convert results to tabular format (M8)
385
+ self._run_mmseqs_command(
386
+ [
387
+ "mmseqs",
388
+ "convertalis",
389
+ str(profile_db), # Query DB used for search (profileDB)
390
+ str(target_db),
391
+ str(result_db),
392
+ str(local_results_m8_file), # Output M8 file
393
+ "--threads",
394
+ self.num_threads,
395
+ ],
396
+ "Convert results to M8",
397
+ run_base_dir,
398
+ )
399
+
400
+ # 9. Create subdatabase of hits from original target_db
401
+ self._run_mmseqs_command(
402
+ ["mmseqs", "createsubdb", str(result_db), str(target_db), str(hits_db)],
403
+ "Create hits subDB from target_db",
404
+ run_base_dir,
405
+ )
406
+
407
+ # 10. Convert hit sequences to FASTA
408
+ self._run_mmseqs_command(
409
+ ["mmseqs", "convert2fasta", str(hits_db), str(local_hits_fasta_file)],
410
+ "Convert hits to FASTA",
411
+ run_base_dir,
412
+ )
413
+
414
+ logger.info(
415
+ f"MMseqs2 workflow completed successfully. Outputs in {run_base_dir}"
416
+ )
417
+
418
+ finally:
419
+ # Clean up the MMseqs2 temporary directory
420
+ if mmseqs_temp_dir.exists():
421
+ shutil.rmtree(mmseqs_temp_dir)
422
+ logger.info(
423
+ f"Cleaned up MMseqs2 temporary directory: {mmseqs_temp_dir}"
424
+ )
425
+ # The input_file (original target) is managed by the Operator
426
+ # The local_target_file (copy inside run_base_dir) will be cleaned up
427
+ # by the Operator when run_base_dir is deleted after upload.
428
+
429
+ return str(run_base_dir) # Return the path to the directory containing results
@@ -264,65 +264,168 @@ def split_fasta(
264
264
  target_folder: str,
265
265
  base_name: str,
266
266
  sequences_per_file: int = 1000,
267
- max_files=None,
267
+ max_files: int | None = None,
268
+ show_progress: bool = True,
269
+ target_chunk_size_bytes: int | None = None,
268
270
  ) -> int:
269
- """Split a FASTA file into multiple smaller files within a target folder.
271
+ """Split a FASTA file into multiple smaller files within a target folder,
272
+ with an overall progress bar. Files can be split based on a target number
273
+ of sequences or an approximate target file size in bytes.
270
274
 
271
275
  Args:
272
276
  fasta_file (str): Path to the input FASTA file.
273
277
  target_folder (str): Path to the folder where output files will be saved.
274
278
  base_name (str): Used to make output filenames: eg, basename_1.fasta.
275
279
  sequences_per_file (int): Number of sequences per output file.
276
- max_files (int, optional): Maximum number of files to create. If None, all sequences are processed.
280
+ This is used if target_chunk_size_bytes is None.
281
+ max_files (int, optional): Maximum number of files to create.
282
+ If None, all sequences are processed.
283
+ show_progress (bool): If True, display a progress bar based on
284
+ file size processed. Defaults to True.
285
+ target_chunk_size_bytes (int, optional): Approximate target size for
286
+ each output file in bytes. If set, this takes precedence over
287
+ sequences_per_file. The actual file size may be slightly larger to
288
+ ensure full FASTA entries. Defaults to None.
289
+
290
+ Returns:
291
+ int: The number of output files created.
277
292
  """
278
293
  # Ensure the target folder exists
279
294
  os.makedirs(target_folder, exist_ok=True)
280
295
 
281
- # Initialize counters
282
- file_count = 1
283
- sequence_count = 0
284
-
285
- # Open the large FASTA file for reading
286
- with open(fasta_file, "r", buffering=1024 * 1024) as fasta:
287
- # Prepare the first output file
288
- output_file_path = os.path.join(
289
- target_folder, f"{base_name}_{file_count}.fasta"
296
+ # We create output files lazily (on first sequence) so we don't end up with
297
+ # spurious empty files. `files_created` tracks the number of *real* files
298
+ # present on disk when we finish.
299
+ files_created = 0
300
+ current_output_file_sequence_count = 0
301
+ current_output_file_bytes_written = 0
302
+ pbar: tqdm | None = None
303
+ output_file = None # Will be opened when we encounter the first header line
304
+ output_file_path = ""
305
+
306
+ if target_chunk_size_bytes is not None:
307
+ print(
308
+ f"Splitting by target chunk size: {target_chunk_size_bytes / (1024*1024):.2f} MB"
290
309
  )
291
- output_file = open(output_file_path, "w", buffering=1024 * 1024)
292
-
293
- for line in fasta:
294
- # Check if we've reached the maximum number of files, if specified
295
- if max_files is not None and file_count > max_files:
296
- break
310
+ else:
311
+ print(f"Splitting by sequences per file: {sequences_per_file}")
297
312
 
298
- # If line starts with ">", it's the beginning of a new sequence
299
- if line.startswith(">"):
300
- sequence_count += 1
313
+ try:
314
+ # Open the large FASTA file for reading
315
+ with open(fasta_file, "r", buffering=1024 * 1024) as fasta:
316
+ if show_progress:
317
+ total_size = os.path.getsize(fasta_file)
318
+ pbar = tqdm(
319
+ total=total_size,
320
+ unit="B",
321
+ unit_scale=True,
322
+ desc=f"Splitting {os.path.basename(fasta_file)}",
323
+ )
301
324
 
302
- # If we reached the limit, start a new file
303
- if sequence_count > sequences_per_file:
304
- # Close current file and open a new one
305
- output_file.close()
306
- print(f"File written: {output_file}")
307
- file_count += 1
308
- sequence_count = 1 # Reset sequence count for the new file
325
+ # We create output files on demand. The very first file is not
326
+ # opened until we see the first sequence header. This prevents
327
+ # an empty file from being created when the input FASTA is empty
328
+ # or when `max_files` is reached before any data are written.
329
+ def _open_new_output_file():
330
+ nonlocal output_file, output_file_path, files_created
309
331
 
310
- # Check again after incrementing file_count
311
- if max_files is not None and file_count > max_files:
312
- break
332
+ files_created += 1
333
+ output_file_path = os.path.join(
334
+ target_folder, f"{base_name}_{files_created}.fasta"
335
+ )
336
+ output_file = open(output_file_path, "w", buffering=1024 * 1024)
313
337
 
314
- output_file_path = os.path.join(
315
- target_folder, f"{base_name}_{file_count}.fasta"
338
+ # Helper for logging and closing the current file
339
+ def _close_current_output_file():
340
+ nonlocal output_file, current_output_file_sequence_count, current_output_file_bytes_written
341
+ if output_file and not output_file.closed:
342
+ output_file.close()
343
+ print(
344
+ f"File written: {output_file_path} "
345
+ f"(Sequences: {current_output_file_sequence_count}, "
346
+ f"Bytes: {current_output_file_bytes_written} / {(current_output_file_bytes_written / (1024*1024)):.2f} MB)"
316
347
  )
317
- output_file = open(output_file_path, "w", buffering=1024 * 1024)
318
-
319
- # Write the line to the current output file
320
- output_file.write(line)
321
348
 
322
- # Close the last output file
323
- output_file.close()
324
-
325
- return file_count
349
+ for line in fasta:
350
+ line_bytes = len(line.encode("utf-8"))
351
+ if pbar:
352
+ pbar.update(line_bytes)
353
+
354
+ # Note: we don't enforce `max_files` here; we enforce it only when we
355
+ # are about to create *another* file (see logic further below). This
356
+ # ensures we finish writing the current file before stopping.
357
+
358
+ # If line starts with ">", it's the beginning of a new sequence
359
+ if line.startswith(">"):
360
+ # Decide whether we need to roll over to a new output file.
361
+ needs_new_file = False # reset each time we encounter a header
362
+
363
+ if (
364
+ output_file is not None
365
+ and current_output_file_sequence_count > 0
366
+ ):
367
+ if target_chunk_size_bytes is not None:
368
+ # Size-based splitting takes precedence over sequence count.
369
+ if (
370
+ current_output_file_bytes_written
371
+ >= target_chunk_size_bytes
372
+ ):
373
+ needs_new_file = True
374
+ else:
375
+ # Fallback to sequence-count based splitting.
376
+ if current_output_file_sequence_count >= sequences_per_file:
377
+ needs_new_file = True
378
+
379
+ if needs_new_file:
380
+ _close_current_output_file()
381
+
382
+ # Respect `max_files`: do not create another file if limit reached
383
+ if max_files is not None and files_created >= max_files:
384
+ break
385
+
386
+ _open_new_output_file()
387
+ current_output_file_sequence_count = 0
388
+ current_output_file_bytes_written = 0
389
+
390
+ # Opening first file if not already open
391
+ if output_file is None:
392
+ _open_new_output_file()
393
+
394
+ current_output_file_sequence_count += 1
395
+
396
+ # Write the line to the current output file (which should now exist)
397
+ if output_file is not None:
398
+ output_file.write(line)
399
+ current_output_file_bytes_written += line_bytes
400
+
401
+ # After loop, ensure the last file is handled
402
+ _close_current_output_file()
403
+
404
+ finally:
405
+ if pbar:
406
+ pbar.close()
407
+ # Ensure the file is closed in case of an exception before the natural end
408
+ if output_file and not output_file.closed:
409
+ output_file.close()
410
+ # It's hard to know the state to print a meaningful message here if an exception occurred mid-file.
411
+ # The primary 'File written' messages are handled within the loop and at the end of normal processing.
412
+
413
+ # If the last file was empty and removed, and it was the only file, file_count might be 1.
414
+ # Adjust file_count if the last output file was empty and removed.
415
+ if os.path.exists(output_file_path) and os.path.getsize(output_file_path) == 0:
416
+ # This can happen if max_files is hit exactly when a new file is due to be created,
417
+ # or if the input file itself is empty or contains no FASTA entries after the last split point.
418
+ # We should not count this empty file if it was removed.
419
+ # However, file_count is already incremented *before* a new file is opened.
420
+ # The logic for removing empty files is tricky to perfectly align with file_count
421
+ # without more complex state tracking. The current return reflects the number of
422
+ # *attempted* file creations that weren't immediately curtailed by max_files.
423
+ # For simplicity, we'll return the file_count as is, understanding it might
424
+ # include an empty file that was subsequently removed if it was the very last one.
425
+ # A more robust approach might decrement file_count if the last created file path was removed.
426
+ pass
427
+
428
+ return files_created
326
429
 
327
430
 
328
431
  def subtract_fasta_files(file1: str, file2: str, output_file: str):
@@ -0,0 +1,269 @@
1
+ import collections.abc
2
+ import csv
3
+ import gzip
4
+ import pathlib
5
+ import re
6
+
7
+ from tqdm import tqdm
8
+
9
+ _ACCESSION_REGEX = re.compile(r"(GC[AF]_[0-9]+\.[0-9]+)")
10
+
11
+
12
+ def _extract_accession_from_filename(filename_str: str) -> str:
13
+ """
14
+ Extracts the genome assembly accession (e.g., GCA_XXXXXXXXX.X or GCF_XXXXXXXXX.X)
15
+ from a filename.
16
+
17
+ Args:
18
+ filename_str (str): The filename string.
19
+
20
+ Returns:
21
+ str: The extracted accession or "UNKNOWN_ACCESSION" if not found.
22
+ """
23
+ match = _ACCESSION_REGEX.search(filename_str)
24
+ if match:
25
+ return match.group(1)
26
+ return "UNKNOWN_ACCESSION"
27
+
28
+
29
+ def process_gtdb_files_to_fasta(
30
+ gtdb_top_folder: str,
31
+ output_fasta_path: str,
32
+ chunk_size: int = 10000,
33
+ ) -> None:
34
+ """
35
+ Processes a top-level GTDB folder containing gzipped FASTA files (.faa.gz)
36
+ and combines all protein sequences into a single FASTA file.
37
+
38
+ Output is written in chunks for efficiency with large datasets.
39
+ A progress bar is displayed during processing.
40
+
41
+ Args:
42
+ gtdb_top_folder (str): Path to the top-level GTDB directory.
43
+ output_fasta_path (str): Path to write the combined FASTA file.
44
+ chunk_size (int, optional): Number of sequences to process before
45
+ writing a chunk to the output file. Defaults to 10000.
46
+ """
47
+ gtdb_path = pathlib.Path(gtdb_top_folder)
48
+ faa_files = list(gtdb_path.rglob("*.faa.gz"))
49
+
50
+ if not faa_files:
51
+ print(f"No .faa.gz files found in {gtdb_top_folder}")
52
+ return
53
+
54
+ fasta_entries_chunk = []
55
+ sequences_in_current_chunk = 0
56
+
57
+ with open(output_fasta_path, "w") as fasta_out_file:
58
+ current_header_id = None
59
+ current_sequence_lines = []
60
+
61
+ for faa_file_path in tqdm(faa_files, desc="Processing GTDB files to FASTA"):
62
+ try:
63
+ with gzip.open(faa_file_path, "rt") as gz_file:
64
+ for line_content in gz_file:
65
+ line = line_content.strip()
66
+ if not line: # Skip empty lines
67
+ continue
68
+ if line.startswith(">"):
69
+ if current_header_id and current_sequence_lines:
70
+ sequence_string = "".join(current_sequence_lines)
71
+ fasta_entries_chunk.append(
72
+ f">{current_header_id}\n{sequence_string}\n"
73
+ )
74
+ sequences_in_current_chunk += 1
75
+
76
+ # Parse new header
77
+ header_content = line[1:]
78
+ parts = header_content.split(None, 1)
79
+ current_header_id = parts[0]
80
+ current_sequence_lines = []
81
+
82
+ if sequences_in_current_chunk >= chunk_size:
83
+ if fasta_entries_chunk:
84
+ fasta_out_file.write("".join(fasta_entries_chunk))
85
+ fasta_entries_chunk = []
86
+ sequences_in_current_chunk = 0
87
+ else:
88
+ if current_header_id:
89
+ current_sequence_lines.append(line)
90
+
91
+ if current_header_id and current_sequence_lines:
92
+ sequence_string = "".join(current_sequence_lines)
93
+ fasta_entries_chunk.append(
94
+ f">{current_header_id}\n{sequence_string}\n"
95
+ )
96
+ sequences_in_current_chunk += 1
97
+
98
+ # Reset state for the next file to ensure clean parsing start for that file
99
+ current_header_id = None
100
+ current_sequence_lines = []
101
+
102
+ except gzip.BadGzipFile:
103
+ tqdm.write(
104
+ f"Warning: Skipping corrupted or non-gzipped file: {faa_file_path}"
105
+ )
106
+ current_header_id = None
107
+ current_sequence_lines = []
108
+ except Exception as e:
109
+ tqdm.write(f"Warning: Error processing file {faa_file_path}: {e}")
110
+ current_header_id = None
111
+ current_sequence_lines = []
112
+
113
+ if fasta_entries_chunk:
114
+ fasta_out_file.write("".join(fasta_entries_chunk))
115
+
116
+ print(f"Processing complete. Output FASTA file created: {output_fasta_path}")
117
+
118
+
119
+ def process_gtdb_files_to_csv(
120
+ gtdb_top_folder: str,
121
+ output_csv_path: str,
122
+ chunk_size: int = 10000,
123
+ ) -> None:
124
+ """
125
+ Processes a top-level GTDB folder containing gzipped FASTA files (.faa.gz)
126
+ and creates a CSV file with detailed information for each sequence entry.
127
+
128
+ The CSV includes the genome assembly accession, original FASTA header ID,
129
+ and header description for each entry. Output is written in chunks for
130
+ efficiency with large datasets. A progress bar is displayed during processing.
131
+
132
+ Args:
133
+ gtdb_top_folder (str): Path to the top-level GTDB directory.
134
+ output_csv_path (str): Path to write the CSV file.
135
+ chunk_size (int, optional): Number of sequences to process before
136
+ writing a chunk to the output file. Defaults to 10000.
137
+ """
138
+ gtdb_path = pathlib.Path(gtdb_top_folder)
139
+ faa_files = list(gtdb_path.rglob("*.faa.gz"))
140
+
141
+ if not faa_files:
142
+ print(f"No .faa.gz files found in {gtdb_top_folder}")
143
+ return
144
+
145
+ def _serial_iter(paths):
146
+ """Yield the same structure as the parallel branch but serially."""
147
+ for p in paths:
148
+ row_generator_for_file, file_warnings = _csv_rows_from_single_faa(str(p))
149
+ yield row_generator_for_file, file_warnings
150
+
151
+ # Open output CSV for streaming writes.
152
+ with open(output_csv_path, "w", newline="") as csv_out_file:
153
+ csv_writer = csv.writer(csv_out_file)
154
+ csv_writer.writerow(
155
+ [
156
+ "genome_assembly_accession",
157
+ "original_fasta_header_id",
158
+ "original_fasta_header_description",
159
+ ]
160
+ )
161
+
162
+ rows_buffer: list[list[str]] = []
163
+
164
+ # Choose the iterator depending on workers.
165
+ result_iter = _serial_iter(faa_files)
166
+ progress_iter = tqdm(
167
+ result_iter, total=len(faa_files), desc="Processing GTDB files to CSV"
168
+ )
169
+
170
+ # Consume iterator and stream rows to disk in chunks.
171
+ for row_generator_for_file, file_warnings in progress_iter:
172
+ # Add rows to buffer and flush in chunk-size batches.
173
+ # This will consume the generator, and in doing so, populate file_warnings if errors occur.
174
+ for r in row_generator_for_file:
175
+ rows_buffer.append(r)
176
+ if len(rows_buffer) >= chunk_size:
177
+ csv_writer.writerows(rows_buffer)
178
+ rows_buffer.clear()
179
+
180
+ # Now that the generator for the file has been processed (or attempted),
181
+ # emit any warnings that were collected for this specific file.
182
+ for w in file_warnings:
183
+ tqdm.write(w)
184
+
185
+ # Flush remaining rows.
186
+ if rows_buffer:
187
+ csv_writer.writerows(rows_buffer)
188
+
189
+ print(f"Processing complete. Output CSV file created: {output_csv_path}")
190
+
191
+
192
+ # ---------------------------------------------------------------------------
193
+ # Helper functions (private)
194
+ # ---------------------------------------------------------------------------
195
+
196
+
197
+ def _csv_rows_from_single_faa(
198
+ faa_file_path: str,
199
+ ) -> tuple[collections.abc.Iterable[list[str]], list[str]]:
200
+ """Parse a single gzipped FASTA (`.faa.gz`) file into CSV rows.
201
+
202
+ Parameters
203
+ ----------
204
+ faa_file_path
205
+ Path (as ``str``) to the ``.faa.gz`` file.
206
+
207
+ Returns
208
+ -------
209
+ tuple[collections.abc.Iterable[list[str]], list[str]]
210
+ * First element – an iterable (generator) of CSV rows ``[accession, header_id, description]``.
211
+ * Second element – list of warning strings produced while processing
212
+ the file. The caller is responsible for emitting them.
213
+ """
214
+ warnings: list[str] = [] # Outer scope warnings list
215
+ faa_path = pathlib.Path(faa_file_path)
216
+ current_file_accession = _extract_accession_from_filename(faa_path.name)
217
+
218
+ def _generate_rows_iter_inner() -> (
219
+ collections.abc.Iterable[list[str]]
220
+ ): # Renamed for clarity
221
+ # Local parsing state for the generator
222
+ current_header_id_gen = None
223
+ current_header_desc_gen = ""
224
+ has_sequence_lines_gen = False
225
+
226
+ try:
227
+ with gzip.open(faa_file_path, "rt") as gz_file:
228
+ for line_content in gz_file:
229
+ line = line_content.strip()
230
+ if not line:
231
+ continue
232
+ if line.startswith(">"):
233
+ if current_header_id_gen and has_sequence_lines_gen:
234
+ yield [
235
+ current_file_accession,
236
+ current_header_id_gen,
237
+ current_header_desc_gen,
238
+ ]
239
+
240
+ header_content = line[1:]
241
+ parts = header_content.split(None, 1)
242
+ current_header_id_gen = parts[0]
243
+ current_header_desc_gen = parts[1] if len(parts) > 1 else ""
244
+ has_sequence_lines_gen = False
245
+ else:
246
+ if current_header_id_gen:
247
+ has_sequence_lines_gen = True
248
+
249
+ # Add final entry if the file ended after sequence lines.
250
+ if current_header_id_gen and has_sequence_lines_gen:
251
+ yield [
252
+ current_file_accession,
253
+ current_header_id_gen,
254
+ current_header_desc_gen,
255
+ ]
256
+ except gzip.BadGzipFile:
257
+ # Exception handled inside the generator.
258
+ # Append to the outer warnings list and terminate generator.
259
+ warnings.append(
260
+ f"Warning: Skipping corrupted or non-gzipped file: {faa_file_path}"
261
+ )
262
+ return # Stop generation
263
+ except Exception as exc:
264
+ warnings.append(f"Warning: Error processing file {faa_file_path}: {exc}")
265
+ return # Stop generation
266
+
267
+ # Directly return the generator instance and the warnings list.
268
+ # The warnings list will be populated by the generator if errors occur during its execution.
269
+ return _generate_rows_iter_inner(), warnings
@@ -5,7 +5,7 @@ build-backend = "poetry.core.masonry.api"
5
5
 
6
6
  [project]
7
7
  name = "dayhoff-tools"
8
- version = "1.1.4"
8
+ version = "1.1.6"
9
9
  description = "Common tools for all the repos at Dayhoff Labs"
10
10
  authors = [
11
11
  {name = "Daniel Martin-Alarcon", email = "dma@dayhofflabs.com"}
@@ -1,125 +0,0 @@
1
- import logging
2
- import os
3
- import subprocess
4
- import shlex
5
- from abc import ABC, abstractmethod
6
-
7
- logger = logging.getLogger(__name__)
8
-
9
-
10
- class Processor(ABC):
11
- """Processes data locally. Abstract class for specific calculations.
12
- Takes in a single file and produces a single file or folder of outputs."""
13
-
14
- @abstractmethod
15
- def run(self, input_file: str) -> str:
16
- """Do the calculation, including reading from input_file
17
- and writing to output_file"""
18
- output_path = "output_file"
19
-
20
- return output_path
21
-
22
-
23
- class BoltzPredictor(Processor):
24
- """Processor for running Boltz docking predictions.
25
-
26
- This class wraps the Boltz docking tool to predict protein structures
27
- from sequence data.
28
- """
29
-
30
- def __init__(self, num_workers: int, boltz_options: str | None = None):
31
- """Initialize the BoltzPredictor.
32
-
33
- Args:
34
- num_workers: Number of worker threads to use as a default.
35
- This can be overridden if --num_workers is present
36
- in boltz_options.
37
- boltz_options: A string containing additional command-line options
38
- to pass to the Boltz predictor. Options should be
39
- space-separated (e.g., "--option1 value1 --option2").
40
- """
41
- self.num_workers = num_workers
42
- self.boltz_options = boltz_options
43
-
44
- def run(self, input_file: str) -> str:
45
- """Run Boltz prediction on the input file.
46
-
47
- Constructs the command using the input file, default number of workers,
48
- and any additional options provided via `boltz_options`. If `--num_workers`
49
- is specified in `boltz_options`, it overrides the default `num_workers`.
50
-
51
- Args:
52
- input_file: Path to the input file containing sequences
53
-
54
- Returns:
55
- Path to the output directory created by Boltz
56
-
57
- Raises:
58
- subprocess.CalledProcessError: If Boltz prediction fails
59
- """
60
- # Determine expected output directory name
61
- input_base = os.path.splitext(os.path.basename(input_file))[0]
62
- expected_output_dir = f"boltz_results_{input_base}"
63
- logger.info(f"Expected output directory: {expected_output_dir}")
64
-
65
- # Start building the command
66
- cmd = ["boltz", "predict", input_file]
67
-
68
- # Parse additional options if provided
69
- additional_args = []
70
- num_workers_in_opts = False
71
- if self.boltz_options:
72
- try:
73
- parsed_opts = shlex.split(self.boltz_options)
74
- additional_args.extend(parsed_opts)
75
- if "--num_workers" in parsed_opts:
76
- num_workers_in_opts = True
77
- logger.info(
78
- f"Using --num_workers from BOLTZ_OPTIONS: {self.boltz_options}"
79
- )
80
- except ValueError as e:
81
- logger.error(f"Error parsing BOLTZ_OPTIONS '{self.boltz_options}': {e}")
82
- # Decide if we should raise an error or proceed without options
83
- # For now, proceed without the additional options
84
- additional_args = [] # Clear potentially partially parsed args
85
-
86
- # Add num_workers if not specified in options
87
- if not num_workers_in_opts:
88
- logger.info(f"Using default num_workers: {self.num_workers}")
89
- cmd.extend(["--num_workers", str(self.num_workers)])
90
-
91
- # Add the parsed additional arguments
92
- cmd.extend(additional_args)
93
-
94
- # Log the final command
95
- # Use shlex.join for safer command logging, especially if paths/args have spaces
96
- try:
97
- safe_cmd_str = shlex.join(cmd)
98
- logger.info(f"Running command: {safe_cmd_str}")
99
- except AttributeError: # shlex.join is Python 3.8+
100
- logger.info(f"Running command: {' '.join(cmd)}")
101
-
102
- # Stream output in real-time
103
- process = subprocess.Popen(
104
- cmd,
105
- stdout=subprocess.PIPE,
106
- stderr=subprocess.STDOUT,
107
- text=True,
108
- bufsize=1,
109
- )
110
-
111
- stdout = process.stdout
112
- if stdout:
113
- for line in iter(stdout.readline, ""):
114
- logger.info(f"BOLTZ: {line.rstrip()}")
115
-
116
- # Wait for process to complete
117
- return_code = process.wait()
118
- if return_code != 0:
119
- logger.error(f"Boltz prediction failed with exit code {return_code}")
120
- raise subprocess.CalledProcessError(return_code, cmd)
121
-
122
- logger.info(
123
- f"Boltz prediction completed successfully. Output in {expected_output_dir}"
124
- )
125
- return expected_output_dir
File without changes