dayhoff-tools 1.15.0__tar.gz → 1.15.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/PKG-INFO +2 -2
  2. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/batch/workers/boltz.py +5 -5
  3. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/cli/batch/commands/cancel.py +6 -2
  4. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/cli/batch/commands/clean.py +6 -2
  5. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/cli/batch/commands/embed_t5.py +0 -1
  6. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/cli/batch/commands/finalize.py +39 -20
  7. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/cli/batch/commands/status.py +9 -3
  8. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/cli/batch/commands/submit.py +0 -1
  9. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/cli/utility_commands.py +0 -2
  10. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/embedders.py +2 -2
  11. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/fasta.py +2 -4
  12. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/intake/kegg.py +1 -3
  13. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/intake/structure.py +4 -4
  14. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/pyproject.toml +2 -2
  15. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/README.md +0 -0
  16. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/__init__.py +0 -0
  17. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/batch/__init__.py +0 -0
  18. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/batch/workers/__init__.py +0 -0
  19. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/batch/workers/base.py +0 -0
  20. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/batch/workers/embed_t5.py +0 -0
  21. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/chemistry/standardizer.py +0 -0
  22. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/chemistry/utils.py +0 -0
  23. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/cli/__init__.py +0 -0
  24. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/cli/batch/__init__.py +0 -0
  25. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/cli/batch/aws_batch.py +0 -0
  26. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/cli/batch/commands/__init__.py +0 -0
  27. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/cli/batch/commands/boltz.py +0 -0
  28. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/cli/batch/commands/list_jobs.py +0 -0
  29. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/cli/batch/commands/local.py +0 -0
  30. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/cli/batch/commands/logs.py +0 -0
  31. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/cli/batch/commands/retry.py +0 -0
  32. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/cli/batch/job_id.py +0 -0
  33. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/cli/batch/manifest.py +0 -0
  34. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/cli/cloud_commands.py +0 -0
  35. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/cli/engine1/__init__.py +0 -0
  36. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/cli/engine1/engine_core.py +0 -0
  37. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/cli/engine1/engine_lifecycle.py +0 -0
  38. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/cli/engine1/engine_maintenance.py +0 -0
  39. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/cli/engine1/engine_management.py +0 -0
  40. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/cli/engine1/shared.py +0 -0
  41. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/cli/engine1/studio_commands.py +0 -0
  42. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/cli/engines_studios/__init__.py +0 -0
  43. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/cli/engines_studios/api_client.py +0 -0
  44. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/cli/engines_studios/auth.py +0 -0
  45. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/cli/engines_studios/engine-studio-cli.md +0 -0
  46. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/cli/engines_studios/engine_commands.py +0 -0
  47. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/cli/engines_studios/progress.py +0 -0
  48. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/cli/engines_studios/simulators/cli-simulators.md +0 -0
  49. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/cli/engines_studios/simulators/demo.sh +0 -0
  50. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/cli/engines_studios/simulators/engine_list_simulator.py +0 -0
  51. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/cli/engines_studios/simulators/engine_status_simulator.py +0 -0
  52. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/cli/engines_studios/simulators/idle_status_simulator.py +0 -0
  53. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/cli/engines_studios/simulators/simulator_utils.py +0 -0
  54. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/cli/engines_studios/simulators/studio_list_simulator.py +0 -0
  55. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/cli/engines_studios/simulators/studio_status_simulator.py +0 -0
  56. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/cli/engines_studios/ssh_config.py +0 -0
  57. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/cli/engines_studios/studio_commands.py +0 -0
  58. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/cli/github_commands.py +0 -0
  59. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/cli/main.py +0 -0
  60. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/cli/swarm_commands.py +0 -0
  61. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/deployment/base.py +0 -0
  62. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/deployment/deploy_aws.py +0 -0
  63. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/deployment/deploy_gcp.py +0 -0
  64. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/deployment/deploy_utils.py +0 -0
  65. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/deployment/job_runner.py +0 -0
  66. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/deployment/processors.py +0 -0
  67. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/deployment/swarm.py +0 -0
  68. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/file_ops.py +0 -0
  69. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/h5.py +0 -0
  70. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/intake/gcp.py +0 -0
  71. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/intake/gtdb.py +0 -0
  72. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/intake/mmseqs.py +0 -0
  73. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/intake/uniprot.py +0 -0
  74. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/logs.py +0 -0
  75. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/sqlite.py +0 -0
  76. {dayhoff_tools-1.15.0 → dayhoff_tools-1.15.1}/dayhoff_tools/warehouse.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dayhoff-tools
3
- Version: 1.15.0
3
+ Version: 1.15.1
4
4
  Summary: Common tools for all the repos at Dayhoff Labs
5
5
  Author: Daniel Martin-Alarcon
6
6
  Author-email: dma@dayhofflabs.com
@@ -40,8 +40,8 @@ Requires-Dist: sqlalchemy (>=2.0.40,<3.0.0) ; extra == "full"
40
40
  Requires-Dist: toml (>=0.10)
41
41
  Requires-Dist: tqdm (>=4.67.1) ; extra == "embedders"
42
42
  Requires-Dist: tqdm (>=4.67.1) ; extra == "full"
43
- Requires-Dist: transformers (==4.36.2) ; extra == "full"
44
43
  Requires-Dist: transformers (>=4.36.2) ; extra == "embedders"
44
+ Requires-Dist: transformers (>=4.36.2) ; extra == "full"
45
45
  Requires-Dist: typer (>=0.9.0)
46
46
  Requires-Dist: tzdata (>=2025.2)
47
47
  Description-Content-Type: text/markdown
@@ -235,7 +235,7 @@ class BoltzProcessor:
235
235
  # Determine output directory
236
236
  # Boltz always creates boltz_results_{input_name} inside --out_dir
237
237
  input_base = os.path.splitext(os.path.basename(input_file))[0]
238
-
238
+
239
239
  if output_dir is None:
240
240
  # No output_dir specified, boltz creates in current directory
241
241
  expected_output_dir = f"boltz_results_{input_base}"
@@ -244,7 +244,9 @@ class BoltzProcessor:
244
244
  # output_dir specified - use its parent for --out_dir
245
245
  # and expect boltz_results_{input_base} inside it
246
246
  parent_dir = os.path.dirname(output_dir)
247
- expected_output_dir = os.path.join(parent_dir, f"boltz_results_{input_base}")
247
+ expected_output_dir = os.path.join(
248
+ parent_dir, f"boltz_results_{input_base}"
249
+ )
248
250
  out_dir_arg = parent_dir if parent_dir else None
249
251
 
250
252
  logger.info(f"Running Boltz prediction for {input_file}")
@@ -455,9 +457,7 @@ def main():
455
457
  completed += 1
456
458
  continue
457
459
 
458
- logger.info(
459
- f"[{file_idx + 1}/{len(my_files)}] Processing {file_stem}..."
460
- )
460
+ logger.info(f"[{file_idx + 1}/{len(my_files)}] Processing {file_stem}...")
461
461
 
462
462
  try:
463
463
  # Determine output directory
@@ -152,9 +152,13 @@ def _cancel_retry_job(manifest, retry_id: str, force: bool, base_path: str):
152
152
  )
153
153
 
154
154
  click.echo()
155
- click.echo(click.style(f"✓ Retry job {retry_id} cancelled successfully", fg="green"))
155
+ click.echo(
156
+ click.style(f"✓ Retry job {retry_id} cancelled successfully", fg="green")
157
+ )
156
158
  click.echo(f"Parent job: {manifest.job_id}")
157
159
 
158
160
  except BatchError as e:
159
- click.echo(click.style(f"✗ Failed to cancel retry job: {e}", fg="red"), err=True)
161
+ click.echo(
162
+ click.style(f"✗ Failed to cancel retry job: {e}", fg="red"), err=True
163
+ )
160
164
  raise SystemExit(1)
@@ -20,7 +20,9 @@ from .status import format_time_ago, _aws_status_to_job_status
20
20
  default=7,
21
21
  help="Only clean jobs older than N days [default: 7]",
22
22
  )
23
- @click.option("--dry-run", is_flag=True, help="Show what would be cleaned without deleting")
23
+ @click.option(
24
+ "--dry-run", is_flag=True, help="Show what would be cleaned without deleting"
25
+ )
24
26
  @click.option("--force", is_flag=True, help="Delete without confirmation")
25
27
  @click.option("--base-path", default=BATCH_JOBS_BASE, help="Base path for job data")
26
28
  def clean(user, older_than, dry_run, force, base_path):
@@ -81,7 +83,9 @@ def clean(user, older_than, dry_run, force, base_path):
81
83
  live_statuses = client.get_job_statuses_batch(batch_job_ids)
82
84
  except BatchError as e:
83
85
  click.echo(f"Error: Could not fetch status from AWS Batch: {e}", err=True)
84
- click.echo("Cannot safely clean jobs without knowing their status.", err=True)
86
+ click.echo(
87
+ "Cannot safely clean jobs without knowing their status.", err=True
88
+ )
85
89
  raise SystemExit(1)
86
90
 
87
91
  # Find jobs that are safe to clean (SUCCEEDED or FAILED)
@@ -21,7 +21,6 @@ from ..manifest import (
21
21
  save_manifest,
22
22
  )
23
23
 
24
-
25
24
  # Default settings for T5 embedding
26
25
  DEFAULT_QUEUE = "t4-1x-spot"
27
26
  DEFAULT_WORKERS = 50
@@ -40,7 +40,9 @@ from ..manifest import (
40
40
  help="Skip deduplication step (use if input has no duplicates)",
41
41
  )
42
42
  @click.option("--base-path", default=BATCH_JOBS_BASE, help="Base path for job data")
43
- def finalize(job_id, output, force, keep_intermediates, full_output, skip_dedup, base_path):
43
+ def finalize(
44
+ job_id, output, force, keep_intermediates, full_output, skip_dedup, base_path
45
+ ):
44
46
  """Combine results and clean up job intermediates.
45
47
 
46
48
  For embedding jobs, combines H5 files into a single output file.
@@ -238,14 +240,18 @@ def _finalize_embeddings(output_dir: Path, output_path: Path, skip_dedup: bool =
238
240
  if skip_dedup:
239
241
  # Skip dedup - optimize directly from combined
240
242
  click.echo("Optimizing chunks...")
241
- optimize_protein_embedding_chunks(str(combined_path), str(output_path))
243
+ optimize_protein_embedding_chunks(
244
+ str(combined_path), str(output_path)
245
+ )
242
246
  else:
243
247
  # Full pipeline: combine -> dedup -> optimize
244
248
  deduped_path = Path(tmpdir) / "deduped.h5"
245
249
  click.echo("Deduplicating...")
246
250
  deduplicate_h5_file(str(combined_path), str(deduped_path))
247
251
  click.echo("Optimizing chunks...")
248
- optimize_protein_embedding_chunks(str(deduped_path), str(output_path))
252
+ optimize_protein_embedding_chunks(
253
+ str(deduped_path), str(output_path)
254
+ )
249
255
 
250
256
  click.echo(click.style("✓ H5 files combined successfully", fg="green"))
251
257
 
@@ -269,7 +275,7 @@ def _finalize_embeddings(output_dir: Path, output_path: Path, skip_dedup: bool =
269
275
 
270
276
  def _finalize_boltz(output_dir: Path, output_path: Path, full_output: bool = False):
271
277
  """Move Boltz output to destination.
272
-
278
+
273
279
  Args:
274
280
  output_dir: Source directory containing boltz_results_* folders
275
281
  output_path: Destination directory for outputs
@@ -277,20 +283,24 @@ def _finalize_boltz(output_dir: Path, output_path: Path, full_output: bool = Fal
277
283
  extract only essential files (CIF structures and confidence JSON).
278
284
  """
279
285
  # Find all output directories (one per complex)
280
- complex_dirs = [d for d in output_dir.iterdir() if d.is_dir() and d.name.startswith("boltz_results_")]
286
+ complex_dirs = [
287
+ d
288
+ for d in output_dir.iterdir()
289
+ if d.is_dir() and d.name.startswith("boltz_results_")
290
+ ]
281
291
 
282
292
  if not complex_dirs:
283
293
  click.echo("No output directories found.", err=True)
284
294
  raise SystemExit(1)
285
295
 
286
296
  click.echo(f"Found {len(complex_dirs)} structure predictions")
287
-
297
+
288
298
  if full_output:
289
299
  click.echo("Mode: Copying full output (all files)")
290
300
  else:
291
301
  click.echo("Mode: Extracting essential files only (CIF + confidence JSON)")
292
302
  click.echo(" Use --full-output to copy all files")
293
-
303
+
294
304
  # Confirm before proceeding
295
305
  click.echo()
296
306
  if not click.confirm(f"Copy results to {output_path}?"):
@@ -302,16 +312,16 @@ def _finalize_boltz(output_dir: Path, output_path: Path, full_output: bool = Fal
302
312
 
303
313
  copied_count = 0
304
314
  skipped_count = 0
305
-
315
+
306
316
  for complex_dir in complex_dirs:
307
317
  complex_name = complex_dir.name.replace("boltz_results_", "")
308
318
  dest = output_path / complex_name
309
-
319
+
310
320
  if dest.exists():
311
321
  click.echo(f" Skipping {complex_name} (already exists)")
312
322
  skipped_count += 1
313
323
  continue
314
-
324
+
315
325
  if full_output:
316
326
  # Copy entire directory
317
327
  shutil.copytree(complex_dir, dest)
@@ -320,44 +330,53 @@ def _finalize_boltz(output_dir: Path, output_path: Path, full_output: bool = Fal
320
330
  # Extract only essential files
321
331
  _extract_essential_boltz_files(complex_dir, dest, complex_name)
322
332
  click.echo(f" Extracted {complex_name} (essential files)")
323
-
333
+
324
334
  copied_count += 1
325
335
 
326
336
  click.echo()
327
337
  if skipped_count > 0:
328
- click.echo(f"Copied {copied_count} predictions, skipped {skipped_count} existing")
338
+ click.echo(
339
+ f"Copied {copied_count} predictions, skipped {skipped_count} existing"
340
+ )
329
341
  else:
330
- click.echo(click.style(f"✓ Copied {copied_count} structure predictions successfully", fg="green"))
342
+ click.echo(
343
+ click.style(
344
+ f"✓ Copied {copied_count} structure predictions successfully",
345
+ fg="green",
346
+ )
347
+ )
331
348
 
332
349
 
333
350
  def _extract_essential_boltz_files(source_dir: Path, dest_dir: Path, complex_name: str):
334
351
  """Extract only essential files from Boltz output.
335
-
352
+
336
353
  Essential files are:
337
354
  - predictions/*/*.cif (structure files)
338
355
  - predictions/*/confidence_*.json (confidence metrics)
339
-
356
+
340
357
  Args:
341
358
  source_dir: Source boltz_results_* directory
342
359
  dest_dir: Destination directory to create
343
360
  complex_name: Name of the complex (for better error messages)
344
361
  """
345
362
  dest_dir.mkdir(parents=True, exist_ok=True)
346
-
363
+
347
364
  predictions_dir = source_dir / "predictions"
348
365
  if not predictions_dir.exists():
349
- click.echo(f" Warning: No predictions directory found for {complex_name}", err=True)
366
+ click.echo(
367
+ f" Warning: No predictions directory found for {complex_name}", err=True
368
+ )
350
369
  return
351
-
370
+
352
371
  # Find all subdirectories in predictions/ (usually just one named after the complex)
353
372
  for pred_subdir in predictions_dir.iterdir():
354
373
  if not pred_subdir.is_dir():
355
374
  continue
356
-
375
+
357
376
  # Copy CIF files (structures)
358
377
  for cif_file in pred_subdir.glob("*.cif"):
359
378
  shutil.copy2(cif_file, dest_dir / cif_file.name)
360
-
379
+
361
380
  # Copy confidence JSON files
362
381
  for json_file in pred_subdir.glob("confidence_*.json"):
363
382
  shutil.copy2(json_file, dest_dir / json_file.name)
@@ -264,7 +264,9 @@ def _show_job_details(job_id: str, base_path: str):
264
264
  reslice_info = ""
265
265
  if retry.reslice_prefix:
266
266
  reslice_info = f" (resliced to {retry.reslice_count} chunks)"
267
- click.echo(f" - {retry.retry_id}: {len(retry.indices)} indices{reslice_info}")
267
+ click.echo(
268
+ f" - {retry.retry_id}: {len(retry.indices)} indices{reslice_info}"
269
+ )
268
270
  click.echo(f" Indices: {retry.indices}")
269
271
  if retry.batch_job_id:
270
272
  # Show brief status for retry job
@@ -273,7 +275,9 @@ def _show_job_details(job_id: str, base_path: str):
273
275
  array_status = client.get_array_job_status(retry.batch_job_id)
274
276
  if array_status.is_complete:
275
277
  pct = array_status.success_rate * 100
276
- color = "green" if pct == 100 else "yellow" if pct > 90 else "red"
278
+ color = (
279
+ "green" if pct == 100 else "yellow" if pct > 90 else "red"
280
+ )
277
281
  click.echo(
278
282
  f" Status: Complete - {click.style(f'{pct:.0f}%', fg=color)} "
279
283
  f"({array_status.succeeded}/{array_status.total} succeeded)"
@@ -364,7 +368,9 @@ def _show_retry_details(manifest, retry_id: str):
364
368
  click.echo("Retry Config:")
365
369
  click.echo(f" Indices: {retry_info.indices}")
366
370
  if retry_info.reslice_prefix:
367
- click.echo(f" Reslice: {retry_info.reslice_prefix} ({retry_info.reslice_count} chunks)")
371
+ click.echo(
372
+ f" Reslice: {retry_info.reslice_prefix} ({retry_info.reslice_count} chunks)"
373
+ )
368
374
  else:
369
375
  click.echo(f" Reslice: No (retrying original chunks)")
370
376
 
@@ -18,7 +18,6 @@ from ..manifest import (
18
18
  save_manifest,
19
19
  )
20
20
 
21
-
22
21
  # Default job definition for generic jobs
23
22
  DEFAULT_JOB_DEFINITION = "dayhoff-batch-base"
24
23
  DEFAULT_QUEUE = "t4-1x-spot"
@@ -258,5 +258,3 @@ def build_and_upload_wheel(bump_part: str = "patch"):
258
258
  print(f"Warning: Could not find version {new_version} to revert.")
259
259
  except Exception as revert_e:
260
260
  print(f"Warning: Failed to revert version change: {revert_e}")
261
-
262
-
@@ -179,8 +179,8 @@ class H5Reformatter(Processor):
179
179
  def embedding_file_to_df(self, file_name: str) -> pd.DataFrame:
180
180
  with h5py.File(file_name, "r") as f:
181
181
  gene_names = list(f.keys())
182
- Xg = [f[key][()] for key in gene_names] # type:ignore
183
- return pd.DataFrame(np.asmatrix(Xg), index=gene_names) # type:ignore
182
+ Xg = [f[key][()] for key in gene_names] # type: ignore
183
+ return pd.DataFrame(np.asmatrix(Xg), index=gene_names) # type: ignore
184
184
 
185
185
  def write_df_to_h5(self, df: pd.DataFrame, filename: str, description: str) -> None:
186
186
  """
@@ -857,14 +857,12 @@ def fasta_to_sqlite(fasta_file: str, db_file: str, batch_size: int = 1000) -> No
857
857
  # Create the SQLite database and table
858
858
  print("Creating SQLite database...")
859
859
  with sqlite3.connect(db_file) as conn:
860
- conn.execute(
861
- """
860
+ conn.execute("""
862
861
  CREATE TABLE IF NOT EXISTS proteins (
863
862
  protein_id TEXT PRIMARY KEY,
864
863
  sequence TEXT NOT NULL
865
864
  )
866
- """
867
- )
865
+ """)
868
866
  print("Database created successfully.")
869
867
 
870
868
  # Estimate number of records for progress bar
@@ -25,9 +25,7 @@ def get_ko2gene_df(db: str, ko: str | list[str] | None = None) -> pd.DataFrame:
25
25
  query = (
26
26
  f"SELECT gene,ko FROM gene_to_ko WHERE ko IN ({','.join('?' * len(ko))})"
27
27
  )
28
- result_df = pd.read_sql_query(
29
- query, conn, params=ko # type:ignore
30
- )
28
+ result_df = pd.read_sql_query(query, conn, params=ko) # type: ignore
31
29
  else:
32
30
  query = f"SELECT gene,ko FROM gene_to_ko"
33
31
  result_df = pd.read_sql_query(query, conn)
@@ -409,10 +409,10 @@ class PDBFolderProcessor:
409
409
  def _get_pdb_files(self) -> list[str]:
410
410
  """
411
411
  Get a list of PDB files in the specified directory, optionally filtered by ID set.
412
- Files are sorted by creation time to ensure consistent processing order.
412
+ Files are sorted alphabetically to ensure consistent, reproducible processing order.
413
413
 
414
414
  Returns:
415
- List of PDB file names sorted by creation time.
415
+ List of PDB file names sorted alphabetically.
416
416
  """
417
417
  print("Scanning directory for PDB files...")
418
418
  pdb_files = [
@@ -424,8 +424,8 @@ class PDBFolderProcessor:
424
424
  f for f in pdb_files if self._extract_id_from_filename(f) in self.id_set
425
425
  ]
426
426
 
427
- # Sort files by creation time
428
- pdb_files.sort(key=lambda f: os.path.getctime(os.path.join(self.pdb_dir, f)))
427
+ # Sort files alphabetically for deterministic, reproducible order
428
+ pdb_files.sort()
429
429
 
430
430
  print(f"Found {len(pdb_files)} PDB files")
431
431
  return pdb_files
@@ -11,7 +11,7 @@ build-backend = "poetry.core.masonry.api"
11
11
 
12
12
  [project]
13
13
  name = "dayhoff-tools"
14
- version = "1.15.0"
14
+ version = "1.15.1"
15
15
  description = "Common tools for all the repos at Dayhoff Labs"
16
16
  authors = [
17
17
  {name = "Daniel Martin-Alarcon", email = "dma@dayhofflabs.com"}
@@ -43,7 +43,7 @@ full = [
43
43
  "sentencepiece>=0.2.0",
44
44
  "sqlalchemy>=2.0.40,<3.0.0",
45
45
  "tqdm>=4.67.1",
46
- "transformers==4.36.2",
46
+ "transformers>=4.36.2", # Relaxed: exact pin broke conda envs with huggingface-hub>=1.0
47
47
  ]
48
48
 
49
49
  # Embedding models (requires torch - user must install separately for their platform)
File without changes