amina-cli 0.2.7__tar.gz → 0.2.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. {amina_cli-0.2.7 → amina_cli-0.2.9}/PKG-INFO +2 -1
  2. {amina_cli-0.2.7 → amina_cli-0.2.9}/pyproject.toml +2 -1
  3. {amina_cli-0.2.7 → amina_cli-0.2.9}/src/amina_cli/__init__.py +1 -1
  4. {amina_cli-0.2.7 → amina_cli-0.2.9}/src/amina_cli/commands/tools/__init__.py +68 -1
  5. amina_cli-0.2.9/src/amina_cli/commands/tools/analysis/docs/hydrophobicity.yaml +243 -0
  6. amina_cli-0.2.9/src/amina_cli/commands/tools/analysis/docs/mmseqs2_cluster.yaml +166 -0
  7. amina_cli-0.2.9/src/amina_cli/commands/tools/analysis/docs/residue_accessibility.yaml +186 -0
  8. amina_cli-0.2.9/src/amina_cli/commands/tools/analysis/docs/rmsd.yaml +195 -0
  9. amina_cli-0.2.9/src/amina_cli/commands/tools/analysis/docs/sasa.yaml +92 -0
  10. amina_cli-0.2.9/src/amina_cli/commands/tools/analysis/docs/simple_rmsd.yaml +142 -0
  11. amina_cli-0.2.9/src/amina_cli/commands/tools/analysis/docs/surface_charge.yaml +205 -0
  12. amina_cli-0.2.9/src/amina_cli/commands/tools/analysis/docs/usalign.yaml +227 -0
  13. {amina_cli-0.2.7 → amina_cli-0.2.9}/src/amina_cli/commands/tools/analysis/hydrophobicity.py +6 -24
  14. {amina_cli-0.2.7 → amina_cli-0.2.9}/src/amina_cli/commands/tools/analysis/mmseqs2_cluster.py +6 -48
  15. {amina_cli-0.2.7 → amina_cli-0.2.9}/src/amina_cli/commands/tools/analysis/residue_accessibility.py +6 -27
  16. {amina_cli-0.2.7 → amina_cli-0.2.9}/src/amina_cli/commands/tools/analysis/rmsd.py +7 -69
  17. {amina_cli-0.2.7 → amina_cli-0.2.9}/src/amina_cli/commands/tools/analysis/sasa.py +6 -24
  18. {amina_cli-0.2.7 → amina_cli-0.2.9}/src/amina_cli/commands/tools/analysis/simple_rmsd.py +6 -49
  19. {amina_cli-0.2.7 → amina_cli-0.2.9}/src/amina_cli/commands/tools/analysis/surface_charge.py +6 -29
  20. {amina_cli-0.2.7 → amina_cli-0.2.9}/src/amina_cli/commands/tools/analysis/usalign.py +6 -37
  21. amina_cli-0.2.9/src/amina_cli/commands/tools/design/docs/esm_if1.yaml +247 -0
  22. amina_cli-0.2.9/src/amina_cli/commands/tools/design/docs/protein_mc.yaml +189 -0
  23. amina_cli-0.2.9/src/amina_cli/commands/tools/design/docs/proteinmpnn.yaml +238 -0
  24. amina_cli-0.2.9/src/amina_cli/commands/tools/design/docs/rfdiffusion.yaml +272 -0
  25. {amina_cli-0.2.7 → amina_cli-0.2.9}/src/amina_cli/commands/tools/design/esm_if1.py +6 -40
  26. {amina_cli-0.2.7 → amina_cli-0.2.9}/src/amina_cli/commands/tools/design/protein_mc.py +6 -28
  27. {amina_cli-0.2.7 → amina_cli-0.2.9}/src/amina_cli/commands/tools/design/proteinmpnn.py +6 -24
  28. {amina_cli-0.2.7 → amina_cli-0.2.9}/src/amina_cli/commands/tools/design/rfdiffusion.py +6 -40
  29. amina_cli-0.2.9/src/amina_cli/commands/tools/doccard.py +89 -0
  30. {amina_cli-0.2.7 → amina_cli-0.2.9}/src/amina_cli/commands/tools/folding/boltz2.py +6 -102
  31. amina_cli-0.2.9/src/amina_cli/commands/tools/folding/docs/boltz2.yaml +401 -0
  32. amina_cli-0.2.9/src/amina_cli/commands/tools/folding/docs/esmfold.yaml +148 -0
  33. amina_cli-0.2.9/src/amina_cli/commands/tools/folding/docs/openfold3.yaml +323 -0
  34. amina_cli-0.2.9/src/amina_cli/commands/tools/folding/docs/protenix.yaml +442 -0
  35. {amina_cli-0.2.7 → amina_cli-0.2.9}/src/amina_cli/commands/tools/folding/esmfold.py +6 -25
  36. {amina_cli-0.2.7 → amina_cli-0.2.9}/src/amina_cli/commands/tools/folding/openfold3.py +6 -53
  37. {amina_cli-0.2.7 → amina_cli-0.2.9}/src/amina_cli/commands/tools/folding/protenix.py +6 -58
  38. {amina_cli-0.2.7 → amina_cli-0.2.9}/src/amina_cli/commands/tools/interactions/autodock_vina.py +6 -24
  39. {amina_cli-0.2.7 → amina_cli-0.2.9}/src/amina_cli/commands/tools/interactions/diffdock.py +6 -33
  40. {amina_cli-0.2.7 → amina_cli-0.2.9}/src/amina_cli/commands/tools/interactions/dockq.py +6 -56
  41. amina_cli-0.2.9/src/amina_cli/commands/tools/interactions/docs/autodock_vina.yaml +244 -0
  42. amina_cli-0.2.9/src/amina_cli/commands/tools/interactions/docs/diffdock.yaml +185 -0
  43. amina_cli-0.2.9/src/amina_cli/commands/tools/interactions/docs/dockq.yaml +288 -0
  44. amina_cli-0.2.9/src/amina_cli/commands/tools/interactions/docs/emngly.yaml +181 -0
  45. amina_cli-0.2.9/src/amina_cli/commands/tools/interactions/docs/glycosylation_ensemble.yaml +244 -0
  46. amina_cli-0.2.9/src/amina_cli/commands/tools/interactions/docs/interface_identifier.yaml +176 -0
  47. amina_cli-0.2.9/src/amina_cli/commands/tools/interactions/docs/isoglyp.yaml +225 -0
  48. amina_cli-0.2.9/src/amina_cli/commands/tools/interactions/docs/lmngly.yaml +183 -0
  49. amina_cli-0.2.9/src/amina_cli/commands/tools/interactions/docs/p2rank.yaml +138 -0
  50. amina_cli-0.2.9/src/amina_cli/commands/tools/interactions/docs/pesto.yaml +231 -0
  51. {amina_cli-0.2.7 → amina_cli-0.2.9}/src/amina_cli/commands/tools/interactions/emngly.py +6 -23
  52. {amina_cli-0.2.7 → amina_cli-0.2.9}/src/amina_cli/commands/tools/interactions/glycosylation_ensemble.py +6 -35
  53. {amina_cli-0.2.7 → amina_cli-0.2.9}/src/amina_cli/commands/tools/interactions/interface_identifier.py +6 -32
  54. {amina_cli-0.2.7 → amina_cli-0.2.9}/src/amina_cli/commands/tools/interactions/isoglyp.py +6 -33
  55. {amina_cli-0.2.7 → amina_cli-0.2.9}/src/amina_cli/commands/tools/interactions/lmngly.py +6 -29
  56. {amina_cli-0.2.7 → amina_cli-0.2.9}/src/amina_cli/commands/tools/interactions/p2rank.py +6 -24
  57. {amina_cli-0.2.7 → amina_cli-0.2.9}/src/amina_cli/commands/tools/interactions/pesto.py +6 -47
  58. {amina_cli-0.2.7 → amina_cli-0.2.9}/src/amina_cli/commands/tools/properties/aminosol.py +6 -27
  59. amina_cli-0.2.9/src/amina_cli/commands/tools/properties/docs/aminosol.yaml +153 -0
  60. amina_cli-0.2.9/src/amina_cli/commands/tools/properties/docs/esm1v.yaml +178 -0
  61. amina_cli-0.2.9/src/amina_cli/commands/tools/properties/docs/esm2_embedding.yaml +226 -0
  62. {amina_cli-0.2.7 → amina_cli-0.2.9}/src/amina_cli/commands/tools/properties/esm1v.py +6 -32
  63. {amina_cli-0.2.7 → amina_cli-0.2.9}/src/amina_cli/commands/tools/properties/esm2_embedding.py +6 -49
  64. {amina_cli-0.2.7 → amina_cli-0.2.9}/src/amina_cli/commands/tools/utilities/activesite_verifier.py +6 -37
  65. {amina_cli-0.2.7 → amina_cli-0.2.9}/src/amina_cli/commands/tools/utilities/chain_select.py +6 -35
  66. {amina_cli-0.2.7 → amina_cli-0.2.9}/src/amina_cli/commands/tools/utilities/distance_calculator.py +6 -36
  67. amina_cli-0.2.9/src/amina_cli/commands/tools/utilities/docs/activesite_verifier.yaml +204 -0
  68. amina_cli-0.2.9/src/amina_cli/commands/tools/utilities/docs/chain_select.yaml +248 -0
  69. amina_cli-0.2.9/src/amina_cli/commands/tools/utilities/docs/distance_calculator.yaml +203 -0
  70. amina_cli-0.2.9/src/amina_cli/commands/tools/utilities/docs/maxit_convert.yaml +120 -0
  71. amina_cli-0.2.9/src/amina_cli/commands/tools/utilities/docs/mol_size_calculator.yaml +139 -0
  72. amina_cli-0.2.9/src/amina_cli/commands/tools/utilities/docs/obabel_convert.yaml +155 -0
  73. amina_cli-0.2.9/src/amina_cli/commands/tools/utilities/docs/pdb_bfactor_overwrite.yaml +151 -0
  74. amina_cli-0.2.9/src/amina_cli/commands/tools/utilities/docs/pdb_cleaner.yaml +220 -0
  75. amina_cli-0.2.9/src/amina_cli/commands/tools/utilities/docs/pdb_quality_assessment.yaml +183 -0
  76. amina_cli-0.2.9/src/amina_cli/commands/tools/utilities/docs/pdb_to_fasta.yaml +104 -0
  77. amina_cli-0.2.9/src/amina_cli/commands/tools/utilities/docs/protein_relaxer.yaml +204 -0
  78. {amina_cli-0.2.7 → amina_cli-0.2.9}/src/amina_cli/commands/tools/utilities/maxit_convert.py +6 -25
  79. {amina_cli-0.2.7 → amina_cli-0.2.9}/src/amina_cli/commands/tools/utilities/mol_size_calculator.py +6 -23
  80. {amina_cli-0.2.7 → amina_cli-0.2.9}/src/amina_cli/commands/tools/utilities/obabel_convert.py +6 -27
  81. {amina_cli-0.2.7 → amina_cli-0.2.9}/src/amina_cli/commands/tools/utilities/pdb_bfactor_overwrite.py +6 -35
  82. {amina_cli-0.2.7 → amina_cli-0.2.9}/src/amina_cli/commands/tools/utilities/pdb_cleaner.py +6 -24
  83. {amina_cli-0.2.7 → amina_cli-0.2.9}/src/amina_cli/commands/tools/utilities/pdb_quality_assessment.py +6 -25
  84. {amina_cli-0.2.7 → amina_cli-0.2.9}/src/amina_cli/commands/tools/utilities/pdb_to_fasta.py +6 -22
  85. {amina_cli-0.2.7 → amina_cli-0.2.9}/src/amina_cli/commands/tools/utilities/protein_relaxer.py +6 -25
  86. {amina_cli-0.2.7 → amina_cli-0.2.9}/src/amina_cli/commands/tools_cmd.py +140 -1
  87. {amina_cli-0.2.7 → amina_cli-0.2.9}/.gitignore +0 -0
  88. {amina_cli-0.2.7 → amina_cli-0.2.9}/LICENSE +0 -0
  89. {amina_cli-0.2.7 → amina_cli-0.2.9}/README.md +0 -0
  90. {amina_cli-0.2.7 → amina_cli-0.2.9}/src/amina_cli/auth.py +0 -0
  91. {amina_cli-0.2.7 → amina_cli-0.2.9}/src/amina_cli/client.py +0 -0
  92. {amina_cli-0.2.7 → amina_cli-0.2.9}/src/amina_cli/commands/__init__.py +0 -0
  93. {amina_cli-0.2.7 → amina_cli-0.2.9}/src/amina_cli/commands/auth_cmd.py +0 -0
  94. {amina_cli-0.2.7 → amina_cli-0.2.9}/src/amina_cli/commands/jobs_cmd.py +0 -0
  95. {amina_cli-0.2.7 → amina_cli-0.2.9}/src/amina_cli/commands/run_cmd.py +0 -0
  96. {amina_cli-0.2.7 → amina_cli-0.2.9}/src/amina_cli/commands/tools/analysis/__init__.py +0 -0
  97. {amina_cli-0.2.7 → amina_cli-0.2.9}/src/amina_cli/commands/tools/design/__init__.py +0 -0
  98. {amina_cli-0.2.7 → amina_cli-0.2.9}/src/amina_cli/commands/tools/display.py +0 -0
  99. {amina_cli-0.2.7 → amina_cli-0.2.9}/src/amina_cli/commands/tools/folding/__init__.py +0 -0
  100. {amina_cli-0.2.7 → amina_cli-0.2.9}/src/amina_cli/commands/tools/interactions/__init__.py +0 -0
  101. {amina_cli-0.2.7 → amina_cli-0.2.9}/src/amina_cli/commands/tools/properties/__init__.py +0 -0
  102. {amina_cli-0.2.7 → amina_cli-0.2.9}/src/amina_cli/commands/tools/utilities/__init__.py +0 -0
  103. {amina_cli-0.2.7 → amina_cli-0.2.9}/src/amina_cli/main.py +0 -0
  104. {amina_cli-0.2.7 → amina_cli-0.2.9}/src/amina_cli/registry.py +0 -0
  105. {amina_cli-0.2.7 → amina_cli-0.2.9}/src/amina_cli/storage.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: amina-cli
3
- Version: 0.2.7
3
+ Version: 0.2.9
4
4
  Summary: CLI for AminoAnalytica protein engineering platform
5
5
  Project-URL: Homepage, https://aminoanalytica.com
6
6
  Project-URL: Documentation, https://docs.aminoanalytica.com
@@ -20,6 +20,7 @@ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
20
20
  Requires-Python: >=3.11
21
21
  Requires-Dist: httpx>=0.27.0
22
22
  Requires-Dist: pydantic>=2.0
23
+ Requires-Dist: pyyaml>=6.0
23
24
  Requires-Dist: rich>=13.0.0
24
25
  Requires-Dist: supabase>=2.0.0
25
26
  Requires-Dist: typer>=0.9.0
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "amina-cli"
3
- version = "0.2.7"
3
+ version = "0.2.9"
4
4
  description = "CLI for AminoAnalytica protein engineering platform"
5
5
  readme = "README.md"
6
6
  license = {text = "Apache-2.0"}
@@ -26,6 +26,7 @@ dependencies = [
26
26
  "rich>=13.0.0", # Beautiful terminal output (included with typer)
27
27
  "pydantic>=2.0", # Data validation
28
28
  "supabase>=2.0.0", # Supabase client for file operations
29
+ "pyyaml>=6.0", # YAML parsing for tool doccards
29
30
  ]
30
31
 
31
32
  [project.scripts]
@@ -9,4 +9,4 @@ Quick start:
9
9
  amina run esmfold --sequence "MKFLILLFNILCLFPVLAADNH"
10
10
  """
11
11
 
12
- __version__ = "0.2.7"
12
+ __version__ = "0.2.9"
@@ -22,9 +22,15 @@ def discover_tools() -> Iterator[tuple[str, dict, callable]]:
22
22
  """
23
23
  Discover all tools in category subfolders.
24
24
 
25
+ For each tool, attempts to load a doccard YAML first. If a valid doccard
26
+ exists, it becomes the METADATA for that tool. Otherwise falls back to
27
+ the Python METADATA dict.
28
+
25
29
  Yields:
26
30
  (module_name, METADATA dict, register function)
27
31
  """
32
+ from amina_cli.commands.tools.doccard import load_doccard
33
+
28
34
  tools_dir = Path(__file__).parent
29
35
 
30
36
  for category_dir in sorted(tools_dir.iterdir()):
@@ -39,7 +45,14 @@ def discover_tools() -> Iterator[tuple[str, dict, callable]]:
39
45
  try:
40
46
  module = importlib.import_module(module_name)
41
47
  if hasattr(module, "METADATA") and hasattr(module, "register"):
42
- yield module_name, module.METADATA, module.register
48
+ metadata = module.METADATA
49
+ # Try doccard YAML override
50
+ category = category_dir.name
51
+ tool_name = metadata.get("name", tool_file.stem)
52
+ doccard = load_doccard(category, tool_name)
53
+ if doccard is not None:
54
+ metadata = doccard
55
+ yield module_name, metadata, module.register
43
56
  except ImportError as e:
44
57
  console.print(f"[dim]Warning: Could not load {module_name}: {e}[/dim]")
45
58
 
@@ -339,6 +352,53 @@ def run_tool_with_progress(
339
352
  raise typer.Exit(1)
340
353
 
341
354
 
355
+ def _get_metrics_reference(metadata: dict | None, result: dict) -> list[tuple[str, str]]:
356
+ """
357
+ Build a metrics reference from doccard output_metrics, filtered to only
358
+ include metrics that appear in the actual result data.
359
+
360
+ Returns list of (display_name, description) tuples.
361
+ """
362
+ if not metadata:
363
+ return []
364
+ output_metrics = metadata.get("output_metrics")
365
+ if not output_metrics:
366
+ return []
367
+
368
+ # Collect all keys present in result data (top-level and nested in data)
369
+ data = result.get("data", {}) or {}
370
+ all_keys = set(result.keys()) | set(data.keys())
371
+ # Also check inside nested structures (e.g., predictions[0])
372
+ output_display = metadata.get("output_display", {})
373
+ if output_display:
374
+ from amina_cli.commands.tools.display import _get_nested_value
375
+
376
+ data_path = output_display.get("data_path")
377
+ if data_path:
378
+ merged = {**result, **data}
379
+ extracted = _get_nested_value(merged, data_path)
380
+ if isinstance(extracted, dict):
381
+ all_keys |= set(extracted.keys())
382
+
383
+ references = []
384
+ for metric_key, metric_info in output_metrics.items():
385
+ if metric_key in all_keys:
386
+ display_name = metric_info.get("display_name", metric_key)
387
+ desc = metric_info.get("description", "").strip()
388
+ metric_range = metric_info.get("range")
389
+ interp = metric_info.get("interpretation", "").strip()
390
+
391
+ # Build compact reference line
392
+ parts = [desc]
393
+ if metric_range:
394
+ parts.append(f"Range: {metric_range[0]}–{metric_range[1]}.")
395
+ if interp:
396
+ parts.append(interp)
397
+ references.append((display_name, " ".join(parts)))
398
+
399
+ return references
400
+
401
+
342
402
  def _display_result(
343
403
  result: dict,
344
404
  downloaded: list[Path],
@@ -403,6 +463,13 @@ def _display_result(
403
463
  for key, value in display_params.items():
404
464
  console.print(f" {key}: {value}")
405
465
 
466
+ # Append metrics reference from doccard (anti-hallucination)
467
+ metrics_ref = _get_metrics_reference(metadata, result)
468
+ if metrics_ref:
469
+ console.print("\n[bold]Metrics Reference[/bold]")
470
+ for display_name, description in metrics_ref:
471
+ console.print(f" [cyan]{display_name}[/cyan]: {description}")
472
+
406
473
  else:
407
474
  # Check error_details first (from gateway), then error, then message
408
475
  error_msg = result.get("error_details") or result.get("error") or result.get("message", "Unknown error")
@@ -0,0 +1,243 @@
1
+ # ─── Identity & Routing ───
2
+ name: hydrophobicity
3
+ display_name: Hydrophobicity Analysis
4
+ category: analysis
5
+ status: available
6
+ modal_app_name: hydrophobicity-analysis-api
7
+ modal_function_name: hydrophobicity_worker
8
+
9
+ # ─── Documentation ───
10
+ description: |
11
+ Analyze the distribution of hydrophobic residues between a protein's core and
12
+ surface regions. Classifies residues using an automatically determined SASA
13
+ threshold and reports hydrophobic burial, surface exposure, and core contact metrics.
14
+
15
+ when_to_use: |
16
+ - Assessing how well hydrophobic residues are buried in the protein core
17
+ - Comparing hydrophobic packing between wild-type and mutant structures
18
+ - Evaluating protein stability indicators (hydrophobic core quality)
19
+ - Analyzing surface hydrophobicity for aggregation risk assessment
20
+
21
+ when_not_to_use: |
22
+ - Simple solvent exposure per residue → use **SASA**
23
+ - Binder-accessible surface scoring with depth/visibility → use **Residue Accessibility**
24
+ - Protein surface electrostatics → use **Surface Charge**
25
+
26
+ tool_algorithm: |
27
+ 1. **SASA calculation**: Per-residue solvent accessible surface area is computed
28
+ using the Shrake-Rupley algorithm (probe radius 1.4 Angstroms) via mdtraj.
29
+
30
+ 2. **Relative SASA normalization**: Each residue's SASA is divided by the maximum
31
+ theoretical SASA for its amino acid type (reference values from Miller et al. 2013).
32
+
33
+ 3. **Optimal threshold detection**: The relative SASA threshold that best separates
34
+ core from surface residues is found by scanning thresholds from 0 to 1.0 in steps
35
+ of `threshold_step` (default 0.001) and locating the point of maximum change in
36
+ core residue count (the inflection point of the sigmoidal transition curve).
37
+ Falls back to 0.5 if no pronounced transition is detected.
38
+
39
+ 4. **Core/surface classification**: Residues with relative SASA below the optimal
40
+ threshold are classified as core; those at or above are classified as surface.
41
+
42
+ 5. **Hydrophobic residue identification**: Residues are identified as hydrophobic
43
+ using the Kyte-Doolittle scale: ALA, VAL, ILE, LEU, MET, PHE, TRP, TYR.
44
+
45
+ 6. **Contact analysis**: Pairwise distances between CA atoms of core hydrophobic
46
+ residues are computed. Pairs within the critical distance (default 5.0 Angstroms)
47
+ are counted as hydrophobic-hydrophobic contacts, indicating core packing quality.
48
+
49
+ additional_context: |
50
+ - The sigmoidal transition plot (enabled by default) visualizes how core residue
51
+ count changes as the SASA threshold varies, with the optimal threshold marked.
52
+ This is useful for validating the automatic threshold selection.
53
+ - A well-folded protein typically has >40% of hydrophobic residues buried in the core.
54
+ - High hydrophobic-hydrophobic contact counts indicate tight core packing.
55
+ - The tool runs on CPU only (no GPU required), typically completing in under 60 seconds.
56
+
57
+ # ─── Parameters ───
58
+ # Parameter definitions are canonical here — keep in sync with hydrophobicity.py
59
+ parameters:
60
+ pdb:
61
+ type: file
62
+ required: true
63
+ description: |
64
+ Path to PDB file containing the protein structure to analyze.
65
+
66
+ savecsv:
67
+ type: boolean
68
+ default: true
69
+ description: |
70
+ Save detailed results to a CSV file. Use `--no-csv` to disable.
71
+
72
+ plot:
73
+ type: boolean
74
+ default: true
75
+ description: |
76
+ Save the sigmoidal transition plot showing core residue count vs
77
+ SASA threshold. Use `--no-plot` to disable.
78
+
79
+ threshold-step:
80
+ type: float
81
+ default: 0.001
82
+ range: [0.0, 1.0]
83
+ description: |
84
+ Step size for scanning SASA thresholds when detecting the optimal
85
+ core/surface boundary. Smaller values give finer resolution but
86
+ take longer. Default 0.001 is usually sufficient.
87
+
88
+ critical-distance:
89
+ type: float
90
+ default: 5.0
91
+ description: |
92
+ Maximum distance in Angstroms between CA atoms for two core
93
+ hydrophobic residues to be counted as a contact. Default 5.0
94
+ Angstroms is standard for residue-residue contact analysis.
95
+
96
+ job-name:
97
+ type: string
98
+ required: false
99
+ description: |
100
+ Custom job name for output files. Defaults to a random 4-letter code.
101
+
102
+ background:
103
+ type: boolean
104
+ default: false
105
+ description: |
106
+ Submit job and return immediately without waiting for completion.
107
+
108
+ # ─── Output Files ───
109
+ outputs:
110
+ csv_filepath: Detailed hydrophobicity analysis results CSV
111
+ plot_filepath: Sigmoidal transition plot (if enabled)
112
+
113
+ # ─── Output Metrics ───
114
+ output_metrics:
115
+ optimal_threshold_pct:
116
+ display_name: Optimal Threshold
117
+ description: |
118
+ **Optimal Surface/Core Threshold** (%) — the relative SASA percentage that
119
+ best separates core from surface residues, determined automatically by finding
120
+ the inflection point of the sigmoidal transition curve. Residues with relative
121
+ SASA below this value are classified as core.
122
+ range: [0, 100]
123
+ interpretation: |
124
+ - Typical range: 20–50%
125
+ - Lower threshold = stricter core definition (fewer, more deeply buried residues)
126
+ - If fallback to 50% is used, the protein may lack a clear core/surface transition
127
+
128
+ total_residues:
129
+ display_name: Total Residues
130
+ description: |
131
+ Total number of residues in the analyzed protein structure.
132
+
133
+ core_residues:
134
+ display_name: Core Residues
135
+ description: |
136
+ Number of residues classified as core (relative SASA below the optimal threshold).
137
+ interpretation: |
138
+ Compare to total residues for proportion buried. Well-folded globular proteins
139
+ typically have 30–60% of residues in the core.
140
+
141
+ surface_residues:
142
+ display_name: Surface Residues
143
+ description: |
144
+ Number of residues classified as surface (relative SASA at or above the optimal threshold).
145
+
146
+ hydrophobic_core_pct:
147
+ display_name: Hydrophobic Core %
148
+ description: |
149
+ Percentage of total residues that are both hydrophobic (Kyte-Doolittle:
150
+ ALA, VAL, ILE, LEU, MET, PHE, TRP, TYR) and buried in the core.
151
+ range: [0, 100]
152
+ interpretation: |
153
+ - >40%: Strong hydrophobic core — typical of well-folded globular proteins
154
+ - 20–40%: Moderate core burial
155
+ - <20%: Weak hydrophobic core — may indicate poor folding or intrinsic disorder
156
+
157
+ total_sasa:
158
+ display_name: Total SASA
159
+ description: |
160
+ Total solvent accessible surface area of the entire protein, measured in nm^2.
161
+ Computed via Shrake-Rupley algorithm.
162
+
163
+ hydrophobic_sasa:
164
+ display_name: Hydrophobic SASA
165
+ description: |
166
+ Solvent accessible surface area contributed by hydrophobic residues only,
167
+ measured in nm^2.
168
+
169
+ hydrophobic_sasa_pct:
170
+ display_name: Hydrophobic SASA %
171
+ description: |
172
+ Percentage of total SASA contributed by hydrophobic residues.
173
+ range: [0, 100]
174
+ interpretation: |
175
+ - Lower values indicate better hydrophobic burial (hydrophobic residues shielded from solvent)
176
+ - Higher values suggest more hydrophobic surface exposure (potential aggregation risk)
177
+
178
+ hydrophobic_contacts:
179
+ display_name: Hydrophobic Contacts
180
+ description: |
181
+ Number of pairwise contacts between core hydrophobic residues (CA atoms
182
+ within the critical distance, default 5.0 Angstroms). Measures the density
183
+ of hydrophobic packing in the protein core.
184
+ interpretation: |
185
+ - Higher count = denser hydrophobic core packing (generally favorable for stability)
186
+ - Scales with protein size — compare between variants of the same protein
187
+
188
+ # ─── Output Display ───
189
+ output_display:
190
+ data_path: hydrophobicity_results
191
+ sections:
192
+ - title: Classification
193
+ fields:
194
+ - key: optimal_threshold_pct
195
+ label: Optimal Threshold (%)
196
+ format: "{:.1f}%"
197
+ - key: total_residues
198
+ label: Total Residues
199
+ - key: core_residues
200
+ label: Core Residues
201
+ - key: surface_residues
202
+ label: Surface Residues
203
+ - title: Hydrophobicity Metrics
204
+ fields:
205
+ - key: hydrophobic_core_pct
206
+ label: Hydrophobic Core (%)
207
+ format: "{:.1f}%"
208
+ - key: total_sasa
209
+ label: Total SASA (nm^2)
210
+ format: "{:.2f}"
211
+ - key: hydrophobic_sasa
212
+ label: Hydrophobic SASA (nm^2)
213
+ format: "{:.2f}"
214
+ - key: hydrophobic_sasa_pct
215
+ label: Hydrophobic SASA (%)
216
+ format: "{:.1f}%"
217
+ - key: hydrophobic_contacts
218
+ label: Hydrophobic-Hydrophobic Contacts
219
+
220
+ # ─── Examples ───
221
+ examples:
222
+ - title: Basic analysis
223
+ command: amina run hydrophobicity --pdb ./protein.pdb -o ./results/
224
+ - title: Without plot
225
+ command: amina run hydrophobicity --pdb ./protein.pdb --no-plot -o ./results/
226
+ - title: Custom threshold resolution
227
+ command: amina run hydrophobicity --pdb ./protein.pdb --threshold-step 0.0001 -o ./results/
228
+
229
+ # ─── References ───
230
+ references:
231
+ - title: "Kyte & Doolittle (1982) - Hydropathy scale"
232
+ url: "https://doi.org/10.1016/0022-2836(82)90515-0"
233
+ - title: "Miller et al. (2013) - Maximum SASA reference values"
234
+ url: "https://doi.org/10.1371/journal.pone.0080635"
235
+ - title: "Shrake & Rupley (1973) - SASA algorithm"
236
+ url: "https://doi.org/10.1016/0022-2836(73)90011-9"
237
+
238
+ # ─── REVIEW STATUS: All definitions verified from source code and documentation ───
239
+ # - All metric keys verified against worker.py lines 236-245
240
+ # - Algorithm verified against analyzer.py
241
+ # - Hydrophobic residue set verified against analyzer.py line 49 (Kyte-Doolittle)
242
+ # - MAX_SASA references verified against analyzer.py lines 25-46 (Miller et al. 2013)
243
+ # - Shrake-Rupley confirmed in analyzer.py SASA computation
@@ -0,0 +1,166 @@
1
+ # ─── Identity & Routing ───
2
+ name: mmseqs2-cluster
3
+ display_name: MMseqs2 Cluster
4
+ category: analysis
5
+ status: available
6
+ modal_app_name: mmseqs2-cluster-api
7
+ modal_function_name: mmseqs2_cluster_worker
8
+
9
+ # ─── Documentation ───
10
+ description: |
11
+ Cluster protein sequences using MMseqs2 at a specified sequence identity threshold.
12
+ Accepts multiple FASTA files with source tracking, removes exact duplicates, and
13
+ returns cluster assignments, representative sequences, and size distribution
14
+ visualizations.
15
+
16
+ when_to_use: |
17
+ - Ensuring experimental diversity when selecting designs from hundreds or thousands of candidates
18
+ - Reducing redundancy in a set of designed or retrieved sequences
19
+ - Identifying groups of closely related sequences across multiple design runs
20
+ - Selecting representative sequences from large candidate pools
21
+
22
+ when_not_to_use: |
23
+ - Structural similarity comparison → use **US-Align** or **RMSD Analysis**
24
+ - Multiple sequence alignment → use an external MSA tool (e.g., Clustal Omega)
25
+ - Phylogenetic analysis → use dedicated phylogenetics software
26
+
27
+ tool_algorithm: |
28
+ 1. **FASTA parsing**: Input sequences are parsed and validated using the shared
29
+ FASTAParser with source tracking per file.
30
+
31
+ 2. **Duplicate removal**: Exact duplicate sequences (100% identity) are detected
32
+ by MD5 hash and removed before clustering.
33
+
34
+ 3. **MMseqs2 easy-cluster**: The combined, deduplicated sequences are clustered
35
+ using `mmseqs easy-cluster` with bidirectional coverage mode (`--cov-mode 0`),
36
+ meaning both query and target must meet the coverage threshold. Sequence identity
37
+ and coverage thresholds are user-configurable.
38
+
39
+ 4. **Cluster analysis**: Cluster assignments are parsed from the MMseqs2 TSV output,
40
+ statistics are computed (sizes, medians, singletons), and representative sequences
41
+ are extracted.
42
+
43
+ 5. **Visualization**: A bar chart of cluster sizes (sorted descending) is generated
44
+ with matplotlib, and an interactive HTML chart is produced with Plotly. If multiple
45
+ input files are provided, a source distribution pie chart is also generated.
46
+
47
+ additional_context: |
48
+ - Accepts multiple FASTA files via repeated `--fasta` / `-f` flags — each file's
49
+ sequences are tracked by source filename in the output CSV.
50
+ - Common identity thresholds: 0.30 (broad, distant homologs), 0.50 (moderate),
51
+ 0.70 (tight, closely related), 0.90 (near-identical only).
52
+ - Runs on CPU only (4 cores, 8 GB RAM), with a 1-hour timeout for large datasets.
53
+
54
+ # ─── Parameters ───
55
+ parameters:
56
+ fasta:
57
+ type: file
58
+ required: true
59
+ description: |
60
+ Path to FASTA file(s) containing protein sequences. Can be specified
61
+ multiple times for multiple input files (e.g., -f mpnn.fasta -f rfdiff.fasta).
62
+
63
+ identity:
64
+ type: float
65
+ default: 0.5
66
+ range: [0.0, 1.0]
67
+ description: |
68
+ Sequence identity threshold (0.0-1.0). Sequences above this threshold
69
+ within a cluster are grouped together.
70
+
71
+ coverage:
72
+ type: float
73
+ default: 0.8
74
+ range: [0.0, 1.0]
75
+ description: |
76
+ Coverage threshold (0.0-1.0). Uses bidirectional coverage mode, meaning
77
+ both query and target must have this fraction of their sequence aligned.
78
+ Default 0.8 means 80% of the shorter sequence must align.
79
+
80
+ job-name:
81
+ type: string
82
+ required: false
83
+ description: |
84
+ Custom job name for output files. Defaults to a random 4-letter code.
85
+
86
+ background:
87
+ type: boolean
88
+ default: false
89
+ description: |
90
+ Submit job and return immediately without waiting for completion.
91
+
92
+ # ─── Output Files ───
93
+ outputs:
94
+ clusters_csv_filepath: CSV with full cluster assignments (sequence ID, cluster ID, source, representative flag)
95
+ representatives_fasta_filepath: FASTA with one representative sequence per cluster
96
+ summary_json_filepath: JSON summary with clustering statistics and parameters
97
+ cluster_sizes_png: Bar chart of all cluster sizes (sorted descending)
98
+ cluster_sizes_html: Interactive Plotly bar chart of cluster sizes
99
+
100
+ # ─── Output Metrics ───
101
+ output_metrics:
102
+ num_sequences:
103
+ display_name: Sequences Clustered
104
+ description: |
105
+ Number of unique sequences after duplicate removal that were clustered.
106
+
107
+ num_duplicates_removed:
108
+ display_name: Duplicates Removed
109
+ description: |
110
+ Number of exact duplicate sequences (100% identity) removed before clustering.
111
+
112
+ num_clusters:
113
+ display_name: Number of Clusters
114
+ description: |
115
+ Total number of clusters produced by MMseqs2.
116
+ interpretation: |
117
+ Fewer clusters relative to input sequences indicates higher redundancy.
118
+ More clusters suggests greater sequence diversity in the input set.
119
+
120
+ largest_cluster_size:
121
+ display_name: Largest Cluster
122
+ description: |
123
+ Number of sequences in the largest cluster.
124
+
125
+ smallest_cluster_size:
126
+ display_name: Smallest Cluster
127
+ description: |
128
+ Number of sequences in the smallest cluster.
129
+
130
+ median_cluster_size:
131
+ display_name: Median Cluster Size
132
+ description: |
133
+ Median number of sequences per cluster.
134
+
135
+ singleton_count:
136
+ display_name: Singletons
137
+ description: |
138
+ Number of clusters containing only a single sequence (no close neighbors
139
+ at the given identity threshold).
140
+ interpretation: |
141
+ High singleton count indicates many unique sequences that don't cluster
142
+ with others — good for diversity but may also indicate outliers.
143
+
144
+ # ─── Examples ───
145
+ examples:
146
+ - title: Cluster at 50% identity
147
+ command: amina run mmseqs2-cluster -f designs.fasta --identity 0.5 -o ./results/
148
+ - title: Cluster multiple files at 70% identity
149
+ command: amina run mmseqs2-cluster -f mpnn.fasta -f rfdiff.fasta -i 0.7 -o ./results/
150
+ - title: Custom coverage threshold
151
+ command: amina run mmseqs2-cluster -f designs.fasta -i 0.5 -c 0.6 -o ./results/
152
+ - title: Background submission
153
+ command: amina run mmseqs2-cluster -f designs.fasta -i 0.5 --background
154
+
155
+ # ─── References ───
156
+ references:
157
+ - title: "Steinegger & Soding (2017) - MMseqs2"
158
+ url: "https://doi.org/10.1038/nbt.3988"
159
+ - title: "MMseqs2 GitHub repository"
160
+ url: "https://github.com/soedinglab/MMseqs2"
161
+
162
+ # ─── REVIEW STATUS: All definitions verified from source code ───
163
+ # - Metric keys verified against worker.py _success_response (lines 43-80)
164
+ # - Algorithm verified: mmseqs easy-cluster with --cov-mode 0 (line 319)
165
+ # - Output file keys verified against write_outputs() (lines 544-605)
166
+ # - Duplicate removal verified: MD5 hash-based (lines 162-187)