PyPI - amina-cli - Versions diffs - 0.2.7__tar.gz → 0.2.8__tar.gz - Mend

amina-cli 0.2.7tar.gz → 0.2.8tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (105) hide show

{amina_cli-0.2.7 → amina_cli-0.2.8}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: amina-cli
-Version: 0.2.7
+Version: 0.2.8
 Summary: CLI for AminoAnalytica protein engineering platform
 Project-URL: Homepage, https://aminoanalytica.com
 Project-URL: Documentation, https://docs.aminoanalytica.com
@@ -20,6 +20,7 @@ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
 Requires-Python: >=3.11
 Requires-Dist: httpx>=0.27.0
 Requires-Dist: pydantic>=2.0
+Requires-Dist: pyyaml>=6.0
 Requires-Dist: rich>=13.0.0
 Requires-Dist: supabase>=2.0.0
 Requires-Dist: typer>=0.9.0

{amina_cli-0.2.7 → amina_cli-0.2.8}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "amina-cli"
-version = "0.2.7"
+version = "0.2.8"
 description = "CLI for AminoAnalytica protein engineering platform"
 readme = "README.md"
 license = {text = "Apache-2.0"}
@@ -26,6 +26,7 @@ dependencies = [
     "rich>=13.0.0",      # Beautiful terminal output (included with typer)
     "pydantic>=2.0",     # Data validation
     "supabase>=2.0.0",   # Supabase client for file operations
+    "pyyaml>=6.0",       # YAML parsing for tool doccards
 ]
 [project.scripts]

{amina_cli-0.2.7 → amina_cli-0.2.8}/src/amina_cli/__init__.py RENAMED Viewed

@@ -9,4 +9,4 @@ Quick start:
     amina run esmfold --sequence "MKFLILLFNILCLFPVLAADNH"
 """
-__version__ = "0.2.7"
+__version__ = "0.2.8"

{amina_cli-0.2.7 → amina_cli-0.2.8}/src/amina_cli/commands/tools/__init__.py RENAMED Viewed

@@ -22,9 +22,15 @@ def discover_tools() -> Iterator[tuple[str, dict, callable]]:
     """
     Discover all tools in category subfolders.
+    For each tool, attempts to load a doccard YAML first. If a valid doccard
+    exists, it becomes the METADATA for that tool. Otherwise falls back to
+    the Python METADATA dict.
     Yields:
         (module_name, METADATA dict, register function)
     """
+    from amina_cli.commands.tools.doccard import load_doccard
     tools_dir = Path(__file__).parent
     for category_dir in sorted(tools_dir.iterdir()):
@@ -39,7 +45,14 @@ def discover_tools() -> Iterator[tuple[str, dict, callable]]:
             try:
                 module = importlib.import_module(module_name)
                 if hasattr(module, "METADATA") and hasattr(module, "register"):
-                    yield module_name, module.METADATA, module.register
+                    metadata = module.METADATA
+                    # Try doccard YAML override
+                    category = category_dir.name
+                    tool_name = metadata.get("name", tool_file.stem)
+                    doccard = load_doccard(category, tool_name)
+                    if doccard is not None:
+                        metadata = doccard
+                    yield module_name, metadata, module.register
             except ImportError as e:
                 console.print(f"[dim]Warning: Could not load {module_name}: {e}[/dim]")
@@ -339,6 +352,53 @@ def run_tool_with_progress(
         raise typer.Exit(1)
+def _get_metrics_reference(metadata: dict | None, result: dict) -> list[tuple[str, str]]:
+    """
+    Build a metrics reference from doccard output_metrics, filtered to only
+    include metrics that appear in the actual result data.
+    Returns list of (display_name, description) tuples.
+    """
+    if not metadata:
+        return []
+    output_metrics = metadata.get("output_metrics")
+    if not output_metrics:
+        return []
+    # Collect all keys present in result data (top-level and nested in data)
+    data = result.get("data", {}) or {}
+    all_keys = set(result.keys()) | set(data.keys())
+    # Also check inside nested structures (e.g., predictions[0])
+    output_display = metadata.get("output_display", {})
+    if output_display:
+        from amina_cli.commands.tools.display import _get_nested_value
+        data_path = output_display.get("data_path")
+        if data_path:
+            merged = {**result, **data}
+            extracted = _get_nested_value(merged, data_path)
+            if isinstance(extracted, dict):
+                all_keys |= set(extracted.keys())
+    references = []
+    for metric_key, metric_info in output_metrics.items():
+        if metric_key in all_keys:
+            display_name = metric_info.get("display_name", metric_key)
+            desc = metric_info.get("description", "").strip()
+            metric_range = metric_info.get("range")
+            interp = metric_info.get("interpretation", "").strip()
+            # Build compact reference line
+            parts = [desc]
+            if metric_range:
+                parts.append(f"Range: {metric_range[0]}–{metric_range[1]}.")
+            if interp:
+                parts.append(interp)
+            references.append((display_name, " ".join(parts)))
+    return references
 def _display_result(
     result: dict,
     downloaded: list[Path],
@@ -403,6 +463,13 @@ def _display_result(
                 for key, value in display_params.items():
                     console.print(f"  {key}: {value}")
+        # Append metrics reference from doccard (anti-hallucination)
+        metrics_ref = _get_metrics_reference(metadata, result)
+        if metrics_ref:
+            console.print("\n[bold]Metrics Reference[/bold]")
+            for display_name, description in metrics_ref:
+                console.print(f"  [cyan]{display_name}[/cyan]: {description}")
     else:
         # Check error_details first (from gateway), then error, then message
         error_msg = result.get("error_details") or result.get("error") or result.get("message", "Unknown error")

amina_cli-0.2.8/src/amina_cli/commands/tools/analysis/docs/hydrophobicity.yaml ADDED Viewed

@@ -0,0 +1,243 @@
+# ─── Identity & Routing ───
+name: hydrophobicity
+display_name: Hydrophobicity Analysis
+category: analysis
+status: available
+modal_app_name: hydrophobicity-analysis-api
+modal_function_name: hydrophobicity_worker
+# ─── Documentation ───
+description: |
+  Analyze the distribution of hydrophobic residues between a protein's core and
+  surface regions. Classifies residues using an automatically determined SASA
+  threshold and reports hydrophobic burial, surface exposure, and core contact metrics.
+when_to_use: |
+  - Assessing how well hydrophobic residues are buried in the protein core
+  - Comparing hydrophobic packing between wild-type and mutant structures
+  - Evaluating protein stability indicators (hydrophobic core quality)
+  - Analyzing surface hydrophobicity for aggregation risk assessment
+when_not_to_use: |
+  - Simple solvent exposure per residue → use **SASA**
+  - Binder-accessible surface scoring with depth/visibility → use **Residue Accessibility**
+  - Protein surface electrostatics → use **Surface Charge**
+tool_algorithm: |
+  1. **SASA calculation**: Per-residue solvent accessible surface area is computed
+     using the Shrake-Rupley algorithm (probe radius 1.4 Angstroms) via mdtraj.
+  2. **Relative SASA normalization**: Each residue's SASA is divided by the maximum
+     theoretical SASA for its amino acid type (reference values from Miller et al. 2013).
+  3. **Optimal threshold detection**: The relative SASA threshold that best separates
+     core from surface residues is found by scanning thresholds from 0 to 1.0 in steps
+     of `threshold_step` (default 0.001) and locating the point of maximum change in
+     core residue count (the inflection point of the sigmoidal transition curve).
+     Falls back to 0.5 if no pronounced transition is detected.
+  4. **Core/surface classification**: Residues with relative SASA below the optimal
+     threshold are classified as core; those at or above are classified as surface.
+  5. **Hydrophobic residue identification**: Residues are identified as hydrophobic
+     using the Kyte-Doolittle scale: ALA, VAL, ILE, LEU, MET, PHE, TRP, TYR.
+  6. **Contact analysis**: Pairwise distances between CA atoms of core hydrophobic
+     residues are computed. Pairs within the critical distance (default 5.0 Angstroms)
+     are counted as hydrophobic-hydrophobic contacts, indicating core packing quality.
+additional_context: |
+  - The sigmoidal transition plot (enabled by default) visualizes how core residue
+    count changes as the SASA threshold varies, with the optimal threshold marked.
+    This is useful for validating the automatic threshold selection.
+  - A well-folded protein typically has >40% of hydrophobic residues buried in the core.
+  - High hydrophobic-hydrophobic contact counts indicate tight core packing.
+  - The tool runs on CPU only (no GPU required), typically completing in under 60 seconds.
+# ─── Parameters ───
+# Parameter definitions are canonical here — keep in sync with hydrophobicity.py
+parameters:
+  pdb:
+    type: file
+    required: true
+    description: |
+      Path to PDB file containing the protein structure to analyze.
+  savecsv:
+    type: boolean
+    default: true
+    description: |
+      Save detailed results to a CSV file. Use `--no-csv` to disable.
+  plot:
+    type: boolean
+    default: true
+    description: |
+      Save the sigmoidal transition plot showing core residue count vs
+      SASA threshold. Use `--no-plot` to disable.
+  threshold-step:
+    type: float
+    default: 0.001
+    range: [0.0, 1.0]
+    description: |
+      Step size for scanning SASA thresholds when detecting the optimal
+      core/surface boundary. Smaller values give finer resolution but
+      take longer. Default 0.001 is usually sufficient.
+  critical-distance:
+    type: float
+    default: 5.0
+    description: |
+      Maximum distance in Angstroms between CA atoms for two core
+      hydrophobic residues to be counted as a contact. Default 5.0
+      Angstroms is standard for residue-residue contact analysis.
+  job-name:
+    type: string
+    required: false
+    description: |
+      Custom job name for output files. Defaults to a random 4-letter code.
+  background:
+    type: boolean
+    default: false
+    description: |
+      Submit job and return immediately without waiting for completion.
+# ─── Output Files ───
+outputs:
+  csv_filepath: Detailed hydrophobicity analysis results CSV
+  plot_filepath: Sigmoidal transition plot (if enabled)
+# ─── Output Metrics ───
+output_metrics:
+  optimal_threshold_pct:
+    display_name: Optimal Threshold
+    description: |
+      **Optimal Surface/Core Threshold** (%) — the relative SASA percentage that
+      best separates core from surface residues, determined automatically by finding
+      the inflection point of the sigmoidal transition curve. Residues with relative
+      SASA below this value are classified as core.
+    range: [0, 100]
+    interpretation: |
+      - Typical range: 20–50%
+      - Lower threshold = stricter core definition (fewer, more deeply buried residues)
+      - If fallback to 50% is used, the protein may lack a clear core/surface transition
+  total_residues:
+    display_name: Total Residues
+    description: |
+      Total number of residues in the analyzed protein structure.
+  core_residues:
+    display_name: Core Residues
+    description: |
+      Number of residues classified as core (relative SASA below the optimal threshold).
+    interpretation: |
+      Compare to total residues for proportion buried. Well-folded globular proteins
+      typically have 30–60% of residues in the core.
+  surface_residues:
+    display_name: Surface Residues
+    description: |
+      Number of residues classified as surface (relative SASA at or above the optimal threshold).
+  hydrophobic_core_pct:
+    display_name: Hydrophobic Core %
+    description: |
+      Percentage of total residues that are both hydrophobic (Kyte-Doolittle:
+      ALA, VAL, ILE, LEU, MET, PHE, TRP, TYR) and buried in the core.
+    range: [0, 100]
+    interpretation: |
+      - >40%: Strong hydrophobic core — typical of well-folded globular proteins
+      - 20–40%: Moderate core burial
+      - <20%: Weak hydrophobic core — may indicate poor folding or intrinsic disorder
+  total_sasa:
+    display_name: Total SASA
+    description: |
+      Total solvent accessible surface area of the entire protein, measured in nm^2.
+      Computed via Shrake-Rupley algorithm.
+  hydrophobic_sasa:
+    display_name: Hydrophobic SASA
+    description: |
+      Solvent accessible surface area contributed by hydrophobic residues only,
+      measured in nm^2.
+  hydrophobic_sasa_pct:
+    display_name: Hydrophobic SASA %
+    description: |
+      Percentage of total SASA contributed by hydrophobic residues.
+    range: [0, 100]
+    interpretation: |
+      - Lower values indicate better hydrophobic burial (hydrophobic residues shielded from solvent)
+      - Higher values suggest more hydrophobic surface exposure (potential aggregation risk)
+  hydrophobic_contacts:
+    display_name: Hydrophobic Contacts
+    description: |
+      Number of pairwise contacts between core hydrophobic residues (CA atoms
+      within the critical distance, default 5.0 Angstroms). Measures the density
+      of hydrophobic packing in the protein core.
+    interpretation: |
+      - Higher count = denser hydrophobic core packing (generally favorable for stability)
+      - Scales with protein size — compare between variants of the same protein
+# ─── Output Display ───
+output_display:
+  data_path: hydrophobicity_results
+  sections:
+    - title: Classification
+      fields:
+        - key: optimal_threshold_pct
+          label: Optimal Threshold (%)
+          format: "{:.1f}%"
+        - key: total_residues
+          label: Total Residues
+        - key: core_residues
+          label: Core Residues
+        - key: surface_residues
+          label: Surface Residues
+    - title: Hydrophobicity Metrics
+      fields:
+        - key: hydrophobic_core_pct
+          label: Hydrophobic Core (%)
+          format: "{:.1f}%"
+        - key: total_sasa
+          label: Total SASA (nm^2)
+          format: "{:.2f}"
+        - key: hydrophobic_sasa
+          label: Hydrophobic SASA (nm^2)
+          format: "{:.2f}"
+        - key: hydrophobic_sasa_pct
+          label: Hydrophobic SASA (%)
+          format: "{:.1f}%"
+        - key: hydrophobic_contacts
+          label: Hydrophobic-Hydrophobic Contacts
+# ─── Examples ───
+examples:
+  - title: Basic analysis
+    command: amina run hydrophobicity --pdb ./protein.pdb -o ./results/
+  - title: Without plot
+    command: amina run hydrophobicity --pdb ./protein.pdb --no-plot -o ./results/
+  - title: Custom threshold resolution
+    command: amina run hydrophobicity --pdb ./protein.pdb --threshold-step 0.0001 -o ./results/
+# ─── References ───
+references:
+  - title: "Kyte & Doolittle (1982) - Hydropathy scale"
+    url: "https://doi.org/10.1016/0022-2836(82)90515-0"
+  - title: "Miller et al. (2013) - Maximum SASA reference values"
+    url: "https://doi.org/10.1371/journal.pone.0080635"
+  - title: "Shrake & Rupley (1973) - SASA algorithm"
+    url: "https://doi.org/10.1016/0022-2836(73)90011-9"
+# ─── REVIEW STATUS: All definitions verified from source code and documentation ───
+# - All metric keys verified against worker.py lines 236-245
+# - Algorithm verified against analyzer.py
+# - Hydrophobic residue set verified against analyzer.py line 49 (Kyte-Doolittle)
+# - MAX_SASA references verified against analyzer.py lines 25-46 (Miller et al. 2013)
+# - Shrake-Rupley confirmed in analyzer.py SASA computation

amina_cli-0.2.8/src/amina_cli/commands/tools/analysis/docs/mmseqs2_cluster.yaml ADDED Viewed

@@ -0,0 +1,166 @@
+# ─── Identity & Routing ───
+name: mmseqs2-cluster
+display_name: MMseqs2 Cluster
+category: analysis
+status: available
+modal_app_name: mmseqs2-cluster-api
+modal_function_name: mmseqs2_cluster_worker
+# ─── Documentation ───
+description: |
+  Cluster protein sequences using MMseqs2 at a specified sequence identity threshold.
+  Accepts multiple FASTA files with source tracking, removes exact duplicates, and
+  returns cluster assignments, representative sequences, and size distribution
+  visualizations.
+when_to_use: |
+  - Ensuring experimental diversity when selecting designs from hundreds or thousands of candidates
+  - Reducing redundancy in a set of designed or retrieved sequences
+  - Identifying groups of closely related sequences across multiple design runs
+  - Selecting representative sequences from large candidate pools
+when_not_to_use: |
+  - Structural similarity comparison → use **US-Align** or **RMSD Analysis**
+  - Multiple sequence alignment → use an external MSA tool (e.g., Clustal Omega)
+  - Phylogenetic analysis → use dedicated phylogenetics software
+tool_algorithm: |
+  1. **FASTA parsing**: Input sequences are parsed and validated using the shared
+     FASTAParser with source tracking per file.
+  2. **Duplicate removal**: Exact duplicate sequences (100% identity) are detected
+     by MD5 hash and removed before clustering.
+  3. **MMseqs2 easy-cluster**: The combined, deduplicated sequences are clustered
+     using `mmseqs easy-cluster` with bidirectional coverage mode (`--cov-mode 0`),
+     meaning both query and target must meet the coverage threshold. Sequence identity
+     and coverage thresholds are user-configurable.
+  4. **Cluster analysis**: Cluster assignments are parsed from the MMseqs2 TSV output,
+     statistics are computed (sizes, medians, singletons), and representative sequences
+     are extracted.
+  5. **Visualization**: A bar chart of cluster sizes (sorted descending) is generated
+     with matplotlib, and an interactive HTML chart is produced with Plotly. If multiple
+     input files are provided, a source distribution pie chart is also generated.
+additional_context: |
+  - Accepts multiple FASTA files via repeated `--fasta` / `-f` flags — each file's
+    sequences are tracked by source filename in the output CSV.
+  - Common identity thresholds: 0.30 (broad, distant homologs), 0.50 (moderate),
+    0.70 (tight, closely related), 0.90 (near-identical only).
+  - Runs on CPU only (4 cores, 8 GB RAM), with a 1-hour timeout for large datasets.
+# ─── Parameters ───
+parameters:
+  fasta:
+    type: file
+    required: true
+    description: |
+      Path to FASTA file(s) containing protein sequences. Can be specified
+      multiple times for multiple input files (e.g., -f mpnn.fasta -f rfdiff.fasta).
+  identity:
+    type: float
+    default: 0.5
+    range: [0.0, 1.0]
+    description: |
+      Sequence identity threshold (0.0-1.0). Sequences above this threshold
+      within a cluster are grouped together.
+  coverage:
+    type: float
+    default: 0.8
+    range: [0.0, 1.0]
+    description: |
+      Coverage threshold (0.0-1.0). Uses bidirectional coverage mode, meaning
+      both query and target must have this fraction of their sequence aligned.
+      Default 0.8 means 80% of the shorter sequence must align.
+  job-name:
+    type: string
+    required: false
+    description: |
+      Custom job name for output files. Defaults to a random 4-letter code.
+  background:
+    type: boolean
+    default: false
+    description: |
+      Submit job and return immediately without waiting for completion.
+# ─── Output Files ───
+outputs:
+  clusters_csv_filepath: CSV with full cluster assignments (sequence ID, cluster ID, source, representative flag)
+  representatives_fasta_filepath: FASTA with one representative sequence per cluster
+  summary_json_filepath: JSON summary with clustering statistics and parameters
+  cluster_sizes_png: Bar chart of all cluster sizes (sorted descending)
+  cluster_sizes_html: Interactive Plotly bar chart of cluster sizes
+# ─── Output Metrics ───
+output_metrics:
+  num_sequences:
+    display_name: Sequences Clustered
+    description: |
+      Number of unique sequences after duplicate removal that were clustered.
+  num_duplicates_removed:
+    display_name: Duplicates Removed
+    description: |
+      Number of exact duplicate sequences (100% identity) removed before clustering.
+  num_clusters:
+    display_name: Number of Clusters
+    description: |
+      Total number of clusters produced by MMseqs2.
+    interpretation: |
+      Fewer clusters relative to input sequences indicates higher redundancy.
+      More clusters suggests greater sequence diversity in the input set.
+  largest_cluster_size:
+    display_name: Largest Cluster
+    description: |
+      Number of sequences in the largest cluster.
+  smallest_cluster_size:
+    display_name: Smallest Cluster
+    description: |
+      Number of sequences in the smallest cluster.
+  median_cluster_size:
+    display_name: Median Cluster Size
+    description: |
+      Median number of sequences per cluster.
+  singleton_count:
+    display_name: Singletons
+    description: |
+      Number of clusters containing only a single sequence (no close neighbors
+      at the given identity threshold).
+    interpretation: |
+      High singleton count indicates many unique sequences that don't cluster
+      with others — good for diversity but may also indicate outliers.
+# ─── Examples ───
+examples:
+  - title: Cluster at 50% identity
+    command: amina run mmseqs2-cluster -f designs.fasta --identity 0.5 -o ./results/
+  - title: Cluster multiple files at 70% identity
+    command: amina run mmseqs2-cluster -f mpnn.fasta -f rfdiff.fasta -i 0.7 -o ./results/
+  - title: Custom coverage threshold
+    command: amina run mmseqs2-cluster -f designs.fasta -i 0.5 -c 0.6 -o ./results/
+  - title: Background submission
+    command: amina run mmseqs2-cluster -f designs.fasta -i 0.5 --background
+# ─── References ───
+references:
+  - title: "Steinegger & Soding (2017) - MMseqs2"
+    url: "https://doi.org/10.1038/nbt.3988"
+  - title: "MMseqs2 GitHub repository"
+    url: "https://github.com/soedinglab/MMseqs2"
+# ─── REVIEW STATUS: All definitions verified from source code ───
+# - Metric keys verified against worker.py _success_response (lines 43-80)
+# - Algorithm verified: mmseqs easy-cluster with --cov-mode 0 (line 319)
+# - Output file keys verified against write_outputs() (lines 544-605)
+# - Duplicate removal verified: MD5 hash-based (lines 162-187)

amina-cli 0.2.7__tar.gz → 0.2.8__tar.gz

amina-cli 0.2.7tar.gz → 0.2.8tar.gz