PyPI - weirdo - Versions diffs - 2.1.0__py3-none-any.whl - Mend

weirdo 2.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

weirdo/__init__.py +104 -0
weirdo/amino_acid.py +33 -0
weirdo/amino_acid_alphabet.py +158 -0
weirdo/amino_acid_properties.py +358 -0
weirdo/api.py +372 -0
weirdo/blosum.py +74 -0
weirdo/chou_fasman.py +73 -0
weirdo/cli.py +597 -0
weirdo/common.py +22 -0
weirdo/data_manager.py +475 -0
weirdo/distances.py +16 -0
weirdo/matrices/BLOSUM30 +25 -0
weirdo/matrices/BLOSUM50 +21 -0
weirdo/matrices/BLOSUM62 +27 -0
weirdo/matrices/__init__.py +0 -0
weirdo/matrices/amino_acid_properties.txt +829 -0
weirdo/matrices/helix_vs_coil.txt +28 -0
weirdo/matrices/helix_vs_strand.txt +27 -0
weirdo/matrices/pmbec.mat +21 -0
weirdo/matrices/strand_vs_coil.txt +27 -0
weirdo/model_manager.py +346 -0
weirdo/peptide_vectorizer.py +78 -0
weirdo/pmbec.py +85 -0
weirdo/reduced_alphabet.py +61 -0
weirdo/residue_contact_energies.py +74 -0
weirdo/scorers/__init__.py +95 -0
weirdo/scorers/base.py +223 -0
weirdo/scorers/config.py +299 -0
weirdo/scorers/mlp.py +1126 -0
weirdo/scorers/reference.py +265 -0
weirdo/scorers/registry.py +282 -0
weirdo/scorers/similarity.py +386 -0
weirdo/scorers/swissprot.py +510 -0
weirdo/scorers/trainable.py +219 -0
weirdo/static_data.py +17 -0
weirdo-2.1.0.dist-info/METADATA +294 -0
weirdo-2.1.0.dist-info/RECORD +41 -0
weirdo-2.1.0.dist-info/WHEEL +5 -0
weirdo-2.1.0.dist-info/entry_points.txt +2 -0
weirdo-2.1.0.dist-info/licenses/LICENSE +201 -0
weirdo-2.1.0.dist-info/top_level.txt +1 -0

weirdo/reduced_alphabet.py ADDED Viewed

@@ -0,0 +1,61 @@
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+def dict_from_list(groups):
+    aa_to_group = {}
+    for _, group in enumerate(groups):
+        for c in group:
+            sorted_group = sorted(group)
+            aa_to_group[c] = sorted_group[0]
+    return aa_to_group
+"""
+Amino acid groupings from
+'Reduced amino acid alphabets improve the sensitivity...' by
+Peterson, Kondev, et al.
+http://www.rpgroup.caltech.edu/publications/Peterson2008.pdf
+"""
+"""
+Other alphabets from
+http://bio.math-inf.uni-greifswald.de/viscose/html/alphabets.html
+"""
+alphabets = dict(
+    gbmr4=dict_from_list(["ADKERNTSQ", "YFLIVMCWH", "G", "P"]),
+    sdm12=dict_from_list([
+        "A", "D", "KER", "N", "TSQ", "YF", "LIVM", "C", "W", "H", "G", "P"]),
+    hsdm17 = dict_from_list([
+        "A", "D", "KE", "R", "N", "T", "S", "Q", "Y",
+        "F", "LIV", "M", "C", "W", "H", "G", "P"
+    ]),
+    # hydrophilic vs. hydrophobic
+    hp2 = dict_from_list(["AGTSNQDEHRKP", "CMFILVWY"]),
+    # Murphy reduced alphabets (groupings derived from murphy10 splits/merges)
+    murphy8 = dict_from_list([
+        "LVIM", "C", "AG", "STP", "FYW", "EDNQ", "KR", "H"
+    ]),
+    murphy10 = dict_from_list([
+        "LVIM", "C", "A", "G", "ST", "P", "FYW", "EDNQ", "KR", "H"
+    ]),
+    murphy15 = dict_from_list([
+        "LIV", "M", "C", "A", "G", "S", "T", "P", "FY", "W", "ED", "NQ", "K", "R", "H"
+    ]),
+    alex6=dict_from_list(["C", "G", "P", "FYW", "AVILM", "STNQRHKDE"]),
+    aromatic2=dict_from_list(["FHWY", "ADKERNTSQLIVMCGP"]),
+    hp_vs_aromatic = dict_from_list(["H", "CMILV", "FWY", "ADKERNTSQGP"]),
+)

weirdo/residue_contact_energies.py ADDED Viewed

@@ -0,0 +1,74 @@
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from os.path import join
+from .amino_acid_alphabet import canonical_amino_acid_letters, dict_to_amino_acid_matrix
+from .static_data import MATRIX_DIR
+def parse_interaction_table(table, amino_acid_order="ARNDCQEGHILKMFPSTWYV"):
+    table = table.strip()
+    while "  " in table:
+        table = table.replace("  ", " ")
+    lines = [l.strip() for l in table.split("\n")]
+    lines = [l for l in lines if len(l) > 0 and not l.startswith("#")]
+    assert len(lines) == 20, "Malformed amino acid interaction table"
+    d = {}
+    for i, line in enumerate(lines):
+        coeff_strings = line.split(" ")
+        assert len(coeff_strings) == 20, \
+            "Malformed row in amino acid interaction table"
+        x = amino_acid_order[i]
+        d[x] = {}
+        for j, coeff_str in enumerate(coeff_strings):
+            value = float(coeff_str)
+            y = amino_acid_order[j]
+            d[x][y] = value
+    return d
+def transpose_interaction_dict(d):
+    transposed = {}
+    for x in canonical_amino_acid_letters:
+        transposed[x] = {}
+        for y in canonical_amino_acid_letters:
+            transposed[x][y] = d[y][x]
+    return transposed
+with open(join(MATRIX_DIR, 'strand_vs_coil.txt'), 'r') as f:
+    # Strand vs. Coil
+    strand_vs_coil_dict = parse_interaction_table(f.read())
+    strand_vs_coil_array = dict_to_amino_acid_matrix(strand_vs_coil_dict)
+    # Coil vs. Strand
+    coil_vs_strand_dict = transpose_interaction_dict(strand_vs_coil_dict)
+    coil_vs_strand_array = dict_to_amino_acid_matrix(coil_vs_strand_dict)
+with open(join(MATRIX_DIR, 'helix_vs_strand.txt'), 'r') as f:
+    # Helix vs. Strand
+    helix_vs_strand_dict = parse_interaction_table(f.read())
+    helix_vs_strand_array = dict_to_amino_acid_matrix(helix_vs_strand_dict)
+    # Strand vs. Helix
+    strand_vs_helix_dict = transpose_interaction_dict(helix_vs_strand_dict)
+    strand_vs_helix_array = dict_to_amino_acid_matrix(strand_vs_helix_dict)
+with open(join(MATRIX_DIR, 'helix_vs_coil.txt'), 'r') as f:
+    # Helix vs. Coil
+    helix_vs_coil_dict = parse_interaction_table(f.read())
+    helix_vs_coil_array = dict_to_amino_acid_matrix(helix_vs_coil_dict)
+    # Coil vs. Helix
+    coil_vs_helix_dict = transpose_interaction_dict(helix_vs_coil_dict)
+    coil_vs_helix_array = dict_to_amino_acid_matrix(coil_vs_helix_dict)

weirdo/scorers/__init__.py ADDED Viewed

@@ -0,0 +1,95 @@
+"""Extensible foreignness scoring system.
+This module provides a plugin-style architecture for scoring peptides
+based on how "foreign" they are relative to a reference dataset.
+Quick Start
+-----------
+>>> from weirdo.scorers import MLPScorer
+>>> scorer = MLPScorer(k=8, hidden_layer_sizes=(128, 64))
+>>> scorer.train(peptides, labels, target_categories=['human', 'viruses'])
+>>> scores = scorer.score(['MTMDKSEL', 'ACDEFGHI'])
+Using Presets
+-------------
+>>> from weirdo.scorers import ScorerConfig
+>>> config = ScorerConfig.from_preset('default')
+>>> scorer = config.build()
+>>> scorer.train(peptides, labels, target_categories=['human', 'viruses'])
+>>> scores = scorer.score(['MTMDKSEL'])
+Adding Custom Scorers
+---------------------
+>>> from weirdo.scorers import register_scorer, BaseScorer
+>>>
+>>> @register_scorer('my_scorer', description='My custom scorer')
+... class MyScorer(BaseScorer):
+...     def fit(self, reference): ...
+...     def score(self, peptides): ...
+"""
+# Base classes
+from .base import BaseScorer, BatchScorer
+# Reference classes
+from .reference import BaseReference, StreamingReference
+# Registry
+from .registry import (
+    ScorerRegistry,
+    registry,
+    register_scorer,
+    register_reference,
+    get_scorer,
+    get_reference,
+    create_scorer,
+    create_reference,
+    list_scorers,
+    list_references,
+)
+# Configuration
+from .config import (
+    ScorerConfig,
+    PRESETS,
+    get_preset,
+    list_presets,
+)
+# Trainable base class
+from .trainable import TrainableScorer
+# Concrete implementations (import to trigger registration)
+from .swissprot import SwissProtReference
+# ML-based scorer
+from .mlp import MLPScorer
+__all__ = [
+    # Base classes
+    'BaseScorer',
+    'BatchScorer',
+    'BaseReference',
+    'StreamingReference',
+    'TrainableScorer',
+    # Registry
+    'ScorerRegistry',
+    'registry',
+    'register_scorer',
+    'register_reference',
+    'get_scorer',
+    'get_reference',
+    'create_scorer',
+    'create_reference',
+    'list_scorers',
+    'list_references',
+    # Configuration
+    'ScorerConfig',
+    'PRESETS',
+    'get_preset',
+    'list_presets',
+    # Implementations
+    'SwissProtReference',
+    # ML scorer
+    'MLPScorer',
+]

weirdo/scorers/base.py ADDED Viewed

@@ -0,0 +1,223 @@
+"""Base classes for foreignness scorers.
+Provides abstract base classes defining the scorer interface,
+following sklearn-style fit/score patterns.
+"""
+from abc import ABC, abstractmethod
+from typing import Any, Dict, List, Optional, Sequence, Union
+import numpy as np
+class BaseScorer(ABC):
+    """Abstract base class for foreignness scorers.
+    Scorers follow a fit/score pattern similar to sklearn:
+    1. Initialize with configuration parameters
+    2. Call fit() with a reference dataset
+    3. Call score() on new peptides
+    Example
+    -------
+    >>> scorer = MyScorer(k=8, aggregate='mean')
+    >>> scorer.fit(reference)
+    >>> scores = scorer.score(['MTMDKSEL', 'ACDEFGHI'])
+    """
+    def __init__(self, **params):
+        """Initialize scorer with parameters.
+        Parameters
+        ----------
+        **params : dict
+            Scorer-specific configuration parameters.
+        """
+        self._params = params
+        self._is_fitted = False
+        self._reference = None
+    @abstractmethod
+    def fit(self, reference: Any) -> 'BaseScorer':
+        """Fit the scorer to a reference dataset.
+        Parameters
+        ----------
+        reference : BaseReference
+            Reference dataset providing k-mer frequencies or other data.
+        Returns
+        -------
+        self : BaseScorer
+            Returns self for method chaining.
+        """
+        pass
+    @abstractmethod
+    def score(self, peptides: Union[str, Sequence[str]]) -> np.ndarray:
+        """Score peptide(s) for foreignness.
+        Parameters
+        ----------
+        peptides : str or sequence of str
+            Single peptide or list of peptides to score.
+        Returns
+        -------
+        scores : np.ndarray
+            Array of foreignness scores. Higher = more foreign.
+            Shape: (n_peptides,)
+        """
+        pass
+    def fit_score(self, reference: Any, peptides: Union[str, Sequence[str]]) -> np.ndarray:
+        """Fit to reference and score peptides in one call.
+        Parameters
+        ----------
+        reference : BaseReference
+            Reference dataset to fit.
+        peptides : str or sequence of str
+            Peptides to score.
+        Returns
+        -------
+        scores : np.ndarray
+            Foreignness scores.
+        """
+        self.fit(reference)
+        return self.score(peptides)
+    def get_params(self, deep: bool = True) -> Dict[str, Any]:
+        """Get scorer parameters.
+        Parameters
+        ----------
+        deep : bool, default=True
+            If True, return parameters of nested objects.
+        Returns
+        -------
+        params : dict
+            Parameter names mapped to their values.
+        """
+        return self._params.copy()
+    def set_params(self, **params) -> 'BaseScorer':
+        """Set scorer parameters.
+        Parameters
+        ----------
+        **params : dict
+            Scorer parameters to update.
+        Returns
+        -------
+        self : BaseScorer
+            Returns self for method chaining.
+        """
+        self._params.update(params)
+        self._is_fitted = False  # Invalidate fit when params change
+        return self
+    @property
+    def is_fitted(self) -> bool:
+        """Check if scorer has been fitted."""
+        return self._is_fitted
+    def _check_is_fitted(self) -> None:
+        """Raise error if scorer is not fitted."""
+        if not self._is_fitted:
+            raise RuntimeError(
+                f"{self.__class__.__name__} is not fitted. "
+                "Call fit() before score()."
+            )
+    def _ensure_list(self, peptides: Union[str, Sequence[str]]) -> List[str]:
+        """Convert single peptide to list if needed."""
+        if isinstance(peptides, str):
+            return [peptides]
+        return list(peptides)
+class BatchScorer(BaseScorer):
+    """Base class for scorers that support efficient batch operations.
+    Extends BaseScorer with score_batch() for vectorized scoring
+    of large peptide sets.
+    """
+    def __init__(self, batch_size: int = 10000, **params):
+        """Initialize batch scorer.
+        Parameters
+        ----------
+        batch_size : int, default=10000
+            Number of peptides to process per batch.
+        **params : dict
+            Additional scorer parameters.
+        """
+        super().__init__(**params)
+        self._params['batch_size'] = batch_size
+    @property
+    def batch_size(self) -> int:
+        """Get batch size for vectorized operations."""
+        return self._params.get('batch_size', 10000)
+    def score_batch(
+        self,
+        peptides: Sequence[str],
+        show_progress: bool = False
+    ) -> np.ndarray:
+        """Score peptides in batches for memory efficiency.
+        Parameters
+        ----------
+        peptides : sequence of str
+            Peptides to score.
+        show_progress : bool, default=False
+            If True, show progress bar (requires tqdm).
+        Returns
+        -------
+        scores : np.ndarray
+            Foreignness scores.
+        """
+        self._check_is_fitted()
+        peptides = self._ensure_list(peptides)
+        n_peptides = len(peptides)
+        scores = np.zeros(n_peptides)
+        # Create batch iterator
+        batches = range(0, n_peptides, self.batch_size)
+        if show_progress:
+            try:
+                from tqdm import tqdm
+                batches = tqdm(batches, desc="Scoring", unit="batch")
+            except ImportError:
+                pass
+        for i in batches:
+            batch = peptides[i:i + self.batch_size]
+            scores[i:i + len(batch)] = self._score_batch_impl(batch)
+        return scores
+    def _score_batch_impl(self, batch: List[str]) -> np.ndarray:
+        """Implementation of batch scoring.
+        Override this for efficient vectorized scoring.
+        Default implementation calls score() on each peptide.
+        Parameters
+        ----------
+        batch : list of str
+            Batch of peptides to score.
+        Returns
+        -------
+        scores : np.ndarray
+            Scores for the batch.
+        """
+        return self.score(batch)