PyPI - weirdo - Versions diffs - 2.1.0__py3-none-any.whl - Mend

weirdo 2.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

weirdo/__init__.py +104 -0
weirdo/amino_acid.py +33 -0
weirdo/amino_acid_alphabet.py +158 -0
weirdo/amino_acid_properties.py +358 -0
weirdo/api.py +372 -0
weirdo/blosum.py +74 -0
weirdo/chou_fasman.py +73 -0
weirdo/cli.py +597 -0
weirdo/common.py +22 -0
weirdo/data_manager.py +475 -0
weirdo/distances.py +16 -0
weirdo/matrices/BLOSUM30 +25 -0
weirdo/matrices/BLOSUM50 +21 -0
weirdo/matrices/BLOSUM62 +27 -0
weirdo/matrices/__init__.py +0 -0
weirdo/matrices/amino_acid_properties.txt +829 -0
weirdo/matrices/helix_vs_coil.txt +28 -0
weirdo/matrices/helix_vs_strand.txt +27 -0
weirdo/matrices/pmbec.mat +21 -0
weirdo/matrices/strand_vs_coil.txt +27 -0
weirdo/model_manager.py +346 -0
weirdo/peptide_vectorizer.py +78 -0
weirdo/pmbec.py +85 -0
weirdo/reduced_alphabet.py +61 -0
weirdo/residue_contact_energies.py +74 -0
weirdo/scorers/__init__.py +95 -0
weirdo/scorers/base.py +223 -0
weirdo/scorers/config.py +299 -0
weirdo/scorers/mlp.py +1126 -0
weirdo/scorers/reference.py +265 -0
weirdo/scorers/registry.py +282 -0
weirdo/scorers/similarity.py +386 -0
weirdo/scorers/swissprot.py +510 -0
weirdo/scorers/trainable.py +219 -0
weirdo/static_data.py +17 -0
weirdo-2.1.0.dist-info/METADATA +294 -0
weirdo-2.1.0.dist-info/RECORD +41 -0
weirdo-2.1.0.dist-info/WHEEL +5 -0
weirdo-2.1.0.dist-info/entry_points.txt +2 -0
weirdo-2.1.0.dist-info/licenses/LICENSE +201 -0
weirdo-2.1.0.dist-info/top_level.txt +1 -0

weirdo/api.py ADDED Viewed

@@ -0,0 +1,372 @@
+"""High-level convenience API for foreignness scoring.
+Provides simple functions for common use cases without
+needing to understand the full scorer architecture.
+Example
+-------
+>>> from weirdo import score_peptide, load_model
+>>> scorer = load_model('my-mlp')
+>>> score = score_peptide('MTMDKSEL', model=scorer)
+>>> from weirdo import score_peptides
+>>> scores = score_peptides(['MTMDKSEL', 'ACDEFGHI'], model=scorer)
+"""
+from typing import Any, Dict, List, Optional, Sequence, Union
+import numpy as np
+from .scorers import ScorerConfig, BaseScorer, TrainableScorer
+# Cache for scorer instances by preset
+_scorer_cache: Dict[str, BaseScorer] = {}
+def create_scorer(
+    preset: str = 'default',
+    cache: bool = True,
+    auto_download: bool = False,
+    train_data: Optional[Sequence[str]] = None,
+    train_labels: Optional[Any] = None,
+    target_categories: Optional[List[str]] = None,
+    **overrides
+) -> BaseScorer:
+    """Create a scorer from a preset configuration.
+    Parameters
+    ----------
+    preset : str, default='default'
+        Preset name (e.g., 'default', 'fast').
+    cache : bool, default=True
+        If True, cache the scorer instance for reuse.
+        Set to False if you need multiple independent instances.
+    auto_download : bool, default=False
+        If True, automatically download reference data if not present.
+    train_data : sequence of str, optional
+        Training peptides for trainable scorers.
+    train_labels : array-like, optional
+        Training labels for trainable scorers.
+    target_categories : list of str, optional
+        Category names for multi-label training.
+    **overrides : dict
+        Override specific config parameters (e.g., k=10, hidden_layer_sizes=(128, 64)).
+    Returns
+    -------
+    scorer : BaseScorer
+        Configured scorer. Trainable scorers are returned untrained unless
+        train_data and train_labels are provided.
+    Example
+    -------
+    >>> scorer = create_scorer('default', use_dipeptides=False)
+    >>> scorer.train(peptides, labels, target_categories=['human', 'viruses'])
+    >>> # Auto-download data on first use
+    >>> scorer = create_scorer('default', auto_download=True)
+    """
+    # Build cache key from preset and overrides
+    cache_key = f"{preset}:{sorted(overrides.items())}:auto={auto_download}"
+    if cache and cache_key in _scorer_cache and train_data is None and train_labels is None:
+        return _scorer_cache[cache_key]
+    # Get preset config
+    config = ScorerConfig.from_preset(preset)
+    # Apply overrides
+    if overrides:
+        # Check which params go to scorer vs reference
+        scorer_params = {
+            'hidden_layer_sizes',
+            'activation',
+            'alpha',
+            'learning_rate_init',
+            'max_iter',
+            'early_stopping',
+            'use_dipeptides',
+            'batch_size',
+            'random_state',
+        }
+        reference_params = {'categories', 'lazy', 'use_set', 'data_path'}
+        for key, value in overrides.items():
+            if key == 'k':
+                config.k = value
+            elif key == 'scorer':
+                config.scorer = value
+            elif key == 'reference':
+                config.reference = value
+            elif key in scorer_params:
+                config.scorer_params[key] = value
+            elif key in reference_params:
+                config.reference_params[key] = value
+            else:
+                # Assume it's a scorer param
+                config.scorer_params[key] = value
+    # Add auto_download to reference params
+    if auto_download:
+        config.reference_params['auto_download'] = True
+    # Build scorer (trainable scorers are returned untrained unless training data provided)
+    scorer = config.build(
+        train_data=list(train_data) if train_data is not None else None,
+        train_labels=train_labels,
+        target_categories=target_categories,
+    )
+    if cache and train_data is None and train_labels is None:
+        _scorer_cache[cache_key] = scorer
+    return scorer
+def score_peptide(
+    peptide: str,
+    model: Optional[Union[str, BaseScorer]] = None,
+    model_dir: Optional[str] = None,
+    preset: Optional[str] = None,
+    aggregate: str = 'mean',
+    **kwargs
+) -> float:
+    """Score a single peptide.
+    Parameters
+    ----------
+    peptide : str
+        Peptide sequence to score.
+    model : str or BaseScorer, optional
+        Model name (from ModelManager) or an instantiated scorer.
+    model_dir : str, optional
+        Custom model directory when loading by name.
+    preset : str, optional
+        Scoring preset for non-trainable scorers.
+    aggregate : str, default='mean'
+        How to aggregate k-mer probabilities for long peptides.
+    **kwargs : dict
+        Additional arguments passed to create_scorer().
+    Returns
+    -------
+    score : float
+        Foreignness score. Higher = more foreign.
+    Example
+    -------
+    >>> scorer = load_model('my-mlp')
+    >>> score = score_peptide('MTMDKSEL', model=scorer)
+    """
+    if model is None:
+        if preset is None:
+            raise ValueError("Provide a trained model or a preset for non-trainable scorers.")
+        scorer = create_scorer(preset, **kwargs)
+    elif isinstance(model, str):
+        scorer = load_model(model, model_dir)
+    else:
+        scorer = model
+    if isinstance(scorer, TrainableScorer) and not scorer.is_trained:
+        raise RuntimeError("Scorer is not trained. Train or load a trained model before scoring.")
+    try:
+        scores = scorer.score([peptide], aggregate=aggregate)
+    except TypeError:
+        scores = scorer.score([peptide])
+    return float(scores[0])
+def score_peptides(
+    peptides: Sequence[str],
+    model: Optional[Union[str, BaseScorer]] = None,
+    model_dir: Optional[str] = None,
+    preset: Optional[str] = None,
+    aggregate: str = 'mean',
+    **kwargs
+) -> np.ndarray:
+    """Score multiple peptides.
+    Parameters
+    ----------
+    peptides : sequence of str
+        Peptide sequences to score.
+    model : str or BaseScorer, optional
+        Model name (from ModelManager) or an instantiated scorer.
+    model_dir : str, optional
+        Custom model directory when loading by name.
+    preset : str, optional
+        Scoring preset for non-trainable scorers.
+    aggregate : str, default='mean'
+        How to aggregate k-mer probabilities for long peptides.
+    **kwargs : dict
+        Additional arguments passed to create_scorer().
+    Returns
+    -------
+    scores : np.ndarray
+        Array of foreignness scores. Higher = more foreign.
+    Example
+    -------
+    >>> scorer = load_model('my-mlp')
+    >>> scores = score_peptides(['MTMDKSEL'], model=scorer)
+    """
+    if model is None:
+        if preset is None:
+            raise ValueError("Provide a trained model or a preset for non-trainable scorers.")
+        scorer = create_scorer(preset, **kwargs)
+    elif isinstance(model, str):
+        scorer = load_model(model, model_dir)
+    else:
+        scorer = model
+    if isinstance(scorer, TrainableScorer) and not scorer.is_trained:
+        raise RuntimeError("Scorer is not trained. Train or load a trained model before scoring.")
+    try:
+        return scorer.score(peptides, aggregate=aggregate)
+    except TypeError:
+        return scorer.score(peptides)
+def clear_cache() -> None:
+    """Clear the scorer cache.
+    Use this to free memory or reset state.
+    """
+    _scorer_cache.clear()
+def get_available_presets() -> List[str]:
+    """Get list of available preset names.
+    Returns
+    -------
+    presets : list of str
+        Available preset names.
+    """
+    from .scorers import list_presets
+    return list_presets()
+def get_preset_info(preset: str) -> Dict[str, Any]:
+    """Get information about a preset configuration.
+    Parameters
+    ----------
+    preset : str
+        Preset name.
+    Returns
+    -------
+    info : dict
+        Preset configuration details.
+    """
+    config = ScorerConfig.from_preset(preset)
+    return config.to_dict()
+# =============================================================================
+# Model Management Functions
+# =============================================================================
+def list_models(model_dir: Optional[str] = None) -> List[Any]:
+    """List all available trained models.
+    Parameters
+    ----------
+    model_dir : str, optional
+        Custom model directory. Defaults to ~/.weirdo/models.
+    Returns
+    -------
+    models : list of ModelInfo
+        Information about each saved model.
+    Example
+    -------
+    >>> models = list_models()
+    >>> for m in models:
+    ...     print(f"{m.name}: {m.scorer_type}")
+    """
+    from .model_manager import list_models as _list_models
+    return _list_models(model_dir)
+def load_model(name: str, model_dir: Optional[str] = None) -> BaseScorer:
+    """Load a trained model by name.
+    Parameters
+    ----------
+    name : str
+        Model name.
+    model_dir : str, optional
+        Custom model directory.
+    Returns
+    -------
+    scorer : TrainableScorer
+        Loaded model ready for scoring.
+    Example
+    -------
+    >>> model = load_model('my-mlp')
+    >>> scores = model.score(['MTMDKSEL'])
+    """
+    from .model_manager import load_model as _load_model
+    return _load_model(name, model_dir)
+def save_model(
+    scorer: BaseScorer,
+    name: str,
+    model_dir: Optional[str] = None,
+    overwrite: bool = False,
+) -> str:
+    """Save a trained model.
+    Parameters
+    ----------
+    scorer : TrainableScorer
+        Trained model to save.
+    name : str
+        Name for the saved model.
+    model_dir : str, optional
+        Custom model directory.
+    overwrite : bool, default=False
+        Overwrite existing model.
+    Returns
+    -------
+    path : str
+        Path where model was saved.
+    Example
+    -------
+    >>> scorer = MLPScorer()
+    >>> scorer.train(peptides, labels)
+    >>> save_model(scorer, 'my-mlp')
+    """
+    from .model_manager import save_model as _save_model
+    return str(_save_model(scorer, name, model_dir, overwrite))
+def get_available_scorers() -> List[str]:
+    """Get list of available scorer types.
+    Returns both lookup-based and ML-based scorers.
+    Returns
+    -------
+    scorers : list of str
+        Available scorer names.
+    Example
+    -------
+    >>> print(get_available_scorers())
+    ['mlp']
+    """
+    from .scorers import list_scorers
+    return list_scorers()

weirdo/blosum.py ADDED Viewed

@@ -0,0 +1,74 @@
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from os.path import join
+from .static_data import MATRIX_DIR
+from .amino_acid_alphabet import dict_to_amino_acid_matrix
+def parse_blosum_table(table, coeff_type=int, key_type='row'):
+    """
+    Parse a table of pairwise amino acid coefficient (e.g. BLOSUM50)
+    """
+    lines = table.split("\n")
+    # drop comments
+    lines = [line for line in lines if not line.startswith("#")]
+    # drop CR endline characters
+    lines = [line.replace("\r", "") for line in lines]
+    # skip empty lines
+    lines = [line for line in lines if line]
+    labels = lines[0].split()
+    if len(labels) < 20:
+        raise ValueError(
+            "Expected 20+ amino acids but first line '%s' has %d fields" % (
+                lines[0],
+                len(labels)))
+    coeffs = {}
+    for line in lines[1:]:
+        fields = line.split()
+        assert len(fields) >= 21, \
+            "Expected AA and 20+ coefficients but '%s' has %d fields" % (
+                line, len(fields))
+        x = fields[0]
+        for i, coeff_str in enumerate(fields[1:]):
+            y = labels[i]
+            coeff = coeff_type(coeff_str)
+            if key_type == 'pair':
+                coeffs[(x, y)] = coeff
+            elif key_type == 'pair_string':
+                coeffs[x + y] = coeff
+            else:
+                assert key_type == 'row', "Unknown key type: %s" % key_type
+                if x not in coeffs:
+                    coeffs[x] = {}
+                coeffs[x][y] = coeff
+    return coeffs
+with open(join(MATRIX_DIR, 'BLOSUM30'), 'r') as f:
+    blosum30_dict = parse_blosum_table(f.read())
+    blosum30_matrix = dict_to_amino_acid_matrix(blosum30_dict)
+with open(join(MATRIX_DIR, 'BLOSUM50'), 'r') as f:
+    blosum50_dict = parse_blosum_table(f.read())
+    blosum50_matrix = dict_to_amino_acid_matrix(blosum50_dict)
+with open(join(MATRIX_DIR, 'BLOSUM62'), 'r') as f:
+    blosum62_dict = parse_blosum_table(f.read())
+    blosum62_matrix = dict_to_amino_acid_matrix(blosum62_dict)

weirdo/chou_fasman.py ADDED Viewed

@@ -0,0 +1,73 @@
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .amino_acid_alphabet import amino_acid_name_indices
+# Chou-Fasman of structural properties from
+# http://prowl.rockefeller.edu/aainfo/chou.htm
+chou_fasman_table = """
+Alanine        142     83       66      0.06    0.076   0.035   0.058
+Arginine        98     93       95      0.070   0.106   0.099   0.085
+Aspartic Acid  101     54      146      0.147   0.110   0.179   0.081
+Asparagine      67     89      156      0.161   0.083   0.191   0.091
+Cysteine        70    119      119      0.149   0.050   0.117   0.128
+Glutamic Acid  151    037       74      0.056   0.060   0.077   0.064
+Glutamine      111    110       98      0.074   0.098   0.037   0.098
+Glycine         57     75      156      0.102   0.085   0.190   0.152
+Histidine      100     87       95      0.140   0.047   0.093   0.054
+Isoleucine     108    160       47      0.043   0.034   0.013   0.056
+Leucine        121    130       59      0.061   0.025   0.036   0.070
+Lysine         114     74      101      0.055   0.115   0.072   0.095
+Methionine     145    105       60      0.068   0.082   0.014   0.055
+Phenylalanine  113    138       60      0.059   0.041   0.065   0.065
+Proline         57     55      152      0.102   0.301   0.034   0.068
+Serine          77     75      143      0.120   0.139   0.125   0.106
+Threonine       83    119       96      0.086   0.108   0.065   0.079
+Tryptophan     108    137       96      0.077   0.013   0.064   0.167
+Tyrosine        69    147      114      0.082   0.065   0.114   0.125
+Valine         106    170       50      0.062   0.048   0.028   0.053
+"""
+def parse_chou_fasman(table):
+    alpha_helix_score_dict = {}
+    beta_sheet_score_dict = {}
+    turn_score_dict = {}
+    for line in table.split("\n"):
+        fields = [field for field in line.split(" ") if len(field.strip()) > 0]
+        if len(fields) == 0:
+            continue
+        if fields[1] == 'Acid':
+            name = fields[0] + " " + fields[1]
+            fields = fields[1:]
+        else:
+            name = fields[0]
+        assert name in amino_acid_name_indices, "Invalid amino acid name %s" % name
+        letter = amino_acid_name_indices[name]
+        alpha = int(fields[1])
+        beta = int(fields[2])
+        turn = int(fields[3])
+        alpha_helix_score_dict[letter] = alpha
+        beta_sheet_score_dict[letter] = beta
+        turn_score_dict[letter] = turn
+    assert len(alpha_helix_score_dict) == 20
+    assert len(beta_sheet_score_dict) == 20
+    assert len(turn_score_dict) == 20
+    return alpha_helix_score_dict, beta_sheet_score_dict, turn_score_dict
+alpha_helix_score, beta_sheet_score, turn_score = \
+    parse_chou_fasman(chou_fasman_table)