PyPI - imspy-search - Versions diffs - 0.4.0__py3-none-any.whl - Mend

imspy-search 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

imspy_search/__init__.py +126 -0
imspy_search/cli/__init__.py +11 -0
imspy_search/cli/imspy_ccs.py +322 -0
imspy_search/cli/imspy_dda.py +836 -0
imspy_search/cli/imspy_rescore_sage.py +289 -0
imspy_search/configs/config_ccs.toml +15 -0
imspy_search/configs/config_hla.toml +83 -0
imspy_search/configs/config_tryptic.toml +84 -0
imspy_search/dda_extensions.py +209 -0
imspy_search/mgf.py +139 -0
imspy_search/rescoring.py +166 -0
imspy_search/sage_output_utility.py +318 -0
imspy_search/utility.py +585 -0
imspy_search-0.4.0.dist-info/METADATA +108 -0
imspy_search-0.4.0.dist-info/RECORD +17 -0
imspy_search-0.4.0.dist-info/WHEEL +4 -0
imspy_search-0.4.0.dist-info/entry_points.txt +5 -0

imspy_search/mgf.py ADDED Viewed

@@ -0,0 +1,139 @@
+"""MGF file parsing for sagepy queries."""
+import numpy as np
+from sagepy.core import (
+    Precursor, ProcessedSpectrum, RawSpectrum, SpectrumProcessor
+)
+from typing import List, Dict, Tuple
+def iter_spectra(
+    codelines,
+    _start_tag: str = "BEGIN IONS",
+    _stop_tag: str = "END IONS",
+) -> list[dict]:
+    """
+    Iterate over the spectra in an MGF file.
+    Args:
+        codelines: List of lines from an MGF file
+        _start_tag: the tag that indicates the start of a spectrum
+        _stop_tag: the tag that indicates the end of a spectrum
+    Yields:
+        A list of lines representing each spectrum
+    """
+    recording = False
+    for line in codelines:
+        line = line.strip()
+        if line.startswith(_start_tag):
+            recording = True
+            buffer: list[str] = []
+        elif line.startswith(_stop_tag):
+            assert recording
+            recording = False
+            yield buffer
+        elif recording:
+            buffer.append(line)
+        else:
+            pass
+    assert not recording
+def parse_spectrum(line_spectrum) -> Tuple[Dict, np.ndarray, np.ndarray]:
+    """
+    Parse the spectrum from the lines in the MGF file.
+    Args:
+        line_spectrum: List of lines from an MGF file
+    Returns:
+        precursor_info: Dictionary containing precursor information
+        fragment_mzs: Numpy array of fragment m/z values
+        fragment_intensities: Numpy array of fragment intensities
+    """
+    precursor_info = {}
+    fragment_mzs = []
+    fragment_intensities = []
+    precursor_intensity = 0
+    for l in line_spectrum:
+        if not l[0].isdigit():
+            name, val = l.split("=", 1)
+            if name == "CHARGE":
+                val = int(val.replace("+", ""))
+            elif name == "PEPMASS":
+                name = "MZ"
+                val, precursor_intensity = val.split(" ")
+                precursor_intensity = int(precursor_intensity)
+                val = float(val)
+            elif name == "ION_MOBILITY":
+                val = float(val.split(" ")[-1])
+            elif name == "RTINSECONDS":
+                val = float(val)
+            precursor_info[name] = val
+        else:
+            frag = l.split("\t")
+            frag_mz = frag[0]
+            frag_intensity = frag[1]
+            fragment_mzs.append(float(frag_mz))
+            fragment_intensities.append(int(frag_intensity))
+    assert precursor_intensity != 0, "We did not manage to parse out precursor intensity!!!"
+    precursor_info["intensity"] = precursor_intensity
+    try:
+        precursor_id = precursor_info["TITLE"].split(',')[0].split(' ')[1]
+        precursor_info["ID"] = precursor_id
+    except Exception as e:
+        raise Exception(f"{e}\nERROR IN PARSING PRECURSOR ID")
+    try:
+        COLLISION_ENERGY = float(precursor_info["TITLE"].split(',')[2].replace(" ", "").replace("eV", ""))
+        precursor_info["COLLISION_ENERGY"] = COLLISION_ENERGY
+    except Exception as e:
+        raise Exception(f"{e}\nERROR IN PARSING COLLISION ENERGY")
+    return precursor_info, np.array(fragment_mzs), np.array(fragment_intensities)
+def mgf_to_sagepy_query(mgf_path: str, top_n: int = 150) -> List[ProcessedSpectrum]:
+    """
+    Read an MGF file and return a list of ProcessedSpectrum.
+    Args:
+        mgf_path: Path to the MGF file
+        top_n: Number of top peaks to keep in the spectrum
+    Returns:
+        List of ProcessedSpectrum objects
+    """
+    spec_processor = SpectrumProcessor(take_top_n=top_n)
+    queries = []
+    with open(mgf_path, "r") as mgf:
+        for specNo, line_spectrum in enumerate(iter_spectra(mgf)):
+            precursor_info, fragment_mzs, fragment_intensities = parse_spectrum(line_spectrum)
+            precursor = Precursor(
+                mz=precursor_info["MZ"],
+                charge=precursor_info.get("CHARGE", None),
+                intensity=precursor_info["intensity"],
+                inverse_ion_mobility=precursor_info.get("ION_MOBILITY", None),
+                collision_energy=precursor_info.get("COLLISION_ENERGY", None),
+            )
+            prec_rt = precursor_info["RTINSECONDS"] / 60.0
+            raw_spectrum = RawSpectrum(
+                file_id=1,
+                spec_id=str(specNo) + "-" + str(precursor_info["ID"]),
+                total_ion_current=fragment_intensities.sum(),
+                precursors=[precursor],
+                mz=fragment_mzs.astype(np.float32),
+                intensity=fragment_intensities.astype(np.float32),
+                scan_start_time=prec_rt,
+                ion_injection_time=prec_rt,
+            )
+            queries.append(spec_processor.process(raw_spectrum))
+    return queries

imspy_search/rescoring.py ADDED Viewed

@@ -0,0 +1,166 @@
+"""PSM rescoring with deep learning features."""
+import numpy as np
+from sklearn.svm import SVC
+from sklearn.linear_model import LogisticRegression
+from sagepy.rescore.rescore import rescore_psms as sagepy_rescore_psms
+from sagepy.qfdr.tdc import assign_sage_spectrum_q, assign_sage_peptide_q, assign_sage_protein_q
+from sagepy.core.scoring import Psm
+from sagepy.utility import psm_collection_to_pandas
+from typing import List
+from imspy_predictors import (
+    load_tokenizer_from_resources,
+    load_deep_ccs_predictor,
+    DeepPeptideIonMobilityApex,
+    load_deep_retention_time_predictor,
+    DeepChromatographyApex,
+    Prosit2023TimsTofWrapper,
+    get_collision_energy_calibration_factor,
+)
+from sagepy.core.scoring import associate_fragment_ions_with_prosit_predicted_intensities
+from imspy_search.utility import (
+    linear_map,
+    generate_balanced_rt_dataset,
+    generate_balanced_im_dataset,
+)
+def re_score_psms(psms: List[Psm], use_logreg: bool = True) -> List[Psm]:
+    """Re-score the PSMs using machine learning.
+    Args:
+        psms: The PSMs to rescore
+        use_logreg: Whether to use logistic regression (True) or SVM (False)
+    Returns:
+        The re-scored PSMs
+    """
+    if use_logreg:
+        model = LogisticRegression()
+    else:
+        model = SVC(probability=True)
+    psms_rescored = sagepy_rescore_psms(
+        psm_collection=psms,
+        model=model,
+        num_splits=3,
+        verbose=True
+    )
+    psms_rescored = list(filter(lambda x: x.rank == 1, psms_rescored))
+    assign_sage_spectrum_q(psms_rescored, use_hyper_score=False)
+    assign_sage_peptide_q(psms_rescored, use_hyper_score=False)
+    assign_sage_protein_q(psms_rescored, use_hyper_score=False)
+    return psms_rescored
+def create_feature_space(
+    psms: List[Psm],
+    fine_tune_im: bool = True,
+    fine_tune_rt: bool = True,
+    verbose: bool = False
+) -> List[Psm]:
+    """Create a feature space for the PSMs with predicted properties.
+    Args:
+        psms: The PSMs to add features to
+        fine_tune_im: Whether to fine-tune the ion mobility predictor
+        fine_tune_rt: Whether to fine-tune the retention time predictor
+        verbose: Whether to print information
+    Returns:
+        The PSMs with added feature space
+    """
+    # Take the top-n scoring PSMs to calibrate collision energy
+    sample = sorted(psms, key=lambda s: s.hyperscore)[-2 ** 8:]
+    # Load prosit model
+    prosit_model = Prosit2023TimsTofWrapper(verbose=verbose)
+    # Load ion mobility predictor
+    im_predictor = DeepPeptideIonMobilityApex(
+        load_deep_ccs_predictor(),
+        load_tokenizer_from_resources("tokenizer-ptm")
+    )
+    # Load retention time predictor
+    rt_predictor = DeepChromatographyApex(
+        load_deep_retention_time_predictor(),
+        load_tokenizer_from_resources("tokenizer-ptm"),
+        verbose=verbose
+    )
+    # Calculate the collision energy calibration factor
+    collision_energy_calibration_factor, angles = get_collision_energy_calibration_factor(
+        sample,
+        prosit_model,
+        verbose=verbose
+    )
+    # Add the calibration factor to the PSMs
+    for p in psms:
+        p.collision_energy_calibrated = p.collision_energy + collision_energy_calibration_factor
+    # Predict the intensity values
+    I = prosit_model.predict_intensities(
+        [p.sequence_modified if p.decoy == False else p.sequence_decoy_modified for p in psms],
+        np.array([p.charge for p in psms]),
+        [p.collision_energy_calibrated for p in psms],
+        batch_size=2048,
+        flatten=True,
+    )
+    # Add intensity values to PSMs
+    psms = associate_fragment_ions_with_prosit_predicted_intensities(psms, I, num_threads=16)
+    if fine_tune_im:
+        # Fit ion mobility predictor
+        im_predictor.fine_tune_model(
+            data=psm_collection_to_pandas(generate_balanced_im_dataset(psms=psms)),
+            batch_size=1024,
+            re_compile=True,
+            verbose=verbose,
+        )
+    # Predict ion mobilities
+    inv_mob = im_predictor.simulate_ion_mobilities(
+        sequences=[x.sequence_modified if x.decoy == False else x.sequence_decoy_modified for x in psms],
+        charges=[x.charge for x in psms],
+        mz=[x.mono_mz_calculated for x in psms]
+    )
+    # Set ion mobilities
+    for mob, p in zip(inv_mob, psms):
+        p.inverse_ion_mobility_predicted = mob
+    rt_min = min([p.retention_time for p in psms])
+    rt_max = max([p.retention_time for p in psms])
+    # Map the observed retention time into the domain [0, 60]
+    for value in psms:
+        value.retention_time_projected = linear_map(value.retention_time, rt_min, rt_max, 0.0, 60.0)
+    if fine_tune_rt:
+        # Fit retention time predictor
+        rt_predictor.fine_tune_model(
+            data=psm_collection_to_pandas(generate_balanced_rt_dataset(psms=psms)),
+            batch_size=1024,
+            re_compile=True,
+            verbose=verbose,
+        )
+    # Predict retention times
+    rt_pred = rt_predictor.simulate_separation_times(
+        sequences=[x.sequence_modified if x.decoy == False else x.sequence_decoy_modified for x in psms],
+    )
+    # Set retention times
+    for rt, p in zip(rt_pred, psms):
+        p.retention_time_predicted = rt
+    return psms

imspy_search/sage_output_utility.py ADDED Viewed

@@ -0,0 +1,318 @@
+"""Utilities for processing SAGE search output and rescoring PSMs."""
+import re
+import numpy as np
+import pandas as pd
+import random
+from sagepy.core import Fragments, IonType
+from scipy.spatial import distance
+from sagepy.utility import get_features
+from sagepy.qfdr.tdc import target_decoy_competition_pandas
+from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
+from sklearn.preprocessing import StandardScaler
+from matplotlib import pyplot as plt
+from numpy.typing import NDArray
+from typing import Tuple
+from tqdm import tqdm
+def break_into_equal_size_sets(sequence_set, k: int = 10):
+    """
+    Breaks a set of objects into k sets of equal size at random.
+    Args:
+        sequence_set: Set of sequences to be divided
+        k: Number of sets to divide the objects into
+    Returns:
+        A list containing k sets, each with equal number of randomly chosen sequences
+    """
+    objects_list = list(sequence_set)
+    random.shuffle(objects_list)
+    set_size = len(objects_list) // k
+    remainder = len(objects_list) % k
+    sets = []
+    start = 0
+    for i in range(k):
+        end = start + set_size + (1 if i < remainder else 0)
+        sets.append(set(objects_list[start:end]))
+        start = end
+    return sets
+def split_dataframe_randomly(df: pd.DataFrame, n: int) -> list:
+    """Split a DataFrame randomly into n parts based on unique sequences."""
+    sequences_set = set(df.sequence.values)
+    split_sets = break_into_equal_size_sets(sequences_set, n)
+    ret_list = []
+    for seq_set in split_sets:
+        ret_list.append(df[df['sequence'].apply(lambda s: s in seq_set)])
+    return ret_list
+def generate_training_data(
+    psms: pd.DataFrame,
+    method: str = "psm",
+    q_max: float = 0.01,
+    balance: bool = True
+) -> Tuple[NDArray, NDArray]:
+    """Generate training data for LDA rescoring.
+    Args:
+        psms: DataFrame of PSMs
+        method: Method to use for training data generation
+        q_max: Maximum q-value allowed for positive examples
+        balance: Whether to balance the dataset
+    Returns:
+        Tuple of X_train and Y_train
+    """
+    PSM_pandas = psms
+    PSM_q = target_decoy_competition_pandas(PSM_pandas, method=method)
+    PSM_pandas = PSM_pandas.drop(columns=["q_value", "score"])
+    TDC = pd.merge(PSM_q, PSM_pandas, left_on=["spec_idx", "match_idx", "decoy"],
+                   right_on=["spec_idx", "match_idx", "decoy"])
+    TARGET = TDC[(TDC.decoy == False) & (TDC.q_value <= q_max)]
+    X_target, Y_target = get_features(TARGET)
+    DECOY = TDC[TDC.decoy]
+    X_decoy, Y_decoy = get_features(DECOY)
+    if balance:
+        num_target = np.min((len(DECOY), len(TARGET)))
+        target_indices = np.random.choice(np.arange(len(X_target)), size=num_target)
+        X_target = X_target[target_indices, :]
+        Y_target = Y_target[target_indices]
+    X_train = np.vstack((X_target, X_decoy))
+    Y_train = np.hstack((Y_target, Y_decoy))
+    return X_train, Y_train
+def re_score_psms(
+    psms: pd.DataFrame,
+    num_splits: int = 10,
+    verbose: bool = True,
+    balance: bool = True,
+    score: str = "hyperscore",
+    positive_example_q_max: float = 0.01,
+) -> pd.DataFrame:
+    """Re-score PSMs using LDA.
+    Args:
+        psms: DataFrame of PSMs
+        num_splits: Number of splits for cross-validation
+        verbose: Whether to print progress
+        balance: Whether to balance the dataset
+        score: Score to use for re-scoring
+        positive_example_q_max: Maximum q-value allowed for positive examples
+    Returns:
+        DataFrame with re-scored PSMs
+    """
+    scaler = StandardScaler()
+    X_all, _ = get_features(psms, score=score)
+    X_all = np.nan_to_num(X_all, nan=0.0)
+    scaler.fit(X_all)
+    splits = split_dataframe_randomly(df=psms, n=num_splits)
+    predictions, ids, ranks = [], [], []
+    for i in tqdm(range(num_splits), disable=not verbose, desc='Re-scoring PSMs', ncols=100):
+        target = splits[i]
+        ids.extend(target["spec_idx"].values)
+        ranks.extend(target["rank"].values)
+        features = []
+        for j in range(num_splits):
+            if j != i:
+                features.append(splits[j])
+        if num_splits == 1:
+            features = [target]
+        X_train, Y_train = generate_training_data(pd.concat(features), balance=balance, q_max=positive_example_q_max)
+        X_train, Y_train = np.nan_to_num(X_train, nan=0.0), np.nan_to_num(Y_train, nan=0.0)
+        X, _ = get_features(target)
+        X = np.nan_to_num(X, nan=0.0)
+        lda = LinearDiscriminantAnalysis(solver="eigen", shrinkage="auto")
+        lda.fit(scaler.transform(X_train), Y_train)
+        try:
+            score_flip = 1.0 if Y_train[
+                np.argmax(np.squeeze(lda.transform(scaler.transform(X_train))))] == 1.0 else -1.0
+        except:
+            score_flip = 1.0
+        Y_pred = np.squeeze(lda.transform(scaler.transform(X))) * score_flip
+        predictions.extend(Y_pred)
+    return pd.DataFrame({
+        "spec_idx": ids,
+        "rank": ranks,
+        "re_score": predictions
+    })
+def cosim_from_dict(observed, predicted):
+    """Calculate cosine similarity between observed and predicted intensities."""
+    intensities_a = []
+    intensities_b = []
+    for k, v in observed.items():
+        if k in predicted:
+            intensities_a.append(v)
+            intensities_b.append(predicted[k])
+        else:
+            intensities_a.append(v)
+            intensities_b.append(0)
+    a, b = np.array(intensities_a), np.array(intensities_b)
+    return 1 - distance.cosine(a, b)
+def row_to_fragment(r):
+    """Convert a DataFrame row to a Fragments object."""
+    charges = r.fragment_charge
+    ion_types = r.fragment_type
+    fragment_ordinals = r.fragment_ordinals
+    intensities = r.fragment_intensity
+    mz_calculated = r.fragment_mz_calculated
+    mz_experimental = r.fragment_mz_experimental
+    ion_types_parsed = []
+    for ion in ion_types:
+        if ion == "b":
+            ion_types_parsed.append(IonType("b"))
+        else:
+            ion_types_parsed.append(IonType("y"))
+    return Fragments(charges, ion_types_parsed, fragment_ordinals, intensities, mz_calculated, mz_experimental)
+def remove_substrings(input_string: str) -> str:
+    """Remove bracket substrings from a string."""
+    result = re.sub(r'\[.*?\]', '', input_string)
+    return result
+# Token replacements for modification annotation conversion
+replace_tokens = {
+    "[+42]": "[UNIMOD:1]",
+    "[+42.010565]": "[UNIMOD:1]",
+    "[+57.0215]": "[UNIMOD:4]",
+    "[+57.021464]": "[UNIMOD:4]",
+    "[+79.9663]": "[UNIMOD:21]",
+    "[+15.9949]": "[UNIMOD:35]",
+    "[+15.994915]": "[UNIMOD:35]",
+}
+class PatternReplacer:
+    """Replace patterns in strings using a replacement dictionary."""
+    def __init__(
+        self,
+        replacements: dict[str, str],
+        pattern: str | re.Pattern = r"\[.*?\]",
+    ):
+        self.pattern = re.compile(pattern)
+        self.replacements = replacements
+        for _in, _out in replacements.items():
+            assert (
+                len(re.findall(self.pattern, _in)) > 0
+            ), f"The submitted replacement, `{_in}->{_out}`, cannot be used with pattern `{pattern}`."
+    def apply(self, string: str) -> str:
+        """Apply replacements to a string."""
+        out_sequence = string
+        for _in in set(re.findall(self.pattern, string)):
+            try:
+                _out = self.replacements[_in]
+            except KeyError:
+                raise KeyError(
+                    f"Modification {_in} not among those specified in the replacements."
+                )
+            out_sequence = out_sequence.replace(_in, _out)
+        return out_sequence
+def fragments_to_dict(fragments: Fragments):
+    """Convert Fragments object to dictionary."""
+    d = {}
+    for charge, ion_type, ordinal, intensity in zip(
+        fragments.charges, fragments.ion_types, fragments.fragment_ordinals, fragments.intensities
+    ):
+        d[(charge, ion_type, ordinal)] = intensity
+    return d
+def plot_summary(TARGET, DECOY, save_path, dpi=300, file_format='png'):
+    """Create a summary plot for target and decoy PSMs."""
+    fig, axs = plt.subplots(3, 2, figsize=(15, 18))
+    # Plot 1 - RT prediction
+    axs[0, 0].scatter(TARGET.projected_rt, TARGET.rt_predicted, s=1, alpha=.1, c="darkblue", label="Target")
+    axs[0, 0].scatter(DECOY.projected_rt, DECOY.rt_predicted, s=1, alpha=.1, c="orange", label="Decoy")
+    axs[0, 0].set_xlabel("Retention time observed")
+    axs[0, 0].set_ylabel("Retention time predicted")
+    axs[0, 0].legend()
+    axs[0, 0].set_title("Retention Time Prediction")
+    # Plot 2 - IM prediction
+    axs[0, 1].scatter(TARGET.ion_mobility, TARGET.inv_mob_predicted, s=1, alpha=.1, c="darkblue", label="Target")
+    axs[0, 1].scatter(DECOY.ion_mobility, DECOY.inv_mob_predicted, s=1, alpha=.1, c="orange", label="Decoy")
+    axs[0, 1].set_xlabel("Ion mobility observed")
+    axs[0, 1].set_ylabel("Ion mobility predicted")
+    axs[0, 1].legend()
+    axs[0, 1].set_title("Ion Mobility Prediction")
+    # Plot 3 - RT delta
+    axs[1, 0].hist(TARGET.projected_rt - TARGET.rt_predicted, alpha=.8, bins="auto", density=True, color="darkblue", label="Target")
+    axs[1, 0].hist(DECOY.projected_rt - DECOY.rt_predicted, alpha=.5, bins="auto", density=True, color="orange", label="Decoy")
+    axs[1, 0].set_xlabel("Retention time delta")
+    axs[1, 0].set_ylabel("Density")
+    axs[1, 0].legend()
+    axs[1, 0].set_title("Retention Time Delta")
+    # Plot 4 - IM delta
+    axs[1, 1].hist(TARGET.ion_mobility - TARGET.inv_mob_predicted, alpha=.8, bins="auto", density=True, color="darkblue", label="Target")
+    axs[1, 1].hist(DECOY.ion_mobility - DECOY.inv_mob_predicted, alpha=.5, bins="auto", density=True, color="orange", label="Decoy")
+    axs[1, 1].set_xlim((-0.4, 0.4))
+    axs[1, 1].set_xlabel("Ion mobility delta")
+    axs[1, 1].set_ylabel("Density")
+    axs[1, 1].legend()
+    axs[1, 1].set_title("Ion Mobility Delta")
+    # Plot 5 - Cosine similarity
+    axs[2, 0].hist(TARGET.cosine_similarity, bins="auto", density=True, alpha=.8, color="darkblue", label="Target")
+    axs[2, 0].hist(DECOY.cosine_similarity, bins="auto", density=True, alpha=.5, color="orange", label="Decoy")
+    axs[2, 0].set_xlabel("Cosine similarity")
+    axs[2, 0].set_ylabel("Density")
+    axs[2, 0].legend()
+    axs[2, 0].set_title("Cosine Similarity")
+    # Plot 6 - Score
+    axs[2, 1].hist(TARGET.re_score, bins="auto", density=True, alpha=.8, color="darkblue", label="Target")
+    axs[2, 1].hist(DECOY.re_score, bins="auto", density=True, alpha=.5, color="orange", label="Decoy")
+    axs[2, 1].set_xlabel("Score")
+    axs[2, 1].set_ylabel("Density")
+    axs[2, 1].legend()
+    axs[2, 1].set_title("Score Information")
+    plt.tight_layout()
+    plt.savefig(save_path, dpi=dpi, format=file_format)