PyPI - imspy-search - Versions diffs - 0.4.0__py3-none-any.whl - Mend

imspy-search 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

imspy_search/__init__.py +126 -0
imspy_search/cli/__init__.py +11 -0
imspy_search/cli/imspy_ccs.py +322 -0
imspy_search/cli/imspy_dda.py +836 -0
imspy_search/cli/imspy_rescore_sage.py +289 -0
imspy_search/configs/config_ccs.toml +15 -0
imspy_search/configs/config_hla.toml +83 -0
imspy_search/configs/config_tryptic.toml +84 -0
imspy_search/dda_extensions.py +209 -0
imspy_search/mgf.py +139 -0
imspy_search/rescoring.py +166 -0
imspy_search/sage_output_utility.py +318 -0
imspy_search/utility.py +585 -0
imspy_search-0.4.0.dist-info/METADATA +108 -0
imspy_search-0.4.0.dist-info/RECORD +17 -0
imspy_search-0.4.0.dist-info/WHEEL +4 -0
imspy_search-0.4.0.dist-info/entry_points.txt +5 -0

imspy_search/cli/imspy_rescore_sage.py ADDED Viewed

@@ -0,0 +1,289 @@
+"""IMSPY SAGE Rescore CLI - Re-score SAGE search results using deep learning features."""
+import logging
+import argparse
+import sys
+import os
+import pandas as pd
+import numpy as np
+from imspy_predictors import (
+    DeepPeptideIonMobilityApex, load_deep_ccs_predictor,
+    DeepChromatographyApex, load_deep_retention_time_predictor,
+    Prosit2023TimsTofWrapper, load_tokenizer_from_resources,
+)
+from imspy_core.chemistry.utility import calculate_mz
+from imspy_search.utility import linear_map
+from sagepy.core.scoring import prosit_intensities_to_fragments_par
+from sagepy.qfdr.tdc import target_decoy_competition_pandas
+from imspy_search.sage_output_utility import (
+    re_score_psms, row_to_fragment, remove_substrings,
+    PatternReplacer, replace_tokens, cosim_from_dict,
+    fragments_to_dict, plot_summary
+)
+# Suppress pandas warnings about column assignment
+pd.options.mode.chained_assignment = None
+def main():
+    """Main entry point for imspy-rescore-sage CLI."""
+    parser = argparse.ArgumentParser(
+        description='IMSPY - SAGE Parser DDA - Re-score SAGE search results using imspy and sagepy.'
+    )
+    parser.add_argument("sage_results", help="The path to the SAGE results file")
+    parser.add_argument("sage_fragments", help="The path to the SAGE fragments file")
+    parser.add_argument("output", help="The path to where the output files should be created")
+    parser.add_argument(
+        "--tdc_method",
+        default="peptide_psm_peptide",
+        help="The target decoy competition method",
+        choices=["psm", "peptide_psm_only", "peptide_peptide_only", "peptide_psm_peptide"]
+    )
+    parser.add_argument(
+        "--num_splits",
+        default=5,
+        type=int,
+        help="The number of splits for cross-validation"
+    )
+    parser.add_argument(
+        "--no_balanced_split",
+        action="store_false",
+        dest="balance",
+        help="Do not balance the training dataset"
+    )
+    parser.set_defaults(balance=True)
+    parser.add_argument(
+        "--no_store_hyperscore",
+        action="store_false",
+        dest="store_hyperscore",
+        help="Do not store the results with the hyperscore"
+    )
+    parser.set_defaults(store_hyperscore=True)
+    parser.add_argument(
+        "--fine_tune_predictors",
+        action="store_true",
+        help="Fine tune the rt and inv-mob predictors"
+    )
+    parser.set_defaults(fine_tune_predictors=False)
+    parser.add_argument(
+        "--positive_example_q_max",
+        default=0.01,
+        type=float,
+        help="Maximum q-value allowed for positive examples"
+    )
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="Print verbose output"
+    )
+    parser.set_defaults(verbose=False)
+    parser.add_argument(
+        "--no_summary_plot",
+        action="store_false",
+        dest="summary_plot",
+        help="Do not create a summary plot"
+    )
+    parser.set_defaults(summary_plot=True)
+    args = parser.parse_args()
+    # Check if files exist
+    if not os.path.exists(args.sage_results):
+        logging.error(f"The SAGE results file {args.sage_results} does not exist.")
+        sys.exit(1)
+    if not os.path.exists(args.sage_fragments):
+        logging.error(f"The SAGE fragments file {args.sage_fragments} does not exist.")
+        sys.exit(1)
+    # Read SAGE results
+    if args.sage_results.endswith(".tsv"):
+        results = pd.read_csv(args.sage_results, sep="\t")
+    elif args.sage_results.endswith(".parquet"):
+        results = pd.read_parquet(args.sage_results)
+    else:
+        logging.error(f"Unknown file format for SAGE results file {args.sage_results}.")
+        sys.exit(1)
+    # Read SAGE fragments
+    if args.sage_fragments.endswith(".tsv"):
+        fragments = pd.read_csv(args.sage_fragments, sep="\t")
+    elif args.sage_fragments.endswith(".parquet"):
+        fragments = pd.read_parquet(args.sage_fragments)
+    else:
+        logging.error(f"Unknown file format for SAGE fragments file {args.sage_fragments}.")
+        sys.exit(1)
+    logging.basicConfig(level=logging.INFO)
+    # Load models
+    prosit_model = Prosit2023TimsTofWrapper(verbose=False)
+    im_predictor = DeepPeptideIonMobilityApex(
+        load_deep_ccs_predictor(),
+        load_tokenizer_from_resources("tokenizer-ptm")
+    )
+    rt_predictor = DeepChromatographyApex(
+        load_deep_retention_time_predictor(),
+        load_tokenizer_from_resources("tokenizer-ptm"),
+        verbose=True
+    )
+    # Filter sequences by length
+    results["sequence_length"] = results.apply(lambda s: len(remove_substrings(s.peptide)), axis=1)
+    results_filtered = results[results.sequence_length <= 30]
+    results_filtered["decoy"] = results_filtered.apply(lambda r: r.label == -1, axis=1)
+    token_replacer = PatternReplacer(replace_tokens)
+    results_filtered["sequence"] = results_filtered.apply(lambda r: token_replacer.apply(r.peptide), axis=1)
+    results_filtered["mono_mz_calculated"] = results_filtered.apply(
+        lambda r: calculate_mz(r.calcmass, r.charge), axis=1
+    )
+    results_filtered["inverse_mobility_observed"] = results.ion_mobility
+    results_filtered["projected_rt"] = results_filtered.apply(
+        lambda r: linear_map(r.rt, old_min=results_filtered.rt.min(), old_max=results_filtered.rt.max()),
+        axis=1
+    )
+    results_filtered["match_idx"] = results_filtered.sequence
+    results_filtered["spec_idx"] = [str(x) for x in results_filtered.psm_id]
+    results_filtered["score"] = results_filtered.hyperscore
+    results_filtered["q_value"] = None
+    if len(results_filtered) < len(results):
+        s = len(results) - len(results_filtered)
+        logging.info(f"Removed {s} sequences with sequence length > 30.")
+    S = set(results_filtered.psm_id)
+    # Fine-tuning data
+    if args.fine_tune_predictors:
+        TDC_train = target_decoy_competition_pandas(results_filtered, method="psm")
+        TDC_train_f = TDC_train[(TDC_train.decoy == False) & (TDC_train.q_value <= 0.01)]
+        TDC_train_f["spec_idxi"] = [int(x) for x in TDC_train_f.spec_idx]
+        FT_data = pd.merge(TDC_train_f, results_filtered, left_on=["spec_idxi"], right_on=["psm_id"])
+    fragments = fragments[[f in S for f in fragments.psm_id.values]]
+    logging.info(f"Processing {len(results_filtered)} PSMs.")
+    # Group fragments by PSM
+    fragments_grouped = fragments.groupby("psm_id").agg({
+        "fragment_type": list,
+        "fragment_ordinals": list,
+        "fragment_charge": list,
+        "fragment_mz_calculated": list,
+        "fragment_mz_experimental": list,
+        "fragment_intensity": list,
+    }).reset_index()
+    fragments_grouped["fragments_observed"] = fragments_grouped.apply(lambda r: row_to_fragment(r), axis=1)
+    fragments_grouped = fragments_grouped[["psm_id", "fragments_observed"]]
+    logging.info("Predicting intensities...")
+    intensity_pred = prosit_model.predict_intensities(
+        results_filtered.sequence.values,
+        results_filtered.charge.values,
+        collision_energies=np.zeros_like(results_filtered.charge.values) + 30,
+        batch_size=2048,
+        flatten=True,
+    )
+    logging.info("Predicting peptide retention times...")
+    if args.fine_tune_predictors:
+        rt_predictor.fine_tune_model(data=FT_data, verbose=args.verbose)
+    rt_pred = rt_predictor.simulate_separation_times(sequences=results_filtered.sequence.values)
+    logging.info("Predicting ion mobilities...")
+    if args.fine_tune_predictors:
+        im_predictor.fine_tune_model(data=FT_data, verbose=args.verbose)
+    inv_mob = im_predictor.simulate_ion_mobilities(
+        sequences=results_filtered.sequence.values,
+        charges=results_filtered.charge.values,
+        mz=results_filtered.mono_mz_calculated.values,
+    )
+    results_filtered["inv_mob_predicted"] = inv_mob
+    results_filtered["rt_predicted"] = rt_pred
+    results_filtered["fragments_predicted"] = prosit_intensities_to_fragments_par(intensity_pred)
+    PSMS = pd.merge(results_filtered, fragments_grouped, on="psm_id")
+    PSMS["observed_dict"] = PSMS.apply(lambda r: fragments_to_dict(r.fragments_observed), axis=1)
+    PSMS["predicted_dict"] = PSMS.apply(lambda r: fragments_to_dict(r.fragments_predicted), axis=1)
+    PSMS["cosine_similarity"] = PSMS.apply(lambda s: cosim_from_dict(s.observed_dict, s.predicted_dict), axis=1)
+    PSMS["delta_rt"] = PSMS.projected_rt - PSMS.rt_predicted
+    PSMS["delta_ims"] = PSMS.ion_mobility - PSMS.inv_mob_predicted
+    PSMS["intensity_ms1"] = 0.0
+    PSMS["collision_energy"] = 0.0
+    PSMS = PSMS.rename(columns={
+        "ms2_intensity": "intensity_ms2",
+        "fragment_ppm": "average_ppm",
+        "precursor_ppm": "delta_mass"
+    })
+    logging.info("Re-scoring PSMs...")
+    RE_SCORE = re_score_psms(
+        PSMS,
+        num_splits=args.num_splits,
+        balance=args.balance,
+        positive_example_q_max=args.positive_example_q_max
+    )
+    PSMS = pd.merge(PSMS, RE_SCORE, on=["spec_idx", "rank"])
+    TDC = target_decoy_competition_pandas(PSMS, method=args.tdc_method, score="hyperscore")
+    TDC_rescore = target_decoy_competition_pandas(PSMS, method=args.tdc_method, score="re_score")
+    TDC = TDC.rename(columns={"match_idx": "peptide", "spec_idx": "psm_id"})
+    TDC_rescore = TDC_rescore.rename(columns={"match_idx": "peptide", "spec_idx": "psm_id"})
+    before = len(TDC[TDC.q_value <= 0.01])
+    after = len(TDC_rescore[TDC_rescore.q_value <= 0.01])
+    logging.info(f"Before re-scoring: {before} PSMs with q-value <= 0.01")
+    logging.info(f"After re-scoring: {after} PSMs with q-value <= 0.01")
+    if args.summary_plot:
+        TARGET = PSMS[PSMS.decoy == False]
+        DECOY = PSMS[PSMS.decoy]
+        logging.info("Creating summary plot...")
+        output_path = os.path.join(args.output, "summary_plot.png")
+        plot_summary(TARGET, DECOY, output_path, dpi=300)
+    output_path = os.path.dirname(args.output)
+    if not os.path.exists(output_path):
+        os.makedirs(output_path)
+    file_name = os.path.join(output_path, "imspy_sage_hyperscore.tsv")
+    file_name_rescore = os.path.join(output_path, "imspy_sage_rescore.tsv")
+    if args.store_hyperscore:
+        TDC.to_csv(file_name, sep="\t", index=False)
+        logging.info(f"Output file {file_name} saved.")
+    TDC_rescore.to_csv(file_name_rescore, sep="\t", index=False)
+    logging.info(f"Output file {file_name_rescore} saved.")
+if __name__ == "__main__":
+    main()

imspy_search/configs/config_ccs.toml ADDED Viewed

@@ -0,0 +1,15 @@
+raw_data_path = "/media/hd02/data/raw/dda/ccs/Raw_Yeast_Trp/"
+fasta_path = "/media/hd02/data/fasta/yeast/yeast_proteome.fasta"
+num_threads = -1
+cleave_at = "KR"
+restrict = "P"
+n_terminal = false
+verbose = true
+fasta_batch_size = 1
+[static_modifications]
+C = "[UNIMOD:4]"
+[variable_modifications]
+M = ["[UNIMOD:35]"]
+"[" = ["[UNIMOD:1]"]

imspy_search/configs/config_hla.toml ADDED Viewed

@@ -0,0 +1,83 @@
+# This file contains the modifications that are used in the database search.
+# For a detailed description of the supported modification types, consult the SAGE documentation: https://sage-docs.vercel.app/docs/configuration#file
+# compared to sage, variable modifications are not put into a list, please provide each modified variant of the amino acid as a single key-value pair
+[variable_modifications]
+M = ["[UNIMOD:35]"] # Oxidation of methionine
+"[" = ["[UNIMOD:1]"] # Acetylation of the peptide N-terminus of proteins
+[static_modifications]
+[scoring]
+score_type = "openmshyperscore"
+report_psms = 5
+min_matched_peaks = 5
+annotate_matches = true
+max_fragment_charge = 2
+[precursor_tolerance]
+use_da = false
+lower = -25.0
+upper = 25.0
+[fragment_tolerance]
+use_da = false
+lower = -20.0
+upper = 20.0
+[isolation_window]
+lower = -3.0
+upper = 3.0
+[preprocessing]
+take_top_n = 150
+[enzyme]
+missed_cleavages = 2
+min_len = 7
+max_len = 25
+cleave_at = ""
+restrict = ""
+c_terminal = true
+[database]
+generate_decoys = true
+shuffle_decoys = false
+keep_ends = true
+bucket_size = 16384
+[search]
+fragment_max_mz = 1700.0
+[re_scoring]
+re_score_num_splits = 5
+balanced_re_score = true
+re_score_metric = "hyperscore"
+re_score_mokapot = true
+[fdr]
+fdr_threshold = 0.01
+remove_decoys = true
+fdr_psm_method = "psm"
+fdr_peptide_method = "peptide_psm_peptide"
+fdr_score = "re_score"
+[parallelization]
+num_threads = -1
+[refinement]
+refine_rt = false
+refine_im = false
+refinement_verbose = false
+[batch_sizes]
+intensity_prediction_batch_size = 2048
+model_fine_tune_batch_size = 1024
+sample_size_collision_energy_calibration = 256
+[other]
+calibrate_mz = false
+in_memory = false
+bruker_sdk = true
+randomize_fasta_split = false
+verbose = true
+fasta_batch_size = 10

imspy_search/configs/config_tryptic.toml ADDED Viewed

@@ -0,0 +1,84 @@
+# This file contains the modifications that are used in the database search.
+# For a detailed description of the supported modification types, consult the SAGE documentation: https://sage-docs.vercel.app/docs/configuration#file
+# compared to sage, variable modifications are not put into a list, please provide each modified variant of the amino acid as a single key-value pair
+[variable_modifications]
+M = ["[UNIMOD:35]"] # Oxidation of methionine
+"[" = ["[UNIMOD:1]"] # Acetylation of the peptide N-terminus of proteins
+[static_modifications]
+C = "[UNIMOD:4]" # Carbamidomethylation of cysteine
+[scoring]
+score_type = "hyperscore"
+report_psms = 5
+min_matched_peaks = 5
+annotate_matches = true
+max_fragment_charge = 2
+[precursor_tolerance]
+use_da = false
+lower = -15.0
+upper = 15.0
+[fragment_tolerance]
+use_da = false
+lower = -20.0
+upper = 20.0
+[isolation_window]
+lower = -3.0
+upper = 3.0
+[preprocessing]
+take_top_n = 150
+[enzyme]
+missed_cleavages = 2
+min_len = 7
+max_len = 30
+cleave_at = "KR"
+restrict = "P"
+c_terminal = true
+[database]
+generate_decoys = true
+shuffle_decoys = false
+keep_ends = true
+bucket_size = 16384
+[search]
+fragment_max_mz = 1700.0
+[re_scoring]
+re_score_num_splits = 5
+balanced_re_score = true
+re_score_metric = "hyperscore"
+re_score_mokapot = true
+[fdr]
+fdr_threshold = 0.01
+remove_decoys = true
+fdr_psm_method = "psm"
+fdr_peptide_method = "peptide_psm_peptide"
+fdr_score = "re_score"
+[parallelization]
+num_threads = -1
+[refinement]
+refine_rt = true
+refine_im = true
+refinement_verbose = false
+[batch_sizes]
+intensity_prediction_batch_size = 2048
+model_fine_tune_batch_size = 1024
+sample_size_collision_energy_calibration = 256
+[other]
+calibrate_mz = true
+in_memory = false
+bruker_sdk = true
+randomize_fasta_split = false
+verbose = true
+fasta_batch_size = 1

imspy_search/dda_extensions.py ADDED Viewed

@@ -0,0 +1,209 @@
+"""Extensions to TimsDatasetDDA for sagepy integration.
+This module provides methods that were removed from imspy-core's dda.py
+to eliminate the sagepy dependency from the core package.
+"""
+from typing import List, Optional
+import pandas as pd
+from sagepy.core import (
+    Precursor, ProcessedSpectrum, SpectrumProcessor, Tolerance
+)
+from imspy_core.timstof import TimsDatasetDDA
+from imspy_search.utility import sanitize_mz, sanitize_charge, get_searchable_spec
+def to_sage_precursor(
+    row: pd.Series,
+    isolation_window_lower: float = -3.0,
+    isolation_window_upper: float = 3.0,
+) -> Precursor:
+    """Convert a PASEF fragment row to a sagepy Precursor.
+    Args:
+        row: A pandas Series containing PASEF fragment data
+        isolation_window_lower: Lower bound for isolation window (Da)
+        isolation_window_upper: Upper bound for isolation window (Da)
+    Returns:
+        A sagepy Precursor object
+    """
+    return Precursor(
+        mz=sanitize_mz(row['monoisotopic_mz'], row['largest_peak_mz']),
+        intensity=row['intensity'],
+        charge=sanitize_charge(row['charge']),
+        isolation_window=Tolerance(da=(isolation_window_lower, isolation_window_upper)),
+        collision_energy=row.collision_energy,
+        inverse_ion_mobility=row.mobility if 'mobility' in row.index else None,
+    )
+def get_sage_processed_precursors(
+    dataset: TimsDatasetDDA,
+    num_threads: int = 16,
+    take_top_n: int = 150,
+    isolation_window_lower: float = -3.0,
+    isolation_window_upper: float = 3.0,
+    ds_name: Optional[str] = None,
+) -> pd.DataFrame:
+    """Extract and process PASEF fragments as sagepy ProcessedSpectrum objects.
+    This function extracts PASEF fragments from a TimsDatasetDDA, aggregates
+    them by precursor ID, and converts them to sagepy ProcessedSpectrum objects
+    suitable for database search.
+    Args:
+        dataset: TimsDatasetDDA object
+        num_threads: Number of threads for extraction
+        take_top_n: Number of top peaks to keep
+        isolation_window_lower: Lower bound for isolation window (Da)
+        isolation_window_upper: Upper bound for isolation window (Da)
+        ds_name: Dataset name for spec_id generation (defaults to dataset path basename)
+    Returns:
+        DataFrame with columns: precursor_id, mobility, spec_id, sage_precursor, processed_spec
+    """
+    import os
+    if ds_name is None:
+        ds_name = os.path.basename(str(dataset.data_path))
+    # Get PASEF fragments
+    fragments = dataset.get_pasef_fragments(num_threads=num_threads)
+    # Aggregate by precursor_id
+    fragments = fragments.groupby('precursor_id').agg({
+        'frame_id': 'first',
+        'time': 'first',
+        'precursor_id': 'first',
+        'raw_data': 'sum',
+        'scan_begin': 'first',
+        'scan_end': 'first',
+        'isolation_mz': 'first',
+        'isolation_width': 'first',
+        'collision_energy': 'first',
+        'largest_peak_mz': 'first',
+        'average_mz': 'first',
+        'monoisotopic_mz': 'first',
+        'charge': 'first',
+        'average_scan': 'first',
+        'intensity': 'first',
+        'parent_id': 'first',
+    })
+    # Calculate mobility
+    mobility = fragments.apply(
+        lambda r: r.raw_data.get_inverse_mobility_along_scan_marginal(),
+        axis=1
+    )
+    fragments['mobility'] = mobility
+    # Generate spec_id
+    spec_id = fragments.apply(
+        lambda r: str(r['frame_id']) + '-' + str(r['precursor_id']) + '-' + ds_name,
+        axis=1
+    )
+    fragments['spec_id'] = spec_id
+    # Create sage precursors
+    sage_precursor = fragments.apply(
+        lambda r: to_sage_precursor(
+            r,
+            isolation_window_lower=isolation_window_lower,
+            isolation_window_upper=isolation_window_upper
+        ),
+        axis=1
+    )
+    fragments['sage_precursor'] = sage_precursor
+    # Create spectrum processor
+    spec_processor = SpectrumProcessor(take_top_n=take_top_n)
+    # Process spectra
+    processed_spec = fragments.apply(
+        lambda r: get_searchable_spec(
+            precursor=r.sage_precursor,
+            raw_fragment_data=r.raw_data,
+            spec_processor=spec_processor,
+            spec_id=r.spec_id,
+            time=r['time'],
+        ),
+        axis=1
+    )
+    fragments['processed_spec'] = processed_spec
+    return fragments
+def get_processed_spectra_for_search(
+    dataset: TimsDatasetDDA,
+    num_threads: int = 16,
+    take_top_n: int = 150,
+    isolation_window_lower: float = -3.0,
+    isolation_window_upper: float = 3.0,
+) -> List[ProcessedSpectrum]:
+    """Get list of ProcessedSpectrum objects for database search.
+    This is a convenience wrapper around get_sage_processed_precursors
+    that returns just the list of ProcessedSpectrum objects.
+    Args:
+        dataset: TimsDatasetDDA object
+        num_threads: Number of threads for extraction
+        take_top_n: Number of top peaks to keep
+        isolation_window_lower: Lower bound for isolation window (Da)
+        isolation_window_upper: Upper bound for isolation window (Da)
+    Returns:
+        List of ProcessedSpectrum objects
+    """
+    fragments = get_sage_processed_precursors(
+        dataset=dataset,
+        num_threads=num_threads,
+        take_top_n=take_top_n,
+        isolation_window_lower=isolation_window_lower,
+        isolation_window_upper=isolation_window_upper,
+    )
+    return fragments['processed_spec'].tolist()
+def search_timstof_dda(
+    dataset: TimsDatasetDDA,
+    scorer,
+    indexed_db,
+    num_threads: int = 16,
+    take_top_n: int = 150,
+    isolation_window_lower: float = -3.0,
+    isolation_window_upper: float = 3.0,
+):
+    """Search a TimsDatasetDDA against a database using sagepy.
+    Args:
+        dataset: TimsDatasetDDA object
+        scorer: sagepy Scorer object
+        indexed_db: Indexed database from sagepy
+        num_threads: Number of threads for extraction and search
+        take_top_n: Number of top peaks to keep
+        isolation_window_lower: Lower bound for isolation window (Da)
+        isolation_window_upper: Upper bound for isolation window (Da)
+    Returns:
+        Dictionary of PSMs from scorer.score_collection_psm
+    """
+    spectra = get_processed_spectra_for_search(
+        dataset=dataset,
+        num_threads=num_threads,
+        take_top_n=take_top_n,
+        isolation_window_lower=isolation_window_lower,
+        isolation_window_upper=isolation_window_upper,
+    )
+    return scorer.score_collection_psm(
+        db=indexed_db,
+        spectrum_collection=spectra,
+        num_threads=num_threads,
+    )