PyPI - seer-pas-sdk - Versions diffs - 1.2.0__py3-none-any.whl → 1.2.2__py3-none-any.whl - Mend

seer-pas-sdk 1.2.0py3-none-any.whl → 1.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

seer_pas_sdk/common/__init__.py CHANGED Viewed

@@ -99,7 +99,7 @@ def dict_to_df(data):
 # Most cases appear to be a .tsv file.
-def download_df(url, is_tsv=True, dtype={}):
+def download_df(url, is_tsv=True, dtype={}, usecols=None):
     """
     Fetches a TSV/CSV file from a URL and returns as a Pandas DataFrame.
@@ -114,6 +114,9 @@ def download_df(url, is_tsv=True, dtype={}):
     dtype : dict
         Data type conversion when intaking columns. e.g. {'a': str, 'b': np.float64}
+    usecols : list
+        Subset of columns to download. If not specified, downloads all columns.
     Returns
     -------
     pandas.core.frame.DataFrame
@@ -139,12 +142,10 @@ def download_df(url, is_tsv=True, dtype={}):
     if not url:
         return pd.DataFrame()
-    url_content = io.StringIO(requests.get(url).content.decode("utf-8"))
-    if is_tsv:
-        csv = pd.read_csv(url_content, sep="\t", dtype=dtype)
-    else:
-        csv = pd.read_csv(url_content, dtype=dtype)
-    return csv
+    csv = pd.read_csv(
+        url, sep="\t" if is_tsv else ",", usecols=usecols, engine="pyarrow"
+    )
+    return csv.astype(dtype=dtype) if dtype else csv
 def get_sample_info(

seer_pas_sdk/core/sdk.py CHANGED Viewed

@@ -7,7 +7,6 @@ import requests
 import urllib.request
 import ssl
 from typing import List as _List, Tuple as _Tuple
 from ..common import *
@@ -15,8 +14,6 @@ from ..auth import Auth
 from ..objects.volcanoplot import VolcanoPlotBuilder
 from ..objects.headers import *
-import warnings
 class SeerSDK:
     """
@@ -1228,8 +1225,8 @@ class SeerSDK:
         >>> seer_sdk.get_msruns(sample_ids)
         >>> [
-            {"id": "SAMPLE_ID_1_HERE" ... },
-            {"id": "SAMPLE_ID_2_HERE" ... }
+            {"id": "MSRUN_ID_1_HERE" ... },
+            {"id": "MSRUN_ID_2_HERE" ... }
         ]
         >>> seer_sdk.get_msruns(sample_ids, as_df=True)
@@ -1295,8 +1292,8 @@ class SeerSDK:
         >>> seer_sdk.find_msruns(sample_ids)
         >>> [
-            {"id": "SAMPLE_ID_1_HERE" ... },
-            {"id": "SAMPLE_ID_2_HERE" ... }
+            {"id": "MSRUN_ID_1_HERE" ... },
+            {"id": "MSRUN_ID_2_HERE" ... }
         ]
         >>> seer_sdk.find_msruns(sample_ids, as_df=True)
@@ -1310,25 +1307,34 @@ class SeerSDK:
         URL = f"{self._auth.url}api/v1/msdatas/items"
         res = []
-        for sample_id in sample_ids:
-            with self._get_auth_session("findmsdatas") as s:
+        params = {"all": "true"}
-                msdatas = s.post(URL, json={"sampleId": sample_id})
+        with self._get_auth_session("findmsdatas") as s:
-                if msdatas.status_code != 200 or not msdatas.json()["data"]:
-                    raise ValueError(
-                        f"Failed to fetch MS data for sample ID={sample_id}."
-                    )
+            msdatas = s.post(
+                URL, json={"sampleId": ",".join(sample_ids)}, params=params
+            )
-                res += [x for x in msdatas.json()["data"]]
+            if msdatas.status_code != 200 or not msdatas.json()["data"]:
+                raise ValueError(
+                    f"Failed to fetch MS data for sample IDs={sample_ids}."
+                )
+            res += [x for x in msdatas.json()["data"]]
         spaces = {x["id"]: x["usergroup_name"] for x in self.get_spaces()}
+        def filepath_to_msrunid(filepath):
+            return os.path.basename(filepath).split(".")[0]
         for entry in res:
             if "tenant_id" in entry:
                 del entry["tenant_id"]
             if "raw_file_path" in entry:
+                # Provide a human-readable MS run id
+                entry["Run"] = filepath_to_msrunid(entry["raw_file_path"])
                 # Simple lambda function to find the third occurrence of '/' in the raw file path
                 location = lambda s: len(s) - len(s.split("/", 3)[-1])
                 # Slicing the string from the location
@@ -1339,6 +1345,13 @@ class SeerSDK:
                 entry["space"] = spaces.get(entry["user_group"], "General")
                 del entry["user_group"]
+            # Rename the key sample_id to sample_uuid
+            if "sample_id" in entry:
+                entry["sample_uuid"] = entry.pop("sample_id")
+            # Rename the key sample_id_tracking to sample_id
+            if "sample_id_tracking" in entry:
+                entry["sample_id"] = entry.pop("sample_id_tracking")
         if not res and as_df:
             return pd.DataFrame(columns=MSRUN_COLUMNS)
         return res if not as_df else dict_to_df(res)
@@ -1853,7 +1866,7 @@ class SeerSDK:
                             )
                         )
                     except Exception as e:
-                        print("Warning: Could not fetch fasta files.")
+                        print("Error: Could not fetch fasta files.")
                         res["fasta"] = None
                 else:
                     res["fasta"] = None
@@ -2066,7 +2079,7 @@ class SeerSDK:
                             )
                         except:
                             print(
-                                f"Warning: Could not fetch fasta files for analysis {res[entry].get('analysis_name')}."
+                                f"Error: Could not fetch fasta files for analysis {res[entry].get('analysis_name')}."
                             )
                 else:
                     res[entry]["fasta"] = None
@@ -2382,7 +2395,11 @@ class SeerSDK:
             return files
     def get_search_result(
-        self, analysis_id: str, analyte_type: str, rollup: str
+        self,
+        analysis_id: str,
+        analyte_type: str,
+        rollup: str,
+        columns: _List[str] = None,
     ):
         """
         Load one of the files available via the "Download result files" button on the PAS UI.
@@ -2423,6 +2440,7 @@ class SeerSDK:
                         "npLink"
                     ]["url"],
                     dtype=dtype,
+                    usecols=columns,
                 )
             elif rollup == "panel":
                 return download_df(
@@ -2430,6 +2448,7 @@ class SeerSDK:
                         "panelLink"
                     ]["url"],
                     dtype=dtype,
+                    usecols=columns,
                 )
         elif analyte_type == "peptide":
             if rollup == "np":
@@ -2438,6 +2457,7 @@ class SeerSDK:
                         "npLink"
                     ]["url"],
                     dtype=dtype,
+                    usecols=columns,
                 )
             elif rollup == "panel":
                 return download_df(
@@ -2445,12 +2465,14 @@ class SeerSDK:
                         "panelLink"
                     ]["url"],
                     dtype=dtype,
+                    usecols=columns,
                 )
         else:
             return download_df(
                 self.get_search_result_file_url(
                     analysis_id, filename="report.tsv"
-                )["url"]
+                )["url"],
+                usecols=columns,
             )
     def download_search_output_file(

seer_pas_sdk/core/unsupported.py CHANGED Viewed

@@ -1471,37 +1471,70 @@ class _UnsupportedSDK(_SeerSDK):
         Get analyte intensities data for a given PAS analysis.
         Args:
             analysis_id (str): ID of the analysis.
-            analyte_type (str): Type of the analyte. Must be either 'protein', 'peptide', precursor.
+            analyte_type (str): Type of the analyte. Must be either 'protein', 'peptide', 'precursor'.
             rollup (str): Intensities rollup method. Must be either 'np' or 'panel'.
-            norm_method (str): Search engine. Supported engines are: raw, engine, median, median80, pepcal. Default is 'pepcal'.
+            norm_method (str): Search engine. Supported engines are: raw, engine, median, median80, pepcal, pepcal_batch. Default is 'pepcal'.
         Returns:
             pd.DataFrame: A dataframe with each row containing the analyte intensity measurement:
                             'msrun_id', 'sample_id', 'nanoparticle' (if rollup is 'np'), 'protein_group', 'peptide' (for 'peptide' and 'precursor' analyte types), 'charge' (for 'precursor' analyte type),
                             'intensity_log10', 'protein_group_q_value', 'q_value' (for 'precursor' analyte type), 'rt' and 'irt' (for 'peptide' and 'precursor' analyte types)
         """
-        # 1. Get msrun data for analysis
+        def filepath_to_msrunid(filepath):
+            return os.path.basename(filepath).split(".")[0]
+        # 1. Get samples and msrun data for analysis
         samples = self.find_samples(analysis_id=analysis_id)
-        sample_name_to_id = {s["sample_name"]: s["id"] for s in samples}
         sample_uuid_to_id = {s["id"]: s["sample_id"] for s in samples}
-        # for np rollup, a row represents an msrun
-        msruns = self.find_msruns(sample_ids=sample_name_to_id.values())
-        file_to_msrun = {
-            os.path.basename(msrun["raw_file_path"]).split(".")[0]: msrun
-            for msrun in msruns
-        }
-        sample_to_msrun = {msrun["sample_id"]: msrun for msrun in msruns}
+        sample_id_to_uuid = {s["sample_id"]: s["id"] for s in samples}
+        # FIXME sample_name is not guaranteed to be unique (within PAS analysis)
+        sample_name_to_uuid = {s["sample_name"]: s["id"] for s in samples}
-        # for panel rollup, a row represents a sample
+        msruns = self.find_msruns(sample_ids=[s["id"] for s in samples])
+        msrunid_to_info = {msrun["Run"]: msrun for msrun in msruns}
         # 2. Get search results
-        # pull the np/panel file, or report.tsv for precursor mode
+        # pull the np/panel file, or the relevant columns from the report.tsv for precursor mode
+        columns = None
+        if analyte_type == "precursor" and rollup == "np":
+            columnsExperiment = ["Run"]
+            columnsProtein = [
+                "Protein.Group",
+            ]
+            columnsPeptide = [
+                "Stripped.Sequence",
+            ]
+            columnsPrecursor = [
+                "Precursor.Id",
+                "Precursor.Charge",
+                "Precursor.Quantity",
+                "RT",
+                "iRT",
+                "IM",
+                "iIM",
+            ]
+            columnsQValue = [
+                "Q.Value",
+                "Protein.Q.Value",
+            ]
+            columns = [
+                *columnsExperiment,
+                *columnsProtein,
+                *columnsPeptide,
+                *columnsPrecursor,
+                *columnsQValue,
+            ]
         search_results = self.get_search_result(
             analysis_id=analysis_id,
             analyte_type=analyte_type,
             rollup=rollup,
+            columns=columns,
         )
         if analyte_type in ["protein", "peptide"]:
+            # set the intensity column based on norm_method and PAS analysis protocol version
             intensity_column = None
             if norm_method == "raw":
                 intensity_column = (
@@ -1543,139 +1576,171 @@ class _UnsupportedSDK(_SeerSDK):
                     raise ValueError(
                         "Pepcal normalized intensities not found in search results. This is only available with analyses processed with DIA-NN Seer Protocol v2.0 or later with the Seer Peptide Calibrant option enabled. \n Please retry using different norm_method, such as 'median'"
                     )
                 intensity_column = "PepCal Intensities Log10"
+            elif norm_method == "pepcal_batch":
+                if not (
+                    "PepCal Batch Intensities Log10" in search_results.columns
+                ):
+                    raise ValueError(
+                        "Pepcal normalized batch corrected intensities not found in search results. This is only available with analyses processed with DIA-NN Seer Protocol v2.0 or later with the Seer Peptide Calibrant option enabled. \n Please retry using different norm_method, such as 'median'"
+                    )
+                intensity_column = "PepCal Batch Intensities Log10"
             else:
                 raise ValueError(
                     f"norm_method = {norm_method} is not supported. Supported normalization methods are: raw, pepcal, engine, median, median80."
                 )
-            if rollup == "panel":
-                search_results.fillna({"Sample Name": ""}, inplace=True)
-                search_results["File Name"] = search_results[
-                    "Sample Name"
-                ].apply(
-                    lambda x: (
-                        os.path.basename(
-                            sample_to_msrun[sample_name_to_id[x]][
-                                "raw_file_path"
-                            ]
-                        ).split(".")[0]
-                        if x
-                        else None
-                    )
-                )
-            search_results["File Name"] = search_results["File Name"].apply(
-                lambda x: os.path.basename(x).split(".")[0] if x else None
-            )
             search_results["Intensity Log10"] = search_results[
                 intensity_column
             ]
-            # 3. Merge report to search results to get Q value and other properties
-            report = self.get_search_result(
-                analysis_id=analysis_id,
-                analyte_type="precursor",
-                rollup="np",
-            )
-            report["File Name"] = report["Run"]
-            report["Protein Group"] = report["Protein.Group"]
-            if analyte_type == "protein":
-                report["Protein Q Value"] = report["Protein.Q.Value"]
-                report = report[
-                    ["File Name", "Protein Group", "Protein Q Value"]
-                ]
-                report.drop_duplicates(
-                    subset=["File Name", "Protein Group"], inplace=True
+            if rollup == "panel":
+                search_results.rename(
+                    columns={"Sample ID": "Sample UUID"}, inplace=True
                 )
-                df = pd.merge(
-                    search_results,
-                    report,
-                    on=["File Name", "Protein Group"],
-                    how="left",
+                search_results["Sample UUID"] = search_results[
+                    "Sample Name"
+                ].map(sample_name_to_uuid)
+                search_results["Sample ID"] = search_results[
+                    "Sample UUID"
+                ].map(sample_uuid_to_id)
+                experiment_columns = ["Sample UUID", "Sample ID"]
+                # analyte info is limited to the id in the panel rollup
+                if analyte_type == "protein":
+                    analyte_id_column = "Protein Group"
+                else:
+                    analyte_id_column = "Peptide"
+                analyte_columns = [analyte_id_column]
+                df = search_results
+            else:
+                # np rollup, extract basename without extension
+                path_to_msrunid = {
+                    path: filepath_to_msrunid(path)
+                    for path in search_results["File Name"].unique()
+                }
+                # strip path from the filename to allow merging with the precursor report
+                search_results["Run"] = search_results["File Name"].map(
+                    path_to_msrunid
                 )
-                included_columns = [
-                    "MsRun ID",
-                    "Sample ID",
-                    "Protein Group",
-                    "Intensity Log10",
-                    "Protein Q Value",
-                ]
-            else:
-                report["Peptide"] = report["Stripped.Sequence"]
-                #  If analyte_type is peptide, attach retention time (RT, iRT)
-                report = report[["File Name", "Peptide", "RT", "iRT"]]
-                report.drop_duplicates(
-                    subset=["File Name", "Peptide"], inplace=True
+                search_results["MsRun UUID"] = search_results["Run"].map(
+                    {k: v["id"] for k, v in msrunid_to_info.items()}
                 )
-                df = pd.merge(
-                    search_results,
-                    report,
-                    on=["File Name", "Peptide"],
-                    how="left",
+                search_results["Sample ID"] = search_results["Run"].map(
+                    {k: v["sample_id"] for k, v in msrunid_to_info.items()}
+                )
+                search_results["Sample UUID"] = search_results["Run"].map(
+                    {k: v["sample_uuid"] for k, v in msrunid_to_info.items()}
                 )
-                included_columns = [
-                    "MsRun ID",
+                search_results["Nanoparticle"] = search_results["Run"].map(
+                    {k: v["nanoparticle"] for k, v in msrunid_to_info.items()}
+                )
+                experiment_columns = [
+                    "MsRun UUID",
+                    "Run",
+                    "Nanoparticle",
+                    "Sample UUID",
                     "Sample ID",
-                    "Peptide",
-                    "Protein Group",
-                    "Intensity Log10",
-                    "RT",
-                    "iRT",
                 ]
-            # endif
-            if rollup == "np":
-                included_columns.insert(
-                    included_columns.index("Sample ID") + 1, "Nanoparticle"
+                # Merge report to search results to get Q value and other properties
+                if analyte_type == "protein":
+                    columns = ["Run", "Protein.Group", "Protein.Q.Value"]
+                elif analyte_type == "peptide":
+                    columns = ["Run", "Stripped.Sequence", "Protein.Q.Value"]
+                analytes = self.get_search_result(
+                    analysis_id=analysis_id,
+                    analyte_type="precursor",
+                    rollup="np",
+                    columns=columns,
+                )
+                # pandas Dataframe.rename() default behavior is to ignore the columns that do not exist in the data frame.
+                analytes.rename(
+                    columns={
+                        "Protein.Group": "Protein Group",
+                        "Protein.Q.Value": "Protein Q Value",
+                        "Stripped.Sequence": "Peptide",
+                    },
+                    inplace=True,
                 )
-            df["MsRun ID"] = df["File Name"].apply(
-                lambda x: (
-                    file_to_msrun[x]["id"] if x in file_to_msrun else None
+                if analyte_type == "protein":
+                    analyte_id_column = "Protein Group"
+                    analyte_columns = [
+                        analyte_id_column,
+                        "Protein Q Value",
+                    ]
+                else:
+                    analyte_id_column = "Peptide"
+                    analyte_columns = [analyte_id_column]
+                # endif analyte_type
+                analytes.drop(
+                    columns=[
+                        col
+                        for col in analytes.columns
+                        if col != "Run" and col not in analyte_columns
+                    ],
+                    inplace=True,
                 )
-            )
-            df["Sample ID"] = df["File Name"].apply(
-                lambda x: (
-                    file_to_msrun[x]["sample_id"]
-                    if x in file_to_msrun
-                    else None
+                analytes.drop_duplicates(
+                    subset=["Run", analyte_id_column], inplace=True
                 )
-            )
-            df = df[included_columns]
+                df = pd.merge(
+                    search_results,
+                    analytes,
+                    on=["Run", analyte_id_column],
+                    how="left",
+                    validate="one_to_one",
+                )
+            df = df[experiment_columns + analyte_columns + ["Intensity Log10"]]
         else:
             # precursor
             # working only in report.tsv
-            search_results["Intensity"] = search_results["Precursor.Quantity"]
-            search_results["MsRun ID"] = search_results["Run"].apply(
-                lambda x: (
-                    file_to_msrun[x]["id"] if x in file_to_msrun else None
+            if norm_method != "raw":
+                raise ValueError(
+                    "For precursor analyte type, only 'raw' norm_method is supported."
                 )
+            search_results["MsRun UUID"] = search_results["Run"].map(
+                {k: v["id"] for k, v in msrunid_to_info.items()}
             )
-            search_results["Sample ID"] = search_results["Run"].apply(
-                lambda x: (
-                    file_to_msrun[x]["sample_id"]
-                    if x in file_to_msrun
-                    else None
-                )
+            search_results["Sample ID"] = search_results["Run"].map(
+                {k: v["sample_id"] for k, v in msrunid_to_info.items()}
             )
-            search_results["Protein Group"] = search_results["Protein.Group"]
-            search_results["Peptide"] = search_results["Stripped.Sequence"]
-            search_results["Charge"] = search_results["Precursor.Charge"]
-            search_results["Precursor Id"] = search_results["Precursor.Id"]
-            search_results["Precursor Q Value"] = search_results["Q.Value"]
-            search_results["Protein Q Value"] = search_results[
-                "Protein.Q.Value"
+            search_results["Sample UUID"] = search_results["Sample ID"].map(
+                sample_id_to_uuid
+            )
+            search_results["Nanoparticle"] = search_results["Run"].map(
+                {k: v["nanoparticle"] for k, v in msrunid_to_info.items()}
+            )
+            experiment_columns = [
+                "MsRun UUID",
+                "Run",
+                "Nanoparticle",
+                "Sample UUID",
+                "Sample ID",
             ]
-            included_columns = [
-                "MsRun ID",
-                "Sample ID",
+            search_results.rename(
+                columns={
+                    "Protein.Group": "Protein Group",
+                    "Stripped.Sequence": "Peptide",
+                    "Precursor.Charge": "Charge",
+                    "Precursor.Id": "Precursor Id",
+                    "Q.Value": "Precursor Q Value",
+                    "Protein.Q.Value": "Protein Q Value",
+                    "Precursor.Quantity": "Intensity",
+                },
+                inplace=True,
+            )
+            analyte_columns = [
                 "Protein Group",
                 "Protein Q Value",
                 "Peptide",
@@ -1688,16 +1753,12 @@ class _UnsupportedSDK(_SeerSDK):
                 "IM",
                 "iIM",
             ]
-            df = pd.DataFrame(search_results[included_columns])
+            df = pd.DataFrame(
+                search_results[experiment_columns + analyte_columns]
+            )
         df.columns = [title_case_to_snake_case(x) for x in df.columns]
-        df["sample_uuid"] = df["sample_id"]
-        df["sample_id"] = df["sample_uuid"].apply(
-            lambda x: sample_uuid_to_id.get(x)
-        )
-        if rollup == "panel":
-            df.drop(columns=["msrun_id"], inplace=True, errors="ignore")
         return df
     def get_search_data_analytes(self, analysis_id: str, analyte_type: str):
@@ -1714,10 +1775,6 @@ class _UnsupportedSDK(_SeerSDK):
             analysis_id=analysis_id, analyte_type="protein", rollup="np"
         )
-        report_results = self.get_search_result(
-            analysis_id=analysis_id, analyte_type="precursor", rollup="np"
-        )
         search_results = search_results[
             [
                 "Protein Group",
@@ -1729,18 +1786,87 @@ class _UnsupportedSDK(_SeerSDK):
             ]
         ]
         search_results.drop_duplicates(subset=["Protein Group"], inplace=True)
-        report_results["Protein Group"] = report_results["Protein.Group"]
-        report_results["Peptide"] = report_results["Stripped.Sequence"]
-        if analyte_type == "protein":
-            report_results = report_results[
+        # 2. fetch precursor report to extract analyte-specific details
+        columnsPG = [
+            "Protein.Group",
+        ]
+        columnsPeptide = [
+            "Protein.Ids",
+            "Stripped.Sequence",
+            "Proteotypic",
+        ]
+        columnsPrecursor = [
+            "Precursor.Id",
+            "Precursor.Charge",
+            "Precursor.Quantity",
+            "Modified.Sequence",
+        ]
+        columnsPGQValue = [
+            "Global.PG.Q.Value",
+            "Lib.PG.Q.Value",
+        ]
+        columnsPrecursorQValue = [
+            "Global.Q.Value",
+            "Lib.Q.Value",
+        ]
+        columns = [
+            *columnsPG,
+            *columnsPGQValue,
+        ]
+        if analyte_type == "peptide":
+            columns += [*columnsPeptide]
+        elif analyte_type == "precursor":
+            columns += [
+                *columnsPeptide,
+                *columnsPrecursor,
+                *columnsPrecursorQValue,
+            ]
+        report_results = self.get_search_result(
+            analysis_id=analysis_id,
+            analyte_type="precursor",
+            rollup="np",
+            columns=columns,
+        )
+        report_results.rename(
+            columns={
+                "Protein.Group": "Protein Group",
+                "Stripped.Sequence": "Peptide",
+                "Modified.Sequence": "Modified.Peptide",
+            },
+            inplace=True,
+        )
+        # function to fix the potential bug, where different precursors
+        # of the same peptide map to different protein groups
+        def fix_peptide_to_protein_group_assignment(
+            df: pd.DataFrame,
+        ) -> pd.DataFrame:
+            # for each peptide, sort protein groups by confidence
+            df = df.sort_values(
                 [
-                    "Protein Group",
-                    "Protein.Ids",
+                    "Peptide",
                     "Global.PG.Q.Value",
                     "Lib.PG.Q.Value",
+                    "Protein Group",
                 ]
-            ]
+            )
+            # broadcast the best protein group across all rows with the same peptide
+            # to fix the potential bug, where different precursors of the same peptide
+            # map to different protein groups
+            for col in [
+                "Protein Group",
+                "Protein.Ids",
+                "Protein.Names",
+                "Genes",
+            ]:
+                if col in df.columns:
+                    df[col] = df.groupby("Peptide")[col].transform("first")
+            return df
+        if analyte_type == "protein":
             report_results.drop_duplicates(
                 subset=["Protein Group"], inplace=True
             )
@@ -1751,41 +1877,18 @@ class _UnsupportedSDK(_SeerSDK):
                 how="left",
             )
         elif analyte_type == "peptide":
-            # The below logic performs the following:
-            # 1. orders each peptide group by Global.PG.Q.Value, Lib.PG.Q.Value, and Protein Group (ascending)
-            # 2. for each peptide group, select the first row to find the precursor with the lowest Q values
-            # 3. broadcasts the associated protein group columns across all rows with the same peptide.
-            #
-            # This ensures that for each peptide, we retain consistent protein information while avoiding duplication.
-            report_results = report_results.sort_values(
+            search_results = search_results[
                 [
-                    "Peptide",
-                    "Global.PG.Q.Value",
-                    "Lib.PG.Q.Value",
                     "Protein Group",
+                    "Protein Names",
+                    "Gene Names",
                 ]
+            ]
+            report_results.drop_duplicates(inplace=True)
+            report_results = fix_peptide_to_protein_group_assignment(
+                report_results
             )
-            columns_to_broadcast = ["Protein Group", "Protein.Ids"]
-            broadcasted = (
-                report_results.groupby("Peptide")
-                .apply(
-                    lambda x: pd.Series(
-                        {
-                            col: x.iloc[0][col]
-                            for col in columns_to_broadcast + ["Peptide"]
-                        }
-                    )
-                )
-                .reset_index(drop=True)
-            )
-            report_results = (
-                report_results.drop(columns=columns_to_broadcast)
-                .merge(broadcasted, on="Peptide", how="left")
-                .drop_duplicates(subset=["Peptide"])
-            )
+            report_results.drop_duplicates(subset=["Peptide"], inplace=True)
             df = pd.merge(
                 report_results,
@@ -1793,15 +1896,6 @@ class _UnsupportedSDK(_SeerSDK):
                 on=["Protein Group"],
                 how="left",
             )
-            df = df[
-                [
-                    "Peptide",
-                    "Protein Group",
-                    "Protein.Ids",
-                    "Protein Names",
-                    "Gene Names",
-                ]
-            ]
         else:
             # precursor
             search_results = search_results[
@@ -1811,91 +1905,23 @@ class _UnsupportedSDK(_SeerSDK):
                     "Gene Names",
                 ]
             ]
-            search_results.drop_duplicates(
-                subset=["Protein Group"], inplace=True
-            )
-            report_results = report_results[
-                [
-                    "Precursor.Id",
-                    "Precursor.Charge",
-                    "Peptide",
-                    "Protein Group",
-                    "Protein.Ids",
-                    "Protein.Names",
-                    "Genes",
-                    "Modified.Sequence",
-                    "Proteotypic",
-                    "Global.Q.Value",
-                    "Global.PG.Q.Value",
-                    "Lib.Q.Value",
-                    "Lib.PG.Q.Value",
-                ]
-            ]
+            report_results.drop_duplicates(inplace=True)
-            # The below logic performs the following:
-            # 1. orders each peptide group by Global.PG.Q.Value, Lib.PG.Q.Value, and Protein Group (ascending)
-            # 2. for each peptide group, select the first row to find the precursor with the lowest Q values
-            # 3. broadcasts the associated protein group columns across all rows with the same peptide.
-            #
-            # This ensures that for each peptide, we retain consistent protein information while avoiding duplication.
-            columns_to_broadcast = [
-                "Protein Group",
-                "Protein.Ids",
-                "Protein.Names",
-                "Genes",
-            ]
-            report_results = report_results.sort_values(
-                [
-                    "Peptide",
-                    "Global.PG.Q.Value",
-                    "Lib.PG.Q.Value",
-                    "Protein Group",
-                ],
-            )
-            broadcasted = (
-                report_results.groupby("Peptide")
-                .apply(
-                    lambda x: pd.Series(
-                        {
-                            col: x.iloc[0][col]
-                            for col in columns_to_broadcast + ["Peptide"]
-                        }
-                    )
-                )
-                .reset_index(drop=True)
+            report_results = fix_peptide_to_protein_group_assignment(
+                report_results
             )
-            report_results = (
-                report_results.drop(columns=columns_to_broadcast)
-                .merge(broadcasted, on="Peptide", how="left")
-                .drop_duplicates(subset=["Peptide", "Precursor.Charge"])
+            report_results.drop_duplicates(
+                subset=["Peptide", "Modified.Peptide", "Precursor.Charge"],
+                inplace=True,
             )
             df = pd.merge(
                 report_results,
                 search_results,
                 on=["Protein Group"],
                 how="left",
             )
-            df = df[
-                [
-                    "Precursor.Id",
-                    "Precursor.Charge",
-                    "Peptide",
-                    "Protein Group",
-                    "Protein.Ids",
-                    "Protein.Names",
-                    "Genes",
-                    "Modified.Sequence",
-                    "Proteotypic",
-                    "Global.Q.Value",
-                    "Global.PG.Q.Value",
-                    "Lib.Q.Value",
-                    "Lib.PG.Q.Value",
-                    "Gene Names",
-                ]
-            ]
-            df.rename(
-                columns={"Modified.Sequence": "Modified.Peptide"}, inplace=True
-            )
         # endif
         df.columns = [title_case_to_snake_case(x) for x in df.columns]
         return df

{seer_pas_sdk-1.2.0.dist-info → seer_pas_sdk-1.2.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: seer-pas-sdk
-Version: 1.2.0
+Version: 1.2.2
 Summary: SDK for Seer Proteograph Analysis Suite (PAS)
 Author-email: Ryan Sun <rsun@seer.bio>
 License:
@@ -194,9 +194,10 @@ License-File: LICENSE.txt
 Requires-Dist: boto3>=1.26.152
 Requires-Dist: botocore>=1.29.152
 Requires-Dist: pandas>=2.0.1
+Requires-Dist: pyarrow>=17.0.0
 Requires-Dist: PyJWT>=2.8.0
 Requires-Dist: python-dotenv>=1.0.0
-Requires-Dist: Requests>=2.31.0
+Requires-Dist: requests>=2.31.0
 Requires-Dist: tqdm>=4.65.0
 Requires-Dist: deprecation
 Dynamic: license-file

{seer_pas_sdk-1.2.0.dist-info → seer_pas_sdk-1.2.2.dist-info}/RECORD RENAMED Viewed

@@ -1,19 +1,19 @@
 seer_pas_sdk/__init__.py,sha256=Ie6atdmdBV-OmdHHXjhrGhdFGXiyP3JKhKrr3hyvSsA,563
 seer_pas_sdk/auth/__init__.py,sha256=e_eM4jJnnyKUdg4Nggzi9ypt2MLWcEJ8CmCPkUaQDSs,23
 seer_pas_sdk/auth/auth.py,sha256=_SI5CdEkfqfr4o5BQ79BuPbxGeI9p7tqxJd7mUqSAkI,8854
-seer_pas_sdk/common/__init__.py,sha256=LLfkbsZMXXty_T8xkOAws_WWBpbfwWZAdkNTduS8Abc,24443
+seer_pas_sdk/common/__init__.py,sha256=WrRwmSONUdFD0ysT5jHwG2zWDd-v2wverjXY7BWZhHU,24488
 seer_pas_sdk/common/errors.py,sha256=4HFORWnaQQCMXRE8kwdsJWvQRB_3KFEZ7yMb391e4gA,142
 seer_pas_sdk/common/groupanalysis.py,sha256=DxB-gbQfYzl7p9MTYWDIqghcH-IeakzdYdrRZrlIHek,1730
 seer_pas_sdk/core/__init__.py,sha256=rxbKgg-Qe24OaxX2zyHHYPYgDCTEKE_-41bB2wvpvL4,25
-seer_pas_sdk/core/sdk.py,sha256=yDml92xhZtWR54-MgWG3rYVVlcaaAl2i6EzlWgbit8Q,160705
-seer_pas_sdk/core/unsupported.py,sha256=SpxKQx_SN0o7SEBGXko_vmTQVxYDAvXEQGH2VWTK63M,71915
+seer_pas_sdk/core/sdk.py,sha256=0ukg287lsjlSNoV0WqFbiPMURhVogsy_sTR7gg1fr9Q,161512
+seer_pas_sdk/core/unsupported.py,sha256=WcF_Z6ZUpzOWkWQHaMtm9SnE2NveuRmljVfNe8QSbms,72732
 seer_pas_sdk/objects/__init__.py,sha256=r-lY7axLTzToAI-Dme019YfcJLDe2ok1f_e6OQx3j64,130
 seer_pas_sdk/objects/groupanalysis.py,sha256=x3D_5NmYBoPDilNCQqUoCFARIfIeUq4FBY3_N6u8tfM,994
 seer_pas_sdk/objects/headers.py,sha256=RilNzB_Nhid3U8j93BxJYcRrgDmd_1bAuI0P465xd0g,2727
 seer_pas_sdk/objects/platemap.py,sha256=8IvJPAecs_e_FyqibzhCw-O4zjCFnf-zMUp_5krTEsg,5864
 seer_pas_sdk/objects/volcanoplot.py,sha256=lTrTOVg74nT3uo-P1edQJC1ZbdoiLMtQ3VJd9CnzmoM,9396
-seer_pas_sdk-1.2.0.dist-info/licenses/LICENSE.txt,sha256=DVQuDIgE45qn836wDaWnYhSdxoLXgpRRKH4RuTjpRZQ,10174
-seer_pas_sdk-1.2.0.dist-info/METADATA,sha256=6mQ4VXcrUHCVfn3PL0pD2j_u7yJNMmw1HRqtd_lATDg,13413
-seer_pas_sdk-1.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-seer_pas_sdk-1.2.0.dist-info/top_level.txt,sha256=-2kZ-KFMGtXwr8H1O5llMKlcJ8gRKohEmrIvazXB61s,13
-seer_pas_sdk-1.2.0.dist-info/RECORD,,
+seer_pas_sdk-1.2.2.dist-info/licenses/LICENSE.txt,sha256=DVQuDIgE45qn836wDaWnYhSdxoLXgpRRKH4RuTjpRZQ,10174
+seer_pas_sdk-1.2.2.dist-info/METADATA,sha256=Lw-pb90n0vo7K4I-2wnQm_LvsrVzXpEYmj8t4vwToAA,13444
+seer_pas_sdk-1.2.2.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
+seer_pas_sdk-1.2.2.dist-info/top_level.txt,sha256=-2kZ-KFMGtXwr8H1O5llMKlcJ8gRKohEmrIvazXB61s,13
+seer_pas_sdk-1.2.2.dist-info/RECORD,,

{seer_pas_sdk-1.2.0.dist-info → seer_pas_sdk-1.2.2.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (80.9.0)
+Generator: setuptools (80.10.2)
 Root-Is-Purelib: true
 Tag: py3-none-any

{seer_pas_sdk-1.2.0.dist-info → seer_pas_sdk-1.2.2.dist-info}/licenses/LICENSE.txt RENAMED Viewed

File without changes

{seer_pas_sdk-1.2.0.dist-info → seer_pas_sdk-1.2.2.dist-info}/top_level.txt RENAMED Viewed

File without changes

seer-pas-sdk 1.2.0__py3-none-any.whl → 1.2.2__py3-none-any.whl

seer-pas-sdk 1.2.0py3-none-any.whl → 1.2.2py3-none-any.whl