seer-pas-sdk 1.2.0__py3-none-any.whl → 1.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -99,7 +99,7 @@ def dict_to_df(data):
99
99
 
100
100
 
101
101
  # Most cases appear to be a .tsv file.
102
- def download_df(url, is_tsv=True, dtype={}):
102
+ def download_df(url, is_tsv=True, dtype={}, usecols=None):
103
103
  """
104
104
  Fetches a TSV/CSV file from a URL and returns as a Pandas DataFrame.
105
105
 
@@ -114,6 +114,9 @@ def download_df(url, is_tsv=True, dtype={}):
114
114
  dtype : dict
115
115
  Data type conversion when intaking columns. e.g. {'a': str, 'b': np.float64}
116
116
 
117
+ usecols : list
118
+ Subset of columns to download. If not specified, downloads all columns.
119
+
117
120
  Returns
118
121
  -------
119
122
  pandas.core.frame.DataFrame
@@ -139,12 +142,10 @@ def download_df(url, is_tsv=True, dtype={}):
139
142
 
140
143
  if not url:
141
144
  return pd.DataFrame()
142
- url_content = io.StringIO(requests.get(url).content.decode("utf-8"))
143
- if is_tsv:
144
- csv = pd.read_csv(url_content, sep="\t", dtype=dtype)
145
- else:
146
- csv = pd.read_csv(url_content, dtype=dtype)
147
- return csv
145
+ csv = pd.read_csv(
146
+ url, sep="\t" if is_tsv else ",", usecols=usecols, engine="pyarrow"
147
+ )
148
+ return csv.astype(dtype=dtype) if dtype else csv
148
149
 
149
150
 
150
151
  def get_sample_info(
seer_pas_sdk/core/sdk.py CHANGED
@@ -7,7 +7,6 @@ import requests
7
7
  import urllib.request
8
8
  import ssl
9
9
 
10
-
11
10
  from typing import List as _List, Tuple as _Tuple
12
11
 
13
12
  from ..common import *
@@ -15,8 +14,6 @@ from ..auth import Auth
15
14
  from ..objects.volcanoplot import VolcanoPlotBuilder
16
15
  from ..objects.headers import *
17
16
 
18
- import warnings
19
-
20
17
 
21
18
  class SeerSDK:
22
19
  """
@@ -1228,8 +1225,8 @@ class SeerSDK:
1228
1225
 
1229
1226
  >>> seer_sdk.get_msruns(sample_ids)
1230
1227
  >>> [
1231
- {"id": "SAMPLE_ID_1_HERE" ... },
1232
- {"id": "SAMPLE_ID_2_HERE" ... }
1228
+ {"id": "MSRUN_ID_1_HERE" ... },
1229
+ {"id": "MSRUN_ID_2_HERE" ... }
1233
1230
  ]
1234
1231
 
1235
1232
  >>> seer_sdk.get_msruns(sample_ids, as_df=True)
@@ -1295,8 +1292,8 @@ class SeerSDK:
1295
1292
 
1296
1293
  >>> seer_sdk.find_msruns(sample_ids)
1297
1294
  >>> [
1298
- {"id": "SAMPLE_ID_1_HERE" ... },
1299
- {"id": "SAMPLE_ID_2_HERE" ... }
1295
+ {"id": "MSRUN_ID_1_HERE" ... },
1296
+ {"id": "MSRUN_ID_2_HERE" ... }
1300
1297
  ]
1301
1298
 
1302
1299
  >>> seer_sdk.find_msruns(sample_ids, as_df=True)
@@ -1310,25 +1307,34 @@ class SeerSDK:
1310
1307
  URL = f"{self._auth.url}api/v1/msdatas/items"
1311
1308
 
1312
1309
  res = []
1313
- for sample_id in sample_ids:
1314
1310
 
1315
- with self._get_auth_session("findmsdatas") as s:
1311
+ params = {"all": "true"}
1316
1312
 
1317
- msdatas = s.post(URL, json={"sampleId": sample_id})
1313
+ with self._get_auth_session("findmsdatas") as s:
1318
1314
 
1319
- if msdatas.status_code != 200 or not msdatas.json()["data"]:
1320
- raise ValueError(
1321
- f"Failed to fetch MS data for sample ID={sample_id}."
1322
- )
1315
+ msdatas = s.post(
1316
+ URL, json={"sampleId": ",".join(sample_ids)}, params=params
1317
+ )
1323
1318
 
1324
- res += [x for x in msdatas.json()["data"]]
1319
+ if msdatas.status_code != 200 or not msdatas.json()["data"]:
1320
+ raise ValueError(
1321
+ f"Failed to fetch MS data for sample IDs={sample_ids}."
1322
+ )
1323
+
1324
+ res += [x for x in msdatas.json()["data"]]
1325
1325
 
1326
1326
  spaces = {x["id"]: x["usergroup_name"] for x in self.get_spaces()}
1327
+
1328
+ def filepath_to_msrunid(filepath):
1329
+ return os.path.basename(filepath).split(".")[0]
1330
+
1327
1331
  for entry in res:
1328
1332
  if "tenant_id" in entry:
1329
1333
  del entry["tenant_id"]
1330
1334
 
1331
1335
  if "raw_file_path" in entry:
1336
+ # Provide a human-readable MS run id
1337
+ entry["Run"] = filepath_to_msrunid(entry["raw_file_path"])
1332
1338
  # Simple lambda function to find the third occurrence of '/' in the raw file path
1333
1339
  location = lambda s: len(s) - len(s.split("/", 3)[-1])
1334
1340
  # Slicing the string from the location
@@ -1339,6 +1345,13 @@ class SeerSDK:
1339
1345
  entry["space"] = spaces.get(entry["user_group"], "General")
1340
1346
  del entry["user_group"]
1341
1347
 
1348
+ # Rename the key sample_id to sample_uuid
1349
+ if "sample_id" in entry:
1350
+ entry["sample_uuid"] = entry.pop("sample_id")
1351
+ # Rename the key sample_id_tracking to sample_id
1352
+ if "sample_id_tracking" in entry:
1353
+ entry["sample_id"] = entry.pop("sample_id_tracking")
1354
+
1342
1355
  if not res and as_df:
1343
1356
  return pd.DataFrame(columns=MSRUN_COLUMNS)
1344
1357
  return res if not as_df else dict_to_df(res)
@@ -1853,7 +1866,7 @@ class SeerSDK:
1853
1866
  )
1854
1867
  )
1855
1868
  except Exception as e:
1856
- print("Warning: Could not fetch fasta files.")
1869
+ print("Error: Could not fetch fasta files.")
1857
1870
  res["fasta"] = None
1858
1871
  else:
1859
1872
  res["fasta"] = None
@@ -2066,7 +2079,7 @@ class SeerSDK:
2066
2079
  )
2067
2080
  except:
2068
2081
  print(
2069
- f"Warning: Could not fetch fasta files for analysis {res[entry].get('analysis_name')}."
2082
+ f"Error: Could not fetch fasta files for analysis {res[entry].get('analysis_name')}."
2070
2083
  )
2071
2084
  else:
2072
2085
  res[entry]["fasta"] = None
@@ -2382,7 +2395,11 @@ class SeerSDK:
2382
2395
  return files
2383
2396
 
2384
2397
  def get_search_result(
2385
- self, analysis_id: str, analyte_type: str, rollup: str
2398
+ self,
2399
+ analysis_id: str,
2400
+ analyte_type: str,
2401
+ rollup: str,
2402
+ columns: _List[str] = None,
2386
2403
  ):
2387
2404
  """
2388
2405
  Load one of the files available via the "Download result files" button on the PAS UI.
@@ -2423,6 +2440,7 @@ class SeerSDK:
2423
2440
  "npLink"
2424
2441
  ]["url"],
2425
2442
  dtype=dtype,
2443
+ usecols=columns,
2426
2444
  )
2427
2445
  elif rollup == "panel":
2428
2446
  return download_df(
@@ -2430,6 +2448,7 @@ class SeerSDK:
2430
2448
  "panelLink"
2431
2449
  ]["url"],
2432
2450
  dtype=dtype,
2451
+ usecols=columns,
2433
2452
  )
2434
2453
  elif analyte_type == "peptide":
2435
2454
  if rollup == "np":
@@ -2438,6 +2457,7 @@ class SeerSDK:
2438
2457
  "npLink"
2439
2458
  ]["url"],
2440
2459
  dtype=dtype,
2460
+ usecols=columns,
2441
2461
  )
2442
2462
  elif rollup == "panel":
2443
2463
  return download_df(
@@ -2445,12 +2465,14 @@ class SeerSDK:
2445
2465
  "panelLink"
2446
2466
  ]["url"],
2447
2467
  dtype=dtype,
2468
+ usecols=columns,
2448
2469
  )
2449
2470
  else:
2450
2471
  return download_df(
2451
2472
  self.get_search_result_file_url(
2452
2473
  analysis_id, filename="report.tsv"
2453
- )["url"]
2474
+ )["url"],
2475
+ usecols=columns,
2454
2476
  )
2455
2477
 
2456
2478
  def download_search_output_file(
@@ -1471,37 +1471,70 @@ class _UnsupportedSDK(_SeerSDK):
1471
1471
  Get analyte intensities data for a given PAS analysis.
1472
1472
  Args:
1473
1473
  analysis_id (str): ID of the analysis.
1474
- analyte_type (str): Type of the analyte. Must be either 'protein', 'peptide', precursor.
1474
+ analyte_type (str): Type of the analyte. Must be either 'protein', 'peptide', 'precursor'.
1475
1475
  rollup (str): Intensities rollup method. Must be either 'np' or 'panel'.
1476
- norm_method (str): Search engine. Supported engines are: raw, engine, median, median80, pepcal. Default is 'pepcal'.
1476
+ norm_method (str): Search engine. Supported engines are: raw, engine, median, median80, pepcal, pepcal_batch. Default is 'pepcal'.
1477
1477
 
1478
1478
  Returns:
1479
1479
  pd.DataFrame: A dataframe with each row containing the analyte intensity measurement:
1480
1480
  'msrun_id', 'sample_id', 'nanoparticle' (if rollup is 'np'), 'protein_group', 'peptide' (for 'peptide' and 'precursor' analyte types), 'charge' (for 'precursor' analyte type),
1481
1481
  'intensity_log10', 'protein_group_q_value', 'q_value' (for 'precursor' analyte type), 'rt' and 'irt' (for 'peptide' and 'precursor' analyte types)
1482
1482
  """
1483
- # 1. Get msrun data for analysis
1483
+
1484
+ def filepath_to_msrunid(filepath):
1485
+ return os.path.basename(filepath).split(".")[0]
1486
+
1487
+ # 1. Get samples and msrun data for analysis
1484
1488
  samples = self.find_samples(analysis_id=analysis_id)
1485
- sample_name_to_id = {s["sample_name"]: s["id"] for s in samples}
1489
+
1486
1490
  sample_uuid_to_id = {s["id"]: s["sample_id"] for s in samples}
1487
- # for np rollup, a row represents an msrun
1488
- msruns = self.find_msruns(sample_ids=sample_name_to_id.values())
1489
- file_to_msrun = {
1490
- os.path.basename(msrun["raw_file_path"]).split(".")[0]: msrun
1491
- for msrun in msruns
1492
- }
1493
- sample_to_msrun = {msrun["sample_id"]: msrun for msrun in msruns}
1491
+ sample_id_to_uuid = {s["sample_id"]: s["id"] for s in samples}
1492
+ # FIXME sample_name is not guaranteed to be unique (within PAS analysis)
1493
+ sample_name_to_uuid = {s["sample_name"]: s["id"] for s in samples}
1494
1494
 
1495
- # for panel rollup, a row represents a sample
1495
+ msruns = self.find_msruns(sample_ids=[s["id"] for s in samples])
1496
+ msrunid_to_info = {msrun["Run"]: msrun for msrun in msruns}
1496
1497
 
1497
1498
  # 2. Get search results
1498
- # pull the np/panel file, or report.tsv for precursor mode
1499
+ # pull the np/panel file, or the relevant columns from the report.tsv for precursor mode
1500
+ columns = None
1501
+ if analyte_type == "precursor" and rollup == "np":
1502
+ columnsExperiment = ["Run"]
1503
+ columnsProtein = [
1504
+ "Protein.Group",
1505
+ ]
1506
+ columnsPeptide = [
1507
+ "Stripped.Sequence",
1508
+ ]
1509
+ columnsPrecursor = [
1510
+ "Precursor.Id",
1511
+ "Precursor.Charge",
1512
+ "Precursor.Quantity",
1513
+ "RT",
1514
+ "iRT",
1515
+ "IM",
1516
+ "iIM",
1517
+ ]
1518
+ columnsQValue = [
1519
+ "Q.Value",
1520
+ "Protein.Q.Value",
1521
+ ]
1522
+ columns = [
1523
+ *columnsExperiment,
1524
+ *columnsProtein,
1525
+ *columnsPeptide,
1526
+ *columnsPrecursor,
1527
+ *columnsQValue,
1528
+ ]
1499
1529
  search_results = self.get_search_result(
1500
1530
  analysis_id=analysis_id,
1501
1531
  analyte_type=analyte_type,
1502
1532
  rollup=rollup,
1533
+ columns=columns,
1503
1534
  )
1535
+
1504
1536
  if analyte_type in ["protein", "peptide"]:
1537
+ # set the intensity column based on norm_method and PAS analysis protocol version
1505
1538
  intensity_column = None
1506
1539
  if norm_method == "raw":
1507
1540
  intensity_column = (
@@ -1543,139 +1576,171 @@ class _UnsupportedSDK(_SeerSDK):
1543
1576
  raise ValueError(
1544
1577
  "Pepcal normalized intensities not found in search results. This is only available with analyses processed with DIA-NN Seer Protocol v2.0 or later with the Seer Peptide Calibrant option enabled. \n Please retry using different norm_method, such as 'median'"
1545
1578
  )
1546
-
1547
1579
  intensity_column = "PepCal Intensities Log10"
1548
-
1580
+ elif norm_method == "pepcal_batch":
1581
+ if not (
1582
+ "PepCal Batch Intensities Log10" in search_results.columns
1583
+ ):
1584
+ raise ValueError(
1585
+ "Pepcal normalized batch corrected intensities not found in search results. This is only available with analyses processed with DIA-NN Seer Protocol v2.0 or later with the Seer Peptide Calibrant option enabled. \n Please retry using different norm_method, such as 'median'"
1586
+ )
1587
+ intensity_column = "PepCal Batch Intensities Log10"
1549
1588
  else:
1550
1589
  raise ValueError(
1551
1590
  f"norm_method = {norm_method} is not supported. Supported normalization methods are: raw, pepcal, engine, median, median80."
1552
1591
  )
1553
- if rollup == "panel":
1554
- search_results.fillna({"Sample Name": ""}, inplace=True)
1555
- search_results["File Name"] = search_results[
1556
- "Sample Name"
1557
- ].apply(
1558
- lambda x: (
1559
- os.path.basename(
1560
- sample_to_msrun[sample_name_to_id[x]][
1561
- "raw_file_path"
1562
- ]
1563
- ).split(".")[0]
1564
- if x
1565
- else None
1566
- )
1567
- )
1568
- search_results["File Name"] = search_results["File Name"].apply(
1569
- lambda x: os.path.basename(x).split(".")[0] if x else None
1570
- )
1571
1592
 
1572
1593
  search_results["Intensity Log10"] = search_results[
1573
1594
  intensity_column
1574
1595
  ]
1575
1596
 
1576
- # 3. Merge report to search results to get Q value and other properties
1577
- report = self.get_search_result(
1578
- analysis_id=analysis_id,
1579
- analyte_type="precursor",
1580
- rollup="np",
1581
- )
1582
- report["File Name"] = report["Run"]
1583
- report["Protein Group"] = report["Protein.Group"]
1584
-
1585
- if analyte_type == "protein":
1586
- report["Protein Q Value"] = report["Protein.Q.Value"]
1587
-
1588
- report = report[
1589
- ["File Name", "Protein Group", "Protein Q Value"]
1590
- ]
1591
- report.drop_duplicates(
1592
- subset=["File Name", "Protein Group"], inplace=True
1597
+ if rollup == "panel":
1598
+ search_results.rename(
1599
+ columns={"Sample ID": "Sample UUID"}, inplace=True
1593
1600
  )
1594
- df = pd.merge(
1595
- search_results,
1596
- report,
1597
- on=["File Name", "Protein Group"],
1598
- how="left",
1601
+ search_results["Sample UUID"] = search_results[
1602
+ "Sample Name"
1603
+ ].map(sample_name_to_uuid)
1604
+ search_results["Sample ID"] = search_results[
1605
+ "Sample UUID"
1606
+ ].map(sample_uuid_to_id)
1607
+ experiment_columns = ["Sample UUID", "Sample ID"]
1608
+
1609
+ # analyte info is limited to the id in the panel rollup
1610
+ if analyte_type == "protein":
1611
+ analyte_id_column = "Protein Group"
1612
+ else:
1613
+ analyte_id_column = "Peptide"
1614
+
1615
+ analyte_columns = [analyte_id_column]
1616
+ df = search_results
1617
+ else:
1618
+ # np rollup, extract basename without extension
1619
+ path_to_msrunid = {
1620
+ path: filepath_to_msrunid(path)
1621
+ for path in search_results["File Name"].unique()
1622
+ }
1623
+ # strip path from the filename to allow merging with the precursor report
1624
+ search_results["Run"] = search_results["File Name"].map(
1625
+ path_to_msrunid
1599
1626
  )
1600
- included_columns = [
1601
- "MsRun ID",
1602
- "Sample ID",
1603
- "Protein Group",
1604
- "Intensity Log10",
1605
- "Protein Q Value",
1606
- ]
1607
1627
 
1608
- else:
1609
- report["Peptide"] = report["Stripped.Sequence"]
1610
- # If analyte_type is peptide, attach retention time (RT, iRT)
1611
- report = report[["File Name", "Peptide", "RT", "iRT"]]
1612
- report.drop_duplicates(
1613
- subset=["File Name", "Peptide"], inplace=True
1628
+ search_results["MsRun UUID"] = search_results["Run"].map(
1629
+ {k: v["id"] for k, v in msrunid_to_info.items()}
1614
1630
  )
1615
- df = pd.merge(
1616
- search_results,
1617
- report,
1618
- on=["File Name", "Peptide"],
1619
- how="left",
1631
+ search_results["Sample ID"] = search_results["Run"].map(
1632
+ {k: v["sample_id"] for k, v in msrunid_to_info.items()}
1633
+ )
1634
+ search_results["Sample UUID"] = search_results["Run"].map(
1635
+ {k: v["sample_uuid"] for k, v in msrunid_to_info.items()}
1620
1636
  )
1621
- included_columns = [
1622
- "MsRun ID",
1637
+ search_results["Nanoparticle"] = search_results["Run"].map(
1638
+ {k: v["nanoparticle"] for k, v in msrunid_to_info.items()}
1639
+ )
1640
+ experiment_columns = [
1641
+ "MsRun UUID",
1642
+ "Run",
1643
+ "Nanoparticle",
1644
+ "Sample UUID",
1623
1645
  "Sample ID",
1624
- "Peptide",
1625
- "Protein Group",
1626
- "Intensity Log10",
1627
- "RT",
1628
- "iRT",
1629
1646
  ]
1630
- # endif
1631
1647
 
1632
- if rollup == "np":
1633
- included_columns.insert(
1634
- included_columns.index("Sample ID") + 1, "Nanoparticle"
1648
+ # Merge report to search results to get Q value and other properties
1649
+ if analyte_type == "protein":
1650
+ columns = ["Run", "Protein.Group", "Protein.Q.Value"]
1651
+ elif analyte_type == "peptide":
1652
+ columns = ["Run", "Stripped.Sequence", "Protein.Q.Value"]
1653
+ analytes = self.get_search_result(
1654
+ analysis_id=analysis_id,
1655
+ analyte_type="precursor",
1656
+ rollup="np",
1657
+ columns=columns,
1658
+ )
1659
+ # pandas Dataframe.rename() default behavior is to ignore the columns that do not exist in the data frame.
1660
+ analytes.rename(
1661
+ columns={
1662
+ "Protein.Group": "Protein Group",
1663
+ "Protein.Q.Value": "Protein Q Value",
1664
+ "Stripped.Sequence": "Peptide",
1665
+ },
1666
+ inplace=True,
1635
1667
  )
1636
1668
 
1637
- df["MsRun ID"] = df["File Name"].apply(
1638
- lambda x: (
1639
- file_to_msrun[x]["id"] if x in file_to_msrun else None
1669
+ if analyte_type == "protein":
1670
+ analyte_id_column = "Protein Group"
1671
+ analyte_columns = [
1672
+ analyte_id_column,
1673
+ "Protein Q Value",
1674
+ ]
1675
+
1676
+ else:
1677
+ analyte_id_column = "Peptide"
1678
+ analyte_columns = [analyte_id_column]
1679
+ # endif analyte_type
1680
+
1681
+ analytes.drop(
1682
+ columns=[
1683
+ col
1684
+ for col in analytes.columns
1685
+ if col != "Run" and col not in analyte_columns
1686
+ ],
1687
+ inplace=True,
1640
1688
  )
1641
- )
1642
- df["Sample ID"] = df["File Name"].apply(
1643
- lambda x: (
1644
- file_to_msrun[x]["sample_id"]
1645
- if x in file_to_msrun
1646
- else None
1689
+ analytes.drop_duplicates(
1690
+ subset=["Run", analyte_id_column], inplace=True
1647
1691
  )
1648
- )
1649
- df = df[included_columns]
1692
+ df = pd.merge(
1693
+ search_results,
1694
+ analytes,
1695
+ on=["Run", analyte_id_column],
1696
+ how="left",
1697
+ validate="one_to_one",
1698
+ )
1699
+
1700
+ df = df[experiment_columns + analyte_columns + ["Intensity Log10"]]
1650
1701
 
1651
1702
  else:
1652
1703
  # precursor
1653
1704
  # working only in report.tsv
1654
- search_results["Intensity"] = search_results["Precursor.Quantity"]
1655
- search_results["MsRun ID"] = search_results["Run"].apply(
1656
- lambda x: (
1657
- file_to_msrun[x]["id"] if x in file_to_msrun else None
1705
+ if norm_method != "raw":
1706
+ raise ValueError(
1707
+ "For precursor analyte type, only 'raw' norm_method is supported."
1658
1708
  )
1709
+
1710
+ search_results["MsRun UUID"] = search_results["Run"].map(
1711
+ {k: v["id"] for k, v in msrunid_to_info.items()}
1659
1712
  )
1660
- search_results["Sample ID"] = search_results["Run"].apply(
1661
- lambda x: (
1662
- file_to_msrun[x]["sample_id"]
1663
- if x in file_to_msrun
1664
- else None
1665
- )
1713
+ search_results["Sample ID"] = search_results["Run"].map(
1714
+ {k: v["sample_id"] for k, v in msrunid_to_info.items()}
1666
1715
  )
1667
- search_results["Protein Group"] = search_results["Protein.Group"]
1668
- search_results["Peptide"] = search_results["Stripped.Sequence"]
1669
- search_results["Charge"] = search_results["Precursor.Charge"]
1670
- search_results["Precursor Id"] = search_results["Precursor.Id"]
1671
- search_results["Precursor Q Value"] = search_results["Q.Value"]
1672
- search_results["Protein Q Value"] = search_results[
1673
- "Protein.Q.Value"
1716
+ search_results["Sample UUID"] = search_results["Sample ID"].map(
1717
+ sample_id_to_uuid
1718
+ )
1719
+ search_results["Nanoparticle"] = search_results["Run"].map(
1720
+ {k: v["nanoparticle"] for k, v in msrunid_to_info.items()}
1721
+ )
1722
+ experiment_columns = [
1723
+ "MsRun UUID",
1724
+ "Run",
1725
+ "Nanoparticle",
1726
+ "Sample UUID",
1727
+ "Sample ID",
1674
1728
  ]
1675
1729
 
1676
- included_columns = [
1677
- "MsRun ID",
1678
- "Sample ID",
1730
+ search_results.rename(
1731
+ columns={
1732
+ "Protein.Group": "Protein Group",
1733
+ "Stripped.Sequence": "Peptide",
1734
+ "Precursor.Charge": "Charge",
1735
+ "Precursor.Id": "Precursor Id",
1736
+ "Q.Value": "Precursor Q Value",
1737
+ "Protein.Q.Value": "Protein Q Value",
1738
+ "Precursor.Quantity": "Intensity",
1739
+ },
1740
+ inplace=True,
1741
+ )
1742
+
1743
+ analyte_columns = [
1679
1744
  "Protein Group",
1680
1745
  "Protein Q Value",
1681
1746
  "Peptide",
@@ -1688,16 +1753,12 @@ class _UnsupportedSDK(_SeerSDK):
1688
1753
  "IM",
1689
1754
  "iIM",
1690
1755
  ]
1691
- df = pd.DataFrame(search_results[included_columns])
1756
+ df = pd.DataFrame(
1757
+ search_results[experiment_columns + analyte_columns]
1758
+ )
1692
1759
 
1693
1760
  df.columns = [title_case_to_snake_case(x) for x in df.columns]
1694
- df["sample_uuid"] = df["sample_id"]
1695
- df["sample_id"] = df["sample_uuid"].apply(
1696
- lambda x: sample_uuid_to_id.get(x)
1697
- )
1698
1761
 
1699
- if rollup == "panel":
1700
- df.drop(columns=["msrun_id"], inplace=True, errors="ignore")
1701
1762
  return df
1702
1763
 
1703
1764
  def get_search_data_analytes(self, analysis_id: str, analyte_type: str):
@@ -1714,10 +1775,6 @@ class _UnsupportedSDK(_SeerSDK):
1714
1775
  analysis_id=analysis_id, analyte_type="protein", rollup="np"
1715
1776
  )
1716
1777
 
1717
- report_results = self.get_search_result(
1718
- analysis_id=analysis_id, analyte_type="precursor", rollup="np"
1719
- )
1720
-
1721
1778
  search_results = search_results[
1722
1779
  [
1723
1780
  "Protein Group",
@@ -1729,18 +1786,87 @@ class _UnsupportedSDK(_SeerSDK):
1729
1786
  ]
1730
1787
  ]
1731
1788
  search_results.drop_duplicates(subset=["Protein Group"], inplace=True)
1732
- report_results["Protein Group"] = report_results["Protein.Group"]
1733
- report_results["Peptide"] = report_results["Stripped.Sequence"]
1734
1789
 
1735
- if analyte_type == "protein":
1736
- report_results = report_results[
1790
+ # 2. fetch precursor report to extract analyte-specific details
1791
+ columnsPG = [
1792
+ "Protein.Group",
1793
+ ]
1794
+ columnsPeptide = [
1795
+ "Protein.Ids",
1796
+ "Stripped.Sequence",
1797
+ "Proteotypic",
1798
+ ]
1799
+ columnsPrecursor = [
1800
+ "Precursor.Id",
1801
+ "Precursor.Charge",
1802
+ "Precursor.Quantity",
1803
+ "Modified.Sequence",
1804
+ ]
1805
+ columnsPGQValue = [
1806
+ "Global.PG.Q.Value",
1807
+ "Lib.PG.Q.Value",
1808
+ ]
1809
+ columnsPrecursorQValue = [
1810
+ "Global.Q.Value",
1811
+ "Lib.Q.Value",
1812
+ ]
1813
+ columns = [
1814
+ *columnsPG,
1815
+ *columnsPGQValue,
1816
+ ]
1817
+ if analyte_type == "peptide":
1818
+ columns += [*columnsPeptide]
1819
+ elif analyte_type == "precursor":
1820
+ columns += [
1821
+ *columnsPeptide,
1822
+ *columnsPrecursor,
1823
+ *columnsPrecursorQValue,
1824
+ ]
1825
+ report_results = self.get_search_result(
1826
+ analysis_id=analysis_id,
1827
+ analyte_type="precursor",
1828
+ rollup="np",
1829
+ columns=columns,
1830
+ )
1831
+ report_results.rename(
1832
+ columns={
1833
+ "Protein.Group": "Protein Group",
1834
+ "Stripped.Sequence": "Peptide",
1835
+ "Modified.Sequence": "Modified.Peptide",
1836
+ },
1837
+ inplace=True,
1838
+ )
1839
+
1840
+ # function to fix the potential bug, where different precursors
1841
+ # of the same peptide map to different protein groups
1842
+ def fix_peptide_to_protein_group_assignment(
1843
+ df: pd.DataFrame,
1844
+ ) -> pd.DataFrame:
1845
+ # for each peptide, sort protein groups by confidence
1846
+ df = df.sort_values(
1737
1847
  [
1738
- "Protein Group",
1739
- "Protein.Ids",
1848
+ "Peptide",
1740
1849
  "Global.PG.Q.Value",
1741
1850
  "Lib.PG.Q.Value",
1851
+ "Protein Group",
1742
1852
  ]
1743
- ]
1853
+ )
1854
+
1855
+ # broadcast the best protein group across all rows with the same peptide
1856
+ # to fix the potential bug, where different precursors of the same peptide
1857
+ # map to different protein groups
1858
+ for col in [
1859
+ "Protein Group",
1860
+ "Protein.Ids",
1861
+ "Protein.Names",
1862
+ "Genes",
1863
+ ]:
1864
+ if col in df.columns:
1865
+ df[col] = df.groupby("Peptide")[col].transform("first")
1866
+
1867
+ return df
1868
+
1869
+ if analyte_type == "protein":
1744
1870
  report_results.drop_duplicates(
1745
1871
  subset=["Protein Group"], inplace=True
1746
1872
  )
@@ -1751,41 +1877,18 @@ class _UnsupportedSDK(_SeerSDK):
1751
1877
  how="left",
1752
1878
  )
1753
1879
  elif analyte_type == "peptide":
1754
-
1755
- # The below logic performs the following:
1756
- # 1. orders each peptide group by Global.PG.Q.Value, Lib.PG.Q.Value, and Protein Group (ascending)
1757
- # 2. for each peptide group, select the first row to find the precursor with the lowest Q values
1758
- # 3. broadcasts the associated protein group columns across all rows with the same peptide.
1759
- #
1760
- # This ensures that for each peptide, we retain consistent protein information while avoiding duplication.
1761
-
1762
- report_results = report_results.sort_values(
1880
+ search_results = search_results[
1763
1881
  [
1764
- "Peptide",
1765
- "Global.PG.Q.Value",
1766
- "Lib.PG.Q.Value",
1767
1882
  "Protein Group",
1883
+ "Protein Names",
1884
+ "Gene Names",
1768
1885
  ]
1886
+ ]
1887
+ report_results.drop_duplicates(inplace=True)
1888
+ report_results = fix_peptide_to_protein_group_assignment(
1889
+ report_results
1769
1890
  )
1770
-
1771
- columns_to_broadcast = ["Protein Group", "Protein.Ids"]
1772
- broadcasted = (
1773
- report_results.groupby("Peptide")
1774
- .apply(
1775
- lambda x: pd.Series(
1776
- {
1777
- col: x.iloc[0][col]
1778
- for col in columns_to_broadcast + ["Peptide"]
1779
- }
1780
- )
1781
- )
1782
- .reset_index(drop=True)
1783
- )
1784
- report_results = (
1785
- report_results.drop(columns=columns_to_broadcast)
1786
- .merge(broadcasted, on="Peptide", how="left")
1787
- .drop_duplicates(subset=["Peptide"])
1788
- )
1891
+ report_results.drop_duplicates(subset=["Peptide"], inplace=True)
1789
1892
 
1790
1893
  df = pd.merge(
1791
1894
  report_results,
@@ -1793,15 +1896,6 @@ class _UnsupportedSDK(_SeerSDK):
1793
1896
  on=["Protein Group"],
1794
1897
  how="left",
1795
1898
  )
1796
- df = df[
1797
- [
1798
- "Peptide",
1799
- "Protein Group",
1800
- "Protein.Ids",
1801
- "Protein Names",
1802
- "Gene Names",
1803
- ]
1804
- ]
1805
1899
  else:
1806
1900
  # precursor
1807
1901
  search_results = search_results[
@@ -1811,91 +1905,23 @@ class _UnsupportedSDK(_SeerSDK):
1811
1905
  "Gene Names",
1812
1906
  ]
1813
1907
  ]
1814
- search_results.drop_duplicates(
1815
- subset=["Protein Group"], inplace=True
1816
- )
1817
- report_results = report_results[
1818
- [
1819
- "Precursor.Id",
1820
- "Precursor.Charge",
1821
- "Peptide",
1822
- "Protein Group",
1823
- "Protein.Ids",
1824
- "Protein.Names",
1825
- "Genes",
1826
- "Modified.Sequence",
1827
- "Proteotypic",
1828
- "Global.Q.Value",
1829
- "Global.PG.Q.Value",
1830
- "Lib.Q.Value",
1831
- "Lib.PG.Q.Value",
1832
- ]
1833
- ]
1908
+ report_results.drop_duplicates(inplace=True)
1834
1909
 
1835
- # The below logic performs the following:
1836
- # 1. orders each peptide group by Global.PG.Q.Value, Lib.PG.Q.Value, and Protein Group (ascending)
1837
- # 2. for each peptide group, select the first row to find the precursor with the lowest Q values
1838
- # 3. broadcasts the associated protein group columns across all rows with the same peptide.
1839
- #
1840
- # This ensures that for each peptide, we retain consistent protein information while avoiding duplication.
1841
- columns_to_broadcast = [
1842
- "Protein Group",
1843
- "Protein.Ids",
1844
- "Protein.Names",
1845
- "Genes",
1846
- ]
1847
- report_results = report_results.sort_values(
1848
- [
1849
- "Peptide",
1850
- "Global.PG.Q.Value",
1851
- "Lib.PG.Q.Value",
1852
- "Protein Group",
1853
- ],
1854
- )
1855
- broadcasted = (
1856
- report_results.groupby("Peptide")
1857
- .apply(
1858
- lambda x: pd.Series(
1859
- {
1860
- col: x.iloc[0][col]
1861
- for col in columns_to_broadcast + ["Peptide"]
1862
- }
1863
- )
1864
- )
1865
- .reset_index(drop=True)
1910
+ report_results = fix_peptide_to_protein_group_assignment(
1911
+ report_results
1866
1912
  )
1867
- report_results = (
1868
- report_results.drop(columns=columns_to_broadcast)
1869
- .merge(broadcasted, on="Peptide", how="left")
1870
- .drop_duplicates(subset=["Peptide", "Precursor.Charge"])
1913
+ report_results.drop_duplicates(
1914
+ subset=["Peptide", "Modified.Peptide", "Precursor.Charge"],
1915
+ inplace=True,
1871
1916
  )
1917
+
1872
1918
  df = pd.merge(
1873
1919
  report_results,
1874
1920
  search_results,
1875
1921
  on=["Protein Group"],
1876
1922
  how="left",
1877
1923
  )
1878
- df = df[
1879
- [
1880
- "Precursor.Id",
1881
- "Precursor.Charge",
1882
- "Peptide",
1883
- "Protein Group",
1884
- "Protein.Ids",
1885
- "Protein.Names",
1886
- "Genes",
1887
- "Modified.Sequence",
1888
- "Proteotypic",
1889
- "Global.Q.Value",
1890
- "Global.PG.Q.Value",
1891
- "Lib.Q.Value",
1892
- "Lib.PG.Q.Value",
1893
- "Gene Names",
1894
- ]
1895
- ]
1896
- df.rename(
1897
- columns={"Modified.Sequence": "Modified.Peptide"}, inplace=True
1898
- )
1899
1924
  # endif
1900
1925
  df.columns = [title_case_to_snake_case(x) for x in df.columns]
1926
+
1901
1927
  return df
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: seer-pas-sdk
3
- Version: 1.2.0
3
+ Version: 1.2.2
4
4
  Summary: SDK for Seer Proteograph Analysis Suite (PAS)
5
5
  Author-email: Ryan Sun <rsun@seer.bio>
6
6
  License:
@@ -194,9 +194,10 @@ License-File: LICENSE.txt
194
194
  Requires-Dist: boto3>=1.26.152
195
195
  Requires-Dist: botocore>=1.29.152
196
196
  Requires-Dist: pandas>=2.0.1
197
+ Requires-Dist: pyarrow>=17.0.0
197
198
  Requires-Dist: PyJWT>=2.8.0
198
199
  Requires-Dist: python-dotenv>=1.0.0
199
- Requires-Dist: Requests>=2.31.0
200
+ Requires-Dist: requests>=2.31.0
200
201
  Requires-Dist: tqdm>=4.65.0
201
202
  Requires-Dist: deprecation
202
203
  Dynamic: license-file
@@ -1,19 +1,19 @@
1
1
  seer_pas_sdk/__init__.py,sha256=Ie6atdmdBV-OmdHHXjhrGhdFGXiyP3JKhKrr3hyvSsA,563
2
2
  seer_pas_sdk/auth/__init__.py,sha256=e_eM4jJnnyKUdg4Nggzi9ypt2MLWcEJ8CmCPkUaQDSs,23
3
3
  seer_pas_sdk/auth/auth.py,sha256=_SI5CdEkfqfr4o5BQ79BuPbxGeI9p7tqxJd7mUqSAkI,8854
4
- seer_pas_sdk/common/__init__.py,sha256=LLfkbsZMXXty_T8xkOAws_WWBpbfwWZAdkNTduS8Abc,24443
4
+ seer_pas_sdk/common/__init__.py,sha256=WrRwmSONUdFD0ysT5jHwG2zWDd-v2wverjXY7BWZhHU,24488
5
5
  seer_pas_sdk/common/errors.py,sha256=4HFORWnaQQCMXRE8kwdsJWvQRB_3KFEZ7yMb391e4gA,142
6
6
  seer_pas_sdk/common/groupanalysis.py,sha256=DxB-gbQfYzl7p9MTYWDIqghcH-IeakzdYdrRZrlIHek,1730
7
7
  seer_pas_sdk/core/__init__.py,sha256=rxbKgg-Qe24OaxX2zyHHYPYgDCTEKE_-41bB2wvpvL4,25
8
- seer_pas_sdk/core/sdk.py,sha256=yDml92xhZtWR54-MgWG3rYVVlcaaAl2i6EzlWgbit8Q,160705
9
- seer_pas_sdk/core/unsupported.py,sha256=SpxKQx_SN0o7SEBGXko_vmTQVxYDAvXEQGH2VWTK63M,71915
8
+ seer_pas_sdk/core/sdk.py,sha256=0ukg287lsjlSNoV0WqFbiPMURhVogsy_sTR7gg1fr9Q,161512
9
+ seer_pas_sdk/core/unsupported.py,sha256=WcF_Z6ZUpzOWkWQHaMtm9SnE2NveuRmljVfNe8QSbms,72732
10
10
  seer_pas_sdk/objects/__init__.py,sha256=r-lY7axLTzToAI-Dme019YfcJLDe2ok1f_e6OQx3j64,130
11
11
  seer_pas_sdk/objects/groupanalysis.py,sha256=x3D_5NmYBoPDilNCQqUoCFARIfIeUq4FBY3_N6u8tfM,994
12
12
  seer_pas_sdk/objects/headers.py,sha256=RilNzB_Nhid3U8j93BxJYcRrgDmd_1bAuI0P465xd0g,2727
13
13
  seer_pas_sdk/objects/platemap.py,sha256=8IvJPAecs_e_FyqibzhCw-O4zjCFnf-zMUp_5krTEsg,5864
14
14
  seer_pas_sdk/objects/volcanoplot.py,sha256=lTrTOVg74nT3uo-P1edQJC1ZbdoiLMtQ3VJd9CnzmoM,9396
15
- seer_pas_sdk-1.2.0.dist-info/licenses/LICENSE.txt,sha256=DVQuDIgE45qn836wDaWnYhSdxoLXgpRRKH4RuTjpRZQ,10174
16
- seer_pas_sdk-1.2.0.dist-info/METADATA,sha256=6mQ4VXcrUHCVfn3PL0pD2j_u7yJNMmw1HRqtd_lATDg,13413
17
- seer_pas_sdk-1.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
18
- seer_pas_sdk-1.2.0.dist-info/top_level.txt,sha256=-2kZ-KFMGtXwr8H1O5llMKlcJ8gRKohEmrIvazXB61s,13
19
- seer_pas_sdk-1.2.0.dist-info/RECORD,,
15
+ seer_pas_sdk-1.2.2.dist-info/licenses/LICENSE.txt,sha256=DVQuDIgE45qn836wDaWnYhSdxoLXgpRRKH4RuTjpRZQ,10174
16
+ seer_pas_sdk-1.2.2.dist-info/METADATA,sha256=Lw-pb90n0vo7K4I-2wnQm_LvsrVzXpEYmj8t4vwToAA,13444
17
+ seer_pas_sdk-1.2.2.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
18
+ seer_pas_sdk-1.2.2.dist-info/top_level.txt,sha256=-2kZ-KFMGtXwr8H1O5llMKlcJ8gRKohEmrIvazXB61s,13
19
+ seer_pas_sdk-1.2.2.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.9.0)
2
+ Generator: setuptools (80.10.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5