seer-pas-sdk 1.1.1__py3-none-any.whl → 1.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- seer_pas_sdk/common/__init__.py +53 -6
- seer_pas_sdk/core/sdk.py +244 -194
- seer_pas_sdk/core/unsupported.py +295 -195
- {seer_pas_sdk-1.1.1.dist-info → seer_pas_sdk-1.2.1.dist-info}/METADATA +1 -1
- {seer_pas_sdk-1.1.1.dist-info → seer_pas_sdk-1.2.1.dist-info}/RECORD +8 -8
- {seer_pas_sdk-1.1.1.dist-info → seer_pas_sdk-1.2.1.dist-info}/WHEEL +0 -0
- {seer_pas_sdk-1.1.1.dist-info → seer_pas_sdk-1.2.1.dist-info}/licenses/LICENSE.txt +0 -0
- {seer_pas_sdk-1.1.1.dist-info → seer_pas_sdk-1.2.1.dist-info}/top_level.txt +0 -0
seer_pas_sdk/core/unsupported.py
CHANGED
|
@@ -4,6 +4,7 @@ seer_pas_sdk.core.unsupported -- in development
|
|
|
4
4
|
|
|
5
5
|
import os
|
|
6
6
|
import shutil
|
|
7
|
+
from pathlib import Path
|
|
7
8
|
|
|
8
9
|
from typing import List as _List
|
|
9
10
|
|
|
@@ -827,20 +828,29 @@ class _UnsupportedSDK(_SeerSDK):
|
|
|
827
828
|
)
|
|
828
829
|
|
|
829
830
|
# Step 1: Check if paths and file extensions are valid.
|
|
831
|
+
invalid_d_zip_files = []
|
|
830
832
|
for file in ms_data_files:
|
|
831
833
|
if not valid_ms_data_file(file):
|
|
832
834
|
raise ValueError(
|
|
833
835
|
"Invalid file or file format. Please check your file."
|
|
834
836
|
)
|
|
837
|
+
if file.endswith(".d.zip") and (not validate_d_zip_file(file)):
|
|
838
|
+
invalid_d_zip_files.append(file)
|
|
839
|
+
|
|
840
|
+
if invalid_d_zip_files:
|
|
841
|
+
raise ValueError(
|
|
842
|
+
f"The following .d.zip files are invalid: {', '.join(invalid_d_zip_files)}. Please check your files."
|
|
843
|
+
)
|
|
835
844
|
|
|
836
845
|
extensions = set(
|
|
837
|
-
[
|
|
846
|
+
["".join(Path(file).suffixes) for file in ms_data_files]
|
|
838
847
|
)
|
|
839
848
|
|
|
840
849
|
if filenames and ".d.zip" in extensions:
|
|
841
850
|
raise ValueError(
|
|
842
851
|
"Please leave the 'filenames' parameter empty when working with .d.zip files. SeerSDK.rename_d_zip_file() is available for this use case."
|
|
843
852
|
)
|
|
853
|
+
|
|
844
854
|
# Step 2: Use active tenant to fetch the tenant_id.
|
|
845
855
|
tenant_id = self.get_active_tenant_id()
|
|
846
856
|
|
|
@@ -1461,36 +1471,70 @@ class _UnsupportedSDK(_SeerSDK):
|
|
|
1461
1471
|
Get analyte intensities data for a given PAS analysis.
|
|
1462
1472
|
Args:
|
|
1463
1473
|
analysis_id (str): ID of the analysis.
|
|
1464
|
-
analyte_type (str): Type of the analyte. Must be either 'protein', 'peptide', precursor.
|
|
1474
|
+
analyte_type (str): Type of the analyte. Must be either 'protein', 'peptide', 'precursor'.
|
|
1465
1475
|
rollup (str): Intensities rollup method. Must be either 'np' or 'panel'.
|
|
1466
|
-
norm_method (str): Search engine. Supported engines are: raw, engine, median, median80, pepcal. Default is 'pepcal'.
|
|
1476
|
+
norm_method (str): Search engine. Supported engines are: raw, engine, median, median80, pepcal, pepcal_batch. Default is 'pepcal'.
|
|
1467
1477
|
|
|
1468
1478
|
Returns:
|
|
1469
1479
|
pd.DataFrame: A dataframe with each row containing the analyte intensity measurement:
|
|
1470
1480
|
'msrun_id', 'sample_id', 'nanoparticle' (if rollup is 'np'), 'protein_group', 'peptide' (for 'peptide' and 'precursor' analyte types), 'charge' (for 'precursor' analyte type),
|
|
1471
1481
|
'intensity_log10', 'protein_group_q_value', 'q_value' (for 'precursor' analyte type), 'rt' and 'irt' (for 'peptide' and 'precursor' analyte types)
|
|
1472
1482
|
"""
|
|
1473
|
-
|
|
1483
|
+
|
|
1484
|
+
def filepath_to_msrunid(filepath):
|
|
1485
|
+
return os.path.basename(filepath).split(".")[0]
|
|
1486
|
+
|
|
1487
|
+
# 1. Get samples and msrun data for analysis
|
|
1474
1488
|
samples = self.find_samples(analysis_id=analysis_id)
|
|
1475
|
-
sample_name_to_id = {s["sample_name"]: s["id"] for s in samples}
|
|
1476
|
-
# for np rollup, a row represents an msrun
|
|
1477
|
-
msruns = self.find_msruns(sample_ids=sample_name_to_id.values())
|
|
1478
|
-
file_to_msrun = {
|
|
1479
|
-
os.path.basename(msrun["raw_file_path"]).split(".")[0]: msrun
|
|
1480
|
-
for msrun in msruns
|
|
1481
|
-
}
|
|
1482
|
-
sample_to_msrun = {msrun["sample_id"]: msrun for msrun in msruns}
|
|
1483
1489
|
|
|
1484
|
-
|
|
1490
|
+
sample_uuid_to_id = {s["id"]: s["sample_id"] for s in samples}
|
|
1491
|
+
sample_id_to_uuid = {s["sample_id"]: s["id"] for s in samples}
|
|
1492
|
+
# FIXME sample_name is not guaranteed to be unique (within PAS analysis)
|
|
1493
|
+
sample_name_to_uuid = {s["sample_name"]: s["id"] for s in samples}
|
|
1494
|
+
|
|
1495
|
+
msruns = self.find_msruns(sample_ids=[s["id"] for s in samples])
|
|
1496
|
+
msrunid_to_info = {msrun["Run"]: msrun for msrun in msruns}
|
|
1485
1497
|
|
|
1486
1498
|
# 2. Get search results
|
|
1487
|
-
# pull the np/panel file, or report.tsv for precursor mode
|
|
1499
|
+
# pull the np/panel file, or the relevant columns from the report.tsv for precursor mode
|
|
1500
|
+
columns = None
|
|
1501
|
+
if analyte_type == "precursor" and rollup == "np":
|
|
1502
|
+
columnsExperiment = ["Run"]
|
|
1503
|
+
columnsProtein = [
|
|
1504
|
+
"Protein.Group",
|
|
1505
|
+
]
|
|
1506
|
+
columnsPeptide = [
|
|
1507
|
+
"Stripped.Sequence",
|
|
1508
|
+
]
|
|
1509
|
+
columnsPrecursor = [
|
|
1510
|
+
"Precursor.Id",
|
|
1511
|
+
"Precursor.Charge",
|
|
1512
|
+
"Precursor.Quantity",
|
|
1513
|
+
"RT",
|
|
1514
|
+
"iRT",
|
|
1515
|
+
"IM",
|
|
1516
|
+
"iIM",
|
|
1517
|
+
]
|
|
1518
|
+
columnsQValue = [
|
|
1519
|
+
"Q.Value",
|
|
1520
|
+
"Protein.Q.Value",
|
|
1521
|
+
]
|
|
1522
|
+
columns = [
|
|
1523
|
+
*columnsExperiment,
|
|
1524
|
+
*columnsProtein,
|
|
1525
|
+
*columnsPeptide,
|
|
1526
|
+
*columnsPrecursor,
|
|
1527
|
+
*columnsQValue,
|
|
1528
|
+
]
|
|
1488
1529
|
search_results = self.get_search_result(
|
|
1489
1530
|
analysis_id=analysis_id,
|
|
1490
1531
|
analyte_type=analyte_type,
|
|
1491
1532
|
rollup=rollup,
|
|
1533
|
+
columns=columns,
|
|
1492
1534
|
)
|
|
1535
|
+
|
|
1493
1536
|
if analyte_type in ["protein", "peptide"]:
|
|
1537
|
+
# set the intensity column based on norm_method and PAS analysis protocol version
|
|
1494
1538
|
intensity_column = None
|
|
1495
1539
|
if norm_method == "raw":
|
|
1496
1540
|
intensity_column = (
|
|
@@ -1532,140 +1576,171 @@ class _UnsupportedSDK(_SeerSDK):
|
|
|
1532
1576
|
raise ValueError(
|
|
1533
1577
|
"Pepcal normalized intensities not found in search results. This is only available with analyses processed with DIA-NN Seer Protocol v2.0 or later with the Seer Peptide Calibrant option enabled. \n Please retry using different norm_method, such as 'median'"
|
|
1534
1578
|
)
|
|
1535
|
-
|
|
1536
1579
|
intensity_column = "PepCal Intensities Log10"
|
|
1537
|
-
|
|
1580
|
+
elif norm_method == "pepcal_batch":
|
|
1581
|
+
if not (
|
|
1582
|
+
"PepCal Batch Intensities Log10" in search_results.columns
|
|
1583
|
+
):
|
|
1584
|
+
raise ValueError(
|
|
1585
|
+
"Pepcal normalized batch corrected intensities not found in search results. This is only available with analyses processed with DIA-NN Seer Protocol v2.0 or later with the Seer Peptide Calibrant option enabled. \n Please retry using different norm_method, such as 'median'"
|
|
1586
|
+
)
|
|
1587
|
+
intensity_column = "PepCal Batch Intensities Log10"
|
|
1538
1588
|
else:
|
|
1539
1589
|
raise ValueError(
|
|
1540
1590
|
f"norm_method = {norm_method} is not supported. Supported normalization methods are: raw, pepcal, engine, median, median80."
|
|
1541
1591
|
)
|
|
1542
|
-
if rollup == "panel":
|
|
1543
|
-
search_results.fillna({"Sample Name": ""}, inplace=True)
|
|
1544
|
-
search_results["File Name"] = search_results[
|
|
1545
|
-
"Sample Name"
|
|
1546
|
-
].apply(
|
|
1547
|
-
lambda x: (
|
|
1548
|
-
os.path.basename(
|
|
1549
|
-
sample_to_msrun[sample_name_to_id[x]][
|
|
1550
|
-
"raw_file_path"
|
|
1551
|
-
]
|
|
1552
|
-
).split(".")[0]
|
|
1553
|
-
if x
|
|
1554
|
-
else None
|
|
1555
|
-
)
|
|
1556
|
-
)
|
|
1557
|
-
search_results["File Name"] = search_results["File Name"].apply(
|
|
1558
|
-
lambda x: os.path.basename(x).split(".")[0] if x else None
|
|
1559
|
-
)
|
|
1560
1592
|
|
|
1561
1593
|
search_results["Intensity Log10"] = search_results[
|
|
1562
1594
|
intensity_column
|
|
1563
1595
|
]
|
|
1564
1596
|
|
|
1565
|
-
|
|
1566
|
-
|
|
1567
|
-
|
|
1568
|
-
|
|
1569
|
-
|
|
1570
|
-
|
|
1571
|
-
|
|
1572
|
-
|
|
1573
|
-
|
|
1574
|
-
|
|
1575
|
-
|
|
1597
|
+
if rollup == "panel":
|
|
1598
|
+
search_results.rename(
|
|
1599
|
+
columns={"Sample ID": "Sample UUID"}, inplace=True
|
|
1600
|
+
)
|
|
1601
|
+
search_results["Sample UUID"] = search_results[
|
|
1602
|
+
"Sample Name"
|
|
1603
|
+
].map(sample_name_to_uuid)
|
|
1604
|
+
search_results["Sample ID"] = search_results[
|
|
1605
|
+
"Sample UUID"
|
|
1606
|
+
].map(sample_uuid_to_id)
|
|
1607
|
+
experiment_columns = ["Sample UUID", "Sample ID"]
|
|
1608
|
+
|
|
1609
|
+
# analyte info is limited to the id in the panel rollup
|
|
1610
|
+
if analyte_type == "protein":
|
|
1611
|
+
analyte_id_column = "Protein Group"
|
|
1612
|
+
else:
|
|
1613
|
+
analyte_id_column = "Peptide"
|
|
1614
|
+
|
|
1615
|
+
analyte_columns = [analyte_id_column]
|
|
1616
|
+
df = search_results
|
|
1617
|
+
else:
|
|
1618
|
+
# np rollup, extract basename without extension
|
|
1619
|
+
path_to_msrunid = {
|
|
1620
|
+
path: filepath_to_msrunid(path)
|
|
1621
|
+
for path in search_results["File Name"].unique()
|
|
1622
|
+
}
|
|
1623
|
+
# strip path from the filename to allow merging with the precursor report
|
|
1624
|
+
search_results["Run"] = search_results["File Name"].map(
|
|
1625
|
+
path_to_msrunid
|
|
1626
|
+
)
|
|
1576
1627
|
|
|
1577
|
-
|
|
1578
|
-
["
|
|
1579
|
-
]
|
|
1580
|
-
report.drop_duplicates(
|
|
1581
|
-
subset=["File Name", "Protein Group"], inplace=True
|
|
1628
|
+
search_results["MsRun UUID"] = search_results["Run"].map(
|
|
1629
|
+
{k: v["id"] for k, v in msrunid_to_info.items()}
|
|
1582
1630
|
)
|
|
1583
|
-
|
|
1584
|
-
|
|
1585
|
-
|
|
1586
|
-
|
|
1587
|
-
|
|
1631
|
+
search_results["Sample ID"] = search_results["Run"].map(
|
|
1632
|
+
{k: v["sample_id"] for k, v in msrunid_to_info.items()}
|
|
1633
|
+
)
|
|
1634
|
+
search_results["Sample UUID"] = search_results["Run"].map(
|
|
1635
|
+
{k: v["sample_uuid"] for k, v in msrunid_to_info.items()}
|
|
1588
1636
|
)
|
|
1589
|
-
|
|
1590
|
-
"
|
|
1637
|
+
search_results["Nanoparticle"] = search_results["Run"].map(
|
|
1638
|
+
{k: v["nanoparticle"] for k, v in msrunid_to_info.items()}
|
|
1639
|
+
)
|
|
1640
|
+
experiment_columns = [
|
|
1641
|
+
"MsRun UUID",
|
|
1642
|
+
"Run",
|
|
1643
|
+
"Nanoparticle",
|
|
1644
|
+
"Sample UUID",
|
|
1591
1645
|
"Sample ID",
|
|
1592
|
-
"Protein Group",
|
|
1593
|
-
"Intensity Log10",
|
|
1594
|
-
"Protein Q Value",
|
|
1595
1646
|
]
|
|
1596
1647
|
|
|
1597
|
-
|
|
1598
|
-
|
|
1599
|
-
|
|
1600
|
-
|
|
1601
|
-
|
|
1602
|
-
|
|
1648
|
+
# Merge report to search results to get Q value and other properties
|
|
1649
|
+
if analyte_type == "protein":
|
|
1650
|
+
columns = ["Run", "Protein.Group", "Protein.Q.Value"]
|
|
1651
|
+
elif analyte_type == "peptide":
|
|
1652
|
+
columns = ["Run", "Stripped.Sequence", "Protein.Q.Value"]
|
|
1653
|
+
analytes = self.get_search_result(
|
|
1654
|
+
analysis_id=analysis_id,
|
|
1655
|
+
analyte_type="precursor",
|
|
1656
|
+
rollup="np",
|
|
1657
|
+
columns=columns,
|
|
1658
|
+
)
|
|
1659
|
+
# pandas Dataframe.rename() default behavior is to ignore the columns that do not exist in the data frame.
|
|
1660
|
+
analytes.rename(
|
|
1661
|
+
columns={
|
|
1662
|
+
"Protein.Group": "Protein Group",
|
|
1663
|
+
"Protein.Q.Value": "Protein Q Value",
|
|
1664
|
+
"Stripped.Sequence": "Peptide",
|
|
1665
|
+
},
|
|
1666
|
+
inplace=True,
|
|
1667
|
+
)
|
|
1668
|
+
|
|
1669
|
+
if analyte_type == "protein":
|
|
1670
|
+
analyte_id_column = "Protein Group"
|
|
1671
|
+
analyte_columns = [
|
|
1672
|
+
analyte_id_column,
|
|
1673
|
+
"Protein Q Value",
|
|
1674
|
+
]
|
|
1675
|
+
|
|
1676
|
+
else:
|
|
1677
|
+
analyte_id_column = "Peptide"
|
|
1678
|
+
analyte_columns = [analyte_id_column]
|
|
1679
|
+
# endif analyte_type
|
|
1680
|
+
|
|
1681
|
+
analytes.drop(
|
|
1682
|
+
columns=[
|
|
1683
|
+
col
|
|
1684
|
+
for col in analytes.columns
|
|
1685
|
+
if col != "Run" and col not in analyte_columns
|
|
1686
|
+
],
|
|
1687
|
+
inplace=True,
|
|
1688
|
+
)
|
|
1689
|
+
analytes.drop_duplicates(
|
|
1690
|
+
subset=["Run", analyte_id_column], inplace=True
|
|
1603
1691
|
)
|
|
1604
1692
|
df = pd.merge(
|
|
1605
1693
|
search_results,
|
|
1606
|
-
|
|
1607
|
-
on=["
|
|
1694
|
+
analytes,
|
|
1695
|
+
on=["Run", analyte_id_column],
|
|
1608
1696
|
how="left",
|
|
1697
|
+
validate="one_to_one",
|
|
1609
1698
|
)
|
|
1610
|
-
included_columns = [
|
|
1611
|
-
"MsRun ID",
|
|
1612
|
-
"Sample ID",
|
|
1613
|
-
"Peptide",
|
|
1614
|
-
"Protein Group",
|
|
1615
|
-
"Intensity Log10",
|
|
1616
|
-
"RT",
|
|
1617
|
-
"iRT",
|
|
1618
|
-
]
|
|
1619
|
-
# endif
|
|
1620
1699
|
|
|
1621
|
-
|
|
1622
|
-
included_columns.insert(
|
|
1623
|
-
included_columns.index("Sample ID") + 1, "Nanoparticle"
|
|
1624
|
-
)
|
|
1700
|
+
df = df[experiment_columns + analyte_columns + ["Intensity Log10"]]
|
|
1625
1701
|
|
|
1626
|
-
df["MsRun ID"] = df["File Name"].apply(
|
|
1627
|
-
lambda x: (
|
|
1628
|
-
file_to_msrun[x]["id"] if x in file_to_msrun else None
|
|
1629
|
-
)
|
|
1630
|
-
)
|
|
1631
|
-
df["Sample ID"] = df["File Name"].apply(
|
|
1632
|
-
lambda x: (
|
|
1633
|
-
file_to_msrun[x]["sample_id"]
|
|
1634
|
-
if x in file_to_msrun
|
|
1635
|
-
else None
|
|
1636
|
-
)
|
|
1637
|
-
)
|
|
1638
|
-
df = df[included_columns]
|
|
1639
|
-
df.columns = [title_case_to_snake_case(x) for x in df.columns]
|
|
1640
|
-
return df
|
|
1641
1702
|
else:
|
|
1642
1703
|
# precursor
|
|
1643
1704
|
# working only in report.tsv
|
|
1644
|
-
|
|
1645
|
-
|
|
1646
|
-
|
|
1647
|
-
file_to_msrun[x]["id"] if x in file_to_msrun else None
|
|
1705
|
+
if norm_method != "raw":
|
|
1706
|
+
raise ValueError(
|
|
1707
|
+
"For precursor analyte type, only 'raw' norm_method is supported."
|
|
1648
1708
|
)
|
|
1709
|
+
|
|
1710
|
+
search_results["MsRun UUID"] = search_results["Run"].map(
|
|
1711
|
+
{k: v["id"] for k, v in msrunid_to_info.items()}
|
|
1649
1712
|
)
|
|
1650
|
-
search_results["Sample ID"] = search_results["Run"].
|
|
1651
|
-
|
|
1652
|
-
|
|
1653
|
-
|
|
1654
|
-
|
|
1655
|
-
)
|
|
1713
|
+
search_results["Sample ID"] = search_results["Run"].map(
|
|
1714
|
+
{k: v["sample_id"] for k, v in msrunid_to_info.items()}
|
|
1715
|
+
)
|
|
1716
|
+
search_results["Sample UUID"] = search_results["Sample ID"].map(
|
|
1717
|
+
sample_id_to_uuid
|
|
1656
1718
|
)
|
|
1657
|
-
search_results["
|
|
1658
|
-
|
|
1659
|
-
|
|
1660
|
-
|
|
1661
|
-
|
|
1662
|
-
|
|
1663
|
-
"
|
|
1719
|
+
search_results["Nanoparticle"] = search_results["Run"].map(
|
|
1720
|
+
{k: v["nanoparticle"] for k, v in msrunid_to_info.items()}
|
|
1721
|
+
)
|
|
1722
|
+
experiment_columns = [
|
|
1723
|
+
"MsRun UUID",
|
|
1724
|
+
"Run",
|
|
1725
|
+
"Nanoparticle",
|
|
1726
|
+
"Sample UUID",
|
|
1727
|
+
"Sample ID",
|
|
1664
1728
|
]
|
|
1665
1729
|
|
|
1666
|
-
|
|
1667
|
-
|
|
1668
|
-
|
|
1730
|
+
search_results.rename(
|
|
1731
|
+
columns={
|
|
1732
|
+
"Protein.Group": "Protein Group",
|
|
1733
|
+
"Stripped.Sequence": "Peptide",
|
|
1734
|
+
"Precursor.Charge": "Charge",
|
|
1735
|
+
"Precursor.Id": "Precursor Id",
|
|
1736
|
+
"Q.Value": "Precursor Q Value",
|
|
1737
|
+
"Protein.Q.Value": "Protein Q Value",
|
|
1738
|
+
"Precursor.Quantity": "Intensity",
|
|
1739
|
+
},
|
|
1740
|
+
inplace=True,
|
|
1741
|
+
)
|
|
1742
|
+
|
|
1743
|
+
analyte_columns = [
|
|
1669
1744
|
"Protein Group",
|
|
1670
1745
|
"Protein Q Value",
|
|
1671
1746
|
"Peptide",
|
|
@@ -1678,10 +1753,13 @@ class _UnsupportedSDK(_SeerSDK):
|
|
|
1678
1753
|
"IM",
|
|
1679
1754
|
"iIM",
|
|
1680
1755
|
]
|
|
1681
|
-
df =
|
|
1682
|
-
|
|
1756
|
+
df = pd.DataFrame(
|
|
1757
|
+
search_results[experiment_columns + analyte_columns]
|
|
1758
|
+
)
|
|
1683
1759
|
|
|
1684
|
-
|
|
1760
|
+
df.columns = [title_case_to_snake_case(x) for x in df.columns]
|
|
1761
|
+
|
|
1762
|
+
return df
|
|
1685
1763
|
|
|
1686
1764
|
def get_search_data_analytes(self, analysis_id: str, analyte_type: str):
|
|
1687
1765
|
if analyte_type not in ["protein", "peptide", "precursor"]:
|
|
@@ -1697,10 +1775,6 @@ class _UnsupportedSDK(_SeerSDK):
|
|
|
1697
1775
|
analysis_id=analysis_id, analyte_type="protein", rollup="np"
|
|
1698
1776
|
)
|
|
1699
1777
|
|
|
1700
|
-
report_results = self.get_search_result(
|
|
1701
|
-
analysis_id=analysis_id, analyte_type="precursor", rollup="np"
|
|
1702
|
-
)
|
|
1703
|
-
|
|
1704
1778
|
search_results = search_results[
|
|
1705
1779
|
[
|
|
1706
1780
|
"Protein Group",
|
|
@@ -1712,18 +1786,87 @@ class _UnsupportedSDK(_SeerSDK):
|
|
|
1712
1786
|
]
|
|
1713
1787
|
]
|
|
1714
1788
|
search_results.drop_duplicates(subset=["Protein Group"], inplace=True)
|
|
1715
|
-
report_results["Protein Group"] = report_results["Protein.Group"]
|
|
1716
|
-
report_results["Peptide"] = report_results["Stripped.Sequence"]
|
|
1717
1789
|
|
|
1718
|
-
|
|
1719
|
-
|
|
1790
|
+
# 2. fetch precursor report to extract analyte-specific details
|
|
1791
|
+
columnsPG = [
|
|
1792
|
+
"Protein.Group",
|
|
1793
|
+
]
|
|
1794
|
+
columnsPeptide = [
|
|
1795
|
+
"Protein.Ids",
|
|
1796
|
+
"Stripped.Sequence",
|
|
1797
|
+
"Proteotypic",
|
|
1798
|
+
]
|
|
1799
|
+
columnsPrecursor = [
|
|
1800
|
+
"Precursor.Id",
|
|
1801
|
+
"Precursor.Charge",
|
|
1802
|
+
"Precursor.Quantity",
|
|
1803
|
+
"Modified.Sequence",
|
|
1804
|
+
]
|
|
1805
|
+
columnsPGQValue = [
|
|
1806
|
+
"Global.PG.Q.Value",
|
|
1807
|
+
"Lib.PG.Q.Value",
|
|
1808
|
+
]
|
|
1809
|
+
columnsPrecursorQValue = [
|
|
1810
|
+
"Global.Q.Value",
|
|
1811
|
+
"Lib.Q.Value",
|
|
1812
|
+
]
|
|
1813
|
+
columns = [
|
|
1814
|
+
*columnsPG,
|
|
1815
|
+
*columnsPGQValue,
|
|
1816
|
+
]
|
|
1817
|
+
if analyte_type == "peptide":
|
|
1818
|
+
columns += [*columnsPeptide]
|
|
1819
|
+
elif analyte_type == "precursor":
|
|
1820
|
+
columns += [
|
|
1821
|
+
*columnsPeptide,
|
|
1822
|
+
*columnsPrecursor,
|
|
1823
|
+
*columnsPrecursorQValue,
|
|
1824
|
+
]
|
|
1825
|
+
report_results = self.get_search_result(
|
|
1826
|
+
analysis_id=analysis_id,
|
|
1827
|
+
analyte_type="precursor",
|
|
1828
|
+
rollup="np",
|
|
1829
|
+
columns=columns,
|
|
1830
|
+
)
|
|
1831
|
+
report_results.rename(
|
|
1832
|
+
columns={
|
|
1833
|
+
"Protein.Group": "Protein Group",
|
|
1834
|
+
"Stripped.Sequence": "Peptide",
|
|
1835
|
+
"Modified.Sequence": "Modified.Peptide",
|
|
1836
|
+
},
|
|
1837
|
+
inplace=True,
|
|
1838
|
+
)
|
|
1839
|
+
|
|
1840
|
+
# function to fix the potential bug, where different precursors
|
|
1841
|
+
# of the same peptide map to different protein groups
|
|
1842
|
+
def fix_peptide_to_protein_group_assignment(
|
|
1843
|
+
df: pd.DataFrame,
|
|
1844
|
+
) -> pd.DataFrame:
|
|
1845
|
+
# for each peptide, sort protein groups by confidence
|
|
1846
|
+
df = df.sort_values(
|
|
1720
1847
|
[
|
|
1721
|
-
"
|
|
1722
|
-
"Protein.Ids",
|
|
1848
|
+
"Peptide",
|
|
1723
1849
|
"Global.PG.Q.Value",
|
|
1724
1850
|
"Lib.PG.Q.Value",
|
|
1851
|
+
"Protein Group",
|
|
1725
1852
|
]
|
|
1726
|
-
|
|
1853
|
+
)
|
|
1854
|
+
|
|
1855
|
+
# broadcast the best protein group across all rows with the same peptide
|
|
1856
|
+
# to fix the potential bug, where different precursors of the same peptide
|
|
1857
|
+
# map to different protein groups
|
|
1858
|
+
for col in [
|
|
1859
|
+
"Protein Group",
|
|
1860
|
+
"Protein.Ids",
|
|
1861
|
+
"Protein.Names",
|
|
1862
|
+
"Genes",
|
|
1863
|
+
]:
|
|
1864
|
+
if col in df.columns:
|
|
1865
|
+
df[col] = df.groupby("Peptide")[col].transform("first")
|
|
1866
|
+
|
|
1867
|
+
return df
|
|
1868
|
+
|
|
1869
|
+
if analyte_type == "protein":
|
|
1727
1870
|
report_results.drop_duplicates(
|
|
1728
1871
|
subset=["Protein Group"], inplace=True
|
|
1729
1872
|
)
|
|
@@ -1734,25 +1877,23 @@ class _UnsupportedSDK(_SeerSDK):
|
|
|
1734
1877
|
how="left",
|
|
1735
1878
|
)
|
|
1736
1879
|
elif analyte_type == "peptide":
|
|
1737
|
-
|
|
1738
|
-
|
|
1739
|
-
|
|
1740
|
-
|
|
1741
|
-
|
|
1742
|
-
|
|
1743
|
-
search_results,
|
|
1744
|
-
on=["Protein Group"],
|
|
1745
|
-
how="left",
|
|
1746
|
-
)
|
|
1747
|
-
|
|
1748
|
-
report_results = report_results[
|
|
1749
|
-
["Peptide", "Protein.Ids", "Protein.Group"]
|
|
1880
|
+
search_results = search_results[
|
|
1881
|
+
[
|
|
1882
|
+
"Protein Group",
|
|
1883
|
+
"Protein Names",
|
|
1884
|
+
"Gene Names",
|
|
1885
|
+
]
|
|
1750
1886
|
]
|
|
1887
|
+
report_results.drop_duplicates(inplace=True)
|
|
1888
|
+
report_results = fix_peptide_to_protein_group_assignment(
|
|
1889
|
+
report_results
|
|
1890
|
+
)
|
|
1751
1891
|
report_results.drop_duplicates(subset=["Peptide"], inplace=True)
|
|
1892
|
+
|
|
1752
1893
|
df = pd.merge(
|
|
1753
|
-
search_results,
|
|
1754
1894
|
report_results,
|
|
1755
|
-
|
|
1895
|
+
search_results,
|
|
1896
|
+
on=["Protein Group"],
|
|
1756
1897
|
how="left",
|
|
1757
1898
|
)
|
|
1758
1899
|
else:
|
|
@@ -1762,66 +1903,25 @@ class _UnsupportedSDK(_SeerSDK):
|
|
|
1762
1903
|
"Protein Group",
|
|
1763
1904
|
"Protein Names",
|
|
1764
1905
|
"Gene Names",
|
|
1765
|
-
"Biological Process",
|
|
1766
|
-
"Molecular Function",
|
|
1767
|
-
"Cellular Component",
|
|
1768
1906
|
]
|
|
1769
1907
|
]
|
|
1770
|
-
|
|
1771
|
-
|
|
1908
|
+
report_results.drop_duplicates(inplace=True)
|
|
1909
|
+
|
|
1910
|
+
report_results = fix_peptide_to_protein_group_assignment(
|
|
1911
|
+
report_results
|
|
1772
1912
|
)
|
|
1773
|
-
report_results = report_results[
|
|
1774
|
-
[
|
|
1775
|
-
"Precursor.Id",
|
|
1776
|
-
"Precursor.Charge",
|
|
1777
|
-
"Peptide",
|
|
1778
|
-
"Protein Group",
|
|
1779
|
-
"Protein.Ids",
|
|
1780
|
-
"Protein.Names",
|
|
1781
|
-
"Genes",
|
|
1782
|
-
"First.Protein.Description",
|
|
1783
|
-
"Modified.Sequence",
|
|
1784
|
-
"Proteotypic",
|
|
1785
|
-
"Global.Q.Value",
|
|
1786
|
-
"Global.PG.Q.Value",
|
|
1787
|
-
"Lib.Q.Value",
|
|
1788
|
-
"Lib.PG.Q.Value",
|
|
1789
|
-
]
|
|
1790
|
-
]
|
|
1791
1913
|
report_results.drop_duplicates(
|
|
1792
|
-
subset=["
|
|
1914
|
+
subset=["Peptide", "Modified.Peptide", "Precursor.Charge"],
|
|
1915
|
+
inplace=True,
|
|
1793
1916
|
)
|
|
1917
|
+
|
|
1794
1918
|
df = pd.merge(
|
|
1795
1919
|
report_results,
|
|
1796
1920
|
search_results,
|
|
1797
1921
|
on=["Protein Group"],
|
|
1798
1922
|
how="left",
|
|
1799
1923
|
)
|
|
1800
|
-
df = df[
|
|
1801
|
-
[
|
|
1802
|
-
"Precursor.Id",
|
|
1803
|
-
"Precursor.Charge",
|
|
1804
|
-
"Peptide",
|
|
1805
|
-
"Protein Group",
|
|
1806
|
-
"Protein.Ids",
|
|
1807
|
-
"Protein.Names",
|
|
1808
|
-
"Genes",
|
|
1809
|
-
"First.Protein.Description",
|
|
1810
|
-
"Modified.Sequence",
|
|
1811
|
-
"Proteotypic",
|
|
1812
|
-
"Global.Q.Value",
|
|
1813
|
-
"Global.PG.Q.Value",
|
|
1814
|
-
"Lib.Q.Value",
|
|
1815
|
-
"Lib.PG.Q.Value",
|
|
1816
|
-
"Gene Names",
|
|
1817
|
-
"Biological Process",
|
|
1818
|
-
"Molecular Function",
|
|
1819
|
-
"Cellular Component",
|
|
1820
|
-
]
|
|
1821
|
-
]
|
|
1822
|
-
df.rename(
|
|
1823
|
-
columns={"Modified.Sequence": "Modified.Peptide"}, inplace=True
|
|
1824
|
-
)
|
|
1825
1924
|
# endif
|
|
1826
1925
|
df.columns = [title_case_to_snake_case(x) for x in df.columns]
|
|
1926
|
+
|
|
1827
1927
|
return df
|
|
@@ -1,19 +1,19 @@
|
|
|
1
1
|
seer_pas_sdk/__init__.py,sha256=Ie6atdmdBV-OmdHHXjhrGhdFGXiyP3JKhKrr3hyvSsA,563
|
|
2
2
|
seer_pas_sdk/auth/__init__.py,sha256=e_eM4jJnnyKUdg4Nggzi9ypt2MLWcEJ8CmCPkUaQDSs,23
|
|
3
3
|
seer_pas_sdk/auth/auth.py,sha256=_SI5CdEkfqfr4o5BQ79BuPbxGeI9p7tqxJd7mUqSAkI,8854
|
|
4
|
-
seer_pas_sdk/common/__init__.py,sha256=
|
|
4
|
+
seer_pas_sdk/common/__init__.py,sha256=VbtcWOt9_jR5NfETtYTqcUXII98Jj5sphr3lLSdIzdg,24445
|
|
5
5
|
seer_pas_sdk/common/errors.py,sha256=4HFORWnaQQCMXRE8kwdsJWvQRB_3KFEZ7yMb391e4gA,142
|
|
6
6
|
seer_pas_sdk/common/groupanalysis.py,sha256=DxB-gbQfYzl7p9MTYWDIqghcH-IeakzdYdrRZrlIHek,1730
|
|
7
7
|
seer_pas_sdk/core/__init__.py,sha256=rxbKgg-Qe24OaxX2zyHHYPYgDCTEKE_-41bB2wvpvL4,25
|
|
8
|
-
seer_pas_sdk/core/sdk.py,sha256=
|
|
9
|
-
seer_pas_sdk/core/unsupported.py,sha256=
|
|
8
|
+
seer_pas_sdk/core/sdk.py,sha256=0ukg287lsjlSNoV0WqFbiPMURhVogsy_sTR7gg1fr9Q,161512
|
|
9
|
+
seer_pas_sdk/core/unsupported.py,sha256=WcF_Z6ZUpzOWkWQHaMtm9SnE2NveuRmljVfNe8QSbms,72732
|
|
10
10
|
seer_pas_sdk/objects/__init__.py,sha256=r-lY7axLTzToAI-Dme019YfcJLDe2ok1f_e6OQx3j64,130
|
|
11
11
|
seer_pas_sdk/objects/groupanalysis.py,sha256=x3D_5NmYBoPDilNCQqUoCFARIfIeUq4FBY3_N6u8tfM,994
|
|
12
12
|
seer_pas_sdk/objects/headers.py,sha256=RilNzB_Nhid3U8j93BxJYcRrgDmd_1bAuI0P465xd0g,2727
|
|
13
13
|
seer_pas_sdk/objects/platemap.py,sha256=8IvJPAecs_e_FyqibzhCw-O4zjCFnf-zMUp_5krTEsg,5864
|
|
14
14
|
seer_pas_sdk/objects/volcanoplot.py,sha256=lTrTOVg74nT3uo-P1edQJC1ZbdoiLMtQ3VJd9CnzmoM,9396
|
|
15
|
-
seer_pas_sdk-1.
|
|
16
|
-
seer_pas_sdk-1.
|
|
17
|
-
seer_pas_sdk-1.
|
|
18
|
-
seer_pas_sdk-1.
|
|
19
|
-
seer_pas_sdk-1.
|
|
15
|
+
seer_pas_sdk-1.2.1.dist-info/licenses/LICENSE.txt,sha256=DVQuDIgE45qn836wDaWnYhSdxoLXgpRRKH4RuTjpRZQ,10174
|
|
16
|
+
seer_pas_sdk-1.2.1.dist-info/METADATA,sha256=_zLtgk1zE8eWRPizPS9h2tEbfhJ3DOAH2ePNk4ptwvw,13413
|
|
17
|
+
seer_pas_sdk-1.2.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
18
|
+
seer_pas_sdk-1.2.1.dist-info/top_level.txt,sha256=-2kZ-KFMGtXwr8H1O5llMKlcJ8gRKohEmrIvazXB61s,13
|
|
19
|
+
seer_pas_sdk-1.2.1.dist-info/RECORD,,
|