seer-pas-sdk 1.2.0__tar.gz → 1.2.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {seer_pas_sdk-1.2.0 → seer_pas_sdk-1.2.2}/PKG-INFO +3 -2
- {seer_pas_sdk-1.2.0 → seer_pas_sdk-1.2.2}/docs/index.qmd +2 -2
- {seer_pas_sdk-1.2.0 → seer_pas_sdk-1.2.2}/pyproject.toml +2 -1
- {seer_pas_sdk-1.2.0 → seer_pas_sdk-1.2.2}/seer_pas_sdk/common/__init__.py +8 -7
- {seer_pas_sdk-1.2.0 → seer_pas_sdk-1.2.2}/seer_pas_sdk/core/sdk.py +41 -19
- {seer_pas_sdk-1.2.0 → seer_pas_sdk-1.2.2}/seer_pas_sdk/core/unsupported.py +275 -249
- {seer_pas_sdk-1.2.0 → seer_pas_sdk-1.2.2}/seer_pas_sdk.egg-info/PKG-INFO +3 -2
- {seer_pas_sdk-1.2.0 → seer_pas_sdk-1.2.2}/seer_pas_sdk.egg-info/requires.txt +2 -1
- {seer_pas_sdk-1.2.0 → seer_pas_sdk-1.2.2}/.github/workflows/lint.yml +0 -0
- {seer_pas_sdk-1.2.0 → seer_pas_sdk-1.2.2}/.github/workflows/publish.yml +0 -0
- {seer_pas_sdk-1.2.0 → seer_pas_sdk-1.2.2}/.github/workflows/test.yml +0 -0
- {seer_pas_sdk-1.2.0 → seer_pas_sdk-1.2.2}/.gitignore +0 -0
- {seer_pas_sdk-1.2.0 → seer_pas_sdk-1.2.2}/.pre-commit-config.yaml +0 -0
- {seer_pas_sdk-1.2.0 → seer_pas_sdk-1.2.2}/LICENSE.txt +0 -0
- {seer_pas_sdk-1.2.0 → seer_pas_sdk-1.2.2}/README.md +0 -0
- {seer_pas_sdk-1.2.0 → seer_pas_sdk-1.2.2}/docs/_quarto.yml +0 -0
- {seer_pas_sdk-1.2.0 → seer_pas_sdk-1.2.2}/seer_pas_sdk/__init__.py +0 -0
- {seer_pas_sdk-1.2.0 → seer_pas_sdk-1.2.2}/seer_pas_sdk/auth/__init__.py +0 -0
- {seer_pas_sdk-1.2.0 → seer_pas_sdk-1.2.2}/seer_pas_sdk/auth/auth.py +0 -0
- {seer_pas_sdk-1.2.0 → seer_pas_sdk-1.2.2}/seer_pas_sdk/common/errors.py +0 -0
- {seer_pas_sdk-1.2.0 → seer_pas_sdk-1.2.2}/seer_pas_sdk/common/groupanalysis.py +0 -0
- {seer_pas_sdk-1.2.0 → seer_pas_sdk-1.2.2}/seer_pas_sdk/core/__init__.py +0 -0
- {seer_pas_sdk-1.2.0 → seer_pas_sdk-1.2.2}/seer_pas_sdk/objects/__init__.py +0 -0
- {seer_pas_sdk-1.2.0 → seer_pas_sdk-1.2.2}/seer_pas_sdk/objects/groupanalysis.py +0 -0
- {seer_pas_sdk-1.2.0 → seer_pas_sdk-1.2.2}/seer_pas_sdk/objects/headers.py +0 -0
- {seer_pas_sdk-1.2.0 → seer_pas_sdk-1.2.2}/seer_pas_sdk/objects/platemap.py +0 -0
- {seer_pas_sdk-1.2.0 → seer_pas_sdk-1.2.2}/seer_pas_sdk/objects/volcanoplot.py +0 -0
- {seer_pas_sdk-1.2.0 → seer_pas_sdk-1.2.2}/seer_pas_sdk.egg-info/SOURCES.txt +0 -0
- {seer_pas_sdk-1.2.0 → seer_pas_sdk-1.2.2}/seer_pas_sdk.egg-info/dependency_links.txt +0 -0
- {seer_pas_sdk-1.2.0 → seer_pas_sdk-1.2.2}/seer_pas_sdk.egg-info/top_level.txt +0 -0
- {seer_pas_sdk-1.2.0 → seer_pas_sdk-1.2.2}/setup.cfg +0 -0
- {seer_pas_sdk-1.2.0 → seer_pas_sdk-1.2.2}/tests/__init__.py +0 -0
- {seer_pas_sdk-1.2.0 → seer_pas_sdk-1.2.2}/tests/conftest.py +0 -0
- {seer_pas_sdk-1.2.0 → seer_pas_sdk-1.2.2}/tests/objects/__init__.py +0 -0
- {seer_pas_sdk-1.2.0 → seer_pas_sdk-1.2.2}/tests/objects/test_platemap.py +0 -0
- {seer_pas_sdk-1.2.0 → seer_pas_sdk-1.2.2}/tests/test_auth.py +0 -0
- {seer_pas_sdk-1.2.0 → seer_pas_sdk-1.2.2}/tests/test_common.py +0 -0
- {seer_pas_sdk-1.2.0 → seer_pas_sdk-1.2.2}/tests/test_objects.py +0 -0
- {seer_pas_sdk-1.2.0 → seer_pas_sdk-1.2.2}/tests/test_sdk.py +0 -0
- {seer_pas_sdk-1.2.0 → seer_pas_sdk-1.2.2}/tests/unsupported_platemap.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: seer-pas-sdk
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.2
|
|
4
4
|
Summary: SDK for Seer Proteograph Analysis Suite (PAS)
|
|
5
5
|
Author-email: Ryan Sun <rsun@seer.bio>
|
|
6
6
|
License:
|
|
@@ -194,9 +194,10 @@ License-File: LICENSE.txt
|
|
|
194
194
|
Requires-Dist: boto3>=1.26.152
|
|
195
195
|
Requires-Dist: botocore>=1.29.152
|
|
196
196
|
Requires-Dist: pandas>=2.0.1
|
|
197
|
+
Requires-Dist: pyarrow>=17.0.0
|
|
197
198
|
Requires-Dist: PyJWT>=2.8.0
|
|
198
199
|
Requires-Dist: python-dotenv>=1.0.0
|
|
199
|
-
Requires-Dist:
|
|
200
|
+
Requires-Dist: requests>=2.31.0
|
|
200
201
|
Requires-Dist: tqdm>=4.65.0
|
|
201
202
|
Requires-Dist: deprecation
|
|
202
203
|
Dynamic: license-file
|
|
@@ -353,7 +353,7 @@ example = sdk.find_msruns(sample_ids)
|
|
|
353
353
|
log(example)
|
|
354
354
|
```
|
|
355
355
|
```
|
|
356
|
-
[{'id': '81c6a180-15e0-11ee-bdf1-bbaa73585acf', '
|
|
356
|
+
[{'id': '81c6a180-15e0-11ee-bdf1-bbaa73585acf', 'sample_uuid': '812139c0-15e0-11ee-bdf1-bbaa73585acf', 'raw_file_path': '7ec8cad0-15e0-11ee-bdf1-bbaa73585acf/20230628182044224/TestFile2.raw', 'well_location': 'D11', 'nanoparticle': '', 'instrument_name': '', 'created_by': '04936dea-d255-4130-8e82-2f28938a8f9a', 'created_timestamp': '2023-06-28T18:20:49.006Z', 'last_modified_by': '04936dea-d255-4130-8e82-2f28938a8f9a', 'last_modified_timestamp': '2023-06-28T18:20:49.006Z', 'user_group': None, 'sample_id': 'A112', 'nanoparticle_id': '', 'control': '', 'control_id': '', 'date_sample_prep': '', 'sample_volume': '', 'peptide_concentration': '', 'peptide_mass_sample': '', 'dilution_factor': '', 'kit_id': None, 'injection_timestamp': None, 'ms_instrument_sn': None, 'recon_volume': None, 'gradient': None}, {'id': '816a9ed0-15e0-11ee-bdf1-bbaa73585acf', 'sample_uuid': '803e05b0-15e0-11ee-bdf1-bbaa73585acf', 'raw_file_path': '7ec8cad0-15e0-11ee-bdf1-bbaa73585acf/20230628182044224/TestFile1.raw', 'well_location': 'C11', 'nanoparticle': 'NONE', 'instrument_name': '', 'created_by': '04936dea-d255-4130-8e82-2f28938a8f9a', 'created_timestamp': '2023-06-28T18:20:48.408Z', 'last_modified_by': '04936dea-d255-4130-8e82-2f28938a8f9a', 'last_modified_timestamp': '2023-06-28T18:20:48.408Z', 'user_group': None, 'sample_id': 'A111', 'nanoparticle_id': 'NONE', 'control': 'MPE Control', 'control_id': 'MPE Control', 'date_sample_prep': '', 'sample_volume': '20.0', 'peptide_concentration': '59.514', 'peptide_mass_sample': '8.57', 'dilution_factor': '1.0', 'kit_id': None, 'injection_timestamp': None, 'ms_instrument_sn': None, 'recon_volume': None, 'gradient': None}]
|
|
357
357
|
```
|
|
358
358
|
|
|
359
359
|
There is also an option to return everything as a DataFrame instead:
|
|
@@ -363,7 +363,7 @@ example = sdk.find_msruns(sample_ids, as_df=True)
|
|
|
363
363
|
log(example)
|
|
364
364
|
```
|
|
365
365
|
```
|
|
366
|
-
id
|
|
366
|
+
id sample_uuid raw_file_path well_location nanoparticle instrument_name created_by created_timestamp last_modified_by last_modified_timestamp space sample_id nanoparticle_id control control_id date_sample_prep sample_volume peptide_concentration peptide_mass_sample dilution_factor kit_id injection_timestamp ms_instrument_sn recon_volume gradient
|
|
367
367
|
0 81c6a180-15e0-11ee-bdf1-bbaa73585acf 812139c0-15e0-11ee-bdf1-bbaa73585acf 7ec8cad0-15e0-11ee-bdf1-bbaa73585acf/202306281... D11 04936dea-d255-4130-8e82-2f28938a8f9a 2023-06-28T18:20:49.006Z 04936dea-d255-4130-8e82-2f28938a8f9a 2023-06-28T18:20:49.006Z None A112 None None None None None
|
|
368
368
|
1 816a9ed0-15e0-11ee-bdf1-bbaa73585acf 803e05b0-15e0-11ee-bdf1-bbaa73585acf 7ec8cad0-15e0-11ee-bdf1-bbaa73585acf/202306281... C11 NONE 04936dea-d255-4130-8e82-2f28938a8f9a 2023-06-28T18:20:48.408Z 04936dea-d255-4130-8e82-2f28938a8f9a 2023-06-28T18:20:48.408Z None A111 NONE MPE Control MPE Control 20.0 59.514 8.57 1.0 None None None None None
|
|
369
369
|
```
|
|
@@ -99,7 +99,7 @@ def dict_to_df(data):
|
|
|
99
99
|
|
|
100
100
|
|
|
101
101
|
# Most cases appear to be a .tsv file.
|
|
102
|
-
def download_df(url, is_tsv=True, dtype={}):
|
|
102
|
+
def download_df(url, is_tsv=True, dtype={}, usecols=None):
|
|
103
103
|
"""
|
|
104
104
|
Fetches a TSV/CSV file from a URL and returns as a Pandas DataFrame.
|
|
105
105
|
|
|
@@ -114,6 +114,9 @@ def download_df(url, is_tsv=True, dtype={}):
|
|
|
114
114
|
dtype : dict
|
|
115
115
|
Data type conversion when intaking columns. e.g. {'a': str, 'b': np.float64}
|
|
116
116
|
|
|
117
|
+
usecols : list
|
|
118
|
+
Subset of columns to download. If not specified, downloads all columns.
|
|
119
|
+
|
|
117
120
|
Returns
|
|
118
121
|
-------
|
|
119
122
|
pandas.core.frame.DataFrame
|
|
@@ -139,12 +142,10 @@ def download_df(url, is_tsv=True, dtype={}):
|
|
|
139
142
|
|
|
140
143
|
if not url:
|
|
141
144
|
return pd.DataFrame()
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
else
|
|
146
|
-
csv = pd.read_csv(url_content, dtype=dtype)
|
|
147
|
-
return csv
|
|
145
|
+
csv = pd.read_csv(
|
|
146
|
+
url, sep="\t" if is_tsv else ",", usecols=usecols, engine="pyarrow"
|
|
147
|
+
)
|
|
148
|
+
return csv.astype(dtype=dtype) if dtype else csv
|
|
148
149
|
|
|
149
150
|
|
|
150
151
|
def get_sample_info(
|
|
@@ -7,7 +7,6 @@ import requests
|
|
|
7
7
|
import urllib.request
|
|
8
8
|
import ssl
|
|
9
9
|
|
|
10
|
-
|
|
11
10
|
from typing import List as _List, Tuple as _Tuple
|
|
12
11
|
|
|
13
12
|
from ..common import *
|
|
@@ -15,8 +14,6 @@ from ..auth import Auth
|
|
|
15
14
|
from ..objects.volcanoplot import VolcanoPlotBuilder
|
|
16
15
|
from ..objects.headers import *
|
|
17
16
|
|
|
18
|
-
import warnings
|
|
19
|
-
|
|
20
17
|
|
|
21
18
|
class SeerSDK:
|
|
22
19
|
"""
|
|
@@ -1228,8 +1225,8 @@ class SeerSDK:
|
|
|
1228
1225
|
|
|
1229
1226
|
>>> seer_sdk.get_msruns(sample_ids)
|
|
1230
1227
|
>>> [
|
|
1231
|
-
{"id": "
|
|
1232
|
-
{"id": "
|
|
1228
|
+
{"id": "MSRUN_ID_1_HERE" ... },
|
|
1229
|
+
{"id": "MSRUN_ID_2_HERE" ... }
|
|
1233
1230
|
]
|
|
1234
1231
|
|
|
1235
1232
|
>>> seer_sdk.get_msruns(sample_ids, as_df=True)
|
|
@@ -1295,8 +1292,8 @@ class SeerSDK:
|
|
|
1295
1292
|
|
|
1296
1293
|
>>> seer_sdk.find_msruns(sample_ids)
|
|
1297
1294
|
>>> [
|
|
1298
|
-
{"id": "
|
|
1299
|
-
{"id": "
|
|
1295
|
+
{"id": "MSRUN_ID_1_HERE" ... },
|
|
1296
|
+
{"id": "MSRUN_ID_2_HERE" ... }
|
|
1300
1297
|
]
|
|
1301
1298
|
|
|
1302
1299
|
>>> seer_sdk.find_msruns(sample_ids, as_df=True)
|
|
@@ -1310,25 +1307,34 @@ class SeerSDK:
|
|
|
1310
1307
|
URL = f"{self._auth.url}api/v1/msdatas/items"
|
|
1311
1308
|
|
|
1312
1309
|
res = []
|
|
1313
|
-
for sample_id in sample_ids:
|
|
1314
1310
|
|
|
1315
|
-
|
|
1311
|
+
params = {"all": "true"}
|
|
1316
1312
|
|
|
1317
|
-
|
|
1313
|
+
with self._get_auth_session("findmsdatas") as s:
|
|
1318
1314
|
|
|
1319
|
-
|
|
1320
|
-
|
|
1321
|
-
|
|
1322
|
-
)
|
|
1315
|
+
msdatas = s.post(
|
|
1316
|
+
URL, json={"sampleId": ",".join(sample_ids)}, params=params
|
|
1317
|
+
)
|
|
1323
1318
|
|
|
1324
|
-
|
|
1319
|
+
if msdatas.status_code != 200 or not msdatas.json()["data"]:
|
|
1320
|
+
raise ValueError(
|
|
1321
|
+
f"Failed to fetch MS data for sample IDs={sample_ids}."
|
|
1322
|
+
)
|
|
1323
|
+
|
|
1324
|
+
res += [x for x in msdatas.json()["data"]]
|
|
1325
1325
|
|
|
1326
1326
|
spaces = {x["id"]: x["usergroup_name"] for x in self.get_spaces()}
|
|
1327
|
+
|
|
1328
|
+
def filepath_to_msrunid(filepath):
|
|
1329
|
+
return os.path.basename(filepath).split(".")[0]
|
|
1330
|
+
|
|
1327
1331
|
for entry in res:
|
|
1328
1332
|
if "tenant_id" in entry:
|
|
1329
1333
|
del entry["tenant_id"]
|
|
1330
1334
|
|
|
1331
1335
|
if "raw_file_path" in entry:
|
|
1336
|
+
# Provide a human-readable MS run id
|
|
1337
|
+
entry["Run"] = filepath_to_msrunid(entry["raw_file_path"])
|
|
1332
1338
|
# Simple lambda function to find the third occurrence of '/' in the raw file path
|
|
1333
1339
|
location = lambda s: len(s) - len(s.split("/", 3)[-1])
|
|
1334
1340
|
# Slicing the string from the location
|
|
@@ -1339,6 +1345,13 @@ class SeerSDK:
|
|
|
1339
1345
|
entry["space"] = spaces.get(entry["user_group"], "General")
|
|
1340
1346
|
del entry["user_group"]
|
|
1341
1347
|
|
|
1348
|
+
# Rename the key sample_id to sample_uuid
|
|
1349
|
+
if "sample_id" in entry:
|
|
1350
|
+
entry["sample_uuid"] = entry.pop("sample_id")
|
|
1351
|
+
# Rename the key sample_id_tracking to sample_id
|
|
1352
|
+
if "sample_id_tracking" in entry:
|
|
1353
|
+
entry["sample_id"] = entry.pop("sample_id_tracking")
|
|
1354
|
+
|
|
1342
1355
|
if not res and as_df:
|
|
1343
1356
|
return pd.DataFrame(columns=MSRUN_COLUMNS)
|
|
1344
1357
|
return res if not as_df else dict_to_df(res)
|
|
@@ -1853,7 +1866,7 @@ class SeerSDK:
|
|
|
1853
1866
|
)
|
|
1854
1867
|
)
|
|
1855
1868
|
except Exception as e:
|
|
1856
|
-
print("
|
|
1869
|
+
print("Error: Could not fetch fasta files.")
|
|
1857
1870
|
res["fasta"] = None
|
|
1858
1871
|
else:
|
|
1859
1872
|
res["fasta"] = None
|
|
@@ -2066,7 +2079,7 @@ class SeerSDK:
|
|
|
2066
2079
|
)
|
|
2067
2080
|
except:
|
|
2068
2081
|
print(
|
|
2069
|
-
f"
|
|
2082
|
+
f"Error: Could not fetch fasta files for analysis {res[entry].get('analysis_name')}."
|
|
2070
2083
|
)
|
|
2071
2084
|
else:
|
|
2072
2085
|
res[entry]["fasta"] = None
|
|
@@ -2382,7 +2395,11 @@ class SeerSDK:
|
|
|
2382
2395
|
return files
|
|
2383
2396
|
|
|
2384
2397
|
def get_search_result(
|
|
2385
|
-
self,
|
|
2398
|
+
self,
|
|
2399
|
+
analysis_id: str,
|
|
2400
|
+
analyte_type: str,
|
|
2401
|
+
rollup: str,
|
|
2402
|
+
columns: _List[str] = None,
|
|
2386
2403
|
):
|
|
2387
2404
|
"""
|
|
2388
2405
|
Load one of the files available via the "Download result files" button on the PAS UI.
|
|
@@ -2423,6 +2440,7 @@ class SeerSDK:
|
|
|
2423
2440
|
"npLink"
|
|
2424
2441
|
]["url"],
|
|
2425
2442
|
dtype=dtype,
|
|
2443
|
+
usecols=columns,
|
|
2426
2444
|
)
|
|
2427
2445
|
elif rollup == "panel":
|
|
2428
2446
|
return download_df(
|
|
@@ -2430,6 +2448,7 @@ class SeerSDK:
|
|
|
2430
2448
|
"panelLink"
|
|
2431
2449
|
]["url"],
|
|
2432
2450
|
dtype=dtype,
|
|
2451
|
+
usecols=columns,
|
|
2433
2452
|
)
|
|
2434
2453
|
elif analyte_type == "peptide":
|
|
2435
2454
|
if rollup == "np":
|
|
@@ -2438,6 +2457,7 @@ class SeerSDK:
|
|
|
2438
2457
|
"npLink"
|
|
2439
2458
|
]["url"],
|
|
2440
2459
|
dtype=dtype,
|
|
2460
|
+
usecols=columns,
|
|
2441
2461
|
)
|
|
2442
2462
|
elif rollup == "panel":
|
|
2443
2463
|
return download_df(
|
|
@@ -2445,12 +2465,14 @@ class SeerSDK:
|
|
|
2445
2465
|
"panelLink"
|
|
2446
2466
|
]["url"],
|
|
2447
2467
|
dtype=dtype,
|
|
2468
|
+
usecols=columns,
|
|
2448
2469
|
)
|
|
2449
2470
|
else:
|
|
2450
2471
|
return download_df(
|
|
2451
2472
|
self.get_search_result_file_url(
|
|
2452
2473
|
analysis_id, filename="report.tsv"
|
|
2453
|
-
)["url"]
|
|
2474
|
+
)["url"],
|
|
2475
|
+
usecols=columns,
|
|
2454
2476
|
)
|
|
2455
2477
|
|
|
2456
2478
|
def download_search_output_file(
|
|
@@ -1471,37 +1471,70 @@ class _UnsupportedSDK(_SeerSDK):
|
|
|
1471
1471
|
Get analyte intensities data for a given PAS analysis.
|
|
1472
1472
|
Args:
|
|
1473
1473
|
analysis_id (str): ID of the analysis.
|
|
1474
|
-
analyte_type (str): Type of the analyte. Must be either 'protein', 'peptide', precursor.
|
|
1474
|
+
analyte_type (str): Type of the analyte. Must be either 'protein', 'peptide', 'precursor'.
|
|
1475
1475
|
rollup (str): Intensities rollup method. Must be either 'np' or 'panel'.
|
|
1476
|
-
norm_method (str): Search engine. Supported engines are: raw, engine, median, median80, pepcal. Default is 'pepcal'.
|
|
1476
|
+
norm_method (str): Search engine. Supported engines are: raw, engine, median, median80, pepcal, pepcal_batch. Default is 'pepcal'.
|
|
1477
1477
|
|
|
1478
1478
|
Returns:
|
|
1479
1479
|
pd.DataFrame: A dataframe with each row containing the analyte intensity measurement:
|
|
1480
1480
|
'msrun_id', 'sample_id', 'nanoparticle' (if rollup is 'np'), 'protein_group', 'peptide' (for 'peptide' and 'precursor' analyte types), 'charge' (for 'precursor' analyte type),
|
|
1481
1481
|
'intensity_log10', 'protein_group_q_value', 'q_value' (for 'precursor' analyte type), 'rt' and 'irt' (for 'peptide' and 'precursor' analyte types)
|
|
1482
1482
|
"""
|
|
1483
|
-
|
|
1483
|
+
|
|
1484
|
+
def filepath_to_msrunid(filepath):
|
|
1485
|
+
return os.path.basename(filepath).split(".")[0]
|
|
1486
|
+
|
|
1487
|
+
# 1. Get samples and msrun data for analysis
|
|
1484
1488
|
samples = self.find_samples(analysis_id=analysis_id)
|
|
1485
|
-
|
|
1489
|
+
|
|
1486
1490
|
sample_uuid_to_id = {s["id"]: s["sample_id"] for s in samples}
|
|
1487
|
-
|
|
1488
|
-
|
|
1489
|
-
|
|
1490
|
-
os.path.basename(msrun["raw_file_path"]).split(".")[0]: msrun
|
|
1491
|
-
for msrun in msruns
|
|
1492
|
-
}
|
|
1493
|
-
sample_to_msrun = {msrun["sample_id"]: msrun for msrun in msruns}
|
|
1491
|
+
sample_id_to_uuid = {s["sample_id"]: s["id"] for s in samples}
|
|
1492
|
+
# FIXME sample_name is not guaranteed to be unique (within PAS analysis)
|
|
1493
|
+
sample_name_to_uuid = {s["sample_name"]: s["id"] for s in samples}
|
|
1494
1494
|
|
|
1495
|
-
|
|
1495
|
+
msruns = self.find_msruns(sample_ids=[s["id"] for s in samples])
|
|
1496
|
+
msrunid_to_info = {msrun["Run"]: msrun for msrun in msruns}
|
|
1496
1497
|
|
|
1497
1498
|
# 2. Get search results
|
|
1498
|
-
# pull the np/panel file, or report.tsv for precursor mode
|
|
1499
|
+
# pull the np/panel file, or the relevant columns from the report.tsv for precursor mode
|
|
1500
|
+
columns = None
|
|
1501
|
+
if analyte_type == "precursor" and rollup == "np":
|
|
1502
|
+
columnsExperiment = ["Run"]
|
|
1503
|
+
columnsProtein = [
|
|
1504
|
+
"Protein.Group",
|
|
1505
|
+
]
|
|
1506
|
+
columnsPeptide = [
|
|
1507
|
+
"Stripped.Sequence",
|
|
1508
|
+
]
|
|
1509
|
+
columnsPrecursor = [
|
|
1510
|
+
"Precursor.Id",
|
|
1511
|
+
"Precursor.Charge",
|
|
1512
|
+
"Precursor.Quantity",
|
|
1513
|
+
"RT",
|
|
1514
|
+
"iRT",
|
|
1515
|
+
"IM",
|
|
1516
|
+
"iIM",
|
|
1517
|
+
]
|
|
1518
|
+
columnsQValue = [
|
|
1519
|
+
"Q.Value",
|
|
1520
|
+
"Protein.Q.Value",
|
|
1521
|
+
]
|
|
1522
|
+
columns = [
|
|
1523
|
+
*columnsExperiment,
|
|
1524
|
+
*columnsProtein,
|
|
1525
|
+
*columnsPeptide,
|
|
1526
|
+
*columnsPrecursor,
|
|
1527
|
+
*columnsQValue,
|
|
1528
|
+
]
|
|
1499
1529
|
search_results = self.get_search_result(
|
|
1500
1530
|
analysis_id=analysis_id,
|
|
1501
1531
|
analyte_type=analyte_type,
|
|
1502
1532
|
rollup=rollup,
|
|
1533
|
+
columns=columns,
|
|
1503
1534
|
)
|
|
1535
|
+
|
|
1504
1536
|
if analyte_type in ["protein", "peptide"]:
|
|
1537
|
+
# set the intensity column based on norm_method and PAS analysis protocol version
|
|
1505
1538
|
intensity_column = None
|
|
1506
1539
|
if norm_method == "raw":
|
|
1507
1540
|
intensity_column = (
|
|
@@ -1543,139 +1576,171 @@ class _UnsupportedSDK(_SeerSDK):
|
|
|
1543
1576
|
raise ValueError(
|
|
1544
1577
|
"Pepcal normalized intensities not found in search results. This is only available with analyses processed with DIA-NN Seer Protocol v2.0 or later with the Seer Peptide Calibrant option enabled. \n Please retry using different norm_method, such as 'median'"
|
|
1545
1578
|
)
|
|
1546
|
-
|
|
1547
1579
|
intensity_column = "PepCal Intensities Log10"
|
|
1548
|
-
|
|
1580
|
+
elif norm_method == "pepcal_batch":
|
|
1581
|
+
if not (
|
|
1582
|
+
"PepCal Batch Intensities Log10" in search_results.columns
|
|
1583
|
+
):
|
|
1584
|
+
raise ValueError(
|
|
1585
|
+
"Pepcal normalized batch corrected intensities not found in search results. This is only available with analyses processed with DIA-NN Seer Protocol v2.0 or later with the Seer Peptide Calibrant option enabled. \n Please retry using different norm_method, such as 'median'"
|
|
1586
|
+
)
|
|
1587
|
+
intensity_column = "PepCal Batch Intensities Log10"
|
|
1549
1588
|
else:
|
|
1550
1589
|
raise ValueError(
|
|
1551
1590
|
f"norm_method = {norm_method} is not supported. Supported normalization methods are: raw, pepcal, engine, median, median80."
|
|
1552
1591
|
)
|
|
1553
|
-
if rollup == "panel":
|
|
1554
|
-
search_results.fillna({"Sample Name": ""}, inplace=True)
|
|
1555
|
-
search_results["File Name"] = search_results[
|
|
1556
|
-
"Sample Name"
|
|
1557
|
-
].apply(
|
|
1558
|
-
lambda x: (
|
|
1559
|
-
os.path.basename(
|
|
1560
|
-
sample_to_msrun[sample_name_to_id[x]][
|
|
1561
|
-
"raw_file_path"
|
|
1562
|
-
]
|
|
1563
|
-
).split(".")[0]
|
|
1564
|
-
if x
|
|
1565
|
-
else None
|
|
1566
|
-
)
|
|
1567
|
-
)
|
|
1568
|
-
search_results["File Name"] = search_results["File Name"].apply(
|
|
1569
|
-
lambda x: os.path.basename(x).split(".")[0] if x else None
|
|
1570
|
-
)
|
|
1571
1592
|
|
|
1572
1593
|
search_results["Intensity Log10"] = search_results[
|
|
1573
1594
|
intensity_column
|
|
1574
1595
|
]
|
|
1575
1596
|
|
|
1576
|
-
|
|
1577
|
-
|
|
1578
|
-
|
|
1579
|
-
analyte_type="precursor",
|
|
1580
|
-
rollup="np",
|
|
1581
|
-
)
|
|
1582
|
-
report["File Name"] = report["Run"]
|
|
1583
|
-
report["Protein Group"] = report["Protein.Group"]
|
|
1584
|
-
|
|
1585
|
-
if analyte_type == "protein":
|
|
1586
|
-
report["Protein Q Value"] = report["Protein.Q.Value"]
|
|
1587
|
-
|
|
1588
|
-
report = report[
|
|
1589
|
-
["File Name", "Protein Group", "Protein Q Value"]
|
|
1590
|
-
]
|
|
1591
|
-
report.drop_duplicates(
|
|
1592
|
-
subset=["File Name", "Protein Group"], inplace=True
|
|
1597
|
+
if rollup == "panel":
|
|
1598
|
+
search_results.rename(
|
|
1599
|
+
columns={"Sample ID": "Sample UUID"}, inplace=True
|
|
1593
1600
|
)
|
|
1594
|
-
|
|
1595
|
-
|
|
1596
|
-
|
|
1597
|
-
|
|
1598
|
-
|
|
1601
|
+
search_results["Sample UUID"] = search_results[
|
|
1602
|
+
"Sample Name"
|
|
1603
|
+
].map(sample_name_to_uuid)
|
|
1604
|
+
search_results["Sample ID"] = search_results[
|
|
1605
|
+
"Sample UUID"
|
|
1606
|
+
].map(sample_uuid_to_id)
|
|
1607
|
+
experiment_columns = ["Sample UUID", "Sample ID"]
|
|
1608
|
+
|
|
1609
|
+
# analyte info is limited to the id in the panel rollup
|
|
1610
|
+
if analyte_type == "protein":
|
|
1611
|
+
analyte_id_column = "Protein Group"
|
|
1612
|
+
else:
|
|
1613
|
+
analyte_id_column = "Peptide"
|
|
1614
|
+
|
|
1615
|
+
analyte_columns = [analyte_id_column]
|
|
1616
|
+
df = search_results
|
|
1617
|
+
else:
|
|
1618
|
+
# np rollup, extract basename without extension
|
|
1619
|
+
path_to_msrunid = {
|
|
1620
|
+
path: filepath_to_msrunid(path)
|
|
1621
|
+
for path in search_results["File Name"].unique()
|
|
1622
|
+
}
|
|
1623
|
+
# strip path from the filename to allow merging with the precursor report
|
|
1624
|
+
search_results["Run"] = search_results["File Name"].map(
|
|
1625
|
+
path_to_msrunid
|
|
1599
1626
|
)
|
|
1600
|
-
included_columns = [
|
|
1601
|
-
"MsRun ID",
|
|
1602
|
-
"Sample ID",
|
|
1603
|
-
"Protein Group",
|
|
1604
|
-
"Intensity Log10",
|
|
1605
|
-
"Protein Q Value",
|
|
1606
|
-
]
|
|
1607
1627
|
|
|
1608
|
-
|
|
1609
|
-
|
|
1610
|
-
# If analyte_type is peptide, attach retention time (RT, iRT)
|
|
1611
|
-
report = report[["File Name", "Peptide", "RT", "iRT"]]
|
|
1612
|
-
report.drop_duplicates(
|
|
1613
|
-
subset=["File Name", "Peptide"], inplace=True
|
|
1628
|
+
search_results["MsRun UUID"] = search_results["Run"].map(
|
|
1629
|
+
{k: v["id"] for k, v in msrunid_to_info.items()}
|
|
1614
1630
|
)
|
|
1615
|
-
|
|
1616
|
-
|
|
1617
|
-
|
|
1618
|
-
|
|
1619
|
-
|
|
1631
|
+
search_results["Sample ID"] = search_results["Run"].map(
|
|
1632
|
+
{k: v["sample_id"] for k, v in msrunid_to_info.items()}
|
|
1633
|
+
)
|
|
1634
|
+
search_results["Sample UUID"] = search_results["Run"].map(
|
|
1635
|
+
{k: v["sample_uuid"] for k, v in msrunid_to_info.items()}
|
|
1620
1636
|
)
|
|
1621
|
-
|
|
1622
|
-
"
|
|
1637
|
+
search_results["Nanoparticle"] = search_results["Run"].map(
|
|
1638
|
+
{k: v["nanoparticle"] for k, v in msrunid_to_info.items()}
|
|
1639
|
+
)
|
|
1640
|
+
experiment_columns = [
|
|
1641
|
+
"MsRun UUID",
|
|
1642
|
+
"Run",
|
|
1643
|
+
"Nanoparticle",
|
|
1644
|
+
"Sample UUID",
|
|
1623
1645
|
"Sample ID",
|
|
1624
|
-
"Peptide",
|
|
1625
|
-
"Protein Group",
|
|
1626
|
-
"Intensity Log10",
|
|
1627
|
-
"RT",
|
|
1628
|
-
"iRT",
|
|
1629
1646
|
]
|
|
1630
|
-
# endif
|
|
1631
1647
|
|
|
1632
|
-
|
|
1633
|
-
|
|
1634
|
-
|
|
1648
|
+
# Merge report to search results to get Q value and other properties
|
|
1649
|
+
if analyte_type == "protein":
|
|
1650
|
+
columns = ["Run", "Protein.Group", "Protein.Q.Value"]
|
|
1651
|
+
elif analyte_type == "peptide":
|
|
1652
|
+
columns = ["Run", "Stripped.Sequence", "Protein.Q.Value"]
|
|
1653
|
+
analytes = self.get_search_result(
|
|
1654
|
+
analysis_id=analysis_id,
|
|
1655
|
+
analyte_type="precursor",
|
|
1656
|
+
rollup="np",
|
|
1657
|
+
columns=columns,
|
|
1658
|
+
)
|
|
1659
|
+
# pandas Dataframe.rename() default behavior is to ignore the columns that do not exist in the data frame.
|
|
1660
|
+
analytes.rename(
|
|
1661
|
+
columns={
|
|
1662
|
+
"Protein.Group": "Protein Group",
|
|
1663
|
+
"Protein.Q.Value": "Protein Q Value",
|
|
1664
|
+
"Stripped.Sequence": "Peptide",
|
|
1665
|
+
},
|
|
1666
|
+
inplace=True,
|
|
1635
1667
|
)
|
|
1636
1668
|
|
|
1637
|
-
|
|
1638
|
-
|
|
1639
|
-
|
|
1669
|
+
if analyte_type == "protein":
|
|
1670
|
+
analyte_id_column = "Protein Group"
|
|
1671
|
+
analyte_columns = [
|
|
1672
|
+
analyte_id_column,
|
|
1673
|
+
"Protein Q Value",
|
|
1674
|
+
]
|
|
1675
|
+
|
|
1676
|
+
else:
|
|
1677
|
+
analyte_id_column = "Peptide"
|
|
1678
|
+
analyte_columns = [analyte_id_column]
|
|
1679
|
+
# endif analyte_type
|
|
1680
|
+
|
|
1681
|
+
analytes.drop(
|
|
1682
|
+
columns=[
|
|
1683
|
+
col
|
|
1684
|
+
for col in analytes.columns
|
|
1685
|
+
if col != "Run" and col not in analyte_columns
|
|
1686
|
+
],
|
|
1687
|
+
inplace=True,
|
|
1640
1688
|
)
|
|
1641
|
-
|
|
1642
|
-
|
|
1643
|
-
lambda x: (
|
|
1644
|
-
file_to_msrun[x]["sample_id"]
|
|
1645
|
-
if x in file_to_msrun
|
|
1646
|
-
else None
|
|
1689
|
+
analytes.drop_duplicates(
|
|
1690
|
+
subset=["Run", analyte_id_column], inplace=True
|
|
1647
1691
|
)
|
|
1648
|
-
|
|
1649
|
-
|
|
1692
|
+
df = pd.merge(
|
|
1693
|
+
search_results,
|
|
1694
|
+
analytes,
|
|
1695
|
+
on=["Run", analyte_id_column],
|
|
1696
|
+
how="left",
|
|
1697
|
+
validate="one_to_one",
|
|
1698
|
+
)
|
|
1699
|
+
|
|
1700
|
+
df = df[experiment_columns + analyte_columns + ["Intensity Log10"]]
|
|
1650
1701
|
|
|
1651
1702
|
else:
|
|
1652
1703
|
# precursor
|
|
1653
1704
|
# working only in report.tsv
|
|
1654
|
-
|
|
1655
|
-
|
|
1656
|
-
|
|
1657
|
-
file_to_msrun[x]["id"] if x in file_to_msrun else None
|
|
1705
|
+
if norm_method != "raw":
|
|
1706
|
+
raise ValueError(
|
|
1707
|
+
"For precursor analyte type, only 'raw' norm_method is supported."
|
|
1658
1708
|
)
|
|
1709
|
+
|
|
1710
|
+
search_results["MsRun UUID"] = search_results["Run"].map(
|
|
1711
|
+
{k: v["id"] for k, v in msrunid_to_info.items()}
|
|
1659
1712
|
)
|
|
1660
|
-
search_results["Sample ID"] = search_results["Run"].
|
|
1661
|
-
|
|
1662
|
-
file_to_msrun[x]["sample_id"]
|
|
1663
|
-
if x in file_to_msrun
|
|
1664
|
-
else None
|
|
1665
|
-
)
|
|
1713
|
+
search_results["Sample ID"] = search_results["Run"].map(
|
|
1714
|
+
{k: v["sample_id"] for k, v in msrunid_to_info.items()}
|
|
1666
1715
|
)
|
|
1667
|
-
search_results["
|
|
1668
|
-
|
|
1669
|
-
|
|
1670
|
-
search_results["
|
|
1671
|
-
|
|
1672
|
-
|
|
1673
|
-
|
|
1716
|
+
search_results["Sample UUID"] = search_results["Sample ID"].map(
|
|
1717
|
+
sample_id_to_uuid
|
|
1718
|
+
)
|
|
1719
|
+
search_results["Nanoparticle"] = search_results["Run"].map(
|
|
1720
|
+
{k: v["nanoparticle"] for k, v in msrunid_to_info.items()}
|
|
1721
|
+
)
|
|
1722
|
+
experiment_columns = [
|
|
1723
|
+
"MsRun UUID",
|
|
1724
|
+
"Run",
|
|
1725
|
+
"Nanoparticle",
|
|
1726
|
+
"Sample UUID",
|
|
1727
|
+
"Sample ID",
|
|
1674
1728
|
]
|
|
1675
1729
|
|
|
1676
|
-
|
|
1677
|
-
|
|
1678
|
-
|
|
1730
|
+
search_results.rename(
|
|
1731
|
+
columns={
|
|
1732
|
+
"Protein.Group": "Protein Group",
|
|
1733
|
+
"Stripped.Sequence": "Peptide",
|
|
1734
|
+
"Precursor.Charge": "Charge",
|
|
1735
|
+
"Precursor.Id": "Precursor Id",
|
|
1736
|
+
"Q.Value": "Precursor Q Value",
|
|
1737
|
+
"Protein.Q.Value": "Protein Q Value",
|
|
1738
|
+
"Precursor.Quantity": "Intensity",
|
|
1739
|
+
},
|
|
1740
|
+
inplace=True,
|
|
1741
|
+
)
|
|
1742
|
+
|
|
1743
|
+
analyte_columns = [
|
|
1679
1744
|
"Protein Group",
|
|
1680
1745
|
"Protein Q Value",
|
|
1681
1746
|
"Peptide",
|
|
@@ -1688,16 +1753,12 @@ class _UnsupportedSDK(_SeerSDK):
|
|
|
1688
1753
|
"IM",
|
|
1689
1754
|
"iIM",
|
|
1690
1755
|
]
|
|
1691
|
-
df = pd.DataFrame(
|
|
1756
|
+
df = pd.DataFrame(
|
|
1757
|
+
search_results[experiment_columns + analyte_columns]
|
|
1758
|
+
)
|
|
1692
1759
|
|
|
1693
1760
|
df.columns = [title_case_to_snake_case(x) for x in df.columns]
|
|
1694
|
-
df["sample_uuid"] = df["sample_id"]
|
|
1695
|
-
df["sample_id"] = df["sample_uuid"].apply(
|
|
1696
|
-
lambda x: sample_uuid_to_id.get(x)
|
|
1697
|
-
)
|
|
1698
1761
|
|
|
1699
|
-
if rollup == "panel":
|
|
1700
|
-
df.drop(columns=["msrun_id"], inplace=True, errors="ignore")
|
|
1701
1762
|
return df
|
|
1702
1763
|
|
|
1703
1764
|
def get_search_data_analytes(self, analysis_id: str, analyte_type: str):
|
|
@@ -1714,10 +1775,6 @@ class _UnsupportedSDK(_SeerSDK):
|
|
|
1714
1775
|
analysis_id=analysis_id, analyte_type="protein", rollup="np"
|
|
1715
1776
|
)
|
|
1716
1777
|
|
|
1717
|
-
report_results = self.get_search_result(
|
|
1718
|
-
analysis_id=analysis_id, analyte_type="precursor", rollup="np"
|
|
1719
|
-
)
|
|
1720
|
-
|
|
1721
1778
|
search_results = search_results[
|
|
1722
1779
|
[
|
|
1723
1780
|
"Protein Group",
|
|
@@ -1729,18 +1786,87 @@ class _UnsupportedSDK(_SeerSDK):
|
|
|
1729
1786
|
]
|
|
1730
1787
|
]
|
|
1731
1788
|
search_results.drop_duplicates(subset=["Protein Group"], inplace=True)
|
|
1732
|
-
report_results["Protein Group"] = report_results["Protein.Group"]
|
|
1733
|
-
report_results["Peptide"] = report_results["Stripped.Sequence"]
|
|
1734
1789
|
|
|
1735
|
-
|
|
1736
|
-
|
|
1790
|
+
# 2. fetch precursor report to extract analyte-specific details
|
|
1791
|
+
columnsPG = [
|
|
1792
|
+
"Protein.Group",
|
|
1793
|
+
]
|
|
1794
|
+
columnsPeptide = [
|
|
1795
|
+
"Protein.Ids",
|
|
1796
|
+
"Stripped.Sequence",
|
|
1797
|
+
"Proteotypic",
|
|
1798
|
+
]
|
|
1799
|
+
columnsPrecursor = [
|
|
1800
|
+
"Precursor.Id",
|
|
1801
|
+
"Precursor.Charge",
|
|
1802
|
+
"Precursor.Quantity",
|
|
1803
|
+
"Modified.Sequence",
|
|
1804
|
+
]
|
|
1805
|
+
columnsPGQValue = [
|
|
1806
|
+
"Global.PG.Q.Value",
|
|
1807
|
+
"Lib.PG.Q.Value",
|
|
1808
|
+
]
|
|
1809
|
+
columnsPrecursorQValue = [
|
|
1810
|
+
"Global.Q.Value",
|
|
1811
|
+
"Lib.Q.Value",
|
|
1812
|
+
]
|
|
1813
|
+
columns = [
|
|
1814
|
+
*columnsPG,
|
|
1815
|
+
*columnsPGQValue,
|
|
1816
|
+
]
|
|
1817
|
+
if analyte_type == "peptide":
|
|
1818
|
+
columns += [*columnsPeptide]
|
|
1819
|
+
elif analyte_type == "precursor":
|
|
1820
|
+
columns += [
|
|
1821
|
+
*columnsPeptide,
|
|
1822
|
+
*columnsPrecursor,
|
|
1823
|
+
*columnsPrecursorQValue,
|
|
1824
|
+
]
|
|
1825
|
+
report_results = self.get_search_result(
|
|
1826
|
+
analysis_id=analysis_id,
|
|
1827
|
+
analyte_type="precursor",
|
|
1828
|
+
rollup="np",
|
|
1829
|
+
columns=columns,
|
|
1830
|
+
)
|
|
1831
|
+
report_results.rename(
|
|
1832
|
+
columns={
|
|
1833
|
+
"Protein.Group": "Protein Group",
|
|
1834
|
+
"Stripped.Sequence": "Peptide",
|
|
1835
|
+
"Modified.Sequence": "Modified.Peptide",
|
|
1836
|
+
},
|
|
1837
|
+
inplace=True,
|
|
1838
|
+
)
|
|
1839
|
+
|
|
1840
|
+
# function to fix the potential bug, where different precursors
|
|
1841
|
+
# of the same peptide map to different protein groups
|
|
1842
|
+
def fix_peptide_to_protein_group_assignment(
|
|
1843
|
+
df: pd.DataFrame,
|
|
1844
|
+
) -> pd.DataFrame:
|
|
1845
|
+
# for each peptide, sort protein groups by confidence
|
|
1846
|
+
df = df.sort_values(
|
|
1737
1847
|
[
|
|
1738
|
-
"
|
|
1739
|
-
"Protein.Ids",
|
|
1848
|
+
"Peptide",
|
|
1740
1849
|
"Global.PG.Q.Value",
|
|
1741
1850
|
"Lib.PG.Q.Value",
|
|
1851
|
+
"Protein Group",
|
|
1742
1852
|
]
|
|
1743
|
-
|
|
1853
|
+
)
|
|
1854
|
+
|
|
1855
|
+
# broadcast the best protein group across all rows with the same peptide
|
|
1856
|
+
# to fix the potential bug, where different precursors of the same peptide
|
|
1857
|
+
# map to different protein groups
|
|
1858
|
+
for col in [
|
|
1859
|
+
"Protein Group",
|
|
1860
|
+
"Protein.Ids",
|
|
1861
|
+
"Protein.Names",
|
|
1862
|
+
"Genes",
|
|
1863
|
+
]:
|
|
1864
|
+
if col in df.columns:
|
|
1865
|
+
df[col] = df.groupby("Peptide")[col].transform("first")
|
|
1866
|
+
|
|
1867
|
+
return df
|
|
1868
|
+
|
|
1869
|
+
if analyte_type == "protein":
|
|
1744
1870
|
report_results.drop_duplicates(
|
|
1745
1871
|
subset=["Protein Group"], inplace=True
|
|
1746
1872
|
)
|
|
@@ -1751,41 +1877,18 @@ class _UnsupportedSDK(_SeerSDK):
|
|
|
1751
1877
|
how="left",
|
|
1752
1878
|
)
|
|
1753
1879
|
elif analyte_type == "peptide":
|
|
1754
|
-
|
|
1755
|
-
# The below logic performs the following:
|
|
1756
|
-
# 1. orders each peptide group by Global.PG.Q.Value, Lib.PG.Q.Value, and Protein Group (ascending)
|
|
1757
|
-
# 2. for each peptide group, select the first row to find the precursor with the lowest Q values
|
|
1758
|
-
# 3. broadcasts the associated protein group columns across all rows with the same peptide.
|
|
1759
|
-
#
|
|
1760
|
-
# This ensures that for each peptide, we retain consistent protein information while avoiding duplication.
|
|
1761
|
-
|
|
1762
|
-
report_results = report_results.sort_values(
|
|
1880
|
+
search_results = search_results[
|
|
1763
1881
|
[
|
|
1764
|
-
"Peptide",
|
|
1765
|
-
"Global.PG.Q.Value",
|
|
1766
|
-
"Lib.PG.Q.Value",
|
|
1767
1882
|
"Protein Group",
|
|
1883
|
+
"Protein Names",
|
|
1884
|
+
"Gene Names",
|
|
1768
1885
|
]
|
|
1886
|
+
]
|
|
1887
|
+
report_results.drop_duplicates(inplace=True)
|
|
1888
|
+
report_results = fix_peptide_to_protein_group_assignment(
|
|
1889
|
+
report_results
|
|
1769
1890
|
)
|
|
1770
|
-
|
|
1771
|
-
columns_to_broadcast = ["Protein Group", "Protein.Ids"]
|
|
1772
|
-
broadcasted = (
|
|
1773
|
-
report_results.groupby("Peptide")
|
|
1774
|
-
.apply(
|
|
1775
|
-
lambda x: pd.Series(
|
|
1776
|
-
{
|
|
1777
|
-
col: x.iloc[0][col]
|
|
1778
|
-
for col in columns_to_broadcast + ["Peptide"]
|
|
1779
|
-
}
|
|
1780
|
-
)
|
|
1781
|
-
)
|
|
1782
|
-
.reset_index(drop=True)
|
|
1783
|
-
)
|
|
1784
|
-
report_results = (
|
|
1785
|
-
report_results.drop(columns=columns_to_broadcast)
|
|
1786
|
-
.merge(broadcasted, on="Peptide", how="left")
|
|
1787
|
-
.drop_duplicates(subset=["Peptide"])
|
|
1788
|
-
)
|
|
1891
|
+
report_results.drop_duplicates(subset=["Peptide"], inplace=True)
|
|
1789
1892
|
|
|
1790
1893
|
df = pd.merge(
|
|
1791
1894
|
report_results,
|
|
@@ -1793,15 +1896,6 @@ class _UnsupportedSDK(_SeerSDK):
|
|
|
1793
1896
|
on=["Protein Group"],
|
|
1794
1897
|
how="left",
|
|
1795
1898
|
)
|
|
1796
|
-
df = df[
|
|
1797
|
-
[
|
|
1798
|
-
"Peptide",
|
|
1799
|
-
"Protein Group",
|
|
1800
|
-
"Protein.Ids",
|
|
1801
|
-
"Protein Names",
|
|
1802
|
-
"Gene Names",
|
|
1803
|
-
]
|
|
1804
|
-
]
|
|
1805
1899
|
else:
|
|
1806
1900
|
# precursor
|
|
1807
1901
|
search_results = search_results[
|
|
@@ -1811,91 +1905,23 @@ class _UnsupportedSDK(_SeerSDK):
|
|
|
1811
1905
|
"Gene Names",
|
|
1812
1906
|
]
|
|
1813
1907
|
]
|
|
1814
|
-
|
|
1815
|
-
subset=["Protein Group"], inplace=True
|
|
1816
|
-
)
|
|
1817
|
-
report_results = report_results[
|
|
1818
|
-
[
|
|
1819
|
-
"Precursor.Id",
|
|
1820
|
-
"Precursor.Charge",
|
|
1821
|
-
"Peptide",
|
|
1822
|
-
"Protein Group",
|
|
1823
|
-
"Protein.Ids",
|
|
1824
|
-
"Protein.Names",
|
|
1825
|
-
"Genes",
|
|
1826
|
-
"Modified.Sequence",
|
|
1827
|
-
"Proteotypic",
|
|
1828
|
-
"Global.Q.Value",
|
|
1829
|
-
"Global.PG.Q.Value",
|
|
1830
|
-
"Lib.Q.Value",
|
|
1831
|
-
"Lib.PG.Q.Value",
|
|
1832
|
-
]
|
|
1833
|
-
]
|
|
1908
|
+
report_results.drop_duplicates(inplace=True)
|
|
1834
1909
|
|
|
1835
|
-
|
|
1836
|
-
|
|
1837
|
-
# 2. for each peptide group, select the first row to find the precursor with the lowest Q values
|
|
1838
|
-
# 3. broadcasts the associated protein group columns across all rows with the same peptide.
|
|
1839
|
-
#
|
|
1840
|
-
# This ensures that for each peptide, we retain consistent protein information while avoiding duplication.
|
|
1841
|
-
columns_to_broadcast = [
|
|
1842
|
-
"Protein Group",
|
|
1843
|
-
"Protein.Ids",
|
|
1844
|
-
"Protein.Names",
|
|
1845
|
-
"Genes",
|
|
1846
|
-
]
|
|
1847
|
-
report_results = report_results.sort_values(
|
|
1848
|
-
[
|
|
1849
|
-
"Peptide",
|
|
1850
|
-
"Global.PG.Q.Value",
|
|
1851
|
-
"Lib.PG.Q.Value",
|
|
1852
|
-
"Protein Group",
|
|
1853
|
-
],
|
|
1854
|
-
)
|
|
1855
|
-
broadcasted = (
|
|
1856
|
-
report_results.groupby("Peptide")
|
|
1857
|
-
.apply(
|
|
1858
|
-
lambda x: pd.Series(
|
|
1859
|
-
{
|
|
1860
|
-
col: x.iloc[0][col]
|
|
1861
|
-
for col in columns_to_broadcast + ["Peptide"]
|
|
1862
|
-
}
|
|
1863
|
-
)
|
|
1864
|
-
)
|
|
1865
|
-
.reset_index(drop=True)
|
|
1910
|
+
report_results = fix_peptide_to_protein_group_assignment(
|
|
1911
|
+
report_results
|
|
1866
1912
|
)
|
|
1867
|
-
report_results
|
|
1868
|
-
|
|
1869
|
-
|
|
1870
|
-
.drop_duplicates(subset=["Peptide", "Precursor.Charge"])
|
|
1913
|
+
report_results.drop_duplicates(
|
|
1914
|
+
subset=["Peptide", "Modified.Peptide", "Precursor.Charge"],
|
|
1915
|
+
inplace=True,
|
|
1871
1916
|
)
|
|
1917
|
+
|
|
1872
1918
|
df = pd.merge(
|
|
1873
1919
|
report_results,
|
|
1874
1920
|
search_results,
|
|
1875
1921
|
on=["Protein Group"],
|
|
1876
1922
|
how="left",
|
|
1877
1923
|
)
|
|
1878
|
-
df = df[
|
|
1879
|
-
[
|
|
1880
|
-
"Precursor.Id",
|
|
1881
|
-
"Precursor.Charge",
|
|
1882
|
-
"Peptide",
|
|
1883
|
-
"Protein Group",
|
|
1884
|
-
"Protein.Ids",
|
|
1885
|
-
"Protein.Names",
|
|
1886
|
-
"Genes",
|
|
1887
|
-
"Modified.Sequence",
|
|
1888
|
-
"Proteotypic",
|
|
1889
|
-
"Global.Q.Value",
|
|
1890
|
-
"Global.PG.Q.Value",
|
|
1891
|
-
"Lib.Q.Value",
|
|
1892
|
-
"Lib.PG.Q.Value",
|
|
1893
|
-
"Gene Names",
|
|
1894
|
-
]
|
|
1895
|
-
]
|
|
1896
|
-
df.rename(
|
|
1897
|
-
columns={"Modified.Sequence": "Modified.Peptide"}, inplace=True
|
|
1898
|
-
)
|
|
1899
1924
|
# endif
|
|
1900
1925
|
df.columns = [title_case_to_snake_case(x) for x in df.columns]
|
|
1926
|
+
|
|
1901
1927
|
return df
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: seer-pas-sdk
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.2
|
|
4
4
|
Summary: SDK for Seer Proteograph Analysis Suite (PAS)
|
|
5
5
|
Author-email: Ryan Sun <rsun@seer.bio>
|
|
6
6
|
License:
|
|
@@ -194,9 +194,10 @@ License-File: LICENSE.txt
|
|
|
194
194
|
Requires-Dist: boto3>=1.26.152
|
|
195
195
|
Requires-Dist: botocore>=1.29.152
|
|
196
196
|
Requires-Dist: pandas>=2.0.1
|
|
197
|
+
Requires-Dist: pyarrow>=17.0.0
|
|
197
198
|
Requires-Dist: PyJWT>=2.8.0
|
|
198
199
|
Requires-Dist: python-dotenv>=1.0.0
|
|
199
|
-
Requires-Dist:
|
|
200
|
+
Requires-Dist: requests>=2.31.0
|
|
200
201
|
Requires-Dist: tqdm>=4.65.0
|
|
201
202
|
Requires-Dist: deprecation
|
|
202
203
|
Dynamic: license-file
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|