quantms-utils 0.0.2__tar.gz → 0.0.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {quantms_utils-0.0.2/quantms_utils.egg-info → quantms_utils-0.0.4}/PKG-INFO +12 -4
- {quantms_utils-0.0.2 → quantms_utils-0.0.4/quantms_utils.egg-info}/PKG-INFO +12 -4
- quantms_utils-0.0.4/quantms_utils.egg-info/requires.txt +16 -0
- quantms_utils-0.0.4/quantmsutils/__init__.py +1 -0
- {quantms_utils-0.0.2 → quantms_utils-0.0.4}/quantmsutils/diann/diann2mztab.py +93 -94
- {quantms_utils-0.0.2 → quantms_utils-0.0.4}/quantmsutils/mzml/mzml_statistics.py +20 -13
- {quantms_utils-0.0.2 → quantms_utils-0.0.4}/quantmsutils/psm/psm_conversion.py +19 -12
- {quantms_utils-0.0.2 → quantms_utils-0.0.4}/quantmsutils/quantmsutilsc.py +11 -9
- {quantms_utils-0.0.2 → quantms_utils-0.0.4}/quantmsutils/rescoring/ms2rescore.py +56 -29
- quantms_utils-0.0.4/quantmsutils/sdrf/check_samplesheet.py +198 -0
- {quantms_utils-0.0.2 → quantms_utils-0.0.4}/quantmsutils/sdrf/extract_sample.py +3 -2
- {quantms_utils-0.0.2 → quantms_utils-0.0.4}/setup.py +30 -9
- {quantms_utils-0.0.2 → quantms_utils-0.0.4}/tests/test_commands.py +28 -9
- quantms_utils-0.0.2/quantms_utils.egg-info/requires.txt +0 -8
- quantms_utils-0.0.2/quantmsutils/sdrf/__init__.py +0 -0
- quantms_utils-0.0.2/quantmsutils/sdrf/check_samplesheet.py +0 -134
- {quantms_utils-0.0.2 → quantms_utils-0.0.4}/LICENSE +0 -0
- {quantms_utils-0.0.2 → quantms_utils-0.0.4}/README.md +0 -0
- {quantms_utils-0.0.2 → quantms_utils-0.0.4}/pyproject.toml +0 -0
- {quantms_utils-0.0.2 → quantms_utils-0.0.4}/quantms_utils.egg-info/SOURCES.txt +0 -0
- {quantms_utils-0.0.2 → quantms_utils-0.0.4}/quantms_utils.egg-info/dependency_links.txt +0 -0
- {quantms_utils-0.0.2 → quantms_utils-0.0.4}/quantms_utils.egg-info/entry_points.txt +0 -0
- {quantms_utils-0.0.2 → quantms_utils-0.0.4}/quantms_utils.egg-info/top_level.txt +0 -0
- {quantms_utils-0.0.2/quantmsutils → quantms_utils-0.0.4/quantmsutils/diann}/__init__.py +0 -0
- {quantms_utils-0.0.2 → quantms_utils-0.0.4}/quantmsutils/diann/dianncfg.py +0 -0
- {quantms_utils-0.0.2/quantmsutils/diann → quantms_utils-0.0.4/quantmsutils/features}/__init__.py +0 -0
- {quantms_utils-0.0.2 → quantms_utils-0.0.4}/quantmsutils/features/sage_feature.py +1 -1
- {quantms_utils-0.0.2/quantmsutils/features → quantms_utils-0.0.4/quantmsutils/mzml}/__init__.py +0 -0
- {quantms_utils-0.0.2/quantmsutils/mzml → quantms_utils-0.0.4/quantmsutils/psm}/__init__.py +0 -0
- {quantms_utils-0.0.2/quantmsutils/psm → quantms_utils-0.0.4/quantmsutils/rescoring}/__init__.py +0 -0
- {quantms_utils-0.0.2/quantmsutils/rescoring → quantms_utils-0.0.4/quantmsutils/sdrf}/__init__.py +0 -0
- {quantms_utils-0.0.2 → quantms_utils-0.0.4}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: quantms-utils
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.4
|
|
4
4
|
Summary: Python package with scripts and helpers for the QuantMS workflow
|
|
5
5
|
Home-page: https://www.github.com/bigbio/pyquantms
|
|
6
6
|
Author: Yasset Perez-Riverol, Dai Chengxin
|
|
@@ -20,13 +20,21 @@ Requires-Python: >=3.8,<4
|
|
|
20
20
|
Description-Content-Type: text/markdown
|
|
21
21
|
License-File: LICENSE
|
|
22
22
|
Requires-Dist: click
|
|
23
|
-
Requires-Dist: sdrf-pipelines
|
|
23
|
+
Requires-Dist: sdrf-pipelines>=0.0.29
|
|
24
24
|
Requires-Dist: pyopenms
|
|
25
|
-
Requires-Dist: ms2rescore==3.0.
|
|
26
|
-
Requires-Dist:
|
|
25
|
+
Requires-Dist: ms2rescore==3.0.3
|
|
26
|
+
Requires-Dist: deeplc==2.2.27
|
|
27
|
+
Requires-Dist: ms2pip==4.0.0.dev8
|
|
28
|
+
Requires-Dist: psm-utils==0.8.2
|
|
29
|
+
Requires-Dist: deeplcretrainer==0.2.11
|
|
27
30
|
Requires-Dist: pydantic
|
|
28
31
|
Requires-Dist: pandas
|
|
32
|
+
Requires-Dist: protobuf<4,>=3.9.2
|
|
29
33
|
Requires-Dist: numpy
|
|
34
|
+
Requires-Dist: pyarrow
|
|
35
|
+
Requires-Dist: pygam
|
|
36
|
+
Requires-Dist: scipy
|
|
37
|
+
Requires-Dist: scikit-learn
|
|
30
38
|
|
|
31
39
|
# quantms-utils
|
|
32
40
|
[](https://github.com/bigbio/quantms-utils/actions/workflows/python-app.yml)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: quantms-utils
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.4
|
|
4
4
|
Summary: Python package with scripts and helpers for the QuantMS workflow
|
|
5
5
|
Home-page: https://www.github.com/bigbio/pyquantms
|
|
6
6
|
Author: Yasset Perez-Riverol, Dai Chengxin
|
|
@@ -20,13 +20,21 @@ Requires-Python: >=3.8,<4
|
|
|
20
20
|
Description-Content-Type: text/markdown
|
|
21
21
|
License-File: LICENSE
|
|
22
22
|
Requires-Dist: click
|
|
23
|
-
Requires-Dist: sdrf-pipelines
|
|
23
|
+
Requires-Dist: sdrf-pipelines>=0.0.29
|
|
24
24
|
Requires-Dist: pyopenms
|
|
25
|
-
Requires-Dist: ms2rescore==3.0.
|
|
26
|
-
Requires-Dist:
|
|
25
|
+
Requires-Dist: ms2rescore==3.0.3
|
|
26
|
+
Requires-Dist: deeplc==2.2.27
|
|
27
|
+
Requires-Dist: ms2pip==4.0.0.dev8
|
|
28
|
+
Requires-Dist: psm-utils==0.8.2
|
|
29
|
+
Requires-Dist: deeplcretrainer==0.2.11
|
|
27
30
|
Requires-Dist: pydantic
|
|
28
31
|
Requires-Dist: pandas
|
|
32
|
+
Requires-Dist: protobuf<4,>=3.9.2
|
|
29
33
|
Requires-Dist: numpy
|
|
34
|
+
Requires-Dist: pyarrow
|
|
35
|
+
Requires-Dist: pygam
|
|
36
|
+
Requires-Dist: scipy
|
|
37
|
+
Requires-Dist: scikit-learn
|
|
30
38
|
|
|
31
39
|
# quantms-utils
|
|
32
40
|
[](https://github.com/bigbio/quantms-utils/actions/workflows/python-app.yml)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.0.4"
|
|
@@ -11,7 +11,7 @@ import os
|
|
|
11
11
|
import re
|
|
12
12
|
import warnings
|
|
13
13
|
from pathlib import Path
|
|
14
|
-
from typing import Any,
|
|
14
|
+
from typing import Any, Dict, List, Set, Tuple, Union
|
|
15
15
|
|
|
16
16
|
import click
|
|
17
17
|
import numpy as np
|
|
@@ -44,14 +44,14 @@ logger = logging.getLogger(__name__)
|
|
|
44
44
|
@click.option("--qvalue_threshold", "-q", type=float)
|
|
45
45
|
@click.pass_context
|
|
46
46
|
def diann2mztab(
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
47
|
+
ctx,
|
|
48
|
+
folder,
|
|
49
|
+
exp_design,
|
|
50
|
+
dia_params,
|
|
51
|
+
diann_version,
|
|
52
|
+
charge,
|
|
53
|
+
missed_cleavages,
|
|
54
|
+
qvalue_threshold,
|
|
55
55
|
):
|
|
56
56
|
"""
|
|
57
57
|
Convert DIA-NN output to MSstats, Triqler or mzTab.
|
|
@@ -228,7 +228,7 @@ def get_exp_design_dfs(exp_design_file):
|
|
|
228
228
|
lambda x: _true_stem(x["Spectra_Filepath"]), axis=1
|
|
229
229
|
)
|
|
230
230
|
|
|
231
|
-
s_table = [i.replace("\n", "").split("\t") for i in data[empty_row + 1:]][1:]
|
|
231
|
+
s_table = [i.replace("\n", "").split("\t") for i in data[empty_row + 1 :]][1:]
|
|
232
232
|
s_header = data[empty_row + 1].replace("\n", "").split("\t")
|
|
233
233
|
s_data_frame = pd.DataFrame(s_table, columns=s_header)
|
|
234
234
|
|
|
@@ -265,31 +265,31 @@ def compute_mass_modified_peptide(peptide_seq: str) -> float:
|
|
|
265
265
|
if aa in aa_mass and not_mod:
|
|
266
266
|
aa = aa_mass[aa]
|
|
267
267
|
elif (
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
268
|
+
aa
|
|
269
|
+
not in [
|
|
270
|
+
"G",
|
|
271
|
+
"A",
|
|
272
|
+
"V",
|
|
273
|
+
"L",
|
|
274
|
+
"I",
|
|
275
|
+
"F",
|
|
276
|
+
"M",
|
|
277
|
+
"P",
|
|
278
|
+
"W",
|
|
279
|
+
"S",
|
|
280
|
+
"C",
|
|
281
|
+
"T",
|
|
282
|
+
"Y",
|
|
283
|
+
"N",
|
|
284
|
+
"Q",
|
|
285
|
+
"D",
|
|
286
|
+
"E",
|
|
287
|
+
"K",
|
|
288
|
+
"R",
|
|
289
|
+
"H",
|
|
290
|
+
]
|
|
291
|
+
and not_mod
|
|
292
|
+
and aa != ")"
|
|
293
293
|
):
|
|
294
294
|
logger.info(f"Unknown amino acid with mass not known:{aa}")
|
|
295
295
|
peptide_parts.append(aa)
|
|
@@ -362,18 +362,18 @@ class DiannDirectory:
|
|
|
362
362
|
return diann_version_id
|
|
363
363
|
|
|
364
364
|
def validate_diann_version(self) -> None:
|
|
365
|
-
supported_diann_versions = ["1.8.1"]
|
|
365
|
+
supported_diann_versions = ["1.8.1", "1.9.beta.1"]
|
|
366
366
|
if self.diann_version not in supported_diann_versions:
|
|
367
367
|
raise ValueError(f"Unsupported DIANN version {self.diann_version}")
|
|
368
368
|
|
|
369
369
|
def convert_to_mztab(
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
370
|
+
self,
|
|
371
|
+
report,
|
|
372
|
+
f_table,
|
|
373
|
+
charge: int,
|
|
374
|
+
missed_cleavages: int,
|
|
375
|
+
dia_params: List[Any],
|
|
376
|
+
out: Union[os.PathLike, str],
|
|
377
377
|
) -> None:
|
|
378
378
|
logger.info("Converting to mzTab")
|
|
379
379
|
self.validate_diann_version()
|
|
@@ -481,8 +481,8 @@ class DiannDirectory:
|
|
|
481
481
|
}
|
|
482
482
|
mass_vector = report["Modified.Sequence"].map(uniq_masses)
|
|
483
483
|
report["Calculate.Precursor.Mz"] = (
|
|
484
|
-
|
|
485
|
-
|
|
484
|
+
mass_vector + (PROTON_MASS_U * report["Precursor.Charge"])
|
|
485
|
+
) / report["Precursor.Charge"]
|
|
486
486
|
|
|
487
487
|
logger.debug("Indexing Precursors")
|
|
488
488
|
# Making the map is 1500x faster
|
|
@@ -589,16 +589,16 @@ def mztab_mtd(index_ref, dia_params, fasta, charge, missed_cleavages):
|
|
|
589
589
|
out_mztab_mtd.loc[1, "software[1]-setting[1]"] = fasta
|
|
590
590
|
out_mztab_mtd.loc[1, "software[1]-setting[2]"] = "db_version:null"
|
|
591
591
|
out_mztab_mtd.loc[1, "software[1]-setting[3]"] = (
|
|
592
|
-
|
|
592
|
+
"fragment_mass_tolerance:" + fragment_mass_tolerance
|
|
593
593
|
)
|
|
594
594
|
out_mztab_mtd.loc[1, "software[1]-setting[4]"] = (
|
|
595
|
-
|
|
595
|
+
"fragment_mass_tolerance_unit:" + fragment_mass_tolerance_unit
|
|
596
596
|
)
|
|
597
597
|
out_mztab_mtd.loc[1, "software[1]-setting[5]"] = (
|
|
598
|
-
|
|
598
|
+
"precursor_mass_tolerance:" + precursor_mass_tolerance
|
|
599
599
|
)
|
|
600
600
|
out_mztab_mtd.loc[1, "software[1]-setting[6]"] = (
|
|
601
|
-
|
|
601
|
+
"precursor_mass_tolerance_unit:" + precursor_mass_tolerance_unit
|
|
602
602
|
)
|
|
603
603
|
out_mztab_mtd.loc[1, "software[1]-setting[7]"] = "enzyme:" + enzyme
|
|
604
604
|
out_mztab_mtd.loc[1, "software[1]-setting[8]"] = "enzyme_term_specificity:full"
|
|
@@ -607,10 +607,10 @@ def mztab_mtd(index_ref, dia_params, fasta, charge, missed_cleavages):
|
|
|
607
607
|
missed_cleavages
|
|
608
608
|
)
|
|
609
609
|
out_mztab_mtd.loc[1, "software[1]-setting[11]"] = (
|
|
610
|
-
|
|
610
|
+
"fixed_modifications:" + fixed_modifications
|
|
611
611
|
)
|
|
612
612
|
out_mztab_mtd.loc[1, "software[1]-setting[12]"] = (
|
|
613
|
-
|
|
613
|
+
"variable_modifications:" + variable_modifications
|
|
614
614
|
)
|
|
615
615
|
|
|
616
616
|
(fixed_mods, variable_mods, fix_flag, var_flag) = mtd_mod_info(
|
|
@@ -633,7 +633,7 @@ def mztab_mtd(index_ref, dia_params, fasta, charge, missed_cleavages):
|
|
|
633
633
|
]
|
|
634
634
|
out_mztab_mtd.loc[1, "variable_mod[" + str(i) + "]-site"] = variable_mods[
|
|
635
635
|
i - 1
|
|
636
|
-
|
|
636
|
+
][1]
|
|
637
637
|
out_mztab_mtd.loc[1, "variable_mod[" + str(i) + "]-position"] = "Anywhere"
|
|
638
638
|
else:
|
|
639
639
|
out_mztab_mtd.loc[1, "variable_mod[1]"] = variable_mods[0]
|
|
@@ -649,8 +649,8 @@ def mztab_mtd(index_ref, dia_params, fasta, charge, missed_cleavages):
|
|
|
649
649
|
"[MS, MS:1000584, mzML file, ]"
|
|
650
650
|
)
|
|
651
651
|
out_mztab_mtd.loc[1, "ms_run[" + str(i) + "]-location"] = (
|
|
652
|
-
|
|
653
|
-
|
|
652
|
+
"file://"
|
|
653
|
+
+ index_ref[index_ref["ms_run"] == i]["Spectra_Filepath"].values[0]
|
|
654
654
|
)
|
|
655
655
|
out_mztab_mtd.loc[1, "ms_run[" + str(i) + "]-id_format"] = (
|
|
656
656
|
"[MS, MS:1000777, spectrum identifier nativeID format, ]"
|
|
@@ -659,7 +659,7 @@ def mztab_mtd(index_ref, dia_params, fasta, charge, missed_cleavages):
|
|
|
659
659
|
"[MS, MS:1002038, unlabeled sample, ]"
|
|
660
660
|
)
|
|
661
661
|
out_mztab_mtd.loc[1, "assay[" + str(i) + "]-ms_run_ref"] = (
|
|
662
|
-
|
|
662
|
+
"ms_run[" + str(i) + "]"
|
|
663
663
|
)
|
|
664
664
|
|
|
665
665
|
with warnings.catch_warnings():
|
|
@@ -723,16 +723,16 @@ def mztab_prh(report, pg, index_ref, database, fasta_df):
|
|
|
723
723
|
col = {}
|
|
724
724
|
for i in file:
|
|
725
725
|
col[i] = (
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
726
|
+
"protein_abundance_assay["
|
|
727
|
+
+ str(index_ref[index_ref["Run"] == _true_stem(i)]["ms_run"].values[0])
|
|
728
|
+
+ "]"
|
|
729
729
|
)
|
|
730
730
|
|
|
731
731
|
pg.rename(columns=col, inplace=True)
|
|
732
732
|
|
|
733
733
|
logger.debug("Classifying results type ...")
|
|
734
734
|
pg["opt_global_result_type"] = "single_protein"
|
|
735
|
-
pg.loc[pg["Protein.
|
|
735
|
+
pg.loc[pg["Protein.Group"].str.contains(";"), "opt_global_result_type"] = (
|
|
736
736
|
"indistinguishable_protein_group"
|
|
737
737
|
)
|
|
738
738
|
|
|
@@ -741,7 +741,6 @@ def mztab_prh(report, pg, index_ref, database, fasta_df):
|
|
|
741
741
|
out_mztab_prh = out_mztab_prh.drop(["Protein.Names"], axis=1)
|
|
742
742
|
out_mztab_prh.rename(
|
|
743
743
|
columns={
|
|
744
|
-
"Protein.Group": "accession",
|
|
745
744
|
"First.Protein.Description": "description",
|
|
746
745
|
},
|
|
747
746
|
inplace=True,
|
|
@@ -762,14 +761,14 @@ def mztab_prh(report, pg, index_ref, database, fasta_df):
|
|
|
762
761
|
|
|
763
762
|
logger.debug("Extracting accession values (keeping first)...")
|
|
764
763
|
out_mztab_prh.loc[:, "accession"] = out_mztab_prh.apply(
|
|
765
|
-
lambda x: x["
|
|
764
|
+
lambda x: x["Protein.Group"].split(";")[0], axis=1
|
|
766
765
|
)
|
|
767
766
|
|
|
768
767
|
protein_details_df = out_mztab_prh[
|
|
769
768
|
out_mztab_prh["opt_global_result_type"] == "indistinguishable_protein_group"
|
|
770
|
-
|
|
769
|
+
]
|
|
771
770
|
prh_series = (
|
|
772
|
-
protein_details_df["Protein.
|
|
771
|
+
protein_details_df["Protein.Group"]
|
|
773
772
|
.str.split(";", expand=True)
|
|
774
773
|
.stack()
|
|
775
774
|
.reset_index(level=1, drop=True)
|
|
@@ -806,7 +805,7 @@ def mztab_prh(report, pg, index_ref, database, fasta_df):
|
|
|
806
805
|
# or out_mztab_PRH.loc[out_mztab_PRH["Protein.Ids"] == out_mztab_PRH["accession"], "ambiguity_members"] = "null"
|
|
807
806
|
out_mztab_prh.loc[:, "ambiguity_members"] = out_mztab_prh.apply(
|
|
808
807
|
lambda x: (
|
|
809
|
-
x["Protein.
|
|
808
|
+
x["Protein.Group"]
|
|
810
809
|
if x["opt_global_result_type"] == "indistinguishable_protein_group"
|
|
811
810
|
else "null"
|
|
812
811
|
),
|
|
@@ -817,7 +816,7 @@ def mztab_prh(report, pg, index_ref, database, fasta_df):
|
|
|
817
816
|
score_looker = ModScoreLooker(report)
|
|
818
817
|
out_mztab_prh[["modifiedSequence", "best_search_engine_score[1]"]] = (
|
|
819
818
|
out_mztab_prh.apply(
|
|
820
|
-
lambda x: score_looker.get_score(x["Protein.
|
|
819
|
+
lambda x: score_looker.get_score(x["Protein.Group"]),
|
|
821
820
|
axis=1,
|
|
822
821
|
result_type="expand",
|
|
823
822
|
)
|
|
@@ -833,11 +832,11 @@ def mztab_prh(report, pg, index_ref, database, fasta_df):
|
|
|
833
832
|
# This used to be a bottleneck in performance
|
|
834
833
|
# This implementation drops the run time from 57s to 25ms
|
|
835
834
|
protein_agg_report = (
|
|
836
|
-
report[["PG.MaxLFQ", "Protein.
|
|
837
|
-
.groupby(["study_variable", "Protein.
|
|
835
|
+
report[["PG.MaxLFQ", "Protein.Group", "study_variable"]]
|
|
836
|
+
.groupby(["study_variable", "Protein.Group"])
|
|
838
837
|
.agg({"PG.MaxLFQ": ["mean", "std", "sem"]})
|
|
839
838
|
.reset_index()
|
|
840
|
-
.pivot(columns=["study_variable"], index="Protein.
|
|
839
|
+
.pivot(columns=["study_variable"], index="Protein.Group")
|
|
841
840
|
.reset_index()
|
|
842
841
|
)
|
|
843
842
|
protein_agg_report.columns = [
|
|
@@ -845,7 +844,7 @@ def mztab_prh(report, pg, index_ref, database, fasta_df):
|
|
|
845
844
|
for col in protein_agg_report.columns.values
|
|
846
845
|
]
|
|
847
846
|
subname_mapper = {
|
|
848
|
-
"Protein.
|
|
847
|
+
"Protein.Group::::": "Protein.Group",
|
|
849
848
|
"PG.MaxLFQ::mean": "protein_abundance_study_variable",
|
|
850
849
|
"PG.MaxLFQ::std": "protein_abundance_stdev_study_variable",
|
|
851
850
|
"PG.MaxLFQ::sem": "protein_abundance_std_error_study_variable",
|
|
@@ -858,7 +857,7 @@ def mztab_prh(report, pg, index_ref, database, fasta_df):
|
|
|
858
857
|
# to the Protein.Ids (A0A024RBG1;Q9NZJ9;Q9NZJ9-2), leading to A LOT of missing values.
|
|
859
858
|
out_mztab_prh = out_mztab_prh.merge(
|
|
860
859
|
protein_agg_report,
|
|
861
|
-
on="Protein.
|
|
860
|
+
on="Protein.Group",
|
|
862
861
|
how="left",
|
|
863
862
|
validate="many_to_one",
|
|
864
863
|
copy=True,
|
|
@@ -871,7 +870,7 @@ def mztab_prh(report, pg, index_ref, database, fasta_df):
|
|
|
871
870
|
out_mztab_prh.loc[:, "PRH"] = "PRT"
|
|
872
871
|
index = out_mztab_prh.loc[:, "PRH"]
|
|
873
872
|
out_mztab_prh.drop(
|
|
874
|
-
["PRH", "Genes", "modifiedSequence", "Protein.
|
|
873
|
+
["PRH", "Genes", "modifiedSequence", "Protein.Group"], axis=1, inplace=True
|
|
875
874
|
)
|
|
876
875
|
out_mztab_prh.insert(0, "PRH", index)
|
|
877
876
|
out_mztab_prh.fillna("null", inplace=True)
|
|
@@ -884,11 +883,11 @@ def mztab_prh(report, pg, index_ref, database, fasta_df):
|
|
|
884
883
|
|
|
885
884
|
|
|
886
885
|
def mztab_peh(
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
886
|
+
report: pd.DataFrame,
|
|
887
|
+
pr: pd.DataFrame,
|
|
888
|
+
precursor_list: List[str],
|
|
889
|
+
index_ref: pd.DataFrame,
|
|
890
|
+
database: os.PathLike,
|
|
892
891
|
) -> pd.DataFrame:
|
|
893
892
|
"""
|
|
894
893
|
Construct PEH sub-table.
|
|
@@ -916,14 +915,14 @@ def mztab_peh(
|
|
|
916
915
|
out_mztab_peh = pd.DataFrame()
|
|
917
916
|
out_mztab_peh = pr.iloc[:, 0:10]
|
|
918
917
|
out_mztab_peh.drop(
|
|
919
|
-
["Protein.
|
|
918
|
+
["Protein.Ids", "Protein.Names", "First.Protein.Description", "Proteotypic"],
|
|
920
919
|
axis=1,
|
|
921
920
|
inplace=True,
|
|
922
921
|
)
|
|
923
922
|
out_mztab_peh.rename(
|
|
924
923
|
columns={
|
|
925
924
|
"Stripped.Sequence": "sequence",
|
|
926
|
-
"Protein.
|
|
925
|
+
"Protein.Group": "accession",
|
|
927
926
|
"Modified.Sequence": "opt_global_cv_MS:1000889_peptidoform_sequence",
|
|
928
927
|
"Precursor.Charge": "charge",
|
|
929
928
|
},
|
|
@@ -1106,8 +1105,8 @@ def mztab_psh(report, folder, database):
|
|
|
1106
1105
|
# Standardize spectrum identifier format for bruker data
|
|
1107
1106
|
if not isinstance(target.loc[0, "opt_global_spectrum_reference"], str):
|
|
1108
1107
|
target.loc[:, "opt_global_spectrum_reference"] = "scan=" + target.loc[
|
|
1109
|
-
|
|
1110
|
-
|
|
1108
|
+
:, "opt_global_spectrum_reference"
|
|
1109
|
+
].astype(str)
|
|
1111
1110
|
|
|
1112
1111
|
# TODO seconds returned from precursor.getRT()
|
|
1113
1112
|
target.loc[:, "RT.Start"] = target.apply(lambda x: x["RT.Start"] / 60, axis=1)
|
|
@@ -1123,7 +1122,7 @@ def mztab_psh(report, folder, database):
|
|
|
1123
1122
|
out_mztab_psh = out_mztab_psh[
|
|
1124
1123
|
[
|
|
1125
1124
|
"Stripped.Sequence",
|
|
1126
|
-
"Protein.
|
|
1125
|
+
"Protein.Group",
|
|
1127
1126
|
"Q.Value",
|
|
1128
1127
|
"RT.Start",
|
|
1129
1128
|
"Precursor.Charge",
|
|
@@ -1184,7 +1183,7 @@ def mztab_psh(report, folder, database):
|
|
|
1184
1183
|
|
|
1185
1184
|
out_mztab_psh.loc[:, "spectra_ref"] = out_mztab_psh.apply(
|
|
1186
1185
|
lambda x: "ms_run[{}]:".format(x["ms_run"])
|
|
1187
|
-
|
|
1186
|
+
+ x["opt_global_spectrum_reference"],
|
|
1188
1187
|
axis=1,
|
|
1189
1188
|
result_type="expand",
|
|
1190
1189
|
)
|
|
@@ -1239,7 +1238,7 @@ def classify_result_type(target):
|
|
|
1239
1238
|
:return: A string implys protein type
|
|
1240
1239
|
:rtype: str
|
|
1241
1240
|
"""
|
|
1242
|
-
if ";" in target["Protein.
|
|
1241
|
+
if ";" in target["Protein.Group"]:
|
|
1243
1242
|
return "indistinguishable_protein_group"
|
|
1244
1243
|
return "single_protein"
|
|
1245
1244
|
|
|
@@ -1293,7 +1292,7 @@ def match_in_report(report, target, max_, flag, level):
|
|
|
1293
1292
|
return tuple(q_value)
|
|
1294
1293
|
|
|
1295
1294
|
if flag == 1 and level == "protein":
|
|
1296
|
-
result = report[report["Protein.
|
|
1295
|
+
result = report[report["Protein.Group"] == target]
|
|
1297
1296
|
prh_params = []
|
|
1298
1297
|
for i in range(1, max_ + 1):
|
|
1299
1298
|
match = result[result["study_variable"] == i]
|
|
@@ -1320,9 +1319,9 @@ class ModScoreLooker:
|
|
|
1320
1319
|
|
|
1321
1320
|
def make_lookup_dict(self, report) -> Dict[str, Tuple[str, float]]:
|
|
1322
1321
|
grouped_df = (
|
|
1323
|
-
report[["Modified.Sequence", "Protein.
|
|
1322
|
+
report[["Modified.Sequence", "Protein.Group", "Global.PG.Q.Value"]]
|
|
1324
1323
|
.sort_values("Global.PG.Q.Value", ascending=True)
|
|
1325
|
-
.groupby(["Protein.
|
|
1324
|
+
.groupby(["Protein.Group"])
|
|
1326
1325
|
.head(1)
|
|
1327
1326
|
)
|
|
1328
1327
|
# Modified.Sequence Protein.Ids Global.PG.Q.Value
|
|
@@ -1332,7 +1331,7 @@ class ModScoreLooker:
|
|
|
1332
1331
|
# 103588 NPVGYPLAWQFLR Q9NZ08;Q9NZ08-2 0.000252
|
|
1333
1332
|
|
|
1334
1333
|
out = {
|
|
1335
|
-
row["Protein.
|
|
1334
|
+
row["Protein.Group"]: (row["Modified.Sequence"], row["Global.PG.Q.Value"])
|
|
1336
1335
|
for _, row in grouped_df.iterrows()
|
|
1337
1336
|
}
|
|
1338
1337
|
return out
|
|
@@ -1556,8 +1555,8 @@ def calculate_coverage(ref_sequence: str, sequences: Set[str]):
|
|
|
1556
1555
|
for start, length in sorted(zip(starts, lengths)):
|
|
1557
1556
|
if merged_starts and merged_starts[-1] + merged_lengths[-1] >= start:
|
|
1558
1557
|
merged_lengths[-1] = (
|
|
1559
|
-
|
|
1560
|
-
|
|
1558
|
+
max(merged_starts[-1] + merged_lengths[-1], start + length)
|
|
1559
|
+
- merged_starts[-1]
|
|
1561
1560
|
)
|
|
1562
1561
|
else:
|
|
1563
1562
|
merged_starts.append(start)
|
|
@@ -1569,7 +1568,7 @@ def calculate_coverage(ref_sequence: str, sequences: Set[str]):
|
|
|
1569
1568
|
|
|
1570
1569
|
|
|
1571
1570
|
def calculate_protein_coverages(
|
|
1572
|
-
|
|
1571
|
+
report: pd.DataFrame, out_mztab_prh: pd.DataFrame, fasta_df: pd.DataFrame
|
|
1573
1572
|
) -> List[str]:
|
|
1574
1573
|
"""Calculates protein coverages for the PRH table.
|
|
1575
1574
|
|
|
@@ -1578,8 +1577,8 @@ def calculate_protein_coverages(
|
|
|
1578
1577
|
protein in the PRH table (defined by accession, not protein.ids).
|
|
1579
1578
|
"""
|
|
1580
1579
|
nested_df = (
|
|
1581
|
-
report[["Protein.
|
|
1582
|
-
.groupby("Protein.
|
|
1580
|
+
report[["Protein.Group", "Stripped.Sequence"]]
|
|
1581
|
+
.groupby("Protein.Group")
|
|
1583
1582
|
.agg({"Stripped.Sequence": set})
|
|
1584
1583
|
.reset_index()
|
|
1585
1584
|
)
|
|
@@ -1587,8 +1586,8 @@ def calculate_protein_coverages(
|
|
|
1587
1586
|
# 0 A0A024RBG1;Q9NZJ9;Q9NZJ9-2 {SEQEDEVLLVSSSR}
|
|
1588
1587
|
# 1 A0A096LP49;A0A096LP49-2 {SPWAMTERKHSSLER}
|
|
1589
1588
|
# 2 A0AVT1;A0AVT1-2 {EDFTLLDFINAVK, KPDHVPISSEDER, QDVIITALDNVEAR,...
|
|
1590
|
-
ids_to_seqs = dict(zip(nested_df["Protein.
|
|
1591
|
-
acc_to_ids = dict(zip(out_mztab_prh["accession"], out_mztab_prh["Protein.
|
|
1589
|
+
ids_to_seqs = dict(zip(nested_df["Protein.Group"], nested_df["Stripped.Sequence"]))
|
|
1590
|
+
acc_to_ids = dict(zip(out_mztab_prh["accession"], out_mztab_prh["Protein.Group"]))
|
|
1592
1591
|
fasta_id_to_seqs = dict(zip(fasta_df["id"], fasta_df["seq"]))
|
|
1593
1592
|
acc_to_fasta_ids: dict = {}
|
|
1594
1593
|
|
|
@@ -1,9 +1,10 @@
|
|
|
1
|
-
from pathlib import Path
|
|
2
|
-
import sqlite3
|
|
3
1
|
import re
|
|
2
|
+
import sqlite3
|
|
3
|
+
from pathlib import Path
|
|
4
4
|
|
|
5
5
|
import click
|
|
6
6
|
import pandas as pd
|
|
7
|
+
import pyarrow
|
|
7
8
|
from pyopenms import MSExperiment, MzMLFile
|
|
8
9
|
|
|
9
10
|
|
|
@@ -41,6 +42,14 @@ def mzml_statistics(ctx, ms_path: str, id_only: bool = False) -> None:
|
|
|
41
42
|
]
|
|
42
43
|
|
|
43
44
|
def parse_mzml(file_name: str, file_columns: list, id_only: bool = False):
|
|
45
|
+
"""
|
|
46
|
+
Parse mzML file and return a pandas DataFrame with the information. If id_only is True, it will also save a csv.
|
|
47
|
+
@param file_name: The file name of the mzML file
|
|
48
|
+
@param file_columns: The columns of the DataFrame
|
|
49
|
+
@param id_only: If True, it will save a csv with the spectrum id, mz and intensity
|
|
50
|
+
@return: A pandas DataFrame with the information of the mzML file
|
|
51
|
+
"""
|
|
52
|
+
|
|
44
53
|
info = []
|
|
45
54
|
psm_part_info = []
|
|
46
55
|
exp = MSExperiment()
|
|
@@ -123,11 +132,10 @@ def mzml_statistics(ctx, ms_path: str, id_only: bool = False) -> None:
|
|
|
123
132
|
if id_only and len(psm_part_info) > 0:
|
|
124
133
|
pd.DataFrame(
|
|
125
134
|
psm_part_info, columns=["scan", "ms_level", "mz", "intensity"]
|
|
126
|
-
).
|
|
127
|
-
f"{Path(ms_path).stem}_spectrum_df.
|
|
128
|
-
mode="w",
|
|
135
|
+
).to_parquet(
|
|
136
|
+
f"{Path(ms_path).stem}_spectrum_df.parquet",
|
|
129
137
|
index=False,
|
|
130
|
-
|
|
138
|
+
compression="gzip",
|
|
131
139
|
)
|
|
132
140
|
|
|
133
141
|
return pd.DataFrame(info, columns=file_columns)
|
|
@@ -168,7 +176,7 @@ def mzml_statistics(ctx, ms_path: str, id_only: bool = False) -> None:
|
|
|
168
176
|
except sqlite3.OperationalError as e:
|
|
169
177
|
if "no such table: Precursors" in str(e):
|
|
170
178
|
print(
|
|
171
|
-
f"No
|
|
179
|
+
f"No precursors recorded in {file_name}, This is normal for DIA data."
|
|
172
180
|
)
|
|
173
181
|
precursor_df = pd.DataFrame()
|
|
174
182
|
else:
|
|
@@ -219,13 +227,12 @@ def mzml_statistics(ctx, ms_path: str, id_only: bool = False) -> None:
|
|
|
219
227
|
elif Path(ms_path).suffix in [".mzML", ".mzml"]:
|
|
220
228
|
ms_df = parse_mzml(ms_path, file_columns, id_only)
|
|
221
229
|
else:
|
|
222
|
-
msg = f"Unrecognized or
|
|
230
|
+
msg = f"Unrecognized or the mass spec file '{ms_path}' do not exist"
|
|
223
231
|
raise RuntimeError(msg)
|
|
224
232
|
|
|
225
|
-
ms_df.
|
|
226
|
-
f"{Path(ms_path).stem}_ms_info.
|
|
227
|
-
|
|
228
|
-
sep="\t",
|
|
233
|
+
ms_df.to_parquet(
|
|
234
|
+
f"{Path(ms_path).stem}_ms_info.parquet",
|
|
235
|
+
engine="pyarrow",
|
|
229
236
|
index=False,
|
|
230
|
-
|
|
237
|
+
compression="gzip",
|
|
231
238
|
)
|