quantms-utils 0.0.2__tar.gz → 0.0.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {quantms_utils-0.0.2/quantms_utils.egg-info → quantms_utils-0.0.4}/PKG-INFO +12 -4
  2. {quantms_utils-0.0.2 → quantms_utils-0.0.4/quantms_utils.egg-info}/PKG-INFO +12 -4
  3. quantms_utils-0.0.4/quantms_utils.egg-info/requires.txt +16 -0
  4. quantms_utils-0.0.4/quantmsutils/__init__.py +1 -0
  5. {quantms_utils-0.0.2 → quantms_utils-0.0.4}/quantmsutils/diann/diann2mztab.py +93 -94
  6. {quantms_utils-0.0.2 → quantms_utils-0.0.4}/quantmsutils/mzml/mzml_statistics.py +20 -13
  7. {quantms_utils-0.0.2 → quantms_utils-0.0.4}/quantmsutils/psm/psm_conversion.py +19 -12
  8. {quantms_utils-0.0.2 → quantms_utils-0.0.4}/quantmsutils/quantmsutilsc.py +11 -9
  9. {quantms_utils-0.0.2 → quantms_utils-0.0.4}/quantmsutils/rescoring/ms2rescore.py +56 -29
  10. quantms_utils-0.0.4/quantmsutils/sdrf/check_samplesheet.py +198 -0
  11. {quantms_utils-0.0.2 → quantms_utils-0.0.4}/quantmsutils/sdrf/extract_sample.py +3 -2
  12. {quantms_utils-0.0.2 → quantms_utils-0.0.4}/setup.py +30 -9
  13. {quantms_utils-0.0.2 → quantms_utils-0.0.4}/tests/test_commands.py +28 -9
  14. quantms_utils-0.0.2/quantms_utils.egg-info/requires.txt +0 -8
  15. quantms_utils-0.0.2/quantmsutils/sdrf/__init__.py +0 -0
  16. quantms_utils-0.0.2/quantmsutils/sdrf/check_samplesheet.py +0 -134
  17. {quantms_utils-0.0.2 → quantms_utils-0.0.4}/LICENSE +0 -0
  18. {quantms_utils-0.0.2 → quantms_utils-0.0.4}/README.md +0 -0
  19. {quantms_utils-0.0.2 → quantms_utils-0.0.4}/pyproject.toml +0 -0
  20. {quantms_utils-0.0.2 → quantms_utils-0.0.4}/quantms_utils.egg-info/SOURCES.txt +0 -0
  21. {quantms_utils-0.0.2 → quantms_utils-0.0.4}/quantms_utils.egg-info/dependency_links.txt +0 -0
  22. {quantms_utils-0.0.2 → quantms_utils-0.0.4}/quantms_utils.egg-info/entry_points.txt +0 -0
  23. {quantms_utils-0.0.2 → quantms_utils-0.0.4}/quantms_utils.egg-info/top_level.txt +0 -0
  24. {quantms_utils-0.0.2/quantmsutils → quantms_utils-0.0.4/quantmsutils/diann}/__init__.py +0 -0
  25. {quantms_utils-0.0.2 → quantms_utils-0.0.4}/quantmsutils/diann/dianncfg.py +0 -0
  26. {quantms_utils-0.0.2/quantmsutils/diann → quantms_utils-0.0.4/quantmsutils/features}/__init__.py +0 -0
  27. {quantms_utils-0.0.2 → quantms_utils-0.0.4}/quantmsutils/features/sage_feature.py +1 -1
  28. {quantms_utils-0.0.2/quantmsutils/features → quantms_utils-0.0.4/quantmsutils/mzml}/__init__.py +0 -0
  29. {quantms_utils-0.0.2/quantmsutils/mzml → quantms_utils-0.0.4/quantmsutils/psm}/__init__.py +0 -0
  30. {quantms_utils-0.0.2/quantmsutils/psm → quantms_utils-0.0.4/quantmsutils/rescoring}/__init__.py +0 -0
  31. {quantms_utils-0.0.2/quantmsutils/rescoring → quantms_utils-0.0.4/quantmsutils/sdrf}/__init__.py +0 -0
  32. {quantms_utils-0.0.2 → quantms_utils-0.0.4}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: quantms-utils
3
- Version: 0.0.2
3
+ Version: 0.0.4
4
4
  Summary: Python package with scripts and helpers for the QuantMS workflow
5
5
  Home-page: https://www.github.com/bigbio/pyquantms
6
6
  Author: Yasset Perez-Riverol, Dai Chengxin
@@ -20,13 +20,21 @@ Requires-Python: >=3.8,<4
20
20
  Description-Content-Type: text/markdown
21
21
  License-File: LICENSE
22
22
  Requires-Dist: click
23
- Requires-Dist: sdrf-pipelines
23
+ Requires-Dist: sdrf-pipelines>=0.0.29
24
24
  Requires-Dist: pyopenms
25
- Requires-Dist: ms2rescore==3.0.2
26
- Requires-Dist: psm-utils==0.8.0
25
+ Requires-Dist: ms2rescore==3.0.3
26
+ Requires-Dist: deeplc==2.2.27
27
+ Requires-Dist: ms2pip==4.0.0.dev8
28
+ Requires-Dist: psm-utils==0.8.2
29
+ Requires-Dist: deeplcretrainer==0.2.11
27
30
  Requires-Dist: pydantic
28
31
  Requires-Dist: pandas
32
+ Requires-Dist: protobuf<4,>=3.9.2
29
33
  Requires-Dist: numpy
34
+ Requires-Dist: pyarrow
35
+ Requires-Dist: pygam
36
+ Requires-Dist: scipy
37
+ Requires-Dist: scikit-learn
30
38
 
31
39
  # quantms-utils
32
40
  [![Python application](https://github.com/bigbio/quantms-utils/actions/workflows/python-app.yml/badge.svg)](https://github.com/bigbio/quantms-utils/actions/workflows/python-app.yml)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: quantms-utils
3
- Version: 0.0.2
3
+ Version: 0.0.4
4
4
  Summary: Python package with scripts and helpers for the QuantMS workflow
5
5
  Home-page: https://www.github.com/bigbio/pyquantms
6
6
  Author: Yasset Perez-Riverol, Dai Chengxin
@@ -20,13 +20,21 @@ Requires-Python: >=3.8,<4
20
20
  Description-Content-Type: text/markdown
21
21
  License-File: LICENSE
22
22
  Requires-Dist: click
23
- Requires-Dist: sdrf-pipelines
23
+ Requires-Dist: sdrf-pipelines>=0.0.29
24
24
  Requires-Dist: pyopenms
25
- Requires-Dist: ms2rescore==3.0.2
26
- Requires-Dist: psm-utils==0.8.0
25
+ Requires-Dist: ms2rescore==3.0.3
26
+ Requires-Dist: deeplc==2.2.27
27
+ Requires-Dist: ms2pip==4.0.0.dev8
28
+ Requires-Dist: psm-utils==0.8.2
29
+ Requires-Dist: deeplcretrainer==0.2.11
27
30
  Requires-Dist: pydantic
28
31
  Requires-Dist: pandas
32
+ Requires-Dist: protobuf<4,>=3.9.2
29
33
  Requires-Dist: numpy
34
+ Requires-Dist: pyarrow
35
+ Requires-Dist: pygam
36
+ Requires-Dist: scipy
37
+ Requires-Dist: scikit-learn
30
38
 
31
39
  # quantms-utils
32
40
  [![Python application](https://github.com/bigbio/quantms-utils/actions/workflows/python-app.yml/badge.svg)](https://github.com/bigbio/quantms-utils/actions/workflows/python-app.yml)
@@ -0,0 +1,16 @@
1
+ click
2
+ sdrf-pipelines>=0.0.29
3
+ pyopenms
4
+ ms2rescore==3.0.3
5
+ deeplc==2.2.27
6
+ ms2pip==4.0.0.dev8
7
+ psm-utils==0.8.2
8
+ deeplcretrainer==0.2.11
9
+ pydantic
10
+ pandas
11
+ protobuf<4,>=3.9.2
12
+ numpy
13
+ pyarrow
14
+ pygam
15
+ scipy
16
+ scikit-learn
@@ -0,0 +1 @@
1
+ __version__ = "0.0.4"
@@ -11,7 +11,7 @@ import os
11
11
  import re
12
12
  import warnings
13
13
  from pathlib import Path
14
- from typing import Any, List, Tuple, Dict, Set, Union
14
+ from typing import Any, Dict, List, Set, Tuple, Union
15
15
 
16
16
  import click
17
17
  import numpy as np
@@ -44,14 +44,14 @@ logger = logging.getLogger(__name__)
44
44
  @click.option("--qvalue_threshold", "-q", type=float)
45
45
  @click.pass_context
46
46
  def diann2mztab(
47
- ctx,
48
- folder,
49
- exp_design,
50
- dia_params,
51
- diann_version,
52
- charge,
53
- missed_cleavages,
54
- qvalue_threshold,
47
+ ctx,
48
+ folder,
49
+ exp_design,
50
+ dia_params,
51
+ diann_version,
52
+ charge,
53
+ missed_cleavages,
54
+ qvalue_threshold,
55
55
  ):
56
56
  """
57
57
  Convert DIA-NN output to MSstats, Triqler or mzTab.
@@ -228,7 +228,7 @@ def get_exp_design_dfs(exp_design_file):
228
228
  lambda x: _true_stem(x["Spectra_Filepath"]), axis=1
229
229
  )
230
230
 
231
- s_table = [i.replace("\n", "").split("\t") for i in data[empty_row + 1:]][1:]
231
+ s_table = [i.replace("\n", "").split("\t") for i in data[empty_row + 1 :]][1:]
232
232
  s_header = data[empty_row + 1].replace("\n", "").split("\t")
233
233
  s_data_frame = pd.DataFrame(s_table, columns=s_header)
234
234
 
@@ -265,31 +265,31 @@ def compute_mass_modified_peptide(peptide_seq: str) -> float:
265
265
  if aa in aa_mass and not_mod:
266
266
  aa = aa_mass[aa]
267
267
  elif (
268
- aa
269
- not in [
270
- "G",
271
- "A",
272
- "V",
273
- "L",
274
- "I",
275
- "F",
276
- "M",
277
- "P",
278
- "W",
279
- "S",
280
- "C",
281
- "T",
282
- "Y",
283
- "N",
284
- "Q",
285
- "D",
286
- "E",
287
- "K",
288
- "R",
289
- "H",
290
- ]
291
- and not_mod
292
- and aa != ")"
268
+ aa
269
+ not in [
270
+ "G",
271
+ "A",
272
+ "V",
273
+ "L",
274
+ "I",
275
+ "F",
276
+ "M",
277
+ "P",
278
+ "W",
279
+ "S",
280
+ "C",
281
+ "T",
282
+ "Y",
283
+ "N",
284
+ "Q",
285
+ "D",
286
+ "E",
287
+ "K",
288
+ "R",
289
+ "H",
290
+ ]
291
+ and not_mod
292
+ and aa != ")"
293
293
  ):
294
294
  logger.info(f"Unknown amino acid with mass not known:{aa}")
295
295
  peptide_parts.append(aa)
@@ -362,18 +362,18 @@ class DiannDirectory:
362
362
  return diann_version_id
363
363
 
364
364
  def validate_diann_version(self) -> None:
365
- supported_diann_versions = ["1.8.1"]
365
+ supported_diann_versions = ["1.8.1", "1.9.beta.1"]
366
366
  if self.diann_version not in supported_diann_versions:
367
367
  raise ValueError(f"Unsupported DIANN version {self.diann_version}")
368
368
 
369
369
  def convert_to_mztab(
370
- self,
371
- report,
372
- f_table,
373
- charge: int,
374
- missed_cleavages: int,
375
- dia_params: List[Any],
376
- out: Union[os.PathLike, str],
370
+ self,
371
+ report,
372
+ f_table,
373
+ charge: int,
374
+ missed_cleavages: int,
375
+ dia_params: List[Any],
376
+ out: Union[os.PathLike, str],
377
377
  ) -> None:
378
378
  logger.info("Converting to mzTab")
379
379
  self.validate_diann_version()
@@ -481,8 +481,8 @@ class DiannDirectory:
481
481
  }
482
482
  mass_vector = report["Modified.Sequence"].map(uniq_masses)
483
483
  report["Calculate.Precursor.Mz"] = (
484
- mass_vector + (PROTON_MASS_U * report["Precursor.Charge"])
485
- ) / report["Precursor.Charge"]
484
+ mass_vector + (PROTON_MASS_U * report["Precursor.Charge"])
485
+ ) / report["Precursor.Charge"]
486
486
 
487
487
  logger.debug("Indexing Precursors")
488
488
  # Making the map is 1500x faster
@@ -589,16 +589,16 @@ def mztab_mtd(index_ref, dia_params, fasta, charge, missed_cleavages):
589
589
  out_mztab_mtd.loc[1, "software[1]-setting[1]"] = fasta
590
590
  out_mztab_mtd.loc[1, "software[1]-setting[2]"] = "db_version:null"
591
591
  out_mztab_mtd.loc[1, "software[1]-setting[3]"] = (
592
- "fragment_mass_tolerance:" + fragment_mass_tolerance
592
+ "fragment_mass_tolerance:" + fragment_mass_tolerance
593
593
  )
594
594
  out_mztab_mtd.loc[1, "software[1]-setting[4]"] = (
595
- "fragment_mass_tolerance_unit:" + fragment_mass_tolerance_unit
595
+ "fragment_mass_tolerance_unit:" + fragment_mass_tolerance_unit
596
596
  )
597
597
  out_mztab_mtd.loc[1, "software[1]-setting[5]"] = (
598
- "precursor_mass_tolerance:" + precursor_mass_tolerance
598
+ "precursor_mass_tolerance:" + precursor_mass_tolerance
599
599
  )
600
600
  out_mztab_mtd.loc[1, "software[1]-setting[6]"] = (
601
- "precursor_mass_tolerance_unit:" + precursor_mass_tolerance_unit
601
+ "precursor_mass_tolerance_unit:" + precursor_mass_tolerance_unit
602
602
  )
603
603
  out_mztab_mtd.loc[1, "software[1]-setting[7]"] = "enzyme:" + enzyme
604
604
  out_mztab_mtd.loc[1, "software[1]-setting[8]"] = "enzyme_term_specificity:full"
@@ -607,10 +607,10 @@ def mztab_mtd(index_ref, dia_params, fasta, charge, missed_cleavages):
607
607
  missed_cleavages
608
608
  )
609
609
  out_mztab_mtd.loc[1, "software[1]-setting[11]"] = (
610
- "fixed_modifications:" + fixed_modifications
610
+ "fixed_modifications:" + fixed_modifications
611
611
  )
612
612
  out_mztab_mtd.loc[1, "software[1]-setting[12]"] = (
613
- "variable_modifications:" + variable_modifications
613
+ "variable_modifications:" + variable_modifications
614
614
  )
615
615
 
616
616
  (fixed_mods, variable_mods, fix_flag, var_flag) = mtd_mod_info(
@@ -633,7 +633,7 @@ def mztab_mtd(index_ref, dia_params, fasta, charge, missed_cleavages):
633
633
  ]
634
634
  out_mztab_mtd.loc[1, "variable_mod[" + str(i) + "]-site"] = variable_mods[
635
635
  i - 1
636
- ][1]
636
+ ][1]
637
637
  out_mztab_mtd.loc[1, "variable_mod[" + str(i) + "]-position"] = "Anywhere"
638
638
  else:
639
639
  out_mztab_mtd.loc[1, "variable_mod[1]"] = variable_mods[0]
@@ -649,8 +649,8 @@ def mztab_mtd(index_ref, dia_params, fasta, charge, missed_cleavages):
649
649
  "[MS, MS:1000584, mzML file, ]"
650
650
  )
651
651
  out_mztab_mtd.loc[1, "ms_run[" + str(i) + "]-location"] = (
652
- "file://"
653
- + index_ref[index_ref["ms_run"] == i]["Spectra_Filepath"].values[0]
652
+ "file://"
653
+ + index_ref[index_ref["ms_run"] == i]["Spectra_Filepath"].values[0]
654
654
  )
655
655
  out_mztab_mtd.loc[1, "ms_run[" + str(i) + "]-id_format"] = (
656
656
  "[MS, MS:1000777, spectrum identifier nativeID format, ]"
@@ -659,7 +659,7 @@ def mztab_mtd(index_ref, dia_params, fasta, charge, missed_cleavages):
659
659
  "[MS, MS:1002038, unlabeled sample, ]"
660
660
  )
661
661
  out_mztab_mtd.loc[1, "assay[" + str(i) + "]-ms_run_ref"] = (
662
- "ms_run[" + str(i) + "]"
662
+ "ms_run[" + str(i) + "]"
663
663
  )
664
664
 
665
665
  with warnings.catch_warnings():
@@ -723,16 +723,16 @@ def mztab_prh(report, pg, index_ref, database, fasta_df):
723
723
  col = {}
724
724
  for i in file:
725
725
  col[i] = (
726
- "protein_abundance_assay["
727
- + str(index_ref[index_ref["Run"] == _true_stem(i)]["ms_run"].values[0])
728
- + "]"
726
+ "protein_abundance_assay["
727
+ + str(index_ref[index_ref["Run"] == _true_stem(i)]["ms_run"].values[0])
728
+ + "]"
729
729
  )
730
730
 
731
731
  pg.rename(columns=col, inplace=True)
732
732
 
733
733
  logger.debug("Classifying results type ...")
734
734
  pg["opt_global_result_type"] = "single_protein"
735
- pg.loc[pg["Protein.Ids"].str.contains(";"), "opt_global_result_type"] = (
735
+ pg.loc[pg["Protein.Group"].str.contains(";"), "opt_global_result_type"] = (
736
736
  "indistinguishable_protein_group"
737
737
  )
738
738
 
@@ -741,7 +741,6 @@ def mztab_prh(report, pg, index_ref, database, fasta_df):
741
741
  out_mztab_prh = out_mztab_prh.drop(["Protein.Names"], axis=1)
742
742
  out_mztab_prh.rename(
743
743
  columns={
744
- "Protein.Group": "accession",
745
744
  "First.Protein.Description": "description",
746
745
  },
747
746
  inplace=True,
@@ -762,14 +761,14 @@ def mztab_prh(report, pg, index_ref, database, fasta_df):
762
761
 
763
762
  logger.debug("Extracting accession values (keeping first)...")
764
763
  out_mztab_prh.loc[:, "accession"] = out_mztab_prh.apply(
765
- lambda x: x["accession"].split(";")[0], axis=1
764
+ lambda x: x["Protein.Group"].split(";")[0], axis=1
766
765
  )
767
766
 
768
767
  protein_details_df = out_mztab_prh[
769
768
  out_mztab_prh["opt_global_result_type"] == "indistinguishable_protein_group"
770
- ]
769
+ ]
771
770
  prh_series = (
772
- protein_details_df["Protein.Ids"]
771
+ protein_details_df["Protein.Group"]
773
772
  .str.split(";", expand=True)
774
773
  .stack()
775
774
  .reset_index(level=1, drop=True)
@@ -806,7 +805,7 @@ def mztab_prh(report, pg, index_ref, database, fasta_df):
806
805
  # or out_mztab_PRH.loc[out_mztab_PRH["Protein.Ids"] == out_mztab_PRH["accession"], "ambiguity_members"] = "null"
807
806
  out_mztab_prh.loc[:, "ambiguity_members"] = out_mztab_prh.apply(
808
807
  lambda x: (
809
- x["Protein.Ids"]
808
+ x["Protein.Group"]
810
809
  if x["opt_global_result_type"] == "indistinguishable_protein_group"
811
810
  else "null"
812
811
  ),
@@ -817,7 +816,7 @@ def mztab_prh(report, pg, index_ref, database, fasta_df):
817
816
  score_looker = ModScoreLooker(report)
818
817
  out_mztab_prh[["modifiedSequence", "best_search_engine_score[1]"]] = (
819
818
  out_mztab_prh.apply(
820
- lambda x: score_looker.get_score(x["Protein.Ids"]),
819
+ lambda x: score_looker.get_score(x["Protein.Group"]),
821
820
  axis=1,
822
821
  result_type="expand",
823
822
  )
@@ -833,11 +832,11 @@ def mztab_prh(report, pg, index_ref, database, fasta_df):
833
832
  # This used to be a bottleneck in performance
834
833
  # This implementation drops the run time from 57s to 25ms
835
834
  protein_agg_report = (
836
- report[["PG.MaxLFQ", "Protein.Ids", "study_variable"]]
837
- .groupby(["study_variable", "Protein.Ids"])
835
+ report[["PG.MaxLFQ", "Protein.Group", "study_variable"]]
836
+ .groupby(["study_variable", "Protein.Group"])
838
837
  .agg({"PG.MaxLFQ": ["mean", "std", "sem"]})
839
838
  .reset_index()
840
- .pivot(columns=["study_variable"], index="Protein.Ids")
839
+ .pivot(columns=["study_variable"], index="Protein.Group")
841
840
  .reset_index()
842
841
  )
843
842
  protein_agg_report.columns = [
@@ -845,7 +844,7 @@ def mztab_prh(report, pg, index_ref, database, fasta_df):
845
844
  for col in protein_agg_report.columns.values
846
845
  ]
847
846
  subname_mapper = {
848
- "Protein.Ids::::": "Protein.Ids",
847
+ "Protein.Group::::": "Protein.Group",
849
848
  "PG.MaxLFQ::mean": "protein_abundance_study_variable",
850
849
  "PG.MaxLFQ::std": "protein_abundance_stdev_study_variable",
851
850
  "PG.MaxLFQ::sem": "protein_abundance_std_error_study_variable",
@@ -858,7 +857,7 @@ def mztab_prh(report, pg, index_ref, database, fasta_df):
858
857
  # to the Protein.Ids (A0A024RBG1;Q9NZJ9;Q9NZJ9-2), leading to A LOT of missing values.
859
858
  out_mztab_prh = out_mztab_prh.merge(
860
859
  protein_agg_report,
861
- on="Protein.Ids",
860
+ on="Protein.Group",
862
861
  how="left",
863
862
  validate="many_to_one",
864
863
  copy=True,
@@ -871,7 +870,7 @@ def mztab_prh(report, pg, index_ref, database, fasta_df):
871
870
  out_mztab_prh.loc[:, "PRH"] = "PRT"
872
871
  index = out_mztab_prh.loc[:, "PRH"]
873
872
  out_mztab_prh.drop(
874
- ["PRH", "Genes", "modifiedSequence", "Protein.Ids"], axis=1, inplace=True
873
+ ["PRH", "Genes", "modifiedSequence", "Protein.Group"], axis=1, inplace=True
875
874
  )
876
875
  out_mztab_prh.insert(0, "PRH", index)
877
876
  out_mztab_prh.fillna("null", inplace=True)
@@ -884,11 +883,11 @@ def mztab_prh(report, pg, index_ref, database, fasta_df):
884
883
 
885
884
 
886
885
  def mztab_peh(
887
- report: pd.DataFrame,
888
- pr: pd.DataFrame,
889
- precursor_list: List[str],
890
- index_ref: pd.DataFrame,
891
- database: os.PathLike,
886
+ report: pd.DataFrame,
887
+ pr: pd.DataFrame,
888
+ precursor_list: List[str],
889
+ index_ref: pd.DataFrame,
890
+ database: os.PathLike,
892
891
  ) -> pd.DataFrame:
893
892
  """
894
893
  Construct PEH sub-table.
@@ -916,14 +915,14 @@ def mztab_peh(
916
915
  out_mztab_peh = pd.DataFrame()
917
916
  out_mztab_peh = pr.iloc[:, 0:10]
918
917
  out_mztab_peh.drop(
919
- ["Protein.Group", "Protein.Names", "First.Protein.Description", "Proteotypic"],
918
+ ["Protein.Ids", "Protein.Names", "First.Protein.Description", "Proteotypic"],
920
919
  axis=1,
921
920
  inplace=True,
922
921
  )
923
922
  out_mztab_peh.rename(
924
923
  columns={
925
924
  "Stripped.Sequence": "sequence",
926
- "Protein.Ids": "accession",
925
+ "Protein.Group": "accession",
927
926
  "Modified.Sequence": "opt_global_cv_MS:1000889_peptidoform_sequence",
928
927
  "Precursor.Charge": "charge",
929
928
  },
@@ -1106,8 +1105,8 @@ def mztab_psh(report, folder, database):
1106
1105
  # Standardize spectrum identifier format for bruker data
1107
1106
  if not isinstance(target.loc[0, "opt_global_spectrum_reference"], str):
1108
1107
  target.loc[:, "opt_global_spectrum_reference"] = "scan=" + target.loc[
1109
- :, "opt_global_spectrum_reference"
1110
- ].astype(str)
1108
+ :, "opt_global_spectrum_reference"
1109
+ ].astype(str)
1111
1110
 
1112
1111
  # TODO seconds returned from precursor.getRT()
1113
1112
  target.loc[:, "RT.Start"] = target.apply(lambda x: x["RT.Start"] / 60, axis=1)
@@ -1123,7 +1122,7 @@ def mztab_psh(report, folder, database):
1123
1122
  out_mztab_psh = out_mztab_psh[
1124
1123
  [
1125
1124
  "Stripped.Sequence",
1126
- "Protein.Ids",
1125
+ "Protein.Group",
1127
1126
  "Q.Value",
1128
1127
  "RT.Start",
1129
1128
  "Precursor.Charge",
@@ -1184,7 +1183,7 @@ def mztab_psh(report, folder, database):
1184
1183
 
1185
1184
  out_mztab_psh.loc[:, "spectra_ref"] = out_mztab_psh.apply(
1186
1185
  lambda x: "ms_run[{}]:".format(x["ms_run"])
1187
- + x["opt_global_spectrum_reference"],
1186
+ + x["opt_global_spectrum_reference"],
1188
1187
  axis=1,
1189
1188
  result_type="expand",
1190
1189
  )
@@ -1239,7 +1238,7 @@ def classify_result_type(target):
1239
1238
  :return: A string implys protein type
1240
1239
  :rtype: str
1241
1240
  """
1242
- if ";" in target["Protein.Ids"]:
1241
+ if ";" in target["Protein.Group"]:
1243
1242
  return "indistinguishable_protein_group"
1244
1243
  return "single_protein"
1245
1244
 
@@ -1293,7 +1292,7 @@ def match_in_report(report, target, max_, flag, level):
1293
1292
  return tuple(q_value)
1294
1293
 
1295
1294
  if flag == 1 and level == "protein":
1296
- result = report[report["Protein.Ids"] == target]
1295
+ result = report[report["Protein.Group"] == target]
1297
1296
  prh_params = []
1298
1297
  for i in range(1, max_ + 1):
1299
1298
  match = result[result["study_variable"] == i]
@@ -1320,9 +1319,9 @@ class ModScoreLooker:
1320
1319
 
1321
1320
  def make_lookup_dict(self, report) -> Dict[str, Tuple[str, float]]:
1322
1321
  grouped_df = (
1323
- report[["Modified.Sequence", "Protein.Ids", "Global.PG.Q.Value"]]
1322
+ report[["Modified.Sequence", "Protein.Group", "Global.PG.Q.Value"]]
1324
1323
  .sort_values("Global.PG.Q.Value", ascending=True)
1325
- .groupby(["Protein.Ids"])
1324
+ .groupby(["Protein.Group"])
1326
1325
  .head(1)
1327
1326
  )
1328
1327
  # Modified.Sequence Protein.Ids Global.PG.Q.Value
@@ -1332,7 +1331,7 @@ class ModScoreLooker:
1332
1331
  # 103588 NPVGYPLAWQFLR Q9NZ08;Q9NZ08-2 0.000252
1333
1332
 
1334
1333
  out = {
1335
- row["Protein.Ids"]: (row["Modified.Sequence"], row["Global.PG.Q.Value"])
1334
+ row["Protein.Group"]: (row["Modified.Sequence"], row["Global.PG.Q.Value"])
1336
1335
  for _, row in grouped_df.iterrows()
1337
1336
  }
1338
1337
  return out
@@ -1556,8 +1555,8 @@ def calculate_coverage(ref_sequence: str, sequences: Set[str]):
1556
1555
  for start, length in sorted(zip(starts, lengths)):
1557
1556
  if merged_starts and merged_starts[-1] + merged_lengths[-1] >= start:
1558
1557
  merged_lengths[-1] = (
1559
- max(merged_starts[-1] + merged_lengths[-1], start + length)
1560
- - merged_starts[-1]
1558
+ max(merged_starts[-1] + merged_lengths[-1], start + length)
1559
+ - merged_starts[-1]
1561
1560
  )
1562
1561
  else:
1563
1562
  merged_starts.append(start)
@@ -1569,7 +1568,7 @@ def calculate_coverage(ref_sequence: str, sequences: Set[str]):
1569
1568
 
1570
1569
 
1571
1570
  def calculate_protein_coverages(
1572
- report: pd.DataFrame, out_mztab_prh: pd.DataFrame, fasta_df: pd.DataFrame
1571
+ report: pd.DataFrame, out_mztab_prh: pd.DataFrame, fasta_df: pd.DataFrame
1573
1572
  ) -> List[str]:
1574
1573
  """Calculates protein coverages for the PRH table.
1575
1574
 
@@ -1578,8 +1577,8 @@ def calculate_protein_coverages(
1578
1577
  protein in the PRH table (defined by accession, not protein.ids).
1579
1578
  """
1580
1579
  nested_df = (
1581
- report[["Protein.Ids", "Stripped.Sequence"]]
1582
- .groupby("Protein.Ids")
1580
+ report[["Protein.Group", "Stripped.Sequence"]]
1581
+ .groupby("Protein.Group")
1583
1582
  .agg({"Stripped.Sequence": set})
1584
1583
  .reset_index()
1585
1584
  )
@@ -1587,8 +1586,8 @@ def calculate_protein_coverages(
1587
1586
  # 0 A0A024RBG1;Q9NZJ9;Q9NZJ9-2 {SEQEDEVLLVSSSR}
1588
1587
  # 1 A0A096LP49;A0A096LP49-2 {SPWAMTERKHSSLER}
1589
1588
  # 2 A0AVT1;A0AVT1-2 {EDFTLLDFINAVK, KPDHVPISSEDER, QDVIITALDNVEAR,...
1590
- ids_to_seqs = dict(zip(nested_df["Protein.Ids"], nested_df["Stripped.Sequence"]))
1591
- acc_to_ids = dict(zip(out_mztab_prh["accession"], out_mztab_prh["Protein.Ids"]))
1589
+ ids_to_seqs = dict(zip(nested_df["Protein.Group"], nested_df["Stripped.Sequence"]))
1590
+ acc_to_ids = dict(zip(out_mztab_prh["accession"], out_mztab_prh["Protein.Group"]))
1592
1591
  fasta_id_to_seqs = dict(zip(fasta_df["id"], fasta_df["seq"]))
1593
1592
  acc_to_fasta_ids: dict = {}
1594
1593
 
@@ -1,9 +1,10 @@
1
- from pathlib import Path
2
- import sqlite3
3
1
  import re
2
+ import sqlite3
3
+ from pathlib import Path
4
4
 
5
5
  import click
6
6
  import pandas as pd
7
+ import pyarrow
7
8
  from pyopenms import MSExperiment, MzMLFile
8
9
 
9
10
 
@@ -41,6 +42,14 @@ def mzml_statistics(ctx, ms_path: str, id_only: bool = False) -> None:
41
42
  ]
42
43
 
43
44
  def parse_mzml(file_name: str, file_columns: list, id_only: bool = False):
45
+ """
46
+ Parse mzML file and return a pandas DataFrame with the information. If id_only is True, it will also save a csv.
47
+ @param file_name: The file name of the mzML file
48
+ @param file_columns: The columns of the DataFrame
49
+ @param id_only: If True, it will save a csv with the spectrum id, mz and intensity
50
+ @return: A pandas DataFrame with the information of the mzML file
51
+ """
52
+
44
53
  info = []
45
54
  psm_part_info = []
46
55
  exp = MSExperiment()
@@ -123,11 +132,10 @@ def mzml_statistics(ctx, ms_path: str, id_only: bool = False) -> None:
123
132
  if id_only and len(psm_part_info) > 0:
124
133
  pd.DataFrame(
125
134
  psm_part_info, columns=["scan", "ms_level", "mz", "intensity"]
126
- ).to_csv(
127
- f"{Path(ms_path).stem}_spectrum_df.csv",
128
- mode="w",
135
+ ).to_parquet(
136
+ f"{Path(ms_path).stem}_spectrum_df.parquet",
129
137
  index=False,
130
- header=True,
138
+ compression="gzip",
131
139
  )
132
140
 
133
141
  return pd.DataFrame(info, columns=file_columns)
@@ -168,7 +176,7 @@ def mzml_statistics(ctx, ms_path: str, id_only: bool = False) -> None:
168
176
  except sqlite3.OperationalError as e:
169
177
  if "no such table: Precursors" in str(e):
170
178
  print(
171
- f"No precursers recorded in {file_name}, This is normal for DIA data."
179
+ f"No precursors recorded in {file_name}, This is normal for DIA data."
172
180
  )
173
181
  precursor_df = pd.DataFrame()
174
182
  else:
@@ -219,13 +227,12 @@ def mzml_statistics(ctx, ms_path: str, id_only: bool = False) -> None:
219
227
  elif Path(ms_path).suffix in [".mzML", ".mzml"]:
220
228
  ms_df = parse_mzml(ms_path, file_columns, id_only)
221
229
  else:
222
- msg = f"Unrecognized or inexistent mass spec file '{ms_path}'"
230
+ msg = f"Unrecognized or the mass spec file '{ms_path}' do not exist"
223
231
  raise RuntimeError(msg)
224
232
 
225
- ms_df.to_csv(
226
- f"{Path(ms_path).stem}_ms_info.tsv",
227
- mode="w",
228
- sep="\t",
233
+ ms_df.to_parquet(
234
+ f"{Path(ms_path).stem}_ms_info.parquet",
235
+ engine="pyarrow",
229
236
  index=False,
230
- header=True,
237
+ compression="gzip",
231
238
  )