PyPI - quantms-utils - Versions diffs - 0.0.2__tar.gz → 0.0.4__tar.gz - Mend

quantms-utils 0.0.2tar.gz → 0.0.4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

{quantms_utils-0.0.2/quantms_utils.egg-info → quantms_utils-0.0.4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: quantms-utils
-Version: 0.0.2
+Version: 0.0.4
 Summary: Python package with scripts and helpers for the QuantMS workflow
 Home-page: https://www.github.com/bigbio/pyquantms
 Author: Yasset Perez-Riverol, Dai Chengxin
@@ -20,13 +20,21 @@ Requires-Python: >=3.8,<4
 Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: click
-Requires-Dist: sdrf-pipelines
+Requires-Dist: sdrf-pipelines>=0.0.29
 Requires-Dist: pyopenms
-Requires-Dist: ms2rescore==3.0.2
-Requires-Dist: psm-utils==0.8.0
+Requires-Dist: ms2rescore==3.0.3
+Requires-Dist: deeplc==2.2.27
+Requires-Dist: ms2pip==4.0.0.dev8
+Requires-Dist: psm-utils==0.8.2
+Requires-Dist: deeplcretrainer==0.2.11
 Requires-Dist: pydantic
 Requires-Dist: pandas
+Requires-Dist: protobuf<4,>=3.9.2
 Requires-Dist: numpy
+Requires-Dist: pyarrow
+Requires-Dist: pygam
+Requires-Dist: scipy
+Requires-Dist: scikit-learn
 # quantms-utils
 [![Python application](https://github.com/bigbio/quantms-utils/actions/workflows/python-app.yml/badge.svg)](https://github.com/bigbio/quantms-utils/actions/workflows/python-app.yml)

{quantms_utils-0.0.2 → quantms_utils-0.0.4/quantms_utils.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: quantms-utils
-Version: 0.0.2
+Version: 0.0.4
 Summary: Python package with scripts and helpers for the QuantMS workflow
 Home-page: https://www.github.com/bigbio/pyquantms
 Author: Yasset Perez-Riverol, Dai Chengxin
@@ -20,13 +20,21 @@ Requires-Python: >=3.8,<4
 Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: click
-Requires-Dist: sdrf-pipelines
+Requires-Dist: sdrf-pipelines>=0.0.29
 Requires-Dist: pyopenms
-Requires-Dist: ms2rescore==3.0.2
-Requires-Dist: psm-utils==0.8.0
+Requires-Dist: ms2rescore==3.0.3
+Requires-Dist: deeplc==2.2.27
+Requires-Dist: ms2pip==4.0.0.dev8
+Requires-Dist: psm-utils==0.8.2
+Requires-Dist: deeplcretrainer==0.2.11
 Requires-Dist: pydantic
 Requires-Dist: pandas
+Requires-Dist: protobuf<4,>=3.9.2
 Requires-Dist: numpy
+Requires-Dist: pyarrow
+Requires-Dist: pygam
+Requires-Dist: scipy
+Requires-Dist: scikit-learn
 # quantms-utils
 [![Python application](https://github.com/bigbio/quantms-utils/actions/workflows/python-app.yml/badge.svg)](https://github.com/bigbio/quantms-utils/actions/workflows/python-app.yml)

quantms_utils-0.0.4/quantms_utils.egg-info/requires.txt ADDED Viewed

@@ -0,0 +1,16 @@
+click
+sdrf-pipelines>=0.0.29
+pyopenms
+ms2rescore==3.0.3
+deeplc==2.2.27
+ms2pip==4.0.0.dev8
+psm-utils==0.8.2
+deeplcretrainer==0.2.11
+pydantic
+pandas
+protobuf<4,>=3.9.2
+numpy
+pyarrow
+pygam
+scipy
+scikit-learn

quantms_utils-0.0.4/quantmsutils/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "0.0.4"

{quantms_utils-0.0.2 → quantms_utils-0.0.4}/quantmsutils/diann/diann2mztab.py RENAMED Viewed

@@ -11,7 +11,7 @@ import os
 import re
 import warnings
 from pathlib import Path
-from typing import Any, List, Tuple, Dict, Set, Union
+from typing import Any, Dict, List, Set, Tuple, Union
 import click
 import numpy as np
@@ -44,14 +44,14 @@ logger = logging.getLogger(__name__)
 @click.option("--qvalue_threshold", "-q", type=float)
 @click.pass_context
 def diann2mztab(
-        ctx,
-        folder,
-        exp_design,
-        dia_params,
-        diann_version,
-        charge,
-        missed_cleavages,
-        qvalue_threshold,
+    ctx,
+    folder,
+    exp_design,
+    dia_params,
+    diann_version,
+    charge,
+    missed_cleavages,
+    qvalue_threshold,
 ):
     """
     Convert DIA-NN output to MSstats, Triqler or mzTab.
@@ -228,7 +228,7 @@ def get_exp_design_dfs(exp_design_file):
             lambda x: _true_stem(x["Spectra_Filepath"]), axis=1
         )
-        s_table = [i.replace("\n", "").split("\t") for i in data[empty_row + 1:]][1:]
+        s_table = [i.replace("\n", "").split("\t") for i in data[empty_row + 1 :]][1:]
         s_header = data[empty_row + 1].replace("\n", "").split("\t")
         s_data_frame = pd.DataFrame(s_table, columns=s_header)
@@ -265,31 +265,31 @@ def compute_mass_modified_peptide(peptide_seq: str) -> float:
         if aa in aa_mass and not_mod:
             aa = aa_mass[aa]
         elif (
-                aa
-                not in [
-                    "G",
-                    "A",
-                    "V",
-                    "L",
-                    "I",
-                    "F",
-                    "M",
-                    "P",
-                    "W",
-                    "S",
-                    "C",
-                    "T",
-                    "Y",
-                    "N",
-                    "Q",
-                    "D",
-                    "E",
-                    "K",
-                    "R",
-                    "H",
-                ]
-                and not_mod
-                and aa != ")"
+            aa
+            not in [
+                "G",
+                "A",
+                "V",
+                "L",
+                "I",
+                "F",
+                "M",
+                "P",
+                "W",
+                "S",
+                "C",
+                "T",
+                "Y",
+                "N",
+                "Q",
+                "D",
+                "E",
+                "K",
+                "R",
+                "H",
+            ]
+            and not_mod
+            and aa != ")"
         ):
             logger.info(f"Unknown amino acid with mass not known:{aa}")
         peptide_parts.append(aa)
@@ -362,18 +362,18 @@ class DiannDirectory:
         return diann_version_id
     def validate_diann_version(self) -> None:
-        supported_diann_versions = ["1.8.1"]
+        supported_diann_versions = ["1.8.1", "1.9.beta.1"]
         if self.diann_version not in supported_diann_versions:
             raise ValueError(f"Unsupported DIANN version {self.diann_version}")
     def convert_to_mztab(
-            self,
-            report,
-            f_table,
-            charge: int,
-            missed_cleavages: int,
-            dia_params: List[Any],
-            out: Union[os.PathLike, str],
+        self,
+        report,
+        f_table,
+        charge: int,
+        missed_cleavages: int,
+        dia_params: List[Any],
+        out: Union[os.PathLike, str],
     ) -> None:
         logger.info("Converting to mzTab")
         self.validate_diann_version()
@@ -481,8 +481,8 @@ class DiannDirectory:
         }
         mass_vector = report["Modified.Sequence"].map(uniq_masses)
         report["Calculate.Precursor.Mz"] = (
-                                                   mass_vector + (PROTON_MASS_U * report["Precursor.Charge"])
-                                           ) / report["Precursor.Charge"]
+            mass_vector + (PROTON_MASS_U * report["Precursor.Charge"])
+        ) / report["Precursor.Charge"]
         logger.debug("Indexing Precursors")
         # Making the map is 1500x faster
@@ -589,16 +589,16 @@ def mztab_mtd(index_ref, dia_params, fasta, charge, missed_cleavages):
     out_mztab_mtd.loc[1, "software[1]-setting[1]"] = fasta
     out_mztab_mtd.loc[1, "software[1]-setting[2]"] = "db_version:null"
     out_mztab_mtd.loc[1, "software[1]-setting[3]"] = (
-            "fragment_mass_tolerance:" + fragment_mass_tolerance
+        "fragment_mass_tolerance:" + fragment_mass_tolerance
     )
     out_mztab_mtd.loc[1, "software[1]-setting[4]"] = (
-            "fragment_mass_tolerance_unit:" + fragment_mass_tolerance_unit
+        "fragment_mass_tolerance_unit:" + fragment_mass_tolerance_unit
     )
     out_mztab_mtd.loc[1, "software[1]-setting[5]"] = (
-            "precursor_mass_tolerance:" + precursor_mass_tolerance
+        "precursor_mass_tolerance:" + precursor_mass_tolerance
     )
     out_mztab_mtd.loc[1, "software[1]-setting[6]"] = (
-            "precursor_mass_tolerance_unit:" + precursor_mass_tolerance_unit
+        "precursor_mass_tolerance_unit:" + precursor_mass_tolerance_unit
     )
     out_mztab_mtd.loc[1, "software[1]-setting[7]"] = "enzyme:" + enzyme
     out_mztab_mtd.loc[1, "software[1]-setting[8]"] = "enzyme_term_specificity:full"
@@ -607,10 +607,10 @@ def mztab_mtd(index_ref, dia_params, fasta, charge, missed_cleavages):
         missed_cleavages
     )
     out_mztab_mtd.loc[1, "software[1]-setting[11]"] = (
-            "fixed_modifications:" + fixed_modifications
+        "fixed_modifications:" + fixed_modifications
     )
     out_mztab_mtd.loc[1, "software[1]-setting[12]"] = (
-            "variable_modifications:" + variable_modifications
+        "variable_modifications:" + variable_modifications
     )
     (fixed_mods, variable_mods, fix_flag, var_flag) = mtd_mod_info(
@@ -633,7 +633,7 @@ def mztab_mtd(index_ref, dia_params, fasta, charge, missed_cleavages):
             ]
             out_mztab_mtd.loc[1, "variable_mod[" + str(i) + "]-site"] = variable_mods[
                 i - 1
-                ][1]
+            ][1]
             out_mztab_mtd.loc[1, "variable_mod[" + str(i) + "]-position"] = "Anywhere"
     else:
         out_mztab_mtd.loc[1, "variable_mod[1]"] = variable_mods[0]
@@ -649,8 +649,8 @@ def mztab_mtd(index_ref, dia_params, fasta, charge, missed_cleavages):
             "[MS, MS:1000584, mzML file, ]"
         )
         out_mztab_mtd.loc[1, "ms_run[" + str(i) + "]-location"] = (
-                "file://"
-                + index_ref[index_ref["ms_run"] == i]["Spectra_Filepath"].values[0]
+            "file://"
+            + index_ref[index_ref["ms_run"] == i]["Spectra_Filepath"].values[0]
         )
         out_mztab_mtd.loc[1, "ms_run[" + str(i) + "]-id_format"] = (
             "[MS, MS:1000777, spectrum identifier nativeID format, ]"
@@ -659,7 +659,7 @@ def mztab_mtd(index_ref, dia_params, fasta, charge, missed_cleavages):
             "[MS, MS:1002038, unlabeled sample, ]"
         )
         out_mztab_mtd.loc[1, "assay[" + str(i) + "]-ms_run_ref"] = (
-                "ms_run[" + str(i) + "]"
+            "ms_run[" + str(i) + "]"
         )
     with warnings.catch_warnings():
@@ -723,16 +723,16 @@ def mztab_prh(report, pg, index_ref, database, fasta_df):
     col = {}
     for i in file:
         col[i] = (
-                "protein_abundance_assay["
-                + str(index_ref[index_ref["Run"] == _true_stem(i)]["ms_run"].values[0])
-                + "]"
+            "protein_abundance_assay["
+            + str(index_ref[index_ref["Run"] == _true_stem(i)]["ms_run"].values[0])
+            + "]"
         )
     pg.rename(columns=col, inplace=True)
     logger.debug("Classifying results type ...")
     pg["opt_global_result_type"] = "single_protein"
-    pg.loc[pg["Protein.Ids"].str.contains(";"), "opt_global_result_type"] = (
+    pg.loc[pg["Protein.Group"].str.contains(";"), "opt_global_result_type"] = (
         "indistinguishable_protein_group"
     )
@@ -741,7 +741,6 @@ def mztab_prh(report, pg, index_ref, database, fasta_df):
     out_mztab_prh = out_mztab_prh.drop(["Protein.Names"], axis=1)
     out_mztab_prh.rename(
         columns={
-            "Protein.Group": "accession",
             "First.Protein.Description": "description",
         },
         inplace=True,
@@ -762,14 +761,14 @@ def mztab_prh(report, pg, index_ref, database, fasta_df):
     logger.debug("Extracting accession values (keeping first)...")
     out_mztab_prh.loc[:, "accession"] = out_mztab_prh.apply(
-        lambda x: x["accession"].split(";")[0], axis=1
+        lambda x: x["Protein.Group"].split(";")[0], axis=1
     )
     protein_details_df = out_mztab_prh[
         out_mztab_prh["opt_global_result_type"] == "indistinguishable_protein_group"
-        ]
+    ]
     prh_series = (
-        protein_details_df["Protein.Ids"]
+        protein_details_df["Protein.Group"]
         .str.split(";", expand=True)
         .stack()
         .reset_index(level=1, drop=True)
@@ -806,7 +805,7 @@ def mztab_prh(report, pg, index_ref, database, fasta_df):
     # or out_mztab_PRH.loc[out_mztab_PRH["Protein.Ids"] == out_mztab_PRH["accession"], "ambiguity_members"] = "null"
     out_mztab_prh.loc[:, "ambiguity_members"] = out_mztab_prh.apply(
         lambda x: (
-            x["Protein.Ids"]
+            x["Protein.Group"]
             if x["opt_global_result_type"] == "indistinguishable_protein_group"
             else "null"
         ),
@@ -817,7 +816,7 @@ def mztab_prh(report, pg, index_ref, database, fasta_df):
     score_looker = ModScoreLooker(report)
     out_mztab_prh[["modifiedSequence", "best_search_engine_score[1]"]] = (
         out_mztab_prh.apply(
-            lambda x: score_looker.get_score(x["Protein.Ids"]),
+            lambda x: score_looker.get_score(x["Protein.Group"]),
             axis=1,
             result_type="expand",
         )
@@ -833,11 +832,11 @@ def mztab_prh(report, pg, index_ref, database, fasta_df):
     # This used to be a bottleneck in performance
     # This implementation drops the run time from 57s to 25ms
     protein_agg_report = (
-        report[["PG.MaxLFQ", "Protein.Ids", "study_variable"]]
-        .groupby(["study_variable", "Protein.Ids"])
+        report[["PG.MaxLFQ", "Protein.Group", "study_variable"]]
+        .groupby(["study_variable", "Protein.Group"])
         .agg({"PG.MaxLFQ": ["mean", "std", "sem"]})
         .reset_index()
-        .pivot(columns=["study_variable"], index="Protein.Ids")
+        .pivot(columns=["study_variable"], index="Protein.Group")
         .reset_index()
     )
     protein_agg_report.columns = [
@@ -845,7 +844,7 @@ def mztab_prh(report, pg, index_ref, database, fasta_df):
         for col in protein_agg_report.columns.values
     ]
     subname_mapper = {
-        "Protein.Ids::::": "Protein.Ids",
+        "Protein.Group::::": "Protein.Group",
         "PG.MaxLFQ::mean": "protein_abundance_study_variable",
         "PG.MaxLFQ::std": "protein_abundance_stdev_study_variable",
         "PG.MaxLFQ::sem": "protein_abundance_std_error_study_variable",
@@ -858,7 +857,7 @@ def mztab_prh(report, pg, index_ref, database, fasta_df):
     # to the Protein.Ids (A0A024RBG1;Q9NZJ9;Q9NZJ9-2), leading to A LOT of missing values.
     out_mztab_prh = out_mztab_prh.merge(
         protein_agg_report,
-        on="Protein.Ids",
+        on="Protein.Group",
         how="left",
         validate="many_to_one",
         copy=True,
@@ -871,7 +870,7 @@ def mztab_prh(report, pg, index_ref, database, fasta_df):
     out_mztab_prh.loc[:, "PRH"] = "PRT"
     index = out_mztab_prh.loc[:, "PRH"]
     out_mztab_prh.drop(
-        ["PRH", "Genes", "modifiedSequence", "Protein.Ids"], axis=1, inplace=True
+        ["PRH", "Genes", "modifiedSequence", "Protein.Group"], axis=1, inplace=True
     )
     out_mztab_prh.insert(0, "PRH", index)
     out_mztab_prh.fillna("null", inplace=True)
@@ -884,11 +883,11 @@ def mztab_prh(report, pg, index_ref, database, fasta_df):
 def mztab_peh(
-        report: pd.DataFrame,
-        pr: pd.DataFrame,
-        precursor_list: List[str],
-        index_ref: pd.DataFrame,
-        database: os.PathLike,
+    report: pd.DataFrame,
+    pr: pd.DataFrame,
+    precursor_list: List[str],
+    index_ref: pd.DataFrame,
+    database: os.PathLike,
 ) -> pd.DataFrame:
     """
     Construct PEH sub-table.
@@ -916,14 +915,14 @@ def mztab_peh(
     out_mztab_peh = pd.DataFrame()
     out_mztab_peh = pr.iloc[:, 0:10]
     out_mztab_peh.drop(
-        ["Protein.Group", "Protein.Names", "First.Protein.Description", "Proteotypic"],
+        ["Protein.Ids", "Protein.Names", "First.Protein.Description", "Proteotypic"],
         axis=1,
         inplace=True,
     )
     out_mztab_peh.rename(
         columns={
             "Stripped.Sequence": "sequence",
-            "Protein.Ids": "accession",
+            "Protein.Group": "accession",
             "Modified.Sequence": "opt_global_cv_MS:1000889_peptidoform_sequence",
             "Precursor.Charge": "charge",
         },
@@ -1106,8 +1105,8 @@ def mztab_psh(report, folder, database):
         # Standardize spectrum identifier format for bruker data
         if not isinstance(target.loc[0, "opt_global_spectrum_reference"], str):
             target.loc[:, "opt_global_spectrum_reference"] = "scan=" + target.loc[
-                                                                       :, "opt_global_spectrum_reference"
-                                                                       ].astype(str)
+                :, "opt_global_spectrum_reference"
+            ].astype(str)
         # TODO seconds returned from precursor.getRT()
         target.loc[:, "RT.Start"] = target.apply(lambda x: x["RT.Start"] / 60, axis=1)
@@ -1123,7 +1122,7 @@ def mztab_psh(report, folder, database):
     out_mztab_psh = out_mztab_psh[
         [
             "Stripped.Sequence",
-            "Protein.Ids",
+            "Protein.Group",
             "Q.Value",
             "RT.Start",
             "Precursor.Charge",
@@ -1184,7 +1183,7 @@ def mztab_psh(report, folder, database):
     out_mztab_psh.loc[:, "spectra_ref"] = out_mztab_psh.apply(
         lambda x: "ms_run[{}]:".format(x["ms_run"])
-                  + x["opt_global_spectrum_reference"],
+        + x["opt_global_spectrum_reference"],
         axis=1,
         result_type="expand",
     )
@@ -1239,7 +1238,7 @@ def classify_result_type(target):
     :return: A string implys protein type
     :rtype: str
     """
-    if ";" in target["Protein.Ids"]:
+    if ";" in target["Protein.Group"]:
         return "indistinguishable_protein_group"
     return "single_protein"
@@ -1293,7 +1292,7 @@ def match_in_report(report, target, max_, flag, level):
         return tuple(q_value)
     if flag == 1 and level == "protein":
-        result = report[report["Protein.Ids"] == target]
+        result = report[report["Protein.Group"] == target]
         prh_params = []
         for i in range(1, max_ + 1):
             match = result[result["study_variable"] == i]
@@ -1320,9 +1319,9 @@ class ModScoreLooker:
     def make_lookup_dict(self, report) -> Dict[str, Tuple[str, float]]:
         grouped_df = (
-            report[["Modified.Sequence", "Protein.Ids", "Global.PG.Q.Value"]]
+            report[["Modified.Sequence", "Protein.Group", "Global.PG.Q.Value"]]
             .sort_values("Global.PG.Q.Value", ascending=True)
-            .groupby(["Protein.Ids"])
+            .groupby(["Protein.Group"])
             .head(1)
         )
         #        Modified.Sequence               Protein.Ids  Global.PG.Q.Value
@@ -1332,7 +1331,7 @@ class ModScoreLooker:
         # 103588      NPVGYPLAWQFLR           Q9NZ08;Q9NZ08-2           0.000252
         out = {
-            row["Protein.Ids"]: (row["Modified.Sequence"], row["Global.PG.Q.Value"])
+            row["Protein.Group"]: (row["Modified.Sequence"], row["Global.PG.Q.Value"])
             for _, row in grouped_df.iterrows()
         }
         return out
@@ -1556,8 +1555,8 @@ def calculate_coverage(ref_sequence: str, sequences: Set[str]):
     for start, length in sorted(zip(starts, lengths)):
         if merged_starts and merged_starts[-1] + merged_lengths[-1] >= start:
             merged_lengths[-1] = (
-                    max(merged_starts[-1] + merged_lengths[-1], start + length)
-                    - merged_starts[-1]
+                max(merged_starts[-1] + merged_lengths[-1], start + length)
+                - merged_starts[-1]
             )
         else:
             merged_starts.append(start)
@@ -1569,7 +1568,7 @@ def calculate_coverage(ref_sequence: str, sequences: Set[str]):
 def calculate_protein_coverages(
-        report: pd.DataFrame, out_mztab_prh: pd.DataFrame, fasta_df: pd.DataFrame
+    report: pd.DataFrame, out_mztab_prh: pd.DataFrame, fasta_df: pd.DataFrame
 ) -> List[str]:
     """Calculates protein coverages for the PRH table.
@@ -1578,8 +1577,8 @@ def calculate_protein_coverages(
     protein in the PRH table (defined by accession, not protein.ids).
     """
     nested_df = (
-        report[["Protein.Ids", "Stripped.Sequence"]]
-        .groupby("Protein.Ids")
+        report[["Protein.Group", "Stripped.Sequence"]]
+        .groupby("Protein.Group")
         .agg({"Stripped.Sequence": set})
         .reset_index()
     )
@@ -1587,8 +1586,8 @@ def calculate_protein_coverages(
     # 0     A0A024RBG1;Q9NZJ9;Q9NZJ9-2                                   {SEQEDEVLLVSSSR}
     # 1        A0A096LP49;A0A096LP49-2                                  {SPWAMTERKHSSLER}
     # 2                A0AVT1;A0AVT1-2  {EDFTLLDFINAVK, KPDHVPISSEDER, QDVIITALDNVEAR,...
-    ids_to_seqs = dict(zip(nested_df["Protein.Ids"], nested_df["Stripped.Sequence"]))
-    acc_to_ids = dict(zip(out_mztab_prh["accession"], out_mztab_prh["Protein.Ids"]))
+    ids_to_seqs = dict(zip(nested_df["Protein.Group"], nested_df["Stripped.Sequence"]))
+    acc_to_ids = dict(zip(out_mztab_prh["accession"], out_mztab_prh["Protein.Group"]))
     fasta_id_to_seqs = dict(zip(fasta_df["id"], fasta_df["seq"]))
     acc_to_fasta_ids: dict = {}

{quantms_utils-0.0.2 → quantms_utils-0.0.4}/quantmsutils/mzml/mzml_statistics.py RENAMED Viewed

@@ -1,9 +1,10 @@
-from pathlib import Path
-import sqlite3
 import re
+import sqlite3
+from pathlib import Path
 import click
 import pandas as pd
+import pyarrow
 from pyopenms import MSExperiment, MzMLFile
@@ -41,6 +42,14 @@ def mzml_statistics(ctx, ms_path: str, id_only: bool = False) -> None:
     ]
     def parse_mzml(file_name: str, file_columns: list, id_only: bool = False):
+        """
+        Parse mzML file and return a pandas DataFrame with the information. If id_only is True, it will also save a csv.
+        @param file_name: The file name of the mzML file
+        @param file_columns: The columns of the DataFrame
+        @param id_only: If True, it will save a csv with the spectrum id, mz and intensity
+        @return: A pandas DataFrame with the information of the mzML file
+        """
         info = []
         psm_part_info = []
         exp = MSExperiment()
@@ -123,11 +132,10 @@ def mzml_statistics(ctx, ms_path: str, id_only: bool = False) -> None:
         if id_only and len(psm_part_info) > 0:
             pd.DataFrame(
                 psm_part_info, columns=["scan", "ms_level", "mz", "intensity"]
-            ).to_csv(
-                f"{Path(ms_path).stem}_spectrum_df.csv",
-                mode="w",
+            ).to_parquet(
+                f"{Path(ms_path).stem}_spectrum_df.parquet",
                 index=False,
-                header=True,
+                compression="gzip",
             )
         return pd.DataFrame(info, columns=file_columns)
@@ -168,7 +176,7 @@ def mzml_statistics(ctx, ms_path: str, id_only: bool = False) -> None:
         except sqlite3.OperationalError as e:
             if "no such table: Precursors" in str(e):
                 print(
-                    f"No precursers recorded in {file_name}, This is normal for DIA data."
+                    f"No precursors recorded in {file_name}, This is normal for DIA data."
                 )
                 precursor_df = pd.DataFrame()
             else:
@@ -219,13 +227,12 @@ def mzml_statistics(ctx, ms_path: str, id_only: bool = False) -> None:
     elif Path(ms_path).suffix in [".mzML", ".mzml"]:
         ms_df = parse_mzml(ms_path, file_columns, id_only)
     else:
-        msg = f"Unrecognized or inexistent mass spec file '{ms_path}'"
+        msg = f"Unrecognized or the mass spec file '{ms_path}' do not exist"
         raise RuntimeError(msg)
-    ms_df.to_csv(
-        f"{Path(ms_path).stem}_ms_info.tsv",
-        mode="w",
-        sep="\t",
+    ms_df.to_parquet(
+        f"{Path(ms_path).stem}_ms_info.parquet",
+        engine="pyarrow",
         index=False,
-        header=True,
+        compression="gzip",
     )

quantms-utils 0.0.2__tar.gz → 0.0.4__tar.gz

quantms-utils 0.0.2tar.gz → 0.0.4tar.gz