PyPI - msreport - Versions diffs - 0.0.24__py3-none-any.whl - Mend

msreport 0.0.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

msreport/__init__.py +13 -0
msreport/aggregate/__init__.py +0 -0
msreport/aggregate/condense.py +163 -0
msreport/aggregate/pivot.py +132 -0
msreport/aggregate/summarize.py +281 -0
msreport/analyze.py +586 -0
msreport/errors.py +10 -0
msreport/export.py +526 -0
msreport/fasta.py +28 -0
msreport/helper/__init__.py +23 -0
msreport/helper/calc.py +120 -0
msreport/helper/maxlfq.py +339 -0
msreport/helper/table.py +267 -0
msreport/helper/temp.py +99 -0
msreport/impute.py +275 -0
msreport/isobar.py +161 -0
msreport/normalize.py +496 -0
msreport/peptidoform.py +283 -0
msreport/plot.py +1129 -0
msreport/qtable.py +537 -0
msreport/reader.py +2357 -0
msreport/rinterface/__init__.py +3 -0
msreport/rinterface/limma.py +126 -0
msreport/rinterface/rinstaller.py +35 -0
msreport/rinterface/rscripts/limma.R +104 -0
msreport-0.0.24.dist-info/METADATA +128 -0
msreport-0.0.24.dist-info/RECORD +30 -0
msreport-0.0.24.dist-info/WHEEL +5 -0
msreport-0.0.24.dist-info/licenses/LICENSE.txt +202 -0
msreport-0.0.24.dist-info/top_level.txt +1 -0

msreport/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+from msreport.qtable import Qtable
+from msreport.reader import MaxQuantReader, FragPipeReader, SpectronautReader
+from msreport.fasta import import_protein_database
+import msreport.analyze
+import msreport.export
+import msreport.impute
+import msreport.normalize
+import msreport.plot
+import msreport.reader
+__version__ = "0.0.24"

msreport/aggregate/__init__.py ADDED Viewed

File without changes

msreport/aggregate/condense.py ADDED Viewed

@@ -0,0 +1,163 @@
+import numpy as np
+import msreport.helper.maxlfq as MAXLFQ
+def join_str(array: np.ndarray, sep: str = ";") -> str:
+    """Returns a joined string of sorted values from the array.
+    Note that empty strings or np.nan are not included in the joined string.
+    """
+    elements = []
+    for value in array.flatten():
+        if value != "" and not (isinstance(value, float) and np.isnan(value)):
+            elements.append(str(value))
+    return sep.join(sorted(elements))
+def join_str_per_column(array: np.ndarray, sep: str = ";") -> np.ndarray:
+    """Returns for each column a joined string of sorted values.
+    Note that empty strings or np.nan are not included in the joined string.
+    """
+    return np.array([join_str(i) for i in array.transpose()])
+def join_unique_str(array: np.ndarray, sep: str = ";") -> str:
+    """Returns a joined string of unique sorted values from the array."""
+    elements = []
+    for value in array.flatten():
+        if value != "" and not (isinstance(value, float) and np.isnan(value)):
+            elements.append(str(value))
+    return sep.join(sorted(set(elements)))
+def join_unique_str_per_column(array: np.ndarray, sep: str = ";") -> np.ndarray:
+    """Returns for each column a joined strings of unique sorted values."""
+    return np.array([join_unique_str(i) for i in array.transpose()])
+def sum(array: np.ndarray) -> float:
+    """Returns sum of values from one or multiple columns.
+    Note that if no finite values are present in the array np.nan is returned.
+    """
+    array = array.flatten()
+    if np.isfinite(array).any():
+        return np.nansum(array)
+    else:
+        return np.nan
+def sum_per_column(array: np.ndarray) -> np.ndarray:
+    """Returns for each column the sum of values.
+    Note that if no finite values are present in a column np.nan is returned.
+    """
+    return np.array([sum(i) for i in array.transpose()])
+def maximum(array: np.ndarray) -> float:
+    """Returns the highest finitevalue from one or multiple columns."""
+    array = array.flatten()
+    if np.isfinite(array).any():
+        return np.nanmax(array)
+    else:
+        return np.nan
+def maximum_per_column(array: np.ndarray) -> np.ndarray:
+    """Returns for each column the highest finite value."""
+    return np.array([maximum(i) for i in array.transpose()])
+def minimum(array: np.ndarray) -> int:
+    """Returns the lowest finite value from one or multiple columns."""
+    array = array.flatten()
+    if np.isfinite(array).any():
+        return np.nanmin(array)
+    else:
+        return np.nan
+def minimum_per_column(array: np.ndarray) -> np.ndarray:
+    """Returns for each column the lowest finite value."""
+    return np.array([minimum(i) for i in array.transpose()])
+def count_unique(array: np.ndarray) -> int:
+    """Returns the number of unique values from one or multiple columns.
+    Note that empty strings or np.nan are not counted as unique values.
+    """
+    unique_elements = {
+        x for x in array.flatten() if not (isinstance(x, float) and np.isnan(x))
+    }
+    unique_elements.discard("")
+    return len(unique_elements)
+def count_unique_per_column(array: np.ndarray) -> np.ndarray:
+    """Returns for each column the number of unique values.
+    Note that empty strings or np.nan are not counted as unique values.
+    """
+    if array.size > 0:
+        return np.array([count_unique(i) for i in array.transpose()])
+    else:
+        return np.full(array.shape[0], 0)
+def profile_by_median_ratio_regression(array: np.ndarray) -> np.ndarray:
+    """Calculates abundance profiles by lstsq regression of pair-wise median ratios.
+    The function performs a least squares regression of pair-wise median ratios to
+    calculate estimated abundance profiles.
+    Args:
+        array: A two-dimensional array containing abundance values, with the first
+            dimension corresponding to rows and the second dimension to columns.
+            Abundance values must not be log transformed.
+    Returns:
+        An array containing estimated abundance profiles, with length equal to the
+        number of columns in the input array.
+    """
+    ratio_matrix = MAXLFQ.calculate_pairwise_median_log_ratio_matrix(
+        array, log_transformed=False
+    )
+    coef_matrix, ratio_array, initial_rows = MAXLFQ.prepare_coefficient_matrix(
+        ratio_matrix
+    )
+    log_profile = MAXLFQ.log_profiles_by_lstsq(coef_matrix, ratio_array)
+    profile = np.power(2, log_profile)
+    return profile
+def sum_by_median_ratio_regression(array: np.ndarray) -> np.ndarray:
+    """Calculates summed abundance by lstsq regression of pair-wise median ratios.
+    The function performs a least squares regression of pair-wise median ratios to
+    calculate estimated abundance profiles. These profiles are then scaled based on the
+    input array such that the columns with finite profile values are used and the sum of
+    the scaled profiles matches the sum of the input array.
+    Args:
+        array: A two-dimensional array containing abundance values, with the first
+            dimension corresponding to rows and the second dimension to columns.
+            Abundance values must not be log transformed.
+    Returns:
+        An array containing summed abundance estimates, with length equal to the number
+        of columns in the input array.
+    """
+    profile = profile_by_median_ratio_regression(array)
+    scaled_profile = profile
+    if np.isfinite(profile).any():
+        profile_mask = np.isfinite(profile)
+        scaled_profile[profile_mask] = profile[profile_mask] * (
+            np.nansum(array[:, profile_mask]) / np.nansum(profile[profile_mask])
+        )
+    return scaled_profile

msreport/aggregate/pivot.py ADDED Viewed

@@ -0,0 +1,132 @@
+from typing import Iterable, Union
+import pandas as pd
+import msreport.aggregate.condense as CONDENSE
+import msreport.helper
+def pivot_table(
+    long_table: pd.DataFrame,
+    index: str,
+    group_by: str,
+    annotation_columns: Iterable[str],
+    pivoting_columns: Iterable[str],
+):
+    """Generates a pivoted table in wide format.
+    Args:
+        table: Dataframe in long format that is used to generate a table in wide format.
+        index: One or multiple column names that are used to group the table for
+            pivoting.
+        group_by: Column that is used to split the table on its unique entries.
+        annotation_columns: Each column generates a new column in the pivoted table.
+            Entries from each annotation column are aggregated for each group created by
+            the column(s) specified by 'index' and unique values are joined together
+            with ";" as separator.
+        pivoting_columns: Columns that are combined with unique entries from 'group_by'
+            to generate new columns in the pivoted table.
+    Returns:
+        A reshaped, pivot table with length equal to unique values from the 'index'
+        column.
+    Example:
+        >>> table = pd.DataFrame(
+        ...     {
+        ...         "ID": ["A", "B", "C", "B", "C", "D"],
+        ...         "Sample": ["S1", "S1", "S1", "S2", "S2", "S2"],
+        ...         "Annotation": ["A", "B", "C", "B", "C", "D"],
+        ...         "Quant": [1.0, 1.0, 1.0, 2.0, 2.0, 2.0],
+        ...     }
+        ... )
+        >>> pivot_table(table, "ID", "Sample", ["Annotation"], ["Quant"])
+          ID  Annotation  Quant S1  Quant S2
+        0  A           A       1.0       NaN
+        1  B           B       1.0       2.0
+        2  C           C       1.0       2.0
+        3  D           D       NaN       2.0
+    """
+    sub_tables = []
+    for column in annotation_columns:
+        sub_tables.append(join_unique(long_table, index, column))
+    for column in pivoting_columns:
+        sub_tables.append(pivot_column(long_table, index, group_by, column))
+    wide_table = msreport.helper.join_tables(sub_tables, reset_index=True)
+    return wide_table
+def pivot_column(
+    table: pd.DataFrame, index: Union[str, Iterable], group_by: str, values: str
+) -> pd.DataFrame:
+    """Returns a reshaped dataframe, generated by pivoting the table on one column.
+    Uses unique values from the specified 'index' to form the index axis of the new
+    dataframe. Unique values from the 'group_by' column are used to split the data and
+    generate new columns that are filled with values from the 'values' column. The
+    column names are composed of the 'values' column and the unique entries from
+    'group_by'.
+    Args:
+        table: Dataframe that is used to generate the pivoted table.
+        index: One or multiple column names that are used as the new index.
+        group_by: Column that is used to split the table, each unique entry from this
+            column generates a new column in the pivoted table.
+        values: Column which values are used to populate the pivoted table.
+    Returns:
+        The pivoted dataframe.
+    Example:
+        >>> table = pd.DataFrame(
+        ...     {
+        ...         "ID": ["A", "A", "B", "B"],
+        ...         "Sample": ["S1", "S2", "S1", "S2"],
+        ...         "Entries": [1.0, 2.0, 1.0, 2.0],
+        ...     }
+        ... )
+        >>> pivot_column(table, "ID", "Sample", "Entries")
+            Entries S1  Entries S2
+        ID
+        A          1.0         2.0
+        B          1.0         2.0
+    """
+    pivot = table.pivot(index=index, columns=group_by, values=values)
+    pivot.columns = [f"{values} {sample_column}" for sample_column in pivot.columns]
+    return pivot
+def join_unique(
+    table: pd.DataFrame, index: Union[str, Iterable], values: str
+) -> pd.DataFrame:
+    """Returns a new dataframe with unique values from a column and grouped by 'index'.
+    Args:
+        table: Input dataframe from which to generate the new dataframe.
+        index: One or multiple column names group the table by.
+        values: Column which is used to extract unique values.
+    Returns:
+        A dataframe with a single column named 'values', where the unique values of the
+            column specified by 'values' are joined together with ";" for each group
+            created by the column(s) specified by 'index'.
+    Example:
+        >>> table = pd.DataFrame(
+        ...     {
+        ...         "ID": ["A", "A", "B", "B"],
+        ...         "Annotation": ["A1", "A1", "B1", "B1"],
+        ...     }
+        ... )
+        >>> join_unique(table, "ID", "Annotation")
+            Annotation
+        ID
+        A           A1
+        B           B1
+    """
+    series = table.groupby(index)[values].agg(
+        lambda x: CONDENSE.join_unique_str(x.to_numpy())
+    )
+    new_df = pd.DataFrame(series)
+    new_df.columns = [values]
+    return new_df

msreport/aggregate/summarize.py ADDED Viewed

@@ -0,0 +1,281 @@
+from typing import Callable, Iterable, Optional, Union
+import numpy as np
+import pandas as pd
+import msreport.aggregate.condense as CONDENSE
+from msreport.helper import find_sample_columns
+def count_unique(
+    table: pd.DataFrame,
+    group_by: str,
+    input_column: Union[str, Iterable],
+    output_column: str = "Unique counts",
+    is_sorted: bool = False,
+) -> pd.DataFrame:
+    """Aggregates column(s) by counting unique values for each unique group.
+    Note that empty strings and np.nan do not contribute to the unique value count.
+    Args:
+        table: The input DataFrame used for aggregating on unique groups.
+        group_by: The name of the column used to determine unique groups for
+            aggregation.
+        input_column: A column or a list of columns, whose unique values will be counted
+            for each unique group during aggregation.
+        output_column: The name of the column containing the aggregation results. By
+            default "Unique values" is used as the name of the output column.
+        is_sorted: Indicates whether the input dataframe is already sorted with respect
+            to the 'group_by' column.
+    Returns:
+        A dataframe with unique 'group_by' values as index and a unique counts column
+        containing the number of unique counts per group.
+    Example:
+        >>> table = pd.DataFrame(
+        ...     {
+        ...         "ID": ["A", "A", "B", "C", "C", "C"],
+        ...         "Peptide sequence": ["a", "a", "b", "c1", "c2", "c2"],
+        ...     }
+        ... )
+        >>> count_unique(table, group_by="ID", input_column="Peptide sequence")
+           Unique counts
+        A              1
+        B              1
+        C              2
+    """
+    aggregation, groups = aggregate_unique_groups(
+        table, group_by, input_column, CONDENSE.count_unique, is_sorted
+    )
+    return pd.DataFrame(columns=[output_column], data=aggregation, index=groups)
+def join_unique(
+    table: pd.DataFrame,
+    group_by: str,
+    input_column: Union[str, Iterable],
+    output_column: str = "Unique values",
+    sep: str = ";",
+    is_sorted: bool = False,
+) -> pd.DataFrame:
+    """Aggregates column(s) by concatenating unique values for each unique group.
+    Note that empty strings and np.nan do not contribute to the unique value count.
+    Args:
+        table: The input DataFrame used for aggregating on unique groups.
+        group_by: The name of the column used to determine unique groups for
+            aggregation.
+        input_column: A column or a list of columns, whose unique values will be joined
+            into a single string for each unique group
+        output_column: The name of the column containing the aggregation results. By
+            default "Unique values" is used as the name of the output column.
+        sep: The separator string used to join multiple unique values together. Default
+            is ";".
+        is_sorted: Indicates whether the input dataframe is already sorted with respect
+            to the 'group_by' column.
+    Returns:
+        A dataframe with unique 'group_by' values as index and a unique values column
+        containing the joined unique values per group. Unique values are sorted and
+        joined with the specified separator.
+    Example:
+        >>> table = pd.DataFrame(
+        ...     {
+        ...         "ID": ["A", "A", "B", "C", "C", "C"],
+        ...         "Peptide sequence": ["a", "", "b", "c1", "c2", "c2"],
+        ...     }
+        ... )
+        >>> join_unique(table, group_by="ID", input_column="Peptide sequence")
+          Unique values
+        A             a
+        B             b
+        C         c1;c2
+    """
+    aggregation, groups = aggregate_unique_groups(
+        table,
+        group_by,
+        input_column,
+        lambda x: CONDENSE.join_unique_str(x, sep=sep),
+        is_sorted,
+    )
+    return pd.DataFrame(columns=[output_column], data=aggregation, index=groups)
+def sum_columns(
+    table: pd.DataFrame,
+    group_by: str,
+    samples: Iterable[str],
+    input_tag: str,
+    output_tag: Optional[str] = None,
+    is_sorted: bool = False,
+) -> pd.DataFrame:
+    """Aggregates column(s) by summing up values for each unique group.
+    Args:
+        table: The input DataFrame used for aggregating on unique groups.
+        group_by: The name of the column used to determine unique groups for
+            aggregation.
+        samples: List of sample names that appear in columns of the table as substrings.
+        input_tag: Substring of column names, which is used together with the sample
+            names to determine the columns whose values will be summarized for each
+            unique group.
+        output_tag: Optional, allows changing the ouptut column names by replacing the
+            'input_tag' with the 'output_tag'. If not specified the names of the columns
+            that were used for aggregation will be used in the returned dataframe.
+        is_sorted: Indicates whether the input dataframe is already sorted with respect
+            to the 'group_by' column.
+    Returns:
+        A dataframe with unique 'group_by' values as index and one column per sample.
+        The columns contain the summed group values per sample.
+    Example:
+        >>> table = pd.DataFrame(
+        ...     {
+        ...         "ID": ["A", "A", "B", "C", "C", "C"],
+        ...         "Col S1": [1, 1, 1, 1, 1, 1],
+        ...         "Col S2": [2, 2, 2, 2, 2, 2],
+        ...     }
+        ... )
+        >>> sum_columns(table, "ID", samples=["S1", "S2"], input_tag="Col")
+           Col S1  Col S2
+        A       2       4
+        B       1       2
+        C       3       6
+    """
+    output_tag = input_tag if output_tag is None else output_tag
+    columns = find_sample_columns(table, input_tag, samples)
+    aggregation, groups = aggregate_unique_groups(
+        table, group_by, columns, CONDENSE.sum_per_column, is_sorted
+    )
+    output_columns = [column.replace(input_tag, output_tag) for column in columns]
+    return pd.DataFrame(columns=output_columns, data=aggregation, index=groups)
+def sum_columns_maxlfq(
+    table: pd.DataFrame,
+    group_by: str,
+    samples: Iterable[str],
+    input_tag: str,
+    output_tag: Optional[str] = None,
+    is_sorted: bool = False,
+) -> pd.DataFrame:
+    """Aggregates column(s) by applying the MaxLFQ summation approach to unique group.
+    This function estimates abundance profiles from sample columns using pairwise median
+    ratios and least square regression. It then selects abundance profiles with finite
+    values and the corresponding input columns and scales the abundance profiles so that
+    their total sum is equal to the total sum of the corresponding input columns.
+    Args:
+        table: The input DataFrame used for aggregating on unique groups.
+        group_by: The name of the column used to determine unique groups for
+            aggregation.
+        samples: List of sample names that appear in columns of the table as substrings.
+        input_tag: Substring of column names, which is used together with the sample
+            names to determine the columns whose values will be summarized for each
+            unique group.
+        output_tag: Optional, allows changing the ouptut column names by replacing the
+            'input_tag' with the 'output_tag'. If not specified the names of the columns
+            that were used for aggregation will be used in the returned dataframe.
+        is_sorted: Indicates whether the input dataframe is already sorted with respect
+            to the 'group_by' column.
+    Returns:
+        A dataframe with unique 'group_by' values as index and one column per sample.
+        The columns contain the summed group values per sample.
+    Example:
+        >>> table = pd.DataFrame(
+        ...     {
+        ...         "ID": ["A", "A", "B", "C", "C", "C"],
+        ...         "Col S1": [1, 1, 1, 1, 1, 1],
+        ...         "Col S2": [2, 2, 2, 2, 2, 2],
+        ...     }
+        ... )
+        >>> sum_columns_maxlfq(table, "ID", samples=["S1", "S2"], input_tag="Col")
+           Col S1  Col S2
+        A     2.0     4.0
+        B     1.0     2.0
+        C     3.0     6.0
+    """
+    output_tag = input_tag if output_tag is None else output_tag
+    columns = find_sample_columns(table, input_tag, samples)
+    aggregation, groups = aggregate_unique_groups(
+        table, group_by, columns, CONDENSE.sum_by_median_ratio_regression, is_sorted
+    )
+    output_columns = [column.replace(input_tag, output_tag) for column in columns]
+    return pd.DataFrame(columns=output_columns, data=aggregation, index=groups)
+def aggregate_unique_groups(
+    table: pd.DataFrame,
+    group_by: str,
+    columns_to_aggregate: Union[str, Iterable],
+    condenser: Callable,
+    is_sorted: bool,
+) -> (np.ndarray, np.ndarray):
+    """Aggregates column(s) by applying a condenser function to unique groups.
+    The function returns two arrays containing the aggregated values and the
+    corresponding group names. This function can be used for example to summarize data
+    from an ion table to a peptide, protein or modification table. Suitable condenser
+    functions can be found in the module msreport.aggregate.condense
+    Args:
+        table: The input dataframe used for aggregating on unique groups.
+        group_by: The name of the column used to determine unique groups for
+            aggregation.
+        columns_to_aggregate: A column or a list of columns, which will be passed to the
+            condenser function for applying an aggregation to each unique group.
+        condenser: Function that is applied to each group for generating the
+            aggregation result. If multiple columns are specified for aggregation,
+            the input array for the condenser function will be two dimensional, with the
+            first dimension corresponding to rows and the second to the column. E.g. an
+            array with 3 rows and 2 columns: np.array([[1, 'a'], [2, 'b'], [3, 'c']])
+        is_sorted: Indicates whether the input dataframe is already sorted with respect
+            to the 'group_by' column.
+    Returns:
+        Two numpy arrays, the first array contains the aggregation results of each each
+        unique group and the second array contains the correpsonding group names.
+    """
+    group_start_indices, group_names, table = _prepare_grouping_indices(
+        table, group_by, is_sorted
+    )
+    array = table[columns_to_aggregate].to_numpy()
+    aggregation_result = np.array(
+        [condenser(i) for i in np.split(array, group_start_indices[1:])]
+    )
+    return aggregation_result, group_names
+def _prepare_grouping_indices(
+    table: pd.DataFrame, group_by: str, is_sorted: bool
+) -> (np.ndarray, np.ndarray, pd.DataFrame):
+    """Prepares start indices and names of unique groups from a sorted dataframe.
+    Args:
+        table: The input DataFrame used for generating unique groups.
+        group_by: The name of the column used to determine unique groups.
+        is_sorted: If True, the input DataFrame is assumed to be already sorted with
+            respected to the 'group_by' column. Ohterwise, the input DataFrame is sorted
+            by the 'group_by' column and the sorted DataFrame is returned.
+    Returns:
+        A tuple containing the following three elements:
+        - A numpy array containing the start indices of each unique group
+        - A numpy array containing the names of each unique group
+        - The input DataFrame sorted by the 'group_by' column, if it was not already
+          sorted.
+    """
+    if not is_sorted:
+        table = table.sort_values(by=group_by)
+    group_names, group_start_indices, group_lengths = np.unique(
+        table[group_by], return_counts=True, return_index=True
+    )
+    return group_start_indices, group_names, table