msreport 0.0.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
msreport/__init__.py ADDED
@@ -0,0 +1,13 @@
1
+ from msreport.qtable import Qtable
2
+ from msreport.reader import MaxQuantReader, FragPipeReader, SpectronautReader
3
+
4
+ from msreport.fasta import import_protein_database
5
+
6
+ import msreport.analyze
7
+ import msreport.export
8
+ import msreport.impute
9
+ import msreport.normalize
10
+ import msreport.plot
11
+ import msreport.reader
12
+
13
+ __version__ = "0.0.24"
File without changes
@@ -0,0 +1,163 @@
1
+ import numpy as np
2
+
3
+ import msreport.helper.maxlfq as MAXLFQ
4
+
5
+
6
+ def join_str(array: np.ndarray, sep: str = ";") -> str:
7
+ """Returns a joined string of sorted values from the array.
8
+
9
+ Note that empty strings or np.nan are not included in the joined string.
10
+ """
11
+ elements = []
12
+ for value in array.flatten():
13
+ if value != "" and not (isinstance(value, float) and np.isnan(value)):
14
+ elements.append(str(value))
15
+ return sep.join(sorted(elements))
16
+
17
+
18
+ def join_str_per_column(array: np.ndarray, sep: str = ";") -> np.ndarray:
19
+ """Returns for each column a joined string of sorted values.
20
+
21
+ Note that empty strings or np.nan are not included in the joined string.
22
+ """
23
+ return np.array([join_str(i) for i in array.transpose()])
24
+
25
+
26
+ def join_unique_str(array: np.ndarray, sep: str = ";") -> str:
27
+ """Returns a joined string of unique sorted values from the array."""
28
+ elements = []
29
+ for value in array.flatten():
30
+ if value != "" and not (isinstance(value, float) and np.isnan(value)):
31
+ elements.append(str(value))
32
+ return sep.join(sorted(set(elements)))
33
+
34
+
35
+ def join_unique_str_per_column(array: np.ndarray, sep: str = ";") -> np.ndarray:
36
+ """Returns for each column a joined strings of unique sorted values."""
37
+ return np.array([join_unique_str(i) for i in array.transpose()])
38
+
39
+
40
+ def sum(array: np.ndarray) -> float:
41
+ """Returns sum of values from one or multiple columns.
42
+
43
+ Note that if no finite values are present in the array np.nan is returned.
44
+ """
45
+ array = array.flatten()
46
+ if np.isfinite(array).any():
47
+ return np.nansum(array)
48
+ else:
49
+ return np.nan
50
+
51
+
52
+ def sum_per_column(array: np.ndarray) -> np.ndarray:
53
+ """Returns for each column the sum of values.
54
+
55
+ Note that if no finite values are present in a column np.nan is returned.
56
+ """
57
+ return np.array([sum(i) for i in array.transpose()])
58
+
59
+
60
+ def maximum(array: np.ndarray) -> float:
61
+ """Returns the highest finitevalue from one or multiple columns."""
62
+ array = array.flatten()
63
+ if np.isfinite(array).any():
64
+ return np.nanmax(array)
65
+ else:
66
+ return np.nan
67
+
68
+
69
+ def maximum_per_column(array: np.ndarray) -> np.ndarray:
70
+ """Returns for each column the highest finite value."""
71
+ return np.array([maximum(i) for i in array.transpose()])
72
+
73
+
74
+ def minimum(array: np.ndarray) -> int:
75
+ """Returns the lowest finite value from one or multiple columns."""
76
+ array = array.flatten()
77
+ if np.isfinite(array).any():
78
+ return np.nanmin(array)
79
+ else:
80
+ return np.nan
81
+
82
+
83
+ def minimum_per_column(array: np.ndarray) -> np.ndarray:
84
+ """Returns for each column the lowest finite value."""
85
+ return np.array([minimum(i) for i in array.transpose()])
86
+
87
+
88
+ def count_unique(array: np.ndarray) -> int:
89
+ """Returns the number of unique values from one or multiple columns.
90
+
91
+ Note that empty strings or np.nan are not counted as unique values.
92
+ """
93
+ unique_elements = {
94
+ x for x in array.flatten() if not (isinstance(x, float) and np.isnan(x))
95
+ }
96
+ unique_elements.discard("")
97
+
98
+ return len(unique_elements)
99
+
100
+
101
+ def count_unique_per_column(array: np.ndarray) -> np.ndarray:
102
+ """Returns for each column the number of unique values.
103
+
104
+ Note that empty strings or np.nan are not counted as unique values.
105
+ """
106
+ if array.size > 0:
107
+ return np.array([count_unique(i) for i in array.transpose()])
108
+ else:
109
+ return np.full(array.shape[0], 0)
110
+
111
+
112
+ def profile_by_median_ratio_regression(array: np.ndarray) -> np.ndarray:
113
+ """Calculates abundance profiles by lstsq regression of pair-wise median ratios.
114
+
115
+ The function performs a least squares regression of pair-wise median ratios to
116
+ calculate estimated abundance profiles.
117
+
118
+ Args:
119
+ array: A two-dimensional array containing abundance values, with the first
120
+ dimension corresponding to rows and the second dimension to columns.
121
+ Abundance values must not be log transformed.
122
+
123
+ Returns:
124
+ An array containing estimated abundance profiles, with length equal to the
125
+ number of columns in the input array.
126
+ """
127
+ ratio_matrix = MAXLFQ.calculate_pairwise_median_log_ratio_matrix(
128
+ array, log_transformed=False
129
+ )
130
+ coef_matrix, ratio_array, initial_rows = MAXLFQ.prepare_coefficient_matrix(
131
+ ratio_matrix
132
+ )
133
+ log_profile = MAXLFQ.log_profiles_by_lstsq(coef_matrix, ratio_array)
134
+ profile = np.power(2, log_profile)
135
+ return profile
136
+
137
+
138
+ def sum_by_median_ratio_regression(array: np.ndarray) -> np.ndarray:
139
+ """Calculates summed abundance by lstsq regression of pair-wise median ratios.
140
+
141
+ The function performs a least squares regression of pair-wise median ratios to
142
+ calculate estimated abundance profiles. These profiles are then scaled based on the
143
+ input array such that the columns with finite profile values are used and the sum of
144
+ the scaled profiles matches the sum of the input array.
145
+
146
+ Args:
147
+ array: A two-dimensional array containing abundance values, with the first
148
+ dimension corresponding to rows and the second dimension to columns.
149
+ Abundance values must not be log transformed.
150
+
151
+ Returns:
152
+ An array containing summed abundance estimates, with length equal to the number
153
+ of columns in the input array.
154
+ """
155
+ profile = profile_by_median_ratio_regression(array)
156
+ scaled_profile = profile
157
+ if np.isfinite(profile).any():
158
+ profile_mask = np.isfinite(profile)
159
+ scaled_profile[profile_mask] = profile[profile_mask] * (
160
+ np.nansum(array[:, profile_mask]) / np.nansum(profile[profile_mask])
161
+ )
162
+
163
+ return scaled_profile
@@ -0,0 +1,132 @@
1
+ from typing import Iterable, Union
2
+
3
+ import pandas as pd
4
+ import msreport.aggregate.condense as CONDENSE
5
+ import msreport.helper
6
+
7
+
8
+ def pivot_table(
9
+ long_table: pd.DataFrame,
10
+ index: str,
11
+ group_by: str,
12
+ annotation_columns: Iterable[str],
13
+ pivoting_columns: Iterable[str],
14
+ ):
15
+ """Generates a pivoted table in wide format.
16
+
17
+ Args:
18
+ table: Dataframe in long format that is used to generate a table in wide format.
19
+ index: One or multiple column names that are used to group the table for
20
+ pivoting.
21
+ group_by: Column that is used to split the table on its unique entries.
22
+ annotation_columns: Each column generates a new column in the pivoted table.
23
+ Entries from each annotation column are aggregated for each group created by
24
+ the column(s) specified by 'index' and unique values are joined together
25
+ with ";" as separator.
26
+ pivoting_columns: Columns that are combined with unique entries from 'group_by'
27
+ to generate new columns in the pivoted table.
28
+
29
+ Returns:
30
+ A reshaped, pivot table with length equal to unique values from the 'index'
31
+ column.
32
+
33
+ Example:
34
+ >>> table = pd.DataFrame(
35
+ ... {
36
+ ... "ID": ["A", "B", "C", "B", "C", "D"],
37
+ ... "Sample": ["S1", "S1", "S1", "S2", "S2", "S2"],
38
+ ... "Annotation": ["A", "B", "C", "B", "C", "D"],
39
+ ... "Quant": [1.0, 1.0, 1.0, 2.0, 2.0, 2.0],
40
+ ... }
41
+ ... )
42
+ >>> pivot_table(table, "ID", "Sample", ["Annotation"], ["Quant"])
43
+ ID Annotation Quant S1 Quant S2
44
+ 0 A A 1.0 NaN
45
+ 1 B B 1.0 2.0
46
+ 2 C C 1.0 2.0
47
+ 3 D D NaN 2.0
48
+ """
49
+ sub_tables = []
50
+ for column in annotation_columns:
51
+ sub_tables.append(join_unique(long_table, index, column))
52
+ for column in pivoting_columns:
53
+ sub_tables.append(pivot_column(long_table, index, group_by, column))
54
+
55
+ wide_table = msreport.helper.join_tables(sub_tables, reset_index=True)
56
+ return wide_table
57
+
58
+
59
+ def pivot_column(
60
+ table: pd.DataFrame, index: Union[str, Iterable], group_by: str, values: str
61
+ ) -> pd.DataFrame:
62
+ """Returns a reshaped dataframe, generated by pivoting the table on one column.
63
+
64
+ Uses unique values from the specified 'index' to form the index axis of the new
65
+ dataframe. Unique values from the 'group_by' column are used to split the data and
66
+ generate new columns that are filled with values from the 'values' column. The
67
+ column names are composed of the 'values' column and the unique entries from
68
+ 'group_by'.
69
+
70
+ Args:
71
+ table: Dataframe that is used to generate the pivoted table.
72
+ index: One or multiple column names that are used as the new index.
73
+ group_by: Column that is used to split the table, each unique entry from this
74
+ column generates a new column in the pivoted table.
75
+ values: Column which values are used to populate the pivoted table.
76
+
77
+ Returns:
78
+ The pivoted dataframe.
79
+
80
+ Example:
81
+ >>> table = pd.DataFrame(
82
+ ... {
83
+ ... "ID": ["A", "A", "B", "B"],
84
+ ... "Sample": ["S1", "S2", "S1", "S2"],
85
+ ... "Entries": [1.0, 2.0, 1.0, 2.0],
86
+ ... }
87
+ ... )
88
+ >>> pivot_column(table, "ID", "Sample", "Entries")
89
+ Entries S1 Entries S2
90
+ ID
91
+ A 1.0 2.0
92
+ B 1.0 2.0
93
+ """
94
+ pivot = table.pivot(index=index, columns=group_by, values=values)
95
+ pivot.columns = [f"{values} {sample_column}" for sample_column in pivot.columns]
96
+ return pivot
97
+
98
+
99
+ def join_unique(
100
+ table: pd.DataFrame, index: Union[str, Iterable], values: str
101
+ ) -> pd.DataFrame:
102
+ """Returns a new dataframe with unique values from a column and grouped by 'index'.
103
+
104
+ Args:
105
+ table: Input dataframe from which to generate the new dataframe.
106
+ index: One or multiple column names group the table by.
107
+ values: Column which is used to extract unique values.
108
+
109
+ Returns:
110
+ A dataframe with a single column named 'values', where the unique values of the
111
+ column specified by 'values' are joined together with ";" for each group
112
+ created by the column(s) specified by 'index'.
113
+
114
+ Example:
115
+ >>> table = pd.DataFrame(
116
+ ... {
117
+ ... "ID": ["A", "A", "B", "B"],
118
+ ... "Annotation": ["A1", "A1", "B1", "B1"],
119
+ ... }
120
+ ... )
121
+ >>> join_unique(table, "ID", "Annotation")
122
+ Annotation
123
+ ID
124
+ A A1
125
+ B B1
126
+ """
127
+ series = table.groupby(index)[values].agg(
128
+ lambda x: CONDENSE.join_unique_str(x.to_numpy())
129
+ )
130
+ new_df = pd.DataFrame(series)
131
+ new_df.columns = [values]
132
+ return new_df
@@ -0,0 +1,281 @@
1
+ from typing import Callable, Iterable, Optional, Union
2
+
3
+ import numpy as np
4
+ import pandas as pd
5
+
6
+ import msreport.aggregate.condense as CONDENSE
7
+ from msreport.helper import find_sample_columns
8
+
9
+
10
+ def count_unique(
11
+ table: pd.DataFrame,
12
+ group_by: str,
13
+ input_column: Union[str, Iterable],
14
+ output_column: str = "Unique counts",
15
+ is_sorted: bool = False,
16
+ ) -> pd.DataFrame:
17
+ """Aggregates column(s) by counting unique values for each unique group.
18
+
19
+ Note that empty strings and np.nan do not contribute to the unique value count.
20
+
21
+ Args:
22
+ table: The input DataFrame used for aggregating on unique groups.
23
+ group_by: The name of the column used to determine unique groups for
24
+ aggregation.
25
+ input_column: A column or a list of columns, whose unique values will be counted
26
+ for each unique group during aggregation.
27
+ output_column: The name of the column containing the aggregation results. By
28
+ default "Unique values" is used as the name of the output column.
29
+ is_sorted: Indicates whether the input dataframe is already sorted with respect
30
+ to the 'group_by' column.
31
+
32
+ Returns:
33
+ A dataframe with unique 'group_by' values as index and a unique counts column
34
+ containing the number of unique counts per group.
35
+
36
+ Example:
37
+ >>> table = pd.DataFrame(
38
+ ... {
39
+ ... "ID": ["A", "A", "B", "C", "C", "C"],
40
+ ... "Peptide sequence": ["a", "a", "b", "c1", "c2", "c2"],
41
+ ... }
42
+ ... )
43
+ >>> count_unique(table, group_by="ID", input_column="Peptide sequence")
44
+ Unique counts
45
+ A 1
46
+ B 1
47
+ C 2
48
+ """
49
+ aggregation, groups = aggregate_unique_groups(
50
+ table, group_by, input_column, CONDENSE.count_unique, is_sorted
51
+ )
52
+ return pd.DataFrame(columns=[output_column], data=aggregation, index=groups)
53
+
54
+
55
+ def join_unique(
56
+ table: pd.DataFrame,
57
+ group_by: str,
58
+ input_column: Union[str, Iterable],
59
+ output_column: str = "Unique values",
60
+ sep: str = ";",
61
+ is_sorted: bool = False,
62
+ ) -> pd.DataFrame:
63
+ """Aggregates column(s) by concatenating unique values for each unique group.
64
+
65
+ Note that empty strings and np.nan do not contribute to the unique value count.
66
+
67
+ Args:
68
+ table: The input DataFrame used for aggregating on unique groups.
69
+ group_by: The name of the column used to determine unique groups for
70
+ aggregation.
71
+ input_column: A column or a list of columns, whose unique values will be joined
72
+ into a single string for each unique group
73
+ output_column: The name of the column containing the aggregation results. By
74
+ default "Unique values" is used as the name of the output column.
75
+ sep: The separator string used to join multiple unique values together. Default
76
+ is ";".
77
+ is_sorted: Indicates whether the input dataframe is already sorted with respect
78
+ to the 'group_by' column.
79
+
80
+ Returns:
81
+ A dataframe with unique 'group_by' values as index and a unique values column
82
+ containing the joined unique values per group. Unique values are sorted and
83
+ joined with the specified separator.
84
+
85
+ Example:
86
+ >>> table = pd.DataFrame(
87
+ ... {
88
+ ... "ID": ["A", "A", "B", "C", "C", "C"],
89
+ ... "Peptide sequence": ["a", "", "b", "c1", "c2", "c2"],
90
+ ... }
91
+ ... )
92
+ >>> join_unique(table, group_by="ID", input_column="Peptide sequence")
93
+ Unique values
94
+ A a
95
+ B b
96
+ C c1;c2
97
+ """
98
+ aggregation, groups = aggregate_unique_groups(
99
+ table,
100
+ group_by,
101
+ input_column,
102
+ lambda x: CONDENSE.join_unique_str(x, sep=sep),
103
+ is_sorted,
104
+ )
105
+ return pd.DataFrame(columns=[output_column], data=aggregation, index=groups)
106
+
107
+
108
+ def sum_columns(
109
+ table: pd.DataFrame,
110
+ group_by: str,
111
+ samples: Iterable[str],
112
+ input_tag: str,
113
+ output_tag: Optional[str] = None,
114
+ is_sorted: bool = False,
115
+ ) -> pd.DataFrame:
116
+ """Aggregates column(s) by summing up values for each unique group.
117
+
118
+ Args:
119
+ table: The input DataFrame used for aggregating on unique groups.
120
+ group_by: The name of the column used to determine unique groups for
121
+ aggregation.
122
+ samples: List of sample names that appear in columns of the table as substrings.
123
+ input_tag: Substring of column names, which is used together with the sample
124
+ names to determine the columns whose values will be summarized for each
125
+ unique group.
126
+ output_tag: Optional, allows changing the ouptut column names by replacing the
127
+ 'input_tag' with the 'output_tag'. If not specified the names of the columns
128
+ that were used for aggregation will be used in the returned dataframe.
129
+ is_sorted: Indicates whether the input dataframe is already sorted with respect
130
+ to the 'group_by' column.
131
+
132
+ Returns:
133
+ A dataframe with unique 'group_by' values as index and one column per sample.
134
+ The columns contain the summed group values per sample.
135
+
136
+ Example:
137
+ >>> table = pd.DataFrame(
138
+ ... {
139
+ ... "ID": ["A", "A", "B", "C", "C", "C"],
140
+ ... "Col S1": [1, 1, 1, 1, 1, 1],
141
+ ... "Col S2": [2, 2, 2, 2, 2, 2],
142
+ ... }
143
+ ... )
144
+ >>> sum_columns(table, "ID", samples=["S1", "S2"], input_tag="Col")
145
+ Col S1 Col S2
146
+ A 2 4
147
+ B 1 2
148
+ C 3 6
149
+ """
150
+ output_tag = input_tag if output_tag is None else output_tag
151
+ columns = find_sample_columns(table, input_tag, samples)
152
+ aggregation, groups = aggregate_unique_groups(
153
+ table, group_by, columns, CONDENSE.sum_per_column, is_sorted
154
+ )
155
+ output_columns = [column.replace(input_tag, output_tag) for column in columns]
156
+ return pd.DataFrame(columns=output_columns, data=aggregation, index=groups)
157
+
158
+
159
+ def sum_columns_maxlfq(
160
+ table: pd.DataFrame,
161
+ group_by: str,
162
+ samples: Iterable[str],
163
+ input_tag: str,
164
+ output_tag: Optional[str] = None,
165
+ is_sorted: bool = False,
166
+ ) -> pd.DataFrame:
167
+ """Aggregates column(s) by applying the MaxLFQ summation approach to unique group.
168
+
169
+ This function estimates abundance profiles from sample columns using pairwise median
170
+ ratios and least square regression. It then selects abundance profiles with finite
171
+ values and the corresponding input columns and scales the abundance profiles so that
172
+ their total sum is equal to the total sum of the corresponding input columns.
173
+
174
+ Args:
175
+ table: The input DataFrame used for aggregating on unique groups.
176
+ group_by: The name of the column used to determine unique groups for
177
+ aggregation.
178
+ samples: List of sample names that appear in columns of the table as substrings.
179
+ input_tag: Substring of column names, which is used together with the sample
180
+ names to determine the columns whose values will be summarized for each
181
+ unique group.
182
+ output_tag: Optional, allows changing the ouptut column names by replacing the
183
+ 'input_tag' with the 'output_tag'. If not specified the names of the columns
184
+ that were used for aggregation will be used in the returned dataframe.
185
+ is_sorted: Indicates whether the input dataframe is already sorted with respect
186
+ to the 'group_by' column.
187
+
188
+ Returns:
189
+ A dataframe with unique 'group_by' values as index and one column per sample.
190
+ The columns contain the summed group values per sample.
191
+
192
+ Example:
193
+ >>> table = pd.DataFrame(
194
+ ... {
195
+ ... "ID": ["A", "A", "B", "C", "C", "C"],
196
+ ... "Col S1": [1, 1, 1, 1, 1, 1],
197
+ ... "Col S2": [2, 2, 2, 2, 2, 2],
198
+ ... }
199
+ ... )
200
+ >>> sum_columns_maxlfq(table, "ID", samples=["S1", "S2"], input_tag="Col")
201
+ Col S1 Col S2
202
+ A 2.0 4.0
203
+ B 1.0 2.0
204
+ C 3.0 6.0
205
+ """
206
+ output_tag = input_tag if output_tag is None else output_tag
207
+ columns = find_sample_columns(table, input_tag, samples)
208
+ aggregation, groups = aggregate_unique_groups(
209
+ table, group_by, columns, CONDENSE.sum_by_median_ratio_regression, is_sorted
210
+ )
211
+ output_columns = [column.replace(input_tag, output_tag) for column in columns]
212
+ return pd.DataFrame(columns=output_columns, data=aggregation, index=groups)
213
+
214
+
215
+ def aggregate_unique_groups(
216
+ table: pd.DataFrame,
217
+ group_by: str,
218
+ columns_to_aggregate: Union[str, Iterable],
219
+ condenser: Callable,
220
+ is_sorted: bool,
221
+ ) -> (np.ndarray, np.ndarray):
222
+ """Aggregates column(s) by applying a condenser function to unique groups.
223
+
224
+ The function returns two arrays containing the aggregated values and the
225
+ corresponding group names. This function can be used for example to summarize data
226
+ from an ion table to a peptide, protein or modification table. Suitable condenser
227
+ functions can be found in the module msreport.aggregate.condense
228
+
229
+ Args:
230
+ table: The input dataframe used for aggregating on unique groups.
231
+ group_by: The name of the column used to determine unique groups for
232
+ aggregation.
233
+ columns_to_aggregate: A column or a list of columns, which will be passed to the
234
+ condenser function for applying an aggregation to each unique group.
235
+ condenser: Function that is applied to each group for generating the
236
+ aggregation result. If multiple columns are specified for aggregation,
237
+ the input array for the condenser function will be two dimensional, with the
238
+ first dimension corresponding to rows and the second to the column. E.g. an
239
+ array with 3 rows and 2 columns: np.array([[1, 'a'], [2, 'b'], [3, 'c']])
240
+ is_sorted: Indicates whether the input dataframe is already sorted with respect
241
+ to the 'group_by' column.
242
+
243
+ Returns:
244
+ Two numpy arrays, the first array contains the aggregation results of each each
245
+ unique group and the second array contains the correpsonding group names.
246
+ """
247
+ group_start_indices, group_names, table = _prepare_grouping_indices(
248
+ table, group_by, is_sorted
249
+ )
250
+ array = table[columns_to_aggregate].to_numpy()
251
+ aggregation_result = np.array(
252
+ [condenser(i) for i in np.split(array, group_start_indices[1:])]
253
+ )
254
+ return aggregation_result, group_names
255
+
256
+
257
+ def _prepare_grouping_indices(
258
+ table: pd.DataFrame, group_by: str, is_sorted: bool
259
+ ) -> (np.ndarray, np.ndarray, pd.DataFrame):
260
+ """Prepares start indices and names of unique groups from a sorted dataframe.
261
+
262
+ Args:
263
+ table: The input DataFrame used for generating unique groups.
264
+ group_by: The name of the column used to determine unique groups.
265
+ is_sorted: If True, the input DataFrame is assumed to be already sorted with
266
+ respected to the 'group_by' column. Ohterwise, the input DataFrame is sorted
267
+ by the 'group_by' column and the sorted DataFrame is returned.
268
+
269
+ Returns:
270
+ A tuple containing the following three elements:
271
+ - A numpy array containing the start indices of each unique group
272
+ - A numpy array containing the names of each unique group
273
+ - The input DataFrame sorted by the 'group_by' column, if it was not already
274
+ sorted.
275
+ """
276
+ if not is_sorted:
277
+ table = table.sort_values(by=group_by)
278
+ group_names, group_start_indices, group_lengths = np.unique(
279
+ table[group_by], return_counts=True, return_index=True
280
+ )
281
+ return group_start_indices, group_names, table