msreport 0.0.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,339 @@
1
+ import itertools
2
+ from typing import Callable
3
+ import warnings
4
+
5
+ import numpy as np
6
+
7
+ import msreport.helper
8
+
9
+
10
+ def calculate_pairwise_log_ratio_matrix(
11
+ array: np.ndarray, log_transformed: bool = False
12
+ ) -> np.ndarray:
13
+ """Calculates a pairwise log ratio matrix from an intensity array.
14
+
15
+ Args:
16
+ array: A two-dimensional numpy array, with the first dimension corresponding to
17
+ rows and the second dimension to columns.
18
+ log_transformed: If True, the 'array' is expected to contain log transformed
19
+ intensity values. If False, the array is expected to contain non-transformed
20
+ intensity values, which are log2 transformed for the calculation of ratios.
21
+
22
+ Returns:
23
+ A 3-dimensional numpy array, containing pair-wise log ratios. The shape of the
24
+ output array is (n, i, i), where n is the number of rows of the input array and
25
+ i is the number of columns.
26
+
27
+ Example:
28
+ >>> array = np.array(
29
+ ... [
30
+ ... [4.0, 4.0, 8.0],
31
+ ... [8.0, 9.0, np.nan],
32
+ ... ]
33
+ ... )
34
+ >>> calculate_pairwise_log_ratio_matrix(array)
35
+ array([[[ 0. , 0. , -1. ],
36
+ [ 0. , 0. , -1. ],
37
+ [ 1. , 1. , 0. ]],
38
+ <BLANKLINE>
39
+ [[ 0. , -0.169925, nan],
40
+ [ 0.169925, 0. , nan],
41
+ [ nan, nan, nan]]])
42
+ """
43
+ if np.issubdtype(array.dtype, np.integer):
44
+ log_array = array.astype(float)
45
+ else:
46
+ log_array = array.copy()
47
+
48
+ if not log_transformed:
49
+ log_array[log_array == 0] = np.nan
50
+ log_array = np.log2(log_array)
51
+
52
+ ratio_matrix = log_array[:, :, None] - log_array[:, None, :]
53
+ ratio_matrix[~np.isfinite(ratio_matrix)] = np.nan
54
+ return ratio_matrix
55
+
56
+
57
+ def calculate_pairwise_median_log_ratio_matrix(
58
+ array: np.ndarray, log_transformed: bool = False
59
+ ) -> np.ndarray:
60
+ """Calculates a pairwise median log ratio matrix from an intensity array.
61
+
62
+ Args:
63
+ array: A two-dimensional numpy array, with the first dimension corresponding to
64
+ rows and the second dimension to columns.
65
+ log_transformed: If True, the 'array' is expected to contain log transformed
66
+ intensity values. If False, the array is expected to contain non-transformed
67
+ intensity values, which are log2 transformed for the calculation of ratios.
68
+
69
+ Returns:
70
+ A square 2-dimensional numpy array, containing pair-wise log ratios. The shape
71
+ of the output array is (i, i), where i is the number of columns of the input
72
+ array.
73
+
74
+ Example:
75
+ >>> array = np.array(
76
+ ... [
77
+ ... [4.0, 4.0, 8.0],
78
+ ... [8.0, 9.0, np.nan],
79
+ ... ]
80
+ ... )
81
+ >>> calculate_pairwise_median_log_ratio_matrix(array)
82
+ array([[ 0. , -0.0849625, -1. ],
83
+ [ 0.0849625, 0. , -1. ],
84
+ [ 1. , 1. , 0. ]])
85
+ """
86
+ ratio_marix = _calculate_pairwise_centered_log_ratio_matrix(
87
+ array, np.median, log_transformed=log_transformed
88
+ )
89
+ return ratio_marix
90
+
91
+
92
+ def calculate_pairwise_mode_log_ratio_matrix(
93
+ array: np.ndarray, log_transformed: bool = False
94
+ ) -> np.ndarray:
95
+ """Calculates a pairwise mode ratio matrix from an intensity array.
96
+
97
+ Args:
98
+ array: A two-dimensional numpy array, with the first dimension corresponding to
99
+ rows and the second dimension to columns.
100
+ log_transformed: If True, the 'array' is expected to contain log transformed
101
+ intensity values. If False, the array is expected to contain non-transformed
102
+ intensity values, which are log2 transformed for the calculation of ratios.
103
+
104
+ Returns:
105
+ A square 2-dimensional numpy array, containing pair-wise ratios. The shape of
106
+ the output array is (i, i), where i is the number of columns of the input array.
107
+
108
+ Example:
109
+ >>> array = np.array(
110
+ ... [
111
+ ... [4.0, 4.0, 8.0],
112
+ ... [8.0, 9.0, np.nan],
113
+ ... ]
114
+ ... )
115
+ >>> calculate_pairwise_mode_log_ratio_matrix(array)
116
+ array([[ 0. , -0.0849625, -1. ],
117
+ [ 0.0849625, 0. , -1. ],
118
+ [ 1. , 1. , 0. ]])
119
+ """
120
+ ratio_marix = _calculate_pairwise_centered_log_ratio_matrix(
121
+ array, msreport.helper.mode, log_transformed=log_transformed
122
+ )
123
+ return ratio_marix
124
+
125
+
126
+ def prepare_coefficient_matrix(
127
+ ratio_matrix: np.ndarray,
128
+ ) -> (np.ndarray, np.ndarray, np.ndarray):
129
+ """Prepares coefficients, ratios, and initial row indices from a log ratio matrix.
130
+
131
+ Args:
132
+ ratio_matrix: A numpy array containing one or multiple pair-wise ratio matrices.
133
+ Each ratio matrix must be a square array, with a ratio at position (i, j)
134
+ being calculated from an abundance table as 'column i - column j'. Ratios
135
+ should have been calculated by row index Only the upper triangular part of
136
+ the ratio matrix is used to generate the coefficient matrix. If the
137
+ 'ratio_matrix' contains multiple ratio matrices, the shape of the array
138
+ has to be (n, i, i), where n is the number of ratio matrices and i is the
139
+ number of rows and columns per ratio matrix. If only one ratio matrix is
140
+ provided, the shape of the array has to be (i, i).
141
+
142
+ Returns:
143
+ A tuple containing the following three elements:
144
+ - A coefficent matrix. 2d array with the number of rows corresponding to the
145
+ first dimension of the 'ratio_matrix' and the number of columns to the second
146
+ dimension.
147
+ - Ratios: 1d array containing the ratios from the ratio matrix, each entry
148
+ corresponds to a row in the coefficent matrix.
149
+ - Ratio matrix row indices: 1d array containing row indicies that refer to the
150
+ index of the first dimenstion from the 'ratio_matrix', and thus to row indices
151
+ from the original table that was used to generate 'the ratio_matrix'. If
152
+ the 'ratio_matrix' has only 2 dimensions all values are zero.
153
+
154
+ Example:
155
+ >>> ratio_matrix = np.array(
156
+ ... [
157
+ ... [0.0, -0.1, -1.0],
158
+ ... [0.1, 0.0, -1.0],
159
+ ... [1.0, 1.0, 0.0],
160
+ ... ]
161
+ ... )
162
+ >>> prepare_coefficient_matrix(ratio_matrix)
163
+ (array([[ 1, -1, 0],
164
+ [ 1, 0, -1],
165
+ [ 0, 1, -1]]),
166
+ array([-0.1, -1. , -1. ]),
167
+ array([0, 0, 0]))
168
+
169
+ """
170
+ if len(ratio_matrix.shape) == 2:
171
+ result = _coefficients_from_single_row_matrix(ratio_matrix)
172
+ else:
173
+ result = _coefficients_from_multi_row_matrix(ratio_matrix)
174
+ coef_matrix, ratio_array, initial_rows = result
175
+
176
+ return coef_matrix, ratio_array, initial_rows
177
+
178
+
179
+ def log_profiles_by_lstsq(coef_matrix: np.ndarray, ratio_array: np.ndarray):
180
+ """Calculates estimated log abundance profiles by least-squares fitting.
181
+
182
+ Args:
183
+ coef_matrix: Two-dimensional numpy array representing the coefficients.
184
+ ratio_array: One-dimensional numpy array representing the ratios, each entry
185
+ corresponds to a row in the coefficent matrix.
186
+
187
+ Returns:
188
+ One-dimensional numpy array containing the estimated least-squares profile.
189
+
190
+ Example:
191
+ >>> coef_matrix = np.array(
192
+ ... [
193
+ ... [1, -1, 0],
194
+ ... [1, 0, -1],
195
+ ... [0, 1, -1],
196
+ ... ]
197
+ ... )
198
+ >>> ratio_array = np.array([-0.1, -1.0, -1.0])
199
+ >>> log_profiles_by_lstsq(coef_matrix, ratio_array)
200
+ array([-0.36666667, -0.3 , 0.66666667])
201
+ """
202
+ finite_rows = np.isfinite(ratio_array)
203
+ coef_matrix = coef_matrix[finite_rows]
204
+ ratio_array = ratio_array[finite_rows]
205
+
206
+ absent_coef = np.abs(coef_matrix).sum(axis=0) == 0
207
+ coef_estimates, resid, rank, s = np.linalg.lstsq(
208
+ coef_matrix[:, ~absent_coef], ratio_array, rcond=None
209
+ )
210
+ log_profile = np.zeros(coef_matrix.shape[1])
211
+ log_profile[absent_coef] = np.nan
212
+ log_profile[~absent_coef] = coef_estimates
213
+ return log_profile
214
+
215
+
216
+ def _calculate_pairwise_centered_log_ratio_matrix(
217
+ array: np.ndarray, center_function: Callable, log_transformed: bool = False
218
+ ) -> np.ndarray:
219
+ """Calculates a pairwise, centered log2 ratio matrix from an intensity array.
220
+
221
+ Args:
222
+ array: A two-dimensional numpy array, with the first dimension corresponding to
223
+ rows and the second dimension to columns.
224
+ center_function: Function that is applied to the ratios of each pair-wise
225
+ comparison of columns in the input array to calculate the centered ratio.
226
+ log_transformed: If True, the 'array' is expected to contain log transformed
227
+ intensity values. If False, the array is expected to contain non-transformed
228
+ intensity values, which are log2 transformed for the calculation of ratios.
229
+
230
+ Returns:
231
+ A square 2-dimensional numpy array, containing pair-wise ratios. The shape of
232
+ the output array is (i, i), where i is the number of columns of the input array.
233
+ """
234
+ # Note: Is currently tested only via the calculate_pairwise_median_log_ratio_matrix
235
+ # and calculate_pairwise_mode_log_ratio_matrix functions.
236
+ if np.issubdtype(array.dtype, np.integer):
237
+ log_array = array.astype(float)
238
+ else:
239
+ log_array = array.copy()
240
+
241
+ if not log_transformed:
242
+ log_array[log_array == 0] = np.nan
243
+ log_array = np.log2(log_array)
244
+
245
+ num_cols = log_array.shape[1]
246
+ ratio_marix = np.full((num_cols, num_cols), fill_value=np.nan)
247
+ ratio_marix = np.zeros((num_cols, num_cols))
248
+ for i, j in itertools.combinations(range(num_cols), 2):
249
+ ratios = log_array[:, i] - log_array[:, j]
250
+ with warnings.catch_warnings():
251
+ warnings.simplefilter("ignore", category=RuntimeWarning)
252
+ median_ratio = center_function(ratios[np.isfinite(ratios)])
253
+ ratio_marix[i, j] = median_ratio
254
+
255
+ # Generate a full, mirrowed matrix where the lower triangle is upper triangle * -1
256
+ ratio_marix = ratio_marix - ratio_marix.T - np.diag(np.diag(ratio_marix))
257
+ return ratio_marix
258
+
259
+
260
+ def _coefficients_from_single_row_matrix(ratio_matrix):
261
+ """Calculates coefficients, ratios, and initial row indices for a single row matrix.
262
+
263
+ Args:
264
+ ratio_matrix: A numpy array containing one single pair-wise ratio matrix. The
265
+ ratio matrix must be a square array, with a ratio at position (i, j) being
266
+ calculated from an abundance table as 'column i - column j'. Only the upper
267
+ triangular part of the ratio matrix is used to generate the coefficient
268
+ matrix.
269
+
270
+ Returns:
271
+ A tuple containing the following three elements:
272
+ - A coefficent matrix. 2d array with the number of rows corresponding to the
273
+ first dimension of the 'ratio_matrix' and the number of columns to the second
274
+ dimension.
275
+ - Ratios: 1d array containing the ratios from the ratio matrix, each entry
276
+ corresponds to a row in the coefficent matrix.
277
+ - Ratio matrix row indices: 1d array with equal length as the ratios array,
278
+ containing all zero values. is returned for consistency with the function
279
+ `_coefficients_from_multi_row_matrix`.
280
+ """
281
+ num_coef = ratio_matrix.shape[1]
282
+ coef_combinations = list(itertools.combinations(range(num_coef), 2))
283
+ num_coef_combinations = len(coef_combinations)
284
+
285
+ coef_matrix = np.zeros((num_coef_combinations, num_coef), dtype=int)
286
+ ratio_array = np.zeros(num_coef_combinations)
287
+ idx_ratio_matrix_first_dimension = np.zeros(num_coef_combinations, dtype=int)
288
+
289
+ for variable_position, (i, j) in enumerate(coef_combinations):
290
+ ratio_ij = ratio_matrix[i, j]
291
+ coef_matrix[variable_position, i] = 1
292
+ coef_matrix[variable_position, j] = -1
293
+ ratio_array[variable_position] = ratio_ij
294
+ return coef_matrix, ratio_array, idx_ratio_matrix_first_dimension
295
+
296
+
297
+ def _coefficients_from_multi_row_matrix(ratio_matrix):
298
+ """Calculates coefficients, ratios, and initial row indices for a multi row matrix.
299
+
300
+ Args:
301
+ ratio_matrix: A numpy array containing multiple pair-wise ratio matrices. Each
302
+ ratio matrix must be a square array, with a ratio at position (i, j) being
303
+ calculated from an abundance table as 'column i - column j'. Only the upper
304
+ triangular part of the ratio matrix is used to generate the coefficient
305
+ matrix. The shape of 'ratio_matrix' must be(n, i, i), where n is the number
306
+ of ratio matrices and i is the number of rows and columns per ratio
307
+ matrix.
308
+
309
+ Returns:
310
+ A tuple containing the following three elements:
311
+ - A coefficent matrix. 2d array with the number of rows corresponding to the
312
+ first dimension of the 'ratio_matrix' and the number of columns to the second
313
+ and third dimension.
314
+ - Ratios: 1d array containing the ratios from the ratio matrix, each entry
315
+ corresponds to a row in the coefficent matrix.
316
+ - Ratio matrix row indices: 1d array containing row indicies that refer to the
317
+ index of the first dimenstion from the 'ratio_matrix', and thus to row indices
318
+ from the original table that was used to generate 'the ratio_matrix'.
319
+ """
320
+ num_coef = ratio_matrix.shape[1]
321
+ coef_combinations = list(itertools.combinations(range(num_coef), 2))
322
+ num_coef_combinations = len(coef_combinations)
323
+ num_matrices = ratio_matrix.shape[0]
324
+ coef_matrix_rows = num_coef_combinations * num_matrices
325
+
326
+ coef_matrix = np.zeros((coef_matrix_rows, num_coef), dtype=int)
327
+ ratio_array = np.zeros(coef_matrix_rows)
328
+ idx_ratio_matrix_first_dimension = np.zeros(coef_matrix_rows, dtype=int)
329
+
330
+ for matrix_position, matrix in enumerate(ratio_matrix):
331
+ for variable_position, (i, j) in enumerate(coef_combinations):
332
+ position = (matrix_position * num_coef_combinations) + variable_position
333
+ ratio_ij = matrix[i, j]
334
+ coef_matrix[position, i] = 1
335
+ coef_matrix[position, j] = -1
336
+ ratio_array[position] = ratio_ij
337
+ idx_ratio_matrix_first_dimension[position] = matrix_position
338
+
339
+ return coef_matrix, ratio_array, idx_ratio_matrix_first_dimension
@@ -0,0 +1,267 @@
1
+ import re
2
+ from typing import Iterable, Union
3
+
4
+ import numpy as np
5
+ import pandas as pd
6
+
7
+
8
+ def guess_design(table: pd.DataFrame, tag: str) -> pd.DataFrame:
9
+ """Extracts sample name, experiment, and replicate from specified sample columns.
10
+
11
+ "Total" and "Combined", and their lower case variants, are not allowed as sample
12
+ names and will be ignored.
13
+
14
+ First a subset of columns containing a column tag are identified. Then sample names
15
+ are extracted by removing the column tag from each column name. And finally, sample
16
+ names are split into experiment and replicate at the last underscore.
17
+
18
+ This requires that the naming of samples follows a specific convention. Sample names
19
+ must begin with the experiment name, followed by an underscore and a unique
20
+ identifier of the sample, for example the replicate number. The experiment name can
21
+ also contain underscores, as it is split only by the last underscore.
22
+
23
+ For example "ExpA_r1" would be split into experiment "ExpA" and replicate "r1",
24
+ "Exp_A_1" would be experiment "Exp_A" and replicate "1".
25
+
26
+ Args:
27
+ table: Dataframe which columns are used for extracting sample names.
28
+ tag: Column names containing the 'tag' are selected for sample extraction.
29
+
30
+ Returns:
31
+ A dataframe containing the columns "Sample", "Experiment", and "Replicate"
32
+ """
33
+ sample_entries = []
34
+ for column in find_columns(table, tag, must_be_substring=True):
35
+ sample = column.replace(tag, "").strip()
36
+ if sample.lower() in ["total", "combined"]:
37
+ continue
38
+ experiment = "_".join(sample.split("_")[:-1])
39
+ experiment = experiment if experiment else sample
40
+ replicate = sample.split("_")[-1]
41
+ replicate = replicate if replicate is not sample else "-1"
42
+ sample_entries.append([sample, experiment, replicate])
43
+ design = pd.DataFrame(sample_entries, columns=["Sample", "Experiment", "Replicate"])
44
+ return design
45
+
46
+
47
+ def intensities_in_logspace(data: Union[pd.DataFrame, np.ndarray, Iterable]) -> bool:
48
+ """Evaluates whether intensities are likely to be log transformed.
49
+
50
+ Assumes that intensities are log transformed if all values are smaller or equal to
51
+ 64. Intensities values (and intensity peak areas) reported by tandem mass
52
+ spectrometry typically range from 10^3 to 10^12. To reach log2 transformed values
53
+ greater than 64, intensities would need to be higher than 10^19, which seems to be
54
+ very unlikely to be ever encountered.
55
+
56
+ Args:
57
+ data: Dataset that contains only intensity values, can be any iterable,
58
+ a numpy.array or a pandas.DataFrame, multiple dimensions or columns
59
+ are allowed.
60
+
61
+ Returns:
62
+ True if intensity values in 'data' appear to be log transformed.
63
+ """
64
+ data = np.array(data, dtype=float)
65
+ mask = np.isfinite(data)
66
+ return np.all(data[mask].flatten() <= 64)
67
+
68
+
69
+ def rename_sample_columns(table: pd.DataFrame, mapping: dict[str, str]) -> pd.DataFrame:
70
+ """Renames sample names according to the mapping in a cautious manner.
71
+
72
+ In general, this function allows the use of 'mapping' with keys that are substrings
73
+ of any other keys, as well as values that are substrings of any of the keys.
74
+
75
+ Importantly, if the mapping keys (sample names) are substrings of other column names
76
+ within the table, unintended renaming of those columns will occur. For instance,
77
+ when renaming columns ["Abundance", "Intensity A"] with the mapping
78
+ {"A": "Sample Alpha"}, the columns will be renamed to ["Sample Alphabundance",
79
+ "Intensity Sample Alpha"].
80
+
81
+ Args:
82
+ table: Dataframe which columns will be renamed.
83
+ mapping: A mapping of old to new sample names that will be used to replace
84
+ matching substrings in the columns from table.
85
+
86
+ Returns:
87
+ A copy of the table with renamed columns.
88
+ """
89
+ sorted_mapping_keys = sorted(mapping, key=len, reverse=True)
90
+
91
+ renamed_columns = []
92
+ for column in table.columns:
93
+ for sample_name in sorted_mapping_keys:
94
+ if sample_name in column:
95
+ column = column.replace(sample_name, mapping[sample_name])
96
+ break
97
+ renamed_columns.append(column)
98
+
99
+ renamed_table = table.copy()
100
+ renamed_table.columns = renamed_columns
101
+ return renamed_table
102
+
103
+
104
+ def rename_mq_reporter_channels(
105
+ table: pd.DataFrame, channel_names: Iterable[str]
106
+ ) -> None:
107
+ """Renames reporter channel numbers with sample names.
108
+
109
+ MaxQuant writes reporter channel names either in the format "Reporter intensity 1"
110
+ or "Reporter intensity 1 Experiment Name", dependent on whether an experiment name
111
+ was specified. Renames "Reporter intensity", "Reporter intensity count", and
112
+ "Reporter intensity corrected" columns.
113
+
114
+ NOTE: This might not work for the peptides.txt table, as there are columns present
115
+ with the experiment name and also without it.
116
+ """
117
+ pattern = re.compile("Reporter intensity [0-9]+")
118
+ reporter_columns = list(filter(pattern.match, table.columns.tolist()))
119
+ assert len(reporter_columns) == len(channel_names)
120
+
121
+ column_mapping = {}
122
+ base_name = "Reporter intensity "
123
+ for column, channel_name in zip(reporter_columns, channel_names):
124
+ for tag in ["", "count ", "corrected "]:
125
+ old_column = column.replace(f"{base_name}", f"{base_name}{tag}")
126
+ new_column = f"{base_name}{tag}{channel_name}"
127
+ column_mapping[old_column] = new_column
128
+ table.rename(columns=column_mapping, inplace=True)
129
+
130
+
131
+ def apply_intensity_cutoff(
132
+ table: pd.DataFrame, column_tag: str, threshold: float
133
+ ) -> None:
134
+ """Sets values below the threshold to NA.
135
+
136
+ Args:
137
+ table: Dataframe to which the protein annotations are added.
138
+ column_tag: Substring used to identify intensity columns from the 'table' to
139
+ which the intensity cutoff is applied.
140
+ threshold: Values below the treshold will be set to NA.
141
+ """
142
+ for column in find_columns(table, column_tag):
143
+ table.loc[table[column] < threshold, column] = np.nan
144
+
145
+
146
+ def find_columns(
147
+ table: pd.DataFrame, substring: str, must_be_substring: bool = False
148
+ ) -> list[str]:
149
+ """Returns a list column names containing the substring.
150
+
151
+ Args:
152
+ table: Columns of this datafram are queried.
153
+ substring: String that must be part of column names.
154
+ must_be_substring: If true than column names are not reported if they
155
+ are exactly equal to the substring.
156
+
157
+ Returns:
158
+ A list of column names.
159
+ """
160
+ matches = [substring in col for col in table.columns]
161
+ matched_columns = np.array(table.columns)[matches].tolist()
162
+ if must_be_substring:
163
+ matched_columns = [col for col in matched_columns if col != substring]
164
+ return matched_columns
165
+
166
+
167
+ def find_sample_columns(
168
+ table: pd.DataFrame, substring: str, samples: Iterable[str]
169
+ ) -> list[str]:
170
+ """Returns column names that contain the substring and any entry of 'samples'.
171
+
172
+ Args:
173
+ table: Columns of this dataframe are queried.
174
+ substring: String that must be part of column names.
175
+ samples: List of strings from which at least one must be present in matched
176
+ columns.
177
+
178
+ Returns:
179
+ A list of sample column names.
180
+ """
181
+ matched_columns = []
182
+ for column in find_columns(table, substring):
183
+ if any([sample in column for sample in samples]):
184
+ matched_columns.append(column)
185
+ return matched_columns
186
+
187
+
188
+ def keep_rows_by_partial_match(
189
+ table: pd.DataFrame, column: str, values: Iterable[str]
190
+ ) -> pd.DataFrame:
191
+ """Filter a table to keep only rows partially matching any of the specified values.
192
+
193
+ Args:
194
+ table: The input table that will be filtered.
195
+ column: The name of the column in the 'table' which entries are checked for
196
+ partial matches to the values. This column must have the datatype 'str'.
197
+ modifications: An iterable of strings that are used to filter the table. Any of
198
+ the specified values must have at least a partial match to an entry from the
199
+ specified 'column' for a row to be kept in the filtered table.
200
+
201
+ Returns:
202
+ A new DataFrame containing only the rows that have a partial or complete match
203
+ with any of the specified 'values'.
204
+
205
+ Example:
206
+ >>> df = pd.DataFrame({"Modifications": ["phos", "acetyl;phos", "acetyl"]})
207
+ >>> keep_rows_by_partial_match(df, "Modifications", ["phos"])
208
+ Modifications
209
+ 0 phos
210
+ 1 acetyl;phos
211
+ """
212
+ value_masks = [table[column].str.contains(value, regex=False) for value in values]
213
+ target_mask = np.any(value_masks, axis=0)
214
+ filtered_table = table[target_mask].copy()
215
+ return filtered_table
216
+
217
+
218
+ def remove_rows_by_partial_match(
219
+ table: pd.DataFrame, column: str, values: Iterable[str]
220
+ ) -> pd.DataFrame:
221
+ """Filter a table to remove rows partially matching any of the specified values.
222
+
223
+ Args:
224
+ table: The input table that will be filtered.
225
+ column: The name of the column in the 'table' which entries are checked for
226
+ partial matches to the values. This column must have the datatype 'str'.
227
+ modifications: An iterable of strings that are used to filter the table. Any of
228
+ the specified values must have at least a partial match to an entry from the
229
+ specified 'column' for a row to be removed in the filtered table.
230
+
231
+ Returns:
232
+ A new DataFrame containing no rows that have a partial or complete match with
233
+ any of the specified 'values'.
234
+
235
+ Example:
236
+ >>> df = pd.DataFrame({"Modifications": ["phos", "acetyl;phos", "acetyl"]})
237
+ >>> remove_rows_by_partial_match(df, "Modifications", ["phos"])
238
+ Modifications
239
+ 2 acetyl
240
+ """
241
+ value_masks = [table[column].str.contains(value, regex=False) for value in values]
242
+ target_mask = ~np.any(value_masks, axis=0)
243
+ filtered_table = table[target_mask].copy()
244
+ return filtered_table
245
+
246
+
247
+ def join_tables(
248
+ tables: Iterable[pd.DataFrame], reset_index: bool = False
249
+ ) -> pd.DataFrame:
250
+ """Returns a joined dataframe.
251
+
252
+ Dataframes are merged iteratively on their index using an outer join, beginning with
253
+ the first entry from 'tables'. Can only join dataframes with different columns.
254
+
255
+ Args:
256
+ tables: Dataframes that will be merged together.
257
+ reset_index: If True, the index of the joined dataframe is reset.
258
+
259
+ Returns:
260
+ A merged dataframe.
261
+ """
262
+ merged_table = tables[0]
263
+ for table in tables[1:]:
264
+ merged_table = merged_table.join(table, how="outer")
265
+ if reset_index:
266
+ merged_table.reset_index(inplace=True)
267
+ return merged_table