msreport 0.0.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
msreport/qtable.py ADDED
@@ -0,0 +1,537 @@
1
+ from __future__ import annotations
2
+ from typing import Any, Optional
3
+ import os
4
+ import warnings
5
+
6
+ import numpy as np
7
+ import pandas as pd
8
+ import yaml
9
+
10
+ import msreport.helper as helper
11
+
12
+
13
+ class Qtable:
14
+ """Stores and provides access to quantitative proteomics data in a tabular form.
15
+
16
+ Qtable contains proteomics data in a tabular form, which is stored as 'qtable.data',
17
+ and an experimental design table, stored in 'qtable.design'. Columns from
18
+ 'qtable.data' can directly be accessed by indexing with [], column values can be set
19
+ with [], and the 'in' operator can be used to check whether a column is present in
20
+ 'qtable.data', e.g. 'qtable[key]', 'qtable[key] = value', 'key in qtable'.
21
+
22
+ Attributes:
23
+ data: A pandas.DataFrame containing quantitative proteomics data.
24
+ design: A pandas.DataFrame describing the experimental design.
25
+ """
26
+
27
+ def __init__(self, data: pd.DataFrame, design: Optional[pd.DataFrame] = None):
28
+ """Initializes the Qtable.
29
+
30
+ If data does not contain a "Valid" column, this column is added and all its row
31
+ values are set to True.
32
+
33
+ Args:
34
+ data: A dataframe containing quantitative proteomics data in a wide format.
35
+ design: A dataframe describing the experimental design that must at least
36
+ contain the columns "Sample" and "Experiment". The "Sample" entries
37
+ should correspond to the Sample names present in the quantitative
38
+ columns of the data.
39
+ """
40
+ self.design: pd.DataFrame
41
+ self.data: pd.DataFrame
42
+
43
+ self.data = data.copy()
44
+ if "Valid" not in self.data.columns:
45
+ self.data["Valid"] = True
46
+ if design is not None:
47
+ self.add_design(design)
48
+
49
+ self._expression_columns: list[str] = []
50
+ self._expression_features: list[str] = []
51
+ self._expression_sample_mapping: dict[str, str] = {}
52
+
53
+ def __getitem__(self, key: Any) -> pd.DataFrame:
54
+ """Evaluation of self.data[key]"""
55
+ return self.data[key]
56
+
57
+ def __setitem__(self, key: Any, value: Any):
58
+ """Item assignment of self.data[key]"""
59
+ self.data[key] = value
60
+
61
+ def __contains__(self, key: Any) -> bool:
62
+ """True if key is in the info axis of self.data"""
63
+ return key in self.data
64
+
65
+ def add_design(self, design: pd.DataFrame) -> None:
66
+ """Adds an experimental design table.
67
+
68
+ Args:
69
+ design: A dataframe describing the experimental design that must at least
70
+ contain the columns "Sample" and "Experiment". The "Sample" entries
71
+ should correspond to the Sample names present in the quantitative
72
+ columns of the table.
73
+ """
74
+ columns = design.columns.tolist()
75
+ required_columns = ["Experiment", "Sample", "Replicate"]
76
+ if not all([c in columns for c in required_columns]):
77
+ exception_message = "".join(
78
+ [
79
+ "The design table must at least contain the columns: ",
80
+ ", ".join(f'"{c}"' for c in required_columns),
81
+ ". " "It only contains the columns: ",
82
+ ", ".join(f'"{c}"' for c in columns),
83
+ ".",
84
+ ]
85
+ )
86
+ raise ValueError(exception_message)
87
+ self.design = design.copy()
88
+
89
+ def get_data(self, exclude_invalid: bool = False) -> pd.DataFrame:
90
+ """Returns a copy of the data table.
91
+
92
+ Args:
93
+ exclude_invalid: Optional, if true the returned dataframe is filtered by
94
+ the "Valid" column. Default false.
95
+
96
+ Returns:
97
+ A copy of the qtable.data dataframe.
98
+ """
99
+ data = self.data.copy()
100
+ if exclude_invalid:
101
+ data = _exclude_invalid(data)
102
+ return data
103
+
104
+ def get_design(self) -> pd.DataFrame:
105
+ """Returns a copy of the design table."""
106
+ return self.design.copy()
107
+
108
+ def get_samples(self, experiment: Optional[str] = None) -> list[str]:
109
+ """Returns a list of samples present in the design table.
110
+
111
+ Args:
112
+ experiment: If specified, only samples from this experiment are returned.
113
+
114
+ Returns:
115
+ A list of sample names.
116
+ """
117
+ design = self.get_design()
118
+ if experiment is not None:
119
+ samples = design[design["Experiment"] == experiment]["Sample"]
120
+ else:
121
+ samples = design["Sample"]
122
+ return samples.tolist()
123
+
124
+ def get_experiment(self, sample: str) -> str:
125
+ """Looks up the experiment of the specified sample from the design table.
126
+
127
+ Args:
128
+ sample: A sample name.
129
+
130
+ Returns:
131
+ An experiment name.
132
+ """
133
+ design = self.get_design()
134
+ experiment = design[design["Sample"] == sample]["Experiment"].values[0]
135
+ return experiment
136
+
137
+ def get_experiments(self, samples: Optional[list[str]] = None) -> list[str]:
138
+ """Returns a list of experiments present in the design table.
139
+
140
+ Args:
141
+ samples: If specified, only experiments from these samples are returned.
142
+
143
+ Returns:
144
+ A list of experiments names.
145
+ """
146
+ if samples is not None:
147
+ experiments = []
148
+ for sample in samples:
149
+ experiments.append(self.get_experiment(sample))
150
+ else:
151
+ experiments = self.get_design()["Experiment"].unique().tolist()
152
+
153
+ return experiments
154
+
155
+ def get_expression_column(self, sample: str) -> str:
156
+ """Returns the expression column associated with a sample.
157
+
158
+ Args:
159
+ sample: A sample name.
160
+
161
+ Returns:
162
+ The name of the expression column associated with the sample.
163
+ """
164
+ column_to_sample = self._expression_sample_mapping
165
+ sample_to_column = {v: k for k, v in column_to_sample.items()}
166
+ if sample in sample_to_column:
167
+ expression_column = sample_to_column[sample]
168
+ else:
169
+ expression_column = ""
170
+ return expression_column
171
+
172
+ def make_sample_table(
173
+ self,
174
+ tag: str,
175
+ samples_as_columns: bool = False,
176
+ exclude_invalid: bool = False,
177
+ ) -> pd.DataFrame:
178
+ """Returns a new dataframe with sample columns containing the 'tag'.
179
+
180
+ Args:
181
+ tag: Substring that must be present in selected columns.
182
+ samples_as_columns: If true, replaces expression column names with
183
+ sample names. Requires that the experimental design is set.
184
+ exclude_invalid: Optional, if true the returned dataframe is filtered by
185
+ the "Valid" column. Default false.
186
+
187
+ Returns:
188
+ A new dataframe generated from self.data with sample columns that also
189
+ contained the specified 'tag'.
190
+
191
+ Returns:
192
+ A copied dataframe that contains only the specified columns from the
193
+ quantitative proteomics data.
194
+ """
195
+ samples = self.get_samples()
196
+ columns = helper.find_sample_columns(self.data, tag, samples)
197
+ table = self.get_data(exclude_invalid=exclude_invalid)[columns]
198
+ if samples_as_columns:
199
+ mapping = _str_to_substr_mapping(columns, samples)
200
+ table.rename(columns=mapping, inplace=True)
201
+ return table
202
+
203
+ def make_expression_table(
204
+ self,
205
+ samples_as_columns: bool = False,
206
+ features: Optional[list[str]] = None,
207
+ exclude_invalid: bool = False,
208
+ ) -> pd.DataFrame:
209
+ """Returns a new dataframe containing the expression columns.
210
+
211
+ Args:
212
+ samples_as_columns: If true, replaces expression column names with
213
+ sample names. Requires that the experimental design is set.
214
+ features: A list of additional columns that will be added from qtable.data
215
+ to the newly generated datarame.
216
+ exclude_invalid: Optional, if true the returned dataframe is filtered by
217
+ the "Valid" column. Default false.
218
+
219
+ Returns:
220
+ A copy of tbhe qtable.data dataframe that only contains expression columns
221
+ and additionally specified columns.
222
+ """
223
+ columns = []
224
+ columns.extend(self._expression_columns)
225
+ if features is not None:
226
+ columns.extend(features)
227
+
228
+ table = self.get_data(exclude_invalid=exclude_invalid)[columns]
229
+ if samples_as_columns:
230
+ table.rename(columns=self._expression_sample_mapping, inplace=True)
231
+
232
+ return table
233
+
234
+ def set_expression_by_tag(
235
+ self, tag: str, zerotonan: bool = False, log2: bool = False
236
+ ) -> None:
237
+ """Selects and sets expression columns from those that contain the 'tag'.
238
+
239
+
240
+ A copy of all identified expression columns is generated and columns are renamed
241
+ to "Expression sample_name". Only columns containing a sample name that is
242
+ present in qtable.design are selected as expression columns. For all samples
243
+ present inqtable.design an expression column must be present in qtable.data.
244
+ When this method is called, previously generated expression columns and
245
+ expression features are deleted.
246
+
247
+ Args:
248
+ tag: Columns that contain 'tag' as a substring are selected as potential
249
+ expression columns.
250
+ zerotonan: If true, zeros in expression columns are replace by NaN.
251
+ log2: If true, expression column values are log2 transformed and zeros are
252
+ replaced by NaN. Evaluates whether intensities are likely to be already
253
+ in log-space, which prevents another log2 transformation.
254
+ """
255
+ columns = helper.find_columns(self.data, tag, must_be_substring=True)
256
+ samples_from_design = self.get_samples()
257
+ column_mapping = {}
258
+ for column in columns:
259
+ sample = column.replace(tag, "").strip()
260
+ if sample in samples_from_design:
261
+ column_mapping[column] = sample
262
+ self._set_expression(column_mapping, zerotonan=zerotonan, log2=log2)
263
+
264
+ def set_expression_by_column(
265
+ self,
266
+ columns_to_samples: dict[str, str],
267
+ zerotonan: bool = False,
268
+ log2: bool = False,
269
+ ) -> None:
270
+ """Sets as expression columns by using the keys from 'columns_to_samples'.
271
+
272
+ Generates a copy of all specified expression columns and renames them to
273
+ "Expression sample_name", according to the 'columns_to_samples' mapping. When
274
+ this method is called, previously generated expression columns and expression
275
+ features are deleted.
276
+
277
+ Args:
278
+ columns_to_samples: Mapping of expression columns to sample names. The keys
279
+ of the dictionary must correspond to columns of the proteomics data and
280
+ are used to identify expression columns. The value of each expression
281
+ column specifies the sample name and must correspond to an entry of the
282
+ experimental design table.
283
+ zerotonan: If true, zeros in expression columns are replace by NaN
284
+ log2: If true, expression column values are log2 transformed and zeros are
285
+ replaced by NaN. Evaluates whether intensities are likely to be already
286
+ in log-space, which prevents another log2 transformation.
287
+ """
288
+ self._set_expression(columns_to_samples, zerotonan=zerotonan, log2=log2)
289
+
290
+ def add_expression_features(self, expression_features: pd.DataFrame) -> None:
291
+ """Adds expression features as new columns to the proteomics data.
292
+
293
+ Args:
294
+ expression_features: dataframe or Series that will be added to qtable.data
295
+ as new columns, column names are added to the list of expression
296
+ features. The number and order of rows in 'expression_features' must
297
+ correspond to qtable.data.
298
+ """
299
+ assert isinstance(expression_features, (pd.DataFrame, pd.Series))
300
+ assert self.data.shape[0] == expression_features.shape[0]
301
+
302
+ if isinstance(expression_features, pd.Series):
303
+ expression_features = expression_features.to_frame()
304
+
305
+ old_columns = self.data.columns.difference(expression_features.columns)
306
+ old_columns = self.data.columns[self.data.columns.isin(old_columns)]
307
+ self.data = self.data[old_columns]
308
+
309
+ # Adopt index to assure row by row joining, assumes identical order of entries
310
+ expression_features.index = self.data.index
311
+ self.data = self.data.join(expression_features, how="left")
312
+
313
+ self._expression_features.extend(
314
+ expression_features.columns.difference(self._expression_features)
315
+ )
316
+
317
+ def save(self, directory: str, basename: str):
318
+ """Save qtable to disk, creating a data, design, and config file.
319
+
320
+ Saving the qtable will generate three files, each starting with the specified
321
+ basename, followed by an individual extension. The generated files are:
322
+ "basename.data.tsv", "basename.design.tsv" and "basename.config.yaml"
323
+
324
+ Args:
325
+ directory: The path of the directory where to save the generated files.
326
+ basename: Basename of files that will be generated.
327
+ """
328
+ filepaths = _get_qtable_export_filepaths(directory, basename)
329
+
330
+ config_data = {
331
+ "Expression columns": self._expression_columns,
332
+ "Expression features": self._expression_features,
333
+ "Expression sample mapping": self._expression_sample_mapping,
334
+ "Data dtypes": self.data.dtypes.astype(str).to_dict(),
335
+ }
336
+ with open(filepaths["config"], "w") as openfile:
337
+ yaml.safe_dump(config_data, openfile)
338
+ self.data.to_csv(filepaths["data"], sep="\t", index=True)
339
+ self.design.to_csv(filepaths["design"], sep="\t", index=True)
340
+
341
+ @classmethod
342
+ def load(cls, directory: str, basename: str) -> Qtable:
343
+ """Load a qtable from disk by reading a data, design, and config file.
344
+
345
+ Loading a qtable will first import the three files generated during saving, then
346
+ create and configure a new qtable instance. Each of the filename starts with the
347
+ specified basename, followed by an individual extension. The loaded files are:
348
+ "basename.data.tsv", "basename.design.tsv" and "basename.config.yaml"
349
+
350
+ Args:
351
+ directory: The path of the directory where saved qtable files are located.
352
+ basename: Basename of saved files.
353
+
354
+ Returns:
355
+ An instance of Qtable loaded from the specified files.
356
+ """
357
+ filepaths = _get_qtable_export_filepaths(directory, basename)
358
+ with open(filepaths["config"]) as openfile:
359
+ config_data = yaml.safe_load(openfile)
360
+
361
+ dtypes = config_data["Data dtypes"]
362
+ data = _read_csv_str_safe(
363
+ filepaths["data"], dtypes, **{"sep": "\t", "index_col": 0}
364
+ )
365
+ design = pd.read_csv(
366
+ filepaths["design"], sep="\t", index_col=0, keep_default_na=True
367
+ )
368
+
369
+ qtable = Qtable(data, design)
370
+ qtable._expression_columns = config_data["Expression columns"]
371
+ qtable._expression_features = config_data["Expression features"]
372
+ qtable._expression_sample_mapping = config_data["Expression sample mapping"]
373
+ return qtable
374
+
375
+ def to_tsv(self, path: str, index: bool = False):
376
+ """Writes the data table to a .tsv (tab-separated values) file."""
377
+ warnings.warn(
378
+ "This function is deprecated, use Qtable.save() instead.",
379
+ DeprecationWarning,
380
+ stacklevel=2,
381
+ )
382
+ self.data.to_csv(path, sep="\t", index=index)
383
+
384
+ def to_clipboard(self, index: bool = False):
385
+ """Writes the data table to the system clipboard."""
386
+ self.data.to_clipboard(sep="\t", index=index)
387
+
388
+ def copy(self) -> Qtable:
389
+ """Returns a copy of this Qtable instance."""
390
+ # not tested #
391
+ return self.__copy__()
392
+
393
+ def _set_expression(
394
+ self,
395
+ columns_to_samples: dict[str, str],
396
+ zerotonan: bool = False,
397
+ log2: bool = False,
398
+ ) -> None:
399
+ """Defines expresssion columns and deletes previous expression features.
400
+
401
+ Generates a copy of all specified expression columns and renames them to
402
+ "Expression sample_name", according to the 'columns_to_samples' mapping.
403
+
404
+ Args:
405
+ columns_to_samples: Mapping of expression columns to sample names. The keys
406
+ of the dictionary must correspond to columns of self.data, the values
407
+ specify the sample name and must correspond to entries in
408
+ self.design["Sample"].
409
+ zerotonan: If true, zeros in expression columns are replace by NaN
410
+ log2: If true, expression column values are log2 transformed and zeros are
411
+ replaced by NaN. Evaluates whether intensities are likely to be already
412
+ in log-space, which prevents another log2 transformation.
413
+ """
414
+ data_columns = self.data.columns.tolist()
415
+ expression_columns = list(columns_to_samples.keys())
416
+ samples = list(columns_to_samples.values())
417
+
418
+ if not expression_columns:
419
+ raise KeyError(f"No expression columns matched in qtable")
420
+ if not all([e in data_columns for e in expression_columns]):
421
+ exception_message = (
422
+ f"Not all specified columns {expression_columns} are present in the"
423
+ " qtable"
424
+ )
425
+ raise KeyError(exception_message)
426
+ if not all([s in self.get_samples() for s in samples]):
427
+ exception_message = (
428
+ f"Not all specified samples {samples} are present in the qtable.design"
429
+ )
430
+ raise ValueError(exception_message)
431
+ if not all([s in samples for s in self.get_samples()]):
432
+ exception_message = (
433
+ f"Not all samples from qtable.design are also present in the specified"
434
+ f"samples."
435
+ )
436
+ raise ValueError(exception_message)
437
+
438
+ self._reset_expression()
439
+ new_column_names = [f"Expression {sample}" for sample in samples]
440
+ new_sample_mapping = dict(zip(new_column_names, samples))
441
+
442
+ self._expression_columns = new_column_names
443
+ self._expression_sample_mapping = new_sample_mapping
444
+ expression_data = self.data[expression_columns].copy()
445
+ expression_data.columns = new_column_names
446
+
447
+ if zerotonan or log2:
448
+ expression_data = expression_data.replace({0: np.nan})
449
+ if log2:
450
+ if helper.intensities_in_logspace(expression_data):
451
+ warnings.warn(
452
+ (
453
+ "Prevented log2 transformation of intensities that "
454
+ "appear to be already in log space."
455
+ ),
456
+ UserWarning,
457
+ stacklevel=2,
458
+ )
459
+ else:
460
+ expression_data = np.log2(expression_data)
461
+ self.data[new_column_names] = expression_data
462
+
463
+ def _reset_expression(self) -> None:
464
+ """Removes previously added expression and expression feature columns."""
465
+ no_expression_columns = []
466
+ for col in self.data.columns:
467
+ if col in self._expression_columns:
468
+ continue
469
+ elif col in self._expression_features:
470
+ continue
471
+ else:
472
+ no_expression_columns.append(col)
473
+ self.data = self.data[no_expression_columns]
474
+ self._expression_columns = []
475
+ self._expression_features = []
476
+ self._expression_sample_mapping = {}
477
+
478
+ def __copy__(self) -> Qtable:
479
+ # not tested #
480
+ new_instance = Qtable(self.data, self.design)
481
+ # Copy all private attributes
482
+ for attr in dir(self):
483
+ if (
484
+ not callable(getattr(self, attr))
485
+ and attr.startswith("_")
486
+ and not attr.startswith("__")
487
+ ):
488
+ attr_values = self.__getattribute__(attr).copy()
489
+ new_instance.__setattr__(attr, attr_values)
490
+ return new_instance
491
+
492
+
493
+ def _exclude_invalid(df: pd.DataFrame) -> pd.DataFrame:
494
+ """Returns a filterd dataframe only containing valid entries.
495
+
496
+ Returns:
497
+ A copy of the dataframe that is filtered according to the boolean values in the
498
+ column "Valid".
499
+ """
500
+ if "Valid" not in df:
501
+ raise KeyError("'Valid' column not present in qtable")
502
+ return df[df["Valid"]].copy()
503
+
504
+
505
+ def _str_to_substr_mapping(strings, substrings) -> dict[str, str]:
506
+ """Mapping of strings to substrings.
507
+
508
+ Strings point to a matching substring. If multiple substrings are found in a string,
509
+ only one is reported.
510
+ """
511
+ mapping = dict()
512
+ for sub in substrings:
513
+ mapping.update({s: sub for s in strings if sub in s})
514
+ return mapping
515
+
516
+
517
+ def _get_qtable_export_filepaths(directory: str, name: str):
518
+ """Returns a dictionary of standard filepaths for loading and saving a qtable."""
519
+ filenames = {
520
+ "data": f"{name}.data.tsv",
521
+ "design": f"{name}.design.tsv",
522
+ "config": f"{name}.config.yaml",
523
+ }
524
+ filepaths = {k: os.path.join(directory, fn) for k, fn in filenames.items()}
525
+ return filepaths
526
+
527
+
528
+ def _read_csv_str_safe(filepath: str, dtypes: dict[str, str], **kwargs):
529
+ """Uses pands.read_csv to read a csv file and preserves empty strings."""
530
+ converters = {}
531
+ dtypes_used = {}
532
+ for column, dtype in dtypes.items():
533
+ if dtype in ["object", "o"]:
534
+ converters[column] = lambda x: "" if x == "" else x
535
+ else:
536
+ dtypes_used[column] = dtype
537
+ return pd.read_csv(filepath, dtype=dtypes_used, converters=converters, **kwargs)