seer-pas-sdk 0.1.2__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,129 +1,3 @@
1
- import pandas as pd
2
-
3
-
4
- class PlateMap:
5
- """
6
- Plate map object containing information about samples and corresponding MS data files.
7
- """
8
-
9
- def __init__(
10
- self,
11
- ms_file_name=None,
12
- sample_name=None,
13
- sample_id=None,
14
- well_location=None,
15
- nanoparticle=None,
16
- nanoparticle_id=None,
17
- control=None,
18
- control_id=None,
19
- instrument_name=None,
20
- date_sample_preparation=None,
21
- sample_volume=None,
22
- peptide_concentration=None,
23
- peptide_mass_sample=None,
24
- dilution_factor=None,
25
- kit_id=None,
26
- plate_id=None,
27
- plate_name=None,
28
- ):
29
-
30
- if not ms_file_name:
31
- raise ValueError("MS file name(s) must be provided.")
32
-
33
- self.ms_file_name = ms_file_name
34
- self.length = len(ms_file_name)
35
-
36
- self.sample_name = sample_name
37
- self.sample_id = sample_id
38
- self.well_location = well_location
39
- self.nanoparticle = nanoparticle
40
- self.nanoparticle_id = nanoparticle_id
41
- self.control = control
42
- self.control_id = control_id
43
- self.instrument_name = instrument_name
44
- self.date_sample_preparation = date_sample_preparation
45
- self.sample_volume = sample_volume
46
- self.peptide_concentration = peptide_concentration
47
- self.peptide_mass_sample = peptide_mass_sample
48
- self.dilution_factor = dilution_factor
49
- self.kit_id = kit_id
50
- self.plate_id = plate_id
51
- self.plate_name = plate_name
52
-
53
- self.__cols = [
54
- "MS file name",
55
- "Sample name",
56
- "Sample ID",
57
- "Well location",
58
- "Nanoparticle",
59
- "Nanoparticle ID",
60
- "Control",
61
- "Control ID",
62
- "Instrument name",
63
- "Date sample preparation",
64
- "Sample volume",
65
- "Peptide concentration",
66
- "Peptide mass sample",
67
- "Dilution factor",
68
- "Kit ID",
69
- "Plate ID",
70
- "Plate Name",
71
- ]
72
-
73
- self.__attrs = [
74
- "ms_file_name",
75
- "sample_name",
76
- "sample_id",
77
- "well_location",
78
- "nanoparticle",
79
- "nanoparticle_id",
80
- "control",
81
- "control_id",
82
- "instrument_name",
83
- "date_sample_preparation",
84
- "sample_volume",
85
- "peptide_concentration",
86
- "peptide_mass_sample",
87
- "dilution_factor",
88
- "kit_id",
89
- "plate_id",
90
- "plate_name",
91
- ]
92
-
93
- for attr in self.__attrs:
94
- if not getattr(self, attr):
95
- # Replace falsey values with empty lists
96
- setattr(self, attr, [])
97
-
98
- attr_len = len(getattr(self, attr))
99
-
100
- if attr_len > self.length:
101
- raise ValueError(
102
- "Parameter lengths must not exceed the number of MS files."
103
- )
104
-
105
- elif attr_len < self.length:
106
- for i in range(self.length - attr_len):
107
- getattr(self, attr).append(None)
108
-
109
- def to_dict(self):
110
- res = {}
111
-
112
- for i in range(len(self.__attrs)):
113
- res[self.__cols[i]] = getattr(self, self.__attrs[i])
114
-
115
- for entry in res:
116
- res[entry] = {i: res[entry][i] for i in range(len(res[entry]))}
117
-
118
- return res
119
-
120
- def to_df(self):
121
- return pd.DataFrame(self.to_dict())
122
-
123
- def to_csv(self, path=None):
124
- if not path:
125
- return self.to_df().to_csv(index=False)
126
- return self.to_df().to_csv(path_or_buf=path, index=False)
127
-
128
- def __repr__(self):
129
- return str(self.to_dict())
1
+ from .platemap import PlateMap
2
+ from .groupanalysis import GroupAnalysisPostData
3
+ from .volcanoplot import *
@@ -0,0 +1,30 @@
1
+ # Filter group analysis data for only the POST
2
+ class GroupAnalysisPostData:
3
+ """DTO for Group Analysis Saved Results"""
4
+
5
+ def __init__(self, data):
6
+ if "post" not in data:
7
+ raise ValueError('Invalid data format. Missing "post" key')
8
+
9
+ # Safe check if there are proteins vs peptides
10
+ num_proteins = (
11
+ data["post"]
12
+ .get("protein", {"totalFeature": 0})
13
+ .get("totalFeature", 0)
14
+ )
15
+ num_peptides = (
16
+ data["post"]
17
+ .get("peptide", {"totalFeature": 0})
18
+ .get("totalFeature", 0)
19
+ )
20
+
21
+ if num_proteins > 0:
22
+ self.type = "protein"
23
+ elif num_peptides > 0:
24
+ self.type = "peptide"
25
+ else:
26
+ raise ValueError(
27
+ "Invalid data format. No features found in post data"
28
+ )
29
+ self.data = data["post"][self.type]["mergedStats"]
30
+ self.stat_test = data["post"][self.type]["parameters"]["statTest"]
@@ -0,0 +1,174 @@
1
+ import pandas as pd
2
+
3
+
4
+ class PlateMap:
5
+ """
6
+ Plate map object containing information about samples and corresponding MS data files.
7
+ """
8
+
9
+ def __init__(
10
+ self,
11
+ ms_file_name=None,
12
+ sample_name=None,
13
+ sample_id=None,
14
+ well_location=None,
15
+ nanoparticle=None,
16
+ nanoparticle_id=None,
17
+ control=None,
18
+ control_id=None,
19
+ instrument_name=None,
20
+ date_sample_preparation=None,
21
+ sample_volume=None,
22
+ peptide_concentration=None,
23
+ peptide_mass_sample=None,
24
+ recon_volume=None,
25
+ dilution_factor=None,
26
+ kit_id=None,
27
+ plate_id=None,
28
+ plate_name=None,
29
+ assay_version=None,
30
+ sample_tube_id=None,
31
+ method_set_id=None,
32
+ assay_method_id=None,
33
+ product="XT",
34
+ ):
35
+
36
+ if not ms_file_name:
37
+ raise ValueError("MS file name(s) must be provided.")
38
+
39
+ self.ms_file_name = ms_file_name
40
+ self.length = len(ms_file_name)
41
+
42
+ self.sample_name = sample_name
43
+ self.sample_id = sample_id
44
+ self.well_location = well_location
45
+ self.nanoparticle = nanoparticle
46
+ self.nanoparticle_id = nanoparticle_id
47
+ self.control = control
48
+ self.control_id = control_id
49
+ self.instrument_name = instrument_name
50
+ self.date_sample_preparation = date_sample_preparation
51
+ self.sample_volume = sample_volume
52
+ self.peptide_concentration = peptide_concentration
53
+ self.peptide_mass_sample = peptide_mass_sample
54
+ self.recon_volume = recon_volume
55
+ self.dilution_factor = dilution_factor
56
+ self.kit_id = kit_id
57
+ self.plate_id = plate_id
58
+ self.plate_name = plate_name
59
+ self.assay_version = assay_version
60
+ self.sample_tube_id = sample_tube_id
61
+ self.method_set_id = method_set_id
62
+ self.assay_method_id = assay_method_id
63
+ self.product = product
64
+
65
+ if self.product == "XT":
66
+ self.__map = {
67
+ "ms_file_name": "MS file name",
68
+ "sample_name": "Sample name",
69
+ "sample_id": "Sample ID",
70
+ "well_location": "Well location",
71
+ "nanoparticle": "Nanoparticle",
72
+ "nanoparticle_id": "Nanoparticle ID",
73
+ "control": "Control",
74
+ "control_id": "Control ID",
75
+ "instrument_name": "Instrument name",
76
+ "date_sample_preparation": "Date sample preparation",
77
+ "sample_volume": "Sample volume",
78
+ "peptide_concentration": "Peptide concentration",
79
+ "peptide_mass_sample": "Peptide mass sample",
80
+ "recon_volume": "Recon volume",
81
+ "dilution_factor": "Dilution factor",
82
+ "kit_id": "Kit ID",
83
+ "plate_id": "Plate ID",
84
+ "plate_name": "Plate Name",
85
+ "assay_version": "Assay",
86
+ }
87
+ else:
88
+ self.__map = {
89
+ "ms_file_name": "MS file name",
90
+ "sample_name": "Sample name",
91
+ "sample_id": "Sample ID",
92
+ "well_location": "Well location",
93
+ "nanoparticle": "Nanoparticle set",
94
+ "nanoparticle_id": "Nanoparticle set ID",
95
+ "control_id": "Control ID",
96
+ "instrument_name": "Instrument ID",
97
+ "date_sample_preparation": "Date assay initiated",
98
+ "sample_volume": "Sample volume",
99
+ "peptide_concentration": "Reconstituted peptide concentration",
100
+ "peptide_mass_sample": "Recovered peptide mass",
101
+ "recon_volume": "Reconstitution volume",
102
+ "plate_id": "Plate ID",
103
+ "plate_name": "Plate Name",
104
+ "assay_version": "Assay product",
105
+ "sample_tube_id": "Sample tube ID",
106
+ "method_set_id": "Method set ID",
107
+ "assay_method_id": "Assay method ID",
108
+ }
109
+
110
+ self.__attrs = [
111
+ "ms_file_name",
112
+ "sample_name",
113
+ "sample_id",
114
+ "well_location",
115
+ "nanoparticle",
116
+ "nanoparticle_id",
117
+ "control",
118
+ "control_id",
119
+ "instrument_name",
120
+ "date_sample_preparation",
121
+ "sample_volume",
122
+ "peptide_concentration",
123
+ "peptide_mass_sample",
124
+ "recon_volume",
125
+ "dilution_factor",
126
+ "kit_id",
127
+ "plate_id",
128
+ "plate_name",
129
+ "assay_version",
130
+ "sample_tube_id",
131
+ "method_set_id",
132
+ "assay_method_id",
133
+ ]
134
+
135
+ for attr in self.__attrs:
136
+ if not getattr(self, attr):
137
+ # Replace falsey values with empty lists
138
+ setattr(self, attr, [])
139
+
140
+ attr_len = len(getattr(self, attr))
141
+
142
+ if attr_len > self.length:
143
+ raise ValueError(
144
+ "Parameter lengths must not exceed the number of MS files."
145
+ )
146
+
147
+ elif attr_len < self.length:
148
+ for i in range(self.length - attr_len):
149
+ getattr(self, attr).append(None)
150
+
151
+ def to_dict(self):
152
+ res = {}
153
+
154
+ supported_cols = self.__map.keys()
155
+
156
+ for attr in self.__attrs:
157
+ if attr in supported_cols:
158
+ res[self.__map[attr]] = getattr(self, attr)
159
+
160
+ for entry in res:
161
+ res[entry] = {i: res[entry][i] for i in range(len(res[entry]))}
162
+
163
+ return res
164
+
165
+ def to_df(self):
166
+ return pd.DataFrame(self.to_dict())
167
+
168
+ def to_csv(self, path=None):
169
+ if not path:
170
+ return self.to_df().to_csv(index=False)
171
+ return self.to_df().to_csv(path_or_buf=path, index=False)
172
+
173
+ def __repr__(self):
174
+ return str(self.to_dict())
@@ -0,0 +1,290 @@
1
+ import numpy as np
2
+ import json
3
+ import pandas as pd
4
+ from typing import List as _List, Dict as _Dict
5
+ from .groupanalysis import GroupAnalysisPostData
6
+
7
+
8
+ class VolcanoPlotSettings:
9
+ """Class to hold the settings information for the Volcano Plot"""
10
+
11
+ def __init__(
12
+ self,
13
+ significance_threshold: float = 0.05,
14
+ fold_change_threshold: float = 1,
15
+ label_by: str = "fold_change",
16
+ ):
17
+ """Initialize the VolcanoPlotSettings object
18
+
19
+ Args:
20
+ significance_threshold (float, optional): Cutoff value for the p-value to determine significance. Defaults to 0.05.
21
+ fold_change_threshold (float, optional): Cutoff value for the fold change to determine significance. Defaults to 1.
22
+ label_by (str, optional): Metric to sort result data. Defaults to "fold_change".
23
+
24
+ Raises:
25
+ ValueError: "Invalid label_by value, must be one of ['euclidean', 'fold_change', 'significance']"
26
+ """
27
+ if label_by not in ["euclidean", "fold_change", "significance"]:
28
+ raise ValueError(
29
+ "Invalid label_by value, must be one of ['euclidean', 'fold_change', 'significance']"
30
+ )
31
+ self.significance_threshold = significance_threshold
32
+ self.fold_change_threshold = fold_change_threshold
33
+ self.label_by = label_by
34
+
35
+ @property
36
+ def values(self):
37
+ return {
38
+ "significance_threshold": self.significance_threshold,
39
+ "fold_change_threshold": self.fold_change_threshold,
40
+ "label_by": self.label_by,
41
+ }
42
+
43
+ @classmethod
44
+ def get_settings(cls):
45
+ return [
46
+ "significance_threshold",
47
+ "fold_change_threshold",
48
+ "label_by",
49
+ ]
50
+
51
+ @classmethod
52
+ def get_label_by_map(cls):
53
+ return dict(
54
+ euclidean="euclideanDistance",
55
+ fold_change="logFD",
56
+ significance="negativeLog10P",
57
+ )
58
+
59
+
60
+ class VolcanoPlotBuilder:
61
+ """
62
+ Builder class for the Volcano Plot
63
+ Can be used to reuse the same GroupAnalysisResults data to build multiple Volcano Plots with different settings.
64
+
65
+ """
66
+
67
+ PROTEIN_GROUP_INDEX = "pg"
68
+ PEPTIDE_INDEX = "peptide"
69
+
70
+ def __init__(
71
+ self,
72
+ data: _List[_Dict],
73
+ significance_threshold: float = 0.05,
74
+ fold_change_threshold: float = 1,
75
+ label_by: str = "fold_change",
76
+ ):
77
+ """Initialize the VolcanoPlotBuilder object
78
+
79
+ Args:
80
+ data (list[dict]): The complete set of group analysis result data
81
+ significance_threshold (float, optional): Cutoff value for the p-value to determine significance. Defaults to 0.05.
82
+ fold_change_threshold (float, optional): Cutoff value for the fold change to determine significance. Defaults to 1.
83
+ label_by (str, optional): Metric to sort result data. Defaults to "fold_change".
84
+
85
+ Raises:
86
+ ValueError: "Invalid label_by value, must be one of ['euclidean', 'fold_change', 'significance']"
87
+
88
+ Returns:
89
+ None
90
+ """
91
+
92
+ self.settings = VolcanoPlotSettings(
93
+ significance_threshold=significance_threshold,
94
+ fold_change_threshold=fold_change_threshold,
95
+ label_by=label_by,
96
+ )
97
+
98
+ parsed_data = GroupAnalysisPostData(data)
99
+
100
+ self.type = parsed_data.type
101
+ self.stat_test = parsed_data.stat_test
102
+ self.data = parsed_data.data
103
+ self.minusLog10PSigValue = -np.log10(
104
+ self.settings.significance_threshold
105
+ )
106
+ self.sort_param = VolcanoPlotSettings.get_label_by_map()[
107
+ self.settings.label_by
108
+ ]
109
+ self.max_logFD, self.max_negative_log10_p = self._get_max_values(
110
+ self.data
111
+ )
112
+ self.protein_gene_map = dict()
113
+ self.feature_type_index = (
114
+ self.PROTEIN_GROUP_INDEX
115
+ if self.type == "protein"
116
+ else self.PEPTIDE_INDEX
117
+ )
118
+ self.volcano_plot = self.build()
119
+
120
+ def build(self):
121
+ """Build the volcano plot
122
+
123
+ Returns:
124
+ list[dict]: sorted volcano plot data
125
+ """
126
+ result = []
127
+ for i, row in enumerate(self.data):
128
+ result.append(self.build_row(i, row))
129
+ sorted_result = sorted(
130
+ result,
131
+ key=lambda x: (
132
+ x[self.sort_param]
133
+ if self.sort_param != "logFD"
134
+ else np.abs(x[self.sort_param])
135
+ ),
136
+ reverse=True,
137
+ )
138
+ return sorted_result
139
+
140
+ def build_row(self, i, data):
141
+ """Build a row for the volcano plot
142
+
143
+ Args:
144
+ i (int): The index of the row
145
+ data (dict): a group analysis result entry
146
+
147
+ Returns:
148
+ dict: The row data
149
+ """
150
+ self.protein_gene_map[data[self.feature_type_index]] = data["gene"]
151
+
152
+ row = dict(
153
+ logFD=data["logFD"],
154
+ negativeLog10P=data["negativeLog10P"],
155
+ dataIndex=i,
156
+ rowID=json.dumps(data),
157
+ gene=data["gene"],
158
+ group=self.get_contrast_group_string(data),
159
+ significant=self.get_significance_class(data),
160
+ euclideanDistance=self.calculate_euclidean_distance(
161
+ data["logFD"] / self.max_logFD,
162
+ data["negativeLog10P"] / self.max_negative_log10_p,
163
+ ),
164
+ )
165
+ row[self.type] = data[self.feature_type_index]
166
+ return row
167
+
168
+ def is_significant_point(self, data):
169
+ return (
170
+ data["negativeLog10P"] >= self.minusLog10PSigValue
171
+ and np.abs(data["logFD"]) >= self.settings.fold_change_threshold
172
+ )
173
+
174
+ def get_significance_class(self, data):
175
+ """Get the significance class
176
+
177
+ Args:
178
+ data (dict): the row data
179
+
180
+ Returns:
181
+ int: 0 if not significant, 1 if (logFD >= 1), -1 if (logFD <= -1)
182
+ """
183
+ if not self.is_significant_point(data):
184
+ return 0
185
+ elif data["logFD"] >= 1:
186
+ return 1
187
+ elif data["logFD"] <= -1:
188
+ return -1
189
+
190
+ def get_contrast_group_string(self, obj):
191
+ """Get the contrast group string
192
+
193
+ Args:
194
+ obj (dict): The row data
195
+
196
+ Returns:
197
+ str: The contrast group string
198
+ """
199
+ if (
200
+ obj
201
+ and obj.get("contrastGroup", None)
202
+ and obj["contrastGroup"].get("G1", None)
203
+ and obj["contrastGroup"].get("G2", None)
204
+ ):
205
+ return "/".join(
206
+ [obj["contrastGroup"]["G1"], obj["contrastGroup"]["G2"]]
207
+ )
208
+
209
+ def calculate_euclidean_distance(self, x, y):
210
+ """Calculate the euclidean distance
211
+
212
+ Args:
213
+ x (float): The x value
214
+ y (float): The y value
215
+
216
+ Returns:
217
+ float: The euclidean distance
218
+ """
219
+ return np.sqrt(x**2 + y**2)
220
+
221
+ def _get_max_values(self, data):
222
+ """For euclidean distance, get the max logFD and negativeLog10P values to normalize the data
223
+
224
+ Args:
225
+ data (list[dict]): The complete set of group analysis result data
226
+
227
+ Returns:
228
+ tuple: The max logFD and negativeLog10P values.
229
+ """
230
+ max_logFD = -np.inf
231
+ max_negative_log10_p = -np.inf
232
+ for row in data:
233
+ max_logFD = max(max_logFD, row["logFD"])
234
+ max_negative_log10_p = max(
235
+ max_negative_log10_p, row["negativeLog10P"]
236
+ )
237
+ return max_logFD, max_negative_log10_p
238
+
239
+ def update(
240
+ self,
241
+ significance_threshold=None,
242
+ fold_change_threshold=None,
243
+ label_by=None,
244
+ ):
245
+ """Updates the settings and recalculates the volcano plot
246
+
247
+ Args:
248
+ significance_threshold (float, optional): Cutoff value for the p-value to determine significance
249
+ fold_change_threshold (float, optional): Cutoff value for the fold change to determine significance
250
+ label_by (str, optional): Metric to sort result data
251
+
252
+ Raises:
253
+ ValueError: "Invalid label_by value, must be one of ['euclidean', 'fold_change', 'significance']"
254
+
255
+ Returns:
256
+ None
257
+ """
258
+ if not significance_threshold:
259
+ significance_threshold = self.settings.significance_threshold
260
+ if not fold_change_threshold:
261
+ fold_change_threshold = self.settings.fold_change_threshold
262
+ if not label_by:
263
+ label_by = self.settings.label_by
264
+
265
+ self.settings = VolcanoPlotSettings(
266
+ significance_threshold=significance_threshold,
267
+ fold_change_threshold=fold_change_threshold,
268
+ label_by=label_by,
269
+ )
270
+ self.minusLog10PSigValue = -np.log10(
271
+ self.settings.significance_threshold
272
+ )
273
+ self.sort_param = VolcanoPlotSettings.get_label_by_map()[
274
+ self.settings.label_by
275
+ ]
276
+ self.volcano_plot = self.build()
277
+
278
+ def to_df(self):
279
+ """Convert the volcano plot data to a DataFrame"""
280
+ return pd.DataFrame(self.volcano_plot)
281
+
282
+ def get_significant_rows(self):
283
+ """Get the significant proteins
284
+
285
+ Returns:
286
+ List: The list of significant proteins
287
+ """
288
+ return [
289
+ row[self.type] for row in self.volcano_plot if row["significant"]
290
+ ]