metahq-core 0.1.2__py3-none-any.whl → 1.0.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- metahq_core/__init__.py +1 -1
- metahq_core/curations/annotation_converter.py +5 -5
- metahq_core/curations/annotations.py +361 -151
- metahq_core/curations/index.py +104 -43
- metahq_core/curations/labels.py +259 -128
- metahq_core/curations/propagator.py +62 -85
- metahq_core/export/__init__.py +0 -0
- metahq_core/export/annotations.py +125 -59
- metahq_core/export/labels.py +128 -70
- metahq_core/logger.py +11 -18
- metahq_core/query.py +346 -241
- metahq_core/{ontology/loader.py → relations_loader.py} +2 -1
- metahq_core/search.py +37 -14
- metahq_core/util/io.py +109 -46
- metahq_core/util/supported.py +16 -5
- {metahq_core-0.1.2.dist-info → metahq_core-1.0.0rc1.dist-info}/METADATA +13 -6
- metahq_core-1.0.0rc1.dist-info/RECORD +30 -0
- {metahq_core-0.1.2.dist-info → metahq_core-1.0.0rc1.dist-info}/WHEEL +1 -1
- metahq_core-1.0.0rc1.dist-info/licenses/LICENSE +28 -0
- metahq_core/ontology/base.py +0 -376
- metahq_core/ontology/graph.py +0 -252
- metahq_core-0.1.2.dist-info/RECORD +0 -30
- /metahq_core/{ontology → curations}/__init__.py +0 -0
|
@@ -4,7 +4,7 @@ Class for storing and mutating annotation collections.
|
|
|
4
4
|
Author: Parker Hicks
|
|
5
5
|
Date: 2025-04-14
|
|
6
6
|
|
|
7
|
-
Last updated:
|
|
7
|
+
Last updated: 2026-02-02 by Parker Hicks
|
|
8
8
|
"""
|
|
9
9
|
|
|
10
10
|
from __future__ import annotations
|
|
@@ -20,7 +20,7 @@ from metahq_core.curations.index import Ids
|
|
|
20
20
|
from metahq_core.curations.labels import Labels
|
|
21
21
|
from metahq_core.export.annotations import AnnotationsExporter
|
|
22
22
|
from metahq_core.logger import setup_logger
|
|
23
|
-
from metahq_core.util.
|
|
23
|
+
from metahq_core.util.supported import get_default_log_dir
|
|
24
24
|
|
|
25
25
|
if TYPE_CHECKING:
|
|
26
26
|
import logging
|
|
@@ -31,87 +31,24 @@ class Annotations(BaseCuration):
|
|
|
31
31
|
Class to store and mutate annotations of samples to various attributes
|
|
32
32
|
like tissues, dieases, sexes, ages, etc.
|
|
33
33
|
|
|
34
|
-
Attributes
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
attribute entity for each index (e.g. male or female, tissues, diseases, etc).
|
|
34
|
+
Attributes:
|
|
35
|
+
data (pl.DataFrame):
|
|
36
|
+
Polars DataFrame with index and group ID columns and columns for each
|
|
37
|
+
attribute entity for each index (e.g. male or female, tissues, diseases, etc).
|
|
39
38
|
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
39
|
+
disease (bool):
|
|
40
|
+
Indicates if the annotations are disease based. Used to account for control samples
|
|
41
|
+
when converting annotations to labels.
|
|
43
42
|
|
|
44
|
-
|
|
45
|
-
|
|
43
|
+
index_col (str):
|
|
44
|
+
Name of the column of data that contains the index IDs.
|
|
46
45
|
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
collapsed: bool
|
|
52
|
-
Indicates if the annotations have already been collapsed.
|
|
53
|
-
|
|
54
|
-
Methods
|
|
55
|
-
-------
|
|
56
|
-
collapse()
|
|
57
|
-
Collapses index annotations to group annotations.
|
|
58
|
-
|
|
59
|
-
drop()
|
|
60
|
-
Wrapper for polars `drop`.
|
|
61
|
-
|
|
62
|
-
filter()
|
|
63
|
-
Wrapper for polars `filter`.
|
|
64
|
-
|
|
65
|
-
from_df()
|
|
66
|
-
Creates an Annotations object from a polars DataFrame or LazyFrame.
|
|
67
|
-
|
|
68
|
-
head()
|
|
69
|
-
Wrapper for polars `head`.
|
|
70
|
-
|
|
71
|
-
propagate_controls()
|
|
72
|
-
Propagates control samples to diseases that other samples in the same
|
|
73
|
-
dataset are annotated to.
|
|
74
|
-
|
|
75
|
-
select()
|
|
76
|
-
Wrapper for polars `select`.
|
|
77
|
-
|
|
78
|
-
slice()
|
|
79
|
-
Wrapper for polars `slice`.
|
|
80
|
-
|
|
81
|
-
to_labels()
|
|
82
|
-
Propagates annotations to labels for an annotations matrix, given a reference
|
|
83
|
-
ontology.
|
|
84
|
-
|
|
85
|
-
to_numpy()
|
|
86
|
-
Returns the annotations frame as a numpy 2D array.
|
|
87
|
-
|
|
88
|
-
to_parquet()
|
|
89
|
-
Saves the annotations frame and IDs to a .parquet file.
|
|
90
|
-
|
|
91
|
-
Properties
|
|
92
|
-
---------
|
|
93
|
-
entities: list[str]
|
|
94
|
-
columns of the annotations frame of ontology terms.
|
|
95
|
-
|
|
96
|
-
groups: list[str]
|
|
97
|
-
Groups associated with each index of the annotations curation.
|
|
98
|
-
Note that groups are not unique.
|
|
99
|
-
|
|
100
|
-
ids: pl.DataFrame
|
|
101
|
-
The frame of all IDs within the annotations curation.
|
|
102
|
-
|
|
103
|
-
index
|
|
104
|
-
The index IDs of the annotations frame.
|
|
105
|
-
|
|
106
|
-
n_entities: int
|
|
107
|
-
Number of unique entities.
|
|
108
|
-
|
|
109
|
-
n_index: int
|
|
110
|
-
Number of indices.
|
|
111
|
-
|
|
112
|
-
unique_groups: list[str]
|
|
113
|
-
Unique groups in the annotations curation.
|
|
46
|
+
group_cols (tuple[str, ...]):
|
|
47
|
+
Names of columns of data that contain an ID for each index indicating if it belongs
|
|
48
|
+
to a particular group (e.g. dataset, sex, platform, etc.).
|
|
114
49
|
|
|
50
|
+
collapsed (bool):
|
|
51
|
+
Indicates if the annotations have already been collapsed.
|
|
115
52
|
"""
|
|
116
53
|
|
|
117
54
|
def __init__(
|
|
@@ -123,7 +60,7 @@ class Annotations(BaseCuration):
|
|
|
123
60
|
collapsed: bool = False,
|
|
124
61
|
logger=None,
|
|
125
62
|
loglevel=20,
|
|
126
|
-
logdir=
|
|
63
|
+
logdir=get_default_log_dir(),
|
|
127
64
|
verbose=True,
|
|
128
65
|
):
|
|
129
66
|
self.data = data
|
|
@@ -139,14 +76,21 @@ class Annotations(BaseCuration):
|
|
|
139
76
|
self.verbose: bool = verbose
|
|
140
77
|
|
|
141
78
|
def add_ids(self, new: pl.DataFrame) -> Annotations:
|
|
142
|
-
"""
|
|
143
|
-
Append new group ID columns to the IDs of an Annotations object. The new
|
|
79
|
+
"""Append new group ID columns to the IDs of an Annotations object. The new
|
|
144
80
|
IDs must have a matching index.
|
|
81
|
+
|
|
82
|
+
Arguments:
|
|
83
|
+
new (pl.DataFrame):
|
|
84
|
+
A DataFrame of additional IDs to join with the current index column of `data`.
|
|
85
|
+
Must have a matching index column as the original `data`.
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
A new Annotations object including the new ID columns.
|
|
145
89
|
"""
|
|
146
90
|
new_ids = new.join(
|
|
147
91
|
self.ids, on=self.index_col, how="inner", maintain_order="right"
|
|
148
92
|
)
|
|
149
|
-
new_groups = tuple(
|
|
93
|
+
new_groups = tuple(col for col in new_ids.columns if col != self.index_col)
|
|
150
94
|
assert new_ids.height == self.ids.height, "SRA IDs height mismatch."
|
|
151
95
|
assert (
|
|
152
96
|
new_ids[self.index_col].to_list() == self.index
|
|
@@ -157,15 +101,13 @@ class Annotations(BaseCuration):
|
|
|
157
101
|
)
|
|
158
102
|
|
|
159
103
|
def collapse(self, on: str, inplace: bool = True):
|
|
160
|
-
"""
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
inplace: bool
|
|
168
|
-
If True, updates this object and returns self. If False, returns new object.
|
|
104
|
+
"""Collapses annotations on the specified grouping column.
|
|
105
|
+
|
|
106
|
+
Arguments:
|
|
107
|
+
on (str):
|
|
108
|
+
The column to collapse on. This should be one of the columns in `group_cols`.
|
|
109
|
+
inplace (bool):
|
|
110
|
+
If True, updates this object and returns self. Otherwise, returns new object.
|
|
169
111
|
"""
|
|
170
112
|
params = self._collapse(on)
|
|
171
113
|
|
|
@@ -180,7 +122,9 @@ class Annotations(BaseCuration):
|
|
|
180
122
|
return self.__class__(**params)
|
|
181
123
|
|
|
182
124
|
def drop(self, *args, **kwargs) -> Annotations:
|
|
183
|
-
"""Wrapper for polars drop. Drops any of the term columns.
|
|
125
|
+
"""Wrapper for polars drop. Drops any of the term columns.
|
|
126
|
+
ID columns are not dropped through this method.
|
|
127
|
+
"""
|
|
184
128
|
return self.__class__(
|
|
185
129
|
data=self.data.drop(*args, **kwargs),
|
|
186
130
|
ids=self.ids,
|
|
@@ -192,7 +136,31 @@ class Annotations(BaseCuration):
|
|
|
192
136
|
)
|
|
193
137
|
|
|
194
138
|
def filter(self, condition: pl.Expr) -> Annotations:
|
|
195
|
-
"""Filter both data and ids simultaneously using a mask.
|
|
139
|
+
"""Filter both data and ids simultaneously using a mask.
|
|
140
|
+
|
|
141
|
+
Arguments:
|
|
142
|
+
condition (pl.Expr):
|
|
143
|
+
Polars expression for filtering columns.
|
|
144
|
+
|
|
145
|
+
Examples:
|
|
146
|
+
>>> from metahq_core.curations.annotations import Annotations
|
|
147
|
+
>>> anno = {
|
|
148
|
+
'sample': ['GSM1', 'GSM2', 'GSM3'],
|
|
149
|
+
'series': ['GSE1', 'GSE1', 'GSE2'],
|
|
150
|
+
'UBERON:0000948': [1, 0, 0],
|
|
151
|
+
'UBERON:0002113': [0, 1, 0],
|
|
152
|
+
'UBERON:0000955': [0, 0, 1],
|
|
153
|
+
}
|
|
154
|
+
>>> anno = Annotations.from_df(anno, index_col="sample", group_cols=["series"])
|
|
155
|
+
>>> anno.filter(pl.col("UBERON:0000948") == 1)
|
|
156
|
+
┌────────┬────────┬────────────────┬────────────────┬────────────────┐
|
|
157
|
+
│ sample ┆ series ┆ UBERON:0000948 ┆ UBERON:0002113 ┆ UBERON:0000955 │
|
|
158
|
+
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
|
|
159
|
+
│ str ┆ str ┆ i32 ┆ i32 ┆ i32 │
|
|
160
|
+
╞════════╪════════╪════════════════╪════════════════╪════════════════╡
|
|
161
|
+
│ GSM1 ┆ GSE1 ┆ 1 ┆ 0 ┆ 0 │
|
|
162
|
+
└────────┴────────┴────────────────┴────────────────┴────────────────┘
|
|
163
|
+
"""
|
|
196
164
|
mask = self.data.select(condition.arg_true()).to_numpy().reshape(-1)
|
|
197
165
|
|
|
198
166
|
filtered_data = (
|
|
@@ -216,29 +184,58 @@ class Annotations(BaseCuration):
|
|
|
216
184
|
|
|
217
185
|
def save(
|
|
218
186
|
self,
|
|
219
|
-
outfile:
|
|
187
|
+
outfile: str | Path,
|
|
220
188
|
fmt: Literal["json", "parquet", "csv", "tsv"],
|
|
189
|
+
attribute: str,
|
|
190
|
+
level: str,
|
|
221
191
|
metadata: str | None = None,
|
|
222
192
|
):
|
|
223
|
-
"""
|
|
224
|
-
Save annotations curation to json. Keys are terms and values are
|
|
225
|
-
positively annotated indices.
|
|
193
|
+
"""Save the annotations curation.
|
|
226
194
|
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
Path to outfile.json.
|
|
195
|
+
Arguments:
|
|
196
|
+
outfile (str | Path):
|
|
197
|
+
Path to outfile.json.
|
|
231
198
|
|
|
232
|
-
|
|
233
|
-
|
|
199
|
+
fmt (Literal["json", "parquet", "csv", "tsv"]):
|
|
200
|
+
File format to save to.
|
|
234
201
|
|
|
202
|
+
attribute (str):
|
|
203
|
+
A supported MetaHQ annotated attribute.
|
|
204
|
+
|
|
205
|
+
level (str):
|
|
206
|
+
An index level supported by MetaHQ.
|
|
207
|
+
|
|
208
|
+
metadata (bool):
|
|
209
|
+
If True, will add index titles to each entry.
|
|
235
210
|
"""
|
|
236
|
-
AnnotationsExporter(
|
|
237
|
-
|
|
238
|
-
)
|
|
211
|
+
AnnotationsExporter(
|
|
212
|
+
attribute, level, logger=self.log, verbose=self.verbose
|
|
213
|
+
).save(self, fmt, outfile, metadata)
|
|
239
214
|
|
|
240
215
|
def sort_columns(self):
|
|
241
|
-
"""Sorts term columns.
|
|
216
|
+
"""Sorts term columns.
|
|
217
|
+
|
|
218
|
+
Examples:
|
|
219
|
+
>>> from metahq_core.curations.annotations import Annotations
|
|
220
|
+
>>> anno = {
|
|
221
|
+
'sample': ['GSM1', 'GSM2', 'GSM3'],
|
|
222
|
+
'series': ['GSE1', 'GSE1', 'GSE2'],
|
|
223
|
+
'UBERON:0000948': [1, 0, 0],
|
|
224
|
+
'UBERON:0002113': [0, 1, 0],
|
|
225
|
+
'UBERON:0000955': [0, 0, 1],
|
|
226
|
+
}
|
|
227
|
+
>>> anno = Annotations.from_df(anno, index_col="sample", group_cols=["series"])
|
|
228
|
+
>>> anno.sort_columns()
|
|
229
|
+
┌────────┬────────┬────────────────┬────────────────┬────────────────┐
|
|
230
|
+
│ series ┆ sample ┆ UBERON:0000948 ┆ UBERON:0000955 ┆ UBERON:0002113 │
|
|
231
|
+
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
|
|
232
|
+
│ str ┆ str ┆ i32 ┆ i32 ┆ i32 │
|
|
233
|
+
╞════════╪════════╪════════════════╪════════════════╪════════════════╡
|
|
234
|
+
│ GSE1 ┆ GSM1 ┆ 1 ┆ 0 ┆ 0 │
|
|
235
|
+
│ GSE1 ┆ GSM2 ┆ 0 ┆ 0 ┆ 1 │
|
|
236
|
+
│ GSE2 ┆ GSM3 ┆ 0 ┆ 1 ┆ 0 │
|
|
237
|
+
└────────┴────────┴────────────────┴────────────────┴────────────────┘
|
|
238
|
+
"""
|
|
242
239
|
return self.__class__(
|
|
243
240
|
data=self.data.select(sorted(self.data.columns)),
|
|
244
241
|
ids=self.ids,
|
|
@@ -260,35 +257,72 @@ class Annotations(BaseCuration):
|
|
|
260
257
|
|
|
261
258
|
Assigns propagated labels to terms given their annotations.
|
|
262
259
|
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
260
|
+
Arguments:
|
|
261
|
+
to_terms (list[str]):
|
|
262
|
+
Array of terms to generate labels for, or "union"/"all".
|
|
263
|
+
|
|
264
|
+
ontology (str):
|
|
265
|
+
The name of an ontology to reference for annotation propagation.
|
|
266
|
+
|
|
267
|
+
mode (Literal[0, 1]):
|
|
268
|
+
Mode of propagation.
|
|
269
|
+
|
|
270
|
+
If mode is 0, this will propagate any positive annotations
|
|
271
|
+
from any descendants of the to_terms up to the to_terms.
|
|
272
|
+
|
|
273
|
+
If mode 1, this will convert annotations to -1, 0, +1 labels
|
|
274
|
+
where for a particular term, if an index is annotated to that term or
|
|
275
|
+
any of its descendants, it recieves a +1 label. If it is annotated to an
|
|
276
|
+
ancestor of that term, it receives a 0 (unsure) label. If it is not annotated
|
|
277
|
+
to an ancestor or a descendant of that term, it recieves a -1 label.
|
|
278
|
+
Any indices annotated to the control column are assigned a label of 2 for any
|
|
279
|
+
terms that other indices within the same group are positively labeled to.
|
|
280
|
+
|
|
281
|
+
control_col (str):
|
|
282
|
+
Column name for control annotations.
|
|
283
|
+
|
|
284
|
+
Returns:
|
|
285
|
+
A Labels curation object with propagated -1, 0, +1 labels (and 2 if controls are
|
|
286
|
+
present). Any entries in `index_col` that have a 0 annotation/label across all
|
|
287
|
+
entity columns are dropped.
|
|
288
|
+
|
|
289
|
+
Examples:
|
|
290
|
+
|
|
291
|
+
With `mode=0`:
|
|
292
|
+
|
|
293
|
+
>>> anno = pl.DataFrame(
|
|
294
|
+
{
|
|
295
|
+
"series": ["GSE1", "GSE1", "GSE2"],
|
|
296
|
+
"sample": ["GSM1", "GSM2", "GSM3"],
|
|
297
|
+
"UBERON:0000948": [1, 0, 0],
|
|
298
|
+
"UBERON:0002349": [1, 1, 0],
|
|
299
|
+
"UBERON:0002113": [0, 0, 0],
|
|
300
|
+
"UBERON:0000955": [0, 0, 1],
|
|
301
|
+
}
|
|
302
|
+
)
|
|
303
|
+
>>> anno = Annotations.from_df(anno, index_col="sample", group_cols=["series"])
|
|
304
|
+
>>> anno.propagate(to_terms=["UBERON:0000948"], ontology="uberon", mode=0)
|
|
305
|
+
┌────────┬────────┬────────────────┐
|
|
306
|
+
│ sample ┆ series ┆ UBERON:0000948 │
|
|
307
|
+
│ --- ┆ --- ┆ --- │
|
|
308
|
+
│ str ┆ str ┆ i32 │
|
|
309
|
+
╞════════╪════════╪════════════════╡
|
|
310
|
+
│ GSM1 ┆ GSE1 ┆ 1 │
|
|
311
|
+
│ GSM2 ┆ GSE1 ┆ 1 │
|
|
312
|
+
└────────┴────────┴────────────────┘
|
|
313
|
+
|
|
314
|
+
With `mode=1`:
|
|
315
|
+
|
|
316
|
+
>>> anno.propagate(to_terms=["UBERON:0000948"], ontology="uberon", mode=1)
|
|
317
|
+
┌────────┬────────┬────────────────┐
|
|
318
|
+
│ sample ┆ series ┆ UBERON:0000948 │
|
|
319
|
+
│ --- ┆ --- ┆ --- │
|
|
320
|
+
│ str ┆ str ┆ i32 │
|
|
321
|
+
╞════════╪════════╪════════════════╡
|
|
322
|
+
│ GSM1 ┆ GSE1 ┆ 1 │
|
|
323
|
+
│ GSM2 ┆ GSE1 ┆ 1 │
|
|
324
|
+
│ GSM3 ┆ GSE2 ┆ -1 │
|
|
325
|
+
└────────┴────────┴────────────────┘
|
|
292
326
|
"""
|
|
293
327
|
converter = AnnotationsConverter(
|
|
294
328
|
self,
|
|
@@ -334,7 +368,18 @@ class Annotations(BaseCuration):
|
|
|
334
368
|
)
|
|
335
369
|
|
|
336
370
|
def slice(self, offset: int, length: int | None = None) -> Annotations:
|
|
337
|
-
"""Slice both data and ids simultaneously using polars slice.
|
|
371
|
+
"""Slice both data and ids simultaneously using `polars` slice.
|
|
372
|
+
|
|
373
|
+
Arguments:
|
|
374
|
+
offset (int):
|
|
375
|
+
Index position to begin the slice.
|
|
376
|
+
|
|
377
|
+
length (int | None):
|
|
378
|
+
Number of indices past `offset` to slice out.
|
|
379
|
+
|
|
380
|
+
Returns:
|
|
381
|
+
Sliced Annotations object as a subset of the original Annotations.
|
|
382
|
+
"""
|
|
338
383
|
sliced_data = self.data.slice(offset, length)
|
|
339
384
|
sliced_ids_data = self._ids.data.slice(offset, length)
|
|
340
385
|
|
|
@@ -349,7 +394,9 @@ class Annotations(BaseCuration):
|
|
|
349
394
|
)
|
|
350
395
|
|
|
351
396
|
def _collapse(self, on: str):
|
|
352
|
-
"""Collapses index-level annotations to group-level.
|
|
397
|
+
"""Collapses index-level annotations to group-level. Helper function
|
|
398
|
+
for `collapse`.
|
|
399
|
+
"""
|
|
353
400
|
index_anno = self.data.with_columns(self.ids[on])
|
|
354
401
|
agg_anno = index_anno.group_by(on).agg(pl.col("*").sum()).sort(on)
|
|
355
402
|
new_ids = self._collapse_ids(on, keep=agg_anno[on].to_list())
|
|
@@ -376,7 +423,9 @@ class Annotations(BaseCuration):
|
|
|
376
423
|
return params
|
|
377
424
|
|
|
378
425
|
def _collapse_ids(self, on: str, keep: list[str]):
|
|
379
|
-
"""Group IDs to keep in the new collapsed frame.
|
|
426
|
+
"""Group IDs to keep in the new collapsed frame. Helper function
|
|
427
|
+
for `collapse`.
|
|
428
|
+
"""
|
|
380
429
|
return (
|
|
381
430
|
self.ids.drop(self.index_col)
|
|
382
431
|
.unique()
|
|
@@ -389,11 +438,49 @@ class Annotations(BaseCuration):
|
|
|
389
438
|
cls,
|
|
390
439
|
df: pl.DataFrame,
|
|
391
440
|
index_col: str,
|
|
392
|
-
group_cols: tuple[str, ...] | list[str]
|
|
441
|
+
group_cols: tuple[str, ...] | list[str],
|
|
393
442
|
**kwargs,
|
|
394
443
|
) -> Annotations:
|
|
395
|
-
"""Creates an Annotations object from a combined DataFrame.
|
|
396
|
-
|
|
444
|
+
"""Creates an Annotations object from a combined DataFrame.
|
|
445
|
+
|
|
446
|
+
Attributes:
|
|
447
|
+
df (pl.DataFrame):
|
|
448
|
+
Polars DataFrame with index and group ID columns and columns for each
|
|
449
|
+
attribute entity for each index (e.g. male or female, tissues, diseases, etc).
|
|
450
|
+
|
|
451
|
+
index_col (str):
|
|
452
|
+
Name of the column of data that contains the index IDs.
|
|
453
|
+
|
|
454
|
+
group_cols (tuple[str, ...]):
|
|
455
|
+
Names of columns of data that contain an ID for each index indicating if it belongs
|
|
456
|
+
to a particular group (e.g. dataset, sex, platform, etc.).
|
|
457
|
+
|
|
458
|
+
Returns:
|
|
459
|
+
An Annotations object constructed from `df`.
|
|
460
|
+
|
|
461
|
+
Examples:
|
|
462
|
+
>>> from metahq_core.curations.annotations import Annotations
|
|
463
|
+
>>> anno = pl.DataFrame(
|
|
464
|
+
{
|
|
465
|
+
"series": ["GSE1", "GSE1", "GSE2"],
|
|
466
|
+
"sample": ["GSM1", "GSM2", "GSM3"],
|
|
467
|
+
"UBERON:0000948": [1, 0, 0],
|
|
468
|
+
"UBERON:0002349": [1, 1, 0],
|
|
469
|
+
"UBERON:0002113": [0, 0, 0],
|
|
470
|
+
"UBERON:0000955": [0, 0, 1],
|
|
471
|
+
}
|
|
472
|
+
)
|
|
473
|
+
>>> anno = Annotations.from_df(anno, index_col="sample", group_cols=["series"])
|
|
474
|
+
┌────────┬────────┬────────────────┬────────────────┬────────────────┬────────────────┐
|
|
475
|
+
│ sample ┆ series ┆ UBERON:0000948 ┆ UBERON:0002349 ┆ UBERON:0002113 ┆ UBERON:0000955 │
|
|
476
|
+
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
|
|
477
|
+
│ str ┆ str ┆ i64 ┆ i64 ┆ i64 ┆ i64 │
|
|
478
|
+
╞════════╪════════╪════════════════╪════════════════╪════════════════╪════════════════╡
|
|
479
|
+
│ GSM1 ┆ GSE1 ┆ 1 ┆ 1 ┆ 0 ┆ 0 │
|
|
480
|
+
│ GSM2 ┆ GSE1 ┆ 0 ┆ 1 ┆ 0 ┆ 0 │
|
|
481
|
+
│ GSM3 ┆ GSE2 ┆ 0 ┆ 0 ┆ 0 ┆ 1 │
|
|
482
|
+
└────────┴────────┴────────────────┴────────────────┴────────────────┴────────────────┘
|
|
483
|
+
"""
|
|
397
484
|
group_cols = tuple(group_cols)
|
|
398
485
|
id_columns = [index_col] + list(group_cols)
|
|
399
486
|
ids_data = df.select(id_columns)
|
|
@@ -409,37 +496,160 @@ class Annotations(BaseCuration):
|
|
|
409
496
|
|
|
410
497
|
@property
|
|
411
498
|
def entities(self) -> list[str]:
|
|
412
|
-
"""Returns term names of the Annotations frame.
|
|
499
|
+
"""Returns term names of the Annotations frame.
|
|
500
|
+
|
|
501
|
+
Examples:
|
|
502
|
+
>>> anno = pl.DataFrame(
|
|
503
|
+
{
|
|
504
|
+
"series": ["GSE1", "GSE1", "GSE2"],
|
|
505
|
+
"sample": ["GSM1", "GSM2", "GSM3"],
|
|
506
|
+
"UBERON:0000948": [1, 0, 0],
|
|
507
|
+
"UBERON:0002349": [1, 1, 0],
|
|
508
|
+
"UBERON:0002113": [0, 0, 0],
|
|
509
|
+
"UBERON:0000955": [0, 0, 1],
|
|
510
|
+
}
|
|
511
|
+
)
|
|
512
|
+
>>> anno = Annotations.from_df(anno, index_col="sample", group_cols=["series"])
|
|
513
|
+
>>> anno.entities
|
|
514
|
+
['UBERON:0000955', 'UBERON:0002349', 'UBERON:0000948', 'UBERON:0002113']
|
|
515
|
+
"""
|
|
413
516
|
return list(set(self.data.columns) - set(self.ids.columns))
|
|
414
517
|
|
|
415
518
|
@property
|
|
416
519
|
def groups(self) -> list[str]:
|
|
417
|
-
"""Returns the groups column of the Annotations curation.
|
|
520
|
+
"""Returns the groups column of the Annotations curation.
|
|
521
|
+
|
|
522
|
+
|
|
523
|
+
Examples:
|
|
524
|
+
>>> anno = pl.DataFrame(
|
|
525
|
+
{
|
|
526
|
+
"series": ["GSE1", "GSE1", "GSE2"],
|
|
527
|
+
"sample": ["GSM1", "GSM2", "GSM3"],
|
|
528
|
+
"UBERON:0000948": [1, 0, 0],
|
|
529
|
+
"UBERON:0002349": [1, 1, 0],
|
|
530
|
+
"UBERON:0002113": [0, 0, 0],
|
|
531
|
+
"UBERON:0000955": [0, 0, 1],
|
|
532
|
+
}
|
|
533
|
+
)
|
|
534
|
+
>>> anno = Annotations.from_df(anno, index_col="sample", group_cols=["series"])
|
|
535
|
+
>>> anno.groups
|
|
536
|
+
['GSE1', 'GSE1', 'GSE2']
|
|
537
|
+
|
|
538
|
+
"""
|
|
418
539
|
return self.ids["series"].to_list()
|
|
419
540
|
|
|
420
541
|
@property
|
|
421
542
|
def ids(self) -> pl.DataFrame:
|
|
422
|
-
"""Return the IDs dataframe.
|
|
543
|
+
"""Return the IDs dataframe.
|
|
544
|
+
|
|
545
|
+
|
|
546
|
+
Examples:
|
|
547
|
+
>>> anno = pl.DataFrame(
|
|
548
|
+
{
|
|
549
|
+
"series": ["GSE1", "GSE1", "GSE2"],
|
|
550
|
+
"sample": ["GSM1", "GSM2", "GSM3"],
|
|
551
|
+
"UBERON:0000948": [1, 0, 0],
|
|
552
|
+
"UBERON:0002349": [1, 1, 0],
|
|
553
|
+
"UBERON:0002113": [0, 0, 0],
|
|
554
|
+
"UBERON:0000955": [0, 0, 1],
|
|
555
|
+
}
|
|
556
|
+
)
|
|
557
|
+
>>> anno = Annotations.from_df(anno, index_col="sample", group_cols=["series"])
|
|
558
|
+
>>> anno.ids
|
|
559
|
+
┌────────┬────────┐
|
|
560
|
+
│ sample ┆ series │
|
|
561
|
+
│ --- ┆ --- │
|
|
562
|
+
│ str ┆ str │
|
|
563
|
+
╞════════╪════════╡
|
|
564
|
+
│ GSM1 ┆ GSE1 │
|
|
565
|
+
│ GSM2 ┆ GSE1 │
|
|
566
|
+
│ GSM3 ┆ GSE2 │
|
|
567
|
+
└────────┴────────┘
|
|
568
|
+
"""
|
|
423
569
|
return self._ids.data
|
|
424
570
|
|
|
425
571
|
@property
|
|
426
|
-
def index(self) -> list:
|
|
427
|
-
"""Return the index column as a list.
|
|
572
|
+
def index(self) -> list[str]:
|
|
573
|
+
"""Return the index column as a list.
|
|
574
|
+
|
|
575
|
+
Examples:
|
|
576
|
+
>>> anno = pl.DataFrame(
|
|
577
|
+
{
|
|
578
|
+
"series": ["GSE1", "GSE1", "GSE2"],
|
|
579
|
+
"sample": ["GSM1", "GSM2", "GSM3"],
|
|
580
|
+
"UBERON:0000948": [1, 0, 0],
|
|
581
|
+
"UBERON:0002349": [1, 1, 0],
|
|
582
|
+
"UBERON:0002113": [0, 0, 0],
|
|
583
|
+
"UBERON:0000955": [0, 0, 1],
|
|
584
|
+
}
|
|
585
|
+
)
|
|
586
|
+
>>> anno = Annotations.from_df(anno, index_col="sample", group_cols=["series"])
|
|
587
|
+
>>> anno.index
|
|
588
|
+
['GSM1', 'GSM2', 'GSM3']
|
|
589
|
+
"""
|
|
428
590
|
return self._ids.index.to_list()
|
|
429
591
|
|
|
430
592
|
@property
|
|
431
593
|
def n_indices(self) -> int:
|
|
432
|
-
"""Returns number of indices.
|
|
594
|
+
"""Returns number of indices.
|
|
595
|
+
|
|
596
|
+
Examples:
|
|
597
|
+
>>> anno = pl.DataFrame(
|
|
598
|
+
{
|
|
599
|
+
"series": ["GSE1", "GSE1", "GSE2"],
|
|
600
|
+
"sample": ["GSM1", "GSM2", "GSM3"],
|
|
601
|
+
"UBERON:0000948": [1, 0, 0],
|
|
602
|
+
"UBERON:0002349": [1, 1, 0],
|
|
603
|
+
"UBERON:0002113": [0, 0, 0],
|
|
604
|
+
"UBERON:0000955": [0, 0, 1],
|
|
605
|
+
}
|
|
606
|
+
)
|
|
607
|
+
>>> anno = Annotations.from_df(anno, index_col="sample", group_cols=["series"])
|
|
608
|
+
>>> anno.n_indices
|
|
609
|
+
3
|
|
610
|
+
"""
|
|
433
611
|
return self.data.height
|
|
434
612
|
|
|
435
613
|
@property
|
|
436
614
|
def n_entities(self) -> int:
|
|
437
|
-
"""Returns number of entities.
|
|
615
|
+
"""Returns number of entities.
|
|
616
|
+
|
|
617
|
+
Examples:
|
|
618
|
+
>>> anno = pl.DataFrame(
|
|
619
|
+
{
|
|
620
|
+
"series": ["GSE1", "GSE1", "GSE2"],
|
|
621
|
+
"sample": ["GSM1", "GSM2", "GSM3"],
|
|
622
|
+
"UBERON:0000948": [1, 0, 0],
|
|
623
|
+
"UBERON:0002349": [1, 1, 0],
|
|
624
|
+
"UBERON:0002113": [0, 0, 0],
|
|
625
|
+
"UBERON:0000955": [0, 0, 1],
|
|
626
|
+
}
|
|
627
|
+
)
|
|
628
|
+
>>> anno = Annotations.from_df(anno, index_col="sample", group_cols=["series"])
|
|
629
|
+
>>> anno.n_entities
|
|
630
|
+
4
|
|
631
|
+
"""
|
|
438
632
|
return len(self.entities)
|
|
439
633
|
|
|
440
634
|
@property
|
|
441
635
|
def unique_groups(self) -> list[str]:
|
|
442
|
-
"""Returns unique groups.
|
|
636
|
+
"""Returns unique groups.
|
|
637
|
+
|
|
638
|
+
Examples:
|
|
639
|
+
>>> anno = pl.DataFrame(
|
|
640
|
+
{
|
|
641
|
+
"series": ["GSE1", "GSE1", "GSE2"],
|
|
642
|
+
"sample": ["GSM1", "GSM2", "GSM3"],
|
|
643
|
+
"UBERON:0000948": [1, 0, 0],
|
|
644
|
+
"UBERON:0002349": [1, 1, 0],
|
|
645
|
+
"UBERON:0002113": [0, 0, 0],
|
|
646
|
+
"UBERON:0000955": [0, 0, 1],
|
|
647
|
+
}
|
|
648
|
+
)
|
|
649
|
+
>>> anno = Annotations.from_df(anno, index_col="sample", group_cols=["series"])
|
|
650
|
+
>>> anno.unique_groups
|
|
651
|
+
['GSE2', 'GSE1']
|
|
652
|
+
"""
|
|
443
653
|
return list(set(self.groups))
|
|
444
654
|
|
|
445
655
|
def __repr__(self):
|