metahq-core 0.1.2__py3-none-any.whl → 1.0.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- metahq_core/__init__.py +1 -1
- metahq_core/curations/annotation_converter.py +5 -5
- metahq_core/curations/annotations.py +361 -151
- metahq_core/curations/index.py +104 -43
- metahq_core/curations/labels.py +259 -128
- metahq_core/curations/propagator.py +62 -85
- metahq_core/export/__init__.py +0 -0
- metahq_core/export/annotations.py +125 -59
- metahq_core/export/labels.py +128 -70
- metahq_core/logger.py +11 -18
- metahq_core/query.py +346 -241
- metahq_core/{ontology/loader.py → relations_loader.py} +2 -1
- metahq_core/search.py +37 -14
- metahq_core/util/io.py +109 -46
- metahq_core/util/supported.py +16 -5
- {metahq_core-0.1.2.dist-info → metahq_core-1.0.0rc1.dist-info}/METADATA +13 -6
- metahq_core-1.0.0rc1.dist-info/RECORD +30 -0
- {metahq_core-0.1.2.dist-info → metahq_core-1.0.0rc1.dist-info}/WHEEL +1 -1
- metahq_core-1.0.0rc1.dist-info/licenses/LICENSE +28 -0
- metahq_core/ontology/base.py +0 -376
- metahq_core/ontology/graph.py +0 -252
- metahq_core-0.1.2.dist-info/RECORD +0 -30
- /metahq_core/{ontology → curations}/__init__.py +0 -0
metahq_core/query.py
CHANGED
|
@@ -4,7 +4,7 @@ Class to query the annotations dictionary.
|
|
|
4
4
|
Author: Parker Hicks
|
|
5
5
|
Date: 2025-03
|
|
6
6
|
|
|
7
|
-
Last updated:
|
|
7
|
+
Last updated: 2026-02-02 by Parker Hicks
|
|
8
8
|
"""
|
|
9
9
|
|
|
10
10
|
from pathlib import Path
|
|
@@ -14,15 +14,17 @@ import polars as pl
|
|
|
14
14
|
|
|
15
15
|
from metahq_core.curations.annotations import Annotations
|
|
16
16
|
from metahq_core.logger import setup_logger
|
|
17
|
-
from metahq_core.util.
|
|
17
|
+
from metahq_core.util.exceptions import NoResultsFound
|
|
18
18
|
from metahq_core.util.io import load_bson
|
|
19
19
|
from metahq_core.util.supported import (
|
|
20
20
|
_ecodes,
|
|
21
21
|
attributes,
|
|
22
22
|
get_annotations,
|
|
23
|
+
get_default_log_dir,
|
|
23
24
|
get_technologies,
|
|
24
25
|
na_entities,
|
|
25
26
|
species_map,
|
|
27
|
+
supported,
|
|
26
28
|
technologies,
|
|
27
29
|
)
|
|
28
30
|
|
|
@@ -73,100 +75,135 @@ class ParsedEntries:
|
|
|
73
75
|
|
|
74
76
|
|
|
75
77
|
class LongAnnotations:
|
|
76
|
-
"""
|
|
77
|
-
Annotations in long format.
|
|
78
|
-
Exists to support modularity and readibility within the Query class.
|
|
79
|
-
|
|
80
|
-
Attributes
|
|
81
|
-
----------
|
|
82
|
-
annotations: pl.DataFrame
|
|
83
|
-
DataFrame with columns storing accession IDs with an `id` and `value` column storing
|
|
84
|
-
multiple annotations for a single entry.
|
|
85
|
-
|
|
86
|
-
Methods
|
|
87
|
-
-------
|
|
88
|
-
column_intersection_with()
|
|
89
|
-
Finds the intersection between a list of strings and the annotations columns.
|
|
78
|
+
"""Annotations in long format.
|
|
90
79
|
|
|
91
|
-
|
|
92
|
-
Removes rows that contain NA values.
|
|
80
|
+
Exists to support modularity and readibility within the Query class.
|
|
93
81
|
|
|
94
|
-
|
|
95
|
-
|
|
82
|
+
Attributes:
|
|
83
|
+
annotations (pl.DataFrame):
|
|
84
|
+
DataFrame with columns storing accession IDs with an `id` and `value` column storing
|
|
85
|
+
multiple annotations for a single entry.
|
|
86
|
+
"""
|
|
96
87
|
|
|
97
|
-
|
|
98
|
-
|
|
88
|
+
def __init__(self, annotations):
|
|
89
|
+
self.annotations: pl.DataFrame = annotations
|
|
99
90
|
|
|
100
|
-
|
|
101
|
-
|
|
91
|
+
def column_intersection_with(self, columns: list[str]) -> list[str]:
|
|
92
|
+
"""Find intersection between `columns` and the columns in the `annotations` attribute.
|
|
102
93
|
|
|
103
|
-
|
|
104
|
-
|
|
94
|
+
Arguments:
|
|
95
|
+
columns (list[str]):
|
|
96
|
+
Any list of potential columns in the DataFrame.
|
|
105
97
|
|
|
106
|
-
|
|
98
|
+
Returns:
|
|
99
|
+
The intersection of columns.
|
|
100
|
+
"""
|
|
101
|
+
return list(set(columns) & set(self.annotations.columns))
|
|
107
102
|
|
|
108
|
-
def
|
|
109
|
-
|
|
110
|
-
|
|
103
|
+
def filter_na(self, column: str):
|
|
104
|
+
"""Removes entries in a column that are NA-like values (e.g., 'NA' or 'none').
|
|
105
|
+
Updates the annotations attribute in place.
|
|
111
106
|
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
107
|
+
Arguments:
|
|
108
|
+
column (str):
|
|
109
|
+
The name of a column in the DataFrame.
|
|
110
|
+
"""
|
|
111
|
+
self.annotations = self.annotations.filter(~pl.col(column).is_in(na_entities()))
|
|
115
112
|
|
|
116
|
-
def
|
|
117
|
-
"""
|
|
118
|
-
self.annotations = self.annotations.filter(~pl.col(col).is_in(na_entities()))
|
|
113
|
+
def stage_anchor(self, anchor: Literal["id", "value"]):
|
|
114
|
+
"""Filters NA values from the anchor annotations column.
|
|
119
115
|
|
|
120
|
-
|
|
121
|
-
|
|
116
|
+
Arguments:
|
|
117
|
+
anchor (Literal["id", "value"]):
|
|
118
|
+
The column storing desired format of annotations.
|
|
119
|
+
"""
|
|
122
120
|
self.filter_na(anchor)
|
|
123
121
|
|
|
124
|
-
def stage_level(self, level:
|
|
125
|
-
"""
|
|
126
|
-
Filters NA values from the specified ID level column. If level
|
|
122
|
+
def stage_level(self, level: Literal["sample", "series"]):
|
|
123
|
+
"""Filters NA values from the specified ID level column. If level
|
|
127
124
|
is 'group', then it will also remove annotations with index IDs.
|
|
125
|
+
|
|
126
|
+
Arguments:
|
|
127
|
+
level (Literal['sample', 'series']):
|
|
128
|
+
Annotation level.
|
|
128
129
|
"""
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
raise ValueError(f"Expected level in {supported}, got {level}.")
|
|
130
|
+
if not level in supported("levels"):
|
|
131
|
+
raise ValueError(f"Expected level in {supported("levels")}, got {level}.")
|
|
132
132
|
|
|
133
|
-
if level == "
|
|
134
|
-
self.annotations = self.annotations.filter(pl.col(
|
|
135
|
-
|
|
136
|
-
|
|
133
|
+
if level == "series":
|
|
134
|
+
self.annotations = self.annotations.filter(pl.col(level) != "NA")
|
|
135
|
+
|
|
136
|
+
if "sample" in self.annotations.columns:
|
|
137
|
+
self.annotations = self.annotations.drop("sample")
|
|
137
138
|
|
|
138
139
|
self.filter_na(level)
|
|
139
140
|
|
|
140
|
-
def stage(self, level:
|
|
141
|
-
"""Stages the annotations DataFrame to be converted to wide format.
|
|
141
|
+
def stage(self, level: Literal["sample", "series"], anchor: Literal["id", "value"]):
|
|
142
|
+
"""Stages the annotations DataFrame to be converted to wide format. Mutates the
|
|
143
|
+
annotations attribute in place.
|
|
144
|
+
|
|
145
|
+
Arguments:
|
|
146
|
+
level (Literal['sample', 'series']):
|
|
147
|
+
Annotation level.
|
|
148
|
+
|
|
149
|
+
anchor (Literal["id", "value"]):
|
|
150
|
+
The column storing desired format of annotations.
|
|
151
|
+
|
|
152
|
+
"""
|
|
142
153
|
self.stage_level(level)
|
|
143
154
|
self.stage_anchor(anchor)
|
|
144
155
|
|
|
145
|
-
def pivot_wide(
|
|
146
|
-
|
|
147
|
-
|
|
156
|
+
def pivot_wide(
|
|
157
|
+
self,
|
|
158
|
+
level: Literal["sample", "series"],
|
|
159
|
+
anchor: Literal["id", "value"],
|
|
160
|
+
id_cols: list[str],
|
|
161
|
+
) -> pl.DataFrame:
|
|
162
|
+
"""Pivots the to wide annotations with one-hot-encoded binary entries for
|
|
148
163
|
each annotation.
|
|
149
164
|
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
Returns
|
|
161
|
-
|
|
162
|
-
|
|
165
|
+
Arguments:
|
|
166
|
+
level (Literal['sample', 'series']):
|
|
167
|
+
Annotation level.
|
|
168
|
+
|
|
169
|
+
anchor (Literal["id", "value"]):
|
|
170
|
+
The column storing desired format of annotations.
|
|
171
|
+
|
|
172
|
+
id_cols (list[str]):
|
|
173
|
+
Columns to keep as IDs when pivoting.
|
|
174
|
+
|
|
175
|
+
Returns:
|
|
176
|
+
Annotations in one-hot-encoded wide format with the accession IDs for each annotation.
|
|
177
|
+
|
|
178
|
+
Examples:
|
|
179
|
+
>>> from metahq_core.query import LongAnnotations
|
|
180
|
+
>>> anno = pl.DataFrame({
|
|
181
|
+
'sample': ['GSM1', 'GSM2', 'GSM3'],
|
|
182
|
+
'series': ['GSE1', 'GSE1', 'GSE2'],
|
|
183
|
+
'platform': ['GPL1', 'GPL2', 'GPL2'],
|
|
184
|
+
'id': ['UBERON:0000948|UBERON:0002349', 'UBERON:0002113', 'UBERON:0000955'],
|
|
185
|
+
'value': ['heart|myocardium', 'kidney', 'brain'],
|
|
186
|
+
})
|
|
187
|
+
>>> anno = LongAnnotations(anno)
|
|
188
|
+
>>> anno.pivot_wide(
|
|
189
|
+
level='sample', anchor='id', id_cols=['sample', 'series']
|
|
190
|
+
)
|
|
191
|
+
┌────────┬────────┬────────────────┬────────────────┬────────────────┬────────────────┐
|
|
192
|
+
│ series ┆ sample ┆ UBERON:0000948 ┆ UBERON:0002349 ┆ UBERON:0002113 ┆ UBERON:0000955 │
|
|
193
|
+
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
|
|
194
|
+
│ str ┆ str ┆ i32 ┆ i32 ┆ i32 ┆ i32 │
|
|
195
|
+
╞════════╪════════╪════════════════╪════════════════╪════════════════╪════════════════╡
|
|
196
|
+
│ GSE1 ┆ GSM1 ┆ 1 ┆ 1 ┆ 0 ┆ 0 │
|
|
197
|
+
│ GSE1 ┆ GSM2 ┆ 0 ┆ 0 ┆ 1 ┆ 0 │
|
|
198
|
+
│ GSE2 ┆ GSM3 ┆ 0 ┆ 0 ┆ 0 ┆ 1 │
|
|
199
|
+
└────────┴────────┴────────────────┴────────────────┴────────────────┴────────────────┘
|
|
163
200
|
|
|
164
201
|
"""
|
|
165
202
|
# remove unused entries
|
|
166
203
|
self.stage(level, anchor)
|
|
167
204
|
|
|
168
205
|
# prepare accession IDs DataFrame
|
|
169
|
-
id_cols = self.column_intersection_with(
|
|
206
|
+
id_cols = self.column_intersection_with(id_cols)
|
|
170
207
|
ids = self.annotations.select(id_cols)
|
|
171
208
|
|
|
172
209
|
# remove unused columns for pivoting
|
|
@@ -195,55 +232,78 @@ class LongAnnotations:
|
|
|
195
232
|
|
|
196
233
|
|
|
197
234
|
class UnParsedEntry:
|
|
198
|
-
"""
|
|
199
|
-
Stores and extracts items from a single annotation entry of the annotations dictionary.
|
|
200
|
-
Exists to support modularity and readibility within the Query class.
|
|
201
|
-
|
|
202
|
-
Attrubtes
|
|
203
|
-
---------
|
|
204
|
-
entry: dict[str, dict[str, dict[str, str]]]
|
|
205
|
-
Nested dictionary of annotations in the following structure:
|
|
206
|
-
ID: {
|
|
207
|
-
attribute: {
|
|
208
|
-
source: {
|
|
209
|
-
id: "standardized ID",
|
|
210
|
-
"value": "common name",
|
|
211
|
-
} ...
|
|
212
|
-
} ...
|
|
213
|
-
|
|
214
|
-
attribute: str
|
|
215
|
-
Attribute to extract annotations for.
|
|
216
|
-
|
|
217
|
-
ecodes: str
|
|
218
|
-
Permitted evidence codes for annotations.
|
|
235
|
+
"""Stores and extracts items from a single annotation entry of the annotations dictionary.
|
|
219
236
|
|
|
220
|
-
|
|
221
|
-
-------
|
|
222
|
-
get_annotations():
|
|
223
|
-
Retrieves all available annotations that match the specified parameters.
|
|
224
|
-
|
|
225
|
-
is_acceptable():
|
|
226
|
-
Determines if an entry has annotations available for the attribute.
|
|
227
|
-
|
|
228
|
-
get_id_value():
|
|
229
|
-
Extracts ID and value entries for an individual source within a single entry.
|
|
237
|
+
Exists to support modularity and readibility within the Query class.
|
|
230
238
|
|
|
239
|
+
Attributes:
|
|
240
|
+
entry (dict[str, dict[str, dict[str, str] | str]]):
|
|
241
|
+
Annotations for a single entry in the database.
|
|
242
|
+
|
|
243
|
+
attribute (str):
|
|
244
|
+
Attribute to extract annotations for.
|
|
245
|
+
|
|
246
|
+
ecodes (list[str]):
|
|
247
|
+
Permitted evidence codes for annotations.
|
|
248
|
+
|
|
249
|
+
species (str):
|
|
250
|
+
Species for which to extract annotations for.
|
|
251
|
+
|
|
252
|
+
Examples:
|
|
253
|
+
>>> from metahq_core.query import UnParsedEntry
|
|
254
|
+
>>> entry = {
|
|
255
|
+
'GSM281311': {
|
|
256
|
+
'organism': 'homo sapiens',
|
|
257
|
+
'tissue': {
|
|
258
|
+
'ursa': {
|
|
259
|
+
'id': 'UBERON:0002113', 'value': 'kidney', 'ecode': 'expert-curated'
|
|
260
|
+
}
|
|
261
|
+
}
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
>>> UnParsedEntry(
|
|
265
|
+
entry,
|
|
266
|
+
attribute='tissue',
|
|
267
|
+
ecodes=['expert-curated'],
|
|
268
|
+
'homo sapiens'
|
|
269
|
+
)
|
|
231
270
|
"""
|
|
232
271
|
|
|
233
|
-
def __init__(self,
|
|
234
|
-
self.entry: dict[str, dict[str, dict[str, str]]] =
|
|
235
|
-
self.attribute: str =
|
|
236
|
-
self.ecodes: list[str] =
|
|
237
|
-
self.species: str =
|
|
272
|
+
def __init__(self, entry, attribute, ecodes, species):
|
|
273
|
+
self.entry: dict[str, dict[str, dict[str, str]]] = entry
|
|
274
|
+
self.attribute: str = attribute
|
|
275
|
+
self.ecodes: list[str] = ecodes
|
|
276
|
+
self.species: str = species
|
|
238
277
|
|
|
239
278
|
def get_annotations(self) -> tuple[str, str]:
|
|
240
279
|
"""
|
|
241
280
|
Retrieves the ID and value annotations for a single entry.
|
|
242
281
|
|
|
243
|
-
Returns
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
282
|
+
Returns:
|
|
283
|
+
ID and value annotations for a given attribute. If there are multiple annotations
|
|
284
|
+
across sources, then they are concatenated with a `|` delimiter. If no ID or
|
|
285
|
+
value annotations exist, `NA` is returned.
|
|
286
|
+
|
|
287
|
+
Examples:
|
|
288
|
+
>>> from metahq_core.query import UnParsedEntry
|
|
289
|
+
>>> entry = {
|
|
290
|
+
'GSM281311': {
|
|
291
|
+
'organism': 'homo sapiens',
|
|
292
|
+
'tissue': {
|
|
293
|
+
'ursa': {
|
|
294
|
+
'id': 'UBERON:0002113', 'value': 'kidney', 'ecode': 'expert-curated'
|
|
295
|
+
}
|
|
296
|
+
}
|
|
297
|
+
}
|
|
298
|
+
}
|
|
299
|
+
>>> unparsed = UnParsedEntry(
|
|
300
|
+
entry,
|
|
301
|
+
attribute='tissue',
|
|
302
|
+
ecodes=['expert-curated'],
|
|
303
|
+
'homo sapiens'
|
|
304
|
+
)
|
|
305
|
+
>>> unparsed.get_annotations()
|
|
306
|
+
('UBERON:0002113', 'kidney')
|
|
247
307
|
|
|
248
308
|
"""
|
|
249
309
|
if not self.is_acceptable():
|
|
@@ -263,26 +323,74 @@ class UnParsedEntry:
|
|
|
263
323
|
return "|".join(ids), "|".join(values)
|
|
264
324
|
|
|
265
325
|
def is_acceptable(self) -> bool:
|
|
266
|
-
"""Checks if an
|
|
326
|
+
"""Checks if the entry is not empty and is an acceptable annotation given the
|
|
327
|
+
passed attribute, ecode, and species.
|
|
328
|
+
|
|
329
|
+
Returns:
|
|
330
|
+
True or False given the specified attributes.
|
|
331
|
+
|
|
332
|
+
Examples:
|
|
333
|
+
>>> from metahq_core.query import UnParsedEntry
|
|
334
|
+
>>> entry = {
|
|
335
|
+
'GSM281311': {
|
|
336
|
+
'organism': 'homo sapiens',
|
|
337
|
+
'tissue': {
|
|
338
|
+
'ursa': {
|
|
339
|
+
'id': 'UBERON:0002113', 'value': 'kidney', 'ecode': 'expert-curated'
|
|
340
|
+
}
|
|
341
|
+
}
|
|
342
|
+
}
|
|
343
|
+
}
|
|
344
|
+
>>> unparsed = UnParsedEntry(
|
|
345
|
+
entry,
|
|
346
|
+
attribute='tissue',
|
|
347
|
+
ecodes=['expert-curated'],
|
|
348
|
+
'homo sapiens'
|
|
349
|
+
)
|
|
350
|
+
>>> unparsed.is_acceptable()
|
|
351
|
+
True
|
|
352
|
+
|
|
353
|
+
If an attribute doesn't exist, it will return False.
|
|
354
|
+
|
|
355
|
+
>>> entry = {
|
|
356
|
+
'GSM315993': {
|
|
357
|
+
'organism': 'homo sapiens',
|
|
358
|
+
'sex': {
|
|
359
|
+
'Johnson 2023': {
|
|
360
|
+
'id': 'F', 'ecode': 'expert-curated'
|
|
361
|
+
}
|
|
362
|
+
}
|
|
363
|
+
}
|
|
364
|
+
}
|
|
365
|
+
>>> unparsed = UnParsedEntry(
|
|
366
|
+
entry,
|
|
367
|
+
attribute='tissue',
|
|
368
|
+
ecodes=['expert-curated'],
|
|
369
|
+
'homo sapiens'
|
|
370
|
+
)
|
|
371
|
+
>>> unparsed.is_acceptable()
|
|
372
|
+
False
|
|
373
|
+
"""
|
|
267
374
|
attr_exists = self.attribute in self.entry
|
|
268
|
-
is_correct_species = self.entry["organism"] == self.species
|
|
269
375
|
is_populated = len(self.entry) > 0
|
|
270
376
|
|
|
377
|
+
if "organism" in self.entry:
|
|
378
|
+
is_correct_species = self.entry["organism"] == self.species
|
|
379
|
+
else:
|
|
380
|
+
is_correct_species = False
|
|
381
|
+
|
|
271
382
|
return attr_exists and is_populated and is_correct_species
|
|
272
383
|
|
|
273
384
|
@staticmethod
|
|
274
|
-
def get_id_value(source_anno):
|
|
275
|
-
"""
|
|
276
|
-
Extracts the ID and value for an annotation.
|
|
385
|
+
def get_id_value(source_anno) -> tuple[str, str]:
|
|
386
|
+
"""Extracts the ID and value for an annotation.
|
|
277
387
|
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
Annotations from a single source. Has keys ['id', 'value', 'ecode'].
|
|
388
|
+
Arguments:
|
|
389
|
+
source_anno (dict[str, str]):
|
|
390
|
+
Annotations from a single source. Has keys ['id', 'value', 'ecode'].
|
|
282
391
|
|
|
283
|
-
Returns
|
|
284
|
-
|
|
285
|
-
Tuple of the ID and value for the attribute annotation from a single source.
|
|
392
|
+
Returns:
|
|
393
|
+
Tuple of the ID and value for the attribute annotation from a single source.
|
|
286
394
|
|
|
287
395
|
"""
|
|
288
396
|
if "id" in source_anno:
|
|
@@ -294,63 +402,54 @@ class UnParsedEntry:
|
|
|
294
402
|
value = source_anno["value"]
|
|
295
403
|
else:
|
|
296
404
|
value = "NA"
|
|
405
|
+
|
|
297
406
|
return id_, value
|
|
298
407
|
|
|
299
408
|
|
|
300
409
|
class Query:
|
|
301
|
-
"""
|
|
302
|
-
Class to query the annotations dictionary.
|
|
303
|
-
|
|
304
|
-
Attributes
|
|
305
|
-
----------
|
|
306
|
-
database: str
|
|
307
|
-
Database to query annotations.
|
|
308
|
-
|
|
309
|
-
_annotations: dict
|
|
310
|
-
Nested dictionary of annotations.
|
|
410
|
+
"""Class to query the MetaHQ database.
|
|
311
411
|
|
|
312
|
-
|
|
313
|
-
|
|
412
|
+
Attributes:
|
|
413
|
+
attribute (str):
|
|
414
|
+
Attribute to collect annotations for (e.g., tissue, disease, sex, age)
|
|
314
415
|
|
|
315
|
-
|
|
316
|
-
|
|
416
|
+
level (Literal['sample', 'series']):
|
|
417
|
+
Level of annotations to query.
|
|
317
418
|
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
annotations()
|
|
321
|
-
Primary function to extract formatted annotations from the annotations dictionary.
|
|
322
|
-
Can be propagated to labels if in wide format.
|
|
419
|
+
ecodes (list[str]):
|
|
420
|
+
Acceptable evidence codes for annotations.
|
|
323
421
|
|
|
324
|
-
|
|
325
|
-
|
|
422
|
+
species (str):
|
|
423
|
+
Species for which to query annotations.
|
|
326
424
|
|
|
327
|
-
|
|
328
|
-
|
|
425
|
+
technology (str):
|
|
426
|
+
Technology of the queried samples.
|
|
329
427
|
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
_load_database()
|
|
334
|
-
Loads the annotations dictionary.
|
|
335
|
-
|
|
336
|
-
Example
|
|
337
|
-
-------
|
|
338
|
-
>>> from metahq import Query
|
|
339
|
-
>>> query = Query("geo", "tissue", "expert-curated")
|
|
428
|
+
_annotations (dict):
|
|
429
|
+
Nested dictionary of annotations.
|
|
340
430
|
|
|
431
|
+
Examples:
|
|
432
|
+
>>> from metahq_core.query import Query
|
|
433
|
+
>>> query = Query(
|
|
434
|
+
"tissue",
|
|
435
|
+
level="sample",
|
|
436
|
+
ecodes=["expert-curated"],
|
|
437
|
+
species="homo sapiens",
|
|
438
|
+
technology="rnaseq",
|
|
439
|
+
)
|
|
341
440
|
"""
|
|
342
441
|
|
|
343
442
|
def __init__(
|
|
344
443
|
self,
|
|
345
444
|
database,
|
|
346
445
|
attribute,
|
|
347
|
-
level
|
|
348
|
-
ecode
|
|
349
|
-
species
|
|
350
|
-
technology
|
|
446
|
+
level,
|
|
447
|
+
ecode,
|
|
448
|
+
species,
|
|
449
|
+
technology,
|
|
351
450
|
logger=None,
|
|
352
451
|
loglevel=20,
|
|
353
|
-
logdir=
|
|
452
|
+
logdir=get_default_log_dir(),
|
|
354
453
|
verbose=True,
|
|
355
454
|
):
|
|
356
455
|
self.database: str = database
|
|
@@ -367,52 +466,38 @@ class Query:
|
|
|
367
466
|
self.log: logging.Logger = logger
|
|
368
467
|
self.verbose: bool = verbose
|
|
369
468
|
|
|
370
|
-
def annotations(self, anchor:
|
|
469
|
+
def annotations(self, anchor: Literal["id", "value"] = "id") -> Annotations:
|
|
470
|
+
"""Retrieve annotations from the MetaHQ database.
|
|
471
|
+
|
|
472
|
+
Arguments:
|
|
473
|
+
anchor (Literal['id', 'value']):
|
|
474
|
+
Base of the annotations. Either `id` or `value`. Using `id` will return annotations
|
|
475
|
+
to ontology terms for tissue and disease attributes, M/F for the sex attribute, or
|
|
476
|
+
predetermined age groups for the age attribute. Using `value` will return
|
|
477
|
+
annotations with the free text names for each id.
|
|
478
|
+
|
|
479
|
+
Returns:
|
|
480
|
+
An `Annotations` object with one-hot-encoded annotations to the specified attribute.
|
|
481
|
+
|
|
482
|
+
Examples:
|
|
483
|
+
>>> from metahq_core.query import Query
|
|
484
|
+
>>> query = Query(
|
|
485
|
+
"tissue",
|
|
486
|
+
level="sample",
|
|
487
|
+
ecodes=["expert-curated"],
|
|
488
|
+
species="homo sapiens",
|
|
489
|
+
technology="rnaseq",
|
|
490
|
+
)
|
|
491
|
+
>>> query.annotations(anchor='id')
|
|
371
492
|
"""
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
anchor: str
|
|
380
|
-
Base of the annotations. Either `id` or `value`. Using `id` will return annotations
|
|
381
|
-
to ontology terms for tissue and disease attributes, M/F for the sex attribute, or
|
|
382
|
-
predetermined age groups for the age attribute. Using `value` will return annotations
|
|
383
|
-
with the free text names noted by the annotators.
|
|
384
|
-
|
|
385
|
-
Returns
|
|
386
|
-
-------
|
|
387
|
-
An `Annotations` object with one-hot-encoded annotations to the specified attribute.
|
|
388
|
-
|
|
389
|
-
Example
|
|
390
|
-
-------
|
|
391
|
-
>>> from metahq import Query
|
|
392
|
-
>>> query = Query('geo', 'tissue', 'expert-curated', 'homo-sapiens')
|
|
393
|
-
>>> query.annotations(level='index', anchor='id')
|
|
394
|
-
┌──────────┬───────────┬──────────┬────────────────┬───┬────────────────┐
|
|
395
|
-
│ group ┆ index ┆ platform ┆ UBERON:0002113 ┆ … ┆ UBERON_0000057 │
|
|
396
|
-
│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- │
|
|
397
|
-
│ str ┆ str ┆ str ┆ i32 ┆ ┆ i32 │
|
|
398
|
-
╞══════════╪═══════════╪══════════╪════════════════╪═══╪════════════════╡
|
|
399
|
-
│ GSE11151 ┆ GSM281311 ┆ GPL570 ┆ 1 ┆ … ┆ 0 │
|
|
400
|
-
│ GSE11151 ┆ GSM281312 ┆ GPL570 ┆ 1 ┆ … ┆ 0 │
|
|
401
|
-
│ GSE18969 ┆ GSM469548 ┆ NA ┆ 1 ┆ … ┆ 0 │
|
|
402
|
-
│ GSE18969 ┆ GSM469549 ┆ NA ┆ 1 ┆ … ┆ 0 │
|
|
403
|
-
│ GSE18969 ┆ GSM469550 ┆ NA ┆ 1 ┆ … ┆ 0 │
|
|
404
|
-
│ … ┆ … ┆ … ┆ … ┆ … ┆ … │
|
|
405
|
-
│ GSE2109 ┆ GSM152666 ┆ NA ┆ 0 ┆ … ┆ 0 │
|
|
406
|
-
│ GSE2109 ┆ GSM179804 ┆ NA ┆ 0 ┆ … ┆ 0 │
|
|
407
|
-
│ GSE2109 ┆ GSM353890 ┆ NA ┆ 0 ┆ … ┆ 0 │
|
|
408
|
-
│ GSE2109 ┆ GSM102435 ┆ NA ┆ 0 ┆ … ┆ 0 │
|
|
409
|
-
│ GSE2109 ┆ GSM353891 ┆ NA ┆ 0 ┆ … ┆ 0 │
|
|
410
|
-
└──────────┴───────────┴──────────┴────────────────┴───┴────────────────┘
|
|
493
|
+
# get ID column names
|
|
494
|
+
index, groups = self._assign_index_groups()
|
|
495
|
+
id_cols = [index] + list(groups)
|
|
496
|
+
|
|
497
|
+
# construct the annotations
|
|
498
|
+
attr_anno = self.compile_annotations(id_cols)
|
|
499
|
+
attr_anno = LongAnnotations(attr_anno).pivot_wide(self.level, anchor, id_cols)
|
|
411
500
|
|
|
412
|
-
"""
|
|
413
|
-
index, groups = self.assign_index_groups()
|
|
414
|
-
fields = [index] + list(groups)
|
|
415
|
-
attr_anno = self.compile_annotations(fields).pivot_wide(self.level, anchor)
|
|
416
501
|
na_cols = list(set(attr_anno.columns) & set(na_entities()))
|
|
417
502
|
|
|
418
503
|
return Annotations.from_df(
|
|
@@ -423,26 +508,22 @@ class Query:
|
|
|
423
508
|
verbose=self.verbose,
|
|
424
509
|
)
|
|
425
510
|
|
|
426
|
-
def
|
|
427
|
-
|
|
428
|
-
return "series", tuple(["platform"])
|
|
429
|
-
|
|
430
|
-
if self.level == "sample":
|
|
431
|
-
return "sample", tuple(["series", "platform"])
|
|
511
|
+
def compile_annotations(self, id_cols: list[str]) -> pl.DataFrame:
|
|
512
|
+
"""Extract attribute annotations and accession IDs from the database.
|
|
432
513
|
|
|
433
|
-
|
|
514
|
+
Arguments:
|
|
515
|
+
id_cols (list[str]):
|
|
516
|
+
Accession IDs
|
|
434
517
|
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
518
|
+
Returns:
|
|
519
|
+
Polars DataFrame of all annotations in the annotations dictionary for a single
|
|
520
|
+
attribute.
|
|
438
521
|
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
Polars DataFrame of all annotations in the annotations dictionary for a single
|
|
442
|
-
attribute.
|
|
522
|
+
Raises:
|
|
523
|
+
NoResultsFound: If no attribute annotations can be found.
|
|
443
524
|
|
|
444
525
|
"""
|
|
445
|
-
parsed = ParsedEntries(
|
|
526
|
+
parsed = ParsedEntries(id_cols)
|
|
446
527
|
for entry in self._annotations:
|
|
447
528
|
accessions = self.get_accession_ids(entry)
|
|
448
529
|
id_, value = self.get_valid_annotations(entry)
|
|
@@ -454,27 +535,45 @@ class Query:
|
|
|
454
535
|
) # filter platforms just once for speed
|
|
455
536
|
|
|
456
537
|
if parsed.height == 0:
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
SPECIES:
|
|
538
|
+
msg = (
|
|
539
|
+
"""Unable to identify with provided parameters: [ATTRIBUTE: %s,
|
|
540
|
+
SPECIES: %s, ECODES: %s, TECHNOLOGY: %s]""",
|
|
541
|
+
self.attribute,
|
|
542
|
+
self.species,
|
|
543
|
+
self.ecodes,
|
|
544
|
+
self.technology,
|
|
460
545
|
)
|
|
546
|
+
if self.verbose:
|
|
547
|
+
self.log.error(msg)
|
|
548
|
+
raise NoResultsFound(msg)
|
|
461
549
|
|
|
462
|
-
return
|
|
550
|
+
return parsed
|
|
463
551
|
|
|
464
552
|
def get_accession_ids(self, entry: str) -> dict[str, str]:
|
|
465
|
-
"""
|
|
466
|
-
Updates an AccessionIDs object with index, group, and platform
|
|
553
|
+
"""Updates an AccessionIDs object with index, group, and platform
|
|
467
554
|
IDs from an annotations entry.
|
|
468
555
|
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
556
|
+
Arguments:
|
|
557
|
+
entry (str):
|
|
558
|
+
An ID with annotations in the database (i.e., one of the top level keys of
|
|
559
|
+
the database.)
|
|
560
|
+
|
|
561
|
+
Returns:
|
|
562
|
+
accessions (dict[str, str]):
|
|
563
|
+
A populated dictionary of accession IDs and values for the passed entry.
|
|
564
|
+
|
|
565
|
+
Examples:
|
|
566
|
+
>>> from metahq_core.query import Query
|
|
567
|
+
>>> query = Query(
|
|
568
|
+
"tissue",
|
|
569
|
+
level="sample",
|
|
570
|
+
ecodes=["expert-curated"],
|
|
571
|
+
species="homo sapiens",
|
|
572
|
+
technology="rnaseq",
|
|
573
|
+
)
|
|
574
|
+
>>> query.get_accession_ids('GSM281311')
|
|
575
|
+
{'sample': 'GSM281311', 'series': 'GSE11151', 'platform': 'GPL570'}
|
|
473
576
|
|
|
474
|
-
Returns
|
|
475
|
-
-------
|
|
476
|
-
Tuple of index, group, and platform ID for a given entry in the annotations
|
|
477
|
-
dictionary.
|
|
478
577
|
|
|
479
578
|
"""
|
|
480
579
|
if self.level == "sample":
|
|
@@ -489,17 +588,14 @@ class Query:
|
|
|
489
588
|
return accessions
|
|
490
589
|
|
|
491
590
|
def get_valid_annotations(self, entry: str) -> tuple[str, str]:
|
|
492
|
-
"""
|
|
493
|
-
Extract id and value annotations for each source of annotations in an entry.
|
|
591
|
+
"""Extract id and value annotations for each source of annotations in an entry.
|
|
494
592
|
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
A top-level key of the annotations dictionary.
|
|
593
|
+
Arguments:
|
|
594
|
+
entry: str
|
|
595
|
+
A top-level key of the annotations dictionary.
|
|
499
596
|
|
|
500
|
-
Returns
|
|
501
|
-
|
|
502
|
-
Tuple of the annotation IDs and values.
|
|
597
|
+
Returns:
|
|
598
|
+
Tuple of the annotation IDs and values.
|
|
503
599
|
|
|
504
600
|
"""
|
|
505
601
|
return UnParsedEntry(
|
|
@@ -509,8 +605,17 @@ class Query:
|
|
|
509
605
|
self.species,
|
|
510
606
|
).get_annotations()
|
|
511
607
|
|
|
608
|
+
def _assign_index_groups(self):
|
|
609
|
+
if self.level == "series":
|
|
610
|
+
return "series", tuple(["platform"])
|
|
611
|
+
|
|
612
|
+
if self.level == "sample":
|
|
613
|
+
return "sample", tuple(["series", "platform"])
|
|
614
|
+
|
|
615
|
+
raise ValueError(f"Expected level in [sample, study], got {self.level}.")
|
|
616
|
+
|
|
512
617
|
def _load_annotations(self):
|
|
513
|
-
"""Loads the
|
|
618
|
+
"""Loads the MetaHQ database for the specified level."""
|
|
514
619
|
anno = load_bson(get_annotations(self.level))
|
|
515
620
|
|
|
516
621
|
return anno
|
|
@@ -543,7 +648,7 @@ class Query:
|
|
|
543
648
|
if species in map_:
|
|
544
649
|
return map_[species] # provided shorthand
|
|
545
650
|
if species in map_.values():
|
|
546
|
-
return
|
|
651
|
+
return species
|
|
547
652
|
raise ValueError(
|
|
548
653
|
f"Invalid species query: {species}. Run metahq supported species for available options."
|
|
549
654
|
)
|