metahq-core 0.1.2__py3-none-any.whl → 1.0.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
metahq_core/query.py CHANGED
@@ -4,7 +4,7 @@ Class to query the annotations dictionary.
4
4
  Author: Parker Hicks
5
5
  Date: 2025-03
6
6
 
7
- Last updated: 2025-11-21 by Parker Hicks
7
+ Last updated: 2026-02-02 by Parker Hicks
8
8
  """
9
9
 
10
10
  from pathlib import Path
@@ -14,15 +14,17 @@ import polars as pl
14
14
 
15
15
  from metahq_core.curations.annotations import Annotations
16
16
  from metahq_core.logger import setup_logger
17
- from metahq_core.util.helpers import reverse_dict
17
+ from metahq_core.util.exceptions import NoResultsFound
18
18
  from metahq_core.util.io import load_bson
19
19
  from metahq_core.util.supported import (
20
20
  _ecodes,
21
21
  attributes,
22
22
  get_annotations,
23
+ get_default_log_dir,
23
24
  get_technologies,
24
25
  na_entities,
25
26
  species_map,
27
+ supported,
26
28
  technologies,
27
29
  )
28
30
 
@@ -73,100 +75,135 @@ class ParsedEntries:
73
75
 
74
76
 
75
77
  class LongAnnotations:
76
- """
77
- Annotations in long format.
78
- Exists to support modularity and readibility within the Query class.
79
-
80
- Attributes
81
- ----------
82
- annotations: pl.DataFrame
83
- DataFrame with columns storing accession IDs with an `id` and `value` column storing
84
- multiple annotations for a single entry.
85
-
86
- Methods
87
- -------
88
- column_intersection_with()
89
- Finds the intersection between a list of strings and the annotations columns.
78
+ """Annotations in long format.
90
79
 
91
- filter_na()
92
- Removes rows that contain NA values.
80
+ Exists to support modularity and readibility within the Query class.
93
81
 
94
- pivot_wide()
95
- Converts the annotations in long format to one-hot-encoded wide format.
82
+ Attributes:
83
+ annotations (pl.DataFrame):
84
+ DataFrame with columns storing accession IDs with an `id` and `value` column storing
85
+ multiple annotations for a single entry.
86
+ """
96
87
 
97
- stage_anchor()
98
- Removes NA values from the `id` or `values` columns.
88
+ def __init__(self, annotations):
89
+ self.annotations: pl.DataFrame = annotations
99
90
 
100
- stage_level()
101
- Filters the annotations that have missing IDs.
91
+ def column_intersection_with(self, columns: list[str]) -> list[str]:
92
+ """Find intersection between `columns` and the columns in the `annotations` attribute.
102
93
 
103
- stage()
104
- Prepares the annotations for conversion to wide format.
94
+ Arguments:
95
+ columns (list[str]):
96
+ Any list of potential columns in the DataFrame.
105
97
 
106
- """
98
+ Returns:
99
+ The intersection of columns.
100
+ """
101
+ return list(set(columns) & set(self.annotations.columns))
107
102
 
108
- def __init__(self, annotations, id_cols):
109
- self.annotations: pl.DataFrame = annotations
110
- self.id_cols: list[str] = id_cols
103
+ def filter_na(self, column: str):
104
+ """Removes entries in a column that are NA-like values (e.g., 'NA' or 'none').
105
+ Updates the annotations attribute in place.
111
106
 
112
- def column_intersection_with(self, cols: list[str]) -> list[str]:
113
- """Find intersection between cols and annotations columns."""
114
- return list(set(cols) & set(self.annotations.columns))
107
+ Arguments:
108
+ column (str):
109
+ The name of a column in the DataFrame.
110
+ """
111
+ self.annotations = self.annotations.filter(~pl.col(column).is_in(na_entities()))
115
112
 
116
- def filter_na(self, col: str):
117
- """Removes entries in a column that are NA."""
118
- self.annotations = self.annotations.filter(~pl.col(col).is_in(na_entities()))
113
+ def stage_anchor(self, anchor: Literal["id", "value"]):
114
+ """Filters NA values from the anchor annotations column.
119
115
 
120
- def stage_anchor(self, anchor: str):
121
- """Filters NA values from the anchor annotations column."""
116
+ Arguments:
117
+ anchor (Literal["id", "value"]):
118
+ The column storing desired format of annotations.
119
+ """
122
120
  self.filter_na(anchor)
123
121
 
124
- def stage_level(self, level: str):
125
- """
126
- Filters NA values from the specified ID level column. If level
122
+ def stage_level(self, level: Literal["sample", "series"]):
123
+ """Filters NA values from the specified ID level column. If level
127
124
  is 'group', then it will also remove annotations with index IDs.
125
+
126
+ Arguments:
127
+ level (Literal['sample', 'series']):
128
+ Annotation level.
128
129
  """
129
- supported = ["sample", "series"]
130
- if not level in supported:
131
- raise ValueError(f"Expected level in {supported}, got {level}.")
130
+ if not level in supported("levels"):
131
+ raise ValueError(f"Expected level in {supported("levels")}, got {level}.")
132
132
 
133
- if level == "group":
134
- self.annotations = self.annotations.filter(pl.col("sample") == "NA").drop(
135
- "index"
136
- )
133
+ if level == "series":
134
+ self.annotations = self.annotations.filter(pl.col(level) != "NA")
135
+
136
+ if "sample" in self.annotations.columns:
137
+ self.annotations = self.annotations.drop("sample")
137
138
 
138
139
  self.filter_na(level)
139
140
 
140
- def stage(self, level: str, anchor: str):
141
- """Stages the annotations DataFrame to be converted to wide format."""
141
+ def stage(self, level: Literal["sample", "series"], anchor: Literal["id", "value"]):
142
+ """Stages the annotations DataFrame to be converted to wide format. Mutates the
143
+ annotations attribute in place.
144
+
145
+ Arguments:
146
+ level (Literal['sample', 'series']):
147
+ Annotation level.
148
+
149
+ anchor (Literal["id", "value"]):
150
+ The column storing desired format of annotations.
151
+
152
+ """
142
153
  self.stage_level(level)
143
154
  self.stage_anchor(anchor)
144
155
 
145
- def pivot_wide(self, level: str, anchor: str) -> pl.DataFrame:
146
- """
147
- Pivots the to wide annotations with one-hot-encoded binary entries for
156
+ def pivot_wide(
157
+ self,
158
+ level: Literal["sample", "series"],
159
+ anchor: Literal["id", "value"],
160
+ id_cols: list[str],
161
+ ) -> pl.DataFrame:
162
+ """Pivots the to wide annotations with one-hot-encoded binary entries for
148
163
  each annotation.
149
164
 
150
- Args
151
- ----
152
- level: str
153
- ID level of the annotations. Either `index` or `group`.
154
- anchor: str
155
- Base of the annotations. Either `id` or `value`. Using `id` will return annotations
156
- to ontology terms for tissue and disease attributes, M/F for the sex attribute, or
157
- predetermined age groups for the age attribute. Using `value` will return annotations
158
- with the free text names noted by the annotators.
159
-
160
- Returns
161
- -------
162
- Annotations in one-hot-encoded wide format with the accession IDs for each annotation.
165
+ Arguments:
166
+ level (Literal['sample', 'series']):
167
+ Annotation level.
168
+
169
+ anchor (Literal["id", "value"]):
170
+ The column storing desired format of annotations.
171
+
172
+ id_cols (list[str]):
173
+ Columns to keep as IDs when pivoting.
174
+
175
+ Returns:
176
+ Annotations in one-hot-encoded wide format with the accession IDs for each annotation.
177
+
178
+ Examples:
179
+ >>> from metahq_core.query import LongAnnotations
180
+ >>> anno = pl.DataFrame({
181
+ 'sample': ['GSM1', 'GSM2', 'GSM3'],
182
+ 'series': ['GSE1', 'GSE1', 'GSE2'],
183
+ 'platform': ['GPL1', 'GPL2', 'GPL2'],
184
+ 'id': ['UBERON:0000948|UBERON:0002349', 'UBERON:0002113', 'UBERON:0000955'],
185
+ 'value': ['heart|myocardium', 'kidney', 'brain'],
186
+ })
187
+ >>> anno = LongAnnotations(anno)
188
+ >>> anno.pivot_wide(
189
+ level='sample', anchor='id', id_cols=['sample', 'series']
190
+ )
191
+ ┌────────┬────────┬────────────────┬────────────────┬────────────────┬────────────────┐
192
+ │ series ┆ sample ┆ UBERON:0000948 ┆ UBERON:0002349 ┆ UBERON:0002113 ┆ UBERON:0000955 │
193
+ │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
194
+ │ str ┆ str ┆ i32 ┆ i32 ┆ i32 ┆ i32 │
195
+ ╞════════╪════════╪════════════════╪════════════════╪════════════════╪════════════════╡
196
+ │ GSE1 ┆ GSM1 ┆ 1 ┆ 1 ┆ 0 ┆ 0 │
197
+ │ GSE1 ┆ GSM2 ┆ 0 ┆ 0 ┆ 1 ┆ 0 │
198
+ │ GSE2 ┆ GSM3 ┆ 0 ┆ 0 ┆ 0 ┆ 1 │
199
+ └────────┴────────┴────────────────┴────────────────┴────────────────┴────────────────┘
163
200
 
164
201
  """
165
202
  # remove unused entries
166
203
  self.stage(level, anchor)
167
204
 
168
205
  # prepare accession IDs DataFrame
169
- id_cols = self.column_intersection_with(self.id_cols)
206
+ id_cols = self.column_intersection_with(id_cols)
170
207
  ids = self.annotations.select(id_cols)
171
208
 
172
209
  # remove unused columns for pivoting
@@ -195,55 +232,78 @@ class LongAnnotations:
195
232
 
196
233
 
197
234
  class UnParsedEntry:
198
- """
199
- Stores and extracts items from a single annotation entry of the annotations dictionary.
200
- Exists to support modularity and readibility within the Query class.
201
-
202
- Attrubtes
203
- ---------
204
- entry: dict[str, dict[str, dict[str, str]]]
205
- Nested dictionary of annotations in the following structure:
206
- ID: {
207
- attribute: {
208
- source: {
209
- id: "standardized ID",
210
- "value": "common name",
211
- } ...
212
- } ...
213
-
214
- attribute: str
215
- Attribute to extract annotations for.
216
-
217
- ecodes: str
218
- Permitted evidence codes for annotations.
235
+ """Stores and extracts items from a single annotation entry of the annotations dictionary.
219
236
 
220
- Methods
221
- -------
222
- get_annotations():
223
- Retrieves all available annotations that match the specified parameters.
224
-
225
- is_acceptable():
226
- Determines if an entry has annotations available for the attribute.
227
-
228
- get_id_value():
229
- Extracts ID and value entries for an individual source within a single entry.
237
+ Exists to support modularity and readibility within the Query class.
230
238
 
239
+ Attributes:
240
+ entry (dict[str, dict[str, dict[str, str] | str]]):
241
+ Annotations for a single entry in the database.
242
+
243
+ attribute (str):
244
+ Attribute to extract annotations for.
245
+
246
+ ecodes (list[str]):
247
+ Permitted evidence codes for annotations.
248
+
249
+ species (str):
250
+ Species for which to extract annotations for.
251
+
252
+ Examples:
253
+ >>> from metahq_core.query import UnParsedEntry
254
+ >>> entry = {
255
+ 'GSM281311': {
256
+ 'organism': 'homo sapiens',
257
+ 'tissue': {
258
+ 'ursa': {
259
+ 'id': 'UBERON:0002113', 'value': 'kidney', 'ecode': 'expert-curated'
260
+ }
261
+ }
262
+ }
263
+ }
264
+ >>> UnParsedEntry(
265
+ entry,
266
+ attribute='tissue',
267
+ ecodes=['expert-curated'],
268
+ 'homo sapiens'
269
+ )
231
270
  """
232
271
 
233
- def __init__(self, _entry, _attribute, _ecodes, _species):
234
- self.entry: dict[str, dict[str, dict[str, str]]] = _entry
235
- self.attribute: str = _attribute
236
- self.ecodes: list[str] = _ecodes
237
- self.species: str = _species
272
+ def __init__(self, entry, attribute, ecodes, species):
273
+ self.entry: dict[str, dict[str, dict[str, str]]] = entry
274
+ self.attribute: str = attribute
275
+ self.ecodes: list[str] = ecodes
276
+ self.species: str = species
238
277
 
239
278
  def get_annotations(self) -> tuple[str, str]:
240
279
  """
241
280
  Retrieves the ID and value annotations for a single entry.
242
281
 
243
- Returns
244
- -------
245
- ID and value annotations for a given attribute. If there are multiple annotations
246
- across sources, then they are concatenated with a `|` delimiter.
282
+ Returns:
283
+ ID and value annotations for a given attribute. If there are multiple annotations
284
+ across sources, then they are concatenated with a `|` delimiter. If no ID or
285
+ value annotations exist, `NA` is returned.
286
+
287
+ Examples:
288
+ >>> from metahq_core.query import UnParsedEntry
289
+ >>> entry = {
290
+ 'GSM281311': {
291
+ 'organism': 'homo sapiens',
292
+ 'tissue': {
293
+ 'ursa': {
294
+ 'id': 'UBERON:0002113', 'value': 'kidney', 'ecode': 'expert-curated'
295
+ }
296
+ }
297
+ }
298
+ }
299
+ >>> unparsed = UnParsedEntry(
300
+ entry,
301
+ attribute='tissue',
302
+ ecodes=['expert-curated'],
303
+ 'homo sapiens'
304
+ )
305
+ >>> unparsed.get_annotations()
306
+ ('UBERON:0002113', 'kidney')
247
307
 
248
308
  """
249
309
  if not self.is_acceptable():
@@ -263,26 +323,74 @@ class UnParsedEntry:
263
323
  return "|".join(ids), "|".join(values)
264
324
 
265
325
  def is_acceptable(self) -> bool:
266
- """Checks if an attribute annotation exists."""
326
+ """Checks if the entry is not empty and is an acceptable annotation given the
327
+ passed attribute, ecode, and species.
328
+
329
+ Returns:
330
+ True or False given the specified attributes.
331
+
332
+ Examples:
333
+ >>> from metahq_core.query import UnParsedEntry
334
+ >>> entry = {
335
+ 'GSM281311': {
336
+ 'organism': 'homo sapiens',
337
+ 'tissue': {
338
+ 'ursa': {
339
+ 'id': 'UBERON:0002113', 'value': 'kidney', 'ecode': 'expert-curated'
340
+ }
341
+ }
342
+ }
343
+ }
344
+ >>> unparsed = UnParsedEntry(
345
+ entry,
346
+ attribute='tissue',
347
+ ecodes=['expert-curated'],
348
+ 'homo sapiens'
349
+ )
350
+ >>> unparsed.is_acceptable()
351
+ True
352
+
353
+ If an attribute doesn't exist, it will return False.
354
+
355
+ >>> entry = {
356
+ 'GSM315993': {
357
+ 'organism': 'homo sapiens',
358
+ 'sex': {
359
+ 'Johnson 2023': {
360
+ 'id': 'F', 'ecode': 'expert-curated'
361
+ }
362
+ }
363
+ }
364
+ }
365
+ >>> unparsed = UnParsedEntry(
366
+ entry,
367
+ attribute='tissue',
368
+ ecodes=['expert-curated'],
369
+ 'homo sapiens'
370
+ )
371
+ >>> unparsed.is_acceptable()
372
+ False
373
+ """
267
374
  attr_exists = self.attribute in self.entry
268
- is_correct_species = self.entry["organism"] == self.species
269
375
  is_populated = len(self.entry) > 0
270
376
 
377
+ if "organism" in self.entry:
378
+ is_correct_species = self.entry["organism"] == self.species
379
+ else:
380
+ is_correct_species = False
381
+
271
382
  return attr_exists and is_populated and is_correct_species
272
383
 
273
384
  @staticmethod
274
- def get_id_value(source_anno):
275
- """
276
- Extracts the ID and value for an annotation.
385
+ def get_id_value(source_anno) -> tuple[str, str]:
386
+ """Extracts the ID and value for an annotation.
277
387
 
278
- Args
279
- ----
280
- source_anno: dict[str, str]
281
- Annotations from a single source. Has keys ['id', 'value', 'ecode'].
388
+ Arguments:
389
+ source_anno (dict[str, str]):
390
+ Annotations from a single source. Has keys ['id', 'value', 'ecode'].
282
391
 
283
- Returns
284
- -------
285
- Tuple of the ID and value for the attribute annotation from a single source.
392
+ Returns:
393
+ Tuple of the ID and value for the attribute annotation from a single source.
286
394
 
287
395
  """
288
396
  if "id" in source_anno:
@@ -294,63 +402,54 @@ class UnParsedEntry:
294
402
  value = source_anno["value"]
295
403
  else:
296
404
  value = "NA"
405
+
297
406
  return id_, value
298
407
 
299
408
 
300
409
  class Query:
301
- """
302
- Class to query the annotations dictionary.
303
-
304
- Attributes
305
- ----------
306
- database: str
307
- Database to query annotations.
308
-
309
- _annotations: dict
310
- Nested dictionary of annotations.
410
+ """Class to query the MetaHQ database.
311
411
 
312
- attribute: str
313
- Attribute to collect annotations for (e.g., tissue, disease, sex, age)
412
+ Attributes:
413
+ attribute (str):
414
+ Attribute to collect annotations for (e.g., tissue, disease, sex, age)
314
415
 
315
- ecodes: list[str]
316
- Acceptable evidence codes for annotations.
416
+ level (Literal['sample', 'series']):
417
+ Level of annotations to query.
317
418
 
318
- Methods
319
- -------
320
- annotations()
321
- Primary function to extract formatted annotations from the annotations dictionary.
322
- Can be propagated to labels if in wide format.
419
+ ecodes (list[str]):
420
+ Acceptable evidence codes for annotations.
323
421
 
324
- compile_annotations()
325
- Backend function of `annotations()`. Does the actual extracting.
422
+ species (str):
423
+ Species for which to query annotations.
326
424
 
327
- get_accession_ids()
328
- Retrives and structures accession IDs for a given entry.
425
+ technology (str):
426
+ Technology of the queried samples.
329
427
 
330
- get_valid_annotations()
331
- Retrives all valid annotatiosn for a given entry.
332
-
333
- _load_database()
334
- Loads the annotations dictionary.
335
-
336
- Example
337
- -------
338
- >>> from metahq import Query
339
- >>> query = Query("geo", "tissue", "expert-curated")
428
+ _annotations (dict):
429
+ Nested dictionary of annotations.
340
430
 
431
+ Examples:
432
+ >>> from metahq_core.query import Query
433
+ >>> query = Query(
434
+ "tissue",
435
+ level="sample",
436
+ ecodes=["expert-curated"],
437
+ species="homo sapiens",
438
+ technology="rnaseq",
439
+ )
341
440
  """
342
441
 
343
442
  def __init__(
344
443
  self,
345
444
  database,
346
445
  attribute,
347
- level="sample",
348
- ecode="expert-curated",
349
- species="human",
350
- technology="rnaseq",
446
+ level,
447
+ ecode,
448
+ species,
449
+ technology,
351
450
  logger=None,
352
451
  loglevel=20,
353
- logdir=Path("."),
452
+ logdir=get_default_log_dir(),
354
453
  verbose=True,
355
454
  ):
356
455
  self.database: str = database
@@ -367,52 +466,38 @@ class Query:
367
466
  self.log: logging.Logger = logger
368
467
  self.verbose: bool = verbose
369
468
 
370
- def annotations(self, anchor: str = "id"):
469
+ def annotations(self, anchor: Literal["id", "value"] = "id") -> Annotations:
470
+ """Retrieve annotations from the MetaHQ database.
471
+
472
+ Arguments:
473
+ anchor (Literal['id', 'value']):
474
+ Base of the annotations. Either `id` or `value`. Using `id` will return annotations
475
+ to ontology terms for tissue and disease attributes, M/F for the sex attribute, or
476
+ predetermined age groups for the age attribute. Using `value` will return
477
+ annotations with the free text names for each id.
478
+
479
+ Returns:
480
+ An `Annotations` object with one-hot-encoded annotations to the specified attribute.
481
+
482
+ Examples:
483
+ >>> from metahq_core.query import Query
484
+ >>> query = Query(
485
+ "tissue",
486
+ level="sample",
487
+ ecodes=["expert-curated"],
488
+ species="homo sapiens",
489
+ technology="rnaseq",
490
+ )
491
+ >>> query.annotations(anchor='id')
371
492
  """
372
- Retrieve annotations from the databse annotations dictionary.
373
-
374
- Args
375
- ----
376
- level: str
377
- Level of annotations. Can be `index` or `group`.
378
-
379
- anchor: str
380
- Base of the annotations. Either `id` or `value`. Using `id` will return annotations
381
- to ontology terms for tissue and disease attributes, M/F for the sex attribute, or
382
- predetermined age groups for the age attribute. Using `value` will return annotations
383
- with the free text names noted by the annotators.
384
-
385
- Returns
386
- -------
387
- An `Annotations` object with one-hot-encoded annotations to the specified attribute.
388
-
389
- Example
390
- -------
391
- >>> from metahq import Query
392
- >>> query = Query('geo', 'tissue', 'expert-curated', 'homo-sapiens')
393
- >>> query.annotations(level='index', anchor='id')
394
- ┌──────────┬───────────┬──────────┬────────────────┬───┬────────────────┐
395
- │ group ┆ index ┆ platform ┆ UBERON:0002113 ┆ … ┆ UBERON_0000057 │
396
- │ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- │
397
- │ str ┆ str ┆ str ┆ i32 ┆ ┆ i32 │
398
- ╞══════════╪═══════════╪══════════╪════════════════╪═══╪════════════════╡
399
- │ GSE11151 ┆ GSM281311 ┆ GPL570 ┆ 1 ┆ … ┆ 0 │
400
- │ GSE11151 ┆ GSM281312 ┆ GPL570 ┆ 1 ┆ … ┆ 0 │
401
- │ GSE18969 ┆ GSM469548 ┆ NA ┆ 1 ┆ … ┆ 0 │
402
- │ GSE18969 ┆ GSM469549 ┆ NA ┆ 1 ┆ … ┆ 0 │
403
- │ GSE18969 ┆ GSM469550 ┆ NA ┆ 1 ┆ … ┆ 0 │
404
- │ … ┆ … ┆ … ┆ … ┆ … ┆ … │
405
- │ GSE2109 ┆ GSM152666 ┆ NA ┆ 0 ┆ … ┆ 0 │
406
- │ GSE2109 ┆ GSM179804 ┆ NA ┆ 0 ┆ … ┆ 0 │
407
- │ GSE2109 ┆ GSM353890 ┆ NA ┆ 0 ┆ … ┆ 0 │
408
- │ GSE2109 ┆ GSM102435 ┆ NA ┆ 0 ┆ … ┆ 0 │
409
- │ GSE2109 ┆ GSM353891 ┆ NA ┆ 0 ┆ … ┆ 0 │
410
- └──────────┴───────────┴──────────┴────────────────┴───┴────────────────┘
493
+ # get ID column names
494
+ index, groups = self._assign_index_groups()
495
+ id_cols = [index] + list(groups)
496
+
497
+ # construct the annotations
498
+ attr_anno = self.compile_annotations(id_cols)
499
+ attr_anno = LongAnnotations(attr_anno).pivot_wide(self.level, anchor, id_cols)
411
500
 
412
- """
413
- index, groups = self.assign_index_groups()
414
- fields = [index] + list(groups)
415
- attr_anno = self.compile_annotations(fields).pivot_wide(self.level, anchor)
416
501
  na_cols = list(set(attr_anno.columns) & set(na_entities()))
417
502
 
418
503
  return Annotations.from_df(
@@ -423,26 +508,22 @@ class Query:
423
508
  verbose=self.verbose,
424
509
  )
425
510
 
426
- def assign_index_groups(self):
427
- if self.level == "series":
428
- return "series", tuple(["platform"])
429
-
430
- if self.level == "sample":
431
- return "sample", tuple(["series", "platform"])
511
+ def compile_annotations(self, id_cols: list[str]) -> pl.DataFrame:
512
+ """Extract attribute annotations and accession IDs from the database.
432
513
 
433
- raise ValueError(f"Expected level in [sample, study], got {self.level}.")
514
+ Arguments:
515
+ id_cols (list[str]):
516
+ Accession IDs
434
517
 
435
- def compile_annotations(self, fields: list[str]) -> LongAnnotations:
436
- """
437
- Extract attribute annotations and accession IDs from the annotations dictionary.
518
+ Returns:
519
+ Polars DataFrame of all annotations in the annotations dictionary for a single
520
+ attribute.
438
521
 
439
- Returns
440
- -------
441
- Polars DataFrame of all annotations in the annotations dictionary for a single
442
- attribute.
522
+ Raises:
523
+ NoResultsFound: If no attribute annotations can be found.
443
524
 
444
525
  """
445
- parsed = ParsedEntries(fields)
526
+ parsed = ParsedEntries(id_cols)
446
527
  for entry in self._annotations:
447
528
  accessions = self.get_accession_ids(entry)
448
529
  id_, value = self.get_valid_annotations(entry)
@@ -454,27 +535,45 @@ class Query:
454
535
  ) # filter platforms just once for speed
455
536
 
456
537
  if parsed.height == 0:
457
- raise RuntimeError(
458
- f"""Unable to identify with provided parameters: [ATTRIBUTE: {self.attribute},
459
- SPECIES: {self.species}, ECODES: {self.ecodes}, TECHNOLOGY: {self.technology}]"""
538
+ msg = (
539
+ """Unable to identify with provided parameters: [ATTRIBUTE: %s,
540
+ SPECIES: %s, ECODES: %s, TECHNOLOGY: %s]""",
541
+ self.attribute,
542
+ self.species,
543
+ self.ecodes,
544
+ self.technology,
460
545
  )
546
+ if self.verbose:
547
+ self.log.error(msg)
548
+ raise NoResultsFound(msg)
461
549
 
462
- return LongAnnotations(parsed, fields)
550
+ return parsed
463
551
 
464
552
  def get_accession_ids(self, entry: str) -> dict[str, str]:
465
- """
466
- Updates an AccessionIDs object with index, group, and platform
553
+ """Updates an AccessionIDs object with index, group, and platform
467
554
  IDs from an annotations entry.
468
555
 
469
- Args
470
- ----
471
- entry: str
472
- Top key of the annotations dictionary to extract accession IDs for.
556
+ Arguments:
557
+ entry (str):
558
+ An ID with annotations in the database (i.e., one of the top level keys of
559
+ the database.)
560
+
561
+ Returns:
562
+ accessions (dict[str, str]):
563
+ A populated dictionary of accession IDs and values for the passed entry.
564
+
565
+ Examples:
566
+ >>> from metahq_core.query import Query
567
+ >>> query = Query(
568
+ "tissue",
569
+ level="sample",
570
+ ecodes=["expert-curated"],
571
+ species="homo sapiens",
572
+ technology="rnaseq",
573
+ )
574
+ >>> query.get_accession_ids('GSM281311')
575
+ {'sample': 'GSM281311', 'series': 'GSE11151', 'platform': 'GPL570'}
473
576
 
474
- Returns
475
- -------
476
- Tuple of index, group, and platform ID for a given entry in the annotations
477
- dictionary.
478
577
 
479
578
  """
480
579
  if self.level == "sample":
@@ -489,17 +588,14 @@ class Query:
489
588
  return accessions
490
589
 
491
590
  def get_valid_annotations(self, entry: str) -> tuple[str, str]:
492
- """
493
- Extract id and value annotations for each source of annotations in an entry.
591
+ """Extract id and value annotations for each source of annotations in an entry.
494
592
 
495
- Args
496
- ----
497
- entry: str
498
- A top-level key of the annotations dictionary.
593
+ Arguments:
594
+ entry: str
595
+ A top-level key of the annotations dictionary.
499
596
 
500
- Returns
501
- -------
502
- Tuple of the annotation IDs and values.
597
+ Returns:
598
+ Tuple of the annotation IDs and values.
503
599
 
504
600
  """
505
601
  return UnParsedEntry(
@@ -509,8 +605,17 @@ class Query:
509
605
  self.species,
510
606
  ).get_annotations()
511
607
 
608
+ def _assign_index_groups(self):
609
+ if self.level == "series":
610
+ return "series", tuple(["platform"])
611
+
612
+ if self.level == "sample":
613
+ return "sample", tuple(["series", "platform"])
614
+
615
+ raise ValueError(f"Expected level in [sample, study], got {self.level}.")
616
+
512
617
  def _load_annotations(self):
513
- """Loads the annotations dictionary for the specified database."""
618
+ """Loads the MetaHQ database for the specified level."""
514
619
  anno = load_bson(get_annotations(self.level))
515
620
 
516
621
  return anno
@@ -543,7 +648,7 @@ class Query:
543
648
  if species in map_:
544
649
  return map_[species] # provided shorthand
545
650
  if species in map_.values():
546
- return reverse_dict(map_)[species]
651
+ return species
547
652
  raise ValueError(
548
653
  f"Invalid species query: {species}. Run metahq supported species for available options."
549
654
  )