metahq-core 0.1.2__py3-none-any.whl → 1.0.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,7 +4,7 @@ Class for mutating and operating on sets of labels.
4
4
  Author: Parker Hicks
5
5
  Date: 2025-08-13
6
6
 
7
- Last updated: 2025-11-21 by Parker Hicks
7
+ Last updated: 2026-02-02 by Parker Hicks
8
8
  """
9
9
 
10
10
  from __future__ import annotations
@@ -12,14 +12,14 @@ from __future__ import annotations
12
12
  from pathlib import Path
13
13
  from typing import TYPE_CHECKING, Literal
14
14
 
15
- import numpy as np
16
15
  import polars as pl
17
16
 
18
17
  from metahq_core.curations.base import BaseCuration
19
18
  from metahq_core.curations.index import Ids
20
19
  from metahq_core.export.labels import LabelsExporter
21
20
  from metahq_core.logger import setup_logger
22
- from metahq_core.util.alltypes import FilePath, NpIntMatrix
21
+ from metahq_core.util.alltypes import NpIntMatrix
22
+ from metahq_core.util.supported import get_default_log_dir
23
23
 
24
24
  if TYPE_CHECKING:
25
25
  import logging
@@ -27,72 +27,24 @@ if TYPE_CHECKING:
27
27
 
28
28
  # TODO: Add method to remove redundant terms
29
29
  class Labels(BaseCuration):
30
- """
31
- Class for storing and mutating labels.
30
+ """Class for storing and mutating labels.
32
31
 
33
32
  Currently supports -1, 0, +1 labels.
34
33
 
35
- Attributes
36
- ---------
37
- data: pl.DataFrame
38
- Polars DataFrame with columns `index`, `groups` and columns for each
39
- attribute entity for each index (e.g. male or female, tissues, diseases, etc).
40
-
41
- disease: bool
42
- Indicates if the annotations are disease based. Used to account for control samples
43
- when converting annotations to labels.
44
-
45
- index_col: IdArray
46
- Name of the column of data that contains the index IDs.
47
-
48
- group_cols: tuple
49
- Names of columns of data that contain an ID for each index indicating if it belongs
50
- to a particular group (e.g. dataset, sex, platform, etc.).
51
-
52
- collapsed: bool
53
- Indicates if the annotations have already been collapsed.
54
-
55
- Methods
56
- -------
57
- drop()
58
- Wrapper for polars `drop`.
59
-
60
- filter()
61
- Wrapper for polars `filter`.
62
-
63
- head()
64
- Wrapper for polars `head`.
65
-
66
- select()
67
- Wrapper for polars `select`.
68
-
69
- slice()
70
- Wrapper for polars `slice`.
71
-
72
- Properties
73
- ---------
74
- entities: list[str]
75
- columns of the annotations frame of ontology terms.
76
-
77
- groups: list[str]
78
- Groups associated with each index of the annotations curation.
79
- Note that groups are not unique.
80
-
81
- ids: pl.DataFrame
82
- The frame of all IDs within the annotations curation.
83
-
84
- index
85
- The index IDs of the annotations frame.
34
+ Attributes:
35
+ data (pl.DataFrame):
36
+ Polars DataFrame with columns `index`, `groups` and columns for each
37
+ attribute entity for each index (e.g. male or female, tissues, diseases, etc).
86
38
 
87
- n_entities: int
88
- Number of unique entities.
39
+ index_col (str):
40
+ Name of the column of data that contains the index IDs.
89
41
 
90
- n_index: int
91
- Number of indices.
92
-
93
- unique_groups: list[str]
94
- Unique groups in the annotations curation.
42
+ group_cols (tuple[str, ...]):
43
+ Names of columns of data that contain an ID for each index indicating if it belongs
44
+ to a particular group (e.g. dataset, sex, platform, etc.).
95
45
 
46
+ collapsed (bool):
47
+ Indicates if the annotations have already been collapsed.
96
48
  """
97
49
 
98
50
  def __init__(
@@ -104,7 +56,7 @@ class Labels(BaseCuration):
104
56
  collapsed: bool = False,
105
57
  logger=None,
106
58
  loglevel=20,
107
- logdir=Path("."),
59
+ logdir=get_default_log_dir(),
108
60
  verbose=True,
109
61
  ):
110
62
  self.data = data
@@ -120,14 +72,21 @@ class Labels(BaseCuration):
120
72
  self.verbose: bool = verbose
121
73
 
122
74
  def add_ids(self, new: pl.DataFrame) -> Labels:
123
- """
124
- Append new group ID columns to the IDs of a Labels object. The new
75
+ """Append new group ID columns to the IDs of a Labels object. The new
125
76
  IDs must have a matching index.
77
+
78
+ Arguments:
79
+ new (pl.DataFrame):
80
+ A DataFrame of additional IDs to join with the current index column of `data`.
81
+ Must have a matching index column as the original `data`.
82
+
83
+ Returns:
84
+ A new Labels object including the new ID columns.
126
85
  """
127
86
  new_ids = new.join(
128
87
  self.ids, on=self.index_col, how="inner", maintain_order="right"
129
88
  )
130
- new_groups = tuple([col for col in new_ids.columns if col != self.index_col])
89
+ new_groups = tuple(col for col in new_ids.columns if col != self.index_col)
131
90
  assert new_ids.height == self.ids.height, "SRA IDs height mismatch."
132
91
  assert (
133
92
  new_ids[self.index_col].to_list() == self.index
@@ -138,11 +97,37 @@ class Labels(BaseCuration):
138
97
  )
139
98
 
140
99
  def drop(self, *args, **kwargs):
141
- """Wrapper for polars drop."""
100
+ """Wrapper for polars drop. Drops any of the term columns.
101
+ ID columns are not dropped through this method.
102
+ """
142
103
  self.data = self.data.drop(*args, **kwargs)
143
104
 
144
105
  def filter(self, condition: pl.Expr) -> Labels:
145
- """Filter both data and ids simultaneously using a mask."""
106
+ """Filter both data and ids simultaneously using a mask.
107
+
108
+ Arguments:
109
+ condition (pl.Expr):
110
+ Polars expression for filtering columns.
111
+
112
+ Examples:
113
+ >>> from metahq_core.curations.labels import Labels
114
+ >>> labels = {
115
+ 'sample': ['GSM1', 'GSM2', 'GSM3'],
116
+ 'series': ['GSE1', 'GSE1', 'GSE2'],
117
+ 'UBERON:0000948': [1, -1, -1],
118
+ 'UBERON:0002113': [-1, 1, -1],
119
+ 'UBERON:0000955': [-1, -1, 1],
120
+ }
121
+ >>> labels = Labels.from_df(anno, index_col="sample", group_cols=["series"])
122
+ >>> labels.filter(pl.col("UBERON:0000948") == 1)
123
+ ┌────────┬────────┬────────────────┬────────────────┬────────────────┐
124
+ │ sample ┆ series ┆ UBERON:0000948 ┆ UBERON:0002113 ┆ UBERON:0000955 │
125
+ │ --- ┆ --- ┆ --- ┆ --- ┆ --- │
126
+ │ str ┆ str ┆ i32 ┆ i32 ┆ i32 │
127
+ ╞════════╪════════╪════════════════╪════════════════╪════════════════╡
128
+ │ GSM1 ┆ GSE1 ┆ 1 ┆ -1 ┆ -1 │
129
+ └────────┴────────┴────────────────┴────────────────┴────────────────┘
130
+ """
146
131
  mask = self.data.select(condition.arg_true()).to_numpy().reshape(-1)
147
132
 
148
133
  filtered_data = (
@@ -166,29 +151,56 @@ class Labels(BaseCuration):
166
151
 
167
152
  def save(
168
153
  self,
169
- outfile: FilePath,
154
+ outfile: str | Path,
170
155
  fmt: Literal["json", "parquet", "csv", "tsv"],
156
+ attribute: str,
157
+ level: str,
171
158
  metadata: str | None = None,
172
159
  ):
173
- """
174
- Save labels curation to json. Keys are terms and values are
175
- positively annotated indices.
160
+ """Save the labels curation.
176
161
 
177
- Parameters
178
- ----------
179
- outfile: FilePath
180
- Path to outfile.json.
162
+ Arguments:
163
+ outfile (str | Path):
164
+ Path to outfile.json.
181
165
 
182
- metadata: bool
183
- If True, will add index titles to each entry.
166
+ fmt (Literal["json", "parquet", "csv", "tsv"]):
167
+ File format to save to.
168
+
169
+ attribute (str):
170
+ A supported MetaHQ annotated attribute.
171
+
172
+ level (str):
173
+ An index level supported by MetaHQ.
174
+
175
+ metadata (str | None):
176
+ Metadata fields to inlcude formatted as a comma
177
+ delimited string.
178
+
179
+ Examples:
180
+
181
+ If `metadata` is None, will only save the index column
182
+ with the remaining labels.
183
+
184
+ >>> from metahq_core.curations.labels import Labels
185
+ >>> labels = {
186
+ 'sample': ['GSM1', 'GSM2', 'GSM3'],
187
+ 'series': ['GSE1', 'GSE1', 'GSE2'],
188
+ 'UBERON:0000948': [1, -1, -1],
189
+ 'UBERON:0002113': [-1, 1, -1],
190
+ 'UBERON:0000955': [-1, -1, 1],
191
+ }
192
+ >>> labels = Labels.from_df(anno, index_col='sample', group_cols=['series'])
193
+ >>> labels.save(
194
+ '/path/to/out.parquet', fmt="parquet", attribute="tissue", level="sample"
195
+ )
184
196
 
185
197
  """
186
- LabelsExporter(logger=self.log, verbose=self.verbose).save(
198
+ LabelsExporter(attribute, level, logger=self.log, verbose=self.verbose).save(
187
199
  self, fmt, outfile, metadata
188
200
  )
189
201
 
190
202
  def select(self, *args, **kwargs) -> Labels:
191
- """Select annotation columns while maintaining ids."""
203
+ """Select label entity columns while maintaining ids."""
192
204
  selected_data = self.data.select(*args, **kwargs)
193
205
 
194
206
  return self.__class__(
@@ -202,48 +214,24 @@ class Labels(BaseCuration):
202
214
  )
203
215
 
204
216
  def slice(self, offset: int, length: int | None = None) -> Labels:
205
- """Slice both data and ids simultaneously using polars slice."""
206
- sliced_data = self.data.slice(offset, length)
207
- sliced_ids_data = self._ids.data.slice(offset, length)
208
-
209
- return self.__class__(
210
- data=sliced_data,
211
- ids=sliced_ids_data,
212
- index_col=self.index_col,
213
- group_cols=self.group_cols,
214
- collapsed=self.collapsed,
215
- logger=self.log,
216
- verbose=self.verbose,
217
- )
217
+ """Slice both data and ids simultaneously using `polars` slice.
218
218
 
219
- def subset_index(self, subset: list[str] | np.ndarray) -> Labels:
220
- """
221
- Selects rows of the expression frame whose sample IDs are in a specified
222
- subset. Note the returned order may not match.
223
-
224
- Parameters
225
- ----------
226
- subset: list[str] | np.ndarray
227
- Array-like of index IDs to select from the expression frame.
219
+ Arguments:
220
+ offset (int):
221
+ Index position to begin the slice.
228
222
 
229
- Returns
230
- -------
231
- A new LazyExp object with the subset of index IDs in the frame.
223
+ length (int | None):
224
+ Number of indices past `offset` to slice out.
232
225
 
226
+ Returns:
227
+ Sliced Labels object as a subset of the original Labels.
233
228
  """
234
- _, _, mask = np.intersect1d(
235
- np.array(subset), np.array(self.index), return_indices=True
236
- )
237
-
238
- diff = abs(len(mask) != len(subset))
239
- if (diff != 0) and self.verbose:
240
- self.log.warning("%s indices not found in the frame.", diff)
229
+ sliced_data = self.data.slice(offset, length)
230
+ sliced_ids_data = self._ids.data.slice(offset, length)
241
231
 
242
232
  return self.__class__(
243
- data=self.data.with_row_index()
244
- .filter(pl.col("index").is_in(mask))
245
- .drop("index"),
246
- ids=self._ids.filter_by_mask(mask).data,
233
+ data=sliced_data,
234
+ ids=sliced_ids_data,
247
235
  index_col=self.index_col,
248
236
  group_cols=self.group_cols,
249
237
  collapsed=self.collapsed,
@@ -253,17 +241,53 @@ class Labels(BaseCuration):
253
241
 
254
242
  def to_numpy(self) -> NpIntMatrix:
255
243
  """Wrapper for polars `to_numpy`."""
256
- return LabelsExporter().to_numpy(self)
244
+ return self.data.to_numpy()
257
245
 
258
246
  @classmethod
259
247
  def from_df(
260
248
  cls,
261
249
  df: pl.DataFrame,
262
250
  index_col: str,
263
- group_cols: tuple[str, ...] | list[str] = ("group", "platform"),
251
+ group_cols: tuple[str, ...] | list[str],
264
252
  **kwargs,
265
253
  ) -> Labels:
266
- """Creates a Labels object from a combined DataFrame."""
254
+ """Creates a Labels object from a combined DataFrame.
255
+
256
+ Attributes:
257
+ df (pl.DataFrame):
258
+ Polars DataFrame with index and group ID columns and columns for each
259
+ attribute entity for each index (e.g. male or female, tissues, diseases, etc).
260
+
261
+ index_col (str):
262
+ Name of the column of data that contains the index IDs.
263
+
264
+ group_cols (tuple[str, ...]):
265
+ Names of columns of data that contain an ID for each index indicating if it belongs
266
+ to a particular group (e.g. dataset, sex, platform, etc.).
267
+
268
+ Returns:
269
+ A Labels object constructed from `df`.
270
+
271
+ Examples:
272
+ >>> from metahq_core.curations.labels import Labels
273
+ >>> labels = {
274
+ 'sample': ['GSM1', 'GSM2', 'GSM3'],
275
+ 'series': ['GSE1', 'GSE1', 'GSE2'],
276
+ 'UBERON:0000948': [1, -1, -1],
277
+ 'UBERON:0002113': [-1, 1, -1],
278
+ 'UBERON:0000955': [-1, -1, 1],
279
+ }
280
+ >>> labels = Labels.from_df(anno, index_col='sample', group_cols=['series'])
281
+ ┌────────┬────────┬────────────────┬────────────────┬────────────────┐
282
+ │ sample ┆ series ┆ UBERON:0000948 ┆ UBERON:0002113 ┆ UBERON:0000955 │
283
+ │ --- ┆ --- ┆ --- ┆ --- ┆ --- │
284
+ │ str ┆ str ┆ i64 ┆ i64 ┆ i64 │
285
+ ╞════════╪════════╪════════════════╪════════════════╪════════════════╡
286
+ │ GSM1 ┆ GSE1 ┆ 1 ┆ -1 ┆ -1 │
287
+ │ GSM2 ┆ GSE1 ┆ -1 ┆ -1 ┆ -1 │
288
+ │ GSM3 ┆ GSE2 ┆ -1 ┆ -1 ┆ 1 │
289
+ └────────┴────────┴────────────────┴────────────────┴────────────────┘
290
+ """
267
291
  id_columns = [index_col] + list(group_cols)
268
292
  ids_data = df.select(id_columns)
269
293
  annotation_data = df.drop(id_columns)
@@ -278,37 +302,144 @@ class Labels(BaseCuration):
278
302
 
279
303
  @property
280
304
  def entities(self) -> list[str]:
281
- """Returns column names of the Annotations frame."""
305
+ """Returns column names of the Labels frame.
306
+
307
+ Examples:
308
+ >>> from metahq_core.curations.labels import Labels
309
+ >>> labels = {
310
+ 'sample': ['GSM1', 'GSM2', 'GSM3'],
311
+ 'series': ['GSE1', 'GSE1', 'GSE2'],
312
+ 'UBERON:0000948': [1, -1, -1],
313
+ 'UBERON:0002113': [-1, 1, -1],
314
+ 'UBERON:0000955': [-1, -1, 1],
315
+ }
316
+ >>> labels = Labels.from_df(anno, index_col='sample', group_cols=['series'])
317
+ >>> labels.entities
318
+ ['UBERON:0000948', 'UBERON:0002113', 'UBERON:0000955']
319
+ """
282
320
  return self.data.columns
283
321
 
284
322
  @property
285
323
  def groups(self) -> list[str]:
286
- """Returns the groups column of the Annotations curation."""
324
+ """Returns the groups column of the Labels curation.
325
+
326
+ Examples:
327
+ >>> from metahq_core.curations.labels import Labels
328
+ >>> labels = {
329
+ 'sample': ['GSM1', 'GSM2', 'GSM3'],
330
+ 'series': ['GSE1', 'GSE1', 'GSE2'],
331
+ 'UBERON:0000948': [1, -1, -1],
332
+ 'UBERON:0002113': [-1, 1, -1],
333
+ 'UBERON:0000955': [-1, -1, 1],
334
+ }
335
+ >>> labels = Labels.from_df(anno, index_col='sample', group_cols=['series'])
336
+ >>> labels.groups
337
+ ['GSE1', 'GSE1', 'GSE2']
338
+ """
287
339
  return self.ids["group"].to_list()
288
340
 
289
341
  @property
290
342
  def ids(self) -> pl.DataFrame:
291
- """Return the IDs dataframe."""
343
+ """Return the IDs dataframe.
344
+
345
+ Examples:
346
+ >>> from metahq_core.curations.labels import Labels
347
+ >>> labels = {
348
+ 'sample': ['GSM1', 'GSM2', 'GSM3'],
349
+ 'series': ['GSE1', 'GSE1', 'GSE2'],
350
+ 'UBERON:0000948': [1, -1, -1],
351
+ 'UBERON:0002113': [-1, 1, -1],
352
+ 'UBERON:0000955': [-1, -1, 1],
353
+ }
354
+ >>> labels = Labels.from_df(anno, index_col='sample', group_cols=['series'])
355
+ >>> labels.ids
356
+ ┌────────┬────────┐
357
+ │ sample ┆ series │
358
+ │ --- ┆ --- │
359
+ │ str ┆ str │
360
+ ╞════════╪════════╡
361
+ │ GSM1 ┆ GSE1 │
362
+ │ GSM2 ┆ GSE1 │
363
+ │ GSM3 ┆ GSE2 │
364
+ └────────┴────────┘
365
+ """
292
366
  return self._ids.data
293
367
 
294
368
  @property
295
- def index(self) -> list:
296
- """Return the index column as a list."""
369
+ def index(self) -> list[str]:
370
+ """Return the index column as a list.
371
+
372
+ Examples:
373
+ >>> from metahq_core.curations.labels import Labels
374
+ >>> labels = {
375
+ 'sample': ['GSM1', 'GSM2', 'GSM3'],
376
+ 'series': ['GSE1', 'GSE1', 'GSE2'],
377
+ 'UBERON:0000948': [1, -1, -1],
378
+ 'UBERON:0002113': [-1, 1, -1],
379
+ 'UBERON:0000955': [-1, -1, 1],
380
+ }
381
+ >>> labels = Labels.from_df(anno, index_col='sample', group_cols=['series'])
382
+ >>> labels.index
383
+ ['GSM1', 'GSM2', 'GSM3']
384
+ """
297
385
  return self._ids.index.to_list()
298
386
 
299
387
  @property
300
388
  def n_indices(self) -> int:
301
- """Returns number of indices."""
389
+ """Returns number of indices.
390
+
391
+ Examples:
392
+ >>> from metahq_core.curations.labels import Labels
393
+ >>> labels = {
394
+ 'sample': ['GSM1', 'GSM2', 'GSM3'],
395
+ 'series': ['GSE1', 'GSE1', 'GSE2'],
396
+ 'UBERON:0000948': [1, -1, -1],
397
+ 'UBERON:0002113': [-1, 1, -1],
398
+ 'UBERON:0000955': [-1, -1, 1],
399
+ }
400
+ >>> labels = Labels.from_df(anno, index_col='sample', group_cols=['series'])
401
+ >>> labels.n_indices
402
+ 3
403
+ """
302
404
  return self.data.height
303
405
 
304
406
  @property
305
407
  def n_entities(self) -> int:
306
- """Returns number of entities."""
408
+ """Returns number of entities.
409
+
410
+ Examples:
411
+ >>> from metahq_core.curations.labels import Labels
412
+ >>> labels = {
413
+ 'sample': ['GSM1', 'GSM2', 'GSM3'],
414
+ 'series': ['GSE1', 'GSE1', 'GSE2'],
415
+ 'UBERON:0000948': [1, -1, -1],
416
+ 'UBERON:0002113': [-1, 1, -1],
417
+ 'UBERON:0000955': [-1, -1, 1],
418
+ 'UBERON:0002107': [-1, -1, -1],
419
+ }
420
+ >>> labels = Labels.from_df(anno, index_col='sample', group_cols=['series'])
421
+ >>> labels.n_entities
422
+ 4
423
+ """
307
424
  return len(self.entities)
308
425
 
309
426
  @property
310
427
  def unique_groups(self) -> list[str]:
311
- """Returns unique groups."""
428
+ """Returns unique groups.
429
+
430
+ Examples:
431
+ >>> from metahq_core.curations.labels import Labels
432
+ >>> labels = {
433
+ 'sample': ['GSM1', 'GSM2', 'GSM3'],
434
+ 'series': ['GSE1', 'GSE1', 'GSE2'],
435
+ 'UBERON:0000948': [1, -1, -1],
436
+ 'UBERON:0002113': [-1, 1, -1],
437
+ 'UBERON:0000955': [-1, -1, 1],
438
+ }
439
+ >>> labels = Labels.from_df(anno, index_col='sample', group_cols=['series'])
440
+ >>> labels.unqiue_groups
441
+ ['GSE1', 'GSE2']
442
+ """
312
443
  return list(set(self.groups))
313
444
 
314
445
  def __repr__(self):