metahq-core 0.1.2__py3-none-any.whl → 1.0.0rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,24 +1,25 @@
1
1
  """
2
2
  Class for performing annotation propagation.
3
3
 
4
- Assigns labels to terms by propagating annotations through
5
- an ontology structure.
6
-
7
- Applies the dot product between an annotations matrix and familial adjacency
4
+ Applies the dot product between an annotations matrix and familial membership
8
5
  matrices. Below is the computation:
9
6
 
10
7
  (samples x reference_terms) @ (reference_terms, propagated_terms)
11
8
  -> (samples x propagated_terms).
12
9
 
13
- This is done once for ancestors and once for descendants. Then for each sample,
10
+ To propagate upwards, the (reference_terms, propagated_terms) familial membership
11
+ matrix indicates ancestor relationships. To propagate downwards, the
12
+ (reference_terms, propagated_terms) familial membership matrix indicates descendant
13
+ relationships.
14
+
15
+ If labeling, this is done once for ancestors and once for descendants. Then for each sample,
14
16
  if a term is is not an ancestor or descendant of that sample, then the sample is
15
17
  given a negative label for that term.
16
18
 
17
-
18
19
  Author: Parker Hicks
19
20
  Date: 2025-04-23
20
21
 
21
- Last updated: 2025-11-21 by Parker Hicks
22
+ Last updated: 2025-11-28 by Parker Hicks
22
23
  """
23
24
 
24
25
  from pathlib import Path
@@ -30,7 +31,7 @@ import polars as pl
30
31
  from metahq_core.curations._multiprocess_propagator import MultiprocessPropagator
31
32
  from metahq_core.logger import setup_logger
32
33
  from metahq_core.util.alltypes import NpIntMatrix, NpStringArray
33
- from metahq_core.util.supported import onto_relations
34
+ from metahq_core.util.supported import get_default_log_dir, onto_relations
34
35
 
35
36
  if TYPE_CHECKING:
36
37
  import logging
@@ -39,38 +40,23 @@ if TYPE_CHECKING:
39
40
 
40
41
 
41
42
  class Propagator:
42
- """
43
- Class to propagate annotations to labels given an ontology structure.
43
+ """Class to propagate annotations given an particular ontology structure.
44
44
 
45
- Attributes
46
- ----------
47
- ontology: str
48
- The name of an ontology supported by MetaHQ.
45
+ Attributes:
46
+ ontology (str):
47
+ The name of an ontology supported by MetaHQ.
49
48
 
50
- anno: Annotations
51
- A MetaHQ Annotations object with columns of ontology terms
52
- rows as samples, and each value is a 1 or 0 indicating if a sample is
53
- annotated to a particular term.
49
+ anno (Annotations):
50
+ A MetaHQ Annotations object with columns of ontology terms
51
+ rows as samples, and each value is a 1 or 0 indicating if a sample is
52
+ annotated to a particular term.
54
53
 
55
- to: list[str]
56
- A list of ontology term IDs to propagate annotations up or down to.
57
-
58
- family: dict[str, pl.DataFrame | list[str]]
59
- A pointer to the ancestry and descendants adjacency matrices and ids
60
- denoting their column ids.
61
-
62
- Methods
63
- -------
64
- propagate_up()
65
- Propagates annotations up to all terms in the annotations curation.
66
- If an index is annotated to a descendant of a term in `to`, then it
67
- is given an annotation of 1 to that term.
68
-
69
- propagate_down()
70
- Propagates annotations down to all terms in the annotations curation.
71
- If an index is annotated to an ancestor of a term in `to`, then it
72
- is given an annotation of 1 to that term.
54
+ to (list[str]):
55
+ A list of ontology term IDs to propagate annotations up or down to.
73
56
 
57
+ family (dict[str, pl.DataFrame | list[str]]):
58
+ A pointer to the ancestry and descendants adjacency matrices and ids
59
+ denoting their column ids.
74
60
  """
75
61
 
76
62
  def __init__(
@@ -81,7 +67,7 @@ class Propagator:
81
67
  relatives,
82
68
  logger=None,
83
69
  loglevel=20,
84
- logdir=Path("."),
70
+ logdir=get_default_log_dir(),
85
71
  verbose=True,
86
72
  ):
87
73
  self.ontology: str = ontology
@@ -98,35 +84,34 @@ class Propagator:
98
84
  self.verbose: bool = verbose
99
85
  self._propagator = MultiprocessPropagator(logger=logger, verbose=verbose)
100
86
 
101
- def propagate_down(
102
- self, verbose: bool = False
103
- ) -> tuple[NpIntMatrix, list[str], pl.DataFrame]:
104
- """Propagates annotations down to the terms in self.to"""
105
- if verbose:
87
+ def propagate_down(self) -> tuple[NpIntMatrix, list[str], pl.DataFrame]:
88
+ """Propagates annotations down to all terms in the annotations curation.
89
+ If an index is annotated to an ancestor of a term in `to`, then it
90
+ is given an annotation of 1 to that term.
91
+ """
92
+ if self.verbose:
106
93
  return self._propagate_to_family(
107
94
  "descendants", task="Propagating descendants"
108
95
  )
109
96
  return self._propagate_to_family("descendants")
110
97
 
111
- def propagate_up(
112
- self, verbose: bool = False
113
- ) -> tuple[NpIntMatrix, list[str], pl.DataFrame]:
114
- """Propagates annotations up to the terms in self.to"""
115
- if verbose:
98
+ def propagate_up(self) -> tuple[NpIntMatrix, list[str], pl.DataFrame]:
99
+ """Propagates annotations up to all terms in the annotations curation.
100
+ If an index is annotated to a descendant of a term in `to`, then it
101
+ is given an annotation of 1 to that term.
102
+ """
103
+ if self.verbose:
116
104
  return self._propagate_to_family("ancestors", task="Propagating ancestors")
117
105
  return self._propagate_to_family("ancestors")
118
106
 
119
107
  def _load_anscestors(
120
108
  self, lf: pl.LazyFrame, _from: list[str], all_terms: pl.Series
121
109
  ) -> NpIntMatrix:
122
- """
123
- Loads the relations matrix with a ancestor-forward orientation.
124
-
125
- Returns
126
- -------
127
- Matrix of shape [_from, _to] where each value indicates if a particular
128
- column is a ancestor of a particular row.
110
+ """Loads the relations matrix with a ancestor-forward orientation.
129
111
 
112
+ Returns:
113
+ Matrix of shape [_from, _to] where each value indicates if a particular
114
+ column is a ancestor of a particular row.
130
115
  """
131
116
  return (
132
117
  lf.select(_from)
@@ -142,14 +127,11 @@ class Propagator:
142
127
  def _load_descendants(
143
128
  self, lf: pl.LazyFrame, _from: list[str], all_terms: pl.Series
144
129
  ) -> NpIntMatrix:
145
- """
146
- Loads the relations matrix with a descendants-forward orientation.
147
-
148
- Returns
149
- -------
150
- Matrix of shape [_from, _to] where each value indicates if a particular
151
- column is a descendant of a particular row.
130
+ """Loads the relations matrix with a descendants-forward orientation.
152
131
 
132
+ Returns:
133
+ Matrix of shape [_from, _to] where each value indicates if a particular
134
+ column is a descendant of a particular row.
153
135
  """
154
136
  return (
155
137
  lf.select(sorted(self.to))
@@ -161,8 +143,7 @@ class Propagator:
161
143
  )
162
144
 
163
145
  def _load_family(self):
164
- """
165
- Loads the terms x terms relations matrix for ancestor and descendant relationships.
146
+ """Loads the terms x terms relations matrix for ancestor and descendant relationships.
166
147
  These matrices store column-wise relational annotations where if term_n is an ancestor
167
148
  of term_m, then ancestors[n, m] will be 1 and ancestors[m, n] will be 0. This matrix is
168
149
  transposed when loading to get row-wise relational annotations and match dimensions with
@@ -218,9 +199,8 @@ class Propagator:
218
199
  return propagated, list(self.family["ids"]), self.anno.ids
219
200
 
220
201
  def _split_anno(self) -> list:
221
- """
222
- Splits annotation matrix into chunks row-wise to reduce computational overhead
223
- for matrix multiplication. Each chunk will have at most 1000 entries.
202
+ """Splits annotation matrix into chunks row-wise to reduce computational overhead
203
+ for matrix multiplication. Each chunk will have at most 500 entries.
224
204
  """
225
205
  nchunks = self.anno.ids.height // 500
226
206
  if nchunks == 0:
@@ -241,30 +221,27 @@ def propagate_controls(
241
221
  a label of 2 for any disease term IDs that are labeled as positives for any other
242
222
  samples that come from the same study (group) as the control samples.
243
223
 
244
- Parameters
245
- ----------
246
- labels: pl.DataFrame
247
- Labels DataFrame with an index and group column specifically. Values are -1, 0, and 1
248
- indicating if an index is labeled to a term (other columns), not, or unknown.
249
-
250
- to_terms: list[str]
251
- Ontology term IDs for which to generate labels. Must be in the columns of labels.
224
+ Parameters:
225
+ labels (pl.DataFrame):
226
+ Labels DataFrame with an index and group column specifically. Values are -1, 0, and 1
227
+ indicating if an index is labeled to a term (other columns), not, or unknown.
252
228
 
253
- index_col: str
254
- Name of the column in labels storing index IDs.
229
+ to_terms (list[str]):
230
+ Ontology term IDs for which to generate labels. Must be in the columns of `labels`.
255
231
 
256
- group_col: str
257
- Name of the column in labels storing group IDs.
232
+ index_col (str):
233
+ Name of the column in `labels` storing index IDs.
258
234
 
259
- ctrl_ids: pl.DataFrame
260
- DataFrame of index IDs that are healthy controls and any other ID columns that are
261
- also in labels.
235
+ group_col (str):
236
+ Name of the column in `labels` storing group IDs.
262
237
 
263
- Returns
264
- -------
265
- A DataFrame of -1, 0, 1, and 2 labels of all available indices where 2 indicates if an
266
- index is a control for a particular disease.
238
+ ctrl_ids (pl.DataFrame):
239
+ DataFrame of index IDs that are healthy controls and any other ID columns that are
240
+ also in `labels`.
267
241
 
242
+ Returns:
243
+ A `polars.DataFrame` object of -1, 0, 1, and 2 labels of all available indices where 2
244
+ indicates if an index is a control for a particular disease.
268
245
  """
269
246
  mapper = {0: 0, 1: 1, -1: 0}
270
247
  select = to_terms + [group_col]
File without changes
@@ -4,7 +4,7 @@ Class for Annotations export io classes.
4
4
  Author: Parker Hicks
5
5
  Date: 2025-09-08
6
6
 
7
- Last updated: 2025-11-21 by Parker Hicks
7
+ Last updated: 2026-02-03 by Parker Hicks
8
8
  """
9
9
 
10
10
  from __future__ import annotations
@@ -21,6 +21,7 @@ from metahq_core.util.supported import (
21
21
  database_ids,
22
22
  geo_metadata,
23
23
  get_annotations,
24
+ get_default_log_dir,
24
25
  metadata_fields,
25
26
  supported,
26
27
  )
@@ -36,31 +37,82 @@ ANNOTATION_KEY = {"1": True, "0": False}
36
37
 
37
38
 
38
39
  class AnnotationsExporter(BaseExporter):
39
- """Base abstract class for Exporter children."""
40
+ """Exporter for Annotations curations.
40
41
 
41
- def __init__(self, logger=None, loglevel=20, logdir=Path("."), verbose=True):
42
+ Attributes:
43
+ attribute (Literal["tissue", "disease", "sex", "age"]):
44
+ Attribute of the annotations to save.
45
+
46
+ level (Literal["sample", "series"]):
47
+ Level of the annotations.
48
+
49
+ logger (logging.Logger):
50
+ Python builtin Logger.
51
+
52
+ loglevel (int):
53
+ Logging level.
54
+
55
+ logdir (str | Path):
56
+ Path to directory storing logs.
57
+
58
+ verbose (bool):
59
+ Controls logging outputs.
60
+
61
+ """
62
+
63
+ def __init__(
64
+ self,
65
+ attribute: str,
66
+ level: str,
67
+ logger=None,
68
+ loglevel=20,
69
+ logdir=get_default_log_dir(),
70
+ verbose=True,
71
+ ):
72
+ self.attribute = attribute
73
+ self._database = self._load_annotations(level)
42
74
 
43
75
  if logger is None:
44
76
  logger = setup_logger(__name__, level=loglevel, log_dir=logdir)
45
77
  self.log: logging.Logger = logger
46
78
  self.verbose: bool = verbose
47
79
 
80
+ def add_sources(self, anno: Annotations) -> Annotations:
81
+ """Add the sources that contributed to the labels of each sample or dataset.
82
+
83
+ Arguments:
84
+ anno (Annotations):
85
+ A populated Labels curation object.
86
+
87
+ Returns:
88
+ The Labels object with additional source IDs for each index.
89
+
90
+ """
91
+ sources = {anno.index_col: [], "sources": []}
92
+ for idx in anno.index:
93
+ sources[anno.index_col].append(idx)
94
+
95
+ # get sources for a particular index for the specified attribute
96
+ sources["sources"].append(
97
+ "|".join(list(self._database[idx][self.attribute].keys()))
98
+ )
99
+
100
+ return anno.add_ids(pl.DataFrame(sources))
101
+
48
102
  def get_sra(self, anno: Annotations, fields: list[str]) -> Annotations:
49
103
  """
50
104
  Retrieve SRA IDs from the annotations if they exist.
51
105
 
52
- Parameters
53
- ----------
54
- anno: Annotations
55
- An Annotations curation containing samples and terms matching user-specified
56
- filters.
106
+ Arguments:
107
+ anno (Annotations):
108
+ An Annotations curation containing samples and terms matching user-specified
109
+ filters.
57
110
 
58
- fields: list[str]
59
- SRA ID levels (i.e., srr, srx, srs, or srp)
111
+ fields (list[str]):
112
+ SRA ID levels (i.e., srr, srx, srs, or srp)
60
113
 
61
- Returns
62
- -------
63
- A new Annotations curation with merged SRA IDs.
114
+ Returns:
115
+ A new Annotations curation with merged SRA IDs.
64
116
 
65
117
  """
66
118
  _anno = self._load_annotations(level=anno.index_col) # all MetaHQ annotations
@@ -88,18 +140,22 @@ class AnnotationsExporter(BaseExporter):
88
140
  metadata: str | None = None,
89
141
  **kwargs,
90
142
  ):
91
- """
92
-
93
- Save annotations curation to json. Keys are terms and values are
143
+ """Save annotations curation to json. Keys are terms and values are
94
144
  positively annotated indices.
95
145
 
96
- Parameters
97
- ----------
98
- outfile: FilePath
99
- Path to outfile.json.
146
+ Arguments:
147
+ anno (Annotations):
148
+ A populated Annotations object.
149
+
150
+ fmt (Literal["json", "parquet", "csv", "tsv"]):
151
+ File format to save to.
152
+
153
+ file (FilePath):
154
+ Path to outfile.json.
155
+
156
+ metadata (str):
157
+ Metadata fields to include.
100
158
 
101
- metadata: str
102
- Metadata fields to include.
103
159
  """
104
160
  _ = checkdir(file, is_file=True)
105
161
  opt = {
@@ -117,37 +173,39 @@ class AnnotationsExporter(BaseExporter):
117
173
  def to_csv(
118
174
  self, anno: Annotations, file: FilePath, metadata: str | None = None, **kwargs
119
175
  ):
120
- """
121
- Save annotations to csv.
176
+ """Save annotations to csv.
177
+
178
+ Arguments:
179
+ anno (Annotations):
180
+ A populated Annotations object.
122
181
 
123
- Parameters
124
- ----------
125
- outfile: FilePath
126
- Path to outfile.csv.
182
+ file (FilePath):
183
+ Path to outfile.csv.
127
184
 
128
- metadata: str
129
- Metadata fields to include.
185
+ metadata (str):
186
+ Metadata fields to include.
130
187
 
131
188
  """
132
189
  self._save_tabular("csv", anno, file, metadata, **kwargs)
133
190
 
134
191
  def to_json(self, anno: Annotations, file: FilePath, metadata: str | None = None):
135
- """
136
- Save annotations curation to json. Keys are terms and values are
192
+ """Save annotations curation to json. Keys are terms and values are
137
193
  positively annotated indices.
138
194
 
139
- Parameters
140
- ----------
141
- file: FilePath
142
- Path to outfile.json.
195
+ Arguments:
196
+ anno (Annotations):
197
+ A populated Annotations object.
198
+
199
+ file (FilePath):
200
+ Path to outfile.json.
143
201
 
144
- metadata: str
145
- Metadata fields to include.
202
+ metadata (str):
203
+ Metadata fields to include.
146
204
 
147
205
  """
148
206
 
149
207
  if self._only_index(metadata, anno.index_col):
150
- self._save_json_only_index(anno, file)
208
+ self._save_json_with_metadata(anno, file, anno.index_col)
151
209
 
152
210
  elif isinstance(metadata, str):
153
211
  self._save_json_with_metadata(anno, file, metadata)
@@ -169,19 +227,17 @@ class AnnotationsExporter(BaseExporter):
169
227
  metadata: str | None = None,
170
228
  **kwargs,
171
229
  ):
172
- """
173
- Save annotations to parquet.
230
+ """Save annotations to parquet.
174
231
 
175
- Parameters
176
- ----------
177
- anno: Annotations
178
- Annotations curation object to save.
232
+ Arguments:
233
+ anno (Annotations):
234
+ Annotations curation object to save.
179
235
 
180
- file: FilePath
181
- Path to outfile.parquet.
236
+ file (FilePath):
237
+ Path to outfile.parquet.
182
238
 
183
- metadata: str | None
184
- Metadata fields to include.
239
+ metadata (str | None):
240
+ Metadata fields to include.
185
241
 
186
242
  """
187
243
  self._save_tabular("parquet", anno, file, metadata, **kwargs)
@@ -189,15 +245,17 @@ class AnnotationsExporter(BaseExporter):
189
245
  def to_tsv(
190
246
  self, anno: Annotations, file: FilePath, metadata: str | None = None, **kwargs
191
247
  ):
192
- """
193
- Save annotations to tsv.
194
- Parameters
195
- ----------
196
- outfile: FilePath
197
- Path to outfile.tsv.
248
+ """Save annotations to tsv.
249
+
250
+ Arguments:
251
+ anno (Annotations):
252
+ A populated Annotations object.
253
+
254
+ file (FilePath):
255
+ Path to outfile.tsv.
198
256
 
199
- metadata: str
200
- Metadata fields to include.
257
+ metadata (str):
258
+ Metadata fields to include.
201
259
 
202
260
  """
203
261
  self._save_tabular("tsv", anno, file, metadata, **kwargs)
@@ -277,8 +335,7 @@ class AnnotationsExporter(BaseExporter):
277
335
  def _save_table_with_description(
278
336
  self, file: FilePath, anno: Annotations, metadata: list[str], fmt: str, **kwargs
279
337
  ):
280
- """
281
- Fetches corresponding sample/study descriptions and saves the annotations
338
+ """Fetches corresponding sample/study descriptions and saves the annotations
282
339
  curation in tabular format (parquet, csv, tsv).
283
340
  """
284
341
 
@@ -317,6 +374,10 @@ class AnnotationsExporter(BaseExporter):
317
374
  anno, [field for field in _metadata if field in database_ids("sra")]
318
375
  )
319
376
 
377
+ # add sources
378
+ anno = self.add_sources(anno)
379
+ _metadata.extend(["sources"])
380
+
320
381
  if "description" in _metadata:
321
382
  self._save_table_with_description(file, anno, _metadata, fmt=fmt, **kwargs)
322
383
 
@@ -356,10 +417,15 @@ class AnnotationsExporter(BaseExporter):
356
417
  self, anno: Annotations, file: FilePath, metadata: str
357
418
  ):
358
419
  """Save annotations as JSON with requested metadata."""
420
+
421
+ # add sources
422
+ anno = self.add_sources(anno)
423
+
359
424
  _anno: dict[str, dict[str, dict[str, str]]] = {
360
425
  term: {} for term in anno.entities
361
426
  }
362
427
  _metadata = self._parse_metafields(anno.index_col, metadata)
428
+ _metadata.extend(["sources"])
363
429
 
364
430
  if self._sra_in_metadata(_metadata):
365
431
  anno = self.get_sra(