PyPI - metahq-core - Versions diffs - 0.1.2__py3-none-any.whl → 1.0.0rc2__py3-none-any.whl - Mend

metahq-core 0.1.2py3-none-any.whl → 1.0.0rc2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

metahq_core/__init__.py +1 -1
metahq_core/curations/annotation_converter.py +5 -5
metahq_core/curations/annotations.py +361 -151
metahq_core/curations/index.py +104 -43
metahq_core/curations/labels.py +259 -128
metahq_core/curations/propagator.py +62 -85
metahq_core/export/__init__.py +0 -0
metahq_core/export/annotations.py +125 -59
metahq_core/export/labels.py +128 -70
metahq_core/logger.py +11 -18
metahq_core/query.py +346 -241
metahq_core/{ontology/loader.py → relations_loader.py} +2 -1
metahq_core/search.py +37 -14
metahq_core/util/io.py +109 -46
metahq_core/util/supported.py +16 -5
{metahq_core-0.1.2.dist-info → metahq_core-1.0.0rc2.dist-info}/METADATA +13 -6
metahq_core-1.0.0rc2.dist-info/RECORD +30 -0
{metahq_core-0.1.2.dist-info → metahq_core-1.0.0rc2.dist-info}/WHEEL +1 -1
metahq_core-1.0.0rc2.dist-info/licenses/LICENSE +28 -0
metahq_core/ontology/base.py +0 -376
metahq_core/ontology/graph.py +0 -252
metahq_core-0.1.2.dist-info/RECORD +0 -30
/metahq_core/{ontology → curations}/__init__.py +0 -0

metahq_core/curations/propagator.py CHANGED Viewed

@@ -1,24 +1,25 @@
 """
 Class for performing annotation propagation.
-Assigns labels to terms by propagating annotations through
-an ontology structure.
-Applies the dot product between an annotations matrix and familial adjacency
+Applies the dot product between an annotations matrix and familial membership
 matrices. Below is the computation:
     (samples x reference_terms) @ (reference_terms, propagated_terms)
         -> (samples x propagated_terms).
-This is done once for ancestors and once for descendants. Then for each sample,
+To propagate upwards, the (reference_terms, propagated_terms) familial membership
+matrix indicates ancestor relationships. To propagate downwards, the
+(reference_terms, propagated_terms) familial membership matrix indicates descendant
+relationships.
+If labeling, this is done once for ancestors and once for descendants. Then for each sample,
 if a term is is not an ancestor or descendant of that sample, then the sample is
 given a negative label for that term.
 Author: Parker Hicks
 Date: 2025-04-23
-Last updated: 2025-11-21 by Parker Hicks
+Last updated: 2025-11-28 by Parker Hicks
 """
 from pathlib import Path
@@ -30,7 +31,7 @@ import polars as pl
 from metahq_core.curations._multiprocess_propagator import MultiprocessPropagator
 from metahq_core.logger import setup_logger
 from metahq_core.util.alltypes import NpIntMatrix, NpStringArray
-from metahq_core.util.supported import onto_relations
+from metahq_core.util.supported import get_default_log_dir, onto_relations
 if TYPE_CHECKING:
     import logging
@@ -39,38 +40,23 @@ if TYPE_CHECKING:
 class Propagator:
-    """
-    Class to propagate annotations to labels given an ontology structure.
+    """Class to propagate annotations given an particular ontology structure.
-    Attributes
-    ----------
-    ontology: str
-        The name of an ontology supported by MetaHQ.
+    Attributes:
+        ontology (str):
+            The name of an ontology supported by MetaHQ.
-    anno: Annotations
-        A MetaHQ Annotations object with columns of ontology terms
-        rows as samples, and each value is a 1 or 0 indicating if a sample is
-        annotated to a particular term.
+        anno (Annotations):
+            A MetaHQ Annotations object with columns of ontology terms
+                rows as samples, and each value is a 1 or 0 indicating if a sample is
+                annotated to a particular term.
-    to: list[str]
-        A list of ontology term IDs to propagate annotations up or down to.
-    family: dict[str, pl.DataFrame | list[str]]
-        A pointer to the ancestry and descendants adjacency matrices and ids
-        denoting their column ids.
-    Methods
-    -------
-    propagate_up()
-        Propagates annotations up to all terms in the annotations curation.
-        If an index is annotated to a descendant of a term in `to`, then it
-        is given an annotation of 1 to that term.
-    propagate_down()
-        Propagates annotations down to all terms in the annotations curation.
-        If an index is annotated to an ancestor of a term in `to`, then it
-        is given an annotation of 1 to that term.
+        to (list[str]):
+            A list of ontology term IDs to propagate annotations up or down to.
+        family (dict[str, pl.DataFrame | list[str]]):
+            A pointer to the ancestry and descendants adjacency matrices and ids
+                denoting their column ids.
     """
     def __init__(
@@ -81,7 +67,7 @@ class Propagator:
         relatives,
         logger=None,
         loglevel=20,
-        logdir=Path("."),
+        logdir=get_default_log_dir(),
         verbose=True,
     ):
         self.ontology: str = ontology
@@ -98,35 +84,34 @@ class Propagator:
         self.verbose: bool = verbose
         self._propagator = MultiprocessPropagator(logger=logger, verbose=verbose)
-    def propagate_down(
-        self, verbose: bool = False
-    ) -> tuple[NpIntMatrix, list[str], pl.DataFrame]:
-        """Propagates annotations down to the terms in self.to"""
-        if verbose:
+    def propagate_down(self) -> tuple[NpIntMatrix, list[str], pl.DataFrame]:
+        """Propagates annotations down to all terms in the annotations curation.
+        If an index is annotated to an ancestor of a term in `to`, then it
+        is given an annotation of 1 to that term.
+        """
+        if self.verbose:
             return self._propagate_to_family(
                 "descendants", task="Propagating descendants"
             )
         return self._propagate_to_family("descendants")
-    def propagate_up(
-        self, verbose: bool = False
-    ) -> tuple[NpIntMatrix, list[str], pl.DataFrame]:
-        """Propagates annotations up to the terms in self.to"""
-        if verbose:
+    def propagate_up(self) -> tuple[NpIntMatrix, list[str], pl.DataFrame]:
+        """Propagates annotations up to all terms in the annotations curation.
+        If an index is annotated to a descendant of a term in `to`, then it
+        is given an annotation of 1 to that term.
+        """
+        if self.verbose:
             return self._propagate_to_family("ancestors", task="Propagating ancestors")
         return self._propagate_to_family("ancestors")
     def _load_anscestors(
         self, lf: pl.LazyFrame, _from: list[str], all_terms: pl.Series
     ) -> NpIntMatrix:
-        """
-        Loads the relations matrix with a ancestor-forward orientation.
-        Returns
-        -------
-        Matrix of shape [_from, _to] where each value indicates if a particular
-        column is a ancestor of a particular row.
+        """Loads the relations matrix with a ancestor-forward orientation.
+        Returns:
+            Matrix of shape [_from, _to] where each value indicates if a particular
+                column is a ancestor of a particular row.
         """
         return (
             lf.select(_from)
@@ -142,14 +127,11 @@ class Propagator:
     def _load_descendants(
         self, lf: pl.LazyFrame, _from: list[str], all_terms: pl.Series
     ) -> NpIntMatrix:
-        """
-        Loads the relations matrix with a descendants-forward orientation.
-        Returns
-        -------
-        Matrix of shape [_from, _to] where each value indicates if a particular
-        column is a descendant of a particular row.
+        """Loads the relations matrix with a descendants-forward orientation.
+        Returns:
+            Matrix of shape [_from, _to] where each value indicates if a particular
+                column is a descendant of a particular row.
         """
         return (
             lf.select(sorted(self.to))
@@ -161,8 +143,7 @@ class Propagator:
         )
     def _load_family(self):
-        """
-        Loads the terms x terms relations matrix for ancestor and descendant relationships.
+        """Loads the terms x terms relations matrix for ancestor and descendant relationships.
         These matrices store column-wise relational annotations where if term_n is an ancestor
         of term_m, then ancestors[n, m] will be 1 and ancestors[m, n] will be 0. This matrix is
         transposed when loading to get row-wise relational annotations and match dimensions with
@@ -218,9 +199,8 @@ class Propagator:
         return propagated, list(self.family["ids"]), self.anno.ids
     def _split_anno(self) -> list:
-        """
-        Splits annotation matrix into chunks row-wise to reduce computational overhead
-        for matrix multiplication. Each chunk will have at most 1000 entries.
+        """Splits annotation matrix into chunks row-wise to reduce computational overhead
+        for matrix multiplication. Each chunk will have at most 500 entries.
         """
         nchunks = self.anno.ids.height // 500
         if nchunks == 0:
@@ -241,30 +221,27 @@ def propagate_controls(
     a label of 2 for any disease term IDs that are labeled as positives for any other
     samples that come from the same study (group) as the control samples.
-    Parameters
-    ----------
-    labels: pl.DataFrame
-        Labels DataFrame with an index and group column specifically. Values are -1, 0, and 1
-        indicating if an index is labeled to a term (other columns), not, or unknown.
-    to_terms: list[str]
-        Ontology term IDs for which to generate labels. Must be in the columns of labels.
+    Parameters:
+        labels (pl.DataFrame):
+            Labels DataFrame with an index and group column specifically. Values are -1, 0, and 1
+                indicating if an index is labeled to a term (other columns), not, or unknown.
-    index_col: str
-        Name of the column in labels storing index IDs.
+        to_terms (list[str]):
+            Ontology term IDs for which to generate labels. Must be in the columns of `labels`.
-    group_col: str
-        Name of the column in labels storing group IDs.
+        index_col (str):
+            Name of the column in `labels` storing index IDs.
-    ctrl_ids: pl.DataFrame
-        DataFrame of index IDs that are healthy controls and any other ID columns that are
-        also in labels.
+        group_col (str):
+            Name of the column in `labels` storing group IDs.
-    Returns
-    -------
-    A DataFrame of -1, 0, 1, and 2 labels of all available indices where 2 indicates if an
-    index is a control for a particular disease.
+        ctrl_ids (pl.DataFrame):
+            DataFrame of index IDs that are healthy controls and any other ID columns that are
+                also in `labels`.
+    Returns:
+        A `polars.DataFrame` object of -1, 0, 1, and 2 labels of all available indices where 2
+            indicates if an index is a control for a particular disease.
     """
     mapper = {0: 0, 1: 1, -1: 0}
     select = to_terms + [group_col]

metahq_core/export/__init__.py ADDED Viewed

File without changes

metahq_core/export/annotations.py CHANGED Viewed

@@ -4,7 +4,7 @@ Class for Annotations export io classes.
 Author: Parker Hicks
 Date: 2025-09-08
-Last updated: 2025-11-21 by Parker Hicks
+Last updated: 2026-02-03 by Parker Hicks
 """
 from __future__ import annotations
@@ -21,6 +21,7 @@ from metahq_core.util.supported import (
     database_ids,
     geo_metadata,
     get_annotations,
+    get_default_log_dir,
     metadata_fields,
     supported,
 )
@@ -36,31 +37,82 @@ ANNOTATION_KEY = {"1": True, "0": False}
 class AnnotationsExporter(BaseExporter):
-    """Base abstract class for Exporter children."""
+    """Exporter for Annotations curations.
-    def __init__(self, logger=None, loglevel=20, logdir=Path("."), verbose=True):
+    Attributes:
+        attribute (Literal["tissue", "disease", "sex", "age"]):
+            Attribute of the annotations to save.
+        level (Literal["sample", "series"]):
+            Level of the annotations.
+        logger (logging.Logger):
+            Python builtin Logger.
+        loglevel (int):
+            Logging level.
+        logdir (str | Path):
+            Path to directory storing logs.
+        verbose (bool):
+            Controls logging outputs.
+    """
+    def __init__(
+        self,
+        attribute: str,
+        level: str,
+        logger=None,
+        loglevel=20,
+        logdir=get_default_log_dir(),
+        verbose=True,
+    ):
+        self.attribute = attribute
+        self._database = self._load_annotations(level)
         if logger is None:
             logger = setup_logger(__name__, level=loglevel, log_dir=logdir)
         self.log: logging.Logger = logger
         self.verbose: bool = verbose
+    def add_sources(self, anno: Annotations) -> Annotations:
+        """Add the sources that contributed to the labels of each sample or dataset.
+        Arguments:
+            anno (Annotations):
+                A populated Labels curation object.
+        Returns:
+            The Labels object with additional source IDs for each index.
+        """
+        sources = {anno.index_col: [], "sources": []}
+        for idx in anno.index:
+            sources[anno.index_col].append(idx)
+            # get sources for a particular index for the specified attribute
+            sources["sources"].append(
+                "|".join(list(self._database[idx][self.attribute].keys()))
+            )
+        return anno.add_ids(pl.DataFrame(sources))
     def get_sra(self, anno: Annotations, fields: list[str]) -> Annotations:
         """
         Retrieve SRA IDs from the annotations if they exist.
-        Parameters
-        ----------
-        anno: Annotations
-            An Annotations curation containing samples and terms matching user-specified
-            filters.
+        Arguments:
+            anno (Annotations):
+                An Annotations curation containing samples and terms matching user-specified
+                filters.
-        fields: list[str]
-            SRA ID levels (i.e., srr, srx, srs, or srp)
+            fields (list[str]):
+                SRA ID levels (i.e., srr, srx, srs, or srp)
-        Returns
-        -------
-        A new Annotations curation with merged SRA IDs.
+        Returns:
+            A new Annotations curation with merged SRA IDs.
         """
         _anno = self._load_annotations(level=anno.index_col)  # all MetaHQ annotations
@@ -88,18 +140,22 @@ class AnnotationsExporter(BaseExporter):
         metadata: str | None = None,
         **kwargs,
     ):
-        """
-        Save annotations curation to json. Keys are terms and values are
+        """Save annotations curation to json. Keys are terms and values are
         positively annotated indices.
-        Parameters
-        ----------
-        outfile: FilePath
-            Path to outfile.json.
+        Arguments:
+            anno (Annotations):
+                A populated Annotations object.
+            fmt (Literal["json", "parquet", "csv", "tsv"]):
+                File format to save to.
+            file (FilePath):
+                Path to outfile.json.
+            metadata (str):
+                Metadata fields to include.
-        metadata: str
-            Metadata fields to include.
         """
         _ = checkdir(file, is_file=True)
         opt = {
@@ -117,37 +173,39 @@ class AnnotationsExporter(BaseExporter):
     def to_csv(
         self, anno: Annotations, file: FilePath, metadata: str | None = None, **kwargs
     ):
-        """
-        Save annotations to csv.
+        """Save annotations to csv.
+        Arguments:
+            anno (Annotations):
+                A populated Annotations object.
-        Parameters
-        ----------
-        outfile: FilePath
-            Path to outfile.csv.
+            file (FilePath):
+                Path to outfile.csv.
-        metadata: str
-            Metadata fields to include.
+            metadata (str):
+                Metadata fields to include.
         """
         self._save_tabular("csv", anno, file, metadata, **kwargs)
     def to_json(self, anno: Annotations, file: FilePath, metadata: str | None = None):
-        """
-        Save annotations curation to json. Keys are terms and values are
+        """Save annotations curation to json. Keys are terms and values are
         positively annotated indices.
-        Parameters
-        ----------
-        file: FilePath
-            Path to outfile.json.
+        Arguments:
+            anno (Annotations):
+                A populated Annotations object.
+            file (FilePath):
+                Path to outfile.json.
-        metadata: str
-            Metadata fields to include.
+            metadata (str):
+                Metadata fields to include.
         """
         if self._only_index(metadata, anno.index_col):
-            self._save_json_only_index(anno, file)
+            self._save_json_with_metadata(anno, file, anno.index_col)
         elif isinstance(metadata, str):
             self._save_json_with_metadata(anno, file, metadata)
@@ -169,19 +227,17 @@ class AnnotationsExporter(BaseExporter):
         metadata: str | None = None,
         **kwargs,
     ):
-        """
-        Save annotations to parquet.
+        """Save annotations to parquet.
-        Parameters
-        ----------
-        anno: Annotations
-            Annotations curation object to save.
+        Arguments:
+            anno (Annotations):
+                Annotations curation object to save.
-        file: FilePath
-            Path to outfile.parquet.
+            file (FilePath):
+                Path to outfile.parquet.
-        metadata: str | None
-            Metadata fields to include.
+            metadata (str | None):
+                Metadata fields to include.
         """
         self._save_tabular("parquet", anno, file, metadata, **kwargs)
@@ -189,15 +245,17 @@ class AnnotationsExporter(BaseExporter):
     def to_tsv(
         self, anno: Annotations, file: FilePath, metadata: str | None = None, **kwargs
     ):
-        """
-        Save annotations to tsv.
-        Parameters
-        ----------
-        outfile: FilePath
-            Path to outfile.tsv.
+        """Save annotations to tsv.
+        Arguments:
+            anno (Annotations):
+                A populated Annotations object.
+            file (FilePath):
+                Path to outfile.tsv.
-        metadata: str
-            Metadata fields to include.
+            metadata (str):
+                Metadata fields to include.
         """
         self._save_tabular("tsv", anno, file, metadata, **kwargs)
@@ -277,8 +335,7 @@ class AnnotationsExporter(BaseExporter):
     def _save_table_with_description(
         self, file: FilePath, anno: Annotations, metadata: list[str], fmt: str, **kwargs
     ):
-        """
-        Fetches corresponding sample/study descriptions and saves the annotations
+        """Fetches corresponding sample/study descriptions and saves the annotations
         curation in tabular format (parquet, csv, tsv).
         """
@@ -317,6 +374,10 @@ class AnnotationsExporter(BaseExporter):
                 anno, [field for field in _metadata if field in database_ids("sra")]
             )
+        # add sources
+        anno = self.add_sources(anno)
+        _metadata.extend(["sources"])
         if "description" in _metadata:
             self._save_table_with_description(file, anno, _metadata, fmt=fmt, **kwargs)
@@ -356,10 +417,15 @@ class AnnotationsExporter(BaseExporter):
         self, anno: Annotations, file: FilePath, metadata: str
     ):
         """Save annotations as JSON with requested metadata."""
+        # add sources
+        anno = self.add_sources(anno)
         _anno: dict[str, dict[str, dict[str, str]]] = {
             term: {} for term in anno.entities
         }
         _metadata = self._parse_metafields(anno.index_col, metadata)
+        _metadata.extend(["sources"])
         if self._sra_in_metadata(_metadata):
             anno = self.get_sra(

metahq-core 0.1.2__py3-none-any.whl → 1.0.0rc2__py3-none-any.whl

metahq-core 0.1.2py3-none-any.whl → 1.0.0rc2py3-none-any.whl