PyPI - metahq-core - Versions diffs - 0.1.2__py3-none-any.whl → 1.0.0rc1__py3-none-any.whl - Mend

metahq-core 0.1.2py3-none-any.whl → 1.0.0rc1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

metahq_core/__init__.py +1 -1
metahq_core/curations/annotation_converter.py +5 -5
metahq_core/curations/annotations.py +361 -151
metahq_core/curations/index.py +104 -43
metahq_core/curations/labels.py +259 -128
metahq_core/curations/propagator.py +62 -85
metahq_core/export/__init__.py +0 -0
metahq_core/export/annotations.py +125 -59
metahq_core/export/labels.py +128 -70
metahq_core/logger.py +11 -18
metahq_core/query.py +346 -241
metahq_core/{ontology/loader.py → relations_loader.py} +2 -1
metahq_core/search.py +37 -14
metahq_core/util/io.py +109 -46
metahq_core/util/supported.py +16 -5
{metahq_core-0.1.2.dist-info → metahq_core-1.0.0rc1.dist-info}/METADATA +13 -6
metahq_core-1.0.0rc1.dist-info/RECORD +30 -0
{metahq_core-0.1.2.dist-info → metahq_core-1.0.0rc1.dist-info}/WHEEL +1 -1
metahq_core-1.0.0rc1.dist-info/licenses/LICENSE +28 -0
metahq_core/ontology/base.py +0 -376
metahq_core/ontology/graph.py +0 -252
metahq_core-0.1.2.dist-info/RECORD +0 -30
/metahq_core/{ontology → curations}/__init__.py +0 -0

metahq_core/curations/labels.py CHANGED Viewed

@@ -4,7 +4,7 @@ Class for mutating and operating on sets of labels.
 Author: Parker Hicks
 Date: 2025-08-13
-Last updated: 2025-11-21 by Parker Hicks
+Last updated: 2026-02-02 by Parker Hicks
 """
 from __future__ import annotations
@@ -12,14 +12,14 @@ from __future__ import annotations
 from pathlib import Path
 from typing import TYPE_CHECKING, Literal
-import numpy as np
 import polars as pl
 from metahq_core.curations.base import BaseCuration
 from metahq_core.curations.index import Ids
 from metahq_core.export.labels import LabelsExporter
 from metahq_core.logger import setup_logger
-from metahq_core.util.alltypes import FilePath, NpIntMatrix
+from metahq_core.util.alltypes import NpIntMatrix
+from metahq_core.util.supported import get_default_log_dir
 if TYPE_CHECKING:
     import logging
@@ -27,72 +27,24 @@ if TYPE_CHECKING:
 # TODO: Add method to remove redundant terms
 class Labels(BaseCuration):
-    """
-    Class for storing and mutating labels.
+    """Class for storing and mutating labels.
     Currently supports -1, 0, +1 labels.
-    Attributes
-    ---------
-    data: pl.DataFrame
-        Polars DataFrame with columns `index`, `groups` and columns for each
-        attribute entity for each index (e.g. male or female, tissues, diseases, etc).
-    disease: bool
-        Indicates if the annotations are disease based. Used to account for control samples
-        when converting annotations to labels.
-    index_col: IdArray
-        Name of the column of data that contains the index IDs.
-    group_cols: tuple
-        Names of columns of data that contain an ID for each index indicating if it belongs
-        to a particular group (e.g. dataset, sex, platform, etc.).
-    collapsed: bool
-        Indicates if the annotations have already been collapsed.
-    Methods
-    -------
-    drop()
-        Wrapper for polars `drop`.
-    filter()
-        Wrapper for polars `filter`.
-    head()
-        Wrapper for polars `head`.
-    select()
-        Wrapper for polars `select`.
-    slice()
-        Wrapper for polars `slice`.
-    Properties
-    ---------
-    entities: list[str]
-        columns of the annotations frame of ontology terms.
-    groups: list[str]
-        Groups associated with each index of the annotations curation.
-        Note that groups are not unique.
-    ids: pl.DataFrame
-        The frame of all IDs within the annotations curation.
-    index
-        The index IDs of the annotations frame.
+    Attributes:
+        data (pl.DataFrame):
+            Polars DataFrame with columns `index`, `groups` and columns for each
+            attribute entity for each index (e.g. male or female, tissues, diseases, etc).
-    n_entities: int
-        Number of unique entities.
+        index_col (str):
+            Name of the column of data that contains the index IDs.
-    n_index: int
-        Number of indices.
-    unique_groups: list[str]
-        Unique groups in the annotations curation.
+        group_cols (tuple[str, ...]):
+            Names of columns of data that contain an ID for each index indicating if it belongs
+            to a particular group (e.g. dataset, sex, platform, etc.).
+        collapsed (bool):
+            Indicates if the annotations have already been collapsed.
     """
     def __init__(
@@ -104,7 +56,7 @@ class Labels(BaseCuration):
         collapsed: bool = False,
         logger=None,
         loglevel=20,
-        logdir=Path("."),
+        logdir=get_default_log_dir(),
         verbose=True,
     ):
         self.data = data
@@ -120,14 +72,21 @@ class Labels(BaseCuration):
         self.verbose: bool = verbose
     def add_ids(self, new: pl.DataFrame) -> Labels:
-        """
-        Append new group ID columns to the IDs of a Labels object. The new
+        """Append new group ID columns to the IDs of a Labels object. The new
         IDs must have a matching index.
+        Arguments:
+            new (pl.DataFrame):
+                A DataFrame of additional IDs to join with the current index column of `data`.
+                    Must have a matching index column as the original `data`.
+        Returns:
+            A new Labels object including the new ID columns.
         """
         new_ids = new.join(
             self.ids, on=self.index_col, how="inner", maintain_order="right"
         )
-        new_groups = tuple([col for col in new_ids.columns if col != self.index_col])
+        new_groups = tuple(col for col in new_ids.columns if col != self.index_col)
         assert new_ids.height == self.ids.height, "SRA IDs height mismatch."
         assert (
             new_ids[self.index_col].to_list() == self.index
@@ -138,11 +97,37 @@ class Labels(BaseCuration):
         )
     def drop(self, *args, **kwargs):
-        """Wrapper for polars drop."""
+        """Wrapper for polars drop. Drops any of the term columns.
+        ID columns are not dropped through this method.
+        """
         self.data = self.data.drop(*args, **kwargs)
     def filter(self, condition: pl.Expr) -> Labels:
-        """Filter both data and ids simultaneously using a mask."""
+        """Filter both data and ids simultaneously using a mask.
+        Arguments:
+            condition (pl.Expr):
+                Polars expression for filtering columns.
+        Examples:
+            >>> from metahq_core.curations.labels import Labels
+            >>> labels = {
+                    'sample': ['GSM1', 'GSM2', 'GSM3'],
+                    'series': ['GSE1', 'GSE1', 'GSE2'],
+                    'UBERON:0000948': [1, -1, -1],
+                    'UBERON:0002113': [-1, 1, -1],
+                    'UBERON:0000955': [-1, -1, 1],
+                }
+            >>> labels = Labels.from_df(anno, index_col="sample", group_cols=["series"])
+            >>> labels.filter(pl.col("UBERON:0000948") == 1)
+            ┌────────┬────────┬────────────────┬────────────────┬────────────────┐
+            │ sample ┆ series ┆ UBERON:0000948 ┆ UBERON:0002113 ┆ UBERON:0000955 │
+            │ ---    ┆ ---    ┆ ---            ┆ ---            ┆ ---            │
+            │ str    ┆ str    ┆ i32            ┆ i32            ┆ i32            │
+            ╞════════╪════════╪════════════════╪════════════════╪════════════════╡
+            │ GSM1   ┆ GSE1   ┆ 1              ┆ -1             ┆ -1             │
+            └────────┴────────┴────────────────┴────────────────┴────────────────┘
+        """
         mask = self.data.select(condition.arg_true()).to_numpy().reshape(-1)
         filtered_data = (
@@ -166,29 +151,56 @@ class Labels(BaseCuration):
     def save(
         self,
-        outfile: FilePath,
+        outfile: str | Path,
         fmt: Literal["json", "parquet", "csv", "tsv"],
+        attribute: str,
+        level: str,
         metadata: str | None = None,
     ):
-        """
-        Save labels curation to json. Keys are terms and values are
-        positively annotated indices.
+        """Save the labels curation.
-        Parameters
-        ----------
-        outfile: FilePath
-            Path to outfile.json.
+        Arguments:
+            outfile (str | Path):
+                Path to outfile.json.
-        metadata: bool
-            If True, will add index titles to each entry.
+            fmt (Literal["json", "parquet", "csv", "tsv"]):
+                File format to save to.
+            attribute (str):
+                A supported MetaHQ annotated attribute.
+            level (str):
+                An index level supported by MetaHQ.
+            metadata (str | None):
+                Metadata fields to inlcude formatted as a comma
+                delimited string.
+        Examples:
+            If `metadata` is None, will only save the index column
+            with the remaining labels.
+            >>> from metahq_core.curations.labels import Labels
+            >>> labels = {
+                    'sample': ['GSM1', 'GSM2', 'GSM3'],
+                    'series': ['GSE1', 'GSE1', 'GSE2'],
+                    'UBERON:0000948': [1, -1, -1],
+                    'UBERON:0002113': [-1, 1, -1],
+                    'UBERON:0000955': [-1, -1, 1],
+                }
+            >>> labels = Labels.from_df(anno, index_col='sample', group_cols=['series'])
+            >>> labels.save(
+                    '/path/to/out.parquet', fmt="parquet", attribute="tissue", level="sample"
+                )
         """
-        LabelsExporter(logger=self.log, verbose=self.verbose).save(
+        LabelsExporter(attribute, level, logger=self.log, verbose=self.verbose).save(
             self, fmt, outfile, metadata
         )
     def select(self, *args, **kwargs) -> Labels:
-        """Select annotation columns while maintaining ids."""
+        """Select label entity columns while maintaining ids."""
         selected_data = self.data.select(*args, **kwargs)
         return self.__class__(
@@ -202,48 +214,24 @@ class Labels(BaseCuration):
         )
     def slice(self, offset: int, length: int | None = None) -> Labels:
-        """Slice both data and ids simultaneously using polars slice."""
-        sliced_data = self.data.slice(offset, length)
-        sliced_ids_data = self._ids.data.slice(offset, length)
-        return self.__class__(
-            data=sliced_data,
-            ids=sliced_ids_data,
-            index_col=self.index_col,
-            group_cols=self.group_cols,
-            collapsed=self.collapsed,
-            logger=self.log,
-            verbose=self.verbose,
-        )
+        """Slice both data and ids simultaneously using `polars` slice.
-    def subset_index(self, subset: list[str] | np.ndarray) -> Labels:
-        """
-        Selects rows of the expression frame whose sample IDs are in a specified
-        subset. Note the returned order may not match.
-        Parameters
-        ----------
-        subset: list[str] | np.ndarray
-            Array-like of index IDs to select from the expression frame.
+        Arguments:
+            offset (int):
+                Index position to begin the slice.
-        Returns
-        -------
-        A new LazyExp object with the subset of index IDs in the frame.
+            length (int | None):
+                Number of indices past `offset` to slice out.
+        Returns:
+            Sliced Labels object as a subset of the original Labels.
         """
-        _, _, mask = np.intersect1d(
-            np.array(subset), np.array(self.index), return_indices=True
-        )
-        diff = abs(len(mask) != len(subset))
-        if (diff != 0) and self.verbose:
-            self.log.warning("%s indices not found in the frame.", diff)
+        sliced_data = self.data.slice(offset, length)
+        sliced_ids_data = self._ids.data.slice(offset, length)
         return self.__class__(
-            data=self.data.with_row_index()
-            .filter(pl.col("index").is_in(mask))
-            .drop("index"),
-            ids=self._ids.filter_by_mask(mask).data,
+            data=sliced_data,
+            ids=sliced_ids_data,
             index_col=self.index_col,
             group_cols=self.group_cols,
             collapsed=self.collapsed,
@@ -253,17 +241,53 @@ class Labels(BaseCuration):
     def to_numpy(self) -> NpIntMatrix:
         """Wrapper for polars `to_numpy`."""
-        return LabelsExporter().to_numpy(self)
+        return self.data.to_numpy()
     @classmethod
     def from_df(
         cls,
         df: pl.DataFrame,
         index_col: str,
-        group_cols: tuple[str, ...] | list[str] = ("group", "platform"),
+        group_cols: tuple[str, ...] | list[str],
         **kwargs,
     ) -> Labels:
-        """Creates a Labels object from a combined DataFrame."""
+        """Creates a Labels object from a combined DataFrame.
+        Attributes:
+            df (pl.DataFrame):
+                Polars DataFrame with index and group ID columns and columns for each
+                    attribute entity for each index (e.g. male or female, tissues, diseases, etc).
+            index_col (str):
+                Name of the column of data that contains the index IDs.
+            group_cols (tuple[str, ...]):
+                Names of columns of data that contain an ID for each index indicating if it belongs
+                    to a particular group (e.g. dataset, sex, platform, etc.).
+        Returns:
+            A Labels object constructed from `df`.
+        Examples:
+            >>> from metahq_core.curations.labels import Labels
+            >>> labels = {
+                    'sample': ['GSM1', 'GSM2', 'GSM3'],
+                    'series': ['GSE1', 'GSE1', 'GSE2'],
+                    'UBERON:0000948': [1, -1, -1],
+                    'UBERON:0002113': [-1, 1, -1],
+                    'UBERON:0000955': [-1, -1, 1],
+                }
+            >>> labels = Labels.from_df(anno, index_col='sample', group_cols=['series'])
+            ┌────────┬────────┬────────────────┬────────────────┬────────────────┐
+            │ sample ┆ series ┆ UBERON:0000948 ┆ UBERON:0002113 ┆ UBERON:0000955 │
+            │ ---    ┆ ---    ┆ ---            ┆ ---            ┆ ---            │
+            │ str    ┆ str    ┆ i64            ┆ i64            ┆ i64            │
+            ╞════════╪════════╪════════════════╪════════════════╪════════════════╡
+            │ GSM1   ┆ GSE1   ┆ 1              ┆ -1             ┆ -1             │
+            │ GSM2   ┆ GSE1   ┆ -1             ┆ -1             ┆ -1             │
+            │ GSM3   ┆ GSE2   ┆ -1             ┆ -1             ┆ 1              │
+            └────────┴────────┴────────────────┴────────────────┴────────────────┘
+        """
         id_columns = [index_col] + list(group_cols)
         ids_data = df.select(id_columns)
         annotation_data = df.drop(id_columns)
@@ -278,37 +302,144 @@ class Labels(BaseCuration):
     @property
     def entities(self) -> list[str]:
-        """Returns column names of the Annotations frame."""
+        """Returns column names of the Labels frame.
+        Examples:
+            >>> from metahq_core.curations.labels import Labels
+            >>> labels = {
+                    'sample': ['GSM1', 'GSM2', 'GSM3'],
+                    'series': ['GSE1', 'GSE1', 'GSE2'],
+                    'UBERON:0000948': [1, -1, -1],
+                    'UBERON:0002113': [-1, 1, -1],
+                    'UBERON:0000955': [-1, -1, 1],
+                }
+            >>> labels = Labels.from_df(anno, index_col='sample', group_cols=['series'])
+            >>> labels.entities
+            ['UBERON:0000948', 'UBERON:0002113', 'UBERON:0000955']
+        """
         return self.data.columns
     @property
     def groups(self) -> list[str]:
-        """Returns the groups column of the Annotations curation."""
+        """Returns the groups column of the Labels curation.
+        Examples:
+            >>> from metahq_core.curations.labels import Labels
+            >>> labels = {
+                    'sample': ['GSM1', 'GSM2', 'GSM3'],
+                    'series': ['GSE1', 'GSE1', 'GSE2'],
+                    'UBERON:0000948': [1, -1, -1],
+                    'UBERON:0002113': [-1, 1, -1],
+                    'UBERON:0000955': [-1, -1, 1],
+                }
+            >>> labels = Labels.from_df(anno, index_col='sample', group_cols=['series'])
+            >>> labels.groups
+            ['GSE1', 'GSE1', 'GSE2']
+        """
         return self.ids["group"].to_list()
     @property
     def ids(self) -> pl.DataFrame:
-        """Return the IDs dataframe."""
+        """Return the IDs dataframe.
+        Examples:
+            >>> from metahq_core.curations.labels import Labels
+            >>> labels = {
+                    'sample': ['GSM1', 'GSM2', 'GSM3'],
+                    'series': ['GSE1', 'GSE1', 'GSE2'],
+                    'UBERON:0000948': [1, -1, -1],
+                    'UBERON:0002113': [-1, 1, -1],
+                    'UBERON:0000955': [-1, -1, 1],
+                }
+            >>> labels = Labels.from_df(anno, index_col='sample', group_cols=['series'])
+            >>> labels.ids
+            ┌────────┬────────┐
+            │ sample ┆ series │
+            │ ---    ┆ ---    │
+            │ str    ┆ str    │
+            ╞════════╪════════╡
+            │ GSM1   ┆ GSE1   │
+            │ GSM2   ┆ GSE1   │
+            │ GSM3   ┆ GSE2   │
+            └────────┴────────┘
+        """
         return self._ids.data
     @property
-    def index(self) -> list:
-        """Return the index column as a list."""
+    def index(self) -> list[str]:
+        """Return the index column as a list.
+        Examples:
+            >>> from metahq_core.curations.labels import Labels
+            >>> labels = {
+                    'sample': ['GSM1', 'GSM2', 'GSM3'],
+                    'series': ['GSE1', 'GSE1', 'GSE2'],
+                    'UBERON:0000948': [1, -1, -1],
+                    'UBERON:0002113': [-1, 1, -1],
+                    'UBERON:0000955': [-1, -1, 1],
+                }
+            >>> labels = Labels.from_df(anno, index_col='sample', group_cols=['series'])
+            >>> labels.index
+            ['GSM1', 'GSM2', 'GSM3']
+        """
         return self._ids.index.to_list()
     @property
     def n_indices(self) -> int:
-        """Returns number of indices."""
+        """Returns number of indices.
+        Examples:
+            >>> from metahq_core.curations.labels import Labels
+            >>> labels = {
+                    'sample': ['GSM1', 'GSM2', 'GSM3'],
+                    'series': ['GSE1', 'GSE1', 'GSE2'],
+                    'UBERON:0000948': [1, -1, -1],
+                    'UBERON:0002113': [-1, 1, -1],
+                    'UBERON:0000955': [-1, -1, 1],
+                }
+            >>> labels = Labels.from_df(anno, index_col='sample', group_cols=['series'])
+            >>> labels.n_indices
+            3
+        """
         return self.data.height
     @property
     def n_entities(self) -> int:
-        """Returns number of entities."""
+        """Returns number of entities.
+        Examples:
+            >>> from metahq_core.curations.labels import Labels
+            >>> labels = {
+                    'sample': ['GSM1', 'GSM2', 'GSM3'],
+                    'series': ['GSE1', 'GSE1', 'GSE2'],
+                    'UBERON:0000948': [1, -1, -1],
+                    'UBERON:0002113': [-1, 1, -1],
+                    'UBERON:0000955': [-1, -1, 1],
+                    'UBERON:0002107': [-1, -1, -1],
+                }
+            >>> labels = Labels.from_df(anno, index_col='sample', group_cols=['series'])
+            >>> labels.n_entities
+            4
+        """
         return len(self.entities)
     @property
     def unique_groups(self) -> list[str]:
-        """Returns unique groups."""
+        """Returns unique groups.
+        Examples:
+            >>> from metahq_core.curations.labels import Labels
+            >>> labels = {
+                    'sample': ['GSM1', 'GSM2', 'GSM3'],
+                    'series': ['GSE1', 'GSE1', 'GSE2'],
+                    'UBERON:0000948': [1, -1, -1],
+                    'UBERON:0002113': [-1, 1, -1],
+                    'UBERON:0000955': [-1, -1, 1],
+                }
+            >>> labels = Labels.from_df(anno, index_col='sample', group_cols=['series'])
+            >>> labels.unqiue_groups
+            ['GSE1', 'GSE2']
+        """
         return list(set(self.groups))
     def __repr__(self):

metahq-core 0.1.2__py3-none-any.whl → 1.0.0rc1__py3-none-any.whl

metahq-core 0.1.2py3-none-any.whl → 1.0.0rc1py3-none-any.whl