PyPI - bioextract - Versions diffs - 0.0.1__py3-none-any.whl - Mend

bioextract 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

bioextract/__init__.py +28 -0
bioextract/_shared.py +140 -0
bioextract/omnipath/__init__.py +7 -0
bioextract/omnipath/constant.py +85 -0
bioextract/omnipath/omnipath.py +335 -0
bioextract/omnipath/spec.py +13 -0
bioextract/omnipath/util.py +203 -0
bioextract/stringdb/__init__.py +7 -0
bioextract/stringdb/constant.py +64 -0
bioextract/stringdb/spec.py +13 -0
bioextract/stringdb/stringdb.py +501 -0
bioextract/stringdb/util.py +215 -0
bioextract-0.0.1.dist-info/METADATA +110 -0
bioextract-0.0.1.dist-info/RECORD +16 -0
bioextract-0.0.1.dist-info/WHEEL +4 -0
bioextract-0.0.1.dist-info/entry_points.txt +4 -0

bioextract/__init__.py ADDED Viewed

@@ -0,0 +1,28 @@
+from importlib import import_module
+from types import ModuleType
+from typing import TYPE_CHECKING, Any
+__all__ = ["omnipath", "stringdb"]
+if TYPE_CHECKING:
+    import bioextract.omnipath as omnipath
+    import bioextract.stringdb as stringdb
+_ALIAS_MODULES: dict[str, str] = {
+    "omnipath": "bioextract.omnipath",
+    "stringdb": "bioextract.stringdb",
+}
+def __getattr__(name: str) -> Any:
+    module_name = _ALIAS_MODULES.get(name)
+    if module_name is None:
+        raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+    module_loaded: ModuleType = import_module(module_name)
+    globals()[name] = module_loaded
+    return module_loaded
+def __dir__() -> list[str]:
+    return sorted(set(globals()) | set(__all__))

bioextract/_shared.py ADDED Viewed

@@ -0,0 +1,140 @@
+import re
+from collections.abc import Collection, Iterable, Mapping
+from dataclasses import dataclass
+from pathlib import Path
+import polars as pl
+from polars._typing import SchemaDict
+RE_UNIPROT_PIPE = re.compile(r"^[^|]+\|([^|]+)\|")
+@dataclass(frozen=True, slots=True)
+class GroupInputFrames:
+    df_groups: pl.DataFrame
+    df_input_ids: pl.DataFrame
+def normalize_input_id(value: str) -> str:
+    value = value.strip()
+    if (match_pipe := RE_UNIPROT_PIPE.match(value)) is not None:
+        return match_pipe.group(1).strip()
+    return value
+def validate_file_size(
+    *,
+    file_path: Path,
+    size_max: int | None,
+    label: str,
+) -> None:
+    if size_max is None:
+        return
+    file_size = file_path.stat().st_size
+    if file_size > size_max:
+        raise ValueError(
+            f"{label} exceeds configured size limit: "
+            f"path={file_path}, size_bytes={file_size}, limit_bytes={size_max}"
+        )
+def validate_count_limit(
+    *,
+    count: int,
+    limit_max: int | None,
+    label: str,
+) -> None:
+    if limit_max is None:
+        return
+    if count > limit_max:
+        raise ValueError(
+            f"{label} exceeds configured limit: count={count}, limit={limit_max}"
+        )
+def validate_group_ids(group_ids: list[str]) -> None:
+    group_ids_seen: set[str] = set()
+    for group_id in group_ids:
+        if not group_id:
+            raise ValueError("GroupId must be a non-empty string after normalization")
+        if group_id in group_ids_seen:
+            raise ValueError(
+                f"GroupId values must be unique after normalization: {group_id!r}"
+            )
+        group_ids_seen.add(group_id)
+def validate_required_cols(
+    cols_available: Collection[str], cols_required: Collection[str], context: str
+) -> None:
+    cols_missing = set(cols_required) - set(cols_available)
+    if cols_missing:
+        raise ValueError(
+            f"{context} is missing required columns: "
+            f"{sorted(cols_missing)}; available={cols_available}"
+        )
+def create_input_id_frame(
+    input_ids: Iterable[str],
+    *,
+    schema_unmapped: SchemaDict,
+) -> pl.DataFrame:
+    ids_normalized: list[str] = []
+    for input_id in input_ids:
+        if input_id_normalized := normalize_input_id(str(input_id)):
+            ids_normalized.append(input_id_normalized)
+    if not ids_normalized:
+        return pl.DataFrame(schema=schema_unmapped)
+    return (
+        pl.DataFrame({"InputId": ids_normalized}, schema=schema_unmapped)
+        .unique(subset=["InputId"])
+        .sort("InputId")
+    )
+def create_group_input_frames(
+    group_to_ids: Mapping[str, Iterable[str]],
+    *,
+    schema_groups: SchemaDict,
+    schema_group_input_ids: SchemaDict,
+) -> GroupInputFrames:
+    group_ids_normalized: list[str] = []
+    group_ids_col: list[str] = []
+    input_ids_col: list[str] = []
+    for group_id_raw, ids in group_to_ids.items():
+        group_id = str(group_id_raw).strip()
+        group_ids_normalized.append(group_id)
+        for input_id in ids:
+            if input_id_normalized := normalize_input_id(str(input_id)):
+                group_ids_col.append(group_id)
+                input_ids_col.append(input_id_normalized)
+    validate_group_ids(group_ids_normalized)
+    if group_ids_normalized:
+        df_groups = pl.DataFrame(
+            {"GroupId": group_ids_normalized},
+            schema=schema_groups,
+        ).sort("GroupId")
+    else:
+        df_groups = pl.DataFrame(schema=schema_groups)
+    if not input_ids_col:
+        return GroupInputFrames(
+            df_groups=df_groups,
+            df_input_ids=pl.DataFrame(schema=schema_group_input_ids),
+        )
+    df_group_input_ids = (
+        pl.DataFrame(
+            {"GroupId": group_ids_col, "InputId": input_ids_col},
+            schema=schema_group_input_ids,
+        )
+        .unique()
+        .sort("GroupId", "InputId")
+    )
+    return GroupInputFrames(df_groups=df_groups, df_input_ids=df_group_input_ids)

bioextract/omnipath/__init__.py ADDED Viewed

@@ -0,0 +1,7 @@
+from .omnipath import OmniPathDb
+from .spec import OmniPathResourceLimits
+__all__ = [
+    "OmniPathDb",
+    "OmniPathResourceLimits",
+]

bioextract/omnipath/constant.py ADDED Viewed

@@ -0,0 +1,85 @@
+import re
+from typing import Literal
+import polars as pl
+SCHEMA_ENZSUB_RAW = {
+    "enzyme": pl.String,
+    "substrate": pl.String,
+    "residue_type": pl.String,
+    "residue_offset": pl.String,
+    "modification": pl.String,
+}
+SCHEMA_INTERACTIONS_RAW = {
+    "source": pl.String,
+    "target": pl.String,
+    "is_directed": pl.String,
+    "is_stimulation": pl.String,
+    "is_inhibition": pl.String,
+}
+SCHEMA_UNMAPPED = {
+    "InputId": pl.String,
+}
+SCHEMA_GROUPS = {
+    "GroupId": pl.String,
+}
+SCHEMA_GROUP_INPUT_IDS = {
+    "GroupId": pl.String,
+    "InputId": pl.String,
+}
+SCHEMA_ENZSUB = {
+    "SourceId": pl.String,
+    "TargetId": pl.String,
+    "TargetSite": pl.String,
+    "Modification": pl.String,
+}
+SCHEMA_GROUP_ENZSUB = {
+    "GroupId": pl.String,
+    "SourceId": pl.String,
+    "TargetId": pl.String,
+    "TargetSite": pl.String,
+    "Modification": pl.String,
+}
+SCHEMA_INTERACTIONS = {
+    "SourceId": pl.String,
+    "TargetId": pl.String,
+    "IsDirected": pl.Boolean,
+    "IsStimulation": pl.Boolean,
+    "IsInhibition": pl.Boolean,
+}
+SCHEMA_GROUP_INTERACTIONS = {
+    "GroupId": pl.String,
+    "SourceId": pl.String,
+    "TargetId": pl.String,
+    "IsDirected": pl.Boolean,
+    "IsStimulation": pl.Boolean,
+    "IsInhibition": pl.Boolean,
+}
+COLS_RENAMED_ENZSUB = {
+    "enzyme": "SourceId",
+    "substrate": "TargetId",
+    "residue_type": "_ResidueType",
+    "residue_offset": "_ResidueOffset",
+    "modification": "Modification",
+}
+COLS_RENAMED_INTERACTIONS = {
+    "source": "SourceId",
+    "target": "TargetId",
+    "is_directed": "IsDirected",
+    "is_stimulation": "IsStimulation",
+    "is_inhibition": "IsInhibition",
+}
+RE_UNIPROT_PIPE = re.compile(r"^[^|]+\|([^|]+)\|")
+OmniPathResourceName = Literal["enzsub", "interactions"]

bioextract/omnipath/omnipath.py ADDED Viewed

@@ -0,0 +1,335 @@
+import os
+from collections.abc import Iterable, Mapping
+from dataclasses import dataclass, field
+from pathlib import Path
+import polars as pl
+from bioextract._shared import validate_count_limit, validate_file_size
+from .._shared import create_group_input_frames, create_input_id_frame
+from .constant import (
+    SCHEMA_GROUP_INPUT_IDS,
+    SCHEMA_GROUPS,
+    SCHEMA_UNMAPPED,
+    OmniPathResourceName,
+)
+from .spec import OmniPathResourceLimits
+from .util import (
+    extract_enzsub_frame,
+    extract_interactions_frame,
+)
+__all__ = [
+    "OmniPathDb",
+]
+@dataclass(frozen=True, slots=True)
+class _OmniPathSnapshot:
+    file_enzsub: Path | None = None
+    file_interactions: Path | None = None
+@dataclass(slots=True)
+class OmniPathDb:
+    """Path-first access to local OmniPath relation files.
+    `OmniPathDb` is the public entrypoint for extracting OmniPath
+    enzyme-substrate relations and interaction relations from local files.
+    It keeps dataset-level resource limits and exposes single and grouped
+    selections through one selection type.
+    """
+    snapshot: _OmniPathSnapshot
+    limits: OmniPathResourceLimits = field(default_factory=OmniPathResourceLimits)
+    DEFAULT_RESOURCE_LIMITS = OmniPathResourceLimits()
+    @classmethod
+    def from_files(
+        cls,
+        *,
+        file_enzsub: os.PathLike[str] | str | None = None,
+        file_interactions: os.PathLike[str] | str | None = None,
+        limits: OmniPathResourceLimits | None = None,
+    ) -> "OmniPathDb":
+        """Create a dataset handle from local OmniPath files.
+        Args:
+            file_enzsub: Path to a local OmniPath `enzsub` text or gzip file.
+            file_interactions: Path to a local OmniPath `interactions` text or
+                gzip file.
+            limits: Dataset-level resource limits. When omitted, default
+                fail-fast limits are used. see :class:`OmniPathResourceLimits` and
+                `OmniPathDb.DEFAULT_RESOURCE_LIMITS` for details.
+        Returns:
+            A dataset handle that can produce single or grouped selections.
+        Raises:
+            FileNotFoundError: If any provided file does not exist.
+            ValueError: If no resource files are provided or a configured
+                file-size limit is exceeded.
+        """
+        if file_enzsub is None and file_interactions is None:
+            raise ValueError("At least one OmniPath resource file must be provided")
+        limits_resolved = OmniPathResourceLimits() if limits is None else limits
+        if file_enzsub is not None:
+            file_enzsub = Path(file_enzsub)
+            if not file_enzsub.exists():
+                raise FileNotFoundError(
+                    f"OmniPath enzsub file not found: {file_enzsub}"
+                )
+            validate_file_size(
+                file_path=file_enzsub,
+                size_max=limits_resolved.file_enzsub_bytes_max,
+                label="OmniPath enzsub file",
+            )
+        if file_interactions is not None:
+            file_interactions = Path(file_interactions)
+            if not file_interactions.exists():
+                raise FileNotFoundError(
+                    f"OmniPath interactions file not found: {file_interactions}"
+                )
+            validate_file_size(
+                file_path=file_interactions,
+                size_max=limits_resolved.file_interactions_bytes_max,
+                label="OmniPath interactions file",
+            )
+        return cls(
+            snapshot=_OmniPathSnapshot(
+                file_enzsub=file_enzsub,
+                file_interactions=file_interactions,
+            ),
+            limits=limits_resolved,
+        )
+    @property
+    def available_resources(self) -> frozenset[OmniPathResourceName]:
+        resources: set[OmniPathResourceName] = set()
+        if self.snapshot.file_enzsub is not None:
+            resources.add("enzsub")
+        if self.snapshot.file_interactions is not None:
+            resources.add("interactions")
+        return frozenset(resources)
+    def select_ids(self, ids: Iterable[str]) -> OmniPathSelection:
+        """Create a single-query selection from input IDs."""
+        df_input_ids = create_input_id_frame(ids, schema_unmapped=SCHEMA_UNMAPPED)
+        validate_count_limit(
+            count=df_input_ids.height,
+            limit_max=self.limits.num_input_ids_max,
+            label="Normalized input ID count",
+        )
+        return OmniPathSelection(
+            dataset=self,
+            _df_input_ids=df_input_ids,
+            _df_groups=None,
+            resources_selected=self.available_resources,
+        )
+    def select_groups(
+        self,
+        group_to_ids: Mapping[str, Iterable[str]],
+    ) -> OmniPathSelection:
+        """Create a grouped selection from multiple input-ID sets."""
+        grp_in_frames = create_group_input_frames(
+            group_to_ids,
+            schema_groups=SCHEMA_GROUPS,
+            schema_group_input_ids=SCHEMA_GROUP_INPUT_IDS,
+        )
+        validate_count_limit(
+            count=grp_in_frames.df_groups.height,
+            limit_max=self.limits.num_groups_max,
+            label="Group count",
+        )
+        validate_count_limit(
+            count=grp_in_frames.df_input_ids.height,
+            limit_max=self.limits.num_input_ids_max,
+            label="Normalized input ID count",
+        )
+        return OmniPathSelection(
+            dataset=self,
+            _df_groups=grp_in_frames.df_groups,
+            _df_input_ids=grp_in_frames.df_input_ids,
+            resources_selected=self.available_resources,
+        )
+@dataclass(slots=True)
+class OmniPathSelection:
+    """Selection handle for both single and grouped OmniPath queries."""
+    dataset: OmniPathDb
+    _df_input_ids: pl.DataFrame = field(repr=False)
+    _df_groups: pl.DataFrame | None = field(repr=False)
+    resources_selected: frozenset[OmniPathResourceName]
+    _df_enzsub: pl.DataFrame | None = field(default=None, repr=False)
+    _df_interactions: pl.DataFrame | None = field(default=None, repr=False)
+    _df_unmapped: pl.DataFrame | None = field(default=None, repr=False)
+    @property
+    def is_grouped(self) -> bool:
+        """Report whether this selection carries `GroupId` through outputs."""
+        return self._df_groups is not None
+    @property
+    def _col_group_id(self) -> tuple[str, ...]:
+        """Return the group ID column when this selection is grouped."""
+        return ("GroupId",) if self.is_grouped else ()
+    def with_resources(
+        self,
+        resources: Iterable[OmniPathResourceName],
+    ) -> OmniPathSelection:
+        """Create a new selection constrained to the given OmniPath resources."""
+        resources_selected = frozenset(resources)
+        resources_invalid = resources_selected.difference({"enzsub", "interactions"})
+        if resources_invalid:
+            raise ValueError(
+                f"Unsupported OmniPath resources: {sorted(resources_invalid)}"
+            )
+        if not resources_selected:
+            raise ValueError("At least one OmniPath resource must be selected")
+        return OmniPathSelection(
+            dataset=self.dataset,
+            _df_input_ids=self._df_input_ids,
+            _df_groups=self._df_groups,
+            resources_selected=resources_selected,
+            _df_enzsub=self._df_enzsub if "enzsub" in resources_selected else None,
+            _df_interactions=(
+                self._df_interactions if "interactions" in resources_selected else None
+            ),
+        )
+    def with_enzsub(self) -> OmniPathSelection:
+        """Create a new selection constrained to OmniPath enzsub relations."""
+        return self.with_resources(["enzsub"])
+    def with_interactions(self) -> OmniPathSelection:
+        """Create a new selection constrained to OmniPath interactions."""
+        return self.with_resources(["interactions"])
+    def extract_enzsub(self) -> pl.DataFrame:
+        """Extract matched OmniPath enzyme-substrate relations.
+        Returns:
+            A materialized table with one of these schemas:
+            - single selection: `SourceId`, `TargetId`, `TargetSite`, `Modification`
+            - grouped selection: `GroupId`, `SourceId`, `TargetId`, `TargetSite`,
+              `Modification`
+        Raises:
+            ValueError: If the selection does not enable `enzsub`, if the
+                `enzsub` file is missing, or if the file is missing required
+                columns.
+        """
+        if "enzsub" not in self.resources_selected:
+            raise ValueError(
+                "OmniPath resource 'enzsub' is not enabled for this selection"
+            )
+        if self.dataset.snapshot.file_enzsub is None:
+            raise ValueError("Cannot extract OmniPath enzsub without enzsub file")
+        if self._df_enzsub is None:
+            self._df_enzsub = extract_enzsub_frame(
+                file_enzsub=self.dataset.snapshot.file_enzsub,
+                df_input_ids=self._df_input_ids,
+                cols_group_id=self._col_group_id,
+            )
+        return self._df_enzsub
+    def extract_interactions(self) -> pl.DataFrame:
+        """Extract matched OmniPath interaction relations.
+        Returns:
+            A materialized table with one of these schemas:
+            - single selection: `SourceId`, `TargetId`, `IsDirected`,
+              `IsStimulation`, `IsInhibition`
+            - grouped selection: `GroupId`, `SourceId`, `TargetId`,
+              `IsDirected`, `IsStimulation`, `IsInhibition`
+        Raises:
+            ValueError: If the selection does not enable `interactions`, if the
+                interactions file is missing, or if the file is missing
+                required columns.
+        """
+        if "interactions" not in self.resources_selected:
+            raise ValueError(
+                "OmniPath resource 'interactions' is not enabled for this selection"
+            )
+        if self.dataset.snapshot.file_interactions is None:
+            raise ValueError(
+                "Cannot extract OmniPath interactions without interactions file"
+            )
+        if self._df_interactions is None:
+            self._df_interactions = extract_interactions_frame(
+                file_interactions=self.dataset.snapshot.file_interactions,
+                df_input_ids=self._df_input_ids,
+                cols_group_id=self._col_group_id,
+            )
+        return self._df_interactions
+    def extract_unmapped_input_ids(self) -> pl.DataFrame:
+        """Extract normalized input IDs not found in the selected resources.
+        Returns:
+            A materialized table with one of these schemas:
+            - single selection: `InputId`
+            - grouped selection: `GroupId`, `InputId`
+        """
+        if self._df_unmapped is None:
+            col_group_id = list(self._col_group_id)
+            cols_index = col_group_id + ["InputId"]
+            df_matched_parts: list[pl.DataFrame] = []
+            if "enzsub" in self.resources_selected:
+                df_enzsub = self.extract_enzsub()
+                df_matched_parts.extend(
+                    [
+                        df_enzsub.select(
+                            col_group_id + [pl.col("SourceId").alias("InputId")]
+                        ),
+                        df_enzsub.select(
+                            col_group_id + [pl.col("TargetId").alias("InputId")]
+                        ),
+                    ]
+                )
+            if "interactions" in self.resources_selected:
+                df_interactions = self.extract_interactions()
+                df_matched_parts.extend(
+                    [
+                        df_interactions.select(
+                            col_group_id + [pl.col("SourceId").alias("InputId")]
+                        ),
+                        df_interactions.select(
+                            col_group_id + [pl.col("TargetId").alias("InputId")]
+                        ),
+                    ]
+                )
+            df_matched_input_ids = (
+                pl.concat(df_matched_parts, how="vertical_relaxed")
+                .join(self._df_input_ids, on=cols_index, how="inner")
+                .unique(subset=cols_index)
+                .sort(cols_index)
+            )
+            self._df_unmapped = (
+                self._df_input_ids.join(
+                    df_matched_input_ids,
+                    on=cols_index,
+                    how="anti",
+                )
+                .select(cols_index)
+                .sort(cols_index)
+            )
+        return self._df_unmapped

bioextract/omnipath/spec.py ADDED Viewed

@@ -0,0 +1,13 @@
+from dataclasses import dataclass
+__all__ = [
+    "OmniPathResourceLimits",
+]
+@dataclass(frozen=True, slots=True)
+class OmniPathResourceLimits:
+    file_enzsub_bytes_max: int | None = 512 * 1024 * 1024
+    file_interactions_bytes_max: int | None = 4 * 1024 * 1024 * 1024
+    num_input_ids_max: int | None = 100_000
+    num_groups_max: int | None = 1_000