PyPI - bioextract - Versions diffs - 0.0.1__tar.gz - Mend

bioextract 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

bioextract-0.0.1/PKG-INFO +110 -0
bioextract-0.0.1/README.md +100 -0
bioextract-0.0.1/pyproject.toml +29 -0
bioextract-0.0.1/src/bioextract/__init__.py +28 -0
bioextract-0.0.1/src/bioextract/_shared.py +140 -0
bioextract-0.0.1/src/bioextract/omnipath/__init__.py +7 -0
bioextract-0.0.1/src/bioextract/omnipath/constant.py +85 -0
bioextract-0.0.1/src/bioextract/omnipath/omnipath.py +335 -0
bioextract-0.0.1/src/bioextract/omnipath/spec.py +13 -0
bioextract-0.0.1/src/bioextract/omnipath/util.py +203 -0
bioextract-0.0.1/src/bioextract/stringdb/__init__.py +7 -0
bioextract-0.0.1/src/bioextract/stringdb/constant.py +64 -0
bioextract-0.0.1/src/bioextract/stringdb/spec.py +13 -0
bioextract-0.0.1/src/bioextract/stringdb/stringdb.py +501 -0
bioextract-0.0.1/src/bioextract/stringdb/util.py +215 -0
bioextract-0.0.1/tests/__init__.py +0 -0
bioextract-0.0.1/tests/test_omnipath.py +407 -0
bioextract-0.0.1/tests/test_stringdb.py +832 -0

bioextract-0.0.1/PKG-INFO ADDED Viewed

@@ -0,0 +1,110 @@
+Metadata-Version: 2.1
+Name: bioextract
+Version: 0.0.1
+Summary: Library-first extraction helpers for bioinformatics resource snapshots.
+Author-Email: FuqingZh <fu.qing.zhang.work@gmail.com>
+License: MIT
+Requires-Python: >=3.14
+Requires-Dist: polars>=1.39.3
+Description-Content-Type: text/markdown
+# bioextract
+Library-first extraction helpers for bioinformatics resource snapshots.
+## Install
+- `pip install bioextract`
+## STRINGdb
+```python
+from bioextract.stringdb import StringDb, StringResourceLimits
+selection = (
+    StringDb.from_files(
+        file_aliases="9606.protein.aliases.v12.0.txt.gz",
+        file_links="9606.protein.links.v12.0.txt.gz",
+        limits=StringResourceLimits(num_input_ids_max=50_000),
+    )
+    .select_ids(["P04637", "EGFR", "CDK2"])
+    .with_score_min(400)
+)
+df_mapping = selection.extract_string_mapping()
+df_unmapped = selection.extract_unmapped_input_ids()
+df_edges = selection.extract_edges()
+print(df_mapping)
+print(df_unmapped)
+print(df_edges)
+```
+```python
+from bioextract.stringdb import StringDb
+df_group_edges = (
+    StringDb.from_files(
+        file_aliases="9606.protein.aliases.v12.0.txt.gz",
+        file_links="9606.protein.links.v12.0.txt.gz",
+    )
+    .select_groups(
+        {
+            "TumorA": ["TP53", "EGFR"],
+            "TumorB": ["CDK2", "TP53"],
+        }
+    )
+    .with_score_min(400)
+    .extract_edges()
+)
+```
+## OmniPath
+```python
+from bioextract.omnipath import OmniPathDb
+selection = (
+    OmniPathDb.from_files(
+        file_enzsub="enzsub.tsv.gz",
+        file_interactions="interactions.tsv.gz",
+    )
+    .select_ids(["P31749", "AKT1", "BAD"])
+    .with_enzsub()
+)
+df_enzsub = selection.extract_enzsub()
+df_unmapped = selection.extract_unmapped_input_ids()
+print(df_enzsub)
+print(df_unmapped)
+```
+```python
+from bioextract.omnipath import OmniPathDb
+df_group_interactions = (
+    OmniPathDb.from_files(file_interactions="interactions.tsv.gz")
+    .select_groups(
+        {
+            "TumorA": ["AKT1", "MTOR"],
+            "TumorB": ["EGFR", "ERBB2"],
+        }
+    )
+    .with_interactions()
+    .extract_interactions()
+)
+```
+## Development
+- `PYTHONPATH=src pytest`
+- `PYTHONPATH=src python scripts/benchmark_stringdb.py`
+## Release
+- GitHub Actions now provides:
+  - `.github/workflows/py-ci.yml` for test-and-build checks on push and pull request
+  - `.github/workflows/publish.yml` for tag-triggered PyPI publishing
+- Release tags must be canonical PEP 440 versions such as `0.1.1`
+- The publish workflow expects PyPI trusted publishing to be configured for the `pypi` environment

bioextract-0.0.1/README.md ADDED Viewed

@@ -0,0 +1,100 @@
+# bioextract
+Library-first extraction helpers for bioinformatics resource snapshots.
+## Install
+- `pip install bioextract`
+## STRINGdb
+```python
+from bioextract.stringdb import StringDb, StringResourceLimits
+selection = (
+    StringDb.from_files(
+        file_aliases="9606.protein.aliases.v12.0.txt.gz",
+        file_links="9606.protein.links.v12.0.txt.gz",
+        limits=StringResourceLimits(num_input_ids_max=50_000),
+    )
+    .select_ids(["P04637", "EGFR", "CDK2"])
+    .with_score_min(400)
+)
+df_mapping = selection.extract_string_mapping()
+df_unmapped = selection.extract_unmapped_input_ids()
+df_edges = selection.extract_edges()
+print(df_mapping)
+print(df_unmapped)
+print(df_edges)
+```
+```python
+from bioextract.stringdb import StringDb
+df_group_edges = (
+    StringDb.from_files(
+        file_aliases="9606.protein.aliases.v12.0.txt.gz",
+        file_links="9606.protein.links.v12.0.txt.gz",
+    )
+    .select_groups(
+        {
+            "TumorA": ["TP53", "EGFR"],
+            "TumorB": ["CDK2", "TP53"],
+        }
+    )
+    .with_score_min(400)
+    .extract_edges()
+)
+```
+## OmniPath
+```python
+from bioextract.omnipath import OmniPathDb
+selection = (
+    OmniPathDb.from_files(
+        file_enzsub="enzsub.tsv.gz",
+        file_interactions="interactions.tsv.gz",
+    )
+    .select_ids(["P31749", "AKT1", "BAD"])
+    .with_enzsub()
+)
+df_enzsub = selection.extract_enzsub()
+df_unmapped = selection.extract_unmapped_input_ids()
+print(df_enzsub)
+print(df_unmapped)
+```
+```python
+from bioextract.omnipath import OmniPathDb
+df_group_interactions = (
+    OmniPathDb.from_files(file_interactions="interactions.tsv.gz")
+    .select_groups(
+        {
+            "TumorA": ["AKT1", "MTOR"],
+            "TumorB": ["EGFR", "ERBB2"],
+        }
+    )
+    .with_interactions()
+    .extract_interactions()
+)
+```
+## Development
+- `PYTHONPATH=src pytest`
+- `PYTHONPATH=src python scripts/benchmark_stringdb.py`
+## Release
+- GitHub Actions now provides:
+  - `.github/workflows/py-ci.yml` for test-and-build checks on push and pull request
+  - `.github/workflows/publish.yml` for tag-triggered PyPI publishing
+- Release tags must be canonical PEP 440 versions such as `0.1.1`
+- The publish workflow expects PyPI trusted publishing to be configured for the `pypi` environment

bioextract-0.0.1/pyproject.toml ADDED Viewed

@@ -0,0 +1,29 @@
+[project]
+name = "bioextract"
+version = "0.0.1"
+description = "Library-first extraction helpers for bioinformatics resource snapshots."
+authors = [
+    { name = "FuqingZh", email = "fu.qing.zhang.work@gmail.com" },
+]
+dependencies = [
+    "polars>=1.39.3",
+]
+requires-python = ">=3.14"
+readme = "README.md"
+[project.license]
+text = "MIT"
+[build-system]
+requires = [
+    "pdm-backend",
+]
+build-backend = "pdm.backend"
+[tool.pdm]
+distribution = true
+[tool.pdm.dev-dependencies]
+dev = [
+    "pytest>=9.0.2",
+]

bioextract-0.0.1/src/bioextract/__init__.py ADDED Viewed

@@ -0,0 +1,28 @@
+from importlib import import_module
+from types import ModuleType
+from typing import TYPE_CHECKING, Any
+__all__ = ["omnipath", "stringdb"]
+if TYPE_CHECKING:
+    import bioextract.omnipath as omnipath
+    import bioextract.stringdb as stringdb
+_ALIAS_MODULES: dict[str, str] = {
+    "omnipath": "bioextract.omnipath",
+    "stringdb": "bioextract.stringdb",
+}
+def __getattr__(name: str) -> Any:
+    module_name = _ALIAS_MODULES.get(name)
+    if module_name is None:
+        raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+    module_loaded: ModuleType = import_module(module_name)
+    globals()[name] = module_loaded
+    return module_loaded
+def __dir__() -> list[str]:
+    return sorted(set(globals()) | set(__all__))

bioextract-0.0.1/src/bioextract/_shared.py ADDED Viewed

@@ -0,0 +1,140 @@
+import re
+from collections.abc import Collection, Iterable, Mapping
+from dataclasses import dataclass
+from pathlib import Path
+import polars as pl
+from polars._typing import SchemaDict
+RE_UNIPROT_PIPE = re.compile(r"^[^|]+\|([^|]+)\|")
+@dataclass(frozen=True, slots=True)
+class GroupInputFrames:
+    df_groups: pl.DataFrame
+    df_input_ids: pl.DataFrame
+def normalize_input_id(value: str) -> str:
+    value = value.strip()
+    if (match_pipe := RE_UNIPROT_PIPE.match(value)) is not None:
+        return match_pipe.group(1).strip()
+    return value
+def validate_file_size(
+    *,
+    file_path: Path,
+    size_max: int | None,
+    label: str,
+) -> None:
+    if size_max is None:
+        return
+    file_size = file_path.stat().st_size
+    if file_size > size_max:
+        raise ValueError(
+            f"{label} exceeds configured size limit: "
+            f"path={file_path}, size_bytes={file_size}, limit_bytes={size_max}"
+        )
+def validate_count_limit(
+    *,
+    count: int,
+    limit_max: int | None,
+    label: str,
+) -> None:
+    if limit_max is None:
+        return
+    if count > limit_max:
+        raise ValueError(
+            f"{label} exceeds configured limit: count={count}, limit={limit_max}"
+        )
+def validate_group_ids(group_ids: list[str]) -> None:
+    group_ids_seen: set[str] = set()
+    for group_id in group_ids:
+        if not group_id:
+            raise ValueError("GroupId must be a non-empty string after normalization")
+        if group_id in group_ids_seen:
+            raise ValueError(
+                f"GroupId values must be unique after normalization: {group_id!r}"
+            )
+        group_ids_seen.add(group_id)
+def validate_required_cols(
+    cols_available: Collection[str], cols_required: Collection[str], context: str
+) -> None:
+    cols_missing = set(cols_required) - set(cols_available)
+    if cols_missing:
+        raise ValueError(
+            f"{context} is missing required columns: "
+            f"{sorted(cols_missing)}; available={cols_available}"
+        )
+def create_input_id_frame(
+    input_ids: Iterable[str],
+    *,
+    schema_unmapped: SchemaDict,
+) -> pl.DataFrame:
+    ids_normalized: list[str] = []
+    for input_id in input_ids:
+        if input_id_normalized := normalize_input_id(str(input_id)):
+            ids_normalized.append(input_id_normalized)
+    if not ids_normalized:
+        return pl.DataFrame(schema=schema_unmapped)
+    return (
+        pl.DataFrame({"InputId": ids_normalized}, schema=schema_unmapped)
+        .unique(subset=["InputId"])
+        .sort("InputId")
+    )
+def create_group_input_frames(
+    group_to_ids: Mapping[str, Iterable[str]],
+    *,
+    schema_groups: SchemaDict,
+    schema_group_input_ids: SchemaDict,
+) -> GroupInputFrames:
+    group_ids_normalized: list[str] = []
+    group_ids_col: list[str] = []
+    input_ids_col: list[str] = []
+    for group_id_raw, ids in group_to_ids.items():
+        group_id = str(group_id_raw).strip()
+        group_ids_normalized.append(group_id)
+        for input_id in ids:
+            if input_id_normalized := normalize_input_id(str(input_id)):
+                group_ids_col.append(group_id)
+                input_ids_col.append(input_id_normalized)
+    validate_group_ids(group_ids_normalized)
+    if group_ids_normalized:
+        df_groups = pl.DataFrame(
+            {"GroupId": group_ids_normalized},
+            schema=schema_groups,
+        ).sort("GroupId")
+    else:
+        df_groups = pl.DataFrame(schema=schema_groups)
+    if not input_ids_col:
+        return GroupInputFrames(
+            df_groups=df_groups,
+            df_input_ids=pl.DataFrame(schema=schema_group_input_ids),
+        )
+    df_group_input_ids = (
+        pl.DataFrame(
+            {"GroupId": group_ids_col, "InputId": input_ids_col},
+            schema=schema_group_input_ids,
+        )
+        .unique()
+        .sort("GroupId", "InputId")
+    )
+    return GroupInputFrames(df_groups=df_groups, df_input_ids=df_group_input_ids)

bioextract-0.0.1/src/bioextract/omnipath/__init__.py ADDED Viewed

@@ -0,0 +1,7 @@
+from .omnipath import OmniPathDb
+from .spec import OmniPathResourceLimits
+__all__ = [
+    "OmniPathDb",
+    "OmniPathResourceLimits",
+]

bioextract-0.0.1/src/bioextract/omnipath/constant.py ADDED Viewed

@@ -0,0 +1,85 @@
+import re
+from typing import Literal
+import polars as pl
+SCHEMA_ENZSUB_RAW = {
+    "enzyme": pl.String,
+    "substrate": pl.String,
+    "residue_type": pl.String,
+    "residue_offset": pl.String,
+    "modification": pl.String,
+}
+SCHEMA_INTERACTIONS_RAW = {
+    "source": pl.String,
+    "target": pl.String,
+    "is_directed": pl.String,
+    "is_stimulation": pl.String,
+    "is_inhibition": pl.String,
+}
+SCHEMA_UNMAPPED = {
+    "InputId": pl.String,
+}
+SCHEMA_GROUPS = {
+    "GroupId": pl.String,
+}
+SCHEMA_GROUP_INPUT_IDS = {
+    "GroupId": pl.String,
+    "InputId": pl.String,
+}
+SCHEMA_ENZSUB = {
+    "SourceId": pl.String,
+    "TargetId": pl.String,
+    "TargetSite": pl.String,
+    "Modification": pl.String,
+}
+SCHEMA_GROUP_ENZSUB = {
+    "GroupId": pl.String,
+    "SourceId": pl.String,
+    "TargetId": pl.String,
+    "TargetSite": pl.String,
+    "Modification": pl.String,
+}
+SCHEMA_INTERACTIONS = {
+    "SourceId": pl.String,
+    "TargetId": pl.String,
+    "IsDirected": pl.Boolean,
+    "IsStimulation": pl.Boolean,
+    "IsInhibition": pl.Boolean,
+}
+SCHEMA_GROUP_INTERACTIONS = {
+    "GroupId": pl.String,
+    "SourceId": pl.String,
+    "TargetId": pl.String,
+    "IsDirected": pl.Boolean,
+    "IsStimulation": pl.Boolean,
+    "IsInhibition": pl.Boolean,
+}
+COLS_RENAMED_ENZSUB = {
+    "enzyme": "SourceId",
+    "substrate": "TargetId",
+    "residue_type": "_ResidueType",
+    "residue_offset": "_ResidueOffset",
+    "modification": "Modification",
+}
+COLS_RENAMED_INTERACTIONS = {
+    "source": "SourceId",
+    "target": "TargetId",
+    "is_directed": "IsDirected",
+    "is_stimulation": "IsStimulation",
+    "is_inhibition": "IsInhibition",
+}
+RE_UNIPROT_PIPE = re.compile(r"^[^|]+\|([^|]+)\|")
+OmniPathResourceName = Literal["enzsub", "interactions"]