bioextract 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,110 @@
1
+ Metadata-Version: 2.1
2
+ Name: bioextract
3
+ Version: 0.0.1
4
+ Summary: Library-first extraction helpers for bioinformatics resource snapshots.
5
+ Author-Email: FuqingZh <fu.qing.zhang.work@gmail.com>
6
+ License: MIT
7
+ Requires-Python: >=3.14
8
+ Requires-Dist: polars>=1.39.3
9
+ Description-Content-Type: text/markdown
10
+
11
+ # bioextract
12
+
13
+ Library-first extraction helpers for bioinformatics resource snapshots.
14
+
15
+ ## Install
16
+
17
+ - `pip install bioextract`
18
+
19
+ ## STRINGdb
20
+
21
+ ```python
22
+ from bioextract.stringdb import StringDb, StringResourceLimits
23
+
24
+ selection = (
25
+ StringDb.from_files(
26
+ file_aliases="9606.protein.aliases.v12.0.txt.gz",
27
+ file_links="9606.protein.links.v12.0.txt.gz",
28
+ limits=StringResourceLimits(num_input_ids_max=50_000),
29
+ )
30
+ .select_ids(["P04637", "EGFR", "CDK2"])
31
+ .with_score_min(400)
32
+ )
33
+
34
+ df_mapping = selection.extract_string_mapping()
35
+ df_unmapped = selection.extract_unmapped_input_ids()
36
+ df_edges = selection.extract_edges()
37
+
38
+ print(df_mapping)
39
+ print(df_unmapped)
40
+ print(df_edges)
41
+ ```
42
+
43
+ ```python
44
+ from bioextract.stringdb import StringDb
45
+
46
+ df_group_edges = (
47
+ StringDb.from_files(
48
+ file_aliases="9606.protein.aliases.v12.0.txt.gz",
49
+ file_links="9606.protein.links.v12.0.txt.gz",
50
+ )
51
+ .select_groups(
52
+ {
53
+ "TumorA": ["TP53", "EGFR"],
54
+ "TumorB": ["CDK2", "TP53"],
55
+ }
56
+ )
57
+ .with_score_min(400)
58
+ .extract_edges()
59
+ )
60
+ ```
61
+
62
+ ## OmniPath
63
+
64
+ ```python
65
+ from bioextract.omnipath import OmniPathDb
66
+
67
+ selection = (
68
+ OmniPathDb.from_files(
69
+ file_enzsub="enzsub.tsv.gz",
70
+ file_interactions="interactions.tsv.gz",
71
+ )
72
+ .select_ids(["P31749", "AKT1", "BAD"])
73
+ .with_enzsub()
74
+ )
75
+
76
+ df_enzsub = selection.extract_enzsub()
77
+ df_unmapped = selection.extract_unmapped_input_ids()
78
+
79
+ print(df_enzsub)
80
+ print(df_unmapped)
81
+ ```
82
+
83
+ ```python
84
+ from bioextract.omnipath import OmniPathDb
85
+
86
+ df_group_interactions = (
87
+ OmniPathDb.from_files(file_interactions="interactions.tsv.gz")
88
+ .select_groups(
89
+ {
90
+ "TumorA": ["AKT1", "MTOR"],
91
+ "TumorB": ["EGFR", "ERBB2"],
92
+ }
93
+ )
94
+ .with_interactions()
95
+ .extract_interactions()
96
+ )
97
+ ```
98
+
99
+ ## Development
100
+
101
+ - `PYTHONPATH=src pytest`
102
+ - `PYTHONPATH=src python scripts/benchmark_stringdb.py`
103
+
104
+ ## Release
105
+
106
+ - GitHub Actions now provides:
107
+ - `.github/workflows/py-ci.yml` for test-and-build checks on push and pull request
108
+ - `.github/workflows/publish.yml` for tag-triggered PyPI publishing
109
+ - Release tags must be canonical PEP 440 versions such as `0.1.1`
110
+ - The publish workflow expects PyPI trusted publishing to be configured for the `pypi` environment
@@ -0,0 +1,100 @@
1
+ # bioextract
2
+
3
+ Library-first extraction helpers for bioinformatics resource snapshots.
4
+
5
+ ## Install
6
+
7
+ - `pip install bioextract`
8
+
9
+ ## STRINGdb
10
+
11
+ ```python
12
+ from bioextract.stringdb import StringDb, StringResourceLimits
13
+
14
+ selection = (
15
+ StringDb.from_files(
16
+ file_aliases="9606.protein.aliases.v12.0.txt.gz",
17
+ file_links="9606.protein.links.v12.0.txt.gz",
18
+ limits=StringResourceLimits(num_input_ids_max=50_000),
19
+ )
20
+ .select_ids(["P04637", "EGFR", "CDK2"])
21
+ .with_score_min(400)
22
+ )
23
+
24
+ df_mapping = selection.extract_string_mapping()
25
+ df_unmapped = selection.extract_unmapped_input_ids()
26
+ df_edges = selection.extract_edges()
27
+
28
+ print(df_mapping)
29
+ print(df_unmapped)
30
+ print(df_edges)
31
+ ```
32
+
33
+ ```python
34
+ from bioextract.stringdb import StringDb
35
+
36
+ df_group_edges = (
37
+ StringDb.from_files(
38
+ file_aliases="9606.protein.aliases.v12.0.txt.gz",
39
+ file_links="9606.protein.links.v12.0.txt.gz",
40
+ )
41
+ .select_groups(
42
+ {
43
+ "TumorA": ["TP53", "EGFR"],
44
+ "TumorB": ["CDK2", "TP53"],
45
+ }
46
+ )
47
+ .with_score_min(400)
48
+ .extract_edges()
49
+ )
50
+ ```
51
+
52
+ ## OmniPath
53
+
54
+ ```python
55
+ from bioextract.omnipath import OmniPathDb
56
+
57
+ selection = (
58
+ OmniPathDb.from_files(
59
+ file_enzsub="enzsub.tsv.gz",
60
+ file_interactions="interactions.tsv.gz",
61
+ )
62
+ .select_ids(["P31749", "AKT1", "BAD"])
63
+ .with_enzsub()
64
+ )
65
+
66
+ df_enzsub = selection.extract_enzsub()
67
+ df_unmapped = selection.extract_unmapped_input_ids()
68
+
69
+ print(df_enzsub)
70
+ print(df_unmapped)
71
+ ```
72
+
73
+ ```python
74
+ from bioextract.omnipath import OmniPathDb
75
+
76
+ df_group_interactions = (
77
+ OmniPathDb.from_files(file_interactions="interactions.tsv.gz")
78
+ .select_groups(
79
+ {
80
+ "TumorA": ["AKT1", "MTOR"],
81
+ "TumorB": ["EGFR", "ERBB2"],
82
+ }
83
+ )
84
+ .with_interactions()
85
+ .extract_interactions()
86
+ )
87
+ ```
88
+
89
+ ## Development
90
+
91
+ - `PYTHONPATH=src pytest`
92
+ - `PYTHONPATH=src python scripts/benchmark_stringdb.py`
93
+
94
+ ## Release
95
+
96
+ - GitHub Actions now provides:
97
+ - `.github/workflows/py-ci.yml` for test-and-build checks on push and pull request
98
+ - `.github/workflows/publish.yml` for tag-triggered PyPI publishing
99
+ - Release tags must be canonical PEP 440 versions such as `0.1.1`
100
+ - The publish workflow expects PyPI trusted publishing to be configured for the `pypi` environment
@@ -0,0 +1,29 @@
1
+ [project]
2
+ name = "bioextract"
3
+ version = "0.0.1"
4
+ description = "Library-first extraction helpers for bioinformatics resource snapshots."
5
+ authors = [
6
+ { name = "FuqingZh", email = "fu.qing.zhang.work@gmail.com" },
7
+ ]
8
+ dependencies = [
9
+ "polars>=1.39.3",
10
+ ]
11
+ requires-python = ">=3.14"
12
+ readme = "README.md"
13
+
14
+ [project.license]
15
+ text = "MIT"
16
+
17
+ [build-system]
18
+ requires = [
19
+ "pdm-backend",
20
+ ]
21
+ build-backend = "pdm.backend"
22
+
23
+ [tool.pdm]
24
+ distribution = true
25
+
26
+ [tool.pdm.dev-dependencies]
27
+ dev = [
28
+ "pytest>=9.0.2",
29
+ ]
@@ -0,0 +1,28 @@
1
+ from importlib import import_module
2
+ from types import ModuleType
3
+ from typing import TYPE_CHECKING, Any
4
+
5
+ __all__ = ["omnipath", "stringdb"]
6
+
7
+ if TYPE_CHECKING:
8
+ import bioextract.omnipath as omnipath
9
+ import bioextract.stringdb as stringdb
10
+
11
+ _ALIAS_MODULES: dict[str, str] = {
12
+ "omnipath": "bioextract.omnipath",
13
+ "stringdb": "bioextract.stringdb",
14
+ }
15
+
16
+
17
+ def __getattr__(name: str) -> Any:
18
+ module_name = _ALIAS_MODULES.get(name)
19
+ if module_name is None:
20
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
21
+
22
+ module_loaded: ModuleType = import_module(module_name)
23
+ globals()[name] = module_loaded
24
+ return module_loaded
25
+
26
+
27
+ def __dir__() -> list[str]:
28
+ return sorted(set(globals()) | set(__all__))
@@ -0,0 +1,140 @@
1
+ import re
2
+ from collections.abc import Collection, Iterable, Mapping
3
+ from dataclasses import dataclass
4
+ from pathlib import Path
5
+
6
+ import polars as pl
7
+ from polars._typing import SchemaDict
8
+
9
+ RE_UNIPROT_PIPE = re.compile(r"^[^|]+\|([^|]+)\|")
10
+
11
+
12
+ @dataclass(frozen=True, slots=True)
13
+ class GroupInputFrames:
14
+ df_groups: pl.DataFrame
15
+ df_input_ids: pl.DataFrame
16
+
17
+
18
+ def normalize_input_id(value: str) -> str:
19
+ value = value.strip()
20
+ if (match_pipe := RE_UNIPROT_PIPE.match(value)) is not None:
21
+ return match_pipe.group(1).strip()
22
+ return value
23
+
24
+
25
+ def validate_file_size(
26
+ *,
27
+ file_path: Path,
28
+ size_max: int | None,
29
+ label: str,
30
+ ) -> None:
31
+ if size_max is None:
32
+ return
33
+ file_size = file_path.stat().st_size
34
+ if file_size > size_max:
35
+ raise ValueError(
36
+ f"{label} exceeds configured size limit: "
37
+ f"path={file_path}, size_bytes={file_size}, limit_bytes={size_max}"
38
+ )
39
+
40
+
41
+ def validate_count_limit(
42
+ *,
43
+ count: int,
44
+ limit_max: int | None,
45
+ label: str,
46
+ ) -> None:
47
+ if limit_max is None:
48
+ return
49
+ if count > limit_max:
50
+ raise ValueError(
51
+ f"{label} exceeds configured limit: count={count}, limit={limit_max}"
52
+ )
53
+
54
+
55
+ def validate_group_ids(group_ids: list[str]) -> None:
56
+ group_ids_seen: set[str] = set()
57
+ for group_id in group_ids:
58
+ if not group_id:
59
+ raise ValueError("GroupId must be a non-empty string after normalization")
60
+ if group_id in group_ids_seen:
61
+ raise ValueError(
62
+ f"GroupId values must be unique after normalization: {group_id!r}"
63
+ )
64
+ group_ids_seen.add(group_id)
65
+
66
+
67
+ def validate_required_cols(
68
+ cols_available: Collection[str], cols_required: Collection[str], context: str
69
+ ) -> None:
70
+ cols_missing = set(cols_required) - set(cols_available)
71
+ if cols_missing:
72
+ raise ValueError(
73
+ f"{context} is missing required columns: "
74
+ f"{sorted(cols_missing)}; available={cols_available}"
75
+ )
76
+
77
+
78
+ def create_input_id_frame(
79
+ input_ids: Iterable[str],
80
+ *,
81
+ schema_unmapped: SchemaDict,
82
+ ) -> pl.DataFrame:
83
+ ids_normalized: list[str] = []
84
+ for input_id in input_ids:
85
+ if input_id_normalized := normalize_input_id(str(input_id)):
86
+ ids_normalized.append(input_id_normalized)
87
+
88
+ if not ids_normalized:
89
+ return pl.DataFrame(schema=schema_unmapped)
90
+
91
+ return (
92
+ pl.DataFrame({"InputId": ids_normalized}, schema=schema_unmapped)
93
+ .unique(subset=["InputId"])
94
+ .sort("InputId")
95
+ )
96
+
97
+
98
+ def create_group_input_frames(
99
+ group_to_ids: Mapping[str, Iterable[str]],
100
+ *,
101
+ schema_groups: SchemaDict,
102
+ schema_group_input_ids: SchemaDict,
103
+ ) -> GroupInputFrames:
104
+ group_ids_normalized: list[str] = []
105
+ group_ids_col: list[str] = []
106
+ input_ids_col: list[str] = []
107
+
108
+ for group_id_raw, ids in group_to_ids.items():
109
+ group_id = str(group_id_raw).strip()
110
+ group_ids_normalized.append(group_id)
111
+ for input_id in ids:
112
+ if input_id_normalized := normalize_input_id(str(input_id)):
113
+ group_ids_col.append(group_id)
114
+ input_ids_col.append(input_id_normalized)
115
+
116
+ validate_group_ids(group_ids_normalized)
117
+
118
+ if group_ids_normalized:
119
+ df_groups = pl.DataFrame(
120
+ {"GroupId": group_ids_normalized},
121
+ schema=schema_groups,
122
+ ).sort("GroupId")
123
+ else:
124
+ df_groups = pl.DataFrame(schema=schema_groups)
125
+
126
+ if not input_ids_col:
127
+ return GroupInputFrames(
128
+ df_groups=df_groups,
129
+ df_input_ids=pl.DataFrame(schema=schema_group_input_ids),
130
+ )
131
+
132
+ df_group_input_ids = (
133
+ pl.DataFrame(
134
+ {"GroupId": group_ids_col, "InputId": input_ids_col},
135
+ schema=schema_group_input_ids,
136
+ )
137
+ .unique()
138
+ .sort("GroupId", "InputId")
139
+ )
140
+ return GroupInputFrames(df_groups=df_groups, df_input_ids=df_group_input_ids)
@@ -0,0 +1,7 @@
1
+ from .omnipath import OmniPathDb
2
+ from .spec import OmniPathResourceLimits
3
+
4
+ __all__ = [
5
+ "OmniPathDb",
6
+ "OmniPathResourceLimits",
7
+ ]
@@ -0,0 +1,85 @@
1
+ import re
2
+ from typing import Literal
3
+
4
+ import polars as pl
5
+
6
+ SCHEMA_ENZSUB_RAW = {
7
+ "enzyme": pl.String,
8
+ "substrate": pl.String,
9
+ "residue_type": pl.String,
10
+ "residue_offset": pl.String,
11
+ "modification": pl.String,
12
+ }
13
+
14
+ SCHEMA_INTERACTIONS_RAW = {
15
+ "source": pl.String,
16
+ "target": pl.String,
17
+ "is_directed": pl.String,
18
+ "is_stimulation": pl.String,
19
+ "is_inhibition": pl.String,
20
+ }
21
+
22
+ SCHEMA_UNMAPPED = {
23
+ "InputId": pl.String,
24
+ }
25
+
26
+ SCHEMA_GROUPS = {
27
+ "GroupId": pl.String,
28
+ }
29
+
30
+ SCHEMA_GROUP_INPUT_IDS = {
31
+ "GroupId": pl.String,
32
+ "InputId": pl.String,
33
+ }
34
+
35
+ SCHEMA_ENZSUB = {
36
+ "SourceId": pl.String,
37
+ "TargetId": pl.String,
38
+ "TargetSite": pl.String,
39
+ "Modification": pl.String,
40
+ }
41
+
42
+ SCHEMA_GROUP_ENZSUB = {
43
+ "GroupId": pl.String,
44
+ "SourceId": pl.String,
45
+ "TargetId": pl.String,
46
+ "TargetSite": pl.String,
47
+ "Modification": pl.String,
48
+ }
49
+
50
+ SCHEMA_INTERACTIONS = {
51
+ "SourceId": pl.String,
52
+ "TargetId": pl.String,
53
+ "IsDirected": pl.Boolean,
54
+ "IsStimulation": pl.Boolean,
55
+ "IsInhibition": pl.Boolean,
56
+ }
57
+
58
+ SCHEMA_GROUP_INTERACTIONS = {
59
+ "GroupId": pl.String,
60
+ "SourceId": pl.String,
61
+ "TargetId": pl.String,
62
+ "IsDirected": pl.Boolean,
63
+ "IsStimulation": pl.Boolean,
64
+ "IsInhibition": pl.Boolean,
65
+ }
66
+
67
+ COLS_RENAMED_ENZSUB = {
68
+ "enzyme": "SourceId",
69
+ "substrate": "TargetId",
70
+ "residue_type": "_ResidueType",
71
+ "residue_offset": "_ResidueOffset",
72
+ "modification": "Modification",
73
+ }
74
+
75
+ COLS_RENAMED_INTERACTIONS = {
76
+ "source": "SourceId",
77
+ "target": "TargetId",
78
+ "is_directed": "IsDirected",
79
+ "is_stimulation": "IsStimulation",
80
+ "is_inhibition": "IsInhibition",
81
+ }
82
+
83
+ RE_UNIPROT_PIPE = re.compile(r"^[^|]+\|([^|]+)\|")
84
+
85
+ OmniPathResourceName = Literal["enzsub", "interactions"]