bioextract 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
bioextract/__init__.py ADDED
@@ -0,0 +1,28 @@
1
+ from importlib import import_module
2
+ from types import ModuleType
3
+ from typing import TYPE_CHECKING, Any
4
+
5
+ __all__ = ["omnipath", "stringdb"]
6
+
7
+ if TYPE_CHECKING:
8
+ import bioextract.omnipath as omnipath
9
+ import bioextract.stringdb as stringdb
10
+
11
+ _ALIAS_MODULES: dict[str, str] = {
12
+ "omnipath": "bioextract.omnipath",
13
+ "stringdb": "bioextract.stringdb",
14
+ }
15
+
16
+
17
+ def __getattr__(name: str) -> Any:
18
+ module_name = _ALIAS_MODULES.get(name)
19
+ if module_name is None:
20
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
21
+
22
+ module_loaded: ModuleType = import_module(module_name)
23
+ globals()[name] = module_loaded
24
+ return module_loaded
25
+
26
+
27
+ def __dir__() -> list[str]:
28
+ return sorted(set(globals()) | set(__all__))
bioextract/_shared.py ADDED
@@ -0,0 +1,140 @@
1
+ import re
2
+ from collections.abc import Collection, Iterable, Mapping
3
+ from dataclasses import dataclass
4
+ from pathlib import Path
5
+
6
+ import polars as pl
7
+ from polars._typing import SchemaDict
8
+
9
+ RE_UNIPROT_PIPE = re.compile(r"^[^|]+\|([^|]+)\|")
10
+
11
+
12
+ @dataclass(frozen=True, slots=True)
13
+ class GroupInputFrames:
14
+ df_groups: pl.DataFrame
15
+ df_input_ids: pl.DataFrame
16
+
17
+
18
+ def normalize_input_id(value: str) -> str:
19
+ value = value.strip()
20
+ if (match_pipe := RE_UNIPROT_PIPE.match(value)) is not None:
21
+ return match_pipe.group(1).strip()
22
+ return value
23
+
24
+
25
+ def validate_file_size(
26
+ *,
27
+ file_path: Path,
28
+ size_max: int | None,
29
+ label: str,
30
+ ) -> None:
31
+ if size_max is None:
32
+ return
33
+ file_size = file_path.stat().st_size
34
+ if file_size > size_max:
35
+ raise ValueError(
36
+ f"{label} exceeds configured size limit: "
37
+ f"path={file_path}, size_bytes={file_size}, limit_bytes={size_max}"
38
+ )
39
+
40
+
41
+ def validate_count_limit(
42
+ *,
43
+ count: int,
44
+ limit_max: int | None,
45
+ label: str,
46
+ ) -> None:
47
+ if limit_max is None:
48
+ return
49
+ if count > limit_max:
50
+ raise ValueError(
51
+ f"{label} exceeds configured limit: count={count}, limit={limit_max}"
52
+ )
53
+
54
+
55
+ def validate_group_ids(group_ids: list[str]) -> None:
56
+ group_ids_seen: set[str] = set()
57
+ for group_id in group_ids:
58
+ if not group_id:
59
+ raise ValueError("GroupId must be a non-empty string after normalization")
60
+ if group_id in group_ids_seen:
61
+ raise ValueError(
62
+ f"GroupId values must be unique after normalization: {group_id!r}"
63
+ )
64
+ group_ids_seen.add(group_id)
65
+
66
+
67
+ def validate_required_cols(
68
+ cols_available: Collection[str], cols_required: Collection[str], context: str
69
+ ) -> None:
70
+ cols_missing = set(cols_required) - set(cols_available)
71
+ if cols_missing:
72
+ raise ValueError(
73
+ f"{context} is missing required columns: "
74
+ f"{sorted(cols_missing)}; available={cols_available}"
75
+ )
76
+
77
+
78
+ def create_input_id_frame(
79
+ input_ids: Iterable[str],
80
+ *,
81
+ schema_unmapped: SchemaDict,
82
+ ) -> pl.DataFrame:
83
+ ids_normalized: list[str] = []
84
+ for input_id in input_ids:
85
+ if input_id_normalized := normalize_input_id(str(input_id)):
86
+ ids_normalized.append(input_id_normalized)
87
+
88
+ if not ids_normalized:
89
+ return pl.DataFrame(schema=schema_unmapped)
90
+
91
+ return (
92
+ pl.DataFrame({"InputId": ids_normalized}, schema=schema_unmapped)
93
+ .unique(subset=["InputId"])
94
+ .sort("InputId")
95
+ )
96
+
97
+
98
+ def create_group_input_frames(
99
+ group_to_ids: Mapping[str, Iterable[str]],
100
+ *,
101
+ schema_groups: SchemaDict,
102
+ schema_group_input_ids: SchemaDict,
103
+ ) -> GroupInputFrames:
104
+ group_ids_normalized: list[str] = []
105
+ group_ids_col: list[str] = []
106
+ input_ids_col: list[str] = []
107
+
108
+ for group_id_raw, ids in group_to_ids.items():
109
+ group_id = str(group_id_raw).strip()
110
+ group_ids_normalized.append(group_id)
111
+ for input_id in ids:
112
+ if input_id_normalized := normalize_input_id(str(input_id)):
113
+ group_ids_col.append(group_id)
114
+ input_ids_col.append(input_id_normalized)
115
+
116
+ validate_group_ids(group_ids_normalized)
117
+
118
+ if group_ids_normalized:
119
+ df_groups = pl.DataFrame(
120
+ {"GroupId": group_ids_normalized},
121
+ schema=schema_groups,
122
+ ).sort("GroupId")
123
+ else:
124
+ df_groups = pl.DataFrame(schema=schema_groups)
125
+
126
+ if not input_ids_col:
127
+ return GroupInputFrames(
128
+ df_groups=df_groups,
129
+ df_input_ids=pl.DataFrame(schema=schema_group_input_ids),
130
+ )
131
+
132
+ df_group_input_ids = (
133
+ pl.DataFrame(
134
+ {"GroupId": group_ids_col, "InputId": input_ids_col},
135
+ schema=schema_group_input_ids,
136
+ )
137
+ .unique()
138
+ .sort("GroupId", "InputId")
139
+ )
140
+ return GroupInputFrames(df_groups=df_groups, df_input_ids=df_group_input_ids)
@@ -0,0 +1,7 @@
1
+ from .omnipath import OmniPathDb
2
+ from .spec import OmniPathResourceLimits
3
+
4
+ __all__ = [
5
+ "OmniPathDb",
6
+ "OmniPathResourceLimits",
7
+ ]
@@ -0,0 +1,85 @@
1
+ import re
2
+ from typing import Literal
3
+
4
+ import polars as pl
5
+
6
+ SCHEMA_ENZSUB_RAW = {
7
+ "enzyme": pl.String,
8
+ "substrate": pl.String,
9
+ "residue_type": pl.String,
10
+ "residue_offset": pl.String,
11
+ "modification": pl.String,
12
+ }
13
+
14
+ SCHEMA_INTERACTIONS_RAW = {
15
+ "source": pl.String,
16
+ "target": pl.String,
17
+ "is_directed": pl.String,
18
+ "is_stimulation": pl.String,
19
+ "is_inhibition": pl.String,
20
+ }
21
+
22
+ SCHEMA_UNMAPPED = {
23
+ "InputId": pl.String,
24
+ }
25
+
26
+ SCHEMA_GROUPS = {
27
+ "GroupId": pl.String,
28
+ }
29
+
30
+ SCHEMA_GROUP_INPUT_IDS = {
31
+ "GroupId": pl.String,
32
+ "InputId": pl.String,
33
+ }
34
+
35
+ SCHEMA_ENZSUB = {
36
+ "SourceId": pl.String,
37
+ "TargetId": pl.String,
38
+ "TargetSite": pl.String,
39
+ "Modification": pl.String,
40
+ }
41
+
42
+ SCHEMA_GROUP_ENZSUB = {
43
+ "GroupId": pl.String,
44
+ "SourceId": pl.String,
45
+ "TargetId": pl.String,
46
+ "TargetSite": pl.String,
47
+ "Modification": pl.String,
48
+ }
49
+
50
+ SCHEMA_INTERACTIONS = {
51
+ "SourceId": pl.String,
52
+ "TargetId": pl.String,
53
+ "IsDirected": pl.Boolean,
54
+ "IsStimulation": pl.Boolean,
55
+ "IsInhibition": pl.Boolean,
56
+ }
57
+
58
+ SCHEMA_GROUP_INTERACTIONS = {
59
+ "GroupId": pl.String,
60
+ "SourceId": pl.String,
61
+ "TargetId": pl.String,
62
+ "IsDirected": pl.Boolean,
63
+ "IsStimulation": pl.Boolean,
64
+ "IsInhibition": pl.Boolean,
65
+ }
66
+
67
+ COLS_RENAMED_ENZSUB = {
68
+ "enzyme": "SourceId",
69
+ "substrate": "TargetId",
70
+ "residue_type": "_ResidueType",
71
+ "residue_offset": "_ResidueOffset",
72
+ "modification": "Modification",
73
+ }
74
+
75
+ COLS_RENAMED_INTERACTIONS = {
76
+ "source": "SourceId",
77
+ "target": "TargetId",
78
+ "is_directed": "IsDirected",
79
+ "is_stimulation": "IsStimulation",
80
+ "is_inhibition": "IsInhibition",
81
+ }
82
+
83
+ RE_UNIPROT_PIPE = re.compile(r"^[^|]+\|([^|]+)\|")
84
+
85
+ OmniPathResourceName = Literal["enzsub", "interactions"]
@@ -0,0 +1,335 @@
1
+ import os
2
+ from collections.abc import Iterable, Mapping
3
+ from dataclasses import dataclass, field
4
+ from pathlib import Path
5
+
6
+ import polars as pl
7
+
8
+ from bioextract._shared import validate_count_limit, validate_file_size
9
+
10
+ from .._shared import create_group_input_frames, create_input_id_frame
11
+ from .constant import (
12
+ SCHEMA_GROUP_INPUT_IDS,
13
+ SCHEMA_GROUPS,
14
+ SCHEMA_UNMAPPED,
15
+ OmniPathResourceName,
16
+ )
17
+ from .spec import OmniPathResourceLimits
18
+ from .util import (
19
+ extract_enzsub_frame,
20
+ extract_interactions_frame,
21
+ )
22
+
23
+ __all__ = [
24
+ "OmniPathDb",
25
+ ]
26
+
27
+
28
+ @dataclass(frozen=True, slots=True)
29
+ class _OmniPathSnapshot:
30
+ file_enzsub: Path | None = None
31
+ file_interactions: Path | None = None
32
+
33
+
34
+ @dataclass(slots=True)
35
+ class OmniPathDb:
36
+ """Path-first access to local OmniPath relation files.
37
+
38
+ `OmniPathDb` is the public entrypoint for extracting OmniPath
39
+ enzyme-substrate relations and interaction relations from local files.
40
+ It keeps dataset-level resource limits and exposes single and grouped
41
+ selections through one selection type.
42
+ """
43
+
44
+ snapshot: _OmniPathSnapshot
45
+ limits: OmniPathResourceLimits = field(default_factory=OmniPathResourceLimits)
46
+
47
+ DEFAULT_RESOURCE_LIMITS = OmniPathResourceLimits()
48
+
49
+ @classmethod
50
+ def from_files(
51
+ cls,
52
+ *,
53
+ file_enzsub: os.PathLike[str] | str | None = None,
54
+ file_interactions: os.PathLike[str] | str | None = None,
55
+ limits: OmniPathResourceLimits | None = None,
56
+ ) -> "OmniPathDb":
57
+ """Create a dataset handle from local OmniPath files.
58
+
59
+ Args:
60
+ file_enzsub: Path to a local OmniPath `enzsub` text or gzip file.
61
+ file_interactions: Path to a local OmniPath `interactions` text or
62
+ gzip file.
63
+ limits: Dataset-level resource limits. When omitted, default
64
+ fail-fast limits are used. see :class:`OmniPathResourceLimits` and
65
+ `OmniPathDb.DEFAULT_RESOURCE_LIMITS` for details.
66
+
67
+ Returns:
68
+ A dataset handle that can produce single or grouped selections.
69
+
70
+ Raises:
71
+ FileNotFoundError: If any provided file does not exist.
72
+ ValueError: If no resource files are provided or a configured
73
+ file-size limit is exceeded.
74
+ """
75
+ if file_enzsub is None and file_interactions is None:
76
+ raise ValueError("At least one OmniPath resource file must be provided")
77
+
78
+ limits_resolved = OmniPathResourceLimits() if limits is None else limits
79
+
80
+ if file_enzsub is not None:
81
+ file_enzsub = Path(file_enzsub)
82
+ if not file_enzsub.exists():
83
+ raise FileNotFoundError(
84
+ f"OmniPath enzsub file not found: {file_enzsub}"
85
+ )
86
+ validate_file_size(
87
+ file_path=file_enzsub,
88
+ size_max=limits_resolved.file_enzsub_bytes_max,
89
+ label="OmniPath enzsub file",
90
+ )
91
+
92
+ if file_interactions is not None:
93
+ file_interactions = Path(file_interactions)
94
+ if not file_interactions.exists():
95
+ raise FileNotFoundError(
96
+ f"OmniPath interactions file not found: {file_interactions}"
97
+ )
98
+ validate_file_size(
99
+ file_path=file_interactions,
100
+ size_max=limits_resolved.file_interactions_bytes_max,
101
+ label="OmniPath interactions file",
102
+ )
103
+
104
+ return cls(
105
+ snapshot=_OmniPathSnapshot(
106
+ file_enzsub=file_enzsub,
107
+ file_interactions=file_interactions,
108
+ ),
109
+ limits=limits_resolved,
110
+ )
111
+
112
+ @property
113
+ def available_resources(self) -> frozenset[OmniPathResourceName]:
114
+ resources: set[OmniPathResourceName] = set()
115
+ if self.snapshot.file_enzsub is not None:
116
+ resources.add("enzsub")
117
+ if self.snapshot.file_interactions is not None:
118
+ resources.add("interactions")
119
+ return frozenset(resources)
120
+
121
+ def select_ids(self, ids: Iterable[str]) -> OmniPathSelection:
122
+ """Create a single-query selection from input IDs."""
123
+ df_input_ids = create_input_id_frame(ids, schema_unmapped=SCHEMA_UNMAPPED)
124
+ validate_count_limit(
125
+ count=df_input_ids.height,
126
+ limit_max=self.limits.num_input_ids_max,
127
+ label="Normalized input ID count",
128
+ )
129
+ return OmniPathSelection(
130
+ dataset=self,
131
+ _df_input_ids=df_input_ids,
132
+ _df_groups=None,
133
+ resources_selected=self.available_resources,
134
+ )
135
+
136
+ def select_groups(
137
+ self,
138
+ group_to_ids: Mapping[str, Iterable[str]],
139
+ ) -> OmniPathSelection:
140
+ """Create a grouped selection from multiple input-ID sets."""
141
+ grp_in_frames = create_group_input_frames(
142
+ group_to_ids,
143
+ schema_groups=SCHEMA_GROUPS,
144
+ schema_group_input_ids=SCHEMA_GROUP_INPUT_IDS,
145
+ )
146
+ validate_count_limit(
147
+ count=grp_in_frames.df_groups.height,
148
+ limit_max=self.limits.num_groups_max,
149
+ label="Group count",
150
+ )
151
+ validate_count_limit(
152
+ count=grp_in_frames.df_input_ids.height,
153
+ limit_max=self.limits.num_input_ids_max,
154
+ label="Normalized input ID count",
155
+ )
156
+ return OmniPathSelection(
157
+ dataset=self,
158
+ _df_groups=grp_in_frames.df_groups,
159
+ _df_input_ids=grp_in_frames.df_input_ids,
160
+ resources_selected=self.available_resources,
161
+ )
162
+
163
+
164
+ @dataclass(slots=True)
165
+ class OmniPathSelection:
166
+ """Selection handle for both single and grouped OmniPath queries."""
167
+
168
+ dataset: OmniPathDb
169
+ _df_input_ids: pl.DataFrame = field(repr=False)
170
+ _df_groups: pl.DataFrame | None = field(repr=False)
171
+ resources_selected: frozenset[OmniPathResourceName]
172
+ _df_enzsub: pl.DataFrame | None = field(default=None, repr=False)
173
+ _df_interactions: pl.DataFrame | None = field(default=None, repr=False)
174
+ _df_unmapped: pl.DataFrame | None = field(default=None, repr=False)
175
+
176
+ @property
177
+ def is_grouped(self) -> bool:
178
+ """Report whether this selection carries `GroupId` through outputs."""
179
+ return self._df_groups is not None
180
+
181
+ @property
182
+ def _col_group_id(self) -> tuple[str, ...]:
183
+ """Return the group ID column when this selection is grouped."""
184
+ return ("GroupId",) if self.is_grouped else ()
185
+
186
+ def with_resources(
187
+ self,
188
+ resources: Iterable[OmniPathResourceName],
189
+ ) -> OmniPathSelection:
190
+ """Create a new selection constrained to the given OmniPath resources."""
191
+ resources_selected = frozenset(resources)
192
+ resources_invalid = resources_selected.difference({"enzsub", "interactions"})
193
+ if resources_invalid:
194
+ raise ValueError(
195
+ f"Unsupported OmniPath resources: {sorted(resources_invalid)}"
196
+ )
197
+ if not resources_selected:
198
+ raise ValueError("At least one OmniPath resource must be selected")
199
+
200
+ return OmniPathSelection(
201
+ dataset=self.dataset,
202
+ _df_input_ids=self._df_input_ids,
203
+ _df_groups=self._df_groups,
204
+ resources_selected=resources_selected,
205
+ _df_enzsub=self._df_enzsub if "enzsub" in resources_selected else None,
206
+ _df_interactions=(
207
+ self._df_interactions if "interactions" in resources_selected else None
208
+ ),
209
+ )
210
+
211
+ def with_enzsub(self) -> OmniPathSelection:
212
+ """Create a new selection constrained to OmniPath enzsub relations."""
213
+ return self.with_resources(["enzsub"])
214
+
215
+ def with_interactions(self) -> OmniPathSelection:
216
+ """Create a new selection constrained to OmniPath interactions."""
217
+ return self.with_resources(["interactions"])
218
+
219
+ def extract_enzsub(self) -> pl.DataFrame:
220
+ """Extract matched OmniPath enzyme-substrate relations.
221
+
222
+ Returns:
223
+ A materialized table with one of these schemas:
224
+
225
+ - single selection: `SourceId`, `TargetId`, `TargetSite`, `Modification`
226
+ - grouped selection: `GroupId`, `SourceId`, `TargetId`, `TargetSite`,
227
+ `Modification`
228
+
229
+ Raises:
230
+ ValueError: If the selection does not enable `enzsub`, if the
231
+ `enzsub` file is missing, or if the file is missing required
232
+ columns.
233
+ """
234
+ if "enzsub" not in self.resources_selected:
235
+ raise ValueError(
236
+ "OmniPath resource 'enzsub' is not enabled for this selection"
237
+ )
238
+ if self.dataset.snapshot.file_enzsub is None:
239
+ raise ValueError("Cannot extract OmniPath enzsub without enzsub file")
240
+ if self._df_enzsub is None:
241
+ self._df_enzsub = extract_enzsub_frame(
242
+ file_enzsub=self.dataset.snapshot.file_enzsub,
243
+ df_input_ids=self._df_input_ids,
244
+ cols_group_id=self._col_group_id,
245
+ )
246
+ return self._df_enzsub
247
+
248
+ def extract_interactions(self) -> pl.DataFrame:
249
+ """Extract matched OmniPath interaction relations.
250
+
251
+ Returns:
252
+ A materialized table with one of these schemas:
253
+
254
+ - single selection: `SourceId`, `TargetId`, `IsDirected`,
255
+ `IsStimulation`, `IsInhibition`
256
+ - grouped selection: `GroupId`, `SourceId`, `TargetId`,
257
+ `IsDirected`, `IsStimulation`, `IsInhibition`
258
+
259
+ Raises:
260
+ ValueError: If the selection does not enable `interactions`, if the
261
+ interactions file is missing, or if the file is missing
262
+ required columns.
263
+ """
264
+ if "interactions" not in self.resources_selected:
265
+ raise ValueError(
266
+ "OmniPath resource 'interactions' is not enabled for this selection"
267
+ )
268
+ if self.dataset.snapshot.file_interactions is None:
269
+ raise ValueError(
270
+ "Cannot extract OmniPath interactions without interactions file"
271
+ )
272
+ if self._df_interactions is None:
273
+ self._df_interactions = extract_interactions_frame(
274
+ file_interactions=self.dataset.snapshot.file_interactions,
275
+ df_input_ids=self._df_input_ids,
276
+ cols_group_id=self._col_group_id,
277
+ )
278
+ return self._df_interactions
279
+
280
+ def extract_unmapped_input_ids(self) -> pl.DataFrame:
281
+ """Extract normalized input IDs not found in the selected resources.
282
+
283
+ Returns:
284
+ A materialized table with one of these schemas:
285
+
286
+ - single selection: `InputId`
287
+ - grouped selection: `GroupId`, `InputId`
288
+ """
289
+ if self._df_unmapped is None:
290
+ col_group_id = list(self._col_group_id)
291
+ cols_index = col_group_id + ["InputId"]
292
+
293
+ df_matched_parts: list[pl.DataFrame] = []
294
+ if "enzsub" in self.resources_selected:
295
+ df_enzsub = self.extract_enzsub()
296
+ df_matched_parts.extend(
297
+ [
298
+ df_enzsub.select(
299
+ col_group_id + [pl.col("SourceId").alias("InputId")]
300
+ ),
301
+ df_enzsub.select(
302
+ col_group_id + [pl.col("TargetId").alias("InputId")]
303
+ ),
304
+ ]
305
+ )
306
+
307
+ if "interactions" in self.resources_selected:
308
+ df_interactions = self.extract_interactions()
309
+ df_matched_parts.extend(
310
+ [
311
+ df_interactions.select(
312
+ col_group_id + [pl.col("SourceId").alias("InputId")]
313
+ ),
314
+ df_interactions.select(
315
+ col_group_id + [pl.col("TargetId").alias("InputId")]
316
+ ),
317
+ ]
318
+ )
319
+
320
+ df_matched_input_ids = (
321
+ pl.concat(df_matched_parts, how="vertical_relaxed")
322
+ .join(self._df_input_ids, on=cols_index, how="inner")
323
+ .unique(subset=cols_index)
324
+ .sort(cols_index)
325
+ )
326
+ self._df_unmapped = (
327
+ self._df_input_ids.join(
328
+ df_matched_input_ids,
329
+ on=cols_index,
330
+ how="anti",
331
+ )
332
+ .select(cols_index)
333
+ .sort(cols_index)
334
+ )
335
+ return self._df_unmapped
@@ -0,0 +1,13 @@
1
+ from dataclasses import dataclass
2
+
3
+ __all__ = [
4
+ "OmniPathResourceLimits",
5
+ ]
6
+
7
+
8
+ @dataclass(frozen=True, slots=True)
9
+ class OmniPathResourceLimits:
10
+ file_enzsub_bytes_max: int | None = 512 * 1024 * 1024
11
+ file_interactions_bytes_max: int | None = 4 * 1024 * 1024 * 1024
12
+ num_input_ids_max: int | None = 100_000
13
+ num_groups_max: int | None = 1_000