alphapulldown-input-parser 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- alphapulldown_input_parser/__init__.py +21 -0
- alphapulldown_input_parser/parser.py +273 -0
- alphapulldown_input_parser-0.1.0.dist-info/METADATA +50 -0
- alphapulldown_input_parser-0.1.0.dist-info/RECORD +7 -0
- alphapulldown_input_parser-0.1.0.dist-info/WHEEL +5 -0
- alphapulldown_input_parser-0.1.0.dist-info/licenses/LICENSE +25 -0
- alphapulldown_input_parser-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""Utilities for parsing AlphaPulldown fold specifications."""
|
|
2
|
+
|
|
3
|
+
from .parser import (
|
|
4
|
+
FormatError,
|
|
5
|
+
FeatureIndex,
|
|
6
|
+
Region,
|
|
7
|
+
RegionSelection,
|
|
8
|
+
expand_fold_specification,
|
|
9
|
+
generate_fold_specifications,
|
|
10
|
+
parse_fold,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"expand_fold_specification",
|
|
15
|
+
"parse_fold",
|
|
16
|
+
"FormatError",
|
|
17
|
+
"FeatureIndex",
|
|
18
|
+
"Region",
|
|
19
|
+
"RegionSelection",
|
|
20
|
+
"generate_fold_specifications",
|
|
21
|
+
]
|
|
@@ -0,0 +1,273 @@
|
|
|
1
|
+
"""Standalone parser for AlphaPulldown fold specifications."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import warnings
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from itertools import product
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Dict, Iterable, List, NamedTuple, Optional, Sequence, Tuple, Union
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class FormatError(ValueError):
|
|
13
|
+
"""Raised when a fold specification cannot be parsed."""
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _format_error(spec: str, msg: str | None = None) -> None:
|
|
17
|
+
"""Mirror the historical AlphaPulldown error message."""
|
|
18
|
+
base = f"Your format: {spec} is wrong. The program will terminate."
|
|
19
|
+
detail = f" {msg}" if msg else ""
|
|
20
|
+
raise FormatError(f"{base}{detail}")
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass(frozen=True)
|
|
24
|
+
class Region:
|
|
25
|
+
"""Closed interval over the protein sequence."""
|
|
26
|
+
|
|
27
|
+
start: int
|
|
28
|
+
end: int
|
|
29
|
+
|
|
30
|
+
def __post_init__(self) -> None:
|
|
31
|
+
if self.start < 0 or self.end < 0:
|
|
32
|
+
raise ValueError("Region boundaries must be non-negative integers.")
|
|
33
|
+
if self.start > self.end:
|
|
34
|
+
raise ValueError("Region start must not exceed region end.")
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@dataclass(frozen=True)
|
|
38
|
+
class RegionSelection:
|
|
39
|
+
"""Container describing the region selection for a fold."""
|
|
40
|
+
|
|
41
|
+
regions: Tuple[Region, ...] | None = None
|
|
42
|
+
|
|
43
|
+
@property
|
|
44
|
+
def is_all(self) -> bool:
|
|
45
|
+
return self.regions is None
|
|
46
|
+
|
|
47
|
+
@classmethod
|
|
48
|
+
def all(cls) -> "RegionSelection":
|
|
49
|
+
return cls(regions=None)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
FoldEntry = Dict[str, Union[str, RegionSelection]]
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class ExpandResult(NamedTuple):
|
|
56
|
+
formatted_folds: List[FoldEntry]
|
|
57
|
+
missing_features: List[str]
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
@dataclass(frozen=True)
|
|
61
|
+
class FeatureIndex:
|
|
62
|
+
"""Pre-indexed feature availability for faster lookups."""
|
|
63
|
+
|
|
64
|
+
pkl: Dict[str, Tuple[str, ...]]
|
|
65
|
+
json: Dict[str, str]
|
|
66
|
+
|
|
67
|
+
def has_pkl(self, name: str) -> bool:
|
|
68
|
+
return name in self.pkl
|
|
69
|
+
|
|
70
|
+
def json_path(self, name: str) -> Optional[str]:
|
|
71
|
+
return self.json.get(name)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _build_feature_index(directories: Sequence[Path]) -> FeatureIndex:
|
|
75
|
+
pkl: Dict[str, List[str]] = {}
|
|
76
|
+
json_files: Dict[str, str] = {}
|
|
77
|
+
|
|
78
|
+
for directory in directories:
|
|
79
|
+
if not directory.is_dir():
|
|
80
|
+
continue
|
|
81
|
+
for entry in directory.iterdir():
|
|
82
|
+
if not entry.is_file():
|
|
83
|
+
continue
|
|
84
|
+
filename = entry.name
|
|
85
|
+
if filename.endswith(".json"):
|
|
86
|
+
json_files.setdefault(filename, str(entry))
|
|
87
|
+
elif filename.endswith(".pkl"):
|
|
88
|
+
base = filename[:-4]
|
|
89
|
+
pkl.setdefault(base, []).append(str(entry))
|
|
90
|
+
elif filename.endswith(".pkl.xz"):
|
|
91
|
+
base = filename[:-7]
|
|
92
|
+
pkl.setdefault(base, []).append(str(entry))
|
|
93
|
+
|
|
94
|
+
return FeatureIndex(
|
|
95
|
+
pkl={name: tuple(paths) for name, paths in pkl.items()},
|
|
96
|
+
json=json_files,
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def _extract_copy_and_regions(tokens: Sequence[str], spec: str) -> Tuple[int, Sequence[str]]:
|
|
101
|
+
"""Return copy count and the remaining region tokens."""
|
|
102
|
+
if len(tokens) > 1:
|
|
103
|
+
try:
|
|
104
|
+
return int(tokens[1]), tokens[2:]
|
|
105
|
+
except ValueError:
|
|
106
|
+
pass
|
|
107
|
+
try:
|
|
108
|
+
return int(tokens[-1]), tokens[1:-1]
|
|
109
|
+
except ValueError:
|
|
110
|
+
pass
|
|
111
|
+
return 1, tokens[1:]
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _parse_regions(region_tokens: Sequence[str], spec: str) -> RegionSelection:
|
|
115
|
+
"""Parse optional region tokens into a RegionSelection."""
|
|
116
|
+
if not region_tokens:
|
|
117
|
+
return RegionSelection.all()
|
|
118
|
+
regions: List[Region] = []
|
|
119
|
+
for tok in region_tokens:
|
|
120
|
+
parts = tok.split("-")
|
|
121
|
+
if len(parts) != 2:
|
|
122
|
+
_format_error(spec, msg=f"Region token '{tok}' is not of form start-stop.")
|
|
123
|
+
try:
|
|
124
|
+
start, end = map(int, parts)
|
|
125
|
+
except ValueError:
|
|
126
|
+
_format_error(spec, msg=f"Region token '{tok}' contains non-integer bounds.")
|
|
127
|
+
try:
|
|
128
|
+
regions.append(Region(start=start, end=end))
|
|
129
|
+
except ValueError as exc:
|
|
130
|
+
_format_error(spec, msg=str(exc))
|
|
131
|
+
return RegionSelection(regions=tuple(regions))
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def expand_fold_specification(
|
|
135
|
+
spec: str,
|
|
136
|
+
features_directory: Iterable[str],
|
|
137
|
+
protein_delimiter: str,
|
|
138
|
+
*,
|
|
139
|
+
feature_index: FeatureIndex | None = None,
|
|
140
|
+
) -> ExpandResult:
|
|
141
|
+
"""Expand a single fold specification.
|
|
142
|
+
|
|
143
|
+
Returns a tuple of (formatted_folds, missing_features).
|
|
144
|
+
"""
|
|
145
|
+
|
|
146
|
+
index = feature_index
|
|
147
|
+
if index is None:
|
|
148
|
+
directories = tuple(Path(d).expanduser().resolve() for d in features_directory)
|
|
149
|
+
index = _build_feature_index(directories)
|
|
150
|
+
|
|
151
|
+
formatted_folds: List[FoldEntry] = []
|
|
152
|
+
missing_features: List[str] = []
|
|
153
|
+
|
|
154
|
+
for pf in spec.split(protein_delimiter):
|
|
155
|
+
if pf.endswith(".json"):
|
|
156
|
+
json_name = pf
|
|
157
|
+
json_path = index.json_path(json_name)
|
|
158
|
+
if json_path:
|
|
159
|
+
formatted_folds.append({"json_input": json_path})
|
|
160
|
+
else:
|
|
161
|
+
missing_features.append(json_name)
|
|
162
|
+
continue
|
|
163
|
+
|
|
164
|
+
tokens = pf.split(":")
|
|
165
|
+
if not tokens or not tokens[0]:
|
|
166
|
+
_format_error(spec, msg="Protein token is empty.")
|
|
167
|
+
|
|
168
|
+
name = tokens[0]
|
|
169
|
+
number, region_tokens = _extract_copy_and_regions(tokens, spec)
|
|
170
|
+
regions = _parse_regions(region_tokens, spec)
|
|
171
|
+
|
|
172
|
+
if not index.has_pkl(name):
|
|
173
|
+
missing_features.append(name)
|
|
174
|
+
continue
|
|
175
|
+
|
|
176
|
+
for _ in range(number):
|
|
177
|
+
formatted_folds.append({name: regions})
|
|
178
|
+
|
|
179
|
+
return ExpandResult(formatted_folds=formatted_folds, missing_features=missing_features)
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def parse_fold(
|
|
183
|
+
input_list: List[str],
|
|
184
|
+
features_directory: Iterable[str],
|
|
185
|
+
protein_delimiter: str,
|
|
186
|
+
) -> List[List[FoldEntry]]:
|
|
187
|
+
"""Parse a list of fold specifications into folding jobs."""
|
|
188
|
+
|
|
189
|
+
directories = tuple(features_directory)
|
|
190
|
+
directory_labels = [str(d) for d in directories]
|
|
191
|
+
directory_paths = tuple(Path(d).expanduser().resolve() for d in directories)
|
|
192
|
+
feature_index = _build_feature_index(directory_paths)
|
|
193
|
+
|
|
194
|
+
all_folding_jobs: List[List[FoldEntry]] = []
|
|
195
|
+
missing_features = set()
|
|
196
|
+
|
|
197
|
+
for spec in input_list:
|
|
198
|
+
result = expand_fold_specification(
|
|
199
|
+
spec=spec,
|
|
200
|
+
features_directory=directories,
|
|
201
|
+
protein_delimiter=protein_delimiter,
|
|
202
|
+
feature_index=feature_index,
|
|
203
|
+
)
|
|
204
|
+
missing_features.update(result.missing_features)
|
|
205
|
+
if result.formatted_folds:
|
|
206
|
+
all_folding_jobs.append(result.formatted_folds)
|
|
207
|
+
|
|
208
|
+
if missing_features:
|
|
209
|
+
raise FileNotFoundError(f"{sorted(missing_features)} not found in {directory_labels}")
|
|
210
|
+
|
|
211
|
+
return all_folding_jobs
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def _read_nonempty_lines(path: Path) -> List[str]:
|
|
215
|
+
with path.open(mode="r", encoding="utf-8") as handle:
|
|
216
|
+
return [line.strip() for line in handle if line.strip()]
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def generate_fold_specifications(
|
|
220
|
+
input_files: Sequence[Union[str, Path]],
|
|
221
|
+
*,
|
|
222
|
+
delimiter: str = "+",
|
|
223
|
+
exclude_permutations: bool = True,
|
|
224
|
+
output_path: Optional[Union[str, Path]] = None,
|
|
225
|
+
) -> List[str]:
|
|
226
|
+
"""Compute the Cartesian product of specification files.
|
|
227
|
+
|
|
228
|
+
Args:
|
|
229
|
+
input_files: Paths to text files containing one specification per line.
|
|
230
|
+
delimiter: Delimiter used to join the combination into a specification string.
|
|
231
|
+
exclude_permutations: When True, filter out combinations that are permutations
|
|
232
|
+
of entries that already appear.
|
|
233
|
+
output_path: Optional destination to persist the resulting specifications.
|
|
234
|
+
|
|
235
|
+
Returns:
|
|
236
|
+
List of joined specification strings.
|
|
237
|
+
"""
|
|
238
|
+
|
|
239
|
+
paths = [Path(p).expanduser().resolve() for p in input_files]
|
|
240
|
+
lines_per_file: List[List[str]] = []
|
|
241
|
+
for path in paths:
|
|
242
|
+
lines = _read_nonempty_lines(path)
|
|
243
|
+
if not lines:
|
|
244
|
+
warnings.warn(
|
|
245
|
+
f"Input file '{path}' contains no specifications; skipping combination generation.",
|
|
246
|
+
RuntimeWarning,
|
|
247
|
+
)
|
|
248
|
+
return []
|
|
249
|
+
lines_per_file.append(lines)
|
|
250
|
+
|
|
251
|
+
combinations = list(product(*lines_per_file)) if lines_per_file else []
|
|
252
|
+
|
|
253
|
+
if exclude_permutations:
|
|
254
|
+
filtered: List[Tuple[str, ...]] = []
|
|
255
|
+
seen: set[Tuple[str, ...]] = set()
|
|
256
|
+
for combo in combinations:
|
|
257
|
+
normalized = tuple(sorted(map(str, combo)))
|
|
258
|
+
if normalized in seen:
|
|
259
|
+
continue
|
|
260
|
+
seen.add(normalized)
|
|
261
|
+
filtered.append(tuple(map(str, combo)))
|
|
262
|
+
combinations = filtered
|
|
263
|
+
else:
|
|
264
|
+
combinations = [tuple(map(str, combo)) for combo in combinations]
|
|
265
|
+
|
|
266
|
+
specifications = [delimiter.join(combo) for combo in combinations]
|
|
267
|
+
|
|
268
|
+
if output_path:
|
|
269
|
+
output = Path(output_path).expanduser().resolve()
|
|
270
|
+
output.parent.mkdir(parents=True, exist_ok=True)
|
|
271
|
+
output.write_text("\n".join(specifications) + ("\n" if specifications else ""), encoding="utf-8")
|
|
272
|
+
|
|
273
|
+
return specifications
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: alphapulldown-input-parser
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Fold specification parser for AlphaPulldown
|
|
5
|
+
Author-email: Kosinski Lab <alphapulldown@embl-hamburg.de>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/KosinskiLab/alphapulldown-input-parser
|
|
8
|
+
Project-URL: Issues, https://github.com/KosinskiLab/alphapulldown-input-parser/issues
|
|
9
|
+
Keywords: alphapulldown,folding,bioinformatics,parser
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Intended Audience :: Science/Research
|
|
17
|
+
Classifier: Operating System :: OS Independent
|
|
18
|
+
Requires-Python: >=3.8
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
License-File: LICENSE
|
|
21
|
+
Provides-Extra: dev
|
|
22
|
+
Requires-Dist: pytest; extra == "dev"
|
|
23
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
|
24
|
+
Requires-Dist: build; extra == "dev"
|
|
25
|
+
Requires-Dist: twine; extra == "dev"
|
|
26
|
+
Dynamic: license-file
|
|
27
|
+
|
|
28
|
+
# alphapulldown-input-parser
|
|
29
|
+
|
|
30
|
+
Reusable parser for AlphaPulldown-style fold specifications. Install it with:
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
pip install alphapulldown-input-parser
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
or, for local development:
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
pip install -e /path/to/alphapulldown-input-parser
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
The package exposes two helpers:
|
|
43
|
+
|
|
44
|
+
* `parse_fold(...)` – mirrors the historical AlphaPulldown helper and performs
|
|
45
|
+
feature existence checks.
|
|
46
|
+
* `expand_fold_specification(...)` – expands a single fold string without
|
|
47
|
+
raising if features are missing.
|
|
48
|
+
|
|
49
|
+
The parser is dependency-free and works across AlphaPulldown, the Snakemake
|
|
50
|
+
pipeline, or any other tooling that consumes the same fold syntax.
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
alphapulldown_input_parser/__init__.py,sha256=jTB7jBzk3AiL3iwLy_nW1HmoEd3PkhgprlRLv9hnH-4,415
|
|
2
|
+
alphapulldown_input_parser/parser.py,sha256=F9N1bsDl5e_Ya7aIxiakV_HEjfy8rB2nG5AR344TVgM,8728
|
|
3
|
+
alphapulldown_input_parser-0.1.0.dist-info/licenses/LICENSE,sha256=qmUOg-ZOGHg3dGfFr1Wj2SGwiZLa1pU95KuqxWdGY5E,1170
|
|
4
|
+
alphapulldown_input_parser-0.1.0.dist-info/METADATA,sha256=e8nmBrVr0RKpOQ8LTheNRDKGdl2Ks2XJNoBGYBGEa9Y,1747
|
|
5
|
+
alphapulldown_input_parser-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
6
|
+
alphapulldown_input_parser-0.1.0.dist-info/top_level.txt,sha256=rnTxJyazk4dq0QWOEUwHkqF0wyyGiAXKu_fVX0rXfYI,27
|
|
7
|
+
alphapulldown_input_parser-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
<<<<<<< HEAD
|
|
4
|
+
Copyright (c) 2024 Kosinski Lab
|
|
5
|
+
=======
|
|
6
|
+
Copyright (c) 2025 KosinskiLab
|
|
7
|
+
>>>>>>> f65396f8370fada264ee798bbeb0666d4c5726d3
|
|
8
|
+
|
|
9
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
10
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
11
|
+
in the Software without restriction, including without limitation the rights
|
|
12
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
13
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
14
|
+
furnished to do so, subject to the following conditions:
|
|
15
|
+
|
|
16
|
+
The above copyright notice and this permission notice shall be included in all
|
|
17
|
+
copies or substantial portions of the Software.
|
|
18
|
+
|
|
19
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
20
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
21
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
22
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
23
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
24
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
25
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
alphapulldown_input_parser
|