alphapulldown-input-parser 0.1.0__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (14) hide show
  1. {alphapulldown_input_parser-0.1.0 → alphapulldown_input_parser-0.2.0}/LICENSE +0 -4
  2. {alphapulldown_input_parser-0.1.0 → alphapulldown_input_parser-0.2.0}/PKG-INFO +1 -1
  3. {alphapulldown_input_parser-0.1.0 → alphapulldown_input_parser-0.2.0}/pyproject.toml +1 -1
  4. alphapulldown_input_parser-0.2.0/src/alphapulldown_input_parser/parser.py +450 -0
  5. {alphapulldown_input_parser-0.1.0 → alphapulldown_input_parser-0.2.0}/src/alphapulldown_input_parser.egg-info/PKG-INFO +1 -1
  6. alphapulldown_input_parser-0.1.0/src/alphapulldown_input_parser/parser.py +0 -273
  7. {alphapulldown_input_parser-0.1.0 → alphapulldown_input_parser-0.2.0}/README.md +0 -0
  8. {alphapulldown_input_parser-0.1.0 → alphapulldown_input_parser-0.2.0}/setup.cfg +0 -0
  9. {alphapulldown_input_parser-0.1.0 → alphapulldown_input_parser-0.2.0}/src/alphapulldown_input_parser/__init__.py +0 -0
  10. {alphapulldown_input_parser-0.1.0 → alphapulldown_input_parser-0.2.0}/src/alphapulldown_input_parser.egg-info/SOURCES.txt +0 -0
  11. {alphapulldown_input_parser-0.1.0 → alphapulldown_input_parser-0.2.0}/src/alphapulldown_input_parser.egg-info/dependency_links.txt +0 -0
  12. {alphapulldown_input_parser-0.1.0 → alphapulldown_input_parser-0.2.0}/src/alphapulldown_input_parser.egg-info/requires.txt +0 -0
  13. {alphapulldown_input_parser-0.1.0 → alphapulldown_input_parser-0.2.0}/src/alphapulldown_input_parser.egg-info/top_level.txt +0 -0
  14. {alphapulldown_input_parser-0.1.0 → alphapulldown_input_parser-0.2.0}/test/test_parser.py +0 -0
@@ -1,10 +1,6 @@
1
1
  MIT License
2
2
 
3
- <<<<<<< HEAD
4
- Copyright (c) 2024 Kosinski Lab
5
- =======
6
3
  Copyright (c) 2025 KosinskiLab
7
- >>>>>>> f65396f8370fada264ee798bbeb0666d4c5726d3
8
4
 
9
5
  Permission is hereby granted, free of charge, to any person obtaining a copy
10
6
  of this software and associated documentation files (the "Software"), to deal
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: alphapulldown-input-parser
3
- Version: 0.1.0
3
+ Version: 0.2.0
4
4
  Summary: Fold specification parser for AlphaPulldown
5
5
  Author-email: Kosinski Lab <alphapulldown@embl-hamburg.de>
6
6
  License: MIT
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "alphapulldown-input-parser"
7
- version = "0.1.0"
7
+ version = "0.2.0"
8
8
  description = "Fold specification parser for AlphaPulldown"
9
9
  readme = "README.md"
10
10
  license = {text = "MIT"}
@@ -0,0 +1,450 @@
1
+ """Standalone parser for AlphaPulldown fold specifications."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import warnings
6
+ from dataclasses import dataclass
7
+ from itertools import product
8
+ from pathlib import Path
9
+ from typing import Dict, Iterable, List, NamedTuple, Optional, Sequence, Tuple, Union
10
+
11
+ # ---------------------------------------------------------------------------
12
+ # Helpers
13
+ # ---------------------------------------------------------------------------
14
+
15
+
16
+ def _deduplicate_preserve_order(items: Iterable[str]) -> Tuple[str, ...]:
17
+ """Return a tuple containing the first occurrence of every item."""
18
+ return tuple(dict.fromkeys(items))
19
+
20
+
21
+ def _strip_path_and_extension(value: str) -> str:
22
+ return Path(value).stem
23
+
24
+
25
+ def _format_error(spec: str, msg: str | None = None) -> None:
26
+ """Mirror the historical AlphaPulldown error message."""
27
+ base = f"Your format: {spec} is wrong. The program will terminate."
28
+ detail = f" {msg}" if msg else ""
29
+ raise FormatError(f"{base}{detail}")
30
+
31
+
32
+ def _read_nonempty_lines(path: Path) -> List[str]:
33
+ with path.open(mode="r", encoding="utf-8") as handle:
34
+ return [line.strip() for line in handle if line.strip()]
35
+
36
+
37
+ # ---------------------------------------------------------------------------
38
+ # Core data structures
39
+ # ---------------------------------------------------------------------------
40
+
41
+
42
+ class FormatError(ValueError):
43
+ """Raised when a fold specification cannot be parsed."""
44
+
45
+
46
+ @dataclass(frozen=True)
47
+ class Region:
48
+ """1-based closed interval over the protein sequence."""
49
+
50
+ start: int
51
+ end: int
52
+
53
+ def __post_init__(self) -> None:
54
+ if self.start < 1 or self.end < 1:
55
+ raise ValueError("Region boundaries must be positive integers (1-based).")
56
+ if self.start > self.end:
57
+ raise ValueError("Region start must not exceed region end.")
58
+
59
+
60
+ @dataclass(frozen=True)
61
+ class RegionSelection:
62
+ """Container describing the region selection for a fold."""
63
+
64
+ regions: Tuple[Region, ...] | None = None
65
+
66
+ @property
67
+ def is_all(self) -> bool:
68
+ return self.regions is None
69
+
70
+ @classmethod
71
+ def all(cls) -> "RegionSelection":
72
+ return cls(regions=None)
73
+
74
+
75
+ # Either {"json_input": "/path/to.json"} or {"CHAIN_A": RegionSelection(...)}
76
+ FoldEntry = Dict[str, Union[str, RegionSelection]]
77
+
78
+
79
+ class ExpandResult(NamedTuple):
80
+ formatted_folds: List[FoldEntry]
81
+ missing_features: List[str]
82
+
83
+
84
+ # ---------------------------------------------------------------------------
85
+ # Fold dataset
86
+ # ---------------------------------------------------------------------------
87
+
88
+
89
+ @dataclass
90
+ class FoldDataset:
91
+ """Container encapsulating parsed fold specifications."""
92
+
93
+ fold_specifications: Tuple[str, ...]
94
+ sequences_by_origin: Dict[str, Tuple[str, ...]]
95
+ sequences_by_fold: Dict[str, Tuple[str, ...]]
96
+
97
+ @property
98
+ def unique_sequences(self) -> Tuple[str, ...]:
99
+ ordered: List[str] = []
100
+ for spec in self.fold_specifications:
101
+ ordered.extend(self.sequences_by_fold.get(spec, ()))
102
+ return _deduplicate_preserve_order(ordered)
103
+
104
+ def symlink_local_files(self, output_directory: Union[str, Path]) -> None:
105
+ output = Path(output_directory).expanduser().resolve()
106
+ output.mkdir(parents=True, exist_ok=True)
107
+ for file in self.sequences_by_origin.get("local", ()):
108
+ source = Path(file).expanduser()
109
+ target = output / Path(file).name
110
+ if target.exists() or target.is_symlink():
111
+ target.unlink()
112
+ target.symlink_to(source)
113
+
114
+ @classmethod
115
+ def from_fold_specifications(
116
+ cls,
117
+ fold_specifications: Sequence[str],
118
+ *,
119
+ protein_delimiter: str = "+",
120
+ ) -> "FoldDataset":
121
+ normalized_specs: List[str] = []
122
+ sequences_by_fold: Dict[str, Tuple[str, ...]] = {}
123
+ referenced_sequences: List[str] = []
124
+
125
+ for specification in fold_specifications:
126
+ tokens = [
127
+ token.strip()
128
+ for token in specification.split(protein_delimiter)
129
+ if token.strip()
130
+ ]
131
+ if not tokens:
132
+ continue
133
+
134
+ normalized_tokens: List[str] = []
135
+ per_fold_sequences: List[str] = []
136
+ for token in tokens:
137
+ parts = [part.strip() for part in token.split(":")]
138
+ protein_reference = parts[0]
139
+ referenced_sequences.append(protein_reference)
140
+
141
+ base_name = _strip_path_and_extension(protein_reference)
142
+ per_fold_sequences.append(base_name)
143
+ suffix_components = [part for part in parts[1:] if part]
144
+ normalized_tokens.append(
145
+ ":".join([base_name, *suffix_components]) if suffix_components else base_name
146
+ )
147
+
148
+ normalized_spec = protein_delimiter.join(normalized_tokens)
149
+ normalized_specs.append(normalized_spec)
150
+ sequences_by_fold[normalized_spec] = _deduplicate_preserve_order(per_fold_sequences)
151
+
152
+ unique_inputs = _deduplicate_preserve_order(referenced_sequences)
153
+ sequences_by_origin: Dict[str, List[str]] = {"uniprot": [], "local": []}
154
+ for sequence in unique_inputs:
155
+ path = Path(sequence).expanduser()
156
+ has_separator = "/" in sequence or "\\" in sequence
157
+ has_suffix = path.suffix != ""
158
+ exists = path.exists() and (path.is_file() or path.is_symlink())
159
+ is_local = exists and (has_separator or has_suffix)
160
+ if is_local:
161
+ sequences_by_origin["local"].append(str(path.resolve()))
162
+ else:
163
+ sequences_by_origin["uniprot"].append(sequence)
164
+
165
+ return cls(
166
+ fold_specifications=_deduplicate_preserve_order(normalized_specs),
167
+ sequences_by_origin={
168
+ key: tuple(values) for key, values in sequences_by_origin.items()
169
+ },
170
+ sequences_by_fold=sequences_by_fold,
171
+ )
172
+
173
+ @classmethod
174
+ def from_file(
175
+ cls,
176
+ filepath: Union[str, Path],
177
+ *,
178
+ protein_delimiter: str = "+",
179
+ ) -> "FoldDataset":
180
+ path = Path(filepath).expanduser().resolve()
181
+ specifications = _deduplicate_preserve_order(_read_nonempty_lines(path))
182
+ return cls.from_fold_specifications(
183
+ specifications,
184
+ protein_delimiter=protein_delimiter,
185
+ )
186
+
187
+ # ---------------------------------------------------------------------------
188
+ # Feature index
189
+ # ---------------------------------------------------------------------------
190
+
191
+
192
+ @dataclass(frozen=True)
193
+ class FeatureIndex:
194
+ """Pre-indexed feature availability for faster lookups."""
195
+
196
+ pkl: Dict[str, Tuple[str, ...]]
197
+ json: Dict[str, str]
198
+
199
+ def has_pkl(self, name: str) -> bool:
200
+ return name in self.pkl
201
+
202
+ def json_path(self, name: str) -> Optional[str]:
203
+ return self.json.get(name)
204
+
205
+
206
+ def _build_feature_index(directories: Sequence[Path]) -> FeatureIndex:
207
+ pkl: Dict[str, List[str]] = {}
208
+ json_files: Dict[str, str] = {}
209
+
210
+ for directory in directories:
211
+ if not directory.is_dir():
212
+ continue
213
+ for entry in directory.iterdir():
214
+ if not entry.is_file():
215
+ continue
216
+ filename = entry.name
217
+ if filename.endswith(".json"):
218
+ keys = {entry.name, entry.stem}
219
+ for key in keys:
220
+ json_files.setdefault(key, str(entry))
221
+ elif filename.endswith(".pkl"):
222
+ base = filename[:-4]
223
+ keys = {base, entry.name, entry.stem}
224
+ for key in keys:
225
+ pkl.setdefault(key, []).append(str(entry))
226
+ elif filename.endswith(".pkl.xz"):
227
+ base = filename[:-7]
228
+ keys = {base, entry.name, Path(filename[:-3]).stem}
229
+ for key in keys:
230
+ pkl.setdefault(key, []).append(str(entry))
231
+
232
+ return FeatureIndex(
233
+ # deduplicate paths while preserving order
234
+ pkl={name: tuple(dict.fromkeys(paths)) for name, paths in pkl.items()},
235
+ json=json_files,
236
+ )
237
+
238
+
239
+ # ---------------------------------------------------------------------------
240
+ # Expansion helpers
241
+ # ---------------------------------------------------------------------------
242
+
243
+
244
+ def _extract_copy_and_regions(tokens: Sequence[str], spec: str) -> Tuple[int, Sequence[str]]:
245
+ """Return copy count and the remaining region tokens."""
246
+ if len(tokens) > 1:
247
+ try:
248
+ return int(tokens[1]), tokens[2:]
249
+ except ValueError:
250
+ pass
251
+ try:
252
+ return int(tokens[-1]), tokens[1:-1]
253
+ except ValueError:
254
+ pass
255
+ return 1, tokens[1:]
256
+
257
+
258
+ def _parse_regions(region_tokens: Sequence[str], spec: str) -> RegionSelection:
259
+ """Parse optional region tokens into a RegionSelection."""
260
+ if not region_tokens:
261
+ return RegionSelection.all()
262
+ regions: List[Region] = []
263
+ for tok in region_tokens:
264
+ parts = tok.split("-")
265
+ if len(parts) != 2:
266
+ _format_error(spec, msg=f"Region token '{tok}' is not of form start-stop.")
267
+ try:
268
+ start, end = map(int, parts)
269
+ except ValueError:
270
+ _format_error(spec, msg=f"Region token '{tok}' contains non-integer bounds.")
271
+ try:
272
+ regions.append(Region(start=start, end=end))
273
+ except ValueError as exc:
274
+ _format_error(spec, msg=str(exc))
275
+ return RegionSelection(regions=tuple(regions))
276
+
277
+
278
+ # ---------------------------------------------------------------------------
279
+ # Expansion logic
280
+ # ---------------------------------------------------------------------------
281
+
282
+
283
+ def expand_fold_specification(
284
+ spec: str,
285
+ features_directory: Iterable[str],
286
+ protein_delimiter: str,
287
+ *,
288
+ feature_index: FeatureIndex | None = None,
289
+ ) -> ExpandResult:
290
+ """Expand a single fold specification into fold entries.
291
+
292
+ Example:
293
+ >>> expand_fold_specification("protA:1-10+protB", ["/features"], "+", feature_index=index)
294
+
295
+ Returns:
296
+ ExpandResult with fold entries and missing feature names. Pure function.
297
+ """
298
+
299
+ index = feature_index
300
+ if index is None:
301
+ directories = tuple(Path(d).expanduser().resolve() for d in features_directory)
302
+ index = _build_feature_index(directories)
303
+
304
+ formatted_folds: List[FoldEntry] = []
305
+ missing_features: List[str] = []
306
+
307
+ for raw_pf in spec.split(protein_delimiter):
308
+ pf = raw_pf.strip()
309
+ if not pf:
310
+ continue
311
+
312
+ if pf.endswith(".json"):
313
+ path_pf = Path(pf)
314
+ json_path: Optional[str] = None
315
+ for json_key in (path_pf.name, path_pf.stem):
316
+ json_path = index.json_path(json_key)
317
+ if json_path:
318
+ formatted_folds.append({"json_input": json_path})
319
+ break
320
+ if json_path:
321
+ continue
322
+ missing_features.append(path_pf.name)
323
+ continue
324
+
325
+ tokens = [token.strip() for token in pf.split(":")]
326
+ if not tokens or not tokens[0]:
327
+ _format_error(spec, msg="Protein token is empty.")
328
+
329
+ name = tokens[0]
330
+ name_path = Path(name)
331
+ name_candidates = [
332
+ name,
333
+ name_path.name,
334
+ name_path.stem,
335
+ ]
336
+ # copy count can be either second or last token
337
+ number, region_tokens = _extract_copy_and_regions(tokens, spec)
338
+ regions = _parse_regions(region_tokens, spec)
339
+
340
+ # try different name representations against prebuilt index
341
+ canonical_name = next((candidate for candidate in name_candidates if index.has_pkl(candidate)), None)
342
+ if canonical_name is None:
343
+ missing_features.append(name)
344
+ continue
345
+
346
+ for _ in range(number):
347
+ formatted_folds.append({canonical_name: regions})
348
+
349
+ return ExpandResult(formatted_folds=formatted_folds, missing_features=missing_features)
350
+
351
+
352
+ def parse_fold(
353
+ input_list: List[str],
354
+ features_directory: Iterable[str],
355
+ protein_delimiter: str,
356
+ ) -> List[List[FoldEntry]]:
357
+ """Parse a list of fold specifications into folding jobs.
358
+
359
+ Example:
360
+ >>> parse_fold(["protA+protB"], ["/features"], "+")
361
+
362
+ Returns:
363
+ List of jobs (each job is a list of FoldEntry). Pure function.
364
+ """
365
+
366
+ directories = tuple(features_directory)
367
+ directory_labels = [str(d) for d in directories]
368
+ directory_paths = tuple(Path(d).expanduser().resolve() for d in directories)
369
+ feature_index = _build_feature_index(directory_paths)
370
+
371
+ all_folding_jobs: List[List[FoldEntry]] = []
372
+ missing_features = set()
373
+
374
+ for spec in input_list:
375
+ result = expand_fold_specification(
376
+ spec=spec,
377
+ features_directory=directories,
378
+ protein_delimiter=protein_delimiter,
379
+ feature_index=feature_index,
380
+ )
381
+ missing_features.update(result.missing_features)
382
+ if result.formatted_folds:
383
+ all_folding_jobs.append(result.formatted_folds)
384
+
385
+ if missing_features:
386
+ raise FileNotFoundError(f"{sorted(missing_features)} not found in {directory_labels}")
387
+
388
+ return all_folding_jobs
389
+
390
+
391
+ # ---------------------------------------------------------------------------
392
+ # Public API
393
+ # ---------------------------------------------------------------------------
394
+
395
+
396
+ def generate_fold_specifications(
397
+ input_files: Sequence[Union[str, Path]],
398
+ *,
399
+ delimiter: str = "+",
400
+ exclude_permutations: bool = True,
401
+ output_path: Optional[Union[str, Path]] = None,
402
+ ) -> List[str]:
403
+ """Compute the Cartesian product of specification files.
404
+
405
+ Args:
406
+ input_files: Paths to text files containing one specification per line.
407
+ delimiter: Delimiter used to join the combination into a specification string.
408
+ exclude_permutations: When True, filter out combinations that are permutations
409
+ of entries that already appear.
410
+ output_path: Optional destination to persist the resulting specifications.
411
+
412
+ Returns:
413
+ List of joined specification strings.
414
+ """
415
+
416
+ paths = [Path(p).expanduser().resolve() for p in input_files]
417
+ lines_per_file: List[List[str]] = []
418
+ for path in paths:
419
+ lines = _read_nonempty_lines(path)
420
+ if not lines:
421
+ warnings.warn(
422
+ f"Input file '{path}' contains no specifications; skipping combination generation.",
423
+ RuntimeWarning,
424
+ )
425
+ return []
426
+ lines_per_file.append(lines)
427
+
428
+ if not lines_per_file:
429
+ combinations: List[Tuple[str, ...]] = []
430
+ else:
431
+ combinations_iter = product(*lines_per_file)
432
+ seen: set[Tuple[str, ...]] = set()
433
+ combinations = []
434
+ for combo in combinations_iter:
435
+ typed_combo = tuple(map(str, combo))
436
+ if exclude_permutations:
437
+ normalized = tuple(sorted(typed_combo))
438
+ if normalized in seen:
439
+ continue
440
+ seen.add(normalized)
441
+ combinations.append(typed_combo)
442
+
443
+ specifications = [delimiter.join(combo) for combo in combinations]
444
+
445
+ if output_path:
446
+ output = Path(output_path).expanduser().resolve()
447
+ output.parent.mkdir(parents=True, exist_ok=True)
448
+ output.write_text("\n".join(specifications) + ("\n" if specifications else ""), encoding="utf-8")
449
+
450
+ return specifications
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: alphapulldown-input-parser
3
- Version: 0.1.0
3
+ Version: 0.2.0
4
4
  Summary: Fold specification parser for AlphaPulldown
5
5
  Author-email: Kosinski Lab <alphapulldown@embl-hamburg.de>
6
6
  License: MIT
@@ -1,273 +0,0 @@
1
- """Standalone parser for AlphaPulldown fold specifications."""
2
-
3
- from __future__ import annotations
4
-
5
- import warnings
6
- from dataclasses import dataclass
7
- from itertools import product
8
- from pathlib import Path
9
- from typing import Dict, Iterable, List, NamedTuple, Optional, Sequence, Tuple, Union
10
-
11
-
12
- class FormatError(ValueError):
13
- """Raised when a fold specification cannot be parsed."""
14
-
15
-
16
- def _format_error(spec: str, msg: str | None = None) -> None:
17
- """Mirror the historical AlphaPulldown error message."""
18
- base = f"Your format: {spec} is wrong. The program will terminate."
19
- detail = f" {msg}" if msg else ""
20
- raise FormatError(f"{base}{detail}")
21
-
22
-
23
- @dataclass(frozen=True)
24
- class Region:
25
- """Closed interval over the protein sequence."""
26
-
27
- start: int
28
- end: int
29
-
30
- def __post_init__(self) -> None:
31
- if self.start < 0 or self.end < 0:
32
- raise ValueError("Region boundaries must be non-negative integers.")
33
- if self.start > self.end:
34
- raise ValueError("Region start must not exceed region end.")
35
-
36
-
37
- @dataclass(frozen=True)
38
- class RegionSelection:
39
- """Container describing the region selection for a fold."""
40
-
41
- regions: Tuple[Region, ...] | None = None
42
-
43
- @property
44
- def is_all(self) -> bool:
45
- return self.regions is None
46
-
47
- @classmethod
48
- def all(cls) -> "RegionSelection":
49
- return cls(regions=None)
50
-
51
-
52
- FoldEntry = Dict[str, Union[str, RegionSelection]]
53
-
54
-
55
- class ExpandResult(NamedTuple):
56
- formatted_folds: List[FoldEntry]
57
- missing_features: List[str]
58
-
59
-
60
- @dataclass(frozen=True)
61
- class FeatureIndex:
62
- """Pre-indexed feature availability for faster lookups."""
63
-
64
- pkl: Dict[str, Tuple[str, ...]]
65
- json: Dict[str, str]
66
-
67
- def has_pkl(self, name: str) -> bool:
68
- return name in self.pkl
69
-
70
- def json_path(self, name: str) -> Optional[str]:
71
- return self.json.get(name)
72
-
73
-
74
- def _build_feature_index(directories: Sequence[Path]) -> FeatureIndex:
75
- pkl: Dict[str, List[str]] = {}
76
- json_files: Dict[str, str] = {}
77
-
78
- for directory in directories:
79
- if not directory.is_dir():
80
- continue
81
- for entry in directory.iterdir():
82
- if not entry.is_file():
83
- continue
84
- filename = entry.name
85
- if filename.endswith(".json"):
86
- json_files.setdefault(filename, str(entry))
87
- elif filename.endswith(".pkl"):
88
- base = filename[:-4]
89
- pkl.setdefault(base, []).append(str(entry))
90
- elif filename.endswith(".pkl.xz"):
91
- base = filename[:-7]
92
- pkl.setdefault(base, []).append(str(entry))
93
-
94
- return FeatureIndex(
95
- pkl={name: tuple(paths) for name, paths in pkl.items()},
96
- json=json_files,
97
- )
98
-
99
-
100
- def _extract_copy_and_regions(tokens: Sequence[str], spec: str) -> Tuple[int, Sequence[str]]:
101
- """Return copy count and the remaining region tokens."""
102
- if len(tokens) > 1:
103
- try:
104
- return int(tokens[1]), tokens[2:]
105
- except ValueError:
106
- pass
107
- try:
108
- return int(tokens[-1]), tokens[1:-1]
109
- except ValueError:
110
- pass
111
- return 1, tokens[1:]
112
-
113
-
114
- def _parse_regions(region_tokens: Sequence[str], spec: str) -> RegionSelection:
115
- """Parse optional region tokens into a RegionSelection."""
116
- if not region_tokens:
117
- return RegionSelection.all()
118
- regions: List[Region] = []
119
- for tok in region_tokens:
120
- parts = tok.split("-")
121
- if len(parts) != 2:
122
- _format_error(spec, msg=f"Region token '{tok}' is not of form start-stop.")
123
- try:
124
- start, end = map(int, parts)
125
- except ValueError:
126
- _format_error(spec, msg=f"Region token '{tok}' contains non-integer bounds.")
127
- try:
128
- regions.append(Region(start=start, end=end))
129
- except ValueError as exc:
130
- _format_error(spec, msg=str(exc))
131
- return RegionSelection(regions=tuple(regions))
132
-
133
-
134
- def expand_fold_specification(
135
- spec: str,
136
- features_directory: Iterable[str],
137
- protein_delimiter: str,
138
- *,
139
- feature_index: FeatureIndex | None = None,
140
- ) -> ExpandResult:
141
- """Expand a single fold specification.
142
-
143
- Returns a tuple of (formatted_folds, missing_features).
144
- """
145
-
146
- index = feature_index
147
- if index is None:
148
- directories = tuple(Path(d).expanduser().resolve() for d in features_directory)
149
- index = _build_feature_index(directories)
150
-
151
- formatted_folds: List[FoldEntry] = []
152
- missing_features: List[str] = []
153
-
154
- for pf in spec.split(protein_delimiter):
155
- if pf.endswith(".json"):
156
- json_name = pf
157
- json_path = index.json_path(json_name)
158
- if json_path:
159
- formatted_folds.append({"json_input": json_path})
160
- else:
161
- missing_features.append(json_name)
162
- continue
163
-
164
- tokens = pf.split(":")
165
- if not tokens or not tokens[0]:
166
- _format_error(spec, msg="Protein token is empty.")
167
-
168
- name = tokens[0]
169
- number, region_tokens = _extract_copy_and_regions(tokens, spec)
170
- regions = _parse_regions(region_tokens, spec)
171
-
172
- if not index.has_pkl(name):
173
- missing_features.append(name)
174
- continue
175
-
176
- for _ in range(number):
177
- formatted_folds.append({name: regions})
178
-
179
- return ExpandResult(formatted_folds=formatted_folds, missing_features=missing_features)
180
-
181
-
182
- def parse_fold(
183
- input_list: List[str],
184
- features_directory: Iterable[str],
185
- protein_delimiter: str,
186
- ) -> List[List[FoldEntry]]:
187
- """Parse a list of fold specifications into folding jobs."""
188
-
189
- directories = tuple(features_directory)
190
- directory_labels = [str(d) for d in directories]
191
- directory_paths = tuple(Path(d).expanduser().resolve() for d in directories)
192
- feature_index = _build_feature_index(directory_paths)
193
-
194
- all_folding_jobs: List[List[FoldEntry]] = []
195
- missing_features = set()
196
-
197
- for spec in input_list:
198
- result = expand_fold_specification(
199
- spec=spec,
200
- features_directory=directories,
201
- protein_delimiter=protein_delimiter,
202
- feature_index=feature_index,
203
- )
204
- missing_features.update(result.missing_features)
205
- if result.formatted_folds:
206
- all_folding_jobs.append(result.formatted_folds)
207
-
208
- if missing_features:
209
- raise FileNotFoundError(f"{sorted(missing_features)} not found in {directory_labels}")
210
-
211
- return all_folding_jobs
212
-
213
-
214
- def _read_nonempty_lines(path: Path) -> List[str]:
215
- with path.open(mode="r", encoding="utf-8") as handle:
216
- return [line.strip() for line in handle if line.strip()]
217
-
218
-
219
- def generate_fold_specifications(
220
- input_files: Sequence[Union[str, Path]],
221
- *,
222
- delimiter: str = "+",
223
- exclude_permutations: bool = True,
224
- output_path: Optional[Union[str, Path]] = None,
225
- ) -> List[str]:
226
- """Compute the Cartesian product of specification files.
227
-
228
- Args:
229
- input_files: Paths to text files containing one specification per line.
230
- delimiter: Delimiter used to join the combination into a specification string.
231
- exclude_permutations: When True, filter out combinations that are permutations
232
- of entries that already appear.
233
- output_path: Optional destination to persist the resulting specifications.
234
-
235
- Returns:
236
- List of joined specification strings.
237
- """
238
-
239
- paths = [Path(p).expanduser().resolve() for p in input_files]
240
- lines_per_file: List[List[str]] = []
241
- for path in paths:
242
- lines = _read_nonempty_lines(path)
243
- if not lines:
244
- warnings.warn(
245
- f"Input file '{path}' contains no specifications; skipping combination generation.",
246
- RuntimeWarning,
247
- )
248
- return []
249
- lines_per_file.append(lines)
250
-
251
- combinations = list(product(*lines_per_file)) if lines_per_file else []
252
-
253
- if exclude_permutations:
254
- filtered: List[Tuple[str, ...]] = []
255
- seen: set[Tuple[str, ...]] = set()
256
- for combo in combinations:
257
- normalized = tuple(sorted(map(str, combo)))
258
- if normalized in seen:
259
- continue
260
- seen.add(normalized)
261
- filtered.append(tuple(map(str, combo)))
262
- combinations = filtered
263
- else:
264
- combinations = [tuple(map(str, combo)) for combo in combinations]
265
-
266
- specifications = [delimiter.join(combo) for combo in combinations]
267
-
268
- if output_path:
269
- output = Path(output_path).expanduser().resolve()
270
- output.parent.mkdir(parents=True, exist_ok=True)
271
- output.write_text("\n".join(specifications) + ("\n" if specifications else ""), encoding="utf-8")
272
-
273
- return specifications