alphapulldown-input-parser 0.1.0__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (14) hide show
  1. {alphapulldown_input_parser-0.1.0 → alphapulldown_input_parser-0.3.0}/LICENSE +0 -4
  2. {alphapulldown_input_parser-0.1.0 → alphapulldown_input_parser-0.3.0}/PKG-INFO +1 -1
  3. {alphapulldown_input_parser-0.1.0 → alphapulldown_input_parser-0.3.0}/pyproject.toml +1 -1
  4. alphapulldown_input_parser-0.3.0/src/alphapulldown_input_parser/parser.py +478 -0
  5. {alphapulldown_input_parser-0.1.0 → alphapulldown_input_parser-0.3.0}/src/alphapulldown_input_parser.egg-info/PKG-INFO +1 -1
  6. {alphapulldown_input_parser-0.1.0 → alphapulldown_input_parser-0.3.0}/test/test_parser.py +26 -0
  7. alphapulldown_input_parser-0.1.0/src/alphapulldown_input_parser/parser.py +0 -273
  8. {alphapulldown_input_parser-0.1.0 → alphapulldown_input_parser-0.3.0}/README.md +0 -0
  9. {alphapulldown_input_parser-0.1.0 → alphapulldown_input_parser-0.3.0}/setup.cfg +0 -0
  10. {alphapulldown_input_parser-0.1.0 → alphapulldown_input_parser-0.3.0}/src/alphapulldown_input_parser/__init__.py +0 -0
  11. {alphapulldown_input_parser-0.1.0 → alphapulldown_input_parser-0.3.0}/src/alphapulldown_input_parser.egg-info/SOURCES.txt +0 -0
  12. {alphapulldown_input_parser-0.1.0 → alphapulldown_input_parser-0.3.0}/src/alphapulldown_input_parser.egg-info/dependency_links.txt +0 -0
  13. {alphapulldown_input_parser-0.1.0 → alphapulldown_input_parser-0.3.0}/src/alphapulldown_input_parser.egg-info/requires.txt +0 -0
  14. {alphapulldown_input_parser-0.1.0 → alphapulldown_input_parser-0.3.0}/src/alphapulldown_input_parser.egg-info/top_level.txt +0 -0
@@ -1,10 +1,6 @@
1
1
  MIT License
2
2
 
3
- <<<<<<< HEAD
4
- Copyright (c) 2024 Kosinski Lab
5
- =======
6
3
  Copyright (c) 2025 KosinskiLab
7
- >>>>>>> f65396f8370fada264ee798bbeb0666d4c5726d3
8
4
 
9
5
  Permission is hereby granted, free of charge, to any person obtaining a copy
10
6
  of this software and associated documentation files (the "Software"), to deal
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: alphapulldown-input-parser
3
- Version: 0.1.0
3
+ Version: 0.3.0
4
4
  Summary: Fold specification parser for AlphaPulldown
5
5
  Author-email: Kosinski Lab <alphapulldown@embl-hamburg.de>
6
6
  License: MIT
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "alphapulldown-input-parser"
7
- version = "0.1.0"
7
+ version = "0.3.0"
8
8
  description = "Fold specification parser for AlphaPulldown"
9
9
  readme = "README.md"
10
10
  license = {text = "MIT"}
@@ -0,0 +1,478 @@
1
+ """Standalone parser for AlphaPulldown fold specifications."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import warnings
6
+ from dataclasses import dataclass
7
+ from itertools import product
8
+ from pathlib import Path
9
+ from typing import Dict, Iterable, List, NamedTuple, Optional, Sequence, Tuple, Union
10
+
11
+ # ---------------------------------------------------------------------------
12
+ # Helpers
13
+ # ---------------------------------------------------------------------------
14
+
15
+
16
+ def _deduplicate_preserve_order(items: Iterable[str]) -> Tuple[str, ...]:
17
+ """Return a tuple containing the first occurrence of every item."""
18
+ return tuple(dict.fromkeys(items))
19
+
20
+
21
+ def _strip_path_and_extension(value: str) -> str:
22
+ return Path(value).stem
23
+
24
+
25
+ def _format_error(spec: str, msg: str | None = None) -> None:
26
+ """Mirror the historical AlphaPulldown error message."""
27
+ base = f"Your format: {spec} is wrong. The program will terminate."
28
+ detail = f" {msg}" if msg else ""
29
+ raise FormatError(f"{base}{detail}")
30
+
31
+
32
+ def _read_nonempty_lines(path: Path) -> List[str]:
33
+ with path.open(mode="r", encoding="utf-8") as handle:
34
+ return [line.strip() for line in handle if line.strip()]
35
+
36
+
37
+ # ---------------------------------------------------------------------------
38
+ # Core data structures
39
+ # ---------------------------------------------------------------------------
40
+
41
+
42
+ class FormatError(ValueError):
43
+ """Raised when a fold specification cannot be parsed."""
44
+
45
+
46
+ @dataclass(frozen=True)
47
+ class Region:
48
+ """1-based closed interval over the protein sequence."""
49
+
50
+ start: int
51
+ end: int
52
+
53
+ def __post_init__(self) -> None:
54
+ if self.start < 1 or self.end < 1:
55
+ raise ValueError("Region boundaries must be positive integers (1-based).")
56
+ if self.start > self.end:
57
+ raise ValueError("Region start must not exceed region end.")
58
+
59
+
60
+ @dataclass(frozen=True)
61
+ class RegionSelection:
62
+ """Container describing the region selection for a fold."""
63
+
64
+ regions: Tuple[Region, ...] | None = None
65
+
66
+ @property
67
+ def is_all(self) -> bool:
68
+ return self.regions is None
69
+
70
+ @classmethod
71
+ def all(cls) -> "RegionSelection":
72
+ return cls(regions=None)
73
+
74
+
75
+ # Either {"json_input": "/path/to.json"} or {"CHAIN_A": RegionSelection(...)}
76
+ FoldEntry = Dict[str, Union[str, RegionSelection]]
77
+
78
+
79
+ class ExpandResult(NamedTuple):
80
+ formatted_folds: List[FoldEntry]
81
+ missing_features: List[str]
82
+
83
+
84
+ # ---------------------------------------------------------------------------
85
+ # Fold dataset
86
+ # ---------------------------------------------------------------------------
87
+
88
+
89
+ @dataclass
90
+ class FoldDataset:
91
+ """Container encapsulating parsed fold specifications."""
92
+
93
+ fold_specifications: Tuple[str, ...]
94
+ sequences_by_origin: Dict[str, Tuple[str, ...]]
95
+ sequences_by_fold: Dict[str, Tuple[str, ...]]
96
+
97
+ @property
98
+ def unique_sequences(self) -> Tuple[str, ...]:
99
+ ordered: List[str] = []
100
+ for spec in self.fold_specifications:
101
+ ordered.extend(self.sequences_by_fold.get(spec, ()))
102
+ return _deduplicate_preserve_order(ordered)
103
+
104
+ def symlink_local_files(self, output_directory: Union[str, Path]) -> None:
105
+ output = Path(output_directory).expanduser().resolve()
106
+ output.mkdir(parents=True, exist_ok=True)
107
+ for file in self.sequences_by_origin.get("local", ()):
108
+ source = Path(file).expanduser()
109
+ target = output / Path(file).name
110
+ if target.exists() or target.is_symlink():
111
+ target.unlink()
112
+ target.symlink_to(source)
113
+
114
+ @classmethod
115
+ def from_fold_specifications(
116
+ cls,
117
+ fold_specifications: Sequence[str],
118
+ *,
119
+ protein_delimiter: str = "+",
120
+ ) -> "FoldDataset":
121
+ normalized_specs: List[str] = []
122
+ sequences_by_fold: Dict[str, Tuple[str, ...]] = {}
123
+ referenced_sequences: List[str] = []
124
+
125
+ for specification in fold_specifications:
126
+ tokens = [
127
+ token.strip()
128
+ for token in specification.split(protein_delimiter)
129
+ if token.strip()
130
+ ]
131
+ if not tokens:
132
+ continue
133
+
134
+ normalized_tokens: List[str] = []
135
+ per_fold_sequences: List[str] = []
136
+ for token in tokens:
137
+ parts = [part.strip() for part in token.split(":")]
138
+ protein_reference = parts[0]
139
+ referenced_sequences.append(protein_reference)
140
+
141
+ base_name = _strip_path_and_extension(protein_reference)
142
+ per_fold_sequences.append(base_name)
143
+ suffix_components = [part for part in parts[1:] if part]
144
+ normalized_tokens.append(
145
+ ":".join([base_name, *suffix_components]) if suffix_components else base_name
146
+ )
147
+
148
+ normalized_spec = protein_delimiter.join(normalized_tokens)
149
+ normalized_specs.append(normalized_spec)
150
+ sequences_by_fold[normalized_spec] = _deduplicate_preserve_order(per_fold_sequences)
151
+
152
+ unique_inputs = _deduplicate_preserve_order(referenced_sequences)
153
+ sequences_by_origin: Dict[str, List[str]] = {"uniprot": [], "local": []}
154
+ for sequence in unique_inputs:
155
+ path = Path(sequence).expanduser()
156
+ has_separator = "/" in sequence or "\\" in sequence
157
+ has_suffix = path.suffix != ""
158
+ exists = path.exists() and (path.is_file() or path.is_symlink())
159
+ is_local = exists and (has_separator or has_suffix)
160
+ if is_local:
161
+ sequences_by_origin["local"].append(str(path.resolve()))
162
+ else:
163
+ sequences_by_origin["uniprot"].append(sequence)
164
+
165
+ return cls(
166
+ fold_specifications=_deduplicate_preserve_order(normalized_specs),
167
+ sequences_by_origin={
168
+ key: tuple(values) for key, values in sequences_by_origin.items()
169
+ },
170
+ sequences_by_fold=sequences_by_fold,
171
+ )
172
+
173
+ @classmethod
174
+ def from_file(
175
+ cls,
176
+ filepath: Union[str, Path],
177
+ *,
178
+ protein_delimiter: str = "+",
179
+ ) -> "FoldDataset":
180
+ path = Path(filepath).expanduser().resolve()
181
+ specifications = _deduplicate_preserve_order(_read_nonempty_lines(path))
182
+ return cls.from_fold_specifications(
183
+ specifications,
184
+ protein_delimiter=protein_delimiter,
185
+ )
186
+
187
+ # ---------------------------------------------------------------------------
188
+ # Feature index
189
+ # ---------------------------------------------------------------------------
190
+
191
+
192
+ @dataclass(frozen=True)
193
+ class FeatureIndex:
194
+ """Pre-indexed feature availability for faster lookups."""
195
+
196
+ pkl: Dict[str, Tuple[str, ...]]
197
+ json: Dict[str, str]
198
+
199
+ def has_pkl(self, name: str) -> bool:
200
+ return name in self.pkl
201
+
202
+ def json_path(self, name: str) -> Optional[str]:
203
+ return self.json.get(name)
204
+
205
+
206
+ def _build_feature_index(directories: Sequence[Path]) -> FeatureIndex:
207
+ pkl: Dict[str, List[str]] = {}
208
+ json_files: Dict[str, str] = {}
209
+
210
+ for directory in directories:
211
+ if not directory.is_dir():
212
+ continue
213
+ for entry in directory.iterdir():
214
+ if not entry.is_file():
215
+ continue
216
+ filename = entry.name
217
+ if filename.endswith(".json"):
218
+ keys = {entry.name, entry.stem}
219
+ for key in keys:
220
+ json_files.setdefault(key, str(entry))
221
+ elif filename.endswith(".pkl"):
222
+ base = filename[:-4]
223
+ keys = {base, entry.name, entry.stem}
224
+ for key in keys:
225
+ pkl.setdefault(key, []).append(str(entry))
226
+ elif filename.endswith(".pkl.xz"):
227
+ base = filename[:-7]
228
+ keys = {base, entry.name, Path(filename[:-3]).stem}
229
+ for key in keys:
230
+ pkl.setdefault(key, []).append(str(entry))
231
+
232
+ return FeatureIndex(
233
+ # deduplicate paths while preserving order
234
+ pkl={name: tuple(dict.fromkeys(paths)) for name, paths in pkl.items()},
235
+ json=json_files,
236
+ )
237
+
238
+
239
+ # ---------------------------------------------------------------------------
240
+ # Expansion helpers
241
+ # ---------------------------------------------------------------------------
242
+
243
+
244
+ def _extract_copy_and_regions(tokens: Sequence[str], spec: str) -> Tuple[int, Sequence[str]]:
245
+ """Return copy count and the remaining region tokens."""
246
+ if len(tokens) > 1:
247
+ try:
248
+ return int(tokens[1]), tokens[2:]
249
+ except ValueError:
250
+ pass
251
+ try:
252
+ return int(tokens[-1]), tokens[1:-1]
253
+ except ValueError:
254
+ pass
255
+ return 1, tokens[1:]
256
+
257
+
258
+ def _parse_regions(region_tokens: Sequence[str], spec: str) -> RegionSelection:
259
+ """Parse optional region tokens into a RegionSelection."""
260
+ if not region_tokens:
261
+ return RegionSelection.all()
262
+ regions: List[Region] = []
263
+ for tok in region_tokens:
264
+ parts = tok.split("-")
265
+ if len(parts) != 2:
266
+ _format_error(spec, msg=f"Region token '{tok}' is not of form start-stop.")
267
+ try:
268
+ start, end = map(int, parts)
269
+ except ValueError:
270
+ _format_error(spec, msg=f"Region token '{tok}' contains non-integer bounds.")
271
+ try:
272
+ regions.append(Region(start=start, end=end))
273
+ except ValueError as exc:
274
+ _format_error(spec, msg=str(exc))
275
+ return RegionSelection(regions=tuple(regions))
276
+
277
+
278
+ # ---------------------------------------------------------------------------
279
+ # Expansion logic
280
+ # ---------------------------------------------------------------------------
281
+
282
+
283
+ def expand_fold_specification(
284
+ spec: str,
285
+ features_directory: Iterable[str],
286
+ protein_delimiter: str,
287
+ *,
288
+ feature_index: FeatureIndex | None = None,
289
+ ) -> ExpandResult:
290
+ """Expand a single fold specification into fold entries.
291
+
292
+ Example:
293
+ >>> expand_fold_specification("protA:1-10+protB", ["/features"], "+", feature_index=index)
294
+
295
+ Returns:
296
+ ExpandResult with fold entries and missing feature names. Pure function.
297
+ """
298
+
299
+ index = feature_index
300
+ if index is None:
301
+ directories = tuple(Path(d).expanduser().resolve() for d in features_directory)
302
+ index = _build_feature_index(directories)
303
+
304
+ formatted_folds: List[FoldEntry] = []
305
+ missing_features: List[str] = []
306
+
307
+ for raw_pf in spec.split(protein_delimiter):
308
+ pf = raw_pf.strip()
309
+ if not pf:
310
+ continue
311
+
312
+ tokens = [token.strip() for token in pf.split(":")]
313
+ base_token = tokens[0] if tokens else ""
314
+
315
+ # JSON inputs: support optional copy number, but no ranges.
316
+ if base_token.endswith(".json"):
317
+ path_pf = Path(base_token)
318
+ json_path: Optional[str] = None
319
+ for json_key in (path_pf.name, path_pf.stem):
320
+ json_path = index.json_path(json_key)
321
+ if json_path:
322
+ # Handle optional copy number for JSON inputs.
323
+ if len(tokens) == 1:
324
+ copies = 1
325
+ else:
326
+ extra_tokens = tokens[1:]
327
+ # Ranges (e.g. "1-10") are not supported for JSON feature files.
328
+ if any("-" in tok for tok in extra_tokens):
329
+ _format_error(
330
+ spec,
331
+ msg="Region ranges are not supported for JSON feature files.",
332
+ )
333
+ if len(extra_tokens) != 1:
334
+ _format_error(
335
+ spec,
336
+ msg="JSON feature files support only an optional copy number.",
337
+ )
338
+ try:
339
+ copies = int(extra_tokens[0])
340
+ except ValueError:
341
+ _format_error(
342
+ spec,
343
+ msg="Copy number for JSON feature file must be an integer.",
344
+ )
345
+
346
+ for _ in range(copies):
347
+ formatted_folds.append({"json_input": json_path})
348
+ break
349
+ if json_path:
350
+ continue
351
+ missing_features.append(path_pf.name)
352
+ continue
353
+
354
+ if not tokens or not tokens[0]:
355
+ _format_error(spec, msg="Protein token is empty.")
356
+
357
+ name = tokens[0]
358
+ name_path = Path(name)
359
+ name_candidates = [
360
+ name,
361
+ name_path.name,
362
+ name_path.stem,
363
+ ]
364
+ # copy count can be either second or last token
365
+ number, region_tokens = _extract_copy_and_regions(tokens, spec)
366
+ regions = _parse_regions(region_tokens, spec)
367
+
368
+ # try different name representations against prebuilt index
369
+ canonical_name = next((candidate for candidate in name_candidates if index.has_pkl(candidate)), None)
370
+ if canonical_name is None:
371
+ missing_features.append(name)
372
+ continue
373
+
374
+ for _ in range(number):
375
+ formatted_folds.append({canonical_name: regions})
376
+
377
+ return ExpandResult(formatted_folds=formatted_folds, missing_features=missing_features)
378
+
379
+
380
+ def parse_fold(
381
+ input_list: List[str],
382
+ features_directory: Iterable[str],
383
+ protein_delimiter: str,
384
+ ) -> List[List[FoldEntry]]:
385
+ """Parse a list of fold specifications into folding jobs.
386
+
387
+ Example:
388
+ >>> parse_fold(["protA+protB"], ["/features"], "+")
389
+
390
+ Returns:
391
+ List of jobs (each job is a list of FoldEntry). Pure function.
392
+ """
393
+
394
+ directories = tuple(features_directory)
395
+ directory_labels = [str(d) for d in directories]
396
+ directory_paths = tuple(Path(d).expanduser().resolve() for d in directories)
397
+ feature_index = _build_feature_index(directory_paths)
398
+
399
+ all_folding_jobs: List[List[FoldEntry]] = []
400
+ missing_features = set()
401
+
402
+ for spec in input_list:
403
+ result = expand_fold_specification(
404
+ spec=spec,
405
+ features_directory=directories,
406
+ protein_delimiter=protein_delimiter,
407
+ feature_index=feature_index,
408
+ )
409
+ missing_features.update(result.missing_features)
410
+ if result.formatted_folds:
411
+ all_folding_jobs.append(result.formatted_folds)
412
+
413
+ if missing_features:
414
+ raise FileNotFoundError(f"{sorted(missing_features)} not found in {directory_labels}")
415
+
416
+ return all_folding_jobs
417
+
418
+
419
+ # ---------------------------------------------------------------------------
420
+ # Public API
421
+ # ---------------------------------------------------------------------------
422
+
423
+
424
+ def generate_fold_specifications(
425
+ input_files: Sequence[Union[str, Path]],
426
+ *,
427
+ delimiter: str = "+",
428
+ exclude_permutations: bool = True,
429
+ output_path: Optional[Union[str, Path]] = None,
430
+ ) -> List[str]:
431
+ """Compute the Cartesian product of specification files.
432
+
433
+ Args:
434
+ input_files: Paths to text files containing one specification per line.
435
+ delimiter: Delimiter used to join the combination into a specification string.
436
+ exclude_permutations: When True, filter out combinations that are permutations
437
+ of entries that already appear.
438
+ output_path: Optional destination to persist the resulting specifications.
439
+
440
+ Returns:
441
+ List of joined specification strings.
442
+ """
443
+
444
+ paths = [Path(p).expanduser().resolve() for p in input_files]
445
+ lines_per_file: List[List[str]] = []
446
+ for path in paths:
447
+ lines = _read_nonempty_lines(path)
448
+ if not lines:
449
+ warnings.warn(
450
+ f"Input file '{path}' contains no specifications; skipping combination generation.",
451
+ RuntimeWarning,
452
+ )
453
+ return []
454
+ lines_per_file.append(lines)
455
+
456
+ if not lines_per_file:
457
+ combinations: List[Tuple[str, ...]] = []
458
+ else:
459
+ combinations_iter = product(*lines_per_file)
460
+ seen: set[Tuple[str, ...]] = set()
461
+ combinations = []
462
+ for combo in combinations_iter:
463
+ typed_combo = tuple(map(str, combo))
464
+ if exclude_permutations:
465
+ normalized = tuple(sorted(typed_combo))
466
+ if normalized in seen:
467
+ continue
468
+ seen.add(normalized)
469
+ combinations.append(typed_combo)
470
+
471
+ specifications = [delimiter.join(combo) for combo in combinations]
472
+
473
+ if output_path:
474
+ output = Path(output_path).expanduser().resolve()
475
+ output.parent.mkdir(parents=True, exist_ok=True)
476
+ output.write_text("\n".join(specifications) + ("\n" if specifications else ""), encoding="utf-8")
477
+
478
+ return specifications
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: alphapulldown-input-parser
3
- Version: 0.1.0
3
+ Version: 0.3.0
4
4
  Summary: Fold specification parser for AlphaPulldown
5
5
  Author-email: Kosinski Lab <alphapulldown@embl-hamburg.de>
6
6
  License: MIT
@@ -211,6 +211,32 @@ def patch_feature_index(monkeypatch):
211
211
  None,
212
212
  id="json_in_multiple_dirs",
213
213
  ),
214
+ pytest.param(
215
+ ["protein1.json:3"],
216
+ ["dir1"],
217
+ "+",
218
+ {"pkl": {}, "json": {"protein1.json": "dir1/protein1.json"}},
219
+ [
220
+ [
221
+ {"json_input": "dir1/protein1.json"},
222
+ {"json_input": "dir1/protein1.json"},
223
+ {"json_input": "dir1/protein1.json"},
224
+ ]
225
+ ],
226
+ None,
227
+ None,
228
+ id="json_with_copy_number",
229
+ ),
230
+ pytest.param(
231
+ ["protein1.json:1-10"],
232
+ ["dir1"],
233
+ "+",
234
+ {"pkl": {}, "json": {"protein1.json": "dir1/protein1.json"}},
235
+ None,
236
+ FormatError,
237
+ None,
238
+ id="json_with_range_not_supported",
239
+ ),
214
240
  ],
215
241
  )
216
242
  def test_parse_fold(
@@ -1,273 +0,0 @@
1
- """Standalone parser for AlphaPulldown fold specifications."""
2
-
3
- from __future__ import annotations
4
-
5
- import warnings
6
- from dataclasses import dataclass
7
- from itertools import product
8
- from pathlib import Path
9
- from typing import Dict, Iterable, List, NamedTuple, Optional, Sequence, Tuple, Union
10
-
11
-
12
- class FormatError(ValueError):
13
- """Raised when a fold specification cannot be parsed."""
14
-
15
-
16
- def _format_error(spec: str, msg: str | None = None) -> None:
17
- """Mirror the historical AlphaPulldown error message."""
18
- base = f"Your format: {spec} is wrong. The program will terminate."
19
- detail = f" {msg}" if msg else ""
20
- raise FormatError(f"{base}{detail}")
21
-
22
-
23
- @dataclass(frozen=True)
24
- class Region:
25
- """Closed interval over the protein sequence."""
26
-
27
- start: int
28
- end: int
29
-
30
- def __post_init__(self) -> None:
31
- if self.start < 0 or self.end < 0:
32
- raise ValueError("Region boundaries must be non-negative integers.")
33
- if self.start > self.end:
34
- raise ValueError("Region start must not exceed region end.")
35
-
36
-
37
- @dataclass(frozen=True)
38
- class RegionSelection:
39
- """Container describing the region selection for a fold."""
40
-
41
- regions: Tuple[Region, ...] | None = None
42
-
43
- @property
44
- def is_all(self) -> bool:
45
- return self.regions is None
46
-
47
- @classmethod
48
- def all(cls) -> "RegionSelection":
49
- return cls(regions=None)
50
-
51
-
52
- FoldEntry = Dict[str, Union[str, RegionSelection]]
53
-
54
-
55
- class ExpandResult(NamedTuple):
56
- formatted_folds: List[FoldEntry]
57
- missing_features: List[str]
58
-
59
-
60
- @dataclass(frozen=True)
61
- class FeatureIndex:
62
- """Pre-indexed feature availability for faster lookups."""
63
-
64
- pkl: Dict[str, Tuple[str, ...]]
65
- json: Dict[str, str]
66
-
67
- def has_pkl(self, name: str) -> bool:
68
- return name in self.pkl
69
-
70
- def json_path(self, name: str) -> Optional[str]:
71
- return self.json.get(name)
72
-
73
-
74
- def _build_feature_index(directories: Sequence[Path]) -> FeatureIndex:
75
- pkl: Dict[str, List[str]] = {}
76
- json_files: Dict[str, str] = {}
77
-
78
- for directory in directories:
79
- if not directory.is_dir():
80
- continue
81
- for entry in directory.iterdir():
82
- if not entry.is_file():
83
- continue
84
- filename = entry.name
85
- if filename.endswith(".json"):
86
- json_files.setdefault(filename, str(entry))
87
- elif filename.endswith(".pkl"):
88
- base = filename[:-4]
89
- pkl.setdefault(base, []).append(str(entry))
90
- elif filename.endswith(".pkl.xz"):
91
- base = filename[:-7]
92
- pkl.setdefault(base, []).append(str(entry))
93
-
94
- return FeatureIndex(
95
- pkl={name: tuple(paths) for name, paths in pkl.items()},
96
- json=json_files,
97
- )
98
-
99
-
100
- def _extract_copy_and_regions(tokens: Sequence[str], spec: str) -> Tuple[int, Sequence[str]]:
101
- """Return copy count and the remaining region tokens."""
102
- if len(tokens) > 1:
103
- try:
104
- return int(tokens[1]), tokens[2:]
105
- except ValueError:
106
- pass
107
- try:
108
- return int(tokens[-1]), tokens[1:-1]
109
- except ValueError:
110
- pass
111
- return 1, tokens[1:]
112
-
113
-
114
- def _parse_regions(region_tokens: Sequence[str], spec: str) -> RegionSelection:
115
- """Parse optional region tokens into a RegionSelection."""
116
- if not region_tokens:
117
- return RegionSelection.all()
118
- regions: List[Region] = []
119
- for tok in region_tokens:
120
- parts = tok.split("-")
121
- if len(parts) != 2:
122
- _format_error(spec, msg=f"Region token '{tok}' is not of form start-stop.")
123
- try:
124
- start, end = map(int, parts)
125
- except ValueError:
126
- _format_error(spec, msg=f"Region token '{tok}' contains non-integer bounds.")
127
- try:
128
- regions.append(Region(start=start, end=end))
129
- except ValueError as exc:
130
- _format_error(spec, msg=str(exc))
131
- return RegionSelection(regions=tuple(regions))
132
-
133
-
134
- def expand_fold_specification(
135
- spec: str,
136
- features_directory: Iterable[str],
137
- protein_delimiter: str,
138
- *,
139
- feature_index: FeatureIndex | None = None,
140
- ) -> ExpandResult:
141
- """Expand a single fold specification.
142
-
143
- Returns a tuple of (formatted_folds, missing_features).
144
- """
145
-
146
- index = feature_index
147
- if index is None:
148
- directories = tuple(Path(d).expanduser().resolve() for d in features_directory)
149
- index = _build_feature_index(directories)
150
-
151
- formatted_folds: List[FoldEntry] = []
152
- missing_features: List[str] = []
153
-
154
- for pf in spec.split(protein_delimiter):
155
- if pf.endswith(".json"):
156
- json_name = pf
157
- json_path = index.json_path(json_name)
158
- if json_path:
159
- formatted_folds.append({"json_input": json_path})
160
- else:
161
- missing_features.append(json_name)
162
- continue
163
-
164
- tokens = pf.split(":")
165
- if not tokens or not tokens[0]:
166
- _format_error(spec, msg="Protein token is empty.")
167
-
168
- name = tokens[0]
169
- number, region_tokens = _extract_copy_and_regions(tokens, spec)
170
- regions = _parse_regions(region_tokens, spec)
171
-
172
- if not index.has_pkl(name):
173
- missing_features.append(name)
174
- continue
175
-
176
- for _ in range(number):
177
- formatted_folds.append({name: regions})
178
-
179
- return ExpandResult(formatted_folds=formatted_folds, missing_features=missing_features)
180
-
181
-
182
- def parse_fold(
183
- input_list: List[str],
184
- features_directory: Iterable[str],
185
- protein_delimiter: str,
186
- ) -> List[List[FoldEntry]]:
187
- """Parse a list of fold specifications into folding jobs."""
188
-
189
- directories = tuple(features_directory)
190
- directory_labels = [str(d) for d in directories]
191
- directory_paths = tuple(Path(d).expanduser().resolve() for d in directories)
192
- feature_index = _build_feature_index(directory_paths)
193
-
194
- all_folding_jobs: List[List[FoldEntry]] = []
195
- missing_features = set()
196
-
197
- for spec in input_list:
198
- result = expand_fold_specification(
199
- spec=spec,
200
- features_directory=directories,
201
- protein_delimiter=protein_delimiter,
202
- feature_index=feature_index,
203
- )
204
- missing_features.update(result.missing_features)
205
- if result.formatted_folds:
206
- all_folding_jobs.append(result.formatted_folds)
207
-
208
- if missing_features:
209
- raise FileNotFoundError(f"{sorted(missing_features)} not found in {directory_labels}")
210
-
211
- return all_folding_jobs
212
-
213
-
214
- def _read_nonempty_lines(path: Path) -> List[str]:
215
- with path.open(mode="r", encoding="utf-8") as handle:
216
- return [line.strip() for line in handle if line.strip()]
217
-
218
-
219
- def generate_fold_specifications(
220
- input_files: Sequence[Union[str, Path]],
221
- *,
222
- delimiter: str = "+",
223
- exclude_permutations: bool = True,
224
- output_path: Optional[Union[str, Path]] = None,
225
- ) -> List[str]:
226
- """Compute the Cartesian product of specification files.
227
-
228
- Args:
229
- input_files: Paths to text files containing one specification per line.
230
- delimiter: Delimiter used to join the combination into a specification string.
231
- exclude_permutations: When True, filter out combinations that are permutations
232
- of entries that already appear.
233
- output_path: Optional destination to persist the resulting specifications.
234
-
235
- Returns:
236
- List of joined specification strings.
237
- """
238
-
239
- paths = [Path(p).expanduser().resolve() for p in input_files]
240
- lines_per_file: List[List[str]] = []
241
- for path in paths:
242
- lines = _read_nonempty_lines(path)
243
- if not lines:
244
- warnings.warn(
245
- f"Input file '{path}' contains no specifications; skipping combination generation.",
246
- RuntimeWarning,
247
- )
248
- return []
249
- lines_per_file.append(lines)
250
-
251
- combinations = list(product(*lines_per_file)) if lines_per_file else []
252
-
253
- if exclude_permutations:
254
- filtered: List[Tuple[str, ...]] = []
255
- seen: set[Tuple[str, ...]] = set()
256
- for combo in combinations:
257
- normalized = tuple(sorted(map(str, combo)))
258
- if normalized in seen:
259
- continue
260
- seen.add(normalized)
261
- filtered.append(tuple(map(str, combo)))
262
- combinations = filtered
263
- else:
264
- combinations = [tuple(map(str, combo)) for combo in combinations]
265
-
266
- specifications = [delimiter.join(combo) for combo in combinations]
267
-
268
- if output_path:
269
- output = Path(output_path).expanduser().resolve()
270
- output.parent.mkdir(parents=True, exist_ok=True)
271
- output.write_text("\n".join(specifications) + ("\n" if specifications else ""), encoding="utf-8")
272
-
273
- return specifications