if-split 0.1.0__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. {if_split-0.1.0 → if_split-0.2.0}/PKG-INFO +87 -15
  2. {if_split-0.1.0 → if_split-0.2.0}/README.md +86 -14
  3. {if_split-0.1.0 → if_split-0.2.0}/config/default.yaml +12 -4
  4. {if_split-0.1.0 → if_split-0.2.0}/examples/IF-Split-2026.05.31/README.md +19 -15
  5. if_split-0.2.0/examples/IF-Split-2026.05.31/STATS.txt +20 -0
  6. {if_split-0.1.0 → if_split-0.2.0}/examples/IF-Split-2026.05.31/manifest.json +35 -32
  7. {if_split-0.1.0 → if_split-0.2.0}/examples/IF-Split-2026.05.31/test/metal_test.json +16 -103
  8. {if_split-0.1.0 → if_split-0.2.0}/examples/IF-Split-2026.05.31/test/small_molecule_test.json +3183 -0
  9. {if_split-0.1.0 → if_split-0.2.0}/examples/IF-Split-2026.05.31/test.json +13 -0
  10. {if_split-0.1.0 → if_split-0.2.0}/examples/IF-Split-2026.05.31/train.json +14 -24
  11. {if_split-0.1.0 → if_split-0.2.0}/examples/IF-Split-2026.05.31/val.json +8 -14
  12. {if_split-0.1.0 → if_split-0.2.0}/pyproject.toml +1 -1
  13. {if_split-0.1.0 → if_split-0.2.0}/src/ifsplit/__init__.py +1 -1
  14. {if_split-0.1.0 → if_split-0.2.0}/src/ifsplit/cli.py +48 -0
  15. {if_split-0.1.0 → if_split-0.2.0}/src/ifsplit/config.py +71 -4
  16. {if_split-0.1.0 → if_split-0.2.0}/src/ifsplit/ligands.py +99 -23
  17. {if_split-0.1.0 → if_split-0.2.0}/src/ifsplit/manifest.py +1 -1
  18. {if_split-0.1.0 → if_split-0.2.0}/src/ifsplit/rcsb.py +8 -1
  19. {if_split-0.1.0 → if_split-0.2.0}/src/ifsplit/schema.py +13 -1
  20. {if_split-0.1.0 → if_split-0.2.0}/tests/test_config.py +51 -0
  21. {if_split-0.1.0 → if_split-0.2.0}/tests/test_ligands.py +112 -17
  22. {if_split-0.1.0 → if_split-0.2.0}/uv.lock +1 -1
  23. if_split-0.1.0/examples/IF-Split-2026.05.31/STATS.txt +0 -20
  24. {if_split-0.1.0 → if_split-0.2.0}/.github/workflows/ci.yml +0 -0
  25. {if_split-0.1.0 → if_split-0.2.0}/.github/workflows/publish.yml +0 -0
  26. {if_split-0.1.0 → if_split-0.2.0}/.gitignore +0 -0
  27. {if_split-0.1.0 → if_split-0.2.0}/.python-version +0 -0
  28. {if_split-0.1.0 → if_split-0.2.0}/CLAUDE.md +0 -0
  29. {if_split-0.1.0 → if_split-0.2.0}/LICENSE +0 -0
  30. {if_split-0.1.0 → if_split-0.2.0}/PLAN.md +0 -0
  31. {if_split-0.1.0 → if_split-0.2.0}/data/cache/.gitkeep +0 -0
  32. {if_split-0.1.0 → if_split-0.2.0}/data/out/.gitkeep +0 -0
  33. {if_split-0.1.0 → if_split-0.2.0}/examples/IF-Split-2026.05.31/config.yaml +0 -0
  34. /if_split-0.1.0/examples/IF-Split-2026.05.31/test/nucleotide_test.json → /if_split-0.2.0/examples/IF-Split-2026.05.31/test/nucleic_acid_test.json +0 -0
  35. {if_split-0.1.0 → if_split-0.2.0}/src/ifsplit/__main__.py +0 -0
  36. {if_split-0.1.0 → if_split-0.2.0}/src/ifsplit/cluster.py +0 -0
  37. {if_split-0.1.0 → if_split-0.2.0}/src/ifsplit/dataset.py +0 -0
  38. {if_split-0.1.0 → if_split-0.2.0}/src/ifsplit/download.py +0 -0
  39. {if_split-0.1.0 → if_split-0.2.0}/src/ifsplit/enumerate.py +0 -0
  40. {if_split-0.1.0 → if_split-0.2.0}/src/ifsplit/hydrate.py +0 -0
  41. {if_split-0.1.0 → if_split-0.2.0}/src/ifsplit/parse.py +0 -0
  42. {if_split-0.1.0 → if_split-0.2.0}/src/ifsplit/split.py +0 -0
  43. {if_split-0.1.0 → if_split-0.2.0}/tests/conftest.py +0 -0
  44. {if_split-0.1.0 → if_split-0.2.0}/tests/test_download.py +0 -0
  45. {if_split-0.1.0 → if_split-0.2.0}/tests/test_enumerate_lock.py +0 -0
  46. {if_split-0.1.0 → if_split-0.2.0}/tests/test_integration.py +0 -0
  47. {if_split-0.1.0 → if_split-0.2.0}/tests/test_loader.py +0 -0
  48. {if_split-0.1.0 → if_split-0.2.0}/tests/test_pipeline.py +0 -0
  49. {if_split-0.1.0 → if_split-0.2.0}/tests/test_schema.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: if-split
3
- Version: 0.1.0
3
+ Version: 0.2.0
4
4
  Summary: Reproducible, date-pinned, ligand-aware train/val/test splitter for the PDB (LigandMPNN-style).
5
5
  Author: WSobo
6
6
  License: MIT
@@ -19,6 +19,10 @@ Description-Content-Type: text/markdown
19
19
 
20
20
  # IF-Split
21
21
 
22
+ [![CI](https://github.com/WSobo/IF-Split/actions/workflows/ci.yml/badge.svg)](https://github.com/WSobo/IF-Split/actions/workflows/ci.yml)
23
+ [![PyPI](https://img.shields.io/pypi/v/if-split.svg)](https://pypi.org/project/if-split/)
24
+ [![Python](https://img.shields.io/pypi/pyversions/if-split.svg)](https://pypi.org/project/if-split/)
25
+
22
26
  **A reproducible, date-pinned, ligand-aware train/val/test splitter for the PDB.**
23
27
 
24
28
  IF-Split borrows the *split logic* of LigandMPNN (Dauparas et al., *Nature
@@ -60,16 +64,23 @@ records and sequences. Coordinates are an optional, downstream concern.
60
64
 
61
65
  ## Install
62
66
 
63
- Requires Python ≥ 3.11 and [`uv`](https://docs.astral.sh/uv/). `build` needs
64
- only network access to RCSB — no external binaries. (The optional `mmseqs2`
65
- clustering backend and the optional coordinate/featurization path via `gemmi`
66
- are Linux-native, so run under Linux/WSL if you use them.)
67
+ Requires Python ≥ 3.11. `build` needs only network access to RCSB — no external
68
+ binaries.
69
+
70
+ ```bash
71
+ pip install if-split # from PyPI
72
+ ```
73
+
74
+ Or for development, with [`uv`](https://docs.astral.sh/uv/):
67
75
 
68
76
  ```bash
77
+ git clone https://github.com/WSobo/IF-Split && cd IF-Split
69
78
  uv sync # creates .venv from uv.lock, installs deps + dev tools (ruff, pytest)
70
79
  ```
71
80
 
72
- `uv.lock` is committed, so environments are reproducible.
81
+ `uv.lock` is committed, so dev environments are reproducible. (The optional
82
+ coordinate/featurization path via `gemmi` is Linux-native, so run under
83
+ Linux/WSL if you use `fetch`.)
73
84
 
74
85
  ## Quickstart
75
86
 
@@ -91,6 +102,9 @@ uv run if-split build --registry data/out/splits.registry.json --out data/out2
91
102
 
92
103
  # OPTIONAL: download the actual structures for a built split (see below).
93
104
  uv run if-split fetch data/out/manifest.json --split test --out data/structures
105
+
106
+ # Emit a portable, shareable split spec (see "Sharing a split spec" below).
107
+ uv run if-split spec data/out/manifest.json --name my-split --out my-split.ifsplit.yaml
94
108
  ```
95
109
 
96
110
  ### Outputs (`--out` directory)
@@ -163,7 +177,7 @@ A `build` runs eight stages; none touch coordinates.
163
177
  |---|---|---|
164
178
  | 1 — enumerate | `enumerate.py`, `rcsb.py` | RCSB Search → entry IDs; Data API (GraphQL, batched) → sequences, ligands, residue counts, cluster membership → `candidates.jsonl`. |
165
179
  | 3 — filter | `parse.py` | Drop no-protein / no-sequence / oversized entries (assembly-1 residue count vs `max_total_residues`), plus optional wwPDB validation-report quality caps (clashscore, R-free, Ramachandran/rotamer/RSRZ) — all from metadata. Every drop is logged with its reason. |
166
- | 4 — ligands | `ligands.py` | Tier each non-protein component `functional`/`ambiguous`/`artifact`; derive class labels (metal / small-molecule / nucleotide). Nucleotide is functional only with a verified protein↔NA assembly interface. **Annotate, never drop.** |
180
+ | 4 — ligands | `ligands.py` | Tier each non-protein component `functional`/`ambiguous`/`artifact`; derive class labels (metal / small-molecule / nucleic-acid). `nucleic_acid` = a protein↔DNA/RNA *complex* (verified assembly interface), **not** a bound mononucleotide. **Annotate, never drop.** |
167
181
  | 5 — cluster | `cluster.py` | Group protein entities by RCSB precomputed cluster id at `identity_threshold`; canonical key = smallest member id. |
168
182
  | 6 — split | `split.py` | Deterministic hash → train/val/test; assert no cluster spans two splits; audit residual secondary-chain overlap. |
169
183
  | 7 — manifest | `manifest.py` | Emit lock + manifest + registry (all deterministic, no wall-clock fields). |
@@ -213,16 +227,30 @@ machine-readable reason, from RCSB metadata signals:
213
227
  **Holo gating (metadata-only).** Presence isn't enough. A small molecule or metal
214
228
  is `functional` only if RCSB reports it *contacting* the protein (`bound_components`)
215
229
  or it has a measured binding affinity; an unbound one is `ambiguous`. A DNA/RNA
216
- chain is `functional` *nucleotide* only when the biological assembly has a verified
217
- protein↔nucleic-acid interface (`rcsb_interface_info.polymer_composition == "Protein/NA"`)
218
- — a co-deposited but non-contacting oligo is reported `ambiguous`, never silently
230
+ chain is `functional` `nucleic_acid` only when the biological assembly has a verified
231
+ protein↔nucleic-acid interface (`num_prot_na_interface_entities > 0`) — a
232
+ co-deposited but non-contacting oligo is reported `ambiguous`, never silently
219
233
  labelled. (Interfaces are RCSB-computed metadata, available for X-ray *and* cryo-EM,
220
234
  so no coordinates are downloaded.)
221
235
 
236
+ > The `nucleic_acid` class is the protein–nucleic-acid **complex** category (DNA/RNA
237
+ > polymer chains), matching LigandMPNN's "nucleotide" split. Bound *mononucleotide*
238
+ > ligands (ATP, GTP, NAD, SAM, …) are not this class — they fall under
239
+ > `small_molecule`.
240
+
222
241
  The His-tag/Ni curation catches a known blemish in the LigandMPNN metal set:
223
242
  structures whose only "metal site" is a poly-His tag chelating Ni/Co from
224
- affinity purification. Live examples from a real build: `101M {HEM:
225
- functional, SO4: artifact}`, `102L {BME: artifact, CL: artifact}`.
243
+ affinity purification. A poly-His run anywhere — or a short run at a chain
244
+ terminus (`histag_terminal_min_run`, catching 6×His tags left partial by
245
+ unmodeled or trimmed residues) — flags the entry's Ni/Co as an `artifact`.
246
+
247
+ But a real audit showed the deeper issue: **~96% of lone Ni/Co entries have no
248
+ His-tag in the deposited sequence at all** (the tag is trimmed from the SEQRES
249
+ record, not just unmodeled), so a sequence scan can never see it. So even with no
250
+ detectable tag, a *lone* Ni/Co (the entry's only metal) with no measured affinity
251
+ is demoted from `functional` to `ambiguous` — reported, not labelled. Real metals
252
+ (Zn, Mg, Fe, …), and Ni/Co backed by an affinity or sitting alongside a genuine
253
+ metal, are untouched. On the full PDB this re-tiers ~2.7% of the metal set.
226
254
 
227
255
  Crucially, **the structure always stays in its split** — a protein with a junk
228
256
  ion is still a good backbone; we just don't label the junk. A consumer wanting
@@ -255,11 +283,54 @@ for epoch in range(3):
255
283
  batch_ids = ds.train.sample_by_cluster(seed=epoch)
256
284
  ```
257
285
 
286
+ ## Sharing a split spec
287
+
288
+ The config **is** the shareable recipe. Everything that affects the split lives in
289
+ one small YAML file with a content hash, so you can hand someone that file and they
290
+ reproduce your methodology exactly — like `params.yaml` in DVC. `if-split spec`
291
+ emits a portable, self-identifying version from any build or config:
292
+
293
+ ```bash
294
+ # Extract a stand-alone spec from a finished build (config is embedded in the manifest):
295
+ uv run if-split spec data/out/manifest.json --name "my-split" --author "you" \
296
+ --out my-split.ifsplit.yaml
297
+
298
+ # Anyone reproduces your split from just that file:
299
+ uv run if-split build --config my-split.ifsplit.yaml --out their/out
300
+ ```
301
+
302
+ The emitted file carries a `spec:` header that announces what it is and pins the
303
+ expected hash:
304
+
305
+ ```yaml
306
+ spec:
307
+ ifsplit_spec: ifsplit/config@1 # schema id — the file says what it is
308
+ name: my-split
309
+ author: you
310
+ created_with: if-split 0.1.0
311
+ expected_config_hash: 3b63318286fd2ac4994f34d10936be05
312
+ snapshot_date: '2026-05-31'
313
+ resolution_max_A: 3.5
314
+ # ... all output-affecting settings ...
315
+ ```
316
+
317
+ On load, if `expected_config_hash` no longer matches the settings (someone edited
318
+ them after stamping), IF-Split warns. The `spec:` metadata is **excluded from the
319
+ hash**, so name/author/description never change the split identity — two specs that
320
+ differ only in their labels produce byte-identical outputs.
321
+
322
+ | Artifact | Question it answers | Size |
323
+ |---|---|--:|
324
+ | `*.ifsplit.yaml` (or `config.yaml`) | *"How did you make this split?"* — the recipe | ~KB |
325
+ | `manifest.json` | *"What's in it?"* — counts, provenance, file index | ~KB |
326
+ | `dataset.lock` | *"Reproduce the exact bytes"* — pins entry set + candidates SHA | ~MB |
327
+
258
328
  ## Configuration
259
329
 
260
330
  Everything that affects the output lives in one YAML file
261
331
  ([`config/default.yaml`](config/default.yaml)); its canonical hash is embedded
262
- in every manifest, so two builds with the same hash used identical settings.
332
+ in every manifest, so two builds with the same hash used identical settings. It
333
+ doubles as a shareable **split spec** — see [Sharing a split spec](#sharing-a-split-spec).
263
334
 
264
335
  | Key | Default | Meaning |
265
336
  |---|---|---|
@@ -270,8 +341,9 @@ in every manifest, so two builds with the same hash used identical settings.
270
341
  | `excluded_het` | waters + common ions | Extra components forced to `artifact`. |
271
342
  | `use_biological_assembly` | `true` | Count residues from assembly 1, not the deposited asymmetric unit. |
272
343
  | `purification_metals` | `[NI, CO]` | Metals treated as IMAC tags; `[]` disables the heuristic. |
273
- | `histag_min_run` | `6` | His-run length that marks a purification tag. |
274
- | `exclude_purification_artifacts` | `true` | Demote His-tag metals to `artifact`. |
344
+ | `histag_min_run` | `6` | His-run length (anywhere) that marks a purification tag. |
345
+ | `histag_terminal_min_run` | `3` | Shorter His-run at a chain terminus that also counts as a tag (partial/unmodeled 6×His). |
346
+ | `exclude_purification_artifacts` | `true` | Demote His-tag metals to `artifact`; lone uncorroborated Ni/Co → `ambiguous`. |
275
347
  | `identity_threshold` | `0.30` | Clustering cutoff (RCSB levels: 30/50/70/90/95/100). |
276
348
  | `clustering_backend` | `precomputed` | `precomputed` (RCSB clusters) or `mmseqs2` (run your own). |
277
349
  | `split_fractions` | 0.80 / 0.10 / 0.10 | train / val / test. |
@@ -1,5 +1,9 @@
1
1
  # IF-Split
2
2
 
3
+ [![CI](https://github.com/WSobo/IF-Split/actions/workflows/ci.yml/badge.svg)](https://github.com/WSobo/IF-Split/actions/workflows/ci.yml)
4
+ [![PyPI](https://img.shields.io/pypi/v/if-split.svg)](https://pypi.org/project/if-split/)
5
+ [![Python](https://img.shields.io/pypi/pyversions/if-split.svg)](https://pypi.org/project/if-split/)
6
+
3
7
  **A reproducible, date-pinned, ligand-aware train/val/test splitter for the PDB.**
4
8
 
5
9
  IF-Split borrows the *split logic* of LigandMPNN (Dauparas et al., *Nature
@@ -41,16 +45,23 @@ records and sequences. Coordinates are an optional, downstream concern.
41
45
 
42
46
  ## Install
43
47
 
44
- Requires Python ≥ 3.11 and [`uv`](https://docs.astral.sh/uv/). `build` needs
45
- only network access to RCSB — no external binaries. (The optional `mmseqs2`
46
- clustering backend and the optional coordinate/featurization path via `gemmi`
47
- are Linux-native, so run under Linux/WSL if you use them.)
48
+ Requires Python ≥ 3.11. `build` needs only network access to RCSB — no external
49
+ binaries.
50
+
51
+ ```bash
52
+ pip install if-split # from PyPI
53
+ ```
54
+
55
+ Or for development, with [`uv`](https://docs.astral.sh/uv/):
48
56
 
49
57
  ```bash
58
+ git clone https://github.com/WSobo/IF-Split && cd IF-Split
50
59
  uv sync # creates .venv from uv.lock, installs deps + dev tools (ruff, pytest)
51
60
  ```
52
61
 
53
- `uv.lock` is committed, so environments are reproducible.
62
+ `uv.lock` is committed, so dev environments are reproducible. (The optional
63
+ coordinate/featurization path via `gemmi` is Linux-native, so run under
64
+ Linux/WSL if you use `fetch`.)
54
65
 
55
66
  ## Quickstart
56
67
 
@@ -72,6 +83,9 @@ uv run if-split build --registry data/out/splits.registry.json --out data/out2
72
83
 
73
84
  # OPTIONAL: download the actual structures for a built split (see below).
74
85
  uv run if-split fetch data/out/manifest.json --split test --out data/structures
86
+
87
+ # Emit a portable, shareable split spec (see "Sharing a split spec" below).
88
+ uv run if-split spec data/out/manifest.json --name my-split --out my-split.ifsplit.yaml
75
89
  ```
76
90
 
77
91
  ### Outputs (`--out` directory)
@@ -144,7 +158,7 @@ A `build` runs eight stages; none touch coordinates.
144
158
  |---|---|---|
145
159
  | 1 — enumerate | `enumerate.py`, `rcsb.py` | RCSB Search → entry IDs; Data API (GraphQL, batched) → sequences, ligands, residue counts, cluster membership → `candidates.jsonl`. |
146
160
  | 3 — filter | `parse.py` | Drop no-protein / no-sequence / oversized entries (assembly-1 residue count vs `max_total_residues`), plus optional wwPDB validation-report quality caps (clashscore, R-free, Ramachandran/rotamer/RSRZ) — all from metadata. Every drop is logged with its reason. |
147
- | 4 — ligands | `ligands.py` | Tier each non-protein component `functional`/`ambiguous`/`artifact`; derive class labels (metal / small-molecule / nucleotide). Nucleotide is functional only with a verified protein↔NA assembly interface. **Annotate, never drop.** |
161
+ | 4 — ligands | `ligands.py` | Tier each non-protein component `functional`/`ambiguous`/`artifact`; derive class labels (metal / small-molecule / nucleic-acid). `nucleic_acid` = a protein↔DNA/RNA *complex* (verified assembly interface), **not** a bound mononucleotide. **Annotate, never drop.** |
148
162
  | 5 — cluster | `cluster.py` | Group protein entities by RCSB precomputed cluster id at `identity_threshold`; canonical key = smallest member id. |
149
163
  | 6 — split | `split.py` | Deterministic hash → train/val/test; assert no cluster spans two splits; audit residual secondary-chain overlap. |
150
164
  | 7 — manifest | `manifest.py` | Emit lock + manifest + registry (all deterministic, no wall-clock fields). |
@@ -194,16 +208,30 @@ machine-readable reason, from RCSB metadata signals:
194
208
  **Holo gating (metadata-only).** Presence isn't enough. A small molecule or metal
195
209
  is `functional` only if RCSB reports it *contacting* the protein (`bound_components`)
196
210
  or it has a measured binding affinity; an unbound one is `ambiguous`. A DNA/RNA
197
- chain is `functional` *nucleotide* only when the biological assembly has a verified
198
- protein↔nucleic-acid interface (`rcsb_interface_info.polymer_composition == "Protein/NA"`)
199
- — a co-deposited but non-contacting oligo is reported `ambiguous`, never silently
211
+ chain is `functional` `nucleic_acid` only when the biological assembly has a verified
212
+ protein↔nucleic-acid interface (`num_prot_na_interface_entities > 0`) — a
213
+ co-deposited but non-contacting oligo is reported `ambiguous`, never silently
200
214
  labelled. (Interfaces are RCSB-computed metadata, available for X-ray *and* cryo-EM,
201
215
  so no coordinates are downloaded.)
202
216
 
217
+ > The `nucleic_acid` class is the protein–nucleic-acid **complex** category (DNA/RNA
218
+ > polymer chains), matching LigandMPNN's "nucleotide" split. Bound *mononucleotide*
219
+ > ligands (ATP, GTP, NAD, SAM, …) are not this class — they fall under
220
+ > `small_molecule`.
221
+
203
222
  The His-tag/Ni curation catches a known blemish in the LigandMPNN metal set:
204
223
  structures whose only "metal site" is a poly-His tag chelating Ni/Co from
205
- affinity purification. Live examples from a real build: `101M {HEM:
206
- functional, SO4: artifact}`, `102L {BME: artifact, CL: artifact}`.
224
+ affinity purification. A poly-His run anywhere — or a short run at a chain
225
+ terminus (`histag_terminal_min_run`, catching 6×His tags left partial by
226
+ unmodeled or trimmed residues) — flags the entry's Ni/Co as an `artifact`.
227
+
228
+ But a real audit showed the deeper issue: **~96% of lone Ni/Co entries have no
229
+ His-tag in the deposited sequence at all** (the tag is trimmed from the SEQRES
230
+ record, not just unmodeled), so a sequence scan can never see it. So even with no
231
+ detectable tag, a *lone* Ni/Co (the entry's only metal) with no measured affinity
232
+ is demoted from `functional` to `ambiguous` — reported, not labelled. Real metals
233
+ (Zn, Mg, Fe, …), and Ni/Co backed by an affinity or sitting alongside a genuine
234
+ metal, are untouched. On the full PDB this re-tiers ~2.7% of the metal set.
207
235
 
208
236
  Crucially, **the structure always stays in its split** — a protein with a junk
209
237
  ion is still a good backbone; we just don't label the junk. A consumer wanting
@@ -236,11 +264,54 @@ for epoch in range(3):
236
264
  batch_ids = ds.train.sample_by_cluster(seed=epoch)
237
265
  ```
238
266
 
267
+ ## Sharing a split spec
268
+
269
+ The config **is** the shareable recipe. Everything that affects the split lives in
270
+ one small YAML file with a content hash, so you can hand someone that file and they
271
+ reproduce your methodology exactly — like `params.yaml` in DVC. `if-split spec`
272
+ emits a portable, self-identifying version from any build or config:
273
+
274
+ ```bash
275
+ # Extract a stand-alone spec from a finished build (config is embedded in the manifest):
276
+ uv run if-split spec data/out/manifest.json --name "my-split" --author "you" \
277
+ --out my-split.ifsplit.yaml
278
+
279
+ # Anyone reproduces your split from just that file:
280
+ uv run if-split build --config my-split.ifsplit.yaml --out their/out
281
+ ```
282
+
283
+ The emitted file carries a `spec:` header that announces what it is and pins the
284
+ expected hash:
285
+
286
+ ```yaml
287
+ spec:
288
+ ifsplit_spec: ifsplit/config@1 # schema id — the file says what it is
289
+ name: my-split
290
+ author: you
291
+ created_with: if-split 0.1.0
292
+ expected_config_hash: 3b63318286fd2ac4994f34d10936be05
293
+ snapshot_date: '2026-05-31'
294
+ resolution_max_A: 3.5
295
+ # ... all output-affecting settings ...
296
+ ```
297
+
298
+ On load, if `expected_config_hash` no longer matches the settings (someone edited
299
+ them after stamping), IF-Split warns. The `spec:` metadata is **excluded from the
300
+ hash**, so name/author/description never change the split identity — two specs that
301
+ differ only in their labels produce byte-identical outputs.
302
+
303
+ | Artifact | Question it answers | Size |
304
+ |---|---|--:|
305
+ | `*.ifsplit.yaml` (or `config.yaml`) | *"How did you make this split?"* — the recipe | ~KB |
306
+ | `manifest.json` | *"What's in it?"* — counts, provenance, file index | ~KB |
307
+ | `dataset.lock` | *"Reproduce the exact bytes"* — pins entry set + candidates SHA | ~MB |
308
+
239
309
  ## Configuration
240
310
 
241
311
  Everything that affects the output lives in one YAML file
242
312
  ([`config/default.yaml`](config/default.yaml)); its canonical hash is embedded
243
- in every manifest, so two builds with the same hash used identical settings.
313
+ in every manifest, so two builds with the same hash used identical settings. It
314
+ doubles as a shareable **split spec** — see [Sharing a split spec](#sharing-a-split-spec).
244
315
 
245
316
  | Key | Default | Meaning |
246
317
  |---|---|---|
@@ -251,8 +322,9 @@ in every manifest, so two builds with the same hash used identical settings.
251
322
  | `excluded_het` | waters + common ions | Extra components forced to `artifact`. |
252
323
  | `use_biological_assembly` | `true` | Count residues from assembly 1, not the deposited asymmetric unit. |
253
324
  | `purification_metals` | `[NI, CO]` | Metals treated as IMAC tags; `[]` disables the heuristic. |
254
- | `histag_min_run` | `6` | His-run length that marks a purification tag. |
255
- | `exclude_purification_artifacts` | `true` | Demote His-tag metals to `artifact`. |
325
+ | `histag_min_run` | `6` | His-run length (anywhere) that marks a purification tag. |
326
+ | `histag_terminal_min_run` | `3` | Shorter His-run at a chain terminus that also counts as a tag (partial/unmodeled 6×His). |
327
+ | `exclude_purification_artifacts` | `true` | Demote His-tag metals to `artifact`; lone uncorroborated Ni/Co → `ambiguous`. |
256
328
  | `identity_threshold` | `0.30` | Clustering cutoff (RCSB levels: 30/50/70/90/95/100). |
257
329
  | `clustering_backend` | `precomputed` | `precomputed` (RCSB clusters) or `mmseqs2` (run your own). |
258
330
  | `split_fractions` | 0.80 / 0.10 / 0.10 | train / val / test. |
@@ -14,11 +14,19 @@ use_biological_assembly: true # biounits, as in LigandMPNN (assembly 1)
14
14
  # Curation: purification-artifact detection (Stage 4). A poly-His tag that
15
15
  # coordinates Ni/Co is an artifact of affinity purification, not a biological
16
16
  # metal site — a known blemish in the LigandMPNN metal set. An entry whose only
17
- # metal is one of these AND that has a His-tag of >= histag_min_run residues is
18
- # flagged, and (by default) its purification metal is dropped from the metal
19
- # class. Set purification_metals: [] to disable.
17
+ # metal is a purification metal AND that carries a His-tag (a >= histag_min_run
18
+ # run anywhere, OR a >= histag_terminal_min_run run at a chain terminus to
19
+ # catch 6xHis tags left partial by unmodeled/trimmed residues) is flagged as an
20
+ # artifact. Set purification_metals: [] to disable.
21
+ #
22
+ # Even without a detectable tag, a *lone* Ni/Co with no measured affinity is
23
+ # demoted from `functional` to `ambiguous` (reported, not labelled): ~96% of
24
+ # lone Ni/Co in the PDB have no His-tag in the deposited sequence, so a bare
25
+ # contact can't be trusted as a biological metal site. Real metals (Zn, Mg, Fe,
26
+ # Mn, …) and Ni/Co alongside another real metal or with affinity are unaffected.
20
27
  purification_metals: ["NI", "CO"]
21
28
  histag_min_run: 6
29
+ histag_terminal_min_run: 3
22
30
  exclude_purification_artifacts: true
23
31
 
24
32
  identity_threshold: 0.30 # clustering cutoff (Data API levels: 30/50/70/90/95/100)
@@ -33,7 +41,7 @@ seed: 0
33
41
  # components (never individual entries -> no leakage) into test, in deterministic
34
42
  # hash order, skipping registry-pinned components (growth stays stable). A floor
35
43
  # larger than the available supply is met as far as possible; the shortfall is
36
- # reported in the manifest, not forced. Example: {metal: 500, nucleotide: 200}
44
+ # reported in the manifest, not forced. Example: {metal: 500, nucleic_acid: 200}
37
45
  test_min_per_class: {}
38
46
 
39
47
  # Quality filters (Stage 3) — wwPDB validation-report metrics, fetched as
@@ -18,7 +18,7 @@ uv run if-split build --config examples/IF-Split-2026.05.31/config.yaml --out da
18
18
  | [`test.json`](test.json) | 100 KB | test-set PDB ids (all of them) |
19
19
  | [`test/metal_test.json`](test/metal_test.json) | 36 KB | test ids with a functional **metal** site |
20
20
  | [`test/small_molecule_test.json`](test/small_molecule_test.json) | 28 KB | test ids with a functional **small molecule** |
21
- | [`test/nucleotide_test.json`](test/nucleotide_test.json) | 8 KB | test ids that are protein↔**nucleic-acid** complexes |
21
+ | [`test/nucleic_acid_test.json`](test/nucleic_acid_test.json) | 8 KB | test ids that are protein↔**nucleic-acid** complexes (DNA/RNA chains) |
22
22
  | [`manifest.json`](manifest.json) | 4 KB | provenance: config, counts, clustering stats, file index |
23
23
  | [`config.yaml`](config.yaml) | — | the exact config used (= `config/default.yaml`, cutoff pinned) |
24
24
  | [`STATS.txt`](STATS.txt) | — | `if-split stats` output |
@@ -50,16 +50,16 @@ output).
50
50
 
51
51
  ## Headline numbers
52
52
 
53
- - **223,422** candidates (X-ray + EM, ≤ 3.5 Å, released ≤ 2026-05-31)
54
- - **214,794 kept** — dropped 3,078 no-protein, 5,550 over-size (≥ 6000 residues)
55
- - **34,222** raw RCSB sequence clusters @ 30% identity → **19,587 leakage-safe
56
- components** after union-find merged **38,834** multi-chain bridging entries
53
+ - **223,419** candidates (X-ray + EM, ≤ 3.5 Å, released ≤ 2026-05-31)
54
+ - **214,791 kept** — dropped 3,078 no-protein, 5,550 over-size (≥ 6000 residues)
55
+ - **34,222** raw RCSB sequence clusters @ 30% identity → **19,589 leakage-safe
56
+ components** after union-find merged **38,832** multi-chain bridging entries
57
57
 
58
58
  | Split | Entries | Components | Component % |
59
59
  |---|--:|--:|--:|
60
- | train | 188,672 | 15,613 | 79.7% |
61
- | val | 13,726 | 1,993 | 10.2% |
62
- | test | 12,396 | 1,981 | 10.1% |
60
+ | train | 188,662 | 15,615 | 79.7% |
61
+ | val | 13,720 | 1,992 | 10.2% |
62
+ | test | 12,409 | 1,982 | 10.1% |
63
63
 
64
64
  **The split is balanced on sequence *components*, not entry counts** — that's why
65
65
  train holds ~88% of entries (redundant families like lysozyme carry many entries
@@ -68,13 +68,17 @@ per component). Splitting on components is what prevents cross-split leakage; us
68
68
 
69
69
  ## Curation highlights (holo-gated, annotate-never-destroy)
70
70
 
71
- - **Test set, functional tier:** metal 4,099 · small-molecule 3,530 · nucleotide 586
72
- - **Test set, ambiguous (reported, not labelled):** small-molecule 3,596 · metal 73 · nucleotide 1
73
- - The small-molecule ambiguous count the functional one: roughly half of
74
- bound-looking small molecules aren't corroborated by contact or a measured
75
- affinity, so they're flagged rather than silently labelled.
76
- - **404** His-tag/Ni(Co) purification artifacts flagged and demoted from the metal
77
- class the LigandMPNN metal-set blemish, caught automatically.
71
+ - **Test set, functional tier:** metal 4,012 · small-molecule 6,713 · nucleic-acid 586
72
+ - **Test set, ambiguous (reported, not labelled):** small-molecule 419 · metal 166 · nucleic-acid 1
73
+ - Functional small molecules far exceed ambiguous (6,713 vs 419): the
74
+ `is_subject_of_investigation` gate recovers non-covalently bound cofactors
75
+ (FAD/NAD/FMN/NADP) and inhibitors that the bond-based contact field misses.
76
+ - The metal ambiguous count includes lone, uncorroborated Ni/Co (likely IMAC
77
+ artifacts whose His-tag is absent from the deposited sequence) demoted from
78
+ functional, not dropped.
79
+ - **415** His-tag/Ni(Co) purification artifacts flagged and demoted from the metal
80
+ class — the LigandMPNN metal-set blemish, caught automatically (full His run or
81
+ a partial terminal tag).
78
82
 
79
83
  Every structure stays in its split regardless of ligand quality; only the labels
80
84
  and confidence tiers change.
@@ -0,0 +1,20 @@
1
+ IF-Split-2026.05.31 (config 3b63318286fd2ac4994f34d10936be05)
2
+ candidates: 223419 kept: 214791 dropped: 8628
3
+ - no_protein_entity: 3078
4
+ - too_large: 5550
5
+ clustering: precomputed @ 30% components=19589 (from 34222 raw) multichain=38832
6
+ splits (entries / components):
7
+ train: 188662 entries 15615 components
8
+ val : 13720 entries 1992 components
9
+ test : 12409 entries 1982 components
10
+ test set by ligand class (functional tier):
11
+ metal: 4012
12
+ nucleic_acid: 586
13
+ small_molecule: 6713
14
+ test set ambiguous (reported, not labelled):
15
+ metal: 166
16
+ nucleic_acid: 1
17
+ small_molecule: 419
18
+ His-tag/Ni purification artifacts flagged: 415
19
+ split files: test.json, train.json, val.json
20
+ per-class test files: test/metal_test.json, test/nucleic_acid_test.json, test/small_molecule_test.json
@@ -1,13 +1,13 @@
1
1
  {
2
2
  "candidates": {
3
- "count": 223422,
4
- "sha256": "62116f0183c2eb9a1c3bf69a36331748b920a1b9e182c65eae7518b0c86336ad"
3
+ "count": 223419,
4
+ "sha256": "302da149ddc0bc8ef62e92fe8f249146f9f37119fca7ea4ab5fbe26f3c6b15d7"
5
5
  },
6
6
  "clustering": {
7
7
  "backend": "precomputed",
8
8
  "identity": 30,
9
- "multichain_entries": 38834,
10
- "n_clusters": 19587,
9
+ "multichain_entries": 38832,
10
+ "n_clusters": 19589,
11
11
  "n_raw_clusters": 34222,
12
12
  "unclustered_entries": 316
13
13
  },
@@ -26,6 +26,7 @@
26
26
  "ELECTRON MICROSCOPY"
27
27
  ],
28
28
  "histag_min_run": 6,
29
+ "histag_terminal_min_run": 3,
29
30
  "identity_threshold": 0.3,
30
31
  "ligand_context_radius_A": 8.0,
31
32
  "max_clashscore": null,
@@ -49,9 +50,10 @@
49
50
  "val": 0.1
50
51
  },
51
52
  "split_salt": "snapsplit-v1",
53
+ "test_min_per_class": {},
52
54
  "use_biological_assembly": true
53
55
  },
54
- "config_hash": "eaa0d1757f53cb929794c21e7d47fcf4",
56
+ "config_hash": "3b63318286fd2ac4994f34d10936be05",
55
57
  "dataset_version": "IF-Split-2026.05.31",
56
58
  "files": {
57
59
  "clusters": "clusters.json",
@@ -64,7 +66,7 @@
64
66
  },
65
67
  "test_by_class": {
66
68
  "metal": "test/metal_test.json",
67
- "nucleotide": "test/nucleotide_test.json",
69
+ "nucleic_acid": "test/nucleic_acid_test.json",
68
70
  "small_molecule": "test/small_molecule_test.json"
69
71
  }
70
72
  },
@@ -74,56 +76,57 @@
74
76
  "too_large": 5550
75
77
  },
76
78
  "dropped": 8628,
77
- "kept": 214794
79
+ "kept": 214791
78
80
  },
79
81
  "if_split_version": "0.1.0",
80
82
  "ligands": {
81
- "n_purification_artifacts": 404
83
+ "n_purification_artifacts": 415
82
84
  },
83
85
  "manifest_schema": "if-split/manifest@2",
84
86
  "splits": {
85
87
  "cluster_counts": {
86
- "test": 1981,
87
- "train": 15613,
88
- "val": 1993
88
+ "test": 1982,
89
+ "train": 15615,
90
+ "val": 1992
89
91
  },
90
92
  "entry_counts": {
91
- "test": 12396,
92
- "train": 188672,
93
- "val": 13726
93
+ "test": 12409,
94
+ "train": 188662,
95
+ "val": 13720
94
96
  },
95
97
  "per_split_ambiguous_counts": {
96
98
  "test": {
97
- "metal": 73,
98
- "nucleotide": 1,
99
- "small_molecule": 3596
99
+ "metal": 166,
100
+ "nucleic_acid": 1,
101
+ "small_molecule": 419
100
102
  },
101
103
  "train": {
102
- "metal": 1233,
103
- "nucleotide": 11,
104
- "small_molecule": 58418
104
+ "metal": 2061,
105
+ "nucleic_acid": 11,
106
+ "small_molecule": 6144
105
107
  },
106
108
  "val": {
107
- "metal": 54,
108
- "small_molecule": 4059
109
+ "metal": 562,
110
+ "small_molecule": 514
109
111
  }
110
112
  },
111
113
  "per_split_class_counts": {
112
114
  "test": {
113
- "metal": 4099,
114
- "nucleotide": 586,
115
- "small_molecule": 3530
115
+ "metal": 4012,
116
+ "nucleic_acid": 586,
117
+ "small_molecule": 6713
116
118
  },
117
119
  "train": {
118
- "metal": 55952,
119
- "nucleotide": 9948,
120
- "small_molecule": 50497
120
+ "metal": 55113,
121
+ "nucleic_acid": 9948,
122
+ "small_molecule": 102762
121
123
  },
122
124
  "val": {
123
- "metal": 4535,
124
- "nucleotide": 539,
125
- "small_molecule": 3147
125
+ "metal": 4019,
126
+ "nucleic_acid": 539,
127
+ "small_molecule": 6692
126
128
  }
127
- }
129
+ },
130
+ "test_minimum_shortfalls": {}
128
131
  }
129
132
  }