if-split 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- if_split-0.1.0.dist-info/METADATA +312 -0
- if_split-0.1.0.dist-info/RECORD +20 -0
- if_split-0.1.0.dist-info/WHEEL +4 -0
- if_split-0.1.0.dist-info/entry_points.txt +2 -0
- if_split-0.1.0.dist-info/licenses/LICENSE +21 -0
- ifsplit/__init__.py +8 -0
- ifsplit/__main__.py +8 -0
- ifsplit/cli.py +317 -0
- ifsplit/cluster.py +130 -0
- ifsplit/config.py +146 -0
- ifsplit/dataset.py +112 -0
- ifsplit/download.py +229 -0
- ifsplit/enumerate.py +111 -0
- ifsplit/hydrate.py +216 -0
- ifsplit/ligands.py +267 -0
- ifsplit/manifest.py +417 -0
- ifsplit/parse.py +111 -0
- ifsplit/rcsb.py +251 -0
- ifsplit/schema.py +241 -0
- ifsplit/split.py +177 -0
ifsplit/parse.py
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
"""Stage 3 - Filter candidates on metadata (no coordinates).
|
|
2
|
+
|
|
3
|
+
Operates on the records in ``candidates.jsonl``. Drops entries that:
|
|
4
|
+
|
|
5
|
+
- have no protein polymer entity (``no_protein_entity``),
|
|
6
|
+
- have a protein entity but no usable sequence (``no_protein_sequence``),
|
|
7
|
+
- exceed the residue cap (``too_large``): ``total_residues >= max_total_residues``,
|
|
8
|
+
- violate an (optional) wwPDB validation-report quality cap — clashscore, R-free,
|
|
9
|
+
Ramachandran/rotamer/RSRZ outliers — or lack a report when one is required.
|
|
10
|
+
|
|
11
|
+
When ``use_biological_assembly`` the residue count is taken from assembly 1
|
|
12
|
+
(``<entry>-1``); otherwise the deposited polymer monomer count is used. Quality
|
|
13
|
+
metrics come from the metadata API (no coordinates); a cap fires only when both
|
|
14
|
+
the cap and the metric are present, so a missing metric never drops an entry.
|
|
15
|
+
Every drop is recorded with its reason so the build is auditable.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
from .config import Config
|
|
21
|
+
from .schema import CandidateRecord
|
|
22
|
+
|
|
23
|
+
DROP_NO_PROTEIN = "no_protein_entity"
|
|
24
|
+
DROP_NO_SEQUENCE = "no_protein_sequence"
|
|
25
|
+
DROP_TOO_LARGE = "too_large"
|
|
26
|
+
DROP_CLASHSCORE = "clashscore_too_high"
|
|
27
|
+
DROP_RFREE = "rfree_too_high"
|
|
28
|
+
DROP_RAMACHANDRAN = "ramachandran_outliers_too_high"
|
|
29
|
+
DROP_ROTAMER = "rotamer_outliers_too_high"
|
|
30
|
+
DROP_RSRZ = "rsrz_outliers_too_high"
|
|
31
|
+
DROP_NO_VALIDATION = "no_validation_report"
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def assembly1_residue_count(record: CandidateRecord) -> int | None:
|
|
35
|
+
"""Residue count of biological assembly 1 (id ending ``-1``), else smallest."""
|
|
36
|
+
if not record.assemblies:
|
|
37
|
+
return None
|
|
38
|
+
for aid in sorted(record.assemblies):
|
|
39
|
+
if aid.endswith("-1"):
|
|
40
|
+
return record.assemblies[aid]
|
|
41
|
+
return record.assemblies[sorted(record.assemblies)[0]]
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def total_residues(record: CandidateRecord, cfg: Config) -> int | None:
|
|
45
|
+
"""Residue count used for the size filter, per the assembly config."""
|
|
46
|
+
if cfg.use_biological_assembly:
|
|
47
|
+
count = assembly1_residue_count(record)
|
|
48
|
+
if count is not None:
|
|
49
|
+
return count
|
|
50
|
+
return record.deposited_residues
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def quality_drop(record: CandidateRecord, cfg: Config) -> tuple[str, float] | None:
|
|
54
|
+
"""First validation-report cap this record violates, else ``None``.
|
|
55
|
+
|
|
56
|
+
A cap fires only when both the cap and the metric are present and the metric
|
|
57
|
+
exceeds the cap; a missing metric never drops the entry. With
|
|
58
|
+
``require_validation_report`` an entry that has no report at all is dropped.
|
|
59
|
+
Returns ``(reason, value)`` so the drop log records the offending number.
|
|
60
|
+
"""
|
|
61
|
+
q = record.quality
|
|
62
|
+
if cfg.require_validation_report and not q.has_report:
|
|
63
|
+
return (DROP_NO_VALIDATION, 0.0)
|
|
64
|
+
checks = (
|
|
65
|
+
(cfg.max_clashscore, q.clashscore, DROP_CLASHSCORE),
|
|
66
|
+
(cfg.max_rfree, q.rfree, DROP_RFREE),
|
|
67
|
+
(cfg.max_ramachandran_outlier_pct, q.ramachandran_outlier_pct, DROP_RAMACHANDRAN),
|
|
68
|
+
(cfg.max_rotamer_outlier_pct, q.rotamer_outlier_pct, DROP_ROTAMER),
|
|
69
|
+
(cfg.max_rsrz_outlier_pct, q.rsrz_outlier_pct, DROP_RSRZ),
|
|
70
|
+
)
|
|
71
|
+
for cap, value, reason in checks:
|
|
72
|
+
if cap is not None and value is not None and value > cap:
|
|
73
|
+
return (reason, value)
|
|
74
|
+
return None
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def filter_candidates(
|
|
78
|
+
records: list[CandidateRecord], cfg: Config
|
|
79
|
+
) -> tuple[list[CandidateRecord], list[dict]]:
|
|
80
|
+
"""Return ``(kept, drops)`` where drops is a list of ``{entry_id, reason, ...}``."""
|
|
81
|
+
kept: list[CandidateRecord] = []
|
|
82
|
+
drops: list[dict] = []
|
|
83
|
+
for r in records:
|
|
84
|
+
proteins = [e for e in r.polymer_entities if e.is_protein]
|
|
85
|
+
if not proteins:
|
|
86
|
+
drops.append({"entry_id": r.entry_id, "reason": DROP_NO_PROTEIN})
|
|
87
|
+
continue
|
|
88
|
+
if not any(e.seq for e in proteins):
|
|
89
|
+
drops.append({"entry_id": r.entry_id, "reason": DROP_NO_SEQUENCE})
|
|
90
|
+
continue
|
|
91
|
+
tr = total_residues(r, cfg)
|
|
92
|
+
if tr is not None and tr >= cfg.max_total_residues:
|
|
93
|
+
drops.append({"entry_id": r.entry_id, "reason": DROP_TOO_LARGE, "residues": tr})
|
|
94
|
+
continue
|
|
95
|
+
qd = quality_drop(r, cfg)
|
|
96
|
+
if qd is not None:
|
|
97
|
+
reason, value = qd
|
|
98
|
+
drops.append({"entry_id": r.entry_id, "reason": reason, "value": value})
|
|
99
|
+
continue
|
|
100
|
+
kept.append(r)
|
|
101
|
+
kept.sort(key=lambda r: r.entry_id)
|
|
102
|
+
drops.sort(key=lambda d: d["entry_id"])
|
|
103
|
+
return kept, drops
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def drop_summary(drops: list[dict]) -> dict[str, int]:
|
|
107
|
+
"""Count drops by reason (deterministic order via sorted keys downstream)."""
|
|
108
|
+
out: dict[str, int] = {}
|
|
109
|
+
for d in drops:
|
|
110
|
+
out[d["reason"]] = out.get(d["reason"], 0) + 1
|
|
111
|
+
return out
|
ifsplit/rcsb.py
ADDED
|
@@ -0,0 +1,251 @@
|
|
|
1
|
+
"""Thin, polite RCSB API client (Search v2 + Data GraphQL).
|
|
2
|
+
|
|
3
|
+
No coordinates are ever fetched here — only metadata and sequences (see
|
|
4
|
+
PLAN.md §1.5). Endpoints are centralized so field paths live in one place, and
|
|
5
|
+
all requests retry with backoff on transient failures (429 / 5xx / network).
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import time
|
|
11
|
+
from collections.abc import Iterator
|
|
12
|
+
|
|
13
|
+
import httpx
|
|
14
|
+
|
|
15
|
+
from .config import Config
|
|
16
|
+
|
|
17
|
+
SEARCH_URL = "https://search.rcsb.org/rcsbsearch/v2/query"
|
|
18
|
+
DATA_GRAPHQL_URL = "https://data.rcsb.org/graphql"
|
|
19
|
+
|
|
20
|
+
# Conservative page/batch sizes: large enough to keep request counts sane,
|
|
21
|
+
# small enough to stay friendly and within response-size limits. The Data API
|
|
22
|
+
# caps batch endpoints at 1000 ids.
|
|
23
|
+
SEARCH_PAGE_ROWS = 5000
|
|
24
|
+
DATA_BATCH_SIZE = 200
|
|
25
|
+
|
|
26
|
+
_RETRY_STATUS = {429, 500, 502, 503, 504}
|
|
27
|
+
|
|
28
|
+
# GraphQL: everything Stages 3-6 need, no coordinates. Validated live against
|
|
29
|
+
# 4HHB / 1A1F / 1IEP. Notable curation signals:
|
|
30
|
+
# - rcsb_cluster_membership: precomputed cluster id per identity level
|
|
31
|
+
# (30/50/70/90/95/100) for protein entities -> no cluster-file download.
|
|
32
|
+
# - rcsb_entry_info.nonpolymer_bound_components: comp ids that actually contact
|
|
33
|
+
# the protein (the cheap buffer-vs-ligand gate; e.g. 4HHB -> ["HEM"], its
|
|
34
|
+
# PO4/Cl buffer is absent).
|
|
35
|
+
# - rcsb_binding_affinity.comp_id: comps with a measured affinity (sparse but a
|
|
36
|
+
# strong positive "this is a real ligand" signal).
|
|
37
|
+
# - pdbx_vrpt_summary_{geometry,diffraction,em}: wwPDB validation-report metrics
|
|
38
|
+
# (clashscore, Ramachandran/rotamer outliers, R-free, RSRZ). Geometry is
|
|
39
|
+
# reported for X-ray AND EM; diffraction is X-ray-only; each comes back as a
|
|
40
|
+
# 1-element list. Metadata, not coordinates — keeps the no-download invariant.
|
|
41
|
+
# - rcsb_assembly_info.num_prot_na_interface_entities: RCSB-computed count of
|
|
42
|
+
# protein<->nucleic-acid interface entities in the assembly; > 0 verifies a real
|
|
43
|
+
# protein/NA contact (the holo gate for the nucleotide class). A single integer
|
|
44
|
+
# in the assembly-info block — far cheaper than listing every interface object
|
|
45
|
+
# (a ribosome has hundreds). Present for X-ray AND EM.
|
|
46
|
+
_ENTRY_QUERY = """
|
|
47
|
+
query($ids: [String!]!) {
|
|
48
|
+
entries(entry_ids: $ids) {
|
|
49
|
+
rcsb_id
|
|
50
|
+
exptl { method }
|
|
51
|
+
rcsb_entry_info {
|
|
52
|
+
resolution_combined
|
|
53
|
+
deposited_polymer_monomer_count
|
|
54
|
+
nonpolymer_bound_components
|
|
55
|
+
}
|
|
56
|
+
rcsb_accession_info { initial_release_date }
|
|
57
|
+
rcsb_binding_affinity { comp_id }
|
|
58
|
+
pdbx_vrpt_summary_geometry {
|
|
59
|
+
clashscore
|
|
60
|
+
percent_ramachandran_outliers
|
|
61
|
+
percent_rotamer_outliers
|
|
62
|
+
}
|
|
63
|
+
pdbx_vrpt_summary_diffraction {
|
|
64
|
+
DCC_Rfree
|
|
65
|
+
percent_RSRZ_outliers
|
|
66
|
+
}
|
|
67
|
+
pdbx_vrpt_summary_em {
|
|
68
|
+
atom_inclusion_backbone
|
|
69
|
+
}
|
|
70
|
+
polymer_entities {
|
|
71
|
+
rcsb_id
|
|
72
|
+
entity_poly {
|
|
73
|
+
rcsb_entity_polymer_type
|
|
74
|
+
pdbx_seq_one_letter_code_can
|
|
75
|
+
}
|
|
76
|
+
rcsb_cluster_membership { cluster_id identity }
|
|
77
|
+
}
|
|
78
|
+
nonpolymer_entities {
|
|
79
|
+
nonpolymer_comp { chem_comp { id name formula type } }
|
|
80
|
+
}
|
|
81
|
+
assemblies {
|
|
82
|
+
rcsb_id
|
|
83
|
+
rcsb_assembly_info {
|
|
84
|
+
polymer_monomer_count
|
|
85
|
+
num_prot_na_interface_entities
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
"""
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
class RcsbError(RuntimeError):
|
|
94
|
+
"""Raised when an RCSB request fails after exhausting retries."""
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
class RcsbClient:
|
|
98
|
+
"""Minimal client for the two RCSB services IF-Split needs."""
|
|
99
|
+
|
|
100
|
+
def __init__(
|
|
101
|
+
self,
|
|
102
|
+
*,
|
|
103
|
+
timeout: float = 60.0,
|
|
104
|
+
max_retries: int = 4,
|
|
105
|
+
backoff_base: float = 1.5,
|
|
106
|
+
sleep=time.sleep,
|
|
107
|
+
) -> None:
|
|
108
|
+
self._client = httpx.Client(
|
|
109
|
+
timeout=timeout,
|
|
110
|
+
headers={"User-Agent": "IF-Split/0.1 (reproducible PDB splitter)"},
|
|
111
|
+
)
|
|
112
|
+
self._max_retries = max_retries
|
|
113
|
+
self._backoff_base = backoff_base
|
|
114
|
+
self._sleep = sleep
|
|
115
|
+
|
|
116
|
+
def __enter__(self) -> RcsbClient:
|
|
117
|
+
return self
|
|
118
|
+
|
|
119
|
+
def __exit__(self, *exc) -> None:
|
|
120
|
+
self.close()
|
|
121
|
+
|
|
122
|
+
def close(self) -> None:
|
|
123
|
+
self._client.close()
|
|
124
|
+
|
|
125
|
+
# --- low-level POST with retry/backoff ---
|
|
126
|
+
def _post(self, url: str, json_body: dict) -> httpx.Response:
|
|
127
|
+
last_exc: Exception | None = None
|
|
128
|
+
for attempt in range(self._max_retries + 1):
|
|
129
|
+
try:
|
|
130
|
+
resp = self._client.post(url, json=json_body)
|
|
131
|
+
except httpx.HTTPError as exc: # network/timeout
|
|
132
|
+
last_exc = exc
|
|
133
|
+
else:
|
|
134
|
+
if resp.status_code not in _RETRY_STATUS:
|
|
135
|
+
return resp
|
|
136
|
+
last_exc = RcsbError(f"{url} -> HTTP {resp.status_code}")
|
|
137
|
+
if attempt < self._max_retries:
|
|
138
|
+
self._sleep(self._backoff_base**attempt)
|
|
139
|
+
raise RcsbError(f"request to {url} failed after retries: {last_exc}")
|
|
140
|
+
|
|
141
|
+
# --- Search API: entry IDs matching the snapshot filters ---
|
|
142
|
+
def _search_query_body(self, cfg: Config) -> dict:
|
|
143
|
+
cutoff = f"{cfg.snapshot_date.isoformat()}T23:59:59Z"
|
|
144
|
+
return {
|
|
145
|
+
"query": {
|
|
146
|
+
"type": "group",
|
|
147
|
+
"logical_operator": "and",
|
|
148
|
+
"nodes": [
|
|
149
|
+
{
|
|
150
|
+
"type": "terminal",
|
|
151
|
+
"service": "text",
|
|
152
|
+
"parameters": {
|
|
153
|
+
"attribute": "exptl.method",
|
|
154
|
+
"operator": "in",
|
|
155
|
+
"value": list(cfg.experimental_methods),
|
|
156
|
+
},
|
|
157
|
+
},
|
|
158
|
+
{
|
|
159
|
+
"type": "terminal",
|
|
160
|
+
"service": "text",
|
|
161
|
+
"parameters": {
|
|
162
|
+
"attribute": "rcsb_entry_info.resolution_combined",
|
|
163
|
+
"operator": "less_or_equal",
|
|
164
|
+
"value": cfg.resolution_max_A,
|
|
165
|
+
},
|
|
166
|
+
},
|
|
167
|
+
{
|
|
168
|
+
"type": "terminal",
|
|
169
|
+
"service": "text",
|
|
170
|
+
"parameters": {
|
|
171
|
+
"attribute": "rcsb_accession_info.initial_release_date",
|
|
172
|
+
"operator": "less_or_equal",
|
|
173
|
+
"value": cutoff,
|
|
174
|
+
},
|
|
175
|
+
},
|
|
176
|
+
],
|
|
177
|
+
},
|
|
178
|
+
"return_type": "entry",
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
def count_entries(self, cfg: Config) -> int:
|
|
182
|
+
"""Total entries matching the snapshot filters (no paging)."""
|
|
183
|
+
body = self._search_query_body(cfg)
|
|
184
|
+
body["request_options"] = {"return_counts": True}
|
|
185
|
+
resp = self._post(SEARCH_URL, body)
|
|
186
|
+
resp.raise_for_status()
|
|
187
|
+
return int(resp.json()["total_count"])
|
|
188
|
+
|
|
189
|
+
def search_entry_ids(
|
|
190
|
+
self, cfg: Config, limit: int | None = None, *, progress=None
|
|
191
|
+
) -> list[str]:
|
|
192
|
+
"""All matching entry IDs, sorted ascending for determinism.
|
|
193
|
+
|
|
194
|
+
``limit`` (dev convenience) takes the first N in sorted order, so a
|
|
195
|
+
limited run is itself reproducible. ``progress`` (optional) is called with
|
|
196
|
+
a status string after each page, so a full-PDB enumeration isn't silent.
|
|
197
|
+
"""
|
|
198
|
+
ids: list[str] = []
|
|
199
|
+
start = 0
|
|
200
|
+
while True:
|
|
201
|
+
rows = SEARCH_PAGE_ROWS
|
|
202
|
+
if limit is not None:
|
|
203
|
+
remaining = limit - len(ids)
|
|
204
|
+
if remaining <= 0:
|
|
205
|
+
break
|
|
206
|
+
rows = min(rows, remaining)
|
|
207
|
+
body = self._search_query_body(cfg)
|
|
208
|
+
body["request_options"] = {
|
|
209
|
+
"paginate": {"start": start, "rows": rows},
|
|
210
|
+
"sort": [
|
|
211
|
+
{
|
|
212
|
+
"sort_by": "rcsb_entry_container_identifiers.entry_id",
|
|
213
|
+
"direction": "asc",
|
|
214
|
+
}
|
|
215
|
+
],
|
|
216
|
+
}
|
|
217
|
+
resp = self._post(SEARCH_URL, body)
|
|
218
|
+
if resp.status_code == 204: # no (more) results
|
|
219
|
+
break
|
|
220
|
+
resp.raise_for_status()
|
|
221
|
+
page = resp.json().get("result_set", [])
|
|
222
|
+
if not page:
|
|
223
|
+
break
|
|
224
|
+
ids.extend(hit["identifier"] for hit in page)
|
|
225
|
+
start += len(page)
|
|
226
|
+
if progress:
|
|
227
|
+
progress(f"search: {len(ids)} entry ids found...")
|
|
228
|
+
if len(page) < rows:
|
|
229
|
+
break
|
|
230
|
+
return ids
|
|
231
|
+
|
|
232
|
+
# --- Data API: batched metadata enrichment ---
|
|
233
|
+
def fetch_entries(self, ids: list[str]) -> Iterator[dict]:
|
|
234
|
+
"""Yield raw Data-API entry objects for ``ids``, batched."""
|
|
235
|
+
for batch in _chunks(ids, DATA_BATCH_SIZE):
|
|
236
|
+
resp = self._post(
|
|
237
|
+
DATA_GRAPHQL_URL,
|
|
238
|
+
{"query": _ENTRY_QUERY, "variables": {"ids": batch}},
|
|
239
|
+
)
|
|
240
|
+
resp.raise_for_status()
|
|
241
|
+
payload = resp.json()
|
|
242
|
+
if payload.get("errors"):
|
|
243
|
+
raise RcsbError(f"GraphQL errors: {payload['errors'][:1]}")
|
|
244
|
+
for entry in payload["data"]["entries"]:
|
|
245
|
+
if entry is not None:
|
|
246
|
+
yield entry
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def _chunks(seq: list[str], size: int) -> Iterator[list[str]]:
|
|
250
|
+
for i in range(0, len(seq), size):
|
|
251
|
+
yield seq[i : i + size]
|
ifsplit/schema.py
ADDED
|
@@ -0,0 +1,241 @@
|
|
|
1
|
+
"""Candidate-record schema + canonical serialization.
|
|
2
|
+
|
|
3
|
+
A ``CandidateRecord`` is one entry's snapshot metadata — everything Stages 3-6
|
|
4
|
+
need, with no coordinates. ``candidates.jsonl`` is the canonical, byte-stable
|
|
5
|
+
serialization of these records (sorted entries, sorted keys), which is what the
|
|
6
|
+
snapshot lock hashes.
|
|
7
|
+
|
|
8
|
+
PDB-ID compatibility: identifiers (entry_id, entity_id) are stored *verbatim* as
|
|
9
|
+
returned by the RCSB Data API in ``rcsb_id`` — never sliced, length-validated, or
|
|
10
|
+
case-folded. This makes the schema agnostic to legacy 4-character IDs (``4HHB``,
|
|
11
|
+
entity ``4HHB_1``) and the extended ``pdb_xxxxxxxx`` form alike.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import hashlib
|
|
17
|
+
import json
|
|
18
|
+
from collections.abc import Iterable
|
|
19
|
+
|
|
20
|
+
from pydantic import BaseModel, ConfigDict
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class PolymerEntity(BaseModel):
|
|
24
|
+
model_config = ConfigDict(extra="forbid")
|
|
25
|
+
|
|
26
|
+
entity_id: str # RCSB rcsb_id, verbatim (e.g. "4HHB_1")
|
|
27
|
+
polymer_type: str # rcsb_entity_polymer_type: Protein / DNA / RNA / NA-hybrid / Other
|
|
28
|
+
seq_len: int
|
|
29
|
+
seq: str
|
|
30
|
+
# RCSB precomputed cluster ids by identity level, e.g. {30: 48, 95: 1239}.
|
|
31
|
+
# Empty for non-protein entities (RCSB clusters proteins only).
|
|
32
|
+
cluster_ids: dict[int, int] = {}
|
|
33
|
+
|
|
34
|
+
@property
|
|
35
|
+
def is_protein(self) -> bool:
|
|
36
|
+
return "PROTEIN" in self.polymer_type.upper()
|
|
37
|
+
|
|
38
|
+
@property
|
|
39
|
+
def is_nucleic(self) -> bool:
|
|
40
|
+
t = self.polymer_type.upper()
|
|
41
|
+
return "DNA" in t or "RNA" in t
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class NonpolymerComp(BaseModel):
|
|
45
|
+
model_config = ConfigDict(extra="forbid")
|
|
46
|
+
|
|
47
|
+
comp_id: str # chem_comp id, e.g. "HEM", "ZN" (CCD codes are uppercase)
|
|
48
|
+
name: str | None = None
|
|
49
|
+
formula: str | None = None
|
|
50
|
+
comp_type: str | None = None
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class QualityMetrics(BaseModel):
|
|
54
|
+
"""wwPDB validation-report summary metrics (no coordinates).
|
|
55
|
+
|
|
56
|
+
Geometry metrics (clashscore, Ramachandran, rotamer) are reported for both
|
|
57
|
+
X-ray and cryo-EM; diffraction metrics (R-free, RSRZ) are X-ray only; EM
|
|
58
|
+
map-fit (backbone atom inclusion) is cryo-EM only. Any field may be ``None``
|
|
59
|
+
when the validation report does not provide it — a missing metric never
|
|
60
|
+
penalizes an entry (see :func:`ifsplit.parse.quality_drop`).
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
model_config = ConfigDict(extra="forbid")
|
|
64
|
+
|
|
65
|
+
clashscore: float | None = None # all-atom clashes / 1000 atoms (lower better)
|
|
66
|
+
ramachandran_outlier_pct: float | None = None # % backbone Ramachandran outliers
|
|
67
|
+
rotamer_outlier_pct: float | None = None # % sidechain rotamer outliers
|
|
68
|
+
rfree: float | None = None # diffraction DCC_Rfree (X-ray only)
|
|
69
|
+
rsrz_outlier_pct: float | None = None # diffraction % RSRZ outliers (X-ray only)
|
|
70
|
+
em_backbone_inclusion: float | None = None # EM backbone atom inclusion (higher better)
|
|
71
|
+
|
|
72
|
+
@property
|
|
73
|
+
def has_report(self) -> bool:
|
|
74
|
+
"""True if the validation report supplied at least one metric."""
|
|
75
|
+
return any(
|
|
76
|
+
v is not None
|
|
77
|
+
for v in (
|
|
78
|
+
self.clashscore,
|
|
79
|
+
self.ramachandran_outlier_pct,
|
|
80
|
+
self.rotamer_outlier_pct,
|
|
81
|
+
self.rfree,
|
|
82
|
+
self.rsrz_outlier_pct,
|
|
83
|
+
self.em_backbone_inclusion,
|
|
84
|
+
)
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
class CandidateRecord(BaseModel):
|
|
89
|
+
"""One PDB entry's snapshot metadata."""
|
|
90
|
+
|
|
91
|
+
model_config = ConfigDict(extra="forbid")
|
|
92
|
+
|
|
93
|
+
entry_id: str
|
|
94
|
+
methods: list[str]
|
|
95
|
+
resolution_A: float | None
|
|
96
|
+
release_date: str # YYYY-MM-DD
|
|
97
|
+
deposited_residues: int | None
|
|
98
|
+
assemblies: dict[str, int] # assembly_id -> polymer_monomer_count
|
|
99
|
+
polymer_entities: list[PolymerEntity]
|
|
100
|
+
nonpolymer_comps: list[NonpolymerComp]
|
|
101
|
+
# Curation signals (Stage 4). comp ids that actually contact the protein
|
|
102
|
+
# (rcsb_entry_info.nonpolymer_bound_components) and comp ids with a measured
|
|
103
|
+
# binding affinity (rcsb_binding_affinity). Both are the buffer-vs-ligand gate.
|
|
104
|
+
bound_components: list[str] = []
|
|
105
|
+
affinity_comp_ids: list[str] = []
|
|
106
|
+
# RCSB-computed count of protein<->nucleic-acid interface entities across the
|
|
107
|
+
# entry's assemblies (rcsb_assembly_info.num_prot_na_interface_entities). > 0
|
|
108
|
+
# verifies a real protein/NA contact (Stage 4 holo gate for the nucleotide
|
|
109
|
+
# class) — distinguishing a true complex from a co-deposited oligo.
|
|
110
|
+
protein_na_interface_count: int = 0
|
|
111
|
+
# wwPDB validation-report summary (Stage 3 quality filters). All fields
|
|
112
|
+
# optional; defaults to an empty report when RCSB has none.
|
|
113
|
+
quality: QualityMetrics = QualityMetrics()
|
|
114
|
+
|
|
115
|
+
@classmethod
|
|
116
|
+
def from_data_api(cls, entry: dict) -> CandidateRecord:
|
|
117
|
+
"""Build a record from a raw Data-API entry object (deterministic)."""
|
|
118
|
+
# Verbatim canonical id from RCSB (legacy or extended) — never reformat.
|
|
119
|
+
entry_id = entry["rcsb_id"]
|
|
120
|
+
|
|
121
|
+
methods = sorted(m["method"] for m in (entry.get("exptl") or []) if m.get("method"))
|
|
122
|
+
|
|
123
|
+
info = entry.get("rcsb_entry_info") or {}
|
|
124
|
+
res_list = info.get("resolution_combined") or []
|
|
125
|
+
resolution = min(res_list) if res_list else None
|
|
126
|
+
deposited = info.get("deposited_polymer_monomer_count")
|
|
127
|
+
bound = sorted({c.upper() for c in (info.get("nonpolymer_bound_components") or [])})
|
|
128
|
+
|
|
129
|
+
acc = entry.get("rcsb_accession_info") or {}
|
|
130
|
+
rel = acc.get("initial_release_date")
|
|
131
|
+
release_date = rel[:10] if rel else ""
|
|
132
|
+
|
|
133
|
+
affinity = sorted(
|
|
134
|
+
{
|
|
135
|
+
a["comp_id"].upper()
|
|
136
|
+
for a in (entry.get("rcsb_binding_affinity") or [])
|
|
137
|
+
if a.get("comp_id")
|
|
138
|
+
}
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
assemblies: dict[str, int] = {}
|
|
142
|
+
prot_na_interfaces = 0
|
|
143
|
+
for asm in entry.get("assemblies") or []:
|
|
144
|
+
aid = asm.get("rcsb_id")
|
|
145
|
+
info = asm.get("rcsb_assembly_info") or {}
|
|
146
|
+
count = info.get("polymer_monomer_count")
|
|
147
|
+
if aid is not None and count is not None:
|
|
148
|
+
assemblies[aid] = count
|
|
149
|
+
prot_na_interfaces += info.get("num_prot_na_interface_entities") or 0
|
|
150
|
+
|
|
151
|
+
polymers: list[PolymerEntity] = []
|
|
152
|
+
for p in entry.get("polymer_entities") or []:
|
|
153
|
+
poly = p.get("entity_poly") or {}
|
|
154
|
+
seq = poly.get("pdbx_seq_one_letter_code_can") or ""
|
|
155
|
+
seq = "".join(seq.split()) # strip newlines/whitespace the API may insert
|
|
156
|
+
cluster_ids: dict[int, int] = {}
|
|
157
|
+
for cm in p.get("rcsb_cluster_membership") or []:
|
|
158
|
+
ident = cm.get("identity")
|
|
159
|
+
cid = cm.get("cluster_id")
|
|
160
|
+
if ident is not None and cid is not None:
|
|
161
|
+
cluster_ids[int(ident)] = int(cid)
|
|
162
|
+
polymers.append(
|
|
163
|
+
PolymerEntity(
|
|
164
|
+
entity_id=p["rcsb_id"], # verbatim
|
|
165
|
+
polymer_type=poly.get("rcsb_entity_polymer_type") or "Other",
|
|
166
|
+
seq_len=len(seq),
|
|
167
|
+
seq=seq,
|
|
168
|
+
cluster_ids=cluster_ids,
|
|
169
|
+
)
|
|
170
|
+
)
|
|
171
|
+
polymers.sort(key=lambda e: e.entity_id)
|
|
172
|
+
|
|
173
|
+
comps: dict[str, NonpolymerComp] = {}
|
|
174
|
+
for n in entry.get("nonpolymer_entities") or []:
|
|
175
|
+
cc = (n.get("nonpolymer_comp") or {}).get("chem_comp") or {}
|
|
176
|
+
cid = cc.get("id")
|
|
177
|
+
if cid and cid.upper() not in comps:
|
|
178
|
+
comps[cid.upper()] = NonpolymerComp(
|
|
179
|
+
comp_id=cid.upper(),
|
|
180
|
+
name=cc.get("name"),
|
|
181
|
+
formula=cc.get("formula"),
|
|
182
|
+
comp_type=cc.get("type"),
|
|
183
|
+
)
|
|
184
|
+
nonpolymers = [comps[k] for k in sorted(comps)]
|
|
185
|
+
|
|
186
|
+
# Validation-report summaries arrive as 1-element lists (or null).
|
|
187
|
+
def _first(items) -> dict:
|
|
188
|
+
return (items or [None])[0] or {}
|
|
189
|
+
|
|
190
|
+
geo = _first(entry.get("pdbx_vrpt_summary_geometry"))
|
|
191
|
+
dif = _first(entry.get("pdbx_vrpt_summary_diffraction"))
|
|
192
|
+
em = _first(entry.get("pdbx_vrpt_summary_em"))
|
|
193
|
+
quality = QualityMetrics(
|
|
194
|
+
clashscore=geo.get("clashscore"),
|
|
195
|
+
ramachandran_outlier_pct=geo.get("percent_ramachandran_outliers"),
|
|
196
|
+
rotamer_outlier_pct=geo.get("percent_rotamer_outliers"),
|
|
197
|
+
rfree=dif.get("DCC_Rfree"),
|
|
198
|
+
rsrz_outlier_pct=dif.get("percent_RSRZ_outliers"),
|
|
199
|
+
em_backbone_inclusion=em.get("atom_inclusion_backbone"),
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
return cls(
|
|
203
|
+
entry_id=entry_id,
|
|
204
|
+
methods=methods,
|
|
205
|
+
resolution_A=resolution,
|
|
206
|
+
release_date=release_date,
|
|
207
|
+
deposited_residues=deposited,
|
|
208
|
+
assemblies=dict(sorted(assemblies.items())),
|
|
209
|
+
polymer_entities=polymers,
|
|
210
|
+
nonpolymer_comps=nonpolymers,
|
|
211
|
+
bound_components=bound,
|
|
212
|
+
affinity_comp_ids=affinity,
|
|
213
|
+
protein_na_interface_count=prot_na_interfaces,
|
|
214
|
+
quality=quality,
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
def to_canonical_json(self) -> str:
|
|
218
|
+
"""Single-line, sorted-key JSON for byte-stable serialization."""
|
|
219
|
+
return json.dumps(self.model_dump(mode="json"), sort_keys=True, separators=(",", ":"))
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def canonical_jsonl_bytes(records: Iterable[CandidateRecord]) -> bytes:
|
|
223
|
+
"""Byte-stable candidates.jsonl content: records sorted by entry_id."""
|
|
224
|
+
ordered = sorted(records, key=lambda r: r.entry_id)
|
|
225
|
+
text = "".join(r.to_canonical_json() + "\n" for r in ordered)
|
|
226
|
+
return text.encode("utf-8")
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def sha256_hex(data: bytes) -> str:
|
|
230
|
+
return hashlib.sha256(data).hexdigest()
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
def read_candidates_jsonl(path) -> list[CandidateRecord]:
|
|
234
|
+
"""Parse a candidates.jsonl file back into validated records."""
|
|
235
|
+
records: list[CandidateRecord] = []
|
|
236
|
+
with open(path, encoding="utf-8") as fh:
|
|
237
|
+
for line in fh:
|
|
238
|
+
line = line.strip()
|
|
239
|
+
if line:
|
|
240
|
+
records.append(CandidateRecord.model_validate_json(line))
|
|
241
|
+
return records
|