if-split 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ifsplit/parse.py ADDED
@@ -0,0 +1,111 @@
1
+ """Stage 3 - Filter candidates on metadata (no coordinates).
2
+
3
+ Operates on the records in ``candidates.jsonl``. Drops entries that:
4
+
5
+ - have no protein polymer entity (``no_protein_entity``),
6
+ - have a protein entity but no usable sequence (``no_protein_sequence``),
7
+ - exceed the residue cap (``too_large``): ``total_residues >= max_total_residues``,
8
+ - violate an (optional) wwPDB validation-report quality cap — clashscore, R-free,
9
+ Ramachandran/rotamer/RSRZ outliers — or lack a report when one is required.
10
+
11
+ When ``use_biological_assembly`` the residue count is taken from assembly 1
12
+ (``<entry>-1``); otherwise the deposited polymer monomer count is used. Quality
13
+ metrics come from the metadata API (no coordinates); a cap fires only when both
14
+ the cap and the metric are present, so a missing metric never drops an entry.
15
+ Every drop is recorded with its reason so the build is auditable.
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ from .config import Config
21
+ from .schema import CandidateRecord
22
+
23
+ DROP_NO_PROTEIN = "no_protein_entity"
24
+ DROP_NO_SEQUENCE = "no_protein_sequence"
25
+ DROP_TOO_LARGE = "too_large"
26
+ DROP_CLASHSCORE = "clashscore_too_high"
27
+ DROP_RFREE = "rfree_too_high"
28
+ DROP_RAMACHANDRAN = "ramachandran_outliers_too_high"
29
+ DROP_ROTAMER = "rotamer_outliers_too_high"
30
+ DROP_RSRZ = "rsrz_outliers_too_high"
31
+ DROP_NO_VALIDATION = "no_validation_report"
32
+
33
+
34
+ def assembly1_residue_count(record: CandidateRecord) -> int | None:
35
+ """Residue count of biological assembly 1 (id ending ``-1``), else smallest."""
36
+ if not record.assemblies:
37
+ return None
38
+ for aid in sorted(record.assemblies):
39
+ if aid.endswith("-1"):
40
+ return record.assemblies[aid]
41
+ return record.assemblies[sorted(record.assemblies)[0]]
42
+
43
+
44
+ def total_residues(record: CandidateRecord, cfg: Config) -> int | None:
45
+ """Residue count used for the size filter, per the assembly config."""
46
+ if cfg.use_biological_assembly:
47
+ count = assembly1_residue_count(record)
48
+ if count is not None:
49
+ return count
50
+ return record.deposited_residues
51
+
52
+
53
+ def quality_drop(record: CandidateRecord, cfg: Config) -> tuple[str, float] | None:
54
+ """First validation-report cap this record violates, else ``None``.
55
+
56
+ A cap fires only when both the cap and the metric are present and the metric
57
+ exceeds the cap; a missing metric never drops the entry. With
58
+ ``require_validation_report`` an entry that has no report at all is dropped.
59
+ Returns ``(reason, value)`` so the drop log records the offending number.
60
+ """
61
+ q = record.quality
62
+ if cfg.require_validation_report and not q.has_report:
63
+ return (DROP_NO_VALIDATION, 0.0)
64
+ checks = (
65
+ (cfg.max_clashscore, q.clashscore, DROP_CLASHSCORE),
66
+ (cfg.max_rfree, q.rfree, DROP_RFREE),
67
+ (cfg.max_ramachandran_outlier_pct, q.ramachandran_outlier_pct, DROP_RAMACHANDRAN),
68
+ (cfg.max_rotamer_outlier_pct, q.rotamer_outlier_pct, DROP_ROTAMER),
69
+ (cfg.max_rsrz_outlier_pct, q.rsrz_outlier_pct, DROP_RSRZ),
70
+ )
71
+ for cap, value, reason in checks:
72
+ if cap is not None and value is not None and value > cap:
73
+ return (reason, value)
74
+ return None
75
+
76
+
77
+ def filter_candidates(
78
+ records: list[CandidateRecord], cfg: Config
79
+ ) -> tuple[list[CandidateRecord], list[dict]]:
80
+ """Return ``(kept, drops)`` where drops is a list of ``{entry_id, reason, ...}``."""
81
+ kept: list[CandidateRecord] = []
82
+ drops: list[dict] = []
83
+ for r in records:
84
+ proteins = [e for e in r.polymer_entities if e.is_protein]
85
+ if not proteins:
86
+ drops.append({"entry_id": r.entry_id, "reason": DROP_NO_PROTEIN})
87
+ continue
88
+ if not any(e.seq for e in proteins):
89
+ drops.append({"entry_id": r.entry_id, "reason": DROP_NO_SEQUENCE})
90
+ continue
91
+ tr = total_residues(r, cfg)
92
+ if tr is not None and tr >= cfg.max_total_residues:
93
+ drops.append({"entry_id": r.entry_id, "reason": DROP_TOO_LARGE, "residues": tr})
94
+ continue
95
+ qd = quality_drop(r, cfg)
96
+ if qd is not None:
97
+ reason, value = qd
98
+ drops.append({"entry_id": r.entry_id, "reason": reason, "value": value})
99
+ continue
100
+ kept.append(r)
101
+ kept.sort(key=lambda r: r.entry_id)
102
+ drops.sort(key=lambda d: d["entry_id"])
103
+ return kept, drops
104
+
105
+
106
+ def drop_summary(drops: list[dict]) -> dict[str, int]:
107
+ """Count drops by reason (deterministic order via sorted keys downstream)."""
108
+ out: dict[str, int] = {}
109
+ for d in drops:
110
+ out[d["reason"]] = out.get(d["reason"], 0) + 1
111
+ return out
ifsplit/rcsb.py ADDED
@@ -0,0 +1,251 @@
1
+ """Thin, polite RCSB API client (Search v2 + Data GraphQL).
2
+
3
+ No coordinates are ever fetched here — only metadata and sequences (see
4
+ PLAN.md §1.5). Endpoints are centralized so field paths live in one place, and
5
+ all requests retry with backoff on transient failures (429 / 5xx / network).
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import time
11
+ from collections.abc import Iterator
12
+
13
+ import httpx
14
+
15
+ from .config import Config
16
+
17
+ SEARCH_URL = "https://search.rcsb.org/rcsbsearch/v2/query"
18
+ DATA_GRAPHQL_URL = "https://data.rcsb.org/graphql"
19
+
20
+ # Conservative page/batch sizes: large enough to keep request counts sane,
21
+ # small enough to stay friendly and within response-size limits. The Data API
22
+ # caps batch endpoints at 1000 ids.
23
+ SEARCH_PAGE_ROWS = 5000
24
+ DATA_BATCH_SIZE = 200
25
+
26
+ _RETRY_STATUS = {429, 500, 502, 503, 504}
27
+
28
+ # GraphQL: everything Stages 3-6 need, no coordinates. Validated live against
29
+ # 4HHB / 1A1F / 1IEP. Notable curation signals:
30
+ # - rcsb_cluster_membership: precomputed cluster id per identity level
31
+ # (30/50/70/90/95/100) for protein entities -> no cluster-file download.
32
+ # - rcsb_entry_info.nonpolymer_bound_components: comp ids that actually contact
33
+ # the protein (the cheap buffer-vs-ligand gate; e.g. 4HHB -> ["HEM"], its
34
+ # PO4/Cl buffer is absent).
35
+ # - rcsb_binding_affinity.comp_id: comps with a measured affinity (sparse but a
36
+ # strong positive "this is a real ligand" signal).
37
+ # - pdbx_vrpt_summary_{geometry,diffraction,em}: wwPDB validation-report metrics
38
+ # (clashscore, Ramachandran/rotamer outliers, R-free, RSRZ). Geometry is
39
+ # reported for X-ray AND EM; diffraction is X-ray-only; each comes back as a
40
+ # 1-element list. Metadata, not coordinates — keeps the no-download invariant.
41
+ # - rcsb_assembly_info.num_prot_na_interface_entities: RCSB-computed count of
42
+ # protein<->nucleic-acid interface entities in the assembly; > 0 verifies a real
43
+ # protein/NA contact (the holo gate for the nucleotide class). A single integer
44
+ # in the assembly-info block — far cheaper than listing every interface object
45
+ # (a ribosome has hundreds). Present for X-ray AND EM.
46
+ _ENTRY_QUERY = """
47
+ query($ids: [String!]!) {
48
+ entries(entry_ids: $ids) {
49
+ rcsb_id
50
+ exptl { method }
51
+ rcsb_entry_info {
52
+ resolution_combined
53
+ deposited_polymer_monomer_count
54
+ nonpolymer_bound_components
55
+ }
56
+ rcsb_accession_info { initial_release_date }
57
+ rcsb_binding_affinity { comp_id }
58
+ pdbx_vrpt_summary_geometry {
59
+ clashscore
60
+ percent_ramachandran_outliers
61
+ percent_rotamer_outliers
62
+ }
63
+ pdbx_vrpt_summary_diffraction {
64
+ DCC_Rfree
65
+ percent_RSRZ_outliers
66
+ }
67
+ pdbx_vrpt_summary_em {
68
+ atom_inclusion_backbone
69
+ }
70
+ polymer_entities {
71
+ rcsb_id
72
+ entity_poly {
73
+ rcsb_entity_polymer_type
74
+ pdbx_seq_one_letter_code_can
75
+ }
76
+ rcsb_cluster_membership { cluster_id identity }
77
+ }
78
+ nonpolymer_entities {
79
+ nonpolymer_comp { chem_comp { id name formula type } }
80
+ }
81
+ assemblies {
82
+ rcsb_id
83
+ rcsb_assembly_info {
84
+ polymer_monomer_count
85
+ num_prot_na_interface_entities
86
+ }
87
+ }
88
+ }
89
+ }
90
+ """
91
+
92
+
93
+ class RcsbError(RuntimeError):
94
+ """Raised when an RCSB request fails after exhausting retries."""
95
+
96
+
97
+ class RcsbClient:
98
+ """Minimal client for the two RCSB services IF-Split needs."""
99
+
100
+ def __init__(
101
+ self,
102
+ *,
103
+ timeout: float = 60.0,
104
+ max_retries: int = 4,
105
+ backoff_base: float = 1.5,
106
+ sleep=time.sleep,
107
+ ) -> None:
108
+ self._client = httpx.Client(
109
+ timeout=timeout,
110
+ headers={"User-Agent": "IF-Split/0.1 (reproducible PDB splitter)"},
111
+ )
112
+ self._max_retries = max_retries
113
+ self._backoff_base = backoff_base
114
+ self._sleep = sleep
115
+
116
+ def __enter__(self) -> RcsbClient:
117
+ return self
118
+
119
+ def __exit__(self, *exc) -> None:
120
+ self.close()
121
+
122
+ def close(self) -> None:
123
+ self._client.close()
124
+
125
+ # --- low-level POST with retry/backoff ---
126
+ def _post(self, url: str, json_body: dict) -> httpx.Response:
127
+ last_exc: Exception | None = None
128
+ for attempt in range(self._max_retries + 1):
129
+ try:
130
+ resp = self._client.post(url, json=json_body)
131
+ except httpx.HTTPError as exc: # network/timeout
132
+ last_exc = exc
133
+ else:
134
+ if resp.status_code not in _RETRY_STATUS:
135
+ return resp
136
+ last_exc = RcsbError(f"{url} -> HTTP {resp.status_code}")
137
+ if attempt < self._max_retries:
138
+ self._sleep(self._backoff_base**attempt)
139
+ raise RcsbError(f"request to {url} failed after retries: {last_exc}")
140
+
141
+ # --- Search API: entry IDs matching the snapshot filters ---
142
+ def _search_query_body(self, cfg: Config) -> dict:
143
+ cutoff = f"{cfg.snapshot_date.isoformat()}T23:59:59Z"
144
+ return {
145
+ "query": {
146
+ "type": "group",
147
+ "logical_operator": "and",
148
+ "nodes": [
149
+ {
150
+ "type": "terminal",
151
+ "service": "text",
152
+ "parameters": {
153
+ "attribute": "exptl.method",
154
+ "operator": "in",
155
+ "value": list(cfg.experimental_methods),
156
+ },
157
+ },
158
+ {
159
+ "type": "terminal",
160
+ "service": "text",
161
+ "parameters": {
162
+ "attribute": "rcsb_entry_info.resolution_combined",
163
+ "operator": "less_or_equal",
164
+ "value": cfg.resolution_max_A,
165
+ },
166
+ },
167
+ {
168
+ "type": "terminal",
169
+ "service": "text",
170
+ "parameters": {
171
+ "attribute": "rcsb_accession_info.initial_release_date",
172
+ "operator": "less_or_equal",
173
+ "value": cutoff,
174
+ },
175
+ },
176
+ ],
177
+ },
178
+ "return_type": "entry",
179
+ }
180
+
181
+ def count_entries(self, cfg: Config) -> int:
182
+ """Total entries matching the snapshot filters (no paging)."""
183
+ body = self._search_query_body(cfg)
184
+ body["request_options"] = {"return_counts": True}
185
+ resp = self._post(SEARCH_URL, body)
186
+ resp.raise_for_status()
187
+ return int(resp.json()["total_count"])
188
+
189
+ def search_entry_ids(
190
+ self, cfg: Config, limit: int | None = None, *, progress=None
191
+ ) -> list[str]:
192
+ """All matching entry IDs, sorted ascending for determinism.
193
+
194
+ ``limit`` (dev convenience) takes the first N in sorted order, so a
195
+ limited run is itself reproducible. ``progress`` (optional) is called with
196
+ a status string after each page, so a full-PDB enumeration isn't silent.
197
+ """
198
+ ids: list[str] = []
199
+ start = 0
200
+ while True:
201
+ rows = SEARCH_PAGE_ROWS
202
+ if limit is not None:
203
+ remaining = limit - len(ids)
204
+ if remaining <= 0:
205
+ break
206
+ rows = min(rows, remaining)
207
+ body = self._search_query_body(cfg)
208
+ body["request_options"] = {
209
+ "paginate": {"start": start, "rows": rows},
210
+ "sort": [
211
+ {
212
+ "sort_by": "rcsb_entry_container_identifiers.entry_id",
213
+ "direction": "asc",
214
+ }
215
+ ],
216
+ }
217
+ resp = self._post(SEARCH_URL, body)
218
+ if resp.status_code == 204: # no (more) results
219
+ break
220
+ resp.raise_for_status()
221
+ page = resp.json().get("result_set", [])
222
+ if not page:
223
+ break
224
+ ids.extend(hit["identifier"] for hit in page)
225
+ start += len(page)
226
+ if progress:
227
+ progress(f"search: {len(ids)} entry ids found...")
228
+ if len(page) < rows:
229
+ break
230
+ return ids
231
+
232
+ # --- Data API: batched metadata enrichment ---
233
+ def fetch_entries(self, ids: list[str]) -> Iterator[dict]:
234
+ """Yield raw Data-API entry objects for ``ids``, batched."""
235
+ for batch in _chunks(ids, DATA_BATCH_SIZE):
236
+ resp = self._post(
237
+ DATA_GRAPHQL_URL,
238
+ {"query": _ENTRY_QUERY, "variables": {"ids": batch}},
239
+ )
240
+ resp.raise_for_status()
241
+ payload = resp.json()
242
+ if payload.get("errors"):
243
+ raise RcsbError(f"GraphQL errors: {payload['errors'][:1]}")
244
+ for entry in payload["data"]["entries"]:
245
+ if entry is not None:
246
+ yield entry
247
+
248
+
249
+ def _chunks(seq: list[str], size: int) -> Iterator[list[str]]:
250
+ for i in range(0, len(seq), size):
251
+ yield seq[i : i + size]
ifsplit/schema.py ADDED
@@ -0,0 +1,241 @@
1
+ """Candidate-record schema + canonical serialization.
2
+
3
+ A ``CandidateRecord`` is one entry's snapshot metadata — everything Stages 3-6
4
+ need, with no coordinates. ``candidates.jsonl`` is the canonical, byte-stable
5
+ serialization of these records (sorted entries, sorted keys), which is what the
6
+ snapshot lock hashes.
7
+
8
+ PDB-ID compatibility: identifiers (entry_id, entity_id) are stored *verbatim* as
9
+ returned by the RCSB Data API in ``rcsb_id`` — never sliced, length-validated, or
10
+ case-folded. This makes the schema agnostic to legacy 4-character IDs (``4HHB``,
11
+ entity ``4HHB_1``) and the extended ``pdb_xxxxxxxx`` form alike.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import hashlib
17
+ import json
18
+ from collections.abc import Iterable
19
+
20
+ from pydantic import BaseModel, ConfigDict
21
+
22
+
23
+ class PolymerEntity(BaseModel):
24
+ model_config = ConfigDict(extra="forbid")
25
+
26
+ entity_id: str # RCSB rcsb_id, verbatim (e.g. "4HHB_1")
27
+ polymer_type: str # rcsb_entity_polymer_type: Protein / DNA / RNA / NA-hybrid / Other
28
+ seq_len: int
29
+ seq: str
30
+ # RCSB precomputed cluster ids by identity level, e.g. {30: 48, 95: 1239}.
31
+ # Empty for non-protein entities (RCSB clusters proteins only).
32
+ cluster_ids: dict[int, int] = {}
33
+
34
+ @property
35
+ def is_protein(self) -> bool:
36
+ return "PROTEIN" in self.polymer_type.upper()
37
+
38
+ @property
39
+ def is_nucleic(self) -> bool:
40
+ t = self.polymer_type.upper()
41
+ return "DNA" in t or "RNA" in t
42
+
43
+
44
+ class NonpolymerComp(BaseModel):
45
+ model_config = ConfigDict(extra="forbid")
46
+
47
+ comp_id: str # chem_comp id, e.g. "HEM", "ZN" (CCD codes are uppercase)
48
+ name: str | None = None
49
+ formula: str | None = None
50
+ comp_type: str | None = None
51
+
52
+
53
+ class QualityMetrics(BaseModel):
54
+ """wwPDB validation-report summary metrics (no coordinates).
55
+
56
+ Geometry metrics (clashscore, Ramachandran, rotamer) are reported for both
57
+ X-ray and cryo-EM; diffraction metrics (R-free, RSRZ) are X-ray only; EM
58
+ map-fit (backbone atom inclusion) is cryo-EM only. Any field may be ``None``
59
+ when the validation report does not provide it — a missing metric never
60
+ penalizes an entry (see :func:`ifsplit.parse.quality_drop`).
61
+ """
62
+
63
+ model_config = ConfigDict(extra="forbid")
64
+
65
+ clashscore: float | None = None # all-atom clashes / 1000 atoms (lower better)
66
+ ramachandran_outlier_pct: float | None = None # % backbone Ramachandran outliers
67
+ rotamer_outlier_pct: float | None = None # % sidechain rotamer outliers
68
+ rfree: float | None = None # diffraction DCC_Rfree (X-ray only)
69
+ rsrz_outlier_pct: float | None = None # diffraction % RSRZ outliers (X-ray only)
70
+ em_backbone_inclusion: float | None = None # EM backbone atom inclusion (higher better)
71
+
72
+ @property
73
+ def has_report(self) -> bool:
74
+ """True if the validation report supplied at least one metric."""
75
+ return any(
76
+ v is not None
77
+ for v in (
78
+ self.clashscore,
79
+ self.ramachandran_outlier_pct,
80
+ self.rotamer_outlier_pct,
81
+ self.rfree,
82
+ self.rsrz_outlier_pct,
83
+ self.em_backbone_inclusion,
84
+ )
85
+ )
86
+
87
+
88
+ class CandidateRecord(BaseModel):
89
+ """One PDB entry's snapshot metadata."""
90
+
91
+ model_config = ConfigDict(extra="forbid")
92
+
93
+ entry_id: str
94
+ methods: list[str]
95
+ resolution_A: float | None
96
+ release_date: str # YYYY-MM-DD
97
+ deposited_residues: int | None
98
+ assemblies: dict[str, int] # assembly_id -> polymer_monomer_count
99
+ polymer_entities: list[PolymerEntity]
100
+ nonpolymer_comps: list[NonpolymerComp]
101
+ # Curation signals (Stage 4). comp ids that actually contact the protein
102
+ # (rcsb_entry_info.nonpolymer_bound_components) and comp ids with a measured
103
+ # binding affinity (rcsb_binding_affinity). Both are the buffer-vs-ligand gate.
104
+ bound_components: list[str] = []
105
+ affinity_comp_ids: list[str] = []
106
+ # RCSB-computed count of protein<->nucleic-acid interface entities across the
107
+ # entry's assemblies (rcsb_assembly_info.num_prot_na_interface_entities). > 0
108
+ # verifies a real protein/NA contact (Stage 4 holo gate for the nucleotide
109
+ # class) — distinguishing a true complex from a co-deposited oligo.
110
+ protein_na_interface_count: int = 0
111
+ # wwPDB validation-report summary (Stage 3 quality filters). All fields
112
+ # optional; defaults to an empty report when RCSB has none.
113
+ quality: QualityMetrics = QualityMetrics()
114
+
115
+ @classmethod
116
+ def from_data_api(cls, entry: dict) -> CandidateRecord:
117
+ """Build a record from a raw Data-API entry object (deterministic)."""
118
+ # Verbatim canonical id from RCSB (legacy or extended) — never reformat.
119
+ entry_id = entry["rcsb_id"]
120
+
121
+ methods = sorted(m["method"] for m in (entry.get("exptl") or []) if m.get("method"))
122
+
123
+ info = entry.get("rcsb_entry_info") or {}
124
+ res_list = info.get("resolution_combined") or []
125
+ resolution = min(res_list) if res_list else None
126
+ deposited = info.get("deposited_polymer_monomer_count")
127
+ bound = sorted({c.upper() for c in (info.get("nonpolymer_bound_components") or [])})
128
+
129
+ acc = entry.get("rcsb_accession_info") or {}
130
+ rel = acc.get("initial_release_date")
131
+ release_date = rel[:10] if rel else ""
132
+
133
+ affinity = sorted(
134
+ {
135
+ a["comp_id"].upper()
136
+ for a in (entry.get("rcsb_binding_affinity") or [])
137
+ if a.get("comp_id")
138
+ }
139
+ )
140
+
141
+ assemblies: dict[str, int] = {}
142
+ prot_na_interfaces = 0
143
+ for asm in entry.get("assemblies") or []:
144
+ aid = asm.get("rcsb_id")
145
+ info = asm.get("rcsb_assembly_info") or {}
146
+ count = info.get("polymer_monomer_count")
147
+ if aid is not None and count is not None:
148
+ assemblies[aid] = count
149
+ prot_na_interfaces += info.get("num_prot_na_interface_entities") or 0
150
+
151
+ polymers: list[PolymerEntity] = []
152
+ for p in entry.get("polymer_entities") or []:
153
+ poly = p.get("entity_poly") or {}
154
+ seq = poly.get("pdbx_seq_one_letter_code_can") or ""
155
+ seq = "".join(seq.split()) # strip newlines/whitespace the API may insert
156
+ cluster_ids: dict[int, int] = {}
157
+ for cm in p.get("rcsb_cluster_membership") or []:
158
+ ident = cm.get("identity")
159
+ cid = cm.get("cluster_id")
160
+ if ident is not None and cid is not None:
161
+ cluster_ids[int(ident)] = int(cid)
162
+ polymers.append(
163
+ PolymerEntity(
164
+ entity_id=p["rcsb_id"], # verbatim
165
+ polymer_type=poly.get("rcsb_entity_polymer_type") or "Other",
166
+ seq_len=len(seq),
167
+ seq=seq,
168
+ cluster_ids=cluster_ids,
169
+ )
170
+ )
171
+ polymers.sort(key=lambda e: e.entity_id)
172
+
173
+ comps: dict[str, NonpolymerComp] = {}
174
+ for n in entry.get("nonpolymer_entities") or []:
175
+ cc = (n.get("nonpolymer_comp") or {}).get("chem_comp") or {}
176
+ cid = cc.get("id")
177
+ if cid and cid.upper() not in comps:
178
+ comps[cid.upper()] = NonpolymerComp(
179
+ comp_id=cid.upper(),
180
+ name=cc.get("name"),
181
+ formula=cc.get("formula"),
182
+ comp_type=cc.get("type"),
183
+ )
184
+ nonpolymers = [comps[k] for k in sorted(comps)]
185
+
186
+ # Validation-report summaries arrive as 1-element lists (or null).
187
+ def _first(items) -> dict:
188
+ return (items or [None])[0] or {}
189
+
190
+ geo = _first(entry.get("pdbx_vrpt_summary_geometry"))
191
+ dif = _first(entry.get("pdbx_vrpt_summary_diffraction"))
192
+ em = _first(entry.get("pdbx_vrpt_summary_em"))
193
+ quality = QualityMetrics(
194
+ clashscore=geo.get("clashscore"),
195
+ ramachandran_outlier_pct=geo.get("percent_ramachandran_outliers"),
196
+ rotamer_outlier_pct=geo.get("percent_rotamer_outliers"),
197
+ rfree=dif.get("DCC_Rfree"),
198
+ rsrz_outlier_pct=dif.get("percent_RSRZ_outliers"),
199
+ em_backbone_inclusion=em.get("atom_inclusion_backbone"),
200
+ )
201
+
202
+ return cls(
203
+ entry_id=entry_id,
204
+ methods=methods,
205
+ resolution_A=resolution,
206
+ release_date=release_date,
207
+ deposited_residues=deposited,
208
+ assemblies=dict(sorted(assemblies.items())),
209
+ polymer_entities=polymers,
210
+ nonpolymer_comps=nonpolymers,
211
+ bound_components=bound,
212
+ affinity_comp_ids=affinity,
213
+ protein_na_interface_count=prot_na_interfaces,
214
+ quality=quality,
215
+ )
216
+
217
+ def to_canonical_json(self) -> str:
218
+ """Single-line, sorted-key JSON for byte-stable serialization."""
219
+ return json.dumps(self.model_dump(mode="json"), sort_keys=True, separators=(",", ":"))
220
+
221
+
222
+ def canonical_jsonl_bytes(records: Iterable[CandidateRecord]) -> bytes:
223
+ """Byte-stable candidates.jsonl content: records sorted by entry_id."""
224
+ ordered = sorted(records, key=lambda r: r.entry_id)
225
+ text = "".join(r.to_canonical_json() + "\n" for r in ordered)
226
+ return text.encode("utf-8")
227
+
228
+
229
+ def sha256_hex(data: bytes) -> str:
230
+ return hashlib.sha256(data).hexdigest()
231
+
232
+
233
+ def read_candidates_jsonl(path) -> list[CandidateRecord]:
234
+ """Parse a candidates.jsonl file back into validated records."""
235
+ records: list[CandidateRecord] = []
236
+ with open(path, encoding="utf-8") as fh:
237
+ for line in fh:
238
+ line = line.strip()
239
+ if line:
240
+ records.append(CandidateRecord.model_validate_json(line))
241
+ return records