RNApolis 0.7.0__py3-none-any.whl → 0.8.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rnapolis/parser_v2.py +807 -186
- rnapolis/splitter.py +128 -0
- rnapolis/unifier.py +20 -5
- {rnapolis-0.7.0.dist-info → rnapolis-0.8.1.dist-info}/METADATA +1 -1
- {rnapolis-0.7.0.dist-info → rnapolis-0.8.1.dist-info}/RECORD +9 -8
- {rnapolis-0.7.0.dist-info → rnapolis-0.8.1.dist-info}/entry_points.txt +1 -0
- {rnapolis-0.7.0.dist-info → rnapolis-0.8.1.dist-info}/WHEEL +0 -0
- {rnapolis-0.7.0.dist-info → rnapolis-0.8.1.dist-info}/licenses/LICENSE +0 -0
- {rnapolis-0.7.0.dist-info → rnapolis-0.8.1.dist-info}/top_level.txt +0 -0
rnapolis/parser_v2.py
CHANGED
@@ -1,4 +1,6 @@
|
|
1
1
|
import io
|
2
|
+
import os
|
3
|
+
import string
|
2
4
|
import tempfile
|
3
5
|
from typing import IO, TextIO, Union
|
4
6
|
|
@@ -34,31 +36,46 @@ def parse_pdb_atoms(content: Union[str, IO[str]]) -> pd.DataFrame:
|
|
34
36
|
if isinstance(lines[0], bytes):
|
35
37
|
lines = [line.decode("utf-8") for line in lines]
|
36
38
|
|
39
|
+
current_model = 1
|
37
40
|
for line in lines:
|
38
41
|
record_type = line[:6].strip()
|
39
42
|
|
43
|
+
# Check for MODEL record
|
44
|
+
if record_type == "MODEL":
|
45
|
+
try:
|
46
|
+
current_model = int(line[10:14].strip())
|
47
|
+
except ValueError:
|
48
|
+
# Handle cases where MODEL record might be malformed
|
49
|
+
pass # Keep the previous model number
|
50
|
+
continue
|
51
|
+
|
40
52
|
# Only process ATOM and HETATM records
|
41
53
|
if record_type not in ["ATOM", "HETATM"]:
|
42
54
|
continue
|
43
55
|
|
44
56
|
# Parse fields according to PDB format specification
|
57
|
+
alt_loc = line[16:17].strip()
|
45
58
|
icode = line[26:27].strip()
|
59
|
+
element = line[76:78].strip()
|
60
|
+
charge = line[78:80].strip()
|
61
|
+
|
46
62
|
record = {
|
47
63
|
"record_type": record_type,
|
48
64
|
"serial": line[6:11].strip(),
|
49
65
|
"name": line[12:16].strip(),
|
50
|
-
"altLoc":
|
66
|
+
"altLoc": None if not alt_loc else alt_loc, # Store None if empty
|
51
67
|
"resName": line[17:20].strip(),
|
52
68
|
"chainID": line[21:22].strip(),
|
53
69
|
"resSeq": line[22:26].strip(),
|
54
|
-
"iCode": None if not icode else icode, #
|
70
|
+
"iCode": None if not icode else icode, # Store None if empty
|
55
71
|
"x": line[30:38].strip(),
|
56
72
|
"y": line[38:46].strip(),
|
57
73
|
"z": line[46:54].strip(),
|
58
74
|
"occupancy": line[54:60].strip(),
|
59
75
|
"tempFactor": line[60:66].strip(),
|
60
|
-
"element":
|
61
|
-
"charge":
|
76
|
+
"element": None if not element else element, # Store None if empty
|
77
|
+
"charge": None if not charge else charge, # Store None if empty
|
78
|
+
"model": current_model, # Add the current model number
|
62
79
|
}
|
63
80
|
|
64
81
|
records.append(record)
|
@@ -83,13 +100,23 @@ def parse_pdb_atoms(content: Union[str, IO[str]]) -> pd.DataFrame:
|
|
83
100
|
"tempFactor",
|
84
101
|
"element",
|
85
102
|
"charge",
|
103
|
+
"model",
|
86
104
|
]
|
87
105
|
)
|
88
106
|
|
89
107
|
df = pd.DataFrame(records)
|
90
108
|
|
91
109
|
# Convert numeric columns to appropriate types
|
92
|
-
numeric_columns = [
|
110
|
+
numeric_columns = [
|
111
|
+
"serial",
|
112
|
+
"resSeq",
|
113
|
+
"x",
|
114
|
+
"y",
|
115
|
+
"z",
|
116
|
+
"occupancy",
|
117
|
+
"tempFactor",
|
118
|
+
"model",
|
119
|
+
]
|
93
120
|
for col in numeric_columns:
|
94
121
|
df[col] = pd.to_numeric(df[col], errors="coerce")
|
95
122
|
|
@@ -128,18 +155,37 @@ def parse_cif_atoms(content: Union[str, IO[str]]) -> pd.DataFrame:
|
|
128
155
|
"""
|
129
156
|
adapter = IoAdapterPy()
|
130
157
|
|
131
|
-
# Handle
|
158
|
+
# Handle string, StringIO, and file-like objects
|
132
159
|
if isinstance(content, str):
|
133
|
-
# Create a temporary file
|
134
|
-
|
135
|
-
|
136
|
-
|
160
|
+
# Create a temporary file for string input
|
161
|
+
with tempfile.NamedTemporaryFile(
|
162
|
+
mode="w+", suffix=".cif", delete=False
|
163
|
+
) as temp_file:
|
137
164
|
temp_file.write(content)
|
138
|
-
temp_file.
|
139
|
-
|
140
|
-
|
141
|
-
|
165
|
+
temp_file_path = temp_file.name
|
166
|
+
try:
|
167
|
+
data = adapter.readFile(temp_file_path)
|
168
|
+
finally:
|
169
|
+
os.remove(temp_file_path) # Clean up the temporary file
|
170
|
+
elif isinstance(content, io.StringIO):
|
171
|
+
# Create a temporary file for StringIO input
|
172
|
+
with tempfile.NamedTemporaryFile(
|
173
|
+
mode="w+", suffix=".cif", delete=False
|
174
|
+
) as temp_file:
|
175
|
+
content.seek(0) # Ensure reading from the start
|
176
|
+
temp_file.write(content.read())
|
177
|
+
temp_file_path = temp_file.name
|
178
|
+
try:
|
179
|
+
data = adapter.readFile(temp_file_path)
|
180
|
+
finally:
|
181
|
+
os.remove(temp_file_path) # Clean up the temporary file
|
182
|
+
elif hasattr(content, "name"):
|
183
|
+
# Assume it's a file-like object with a name attribute (like an open file)
|
142
184
|
data = adapter.readFile(content.name)
|
185
|
+
else:
|
186
|
+
raise TypeError(
|
187
|
+
"Unsupported input type for parse_cif_atoms. Expected str, file-like object with name, or StringIO."
|
188
|
+
)
|
143
189
|
|
144
190
|
# Get the atom_site category
|
145
191
|
category = data[0].getObj("atom_site")
|
@@ -155,47 +201,133 @@ def parse_cif_atoms(content: Union[str, IO[str]]) -> pd.DataFrame:
|
|
155
201
|
# Create a list of dictionaries for each atom
|
156
202
|
records = []
|
157
203
|
for row in rows:
|
158
|
-
record =
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
204
|
+
record = {}
|
205
|
+
for attr, value in zip(attributes, row):
|
206
|
+
# Store None if value indicates missing data ('?' or '.')
|
207
|
+
if value in ["?", "."]:
|
208
|
+
record[attr] = None
|
209
|
+
else:
|
210
|
+
record[attr] = value
|
165
211
|
records.append(record)
|
166
212
|
|
167
213
|
# Create DataFrame from records
|
168
214
|
df = pd.DataFrame(records)
|
169
215
|
|
170
|
-
#
|
171
|
-
|
172
|
-
"
|
173
|
-
"
|
216
|
+
# Define columns based on mmCIF specification for atom_site
|
217
|
+
float_cols = [
|
218
|
+
"aniso_B[1][1]",
|
219
|
+
"aniso_B[1][1]_esd",
|
220
|
+
"aniso_B[1][2]",
|
221
|
+
"aniso_B[1][2]_esd",
|
222
|
+
"aniso_B[1][3]",
|
223
|
+
"aniso_B[1][3]_esd",
|
224
|
+
"aniso_B[2][2]",
|
225
|
+
"aniso_B[2][2]_esd",
|
226
|
+
"aniso_B[2][3]",
|
227
|
+
"aniso_B[2][3]_esd",
|
228
|
+
"aniso_B[3][3]",
|
229
|
+
"aniso_B[3][3]_esd",
|
230
|
+
"aniso_ratio",
|
231
|
+
"aniso_U[1][1]",
|
232
|
+
"aniso_U[1][1]_esd",
|
233
|
+
"aniso_U[1][2]",
|
234
|
+
"aniso_U[1][2]_esd",
|
235
|
+
"aniso_U[1][3]",
|
236
|
+
"aniso_U[1][3]_esd",
|
237
|
+
"aniso_U[2][2]",
|
238
|
+
"aniso_U[2][2]_esd",
|
239
|
+
"aniso_U[2][3]",
|
240
|
+
"aniso_U[2][3]_esd",
|
241
|
+
"aniso_U[3][3]",
|
242
|
+
"aniso_U[3][3]_esd",
|
243
|
+
"B_equiv_geom_mean",
|
244
|
+
"B_equiv_geom_mean_esd",
|
245
|
+
"B_iso_or_equiv",
|
246
|
+
"B_iso_or_equiv_esd",
|
174
247
|
"Cartn_x",
|
248
|
+
"Cartn_x_esd",
|
175
249
|
"Cartn_y",
|
250
|
+
"Cartn_y_esd",
|
176
251
|
"Cartn_z",
|
252
|
+
"Cartn_z_esd",
|
253
|
+
"fract_x",
|
254
|
+
"fract_x_esd",
|
255
|
+
"fract_y",
|
256
|
+
"fract_y_esd",
|
257
|
+
"fract_z",
|
258
|
+
"fract_z_esd",
|
177
259
|
"occupancy",
|
178
|
-
"
|
260
|
+
"occupancy_esd",
|
261
|
+
"U_equiv_geom_mean",
|
262
|
+
"U_equiv_geom_mean_esd",
|
263
|
+
"U_iso_or_equiv",
|
264
|
+
"U_iso_or_equiv_esd",
|
265
|
+
]
|
266
|
+
int_cols = [
|
267
|
+
"attached_hydrogens",
|
268
|
+
"label_seq_id",
|
269
|
+
"symmetry_multiplicity",
|
270
|
+
"pdbx_PDB_model_num",
|
179
271
|
"pdbx_formal_charge",
|
272
|
+
"pdbx_label_index",
|
180
273
|
]
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
274
|
+
category_cols = [
|
275
|
+
"auth_asym_id",
|
276
|
+
"auth_atom_id",
|
277
|
+
"auth_comp_id",
|
278
|
+
"auth_seq_id",
|
279
|
+
"calc_attached_atom",
|
280
|
+
"calc_flag",
|
281
|
+
"disorder_assembly",
|
282
|
+
"disorder_group",
|
188
283
|
"group_PDB",
|
189
|
-
"
|
284
|
+
"id",
|
285
|
+
"label_alt_id",
|
286
|
+
"label_asym_id",
|
190
287
|
"label_atom_id",
|
191
288
|
"label_comp_id",
|
192
|
-
"
|
193
|
-
"
|
194
|
-
"
|
195
|
-
"
|
289
|
+
"label_entity_id",
|
290
|
+
"thermal_displace_type",
|
291
|
+
"type_symbol",
|
292
|
+
"pdbx_atom_ambiguity",
|
293
|
+
"adp_type",
|
294
|
+
"refinement_flags",
|
295
|
+
"refinement_flags_adp",
|
296
|
+
"refinement_flags_occupancy",
|
297
|
+
"refinement_flags_posn",
|
298
|
+
"pdbx_auth_alt_id",
|
299
|
+
"pdbx_PDB_ins_code",
|
300
|
+
"pdbx_PDB_residue_no",
|
301
|
+
"pdbx_PDB_residue_name",
|
302
|
+
"pdbx_PDB_strand_id",
|
303
|
+
"pdbx_PDB_atom_name",
|
304
|
+
"pdbx_auth_atom_name",
|
305
|
+
"pdbx_auth_comp_id",
|
306
|
+
"pdbx_auth_asym_id",
|
307
|
+
"pdbx_auth_seq_id",
|
308
|
+
"pdbx_tls_group_id",
|
309
|
+
"pdbx_ncs_dom_id",
|
310
|
+
"pdbx_group_NDB",
|
311
|
+
"pdbx_atom_group",
|
312
|
+
"pdbx_label_seq_num",
|
313
|
+
"pdbx_not_in_asym",
|
314
|
+
"pdbx_sifts_xref_db_name",
|
315
|
+
"pdbx_sifts_xref_db_acc",
|
316
|
+
"pdbx_sifts_xref_db_num",
|
317
|
+
"pdbx_sifts_xref_db_res",
|
196
318
|
]
|
197
319
|
|
198
|
-
|
320
|
+
# Convert columns to appropriate types
|
321
|
+
for col in float_cols:
|
322
|
+
if col in df.columns:
|
323
|
+
df[col] = pd.to_numeric(df[col], errors="coerce")
|
324
|
+
|
325
|
+
for col in int_cols:
|
326
|
+
if col in df.columns:
|
327
|
+
# Use Int64 (nullable integer) to handle potential NaNs from coercion
|
328
|
+
df[col] = pd.to_numeric(df[col], errors="coerce").astype("Int64")
|
329
|
+
|
330
|
+
for col in category_cols:
|
199
331
|
if col in df.columns:
|
200
332
|
df[col] = df[col].astype("category")
|
201
333
|
|
@@ -205,170 +337,648 @@ def parse_cif_atoms(content: Union[str, IO[str]]) -> pd.DataFrame:
|
|
205
337
|
return df
|
206
338
|
|
207
339
|
|
208
|
-
def
|
209
|
-
df: pd.DataFrame, output: Union[str, TextIO, None] = None
|
210
|
-
) -> Union[str, None]:
|
340
|
+
def can_write_pdb(df: pd.DataFrame) -> bool:
|
211
341
|
"""
|
212
|
-
|
342
|
+
Check if the DataFrame can be losslessly represented in PDB format.
|
343
|
+
|
344
|
+
PDB format has limitations on field widths:
|
345
|
+
- Atom serial number (id): max 99999
|
346
|
+
- Chain identifier (auth_asym_id): max 1 character
|
347
|
+
- Residue sequence number (auth_seq_id): max 9999
|
213
348
|
|
214
349
|
Parameters:
|
215
350
|
-----------
|
216
351
|
df : pd.DataFrame
|
217
|
-
DataFrame containing atom records, as created by parse_pdb_atoms or parse_cif_atoms
|
218
|
-
output : Union[str, TextIO, None], optional
|
219
|
-
Output file path or file-like object. If None, returns the PDB content as a string.
|
352
|
+
DataFrame containing atom records, as created by parse_pdb_atoms or parse_cif_atoms.
|
220
353
|
|
221
354
|
Returns:
|
222
355
|
--------
|
223
|
-
|
224
|
-
|
356
|
+
bool
|
357
|
+
True if the DataFrame can be written to PDB format without data loss/truncation, False otherwise.
|
225
358
|
"""
|
226
|
-
|
227
|
-
buffer = io.StringIO()
|
359
|
+
format_type = df.attrs.get("format")
|
228
360
|
|
229
|
-
|
230
|
-
|
361
|
+
if format_type == "PDB":
|
362
|
+
# Assume data originally from PDB already fits PDB constraints
|
363
|
+
return True
|
231
364
|
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
line = " " * 80
|
365
|
+
if df.empty:
|
366
|
+
# An empty DataFrame can be represented as an empty PDB file
|
367
|
+
return True
|
236
368
|
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
369
|
+
if format_type == "mmCIF":
|
370
|
+
# Check serial number (id)
|
371
|
+
# Convert to numeric first to handle potential categorical type and NaNs
|
372
|
+
if "id" not in df.columns or (
|
373
|
+
pd.to_numeric(df["id"], errors="coerce").max() > 99999
|
374
|
+
):
|
375
|
+
return False
|
243
376
|
|
244
|
-
#
|
245
|
-
if
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
line = line[:6] + serial.rjust(5) + line[11:]
|
377
|
+
# Check chain ID (auth_asym_id) length
|
378
|
+
if "auth_asym_id" not in df.columns or (
|
379
|
+
df["auth_asym_id"].dropna().astype(str).str.len().max() > 1
|
380
|
+
):
|
381
|
+
return False
|
250
382
|
|
251
|
-
#
|
252
|
-
if
|
253
|
-
|
254
|
-
|
255
|
-
|
383
|
+
# Check residue sequence number (auth_seq_id)
|
384
|
+
if "auth_seq_id" not in df.columns or (
|
385
|
+
pd.to_numeric(df["auth_seq_id"], errors="coerce").max() > 9999
|
386
|
+
):
|
387
|
+
return False
|
256
388
|
|
257
|
-
#
|
258
|
-
|
259
|
-
line = line[:12] + atom_name.ljust(4) + line[16:]
|
260
|
-
else:
|
261
|
-
line = line[:12] + " " + atom_name.ljust(3) + line[16:]
|
389
|
+
# All checks passed for mmCIF
|
390
|
+
return True
|
262
391
|
|
263
|
-
|
264
|
-
|
265
|
-
alt_loc = row.get("altLoc", "")
|
266
|
-
else: # mmCIF
|
267
|
-
alt_loc = row.get("label_alt_id", "")
|
268
|
-
line = line[:16] + alt_loc + line[17:]
|
392
|
+
# If format is unknown or not PDB/mmCIF, assume it cannot be safely written
|
393
|
+
return False
|
269
394
|
|
270
|
-
# Set residue name
|
271
|
-
if format_type == "PDB":
|
272
|
-
res_name = row["resName"]
|
273
|
-
else: # mmCIF
|
274
|
-
res_name = row.get("auth_comp_id", row.get("label_comp_id", ""))
|
275
|
-
line = line[:17] + res_name.ljust(3) + line[20:]
|
276
395
|
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
else: # mmCIF
|
281
|
-
chain_id = row.get("auth_asym_id", row.get("label_asym_id", ""))
|
282
|
-
line = line[:21] + chain_id + line[22:]
|
396
|
+
def fit_to_pdb(df: pd.DataFrame) -> pd.DataFrame:
|
397
|
+
"""
|
398
|
+
Attempts to fit the atom data in a DataFrame to comply with PDB format limitations.
|
283
399
|
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
res_seq = str(int(row.get("auth_seq_id", row.get("label_seq_id", 0))))
|
289
|
-
line = line[:22] + res_seq.rjust(4) + line[26:]
|
400
|
+
If the data already fits (checked by can_write_pdb), returns the original DataFrame.
|
401
|
+
Otherwise, checks if fitting is possible based on total atoms, unique chains,
|
402
|
+
and residues per chain. If fitting is possible, it renumbers atoms, renames chains,
|
403
|
+
and renumbers residues within each chain sequentially starting from 1.
|
290
404
|
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
icode = (
|
296
|
-
row.get("pdbx_PDB_ins_code", "")
|
297
|
-
if pd.notna(row.get("pdbx_PDB_ins_code", ""))
|
298
|
-
else ""
|
299
|
-
)
|
300
|
-
line = line[:26] + icode + line[27:]
|
301
|
-
|
302
|
-
# Set X coordinate
|
303
|
-
if format_type == "PDB":
|
304
|
-
x = float(row["x"])
|
305
|
-
else: # mmCIF
|
306
|
-
x = float(row["Cartn_x"])
|
307
|
-
line = line[:30] + f"{x:8.3f}" + line[38:]
|
405
|
+
Parameters:
|
406
|
+
-----------
|
407
|
+
df : pd.DataFrame
|
408
|
+
DataFrame containing atom records, as created by parse_pdb_atoms or parse_cif_atoms.
|
308
409
|
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
410
|
+
Returns:
|
411
|
+
--------
|
412
|
+
pd.DataFrame
|
413
|
+
A new DataFrame with data potentially modified to fit PDB constraints.
|
414
|
+
The 'format' attribute of the returned DataFrame will be set to 'PDB'.
|
415
|
+
|
416
|
+
Raises:
|
417
|
+
-------
|
418
|
+
ValueError
|
419
|
+
If the data cannot be fitted into PDB format constraints (too many atoms,
|
420
|
+
chains, or residues per chain).
|
421
|
+
"""
|
422
|
+
format_type = df.attrs.get("format")
|
423
|
+
|
424
|
+
if not format_type:
|
425
|
+
raise ValueError("DataFrame format attribute is not set.")
|
426
|
+
|
427
|
+
if can_write_pdb(df):
|
428
|
+
return df
|
429
|
+
|
430
|
+
# Determine column names based on format
|
431
|
+
if format_type == "PDB":
|
432
|
+
serial_col = "serial"
|
433
|
+
chain_col = "chainID"
|
434
|
+
resseq_col = "resSeq"
|
435
|
+
icode_col = "iCode"
|
436
|
+
elif format_type == "mmCIF":
|
437
|
+
serial_col = "id"
|
438
|
+
chain_col = "auth_asym_id"
|
439
|
+
resseq_col = "auth_seq_id"
|
440
|
+
icode_col = "pdbx_PDB_ins_code"
|
441
|
+
else:
|
442
|
+
raise ValueError(f"Unsupported DataFrame format: {format_type}")
|
443
|
+
|
444
|
+
# --- Feasibility Checks ---
|
445
|
+
if chain_col not in df.columns:
|
446
|
+
raise ValueError(f"Missing required chain column: {chain_col}")
|
447
|
+
if resseq_col not in df.columns:
|
448
|
+
raise ValueError(f"Missing required residue sequence column: {resseq_col}")
|
449
|
+
|
450
|
+
unique_chains = df[chain_col].unique()
|
451
|
+
num_chains = len(unique_chains)
|
452
|
+
total_atoms = len(df)
|
453
|
+
max_pdb_serial = 99999
|
454
|
+
max_pdb_residue = 9999
|
455
|
+
available_chain_ids = list(
|
456
|
+
string.ascii_uppercase + string.ascii_lowercase + string.digits
|
457
|
+
)
|
458
|
+
max_pdb_chains = len(available_chain_ids)
|
459
|
+
|
460
|
+
# Check 1: Total atoms + TER lines <= 99999
|
461
|
+
if total_atoms + num_chains > max_pdb_serial:
|
462
|
+
raise ValueError(
|
463
|
+
f"Cannot fit to PDB: Total atoms ({total_atoms}) + TER lines ({num_chains}) exceeds PDB limit ({max_pdb_serial})."
|
464
|
+
)
|
315
465
|
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
line = line[:46] + f"{z:8.3f}" + line[54:]
|
466
|
+
# Check 2: Number of chains <= 62
|
467
|
+
if num_chains > max_pdb_chains:
|
468
|
+
raise ValueError(
|
469
|
+
f"Cannot fit to PDB: Number of unique chains ({num_chains}) exceeds PDB limit ({max_pdb_chains})."
|
470
|
+
)
|
322
471
|
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
472
|
+
# Check 3: Max residues per chain <= 9999
|
473
|
+
# More accurate check: group by chain, then count unique (resSeq, iCode) tuples
|
474
|
+
# Use a temporary structure to avoid modifying the original df
|
475
|
+
check_df = pd.DataFrame(
|
476
|
+
{
|
477
|
+
"chain": df[chain_col],
|
478
|
+
"resSeq": df[resseq_col],
|
479
|
+
"iCode": df[icode_col].fillna("") if icode_col in df.columns else "",
|
480
|
+
}
|
481
|
+
)
|
482
|
+
residue_counts = check_df.groupby("chain").apply(
|
483
|
+
lambda x: x[["resSeq", "iCode"]].drop_duplicates().shape[0]
|
484
|
+
)
|
485
|
+
max_residues_per_chain = residue_counts.max() if not residue_counts.empty else 0
|
486
|
+
|
487
|
+
if max_residues_per_chain > max_pdb_residue:
|
488
|
+
raise ValueError(
|
489
|
+
f"Cannot fit to PDB: Maximum residues in a single chain ({max_residues_per_chain}) exceeds PDB limit ({max_pdb_residue})."
|
490
|
+
)
|
329
491
|
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
492
|
+
# --- Perform Fitting ---
|
493
|
+
df_fitted = df.copy()
|
494
|
+
|
495
|
+
# 1. Rename Chains
|
496
|
+
chain_mapping = {
|
497
|
+
orig_chain: available_chain_ids[i] for i, orig_chain in enumerate(unique_chains)
|
498
|
+
}
|
499
|
+
df_fitted[chain_col] = df_fitted[chain_col].map(chain_mapping)
|
500
|
+
# Ensure the chain column is treated as string/object after mapping
|
501
|
+
df_fitted[chain_col] = df_fitted[chain_col].astype(object)
|
502
|
+
|
503
|
+
# 2. Renumber Residues within each new chain
|
504
|
+
new_resseq_col = "new_resSeq" # Temporary column for new numbering
|
505
|
+
df_fitted[new_resseq_col] = -1 # Initialize
|
506
|
+
|
507
|
+
all_new_res_maps = {}
|
508
|
+
for new_chain_id, group in df_fitted.groupby(chain_col):
|
509
|
+
# Identify unique original residues (seq + icode) in order of appearance
|
510
|
+
original_residues = group[[resseq_col, icode_col]].drop_duplicates()
|
511
|
+
# Create mapping: (orig_resSeq, orig_iCode) -> new_resSeq (1-based)
|
512
|
+
residue_mapping = {
|
513
|
+
tuple(res): i + 1
|
514
|
+
for i, res in enumerate(original_residues.itertuples(index=False))
|
515
|
+
}
|
516
|
+
all_new_res_maps[new_chain_id] = residue_mapping
|
517
|
+
|
518
|
+
# Apply mapping to the group
|
519
|
+
res_indices = group.set_index([resseq_col, icode_col]).index
|
520
|
+
df_fitted.loc[group.index, new_resseq_col] = res_indices.map(residue_mapping)
|
521
|
+
|
522
|
+
# Replace original residue number and clear insertion code
|
523
|
+
df_fitted[resseq_col] = df_fitted[new_resseq_col]
|
524
|
+
df_fitted[icode_col] = None # Insertion codes are now redundant
|
525
|
+
df_fitted.drop(columns=[new_resseq_col], inplace=True)
|
526
|
+
# Convert resseq_col back to Int64 if it was before, handling potential NaNs if any step failed
|
527
|
+
df_fitted[resseq_col] = df_fitted[resseq_col].astype("Int64")
|
528
|
+
|
529
|
+
# 3. Renumber Atom Serials
|
530
|
+
new_serial_col = "new_serial"
|
531
|
+
df_fitted[new_serial_col] = -1 # Initialize
|
532
|
+
current_serial = 0
|
533
|
+
last_chain_id_for_serial = None
|
534
|
+
|
535
|
+
# Iterate in the potentially re-sorted order after grouping/mapping
|
536
|
+
# Ensure stable sort order for consistent serial numbering
|
537
|
+
df_fitted.sort_index(
|
538
|
+
inplace=True
|
539
|
+
) # Sort by original index to maintain original atom order as much as possible
|
540
|
+
|
541
|
+
for index, row in df_fitted.iterrows():
|
542
|
+
current_chain_id = row[chain_col]
|
543
|
+
if (
|
544
|
+
last_chain_id_for_serial is not None
|
545
|
+
and current_chain_id != last_chain_id_for_serial
|
546
|
+
):
|
547
|
+
current_serial += 1 # Increment for TER line
|
548
|
+
|
549
|
+
current_serial += 1
|
550
|
+
if current_serial > max_pdb_serial:
|
551
|
+
# This should have been caught by the initial check, but is a safeguard
|
552
|
+
raise ValueError("Serial number exceeded PDB limit during renumbering.")
|
553
|
+
|
554
|
+
df_fitted.loc[index, new_serial_col] = current_serial
|
555
|
+
last_chain_id_for_serial = current_chain_id
|
556
|
+
|
557
|
+
# Replace original serial number
|
558
|
+
df_fitted[serial_col] = df_fitted[new_serial_col]
|
559
|
+
df_fitted.drop(columns=[new_serial_col], inplace=True)
|
560
|
+
# Convert serial_col back to Int64
|
561
|
+
df_fitted[serial_col] = df_fitted[serial_col].astype("Int64")
|
562
|
+
|
563
|
+
# Update attributes and column types for PDB compatibility
|
564
|
+
df_fitted.attrs["format"] = "PDB"
|
565
|
+
|
566
|
+
# Ensure final column types match expected PDB output (especially categories)
|
567
|
+
# Reapply categorical conversion as some operations might change dtypes
|
568
|
+
pdb_categorical_cols = [
|
569
|
+
"record_type",
|
570
|
+
"name",
|
571
|
+
"altLoc",
|
572
|
+
"resName",
|
573
|
+
chain_col,
|
574
|
+
"element",
|
575
|
+
"charge",
|
576
|
+
icode_col,
|
577
|
+
]
|
578
|
+
if "record_type" not in df_fitted.columns and "group_PDB" in df_fitted.columns:
|
579
|
+
df_fitted.rename(
|
580
|
+
columns={"group_PDB": "record_type"}, inplace=True
|
581
|
+
) # Ensure correct name
|
582
|
+
|
583
|
+
for col in pdb_categorical_cols:
|
584
|
+
if col in df_fitted.columns:
|
585
|
+
# Handle None explicitly before converting to category if needed
|
586
|
+
if df_fitted[col].isnull().any():
|
587
|
+
df_fitted[col] = (
|
588
|
+
df_fitted[col].astype(object).fillna("")
|
589
|
+
) # Fill None with empty string for category
|
590
|
+
df_fitted[col] = df_fitted[col].astype("category")
|
591
|
+
|
592
|
+
# Rename columns if necessary from mmCIF to PDB standard names
|
593
|
+
rename_map = {
|
594
|
+
"id": "serial",
|
595
|
+
"auth_asym_id": "chainID",
|
596
|
+
"auth_seq_id": "resSeq",
|
597
|
+
"pdbx_PDB_ins_code": "iCode",
|
598
|
+
"label_atom_id": "name", # Prefer label_atom_id if auth_atom_id not present? PDB uses 'name'
|
599
|
+
"label_comp_id": "resName", # Prefer label_comp_id if auth_comp_id not present? PDB uses 'resName'
|
600
|
+
"type_symbol": "element",
|
601
|
+
"pdbx_formal_charge": "charge",
|
602
|
+
"Cartn_x": "x",
|
603
|
+
"Cartn_y": "y",
|
604
|
+
"Cartn_z": "z",
|
605
|
+
"B_iso_or_equiv": "tempFactor",
|
606
|
+
"group_PDB": "record_type",
|
607
|
+
"pdbx_PDB_model_num": "model",
|
608
|
+
# Add mappings for auth_atom_id -> name, auth_comp_id -> resName if needed,
|
609
|
+
# deciding on precedence if both label_* and auth_* exist.
|
610
|
+
# Current write_pdb prioritizes auth_* when reading mmCIF, so map those.
|
611
|
+
"auth_atom_id": "name",
|
612
|
+
"auth_comp_id": "resName",
|
613
|
+
}
|
614
|
+
|
615
|
+
# Only rename columns that actually exist in the DataFrame
|
616
|
+
actual_rename_map = {k: v for k, v in rename_map.items() if k in df_fitted.columns}
|
617
|
+
df_fitted.rename(columns=actual_rename_map, inplace=True)
|
618
|
+
|
619
|
+
# Ensure essential PDB columns exist, even if empty, if they were created during fitting
|
620
|
+
pdb_essential_cols = [
|
621
|
+
"record_type",
|
622
|
+
"serial",
|
623
|
+
"name",
|
624
|
+
"altLoc",
|
625
|
+
"resName",
|
626
|
+
"chainID",
|
627
|
+
"resSeq",
|
628
|
+
"iCode",
|
629
|
+
"x",
|
630
|
+
"y",
|
631
|
+
"z",
|
632
|
+
"occupancy",
|
633
|
+
"tempFactor",
|
634
|
+
"element",
|
635
|
+
"charge",
|
636
|
+
"model",
|
637
|
+
]
|
638
|
+
for col in pdb_essential_cols:
|
639
|
+
if col not in df_fitted.columns:
|
640
|
+
# This case might occur if input mmCIF was missing fundamental columns mapped to PDB essentials
|
641
|
+
# Decide on default value or raise error. Adding empty series for now.
|
642
|
+
df_fitted[col] = pd.Series(
|
643
|
+
dtype="object"
|
644
|
+
) # Add as object to handle potential None/mixed types initially
|
645
|
+
|
646
|
+
# Re-order columns to standard PDB order for clarity
|
647
|
+
final_pdb_order = [col for col in pdb_essential_cols if col in df_fitted.columns]
|
648
|
+
other_cols = [col for col in df_fitted.columns if col not in final_pdb_order]
|
649
|
+
df_fitted = df_fitted[final_pdb_order + other_cols]
|
650
|
+
|
651
|
+
# --- Final Type Conversions for PDB format ---
|
652
|
+
# Convert numeric columns (similar to parse_pdb_atoms)
|
653
|
+
pdb_numeric_columns = [
|
654
|
+
"serial",
|
655
|
+
"resSeq",
|
656
|
+
"x",
|
657
|
+
"y",
|
658
|
+
"z",
|
659
|
+
"occupancy",
|
660
|
+
"tempFactor",
|
661
|
+
"model",
|
662
|
+
]
|
663
|
+
for col in pdb_numeric_columns:
|
664
|
+
if col in df_fitted.columns:
|
665
|
+
# Use Int64 for integer-like columns that might have been NaN during processing
|
666
|
+
if col in ["serial", "resSeq", "model"]:
|
667
|
+
df_fitted[col] = pd.to_numeric(df_fitted[col], errors="coerce").astype(
|
668
|
+
"Int64"
|
669
|
+
)
|
670
|
+
else: # Floats
|
671
|
+
df_fitted[col] = pd.to_numeric(df_fitted[col], errors="coerce")
|
672
|
+
|
673
|
+
# Convert categorical columns (similar to parse_pdb_atoms)
|
674
|
+
# Note: chainID and iCode were already handled during fitting/renaming
|
675
|
+
pdb_categorical_columns_final = [
|
676
|
+
"record_type",
|
677
|
+
"name",
|
678
|
+
"altLoc",
|
679
|
+
"resName",
|
680
|
+
"chainID", # Already category, but ensure consistency
|
681
|
+
"iCode", # Already category, but ensure consistency
|
682
|
+
"element",
|
683
|
+
"charge",
|
684
|
+
]
|
685
|
+
for col in pdb_categorical_columns_final:
|
686
|
+
if col in df_fitted.columns:
|
687
|
+
# Ensure the column is categorical first
|
688
|
+
if not pd.api.types.is_categorical_dtype(df_fitted[col]):
|
689
|
+
# Convert non-categorical columns, handling potential NaNs
|
690
|
+
if df_fitted[col].isnull().any():
|
691
|
+
df_fitted[col] = (
|
692
|
+
df_fitted[col].astype(object).fillna("").astype("category")
|
693
|
+
)
|
694
|
+
else:
|
695
|
+
df_fitted[col] = df_fitted[col].astype("category")
|
696
|
+
else:
|
697
|
+
# If already categorical, check if '' needs to be added before fillna
|
698
|
+
has_nans = df_fitted[col].isnull().any()
|
699
|
+
if has_nans and "" not in df_fitted[col].cat.categories:
|
700
|
+
# Add '' category explicitly
|
701
|
+
df_fitted[col] = df_fitted[col].cat.add_categories([""])
|
702
|
+
|
703
|
+
# Fill None/NaN with empty string (now safe)
|
704
|
+
if has_nans:
|
705
|
+
df_fitted[col].fillna("", inplace=True)
|
706
|
+
|
707
|
+
return df_fitted
|
708
|
+
|
709
|
+
|
710
|
+
def _format_pdb_atom_line(atom_data: dict) -> str:
|
711
|
+
"""Formats a dictionary of atom data into a PDB ATOM/HETATM line."""
|
712
|
+
# PDB format specification:
|
713
|
+
# COLUMNS DATA TYPE FIELD DEFINITION
|
714
|
+
# -----------------------------------------------------------------------
|
715
|
+
# 1 - 6 Record name "ATOM " or "HETATM"
|
716
|
+
# 7 - 11 Integer serial Atom serial number.
|
717
|
+
# 13 - 16 Atom name Atom name.
|
718
|
+
# 17 Character altLoc Alternate location indicator.
|
719
|
+
# 18 - 20 Residue name resName Residue name.
|
720
|
+
# 22 Character chainID Chain identifier.
|
721
|
+
# 23 - 26 Integer resSeq Residue sequence number.
|
722
|
+
# 27 AChar iCode Code for insertion of residues.
|
723
|
+
# 31 - 38 Real(8.3) x Orthogonal coordinates for X.
|
724
|
+
# 39 - 46 Real(8.3) y Orthogonal coordinates for Y.
|
725
|
+
# 47 - 54 Real(8.3) z Orthogonal coordinates for Z.
|
726
|
+
# 55 - 60 Real(6.2) occupancy Occupancy.
|
727
|
+
# 61 - 66 Real(6.2) tempFactor Temperature factor.
|
728
|
+
# 77 - 78 LString(2) element Element symbol, right-justified.
|
729
|
+
# 79 - 80 LString(2) charge Charge on the atom.
|
730
|
+
|
731
|
+
# Record name (ATOM/HETATM)
|
732
|
+
record_name = atom_data.get("record_name", "ATOM").ljust(6)
|
733
|
+
|
734
|
+
# Serial number
|
735
|
+
serial = str(atom_data.get("serial", 0)).rjust(5)
|
736
|
+
|
737
|
+
# Atom name - special alignment rules
|
738
|
+
atom_name = atom_data.get("name", "")
|
739
|
+
if len(atom_name) < 4 and atom_name[:1].isalpha():
|
740
|
+
# Pad with space on left for 1-3 char names starting with a letter
|
741
|
+
atom_name_fmt = (" " + atom_name).ljust(4)
|
742
|
+
else:
|
743
|
+
# Use as is, left-justified, for 4-char names or those starting with a digit
|
744
|
+
atom_name_fmt = atom_name.ljust(4)
|
745
|
+
|
746
|
+
# Alternate location indicator
|
747
|
+
alt_loc = atom_data.get("altLoc", "")[:1].ljust(1) # Max 1 char
|
748
|
+
|
749
|
+
# Residue name
|
750
|
+
res_name = atom_data.get("resName", "").rjust(
|
751
|
+
3
|
752
|
+
) # Spec says "Residue name", examples often right-justified
|
753
|
+
|
754
|
+
# Chain identifier
|
755
|
+
chain_id = atom_data.get("chainID", "")[:1].ljust(1) # Max 1 char
|
756
|
+
|
757
|
+
# Residue sequence number
|
758
|
+
res_seq = str(atom_data.get("resSeq", 0)).rjust(4)
|
759
|
+
|
760
|
+
# Insertion code
|
761
|
+
icode = atom_data.get("iCode", "")[:1].ljust(1) # Max 1 char
|
762
|
+
|
763
|
+
# Coordinates
|
764
|
+
x = f"{atom_data.get('x', 0.0):8.3f}"
|
765
|
+
y = f"{atom_data.get('y', 0.0):8.3f}"
|
766
|
+
z = f"{atom_data.get('z', 0.0):8.3f}"
|
767
|
+
|
768
|
+
# Occupancy
|
769
|
+
occupancy = f"{atom_data.get('occupancy', 1.0):6.2f}"
|
770
|
+
|
771
|
+
# Temperature factor
|
772
|
+
temp_factor = f"{atom_data.get('tempFactor', 0.0):6.2f}"
|
773
|
+
|
774
|
+
# Element symbol
|
775
|
+
element = atom_data.get("element", "").rjust(2)
|
776
|
+
|
777
|
+
# Charge
|
778
|
+
charge_val = atom_data.get("charge", "")
|
779
|
+
charge_fmt = ""
|
780
|
+
if charge_val:
|
781
|
+
try:
|
782
|
+
# Try converting numeric charge (e.g., +1, -2) to PDB format (1+, 2-)
|
783
|
+
charge_int = int(float(charge_val)) # Use float first for cases like "1.0"
|
784
|
+
if charge_int != 0:
|
785
|
+
charge_fmt = f"{abs(charge_int)}{'+' if charge_int > 0 else '-'}"
|
786
|
+
except ValueError:
|
787
|
+
# If already formatted (e.g., "1+", "FE2+"), use its string representation
|
788
|
+
charge_fmt = str(charge_val)
|
789
|
+
# Ensure it fits and is right-justified
|
790
|
+
charge_fmt = charge_fmt.strip()[:2].rjust(2)
|
791
|
+
else:
|
792
|
+
charge_fmt = " " # Blank if no charge
|
336
793
|
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
794
|
+
# Construct the full line
|
795
|
+
# Ensure spacing is correct according to the spec
|
796
|
+
# 1-6 Record name | 7-11 Serial | 12 Space | 13-16 Name | 17 AltLoc | 18-20 ResName | 21 Space | 22 ChainID | 23-26 ResSeq | 27 iCode | 28-30 Spaces | 31-38 X | 39-46 Y | 47-54 Z | 55-60 Occupancy | 61-66 TempFactor | 67-76 Spaces | 77-78 Element | 79-80 Charge
|
797
|
+
line = (
|
798
|
+
f"{record_name}{serial} {atom_name_fmt}{alt_loc}{res_name} {chain_id}{res_seq}{icode} "
|
799
|
+
f"{x}{y}{z}{occupancy}{temp_factor} " # 10 spaces
|
800
|
+
f"{element}{charge_fmt}"
|
801
|
+
)
|
802
|
+
|
803
|
+
# Ensure the line is exactly 80 characters long
|
804
|
+
return line.ljust(80)
|
343
805
|
|
344
|
-
|
806
|
+
|
807
|
+
def write_pdb(
|
808
|
+
df: pd.DataFrame, output: Union[str, TextIO, None] = None
|
809
|
+
) -> Union[str, None]:
|
810
|
+
"""
|
811
|
+
Write a DataFrame of atom records to PDB format.
|
812
|
+
|
813
|
+
Parameters:
|
814
|
+
-----------
|
815
|
+
df : pd.DataFrame
|
816
|
+
DataFrame containing atom records, as created by parse_pdb_atoms or parse_cif_atoms.
|
817
|
+
Must contain columns mappable to PDB format fields.
|
818
|
+
output : Union[str, TextIO, None], optional
|
819
|
+
Output file path or file-like object. If None, returns the PDB content as a string.
|
820
|
+
|
821
|
+
Returns:
|
822
|
+
--------
|
823
|
+
Union[str, None]
|
824
|
+
If output is None, returns the PDB content as a string. Otherwise, returns None.
|
825
|
+
"""
|
826
|
+
buffer = io.StringIO()
|
827
|
+
format_type = df.attrs.get("format", "PDB") # Assume PDB if not specified
|
828
|
+
|
829
|
+
last_model_num = None
|
830
|
+
last_chain_id = None
|
831
|
+
last_res_info = None # Tuple (resSeq, iCode, resName) for TER record
|
832
|
+
last_serial = 0
|
833
|
+
|
834
|
+
# Check if DataFrame is empty
|
835
|
+
if df.empty:
|
836
|
+
buffer.write("END\n")
|
837
|
+
content = buffer.getvalue()
|
838
|
+
buffer.close()
|
839
|
+
if output is not None:
|
840
|
+
if isinstance(output, str):
|
841
|
+
with open(output, "w") as f:
|
842
|
+
f.write(content)
|
843
|
+
else:
|
844
|
+
output.write(content)
|
845
|
+
return None
|
846
|
+
return content
|
847
|
+
|
848
|
+
for _, row in df.iterrows():
|
849
|
+
atom_data = {}
|
850
|
+
|
851
|
+
# --- Data Extraction ---
|
345
852
|
if format_type == "PDB":
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
853
|
+
# Pre-process PDB values, converting None to empty strings for optional fields
|
854
|
+
raw_alt_loc = row.get("altLoc")
|
855
|
+
pdb_alt_loc = "" if pd.isna(raw_alt_loc) else str(raw_alt_loc)
|
856
|
+
|
857
|
+
raw_icode = row.get("iCode")
|
858
|
+
pdb_icode = "" if pd.isna(raw_icode) else str(raw_icode)
|
859
|
+
|
860
|
+
raw_element = row.get("element")
|
861
|
+
pdb_element = "" if pd.isna(raw_element) else str(raw_element)
|
862
|
+
|
863
|
+
raw_charge = row.get("charge")
|
864
|
+
pdb_charge = "" if pd.isna(raw_charge) else str(raw_charge)
|
865
|
+
|
866
|
+
atom_data = {
|
867
|
+
"record_name": row.get("record_type", "ATOM"),
|
868
|
+
"serial": int(row.get("serial", 0)),
|
869
|
+
"name": str(row.get("name", "")),
|
870
|
+
"altLoc": pdb_alt_loc,
|
871
|
+
"resName": str(row.get("resName", "")),
|
872
|
+
"chainID": str(row.get("chainID", "")),
|
873
|
+
"resSeq": int(row.get("resSeq", 0)),
|
874
|
+
"iCode": pdb_icode,
|
875
|
+
"x": float(row.get("x", 0.0)),
|
876
|
+
"y": float(row.get("y", 0.0)),
|
877
|
+
"z": float(row.get("z", 0.0)),
|
878
|
+
"occupancy": float(row.get("occupancy", 1.0)),
|
879
|
+
"tempFactor": float(row.get("tempFactor", 0.0)),
|
880
|
+
"element": pdb_element,
|
881
|
+
"charge": pdb_charge,
|
882
|
+
"model": int(row.get("model", 1)),
|
883
|
+
}
|
884
|
+
elif format_type == "mmCIF":
|
885
|
+
# Pre-process mmCIF values to PDB compatible format, converting None to empty strings
|
886
|
+
raw_alt_loc = row.get("label_alt_id")
|
887
|
+
pdb_alt_loc = "" if pd.isna(raw_alt_loc) else str(raw_alt_loc)
|
888
|
+
|
889
|
+
raw_icode = row.get("pdbx_PDB_ins_code")
|
890
|
+
pdb_icode = "" if pd.isna(raw_icode) else str(raw_icode)
|
891
|
+
|
892
|
+
raw_element = row.get("type_symbol")
|
893
|
+
pdb_element = "" if pd.isna(raw_element) else str(raw_element)
|
894
|
+
|
895
|
+
raw_charge = row.get("pdbx_formal_charge")
|
896
|
+
pdb_charge = "" if pd.isna(raw_charge) else str(raw_charge)
|
897
|
+
|
898
|
+
atom_data = {
|
899
|
+
"record_name": row.get("group_PDB", "ATOM"),
|
900
|
+
"serial": int(row.get("id", 0)),
|
901
|
+
"name": str(row.get("auth_atom_id", row.get("label_atom_id", ""))),
|
902
|
+
"altLoc": pdb_alt_loc,
|
903
|
+
"resName": str(row.get("auth_comp_id", row.get("label_comp_id", ""))),
|
904
|
+
"chainID": str(row.get("auth_asym_id", row.get("label_asym_id"))),
|
905
|
+
"resSeq": int(row.get("auth_seq_id", row.get("label_seq_id", 0))),
|
906
|
+
"iCode": pdb_icode,
|
907
|
+
"x": float(row.get("Cartn_x", 0.0)),
|
908
|
+
"y": float(row.get("Cartn_y", 0.0)),
|
909
|
+
"z": float(row.get("Cartn_z", 0.0)),
|
910
|
+
"occupancy": float(row.get("occupancy", 1.0)),
|
911
|
+
"tempFactor": float(row.get("B_iso_or_equiv", 0.0)),
|
912
|
+
"element": pdb_element,
|
913
|
+
"charge": pdb_charge,
|
914
|
+
"model": int(row.get("pdbx_PDB_model_num", 1)),
|
915
|
+
}
|
916
|
+
else:
|
917
|
+
raise ValueError(f"Unsupported DataFrame format: {format_type}")
|
918
|
+
|
919
|
+
# --- MODEL/ENDMDL Records ---
|
920
|
+
current_model_num = atom_data["model"]
|
921
|
+
if current_model_num != last_model_num:
|
922
|
+
if last_model_num is not None:
|
923
|
+
buffer.write("ENDMDL\n")
|
924
|
+
buffer.write(f"MODEL {current_model_num:>4}\n")
|
925
|
+
last_model_num = current_model_num
|
926
|
+
# Reset chain/residue tracking for the new model
|
927
|
+
last_chain_id = None
|
928
|
+
last_res_info = None
|
929
|
+
|
930
|
+
# --- TER Records ---
|
931
|
+
current_chain_id = atom_data["chainID"]
|
932
|
+
current_res_info = (
|
933
|
+
atom_data["resSeq"],
|
934
|
+
atom_data["iCode"],
|
935
|
+
atom_data["resName"],
|
936
|
+
)
|
937
|
+
|
938
|
+
# Write TER if chain ID changes within the same model
|
939
|
+
if last_chain_id is not None and current_chain_id != last_chain_id:
|
940
|
+
ter_serial = str(last_serial + 1).rjust(5)
|
941
|
+
ter_res_name = last_res_info[2].strip().rjust(3) # Use last residue's name
|
942
|
+
ter_chain_id = last_chain_id
|
943
|
+
ter_res_seq = str(last_res_info[0]).rjust(4) # Use last residue's seq num
|
944
|
+
ter_icode = (
|
945
|
+
last_res_info[1] if last_res_info[1] else ""
|
946
|
+
) # Use last residue's icode
|
947
|
+
|
948
|
+
ter_line = f"TER {ter_serial} {ter_res_name} {ter_chain_id}{ter_res_seq}{ter_icode}"
|
949
|
+
buffer.write(ter_line.ljust(80) + "\n")
|
950
|
+
|
951
|
+
# --- Format and Write ATOM/HETATM Line ---
|
952
|
+
pdb_line = _format_pdb_atom_line(atom_data)
|
953
|
+
buffer.write(pdb_line + "\n")
|
954
|
+
|
955
|
+
# --- Update Tracking Variables ---
|
956
|
+
last_serial = atom_data["serial"]
|
957
|
+
last_chain_id = current_chain_id
|
958
|
+
last_res_info = current_res_info
|
959
|
+
|
960
|
+
# --- Final Records ---
|
961
|
+
# Add TER record for the very last chain in the last model
|
962
|
+
if last_chain_id is not None:
|
963
|
+
ter_serial = str(last_serial + 1).rjust(5)
|
964
|
+
ter_res_name = last_res_info[2].strip().rjust(3)
|
965
|
+
ter_chain_id = last_chain_id
|
966
|
+
ter_res_seq = str(last_res_info[0]).rjust(4)
|
967
|
+
ter_icode = last_res_info[1] if last_res_info[1] else ""
|
968
|
+
|
969
|
+
ter_line = f"TER {ter_serial} {ter_res_name} {ter_chain_id}{ter_res_seq}{ter_icode}"
|
970
|
+
buffer.write(ter_line.ljust(80) + "\n")
|
971
|
+
|
972
|
+
# Add ENDMDL if models were used
|
973
|
+
if last_model_num is not None:
|
974
|
+
buffer.write("ENDMDL\n")
|
975
|
+
|
365
976
|
buffer.write("END\n")
|
366
977
|
|
367
|
-
#
|
978
|
+
# --- Output Handling ---
|
368
979
|
content = buffer.getvalue()
|
369
980
|
buffer.close()
|
370
981
|
|
371
|
-
# Write to output if provided
|
372
982
|
if output is not None:
|
373
983
|
if isinstance(output, str):
|
374
984
|
with open(output, "w") as f:
|
@@ -376,9 +986,8 @@ def write_pdb(
|
|
376
986
|
else:
|
377
987
|
output.write(content)
|
378
988
|
return None
|
379
|
-
|
380
|
-
|
381
|
-
return content
|
989
|
+
else:
|
990
|
+
return content
|
382
991
|
|
383
992
|
|
384
993
|
def write_cif(
|
@@ -403,7 +1012,7 @@ def write_cif(
|
|
403
1012
|
format_type = df.attrs.get("format", "PDB")
|
404
1013
|
|
405
1014
|
# Create a new DataContainer
|
406
|
-
data_container = DataContainer("
|
1015
|
+
data_container = DataContainer("rnapolis")
|
407
1016
|
|
408
1017
|
# Define the attributes for atom_site category
|
409
1018
|
if format_type == "mmCIF":
|
@@ -432,7 +1041,7 @@ def write_cif(
|
|
432
1041
|
"auth_comp_id", # resName
|
433
1042
|
"auth_asym_id", # chainID
|
434
1043
|
"auth_atom_id", # name
|
435
|
-
"pdbx_PDB_model_num", #
|
1044
|
+
"pdbx_PDB_model_num", # model
|
436
1045
|
]
|
437
1046
|
|
438
1047
|
# Prepare rows for the atom_site category
|
@@ -440,32 +1049,44 @@ def write_cif(
|
|
440
1049
|
|
441
1050
|
for _, row in df.iterrows():
|
442
1051
|
if format_type == "mmCIF":
|
443
|
-
# Use existing mmCIF data
|
444
|
-
row_data = [
|
1052
|
+
# Use existing mmCIF data, converting None to '?' universally
|
1053
|
+
row_data = []
|
1054
|
+
for attr in attributes:
|
1055
|
+
value = row.get(attr)
|
1056
|
+
if pd.isna(value):
|
1057
|
+
# Use '?' as the standard placeholder for missing values
|
1058
|
+
row_data.append("?")
|
1059
|
+
else:
|
1060
|
+
# Ensure all non-missing values are converted to string
|
1061
|
+
row_data.append(str(value))
|
445
1062
|
else: # PDB format
|
446
|
-
# Map PDB data to mmCIF format
|
1063
|
+
# Map PDB data to mmCIF format, converting None to '.' or '?'
|
447
1064
|
entity_id = "1" # Default entity ID
|
448
|
-
model_num = "
|
1065
|
+
model_num = str(int(row["model"]))
|
1066
|
+
|
1067
|
+
# Pre-process optional fields for mmCIF placeholders
|
1068
|
+
element_val = "?" if pd.isna(row.get("element")) else str(row["element"])
|
1069
|
+
altloc_val = "." if pd.isna(row.get("altLoc")) else str(row["altLoc"])
|
1070
|
+
icode_val = "." if pd.isna(row.get("iCode")) else str(row["iCode"])
|
1071
|
+
charge_val = "." if pd.isna(row.get("charge")) else str(row["charge"])
|
449
1072
|
|
450
1073
|
row_data = [
|
451
1074
|
str(row["record_type"]), # group_PDB
|
452
1075
|
str(int(row["serial"])), # id
|
453
|
-
|
1076
|
+
element_val, # type_symbol
|
454
1077
|
str(row["name"]), # label_atom_id
|
455
|
-
|
1078
|
+
altloc_val, # label_alt_id
|
456
1079
|
str(row["resName"]), # label_comp_id
|
457
1080
|
str(row["chainID"]), # label_asym_id
|
458
1081
|
entity_id, # label_entity_id
|
459
1082
|
str(int(row["resSeq"])), # label_seq_id
|
460
|
-
|
461
|
-
if pd.notna(row["iCode"])
|
462
|
-
else "?", # pdbx_PDB_ins_code
|
1083
|
+
icode_val, # pdbx_PDB_ins_code
|
463
1084
|
f"{float(row['x']):.3f}", # Cartn_x
|
464
1085
|
f"{float(row['y']):.3f}", # Cartn_y
|
465
1086
|
f"{float(row['z']):.3f}", # Cartn_z
|
466
1087
|
f"{float(row['occupancy']):.2f}", # occupancy
|
467
1088
|
f"{float(row['tempFactor']):.2f}", # B_iso_or_equiv
|
468
|
-
|
1089
|
+
charge_val, # pdbx_formal_charge
|
469
1090
|
str(int(row["resSeq"])), # auth_seq_id
|
470
1091
|
str(row["resName"]), # auth_comp_id
|
471
1092
|
str(row["chainID"]), # auth_asym_id
|