RNApolis 0.8.0__py3-none-any.whl → 0.8.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rnapolis/parser_v2.py +774 -240
- rnapolis/splitter.py +18 -5
- rnapolis/unifier.py +20 -5
- {rnapolis-0.8.0.dist-info → rnapolis-0.8.1.dist-info}/METADATA +1 -1
- {rnapolis-0.8.0.dist-info → rnapolis-0.8.1.dist-info}/RECORD +9 -9
- {rnapolis-0.8.0.dist-info → rnapolis-0.8.1.dist-info}/WHEEL +0 -0
- {rnapolis-0.8.0.dist-info → rnapolis-0.8.1.dist-info}/entry_points.txt +0 -0
- {rnapolis-0.8.0.dist-info → rnapolis-0.8.1.dist-info}/licenses/LICENSE +0 -0
- {rnapolis-0.8.0.dist-info → rnapolis-0.8.1.dist-info}/top_level.txt +0 -0
rnapolis/parser_v2.py
CHANGED
@@ -1,4 +1,6 @@
|
|
1
1
|
import io
|
2
|
+
import os
|
3
|
+
import string
|
2
4
|
import tempfile
|
3
5
|
from typing import IO, TextIO, Union
|
4
6
|
|
@@ -52,23 +54,27 @@ def parse_pdb_atoms(content: Union[str, IO[str]]) -> pd.DataFrame:
|
|
52
54
|
continue
|
53
55
|
|
54
56
|
# Parse fields according to PDB format specification
|
57
|
+
alt_loc = line[16:17].strip()
|
55
58
|
icode = line[26:27].strip()
|
59
|
+
element = line[76:78].strip()
|
60
|
+
charge = line[78:80].strip()
|
61
|
+
|
56
62
|
record = {
|
57
63
|
"record_type": record_type,
|
58
64
|
"serial": line[6:11].strip(),
|
59
65
|
"name": line[12:16].strip(),
|
60
|
-
"altLoc":
|
66
|
+
"altLoc": None if not alt_loc else alt_loc, # Store None if empty
|
61
67
|
"resName": line[17:20].strip(),
|
62
68
|
"chainID": line[21:22].strip(),
|
63
69
|
"resSeq": line[22:26].strip(),
|
64
|
-
"iCode": None if not icode else icode, #
|
70
|
+
"iCode": None if not icode else icode, # Store None if empty
|
65
71
|
"x": line[30:38].strip(),
|
66
72
|
"y": line[38:46].strip(),
|
67
73
|
"z": line[46:54].strip(),
|
68
74
|
"occupancy": line[54:60].strip(),
|
69
75
|
"tempFactor": line[60:66].strip(),
|
70
|
-
"element":
|
71
|
-
"charge":
|
76
|
+
"element": None if not element else element, # Store None if empty
|
77
|
+
"charge": None if not charge else charge, # Store None if empty
|
72
78
|
"model": current_model, # Add the current model number
|
73
79
|
}
|
74
80
|
|
@@ -149,18 +155,37 @@ def parse_cif_atoms(content: Union[str, IO[str]]) -> pd.DataFrame:
|
|
149
155
|
"""
|
150
156
|
adapter = IoAdapterPy()
|
151
157
|
|
152
|
-
# Handle
|
158
|
+
# Handle string, StringIO, and file-like objects
|
153
159
|
if isinstance(content, str):
|
154
|
-
# Create a temporary file
|
155
|
-
|
156
|
-
|
157
|
-
|
160
|
+
# Create a temporary file for string input
|
161
|
+
with tempfile.NamedTemporaryFile(
|
162
|
+
mode="w+", suffix=".cif", delete=False
|
163
|
+
) as temp_file:
|
158
164
|
temp_file.write(content)
|
159
|
-
temp_file.
|
160
|
-
|
161
|
-
|
162
|
-
|
165
|
+
temp_file_path = temp_file.name
|
166
|
+
try:
|
167
|
+
data = adapter.readFile(temp_file_path)
|
168
|
+
finally:
|
169
|
+
os.remove(temp_file_path) # Clean up the temporary file
|
170
|
+
elif isinstance(content, io.StringIO):
|
171
|
+
# Create a temporary file for StringIO input
|
172
|
+
with tempfile.NamedTemporaryFile(
|
173
|
+
mode="w+", suffix=".cif", delete=False
|
174
|
+
) as temp_file:
|
175
|
+
content.seek(0) # Ensure reading from the start
|
176
|
+
temp_file.write(content.read())
|
177
|
+
temp_file_path = temp_file.name
|
178
|
+
try:
|
179
|
+
data = adapter.readFile(temp_file_path)
|
180
|
+
finally:
|
181
|
+
os.remove(temp_file_path) # Clean up the temporary file
|
182
|
+
elif hasattr(content, "name"):
|
183
|
+
# Assume it's a file-like object with a name attribute (like an open file)
|
163
184
|
data = adapter.readFile(content.name)
|
185
|
+
else:
|
186
|
+
raise TypeError(
|
187
|
+
"Unsupported input type for parse_cif_atoms. Expected str, file-like object with name, or StringIO."
|
188
|
+
)
|
164
189
|
|
165
190
|
# Get the atom_site category
|
166
191
|
category = data[0].getObj("atom_site")
|
@@ -176,47 +201,133 @@ def parse_cif_atoms(content: Union[str, IO[str]]) -> pd.DataFrame:
|
|
176
201
|
# Create a list of dictionaries for each atom
|
177
202
|
records = []
|
178
203
|
for row in rows:
|
179
|
-
record =
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
204
|
+
record = {}
|
205
|
+
for attr, value in zip(attributes, row):
|
206
|
+
# Store None if value indicates missing data ('?' or '.')
|
207
|
+
if value in ["?", "."]:
|
208
|
+
record[attr] = None
|
209
|
+
else:
|
210
|
+
record[attr] = value
|
186
211
|
records.append(record)
|
187
212
|
|
188
213
|
# Create DataFrame from records
|
189
214
|
df = pd.DataFrame(records)
|
190
215
|
|
191
|
-
#
|
192
|
-
|
193
|
-
"
|
194
|
-
"
|
216
|
+
# Define columns based on mmCIF specification for atom_site
|
217
|
+
float_cols = [
|
218
|
+
"aniso_B[1][1]",
|
219
|
+
"aniso_B[1][1]_esd",
|
220
|
+
"aniso_B[1][2]",
|
221
|
+
"aniso_B[1][2]_esd",
|
222
|
+
"aniso_B[1][3]",
|
223
|
+
"aniso_B[1][3]_esd",
|
224
|
+
"aniso_B[2][2]",
|
225
|
+
"aniso_B[2][2]_esd",
|
226
|
+
"aniso_B[2][3]",
|
227
|
+
"aniso_B[2][3]_esd",
|
228
|
+
"aniso_B[3][3]",
|
229
|
+
"aniso_B[3][3]_esd",
|
230
|
+
"aniso_ratio",
|
231
|
+
"aniso_U[1][1]",
|
232
|
+
"aniso_U[1][1]_esd",
|
233
|
+
"aniso_U[1][2]",
|
234
|
+
"aniso_U[1][2]_esd",
|
235
|
+
"aniso_U[1][3]",
|
236
|
+
"aniso_U[1][3]_esd",
|
237
|
+
"aniso_U[2][2]",
|
238
|
+
"aniso_U[2][2]_esd",
|
239
|
+
"aniso_U[2][3]",
|
240
|
+
"aniso_U[2][3]_esd",
|
241
|
+
"aniso_U[3][3]",
|
242
|
+
"aniso_U[3][3]_esd",
|
243
|
+
"B_equiv_geom_mean",
|
244
|
+
"B_equiv_geom_mean_esd",
|
245
|
+
"B_iso_or_equiv",
|
246
|
+
"B_iso_or_equiv_esd",
|
195
247
|
"Cartn_x",
|
248
|
+
"Cartn_x_esd",
|
196
249
|
"Cartn_y",
|
250
|
+
"Cartn_y_esd",
|
197
251
|
"Cartn_z",
|
252
|
+
"Cartn_z_esd",
|
253
|
+
"fract_x",
|
254
|
+
"fract_x_esd",
|
255
|
+
"fract_y",
|
256
|
+
"fract_y_esd",
|
257
|
+
"fract_z",
|
258
|
+
"fract_z_esd",
|
198
259
|
"occupancy",
|
199
|
-
"
|
260
|
+
"occupancy_esd",
|
261
|
+
"U_equiv_geom_mean",
|
262
|
+
"U_equiv_geom_mean_esd",
|
263
|
+
"U_iso_or_equiv",
|
264
|
+
"U_iso_or_equiv_esd",
|
265
|
+
]
|
266
|
+
int_cols = [
|
267
|
+
"attached_hydrogens",
|
268
|
+
"label_seq_id",
|
269
|
+
"symmetry_multiplicity",
|
270
|
+
"pdbx_PDB_model_num",
|
200
271
|
"pdbx_formal_charge",
|
272
|
+
"pdbx_label_index",
|
201
273
|
]
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
274
|
+
category_cols = [
|
275
|
+
"auth_asym_id",
|
276
|
+
"auth_atom_id",
|
277
|
+
"auth_comp_id",
|
278
|
+
"auth_seq_id",
|
279
|
+
"calc_attached_atom",
|
280
|
+
"calc_flag",
|
281
|
+
"disorder_assembly",
|
282
|
+
"disorder_group",
|
209
283
|
"group_PDB",
|
210
|
-
"
|
284
|
+
"id",
|
285
|
+
"label_alt_id",
|
286
|
+
"label_asym_id",
|
211
287
|
"label_atom_id",
|
212
288
|
"label_comp_id",
|
213
|
-
"
|
214
|
-
"
|
215
|
-
"
|
216
|
-
"
|
289
|
+
"label_entity_id",
|
290
|
+
"thermal_displace_type",
|
291
|
+
"type_symbol",
|
292
|
+
"pdbx_atom_ambiguity",
|
293
|
+
"adp_type",
|
294
|
+
"refinement_flags",
|
295
|
+
"refinement_flags_adp",
|
296
|
+
"refinement_flags_occupancy",
|
297
|
+
"refinement_flags_posn",
|
298
|
+
"pdbx_auth_alt_id",
|
299
|
+
"pdbx_PDB_ins_code",
|
300
|
+
"pdbx_PDB_residue_no",
|
301
|
+
"pdbx_PDB_residue_name",
|
302
|
+
"pdbx_PDB_strand_id",
|
303
|
+
"pdbx_PDB_atom_name",
|
304
|
+
"pdbx_auth_atom_name",
|
305
|
+
"pdbx_auth_comp_id",
|
306
|
+
"pdbx_auth_asym_id",
|
307
|
+
"pdbx_auth_seq_id",
|
308
|
+
"pdbx_tls_group_id",
|
309
|
+
"pdbx_ncs_dom_id",
|
310
|
+
"pdbx_group_NDB",
|
311
|
+
"pdbx_atom_group",
|
312
|
+
"pdbx_label_seq_num",
|
313
|
+
"pdbx_not_in_asym",
|
314
|
+
"pdbx_sifts_xref_db_name",
|
315
|
+
"pdbx_sifts_xref_db_acc",
|
316
|
+
"pdbx_sifts_xref_db_num",
|
317
|
+
"pdbx_sifts_xref_db_res",
|
217
318
|
]
|
218
319
|
|
219
|
-
|
320
|
+
# Convert columns to appropriate types
|
321
|
+
for col in float_cols:
|
322
|
+
if col in df.columns:
|
323
|
+
df[col] = pd.to_numeric(df[col], errors="coerce")
|
324
|
+
|
325
|
+
for col in int_cols:
|
326
|
+
if col in df.columns:
|
327
|
+
# Use Int64 (nullable integer) to handle potential NaNs from coercion
|
328
|
+
df[col] = pd.to_numeric(df[col], errors="coerce").astype("Int64")
|
329
|
+
|
330
|
+
for col in category_cols:
|
220
331
|
if col in df.columns:
|
221
332
|
df[col] = df[col].astype("category")
|
222
333
|
|
@@ -226,6 +337,473 @@ def parse_cif_atoms(content: Union[str, IO[str]]) -> pd.DataFrame:
|
|
226
337
|
return df
|
227
338
|
|
228
339
|
|
340
|
+
def can_write_pdb(df: pd.DataFrame) -> bool:
|
341
|
+
"""
|
342
|
+
Check if the DataFrame can be losslessly represented in PDB format.
|
343
|
+
|
344
|
+
PDB format has limitations on field widths:
|
345
|
+
- Atom serial number (id): max 99999
|
346
|
+
- Chain identifier (auth_asym_id): max 1 character
|
347
|
+
- Residue sequence number (auth_seq_id): max 9999
|
348
|
+
|
349
|
+
Parameters:
|
350
|
+
-----------
|
351
|
+
df : pd.DataFrame
|
352
|
+
DataFrame containing atom records, as created by parse_pdb_atoms or parse_cif_atoms.
|
353
|
+
|
354
|
+
Returns:
|
355
|
+
--------
|
356
|
+
bool
|
357
|
+
True if the DataFrame can be written to PDB format without data loss/truncation, False otherwise.
|
358
|
+
"""
|
359
|
+
format_type = df.attrs.get("format")
|
360
|
+
|
361
|
+
if format_type == "PDB":
|
362
|
+
# Assume data originally from PDB already fits PDB constraints
|
363
|
+
return True
|
364
|
+
|
365
|
+
if df.empty:
|
366
|
+
# An empty DataFrame can be represented as an empty PDB file
|
367
|
+
return True
|
368
|
+
|
369
|
+
if format_type == "mmCIF":
|
370
|
+
# Check serial number (id)
|
371
|
+
# Convert to numeric first to handle potential categorical type and NaNs
|
372
|
+
if "id" not in df.columns or (
|
373
|
+
pd.to_numeric(df["id"], errors="coerce").max() > 99999
|
374
|
+
):
|
375
|
+
return False
|
376
|
+
|
377
|
+
# Check chain ID (auth_asym_id) length
|
378
|
+
if "auth_asym_id" not in df.columns or (
|
379
|
+
df["auth_asym_id"].dropna().astype(str).str.len().max() > 1
|
380
|
+
):
|
381
|
+
return False
|
382
|
+
|
383
|
+
# Check residue sequence number (auth_seq_id)
|
384
|
+
if "auth_seq_id" not in df.columns or (
|
385
|
+
pd.to_numeric(df["auth_seq_id"], errors="coerce").max() > 9999
|
386
|
+
):
|
387
|
+
return False
|
388
|
+
|
389
|
+
# All checks passed for mmCIF
|
390
|
+
return True
|
391
|
+
|
392
|
+
# If format is unknown or not PDB/mmCIF, assume it cannot be safely written
|
393
|
+
return False
|
394
|
+
|
395
|
+
|
396
|
+
def fit_to_pdb(df: pd.DataFrame) -> pd.DataFrame:
|
397
|
+
"""
|
398
|
+
Attempts to fit the atom data in a DataFrame to comply with PDB format limitations.
|
399
|
+
|
400
|
+
If the data already fits (checked by can_write_pdb), returns the original DataFrame.
|
401
|
+
Otherwise, checks if fitting is possible based on total atoms, unique chains,
|
402
|
+
and residues per chain. If fitting is possible, it renumbers atoms, renames chains,
|
403
|
+
and renumbers residues within each chain sequentially starting from 1.
|
404
|
+
|
405
|
+
Parameters:
|
406
|
+
-----------
|
407
|
+
df : pd.DataFrame
|
408
|
+
DataFrame containing atom records, as created by parse_pdb_atoms or parse_cif_atoms.
|
409
|
+
|
410
|
+
Returns:
|
411
|
+
--------
|
412
|
+
pd.DataFrame
|
413
|
+
A new DataFrame with data potentially modified to fit PDB constraints.
|
414
|
+
The 'format' attribute of the returned DataFrame will be set to 'PDB'.
|
415
|
+
|
416
|
+
Raises:
|
417
|
+
-------
|
418
|
+
ValueError
|
419
|
+
If the data cannot be fitted into PDB format constraints (too many atoms,
|
420
|
+
chains, or residues per chain).
|
421
|
+
"""
|
422
|
+
format_type = df.attrs.get("format")
|
423
|
+
|
424
|
+
if not format_type:
|
425
|
+
raise ValueError("DataFrame format attribute is not set.")
|
426
|
+
|
427
|
+
if can_write_pdb(df):
|
428
|
+
return df
|
429
|
+
|
430
|
+
# Determine column names based on format
|
431
|
+
if format_type == "PDB":
|
432
|
+
serial_col = "serial"
|
433
|
+
chain_col = "chainID"
|
434
|
+
resseq_col = "resSeq"
|
435
|
+
icode_col = "iCode"
|
436
|
+
elif format_type == "mmCIF":
|
437
|
+
serial_col = "id"
|
438
|
+
chain_col = "auth_asym_id"
|
439
|
+
resseq_col = "auth_seq_id"
|
440
|
+
icode_col = "pdbx_PDB_ins_code"
|
441
|
+
else:
|
442
|
+
raise ValueError(f"Unsupported DataFrame format: {format_type}")
|
443
|
+
|
444
|
+
# --- Feasibility Checks ---
|
445
|
+
if chain_col not in df.columns:
|
446
|
+
raise ValueError(f"Missing required chain column: {chain_col}")
|
447
|
+
if resseq_col not in df.columns:
|
448
|
+
raise ValueError(f"Missing required residue sequence column: {resseq_col}")
|
449
|
+
|
450
|
+
unique_chains = df[chain_col].unique()
|
451
|
+
num_chains = len(unique_chains)
|
452
|
+
total_atoms = len(df)
|
453
|
+
max_pdb_serial = 99999
|
454
|
+
max_pdb_residue = 9999
|
455
|
+
available_chain_ids = list(
|
456
|
+
string.ascii_uppercase + string.ascii_lowercase + string.digits
|
457
|
+
)
|
458
|
+
max_pdb_chains = len(available_chain_ids)
|
459
|
+
|
460
|
+
# Check 1: Total atoms + TER lines <= 99999
|
461
|
+
if total_atoms + num_chains > max_pdb_serial:
|
462
|
+
raise ValueError(
|
463
|
+
f"Cannot fit to PDB: Total atoms ({total_atoms}) + TER lines ({num_chains}) exceeds PDB limit ({max_pdb_serial})."
|
464
|
+
)
|
465
|
+
|
466
|
+
# Check 2: Number of chains <= 62
|
467
|
+
if num_chains > max_pdb_chains:
|
468
|
+
raise ValueError(
|
469
|
+
f"Cannot fit to PDB: Number of unique chains ({num_chains}) exceeds PDB limit ({max_pdb_chains})."
|
470
|
+
)
|
471
|
+
|
472
|
+
# Check 3: Max residues per chain <= 9999
|
473
|
+
# More accurate check: group by chain, then count unique (resSeq, iCode) tuples
|
474
|
+
# Use a temporary structure to avoid modifying the original df
|
475
|
+
check_df = pd.DataFrame(
|
476
|
+
{
|
477
|
+
"chain": df[chain_col],
|
478
|
+
"resSeq": df[resseq_col],
|
479
|
+
"iCode": df[icode_col].fillna("") if icode_col in df.columns else "",
|
480
|
+
}
|
481
|
+
)
|
482
|
+
residue_counts = check_df.groupby("chain").apply(
|
483
|
+
lambda x: x[["resSeq", "iCode"]].drop_duplicates().shape[0]
|
484
|
+
)
|
485
|
+
max_residues_per_chain = residue_counts.max() if not residue_counts.empty else 0
|
486
|
+
|
487
|
+
if max_residues_per_chain > max_pdb_residue:
|
488
|
+
raise ValueError(
|
489
|
+
f"Cannot fit to PDB: Maximum residues in a single chain ({max_residues_per_chain}) exceeds PDB limit ({max_pdb_residue})."
|
490
|
+
)
|
491
|
+
|
492
|
+
# --- Perform Fitting ---
|
493
|
+
df_fitted = df.copy()
|
494
|
+
|
495
|
+
# 1. Rename Chains
|
496
|
+
chain_mapping = {
|
497
|
+
orig_chain: available_chain_ids[i] for i, orig_chain in enumerate(unique_chains)
|
498
|
+
}
|
499
|
+
df_fitted[chain_col] = df_fitted[chain_col].map(chain_mapping)
|
500
|
+
# Ensure the chain column is treated as string/object after mapping
|
501
|
+
df_fitted[chain_col] = df_fitted[chain_col].astype(object)
|
502
|
+
|
503
|
+
# 2. Renumber Residues within each new chain
|
504
|
+
new_resseq_col = "new_resSeq" # Temporary column for new numbering
|
505
|
+
df_fitted[new_resseq_col] = -1 # Initialize
|
506
|
+
|
507
|
+
all_new_res_maps = {}
|
508
|
+
for new_chain_id, group in df_fitted.groupby(chain_col):
|
509
|
+
# Identify unique original residues (seq + icode) in order of appearance
|
510
|
+
original_residues = group[[resseq_col, icode_col]].drop_duplicates()
|
511
|
+
# Create mapping: (orig_resSeq, orig_iCode) -> new_resSeq (1-based)
|
512
|
+
residue_mapping = {
|
513
|
+
tuple(res): i + 1
|
514
|
+
for i, res in enumerate(original_residues.itertuples(index=False))
|
515
|
+
}
|
516
|
+
all_new_res_maps[new_chain_id] = residue_mapping
|
517
|
+
|
518
|
+
# Apply mapping to the group
|
519
|
+
res_indices = group.set_index([resseq_col, icode_col]).index
|
520
|
+
df_fitted.loc[group.index, new_resseq_col] = res_indices.map(residue_mapping)
|
521
|
+
|
522
|
+
# Replace original residue number and clear insertion code
|
523
|
+
df_fitted[resseq_col] = df_fitted[new_resseq_col]
|
524
|
+
df_fitted[icode_col] = None # Insertion codes are now redundant
|
525
|
+
df_fitted.drop(columns=[new_resseq_col], inplace=True)
|
526
|
+
# Convert resseq_col back to Int64 if it was before, handling potential NaNs if any step failed
|
527
|
+
df_fitted[resseq_col] = df_fitted[resseq_col].astype("Int64")
|
528
|
+
|
529
|
+
# 3. Renumber Atom Serials
|
530
|
+
new_serial_col = "new_serial"
|
531
|
+
df_fitted[new_serial_col] = -1 # Initialize
|
532
|
+
current_serial = 0
|
533
|
+
last_chain_id_for_serial = None
|
534
|
+
|
535
|
+
# Iterate in the potentially re-sorted order after grouping/mapping
|
536
|
+
# Ensure stable sort order for consistent serial numbering
|
537
|
+
df_fitted.sort_index(
|
538
|
+
inplace=True
|
539
|
+
) # Sort by original index to maintain original atom order as much as possible
|
540
|
+
|
541
|
+
for index, row in df_fitted.iterrows():
|
542
|
+
current_chain_id = row[chain_col]
|
543
|
+
if (
|
544
|
+
last_chain_id_for_serial is not None
|
545
|
+
and current_chain_id != last_chain_id_for_serial
|
546
|
+
):
|
547
|
+
current_serial += 1 # Increment for TER line
|
548
|
+
|
549
|
+
current_serial += 1
|
550
|
+
if current_serial > max_pdb_serial:
|
551
|
+
# This should have been caught by the initial check, but is a safeguard
|
552
|
+
raise ValueError("Serial number exceeded PDB limit during renumbering.")
|
553
|
+
|
554
|
+
df_fitted.loc[index, new_serial_col] = current_serial
|
555
|
+
last_chain_id_for_serial = current_chain_id
|
556
|
+
|
557
|
+
# Replace original serial number
|
558
|
+
df_fitted[serial_col] = df_fitted[new_serial_col]
|
559
|
+
df_fitted.drop(columns=[new_serial_col], inplace=True)
|
560
|
+
# Convert serial_col back to Int64
|
561
|
+
df_fitted[serial_col] = df_fitted[serial_col].astype("Int64")
|
562
|
+
|
563
|
+
# Update attributes and column types for PDB compatibility
|
564
|
+
df_fitted.attrs["format"] = "PDB"
|
565
|
+
|
566
|
+
# Ensure final column types match expected PDB output (especially categories)
|
567
|
+
# Reapply categorical conversion as some operations might change dtypes
|
568
|
+
pdb_categorical_cols = [
|
569
|
+
"record_type",
|
570
|
+
"name",
|
571
|
+
"altLoc",
|
572
|
+
"resName",
|
573
|
+
chain_col,
|
574
|
+
"element",
|
575
|
+
"charge",
|
576
|
+
icode_col,
|
577
|
+
]
|
578
|
+
if "record_type" not in df_fitted.columns and "group_PDB" in df_fitted.columns:
|
579
|
+
df_fitted.rename(
|
580
|
+
columns={"group_PDB": "record_type"}, inplace=True
|
581
|
+
) # Ensure correct name
|
582
|
+
|
583
|
+
for col in pdb_categorical_cols:
|
584
|
+
if col in df_fitted.columns:
|
585
|
+
# Handle None explicitly before converting to category if needed
|
586
|
+
if df_fitted[col].isnull().any():
|
587
|
+
df_fitted[col] = (
|
588
|
+
df_fitted[col].astype(object).fillna("")
|
589
|
+
) # Fill None with empty string for category
|
590
|
+
df_fitted[col] = df_fitted[col].astype("category")
|
591
|
+
|
592
|
+
# Rename columns if necessary from mmCIF to PDB standard names
|
593
|
+
rename_map = {
|
594
|
+
"id": "serial",
|
595
|
+
"auth_asym_id": "chainID",
|
596
|
+
"auth_seq_id": "resSeq",
|
597
|
+
"pdbx_PDB_ins_code": "iCode",
|
598
|
+
"label_atom_id": "name", # Prefer label_atom_id if auth_atom_id not present? PDB uses 'name'
|
599
|
+
"label_comp_id": "resName", # Prefer label_comp_id if auth_comp_id not present? PDB uses 'resName'
|
600
|
+
"type_symbol": "element",
|
601
|
+
"pdbx_formal_charge": "charge",
|
602
|
+
"Cartn_x": "x",
|
603
|
+
"Cartn_y": "y",
|
604
|
+
"Cartn_z": "z",
|
605
|
+
"B_iso_or_equiv": "tempFactor",
|
606
|
+
"group_PDB": "record_type",
|
607
|
+
"pdbx_PDB_model_num": "model",
|
608
|
+
# Add mappings for auth_atom_id -> name, auth_comp_id -> resName if needed,
|
609
|
+
# deciding on precedence if both label_* and auth_* exist.
|
610
|
+
# Current write_pdb prioritizes auth_* when reading mmCIF, so map those.
|
611
|
+
"auth_atom_id": "name",
|
612
|
+
"auth_comp_id": "resName",
|
613
|
+
}
|
614
|
+
|
615
|
+
# Only rename columns that actually exist in the DataFrame
|
616
|
+
actual_rename_map = {k: v for k, v in rename_map.items() if k in df_fitted.columns}
|
617
|
+
df_fitted.rename(columns=actual_rename_map, inplace=True)
|
618
|
+
|
619
|
+
# Ensure essential PDB columns exist, even if empty, if they were created during fitting
|
620
|
+
pdb_essential_cols = [
|
621
|
+
"record_type",
|
622
|
+
"serial",
|
623
|
+
"name",
|
624
|
+
"altLoc",
|
625
|
+
"resName",
|
626
|
+
"chainID",
|
627
|
+
"resSeq",
|
628
|
+
"iCode",
|
629
|
+
"x",
|
630
|
+
"y",
|
631
|
+
"z",
|
632
|
+
"occupancy",
|
633
|
+
"tempFactor",
|
634
|
+
"element",
|
635
|
+
"charge",
|
636
|
+
"model",
|
637
|
+
]
|
638
|
+
for col in pdb_essential_cols:
|
639
|
+
if col not in df_fitted.columns:
|
640
|
+
# This case might occur if input mmCIF was missing fundamental columns mapped to PDB essentials
|
641
|
+
# Decide on default value or raise error. Adding empty series for now.
|
642
|
+
df_fitted[col] = pd.Series(
|
643
|
+
dtype="object"
|
644
|
+
) # Add as object to handle potential None/mixed types initially
|
645
|
+
|
646
|
+
# Re-order columns to standard PDB order for clarity
|
647
|
+
final_pdb_order = [col for col in pdb_essential_cols if col in df_fitted.columns]
|
648
|
+
other_cols = [col for col in df_fitted.columns if col not in final_pdb_order]
|
649
|
+
df_fitted = df_fitted[final_pdb_order + other_cols]
|
650
|
+
|
651
|
+
# --- Final Type Conversions for PDB format ---
|
652
|
+
# Convert numeric columns (similar to parse_pdb_atoms)
|
653
|
+
pdb_numeric_columns = [
|
654
|
+
"serial",
|
655
|
+
"resSeq",
|
656
|
+
"x",
|
657
|
+
"y",
|
658
|
+
"z",
|
659
|
+
"occupancy",
|
660
|
+
"tempFactor",
|
661
|
+
"model",
|
662
|
+
]
|
663
|
+
for col in pdb_numeric_columns:
|
664
|
+
if col in df_fitted.columns:
|
665
|
+
# Use Int64 for integer-like columns that might have been NaN during processing
|
666
|
+
if col in ["serial", "resSeq", "model"]:
|
667
|
+
df_fitted[col] = pd.to_numeric(df_fitted[col], errors="coerce").astype(
|
668
|
+
"Int64"
|
669
|
+
)
|
670
|
+
else: # Floats
|
671
|
+
df_fitted[col] = pd.to_numeric(df_fitted[col], errors="coerce")
|
672
|
+
|
673
|
+
# Convert categorical columns (similar to parse_pdb_atoms)
|
674
|
+
# Note: chainID and iCode were already handled during fitting/renaming
|
675
|
+
pdb_categorical_columns_final = [
|
676
|
+
"record_type",
|
677
|
+
"name",
|
678
|
+
"altLoc",
|
679
|
+
"resName",
|
680
|
+
"chainID", # Already category, but ensure consistency
|
681
|
+
"iCode", # Already category, but ensure consistency
|
682
|
+
"element",
|
683
|
+
"charge",
|
684
|
+
]
|
685
|
+
for col in pdb_categorical_columns_final:
|
686
|
+
if col in df_fitted.columns:
|
687
|
+
# Ensure the column is categorical first
|
688
|
+
if not pd.api.types.is_categorical_dtype(df_fitted[col]):
|
689
|
+
# Convert non-categorical columns, handling potential NaNs
|
690
|
+
if df_fitted[col].isnull().any():
|
691
|
+
df_fitted[col] = (
|
692
|
+
df_fitted[col].astype(object).fillna("").astype("category")
|
693
|
+
)
|
694
|
+
else:
|
695
|
+
df_fitted[col] = df_fitted[col].astype("category")
|
696
|
+
else:
|
697
|
+
# If already categorical, check if '' needs to be added before fillna
|
698
|
+
has_nans = df_fitted[col].isnull().any()
|
699
|
+
if has_nans and "" not in df_fitted[col].cat.categories:
|
700
|
+
# Add '' category explicitly
|
701
|
+
df_fitted[col] = df_fitted[col].cat.add_categories([""])
|
702
|
+
|
703
|
+
# Fill None/NaN with empty string (now safe)
|
704
|
+
if has_nans:
|
705
|
+
df_fitted[col].fillna("", inplace=True)
|
706
|
+
|
707
|
+
return df_fitted
|
708
|
+
|
709
|
+
|
710
|
+
def _format_pdb_atom_line(atom_data: dict) -> str:
|
711
|
+
"""Formats a dictionary of atom data into a PDB ATOM/HETATM line."""
|
712
|
+
# PDB format specification:
|
713
|
+
# COLUMNS DATA TYPE FIELD DEFINITION
|
714
|
+
# -----------------------------------------------------------------------
|
715
|
+
# 1 - 6 Record name "ATOM " or "HETATM"
|
716
|
+
# 7 - 11 Integer serial Atom serial number.
|
717
|
+
# 13 - 16 Atom name Atom name.
|
718
|
+
# 17 Character altLoc Alternate location indicator.
|
719
|
+
# 18 - 20 Residue name resName Residue name.
|
720
|
+
# 22 Character chainID Chain identifier.
|
721
|
+
# 23 - 26 Integer resSeq Residue sequence number.
|
722
|
+
# 27 AChar iCode Code for insertion of residues.
|
723
|
+
# 31 - 38 Real(8.3) x Orthogonal coordinates for X.
|
724
|
+
# 39 - 46 Real(8.3) y Orthogonal coordinates for Y.
|
725
|
+
# 47 - 54 Real(8.3) z Orthogonal coordinates for Z.
|
726
|
+
# 55 - 60 Real(6.2) occupancy Occupancy.
|
727
|
+
# 61 - 66 Real(6.2) tempFactor Temperature factor.
|
728
|
+
# 77 - 78 LString(2) element Element symbol, right-justified.
|
729
|
+
# 79 - 80 LString(2) charge Charge on the atom.
|
730
|
+
|
731
|
+
# Record name (ATOM/HETATM)
|
732
|
+
record_name = atom_data.get("record_name", "ATOM").ljust(6)
|
733
|
+
|
734
|
+
# Serial number
|
735
|
+
serial = str(atom_data.get("serial", 0)).rjust(5)
|
736
|
+
|
737
|
+
# Atom name - special alignment rules
|
738
|
+
atom_name = atom_data.get("name", "")
|
739
|
+
if len(atom_name) < 4 and atom_name[:1].isalpha():
|
740
|
+
# Pad with space on left for 1-3 char names starting with a letter
|
741
|
+
atom_name_fmt = (" " + atom_name).ljust(4)
|
742
|
+
else:
|
743
|
+
# Use as is, left-justified, for 4-char names or those starting with a digit
|
744
|
+
atom_name_fmt = atom_name.ljust(4)
|
745
|
+
|
746
|
+
# Alternate location indicator
|
747
|
+
alt_loc = atom_data.get("altLoc", "")[:1].ljust(1) # Max 1 char
|
748
|
+
|
749
|
+
# Residue name
|
750
|
+
res_name = atom_data.get("resName", "").rjust(
|
751
|
+
3
|
752
|
+
) # Spec says "Residue name", examples often right-justified
|
753
|
+
|
754
|
+
# Chain identifier
|
755
|
+
chain_id = atom_data.get("chainID", "")[:1].ljust(1) # Max 1 char
|
756
|
+
|
757
|
+
# Residue sequence number
|
758
|
+
res_seq = str(atom_data.get("resSeq", 0)).rjust(4)
|
759
|
+
|
760
|
+
# Insertion code
|
761
|
+
icode = atom_data.get("iCode", "")[:1].ljust(1) # Max 1 char
|
762
|
+
|
763
|
+
# Coordinates
|
764
|
+
x = f"{atom_data.get('x', 0.0):8.3f}"
|
765
|
+
y = f"{atom_data.get('y', 0.0):8.3f}"
|
766
|
+
z = f"{atom_data.get('z', 0.0):8.3f}"
|
767
|
+
|
768
|
+
# Occupancy
|
769
|
+
occupancy = f"{atom_data.get('occupancy', 1.0):6.2f}"
|
770
|
+
|
771
|
+
# Temperature factor
|
772
|
+
temp_factor = f"{atom_data.get('tempFactor', 0.0):6.2f}"
|
773
|
+
|
774
|
+
# Element symbol
|
775
|
+
element = atom_data.get("element", "").rjust(2)
|
776
|
+
|
777
|
+
# Charge
|
778
|
+
charge_val = atom_data.get("charge", "")
|
779
|
+
charge_fmt = ""
|
780
|
+
if charge_val:
|
781
|
+
try:
|
782
|
+
# Try converting numeric charge (e.g., +1, -2) to PDB format (1+, 2-)
|
783
|
+
charge_int = int(float(charge_val)) # Use float first for cases like "1.0"
|
784
|
+
if charge_int != 0:
|
785
|
+
charge_fmt = f"{abs(charge_int)}{'+' if charge_int > 0 else '-'}"
|
786
|
+
except ValueError:
|
787
|
+
# If already formatted (e.g., "1+", "FE2+"), use its string representation
|
788
|
+
charge_fmt = str(charge_val)
|
789
|
+
# Ensure it fits and is right-justified
|
790
|
+
charge_fmt = charge_fmt.strip()[:2].rjust(2)
|
791
|
+
else:
|
792
|
+
charge_fmt = " " # Blank if no charge
|
793
|
+
|
794
|
+
# Construct the full line
|
795
|
+
# Ensure spacing is correct according to the spec
|
796
|
+
# 1-6 Record name | 7-11 Serial | 12 Space | 13-16 Name | 17 AltLoc | 18-20 ResName | 21 Space | 22 ChainID | 23-26 ResSeq | 27 iCode | 28-30 Spaces | 31-38 X | 39-46 Y | 47-54 Z | 55-60 Occupancy | 61-66 TempFactor | 67-76 Spaces | 77-78 Element | 79-80 Charge
|
797
|
+
line = (
|
798
|
+
f"{record_name}{serial} {atom_name_fmt}{alt_loc}{res_name} {chain_id}{res_seq}{icode} "
|
799
|
+
f"{x}{y}{z}{occupancy}{temp_factor} " # 10 spaces
|
800
|
+
f"{element}{charge_fmt}"
|
801
|
+
)
|
802
|
+
|
803
|
+
# Ensure the line is exactly 80 characters long
|
804
|
+
return line.ljust(80)
|
805
|
+
|
806
|
+
|
229
807
|
def write_pdb(
|
230
808
|
df: pd.DataFrame, output: Union[str, TextIO, None] = None
|
231
809
|
) -> Union[str, None]:
|
@@ -235,7 +813,8 @@ def write_pdb(
|
|
235
813
|
Parameters:
|
236
814
|
-----------
|
237
815
|
df : pd.DataFrame
|
238
|
-
DataFrame containing atom records, as created by parse_pdb_atoms or parse_cif_atoms
|
816
|
+
DataFrame containing atom records, as created by parse_pdb_atoms or parse_cif_atoms.
|
817
|
+
Must contain columns mappable to PDB format fields.
|
239
818
|
output : Union[str, TextIO, None], optional
|
240
819
|
Output file path or file-like object. If None, returns the PDB content as a string.
|
241
820
|
|
@@ -244,218 +823,162 @@ def write_pdb(
|
|
244
823
|
Union[str, None]
|
245
824
|
If output is None, returns the PDB content as a string. Otherwise, returns None.
|
246
825
|
"""
|
247
|
-
# Create a buffer to store the PDB content
|
248
826
|
buffer = io.StringIO()
|
827
|
+
format_type = df.attrs.get("format", "PDB") # Assume PDB if not specified
|
249
828
|
|
250
|
-
|
251
|
-
format_type = df.attrs.get("format", "PDB")
|
252
|
-
|
253
|
-
# Variables to track chain changes for TER records
|
829
|
+
last_model_num = None
|
254
830
|
last_chain_id = None
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
831
|
+
last_res_info = None # Tuple (resSeq, iCode, resName) for TER record
|
832
|
+
last_serial = 0
|
833
|
+
|
834
|
+
# Check if DataFrame is empty
|
835
|
+
if df.empty:
|
836
|
+
buffer.write("END\n")
|
837
|
+
content = buffer.getvalue()
|
838
|
+
buffer.close()
|
839
|
+
if output is not None:
|
840
|
+
if isinstance(output, str):
|
841
|
+
with open(output, "w") as f:
|
842
|
+
f.write(content)
|
843
|
+
else:
|
844
|
+
output.write(content)
|
845
|
+
return None
|
846
|
+
return content
|
847
|
+
|
848
|
+
for _, row in df.iterrows():
|
849
|
+
atom_data = {}
|
850
|
+
|
851
|
+
# --- Data Extraction ---
|
263
852
|
if format_type == "PDB":
|
264
|
-
|
265
|
-
|
266
|
-
|
853
|
+
# Pre-process PDB values, converting None to empty strings for optional fields
|
854
|
+
raw_alt_loc = row.get("altLoc")
|
855
|
+
pdb_alt_loc = "" if pd.isna(raw_alt_loc) else str(raw_alt_loc)
|
856
|
+
|
857
|
+
raw_icode = row.get("iCode")
|
858
|
+
pdb_icode = "" if pd.isna(raw_icode) else str(raw_icode)
|
859
|
+
|
860
|
+
raw_element = row.get("element")
|
861
|
+
pdb_element = "" if pd.isna(raw_element) else str(raw_element)
|
862
|
+
|
863
|
+
raw_charge = row.get("charge")
|
864
|
+
pdb_charge = "" if pd.isna(raw_charge) else str(raw_charge)
|
865
|
+
|
866
|
+
atom_data = {
|
867
|
+
"record_name": row.get("record_type", "ATOM"),
|
868
|
+
"serial": int(row.get("serial", 0)),
|
869
|
+
"name": str(row.get("name", "")),
|
870
|
+
"altLoc": pdb_alt_loc,
|
871
|
+
"resName": str(row.get("resName", "")),
|
872
|
+
"chainID": str(row.get("chainID", "")),
|
873
|
+
"resSeq": int(row.get("resSeq", 0)),
|
874
|
+
"iCode": pdb_icode,
|
875
|
+
"x": float(row.get("x", 0.0)),
|
876
|
+
"y": float(row.get("y", 0.0)),
|
877
|
+
"z": float(row.get("z", 0.0)),
|
878
|
+
"occupancy": float(row.get("occupancy", 1.0)),
|
879
|
+
"tempFactor": float(row.get("tempFactor", 0.0)),
|
880
|
+
"element": pdb_element,
|
881
|
+
"charge": pdb_charge,
|
882
|
+
"model": int(row.get("model", 1)),
|
883
|
+
}
|
884
|
+
elif format_type == "mmCIF":
|
885
|
+
# Pre-process mmCIF values to PDB compatible format, converting None to empty strings
|
886
|
+
raw_alt_loc = row.get("label_alt_id")
|
887
|
+
pdb_alt_loc = "" if pd.isna(raw_alt_loc) else str(raw_alt_loc)
|
888
|
+
|
889
|
+
raw_icode = row.get("pdbx_PDB_ins_code")
|
890
|
+
pdb_icode = "" if pd.isna(raw_icode) else str(raw_icode)
|
891
|
+
|
892
|
+
raw_element = row.get("type_symbol")
|
893
|
+
pdb_element = "" if pd.isna(raw_element) else str(raw_element)
|
894
|
+
|
895
|
+
raw_charge = row.get("pdbx_formal_charge")
|
896
|
+
pdb_charge = "" if pd.isna(raw_charge) else str(raw_charge)
|
897
|
+
|
898
|
+
atom_data = {
|
899
|
+
"record_name": row.get("group_PDB", "ATOM"),
|
900
|
+
"serial": int(row.get("id", 0)),
|
901
|
+
"name": str(row.get("auth_atom_id", row.get("label_atom_id", ""))),
|
902
|
+
"altLoc": pdb_alt_loc,
|
903
|
+
"resName": str(row.get("auth_comp_id", row.get("label_comp_id", ""))),
|
904
|
+
"chainID": str(row.get("auth_asym_id", row.get("label_asym_id"))),
|
905
|
+
"resSeq": int(row.get("auth_seq_id", row.get("label_seq_id", 0))),
|
906
|
+
"iCode": pdb_icode,
|
907
|
+
"x": float(row.get("Cartn_x", 0.0)),
|
908
|
+
"y": float(row.get("Cartn_y", 0.0)),
|
909
|
+
"z": float(row.get("Cartn_z", 0.0)),
|
910
|
+
"occupancy": float(row.get("occupancy", 1.0)),
|
911
|
+
"tempFactor": float(row.get("B_iso_or_equiv", 0.0)),
|
912
|
+
"element": pdb_element,
|
913
|
+
"charge": pdb_charge,
|
914
|
+
"model": int(row.get("pdbx_PDB_model_num", 1)),
|
915
|
+
}
|
916
|
+
else:
|
917
|
+
raise ValueError(f"Unsupported DataFrame format: {format_type}")
|
918
|
+
|
919
|
+
# --- MODEL/ENDMDL Records ---
|
920
|
+
current_model_num = atom_data["model"]
|
921
|
+
if current_model_num != last_model_num:
|
922
|
+
if last_model_num is not None:
|
923
|
+
buffer.write("ENDMDL\n")
|
924
|
+
buffer.write(f"MODEL {current_model_num:>4}\n")
|
925
|
+
last_model_num = current_model_num
|
926
|
+
# Reset chain/residue tracking for the new model
|
927
|
+
last_chain_id = None
|
928
|
+
last_res_info = None
|
929
|
+
|
930
|
+
# --- TER Records ---
|
931
|
+
current_chain_id = atom_data["chainID"]
|
932
|
+
current_res_info = (
|
933
|
+
atom_data["resSeq"],
|
934
|
+
atom_data["iCode"],
|
935
|
+
atom_data["resName"],
|
936
|
+
)
|
267
937
|
|
268
|
-
# Write TER
|
938
|
+
# Write TER if chain ID changes within the same model
|
269
939
|
if last_chain_id is not None and current_chain_id != last_chain_id:
|
270
|
-
# Format TER record according to PDB specification
|
271
|
-
# Columns:
|
272
|
-
# 1-6: "TER "
|
273
|
-
# 7-11: Serial number (right-justified)
|
274
|
-
# 18-20: Residue name (right-justified)
|
275
|
-
# 22: Chain ID
|
276
|
-
# 23-26: Residue sequence number (right-justified)
|
277
|
-
# 27: Insertion code
|
278
940
|
ter_serial = str(last_serial + 1).rjust(5)
|
279
|
-
ter_res_name =
|
941
|
+
ter_res_name = last_res_info[2].strip().rjust(3) # Use last residue's name
|
280
942
|
ter_chain_id = last_chain_id
|
281
|
-
ter_res_seq =
|
282
|
-
ter_icode =
|
943
|
+
ter_res_seq = str(last_res_info[0]).rjust(4) # Use last residue's seq num
|
944
|
+
ter_icode = (
|
945
|
+
last_res_info[1] if last_res_info[1] else ""
|
946
|
+
) # Use last residue's icode
|
283
947
|
|
284
|
-
# Construct the TER line ensuring correct spacing for all fields
|
285
|
-
# TER (1-6), serial (7-11), space (12-17), resName (18-20), space (21),
|
286
|
-
# chainID (22), resSeq (23-26), iCode (27)
|
287
948
|
ter_line = f"TER {ter_serial} {ter_res_name} {ter_chain_id}{ter_res_seq}{ter_icode}"
|
288
949
|
buffer.write(ter_line.ljust(80) + "\n")
|
289
950
|
|
290
|
-
#
|
291
|
-
|
292
|
-
|
293
|
-
# Set record type (ATOM or HETATM)
|
294
|
-
if format_type == "PDB":
|
295
|
-
record_type = row["record_type"]
|
296
|
-
else: # mmCIF
|
297
|
-
record_type = row.get("group_PDB", "ATOM")
|
298
|
-
line = record_type.ljust(6) + line[6:]
|
299
|
-
|
300
|
-
# Set atom serial number
|
301
|
-
if format_type == "PDB":
|
302
|
-
serial = str(int(row["serial"]))
|
303
|
-
else: # mmCIF
|
304
|
-
serial = str(int(row["id"]))
|
305
|
-
line = line[:6] + serial.rjust(5) + line[11:]
|
306
|
-
|
307
|
-
# Set atom name
|
308
|
-
if format_type == "PDB":
|
309
|
-
atom_name = row["name"]
|
310
|
-
else: # mmCIF
|
311
|
-
atom_name = row.get("auth_atom_id", row.get("label_atom_id", ""))
|
312
|
-
|
313
|
-
# Right-justify atom name if it starts with a number
|
314
|
-
if atom_name and atom_name[0].isdigit():
|
315
|
-
line = line[:12] + atom_name.ljust(4) + line[16:]
|
316
|
-
else:
|
317
|
-
line = line[:12] + " " + atom_name.ljust(3) + line[16:]
|
318
|
-
|
319
|
-
# Set alternate location indicator
|
320
|
-
if format_type == "PDB":
|
321
|
-
alt_loc = row.get("altLoc", "")
|
322
|
-
else: # mmCIF
|
323
|
-
alt_loc = row.get("label_alt_id", "")
|
324
|
-
line = line[:16] + alt_loc + line[17:]
|
325
|
-
|
326
|
-
# Set residue name
|
327
|
-
if format_type == "PDB":
|
328
|
-
res_name = row["resName"]
|
329
|
-
else: # mmCIF
|
330
|
-
res_name = row.get("auth_comp_id", row.get("label_comp_id", ""))
|
331
|
-
line = line[:17] + res_name.ljust(3) + line[20:]
|
332
|
-
|
333
|
-
# Set chain identifier
|
334
|
-
if format_type == "PDB":
|
335
|
-
chain_id = row["chainID"]
|
336
|
-
else: # mmCIF
|
337
|
-
chain_id = row.get("auth_asym_id", row.get("label_asym_id", ""))
|
338
|
-
line = line[:21] + chain_id + line[22:]
|
339
|
-
|
340
|
-
# Set residue sequence number
|
341
|
-
if format_type == "PDB":
|
342
|
-
res_seq = str(int(row["resSeq"]))
|
343
|
-
else: # mmCIF
|
344
|
-
res_seq = str(int(row.get("auth_seq_id", row.get("label_seq_id", 0))))
|
345
|
-
line = line[:22] + res_seq.rjust(4) + line[26:]
|
346
|
-
|
347
|
-
# Set insertion code
|
348
|
-
if format_type == "PDB":
|
349
|
-
icode = row["iCode"] if pd.notna(row["iCode"]) else ""
|
350
|
-
else: # mmCIF
|
351
|
-
icode = (
|
352
|
-
row.get("pdbx_PDB_ins_code", "")
|
353
|
-
if pd.notna(row.get("pdbx_PDB_ins_code", ""))
|
354
|
-
else ""
|
355
|
-
)
|
356
|
-
line = line[:26] + icode + line[27:]
|
357
|
-
|
358
|
-
# Set X coordinate
|
359
|
-
if format_type == "PDB":
|
360
|
-
x = float(row["x"])
|
361
|
-
else: # mmCIF
|
362
|
-
x = float(row["Cartn_x"])
|
363
|
-
line = line[:30] + f"{x:8.3f}" + line[38:]
|
951
|
+
# --- Format and Write ATOM/HETATM Line ---
|
952
|
+
pdb_line = _format_pdb_atom_line(atom_data)
|
953
|
+
buffer.write(pdb_line + "\n")
|
364
954
|
|
365
|
-
#
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
y = float(row["Cartn_y"])
|
370
|
-
line = line[:38] + f"{y:8.3f}" + line[46:]
|
371
|
-
|
372
|
-
# Set Z coordinate
|
373
|
-
if format_type == "PDB":
|
374
|
-
z = float(row["z"])
|
375
|
-
else: # mmCIF
|
376
|
-
z = float(row["Cartn_z"])
|
377
|
-
line = line[:46] + f"{z:8.3f}" + line[54:]
|
378
|
-
|
379
|
-
# Set occupancy
|
380
|
-
if format_type == "PDB":
|
381
|
-
occupancy = float(row["occupancy"])
|
382
|
-
else: # mmCIF
|
383
|
-
occupancy = float(row.get("occupancy", 1.0))
|
384
|
-
line = line[:54] + f"{occupancy:6.2f}" + line[60:]
|
955
|
+
# --- Update Tracking Variables ---
|
956
|
+
last_serial = atom_data["serial"]
|
957
|
+
last_chain_id = current_chain_id
|
958
|
+
last_res_info = current_res_info
|
385
959
|
|
386
|
-
|
387
|
-
|
388
|
-
temp_factor = float(row["tempFactor"])
|
389
|
-
else: # mmCIF
|
390
|
-
temp_factor = float(row.get("B_iso_or_equiv", 0.0))
|
391
|
-
line = line[:60] + f"{temp_factor:6.2f}" + line[66:]
|
392
|
-
|
393
|
-
# Set element symbol
|
394
|
-
if format_type == "PDB":
|
395
|
-
element = row["element"]
|
396
|
-
else: # mmCIF
|
397
|
-
element = row.get("type_symbol", "")
|
398
|
-
line = line[:76] + element.rjust(2) + line[78:]
|
399
|
-
|
400
|
-
# Set charge
|
401
|
-
if format_type == "PDB":
|
402
|
-
charge = row["charge"]
|
403
|
-
else: # mmCIF
|
404
|
-
charge = row.get("pdbx_formal_charge", "")
|
405
|
-
if charge and charge not in ["?", "."]:
|
406
|
-
# Convert numeric charge to PDB format (e.g., "1+" or "2-")
|
407
|
-
try:
|
408
|
-
charge_val = int(charge)
|
409
|
-
if charge_val != 0:
|
410
|
-
charge = f"{abs(charge_val)}{'+' if charge_val > 0 else '-'}"
|
411
|
-
else:
|
412
|
-
charge = ""
|
413
|
-
except ValueError:
|
414
|
-
pass
|
415
|
-
line = line[:78] + charge + line[80:]
|
416
|
-
|
417
|
-
# Write the line to the buffer
|
418
|
-
buffer.write(line.rstrip() + "\n")
|
419
|
-
|
420
|
-
# Update last atom info for potential TER record
|
421
|
-
if format_type == "PDB":
|
422
|
-
last_serial = int(row["serial"])
|
423
|
-
last_res_name = row["resName"]
|
424
|
-
last_chain_id = row["chainID"]
|
425
|
-
last_res_seq = str(int(row["resSeq"]))
|
426
|
-
last_icode = row["iCode"] if pd.notna(row["iCode"]) else ""
|
427
|
-
else: # mmCIF
|
428
|
-
last_serial = int(row["id"])
|
429
|
-
last_res_name = row.get("auth_comp_id", row.get("label_comp_id", ""))
|
430
|
-
last_chain_id = row.get("auth_asym_id", row.get("label_asym_id", ""))
|
431
|
-
last_res_seq = str(int(row.get("auth_seq_id", row.get("label_seq_id", 0))))
|
432
|
-
last_icode = (
|
433
|
-
row.get("pdbx_PDB_ins_code", "")
|
434
|
-
if pd.notna(row.get("pdbx_PDB_ins_code", ""))
|
435
|
-
else ""
|
436
|
-
)
|
437
|
-
|
438
|
-
# Add TER record for the last chain
|
960
|
+
# --- Final Records ---
|
961
|
+
# Add TER record for the very last chain in the last model
|
439
962
|
if last_chain_id is not None:
|
440
|
-
# Format TER record according to PDB specification
|
441
963
|
ter_serial = str(last_serial + 1).rjust(5)
|
442
|
-
ter_res_name =
|
964
|
+
ter_res_name = last_res_info[2].strip().rjust(3)
|
443
965
|
ter_chain_id = last_chain_id
|
444
|
-
ter_res_seq =
|
445
|
-
ter_icode =
|
966
|
+
ter_res_seq = str(last_res_info[0]).rjust(4)
|
967
|
+
ter_icode = last_res_info[1] if last_res_info[1] else ""
|
446
968
|
|
447
|
-
# Construct the TER line ensuring correct spacing for all fields
|
448
969
|
ter_line = f"TER {ter_serial} {ter_res_name} {ter_chain_id}{ter_res_seq}{ter_icode}"
|
449
970
|
buffer.write(ter_line.ljust(80) + "\n")
|
450
971
|
|
451
|
-
# Add
|
972
|
+
# Add ENDMDL if models were used
|
973
|
+
if last_model_num is not None:
|
974
|
+
buffer.write("ENDMDL\n")
|
975
|
+
|
452
976
|
buffer.write("END\n")
|
453
977
|
|
454
|
-
#
|
978
|
+
# --- Output Handling ---
|
455
979
|
content = buffer.getvalue()
|
456
980
|
buffer.close()
|
457
981
|
|
458
|
-
# Write to output if provided
|
459
982
|
if output is not None:
|
460
983
|
if isinstance(output, str):
|
461
984
|
with open(output, "w") as f:
|
@@ -463,9 +986,8 @@ def write_pdb(
|
|
463
986
|
else:
|
464
987
|
output.write(content)
|
465
988
|
return None
|
466
|
-
|
467
|
-
|
468
|
-
return content
|
989
|
+
else:
|
990
|
+
return content
|
469
991
|
|
470
992
|
|
471
993
|
def write_cif(
|
@@ -490,7 +1012,7 @@ def write_cif(
|
|
490
1012
|
format_type = df.attrs.get("format", "PDB")
|
491
1013
|
|
492
1014
|
# Create a new DataContainer
|
493
|
-
data_container = DataContainer("
|
1015
|
+
data_container = DataContainer("rnapolis")
|
494
1016
|
|
495
1017
|
# Define the attributes for atom_site category
|
496
1018
|
if format_type == "mmCIF":
|
@@ -519,7 +1041,7 @@ def write_cif(
|
|
519
1041
|
"auth_comp_id", # resName
|
520
1042
|
"auth_asym_id", # chainID
|
521
1043
|
"auth_atom_id", # name
|
522
|
-
"pdbx_PDB_model_num", #
|
1044
|
+
"pdbx_PDB_model_num", # model
|
523
1045
|
]
|
524
1046
|
|
525
1047
|
# Prepare rows for the atom_site category
|
@@ -527,32 +1049,44 @@ def write_cif(
|
|
527
1049
|
|
528
1050
|
for _, row in df.iterrows():
|
529
1051
|
if format_type == "mmCIF":
|
530
|
-
# Use existing mmCIF data
|
531
|
-
row_data = [
|
1052
|
+
# Use existing mmCIF data, converting None to '?' universally
|
1053
|
+
row_data = []
|
1054
|
+
for attr in attributes:
|
1055
|
+
value = row.get(attr)
|
1056
|
+
if pd.isna(value):
|
1057
|
+
# Use '?' as the standard placeholder for missing values
|
1058
|
+
row_data.append("?")
|
1059
|
+
else:
|
1060
|
+
# Ensure all non-missing values are converted to string
|
1061
|
+
row_data.append(str(value))
|
532
1062
|
else: # PDB format
|
533
|
-
# Map PDB data to mmCIF format
|
1063
|
+
# Map PDB data to mmCIF format, converting None to '.' or '?'
|
534
1064
|
entity_id = "1" # Default entity ID
|
535
|
-
model_num = "
|
1065
|
+
model_num = str(int(row["model"]))
|
1066
|
+
|
1067
|
+
# Pre-process optional fields for mmCIF placeholders
|
1068
|
+
element_val = "?" if pd.isna(row.get("element")) else str(row["element"])
|
1069
|
+
altloc_val = "." if pd.isna(row.get("altLoc")) else str(row["altLoc"])
|
1070
|
+
icode_val = "." if pd.isna(row.get("iCode")) else str(row["iCode"])
|
1071
|
+
charge_val = "." if pd.isna(row.get("charge")) else str(row["charge"])
|
536
1072
|
|
537
1073
|
row_data = [
|
538
1074
|
str(row["record_type"]), # group_PDB
|
539
1075
|
str(int(row["serial"])), # id
|
540
|
-
|
1076
|
+
element_val, # type_symbol
|
541
1077
|
str(row["name"]), # label_atom_id
|
542
|
-
|
1078
|
+
altloc_val, # label_alt_id
|
543
1079
|
str(row["resName"]), # label_comp_id
|
544
1080
|
str(row["chainID"]), # label_asym_id
|
545
1081
|
entity_id, # label_entity_id
|
546
1082
|
str(int(row["resSeq"])), # label_seq_id
|
547
|
-
|
548
|
-
if pd.notna(row["iCode"])
|
549
|
-
else "?", # pdbx_PDB_ins_code
|
1083
|
+
icode_val, # pdbx_PDB_ins_code
|
550
1084
|
f"{float(row['x']):.3f}", # Cartn_x
|
551
1085
|
f"{float(row['y']):.3f}", # Cartn_y
|
552
1086
|
f"{float(row['z']):.3f}", # Cartn_z
|
553
1087
|
f"{float(row['occupancy']):.2f}", # occupancy
|
554
1088
|
f"{float(row['tempFactor']):.2f}", # B_iso_or_equiv
|
555
|
-
|
1089
|
+
charge_val, # pdbx_formal_charge
|
556
1090
|
str(int(row["resSeq"])), # auth_seq_id
|
557
1091
|
str(row["resName"]), # auth_comp_id
|
558
1092
|
str(row["chainID"]), # auth_asym_id
|
rnapolis/splitter.py
CHANGED
@@ -3,10 +3,14 @@ import argparse
|
|
3
3
|
import os
|
4
4
|
import sys
|
5
5
|
|
6
|
-
import pandas as pd
|
7
|
-
|
8
6
|
from rnapolis.parser import is_cif
|
9
|
-
from rnapolis.parser_v2 import
|
7
|
+
from rnapolis.parser_v2 import (
|
8
|
+
fit_to_pdb,
|
9
|
+
parse_cif_atoms,
|
10
|
+
parse_pdb_atoms,
|
11
|
+
write_cif,
|
12
|
+
write_pdb,
|
13
|
+
)
|
10
14
|
|
11
15
|
|
12
16
|
def main():
|
@@ -97,12 +101,21 @@ def main():
|
|
97
101
|
|
98
102
|
try:
|
99
103
|
if output_format == "PDB":
|
100
|
-
|
104
|
+
df_to_write = fit_to_pdb(model_df)
|
105
|
+
write_pdb(df_to_write, output_path)
|
101
106
|
else: # mmCIF
|
102
107
|
write_cif(model_df, output_path)
|
108
|
+
except ValueError as e:
|
109
|
+
# Handle errors specifically from fit_to_pdb
|
110
|
+
print(
|
111
|
+
f"Error fitting model {model_num} from {args.file} to PDB: {e}. Skipping model.",
|
112
|
+
file=sys.stderr,
|
113
|
+
)
|
114
|
+
continue
|
103
115
|
except Exception as e:
|
116
|
+
# Handle general writing errors
|
104
117
|
print(
|
105
|
-
f"Error writing file {output_path}: {e}",
|
118
|
+
f"Error writing file {output_path} for model {model_num}: {e}",
|
106
119
|
file=sys.stderr,
|
107
120
|
)
|
108
121
|
# Optionally continue to next model or exit
|
rnapolis/unifier.py
CHANGED
@@ -7,7 +7,13 @@ from collections import Counter
|
|
7
7
|
import pandas as pd
|
8
8
|
|
9
9
|
from rnapolis.parser import is_cif
|
10
|
-
from rnapolis.parser_v2 import
|
10
|
+
from rnapolis.parser_v2 import (
|
11
|
+
fit_to_pdb,
|
12
|
+
parse_cif_atoms,
|
13
|
+
parse_pdb_atoms,
|
14
|
+
write_cif,
|
15
|
+
write_pdb,
|
16
|
+
)
|
11
17
|
from rnapolis.tertiary_v2 import Structure
|
12
18
|
|
13
19
|
|
@@ -140,13 +146,22 @@ def main():
|
|
140
146
|
|
141
147
|
ext = ".pdb" if format == "PDB" else ".cif"
|
142
148
|
|
143
|
-
|
144
|
-
df = pd.concat([residue.atoms for residue in residues])
|
149
|
+
df = pd.concat([residue.atoms for residue in residues])
|
145
150
|
|
151
|
+
try:
|
146
152
|
if format == "PDB":
|
147
|
-
|
153
|
+
df_to_write = fit_to_pdb(df)
|
154
|
+
with open(f"{args.output}/{base}{ext}", "w") as f:
|
155
|
+
write_pdb(df_to_write, f)
|
148
156
|
else:
|
149
|
-
|
157
|
+
with open(f"{args.output}/{base}{ext}", "w") as f:
|
158
|
+
write_cif(df, f)
|
159
|
+
except ValueError as e:
|
160
|
+
print(
|
161
|
+
f"Error processing {path} for PDB output: {e}. Skipping file.",
|
162
|
+
file=sys.stderr,
|
163
|
+
)
|
164
|
+
continue
|
150
165
|
|
151
166
|
|
152
167
|
if __name__ == "__main__":
|
@@ -12,17 +12,17 @@ rnapolis/mmcif_pdbx_v50.dic,sha256=5QFx1ssDaehR4_DQ-tS9VQux262SiLXaqcwmwwejF5c,5
|
|
12
12
|
rnapolis/molecule_filter.py,sha256=jgcpJxx_oXEBX0d30v4k_FdwRouRUPUsEtCYWgLGpD4,7310
|
13
13
|
rnapolis/motif_extractor.py,sha256=Lfn1iEkhkP9eZD3GPEWNAfy00QO7QPCc8wM_XS1ory8,1147
|
14
14
|
rnapolis/parser.py,sha256=3g4mtFvpiEENFcSBBtx_E_x1vJPF9BujWnts0kb9XjE,16340
|
15
|
-
rnapolis/parser_v2.py,sha256=
|
15
|
+
rnapolis/parser_v2.py,sha256=qG6CO3or7zmuJu368g9Nzokiqdeip4yjD14F163uH6w,40618
|
16
16
|
rnapolis/rfam_folder.py,sha256=SjiiyML_T1__saruFwSMJEoQ7Y55GIU8ktS8ZUn5-fw,11111
|
17
|
-
rnapolis/splitter.py,sha256=
|
17
|
+
rnapolis/splitter.py,sha256=x-Zn21mkiMgvYPptUFD9BbdNIvoaM6b8GzGf6uYXEwE,4052
|
18
18
|
rnapolis/tertiary.py,sha256=6t9ZB4w33-5n_M3sns1RoFXCOTgVAgGH4WDNG5OG9Kg,23426
|
19
19
|
rnapolis/tertiary_v2.py,sha256=I1uyHWIUePNGO5m-suoL4ibtz02qAJUMvYm0BUKUygY,22480
|
20
20
|
rnapolis/transformer.py,sha256=aC0nBmHHJf5TyLvBIV57Jj3tlwpvHbPo347opfAOlQA,3844
|
21
|
-
rnapolis/unifier.py,sha256=
|
21
|
+
rnapolis/unifier.py,sha256=2ge7IB9FdRgzSAiVD39U_ciwtdDJ2fGzf8mUIudbrqY,5820
|
22
22
|
rnapolis/util.py,sha256=IdquFO3PV1_KDqodjupzm0Rqvgy0CeSzxGHaGEHYXVU,543
|
23
|
-
rnapolis-0.8.
|
24
|
-
rnapolis-0.8.
|
25
|
-
rnapolis-0.8.
|
26
|
-
rnapolis-0.8.
|
27
|
-
rnapolis-0.8.
|
28
|
-
rnapolis-0.8.
|
23
|
+
rnapolis-0.8.1.dist-info/licenses/LICENSE,sha256=ZGRu12MzCgbYA-Lt8MyBlmjvPZh7xfiD5u5wBx0enq4,1066
|
24
|
+
rnapolis-0.8.1.dist-info/METADATA,sha256=NOg9-s2n313HElku8z06JiBvEhPf6oV9RR7ur20hwys,54537
|
25
|
+
rnapolis-0.8.1.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
|
26
|
+
rnapolis-0.8.1.dist-info/entry_points.txt,sha256=H00KoN54wU3dFOofAu3H_3PADmZOBTB1hXf5TUU2uzo,438
|
27
|
+
rnapolis-0.8.1.dist-info/top_level.txt,sha256=LcO18koxZcWoJ21KDRRRo_tyIbmXL5z61dPitZpy8yc,9
|
28
|
+
rnapolis-0.8.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|