RNApolis 0.8.0__py3-none-any.whl → 0.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
rnapolis/parser_v2.py CHANGED
@@ -1,4 +1,6 @@
1
1
  import io
2
+ import os
3
+ import string
2
4
  import tempfile
3
5
  from typing import IO, TextIO, Union
4
6
 
@@ -52,23 +54,27 @@ def parse_pdb_atoms(content: Union[str, IO[str]]) -> pd.DataFrame:
52
54
  continue
53
55
 
54
56
  # Parse fields according to PDB format specification
57
+ alt_loc = line[16:17].strip()
55
58
  icode = line[26:27].strip()
59
+ element = line[76:78].strip()
60
+ charge = line[78:80].strip()
61
+
56
62
  record = {
57
63
  "record_type": record_type,
58
64
  "serial": line[6:11].strip(),
59
65
  "name": line[12:16].strip(),
60
- "altLoc": line[16:17].strip(),
66
+ "altLoc": None if not alt_loc else alt_loc, # Store None if empty
61
67
  "resName": line[17:20].strip(),
62
68
  "chainID": line[21:22].strip(),
63
69
  "resSeq": line[22:26].strip(),
64
- "iCode": None if not icode else icode, # Convert empty string to None
70
+ "iCode": None if not icode else icode, # Store None if empty
65
71
  "x": line[30:38].strip(),
66
72
  "y": line[38:46].strip(),
67
73
  "z": line[46:54].strip(),
68
74
  "occupancy": line[54:60].strip(),
69
75
  "tempFactor": line[60:66].strip(),
70
- "element": line[76:78].strip(),
71
- "charge": line[78:80].strip(),
76
+ "element": None if not element else element, # Store None if empty
77
+ "charge": None if not charge else charge, # Store None if empty
72
78
  "model": current_model, # Add the current model number
73
79
  }
74
80
 
@@ -149,18 +155,37 @@ def parse_cif_atoms(content: Union[str, IO[str]]) -> pd.DataFrame:
149
155
  """
150
156
  adapter = IoAdapterPy()
151
157
 
152
- # Handle both string content and file-like objects
158
+ # Handle string, StringIO, and file-like objects
153
159
  if isinstance(content, str):
154
- # Create a temporary file to use with the adapter
155
- import tempfile
156
-
157
- with tempfile.NamedTemporaryFile(mode="w+", suffix=".cif") as temp_file:
160
+ # Create a temporary file for string input
161
+ with tempfile.NamedTemporaryFile(
162
+ mode="w+", suffix=".cif", delete=False
163
+ ) as temp_file:
158
164
  temp_file.write(content)
159
- temp_file.flush()
160
- data = adapter.readFile(temp_file.name)
161
- else:
162
- # Assume it's a file-like object with a name attribute
165
+ temp_file_path = temp_file.name
166
+ try:
167
+ data = adapter.readFile(temp_file_path)
168
+ finally:
169
+ os.remove(temp_file_path) # Clean up the temporary file
170
+ elif isinstance(content, io.StringIO):
171
+ # Create a temporary file for StringIO input
172
+ with tempfile.NamedTemporaryFile(
173
+ mode="w+", suffix=".cif", delete=False
174
+ ) as temp_file:
175
+ content.seek(0) # Ensure reading from the start
176
+ temp_file.write(content.read())
177
+ temp_file_path = temp_file.name
178
+ try:
179
+ data = adapter.readFile(temp_file_path)
180
+ finally:
181
+ os.remove(temp_file_path) # Clean up the temporary file
182
+ elif hasattr(content, "name"):
183
+ # Assume it's a file-like object with a name attribute (like an open file)
163
184
  data = adapter.readFile(content.name)
185
+ else:
186
+ raise TypeError(
187
+ "Unsupported input type for parse_cif_atoms. Expected str, file-like object with name, or StringIO."
188
+ )
164
189
 
165
190
  # Get the atom_site category
166
191
  category = data[0].getObj("atom_site")
@@ -176,47 +201,133 @@ def parse_cif_atoms(content: Union[str, IO[str]]) -> pd.DataFrame:
176
201
  # Create a list of dictionaries for each atom
177
202
  records = []
178
203
  for row in rows:
179
- record = dict(zip(attributes, row))
180
-
181
- # Convert "?" or "." in insertion code to None
182
- if "pdbx_PDB_ins_code" in record:
183
- if record["pdbx_PDB_ins_code"] in ["?", ".", ""]:
184
- record["pdbx_PDB_ins_code"] = None
185
-
204
+ record = {}
205
+ for attr, value in zip(attributes, row):
206
+ # Store None if value indicates missing data ('?' or '.')
207
+ if value in ["?", "."]:
208
+ record[attr] = None
209
+ else:
210
+ record[attr] = value
186
211
  records.append(record)
187
212
 
188
213
  # Create DataFrame from records
189
214
  df = pd.DataFrame(records)
190
215
 
191
- # Convert numeric columns to appropriate types
192
- numeric_columns = [
193
- "id",
194
- "auth_seq_id",
216
+ # Define columns based on mmCIF specification for atom_site
217
+ float_cols = [
218
+ "aniso_B[1][1]",
219
+ "aniso_B[1][1]_esd",
220
+ "aniso_B[1][2]",
221
+ "aniso_B[1][2]_esd",
222
+ "aniso_B[1][3]",
223
+ "aniso_B[1][3]_esd",
224
+ "aniso_B[2][2]",
225
+ "aniso_B[2][2]_esd",
226
+ "aniso_B[2][3]",
227
+ "aniso_B[2][3]_esd",
228
+ "aniso_B[3][3]",
229
+ "aniso_B[3][3]_esd",
230
+ "aniso_ratio",
231
+ "aniso_U[1][1]",
232
+ "aniso_U[1][1]_esd",
233
+ "aniso_U[1][2]",
234
+ "aniso_U[1][2]_esd",
235
+ "aniso_U[1][3]",
236
+ "aniso_U[1][3]_esd",
237
+ "aniso_U[2][2]",
238
+ "aniso_U[2][2]_esd",
239
+ "aniso_U[2][3]",
240
+ "aniso_U[2][3]_esd",
241
+ "aniso_U[3][3]",
242
+ "aniso_U[3][3]_esd",
243
+ "B_equiv_geom_mean",
244
+ "B_equiv_geom_mean_esd",
245
+ "B_iso_or_equiv",
246
+ "B_iso_or_equiv_esd",
195
247
  "Cartn_x",
248
+ "Cartn_x_esd",
196
249
  "Cartn_y",
250
+ "Cartn_y_esd",
197
251
  "Cartn_z",
252
+ "Cartn_z_esd",
253
+ "fract_x",
254
+ "fract_x_esd",
255
+ "fract_y",
256
+ "fract_y_esd",
257
+ "fract_z",
258
+ "fract_z_esd",
198
259
  "occupancy",
199
- "B_iso_or_equiv",
260
+ "occupancy_esd",
261
+ "U_equiv_geom_mean",
262
+ "U_equiv_geom_mean_esd",
263
+ "U_iso_or_equiv",
264
+ "U_iso_or_equiv_esd",
265
+ ]
266
+ int_cols = [
267
+ "attached_hydrogens",
268
+ "label_seq_id",
269
+ "symmetry_multiplicity",
270
+ "pdbx_PDB_model_num",
200
271
  "pdbx_formal_charge",
272
+ "pdbx_label_index",
201
273
  ]
202
-
203
- for col in numeric_columns:
204
- if col in df.columns:
205
- df[col] = pd.to_numeric(df[col], errors="coerce")
206
-
207
- # Convert categorical columns
208
- categorical_columns = [
274
+ category_cols = [
275
+ "auth_asym_id",
276
+ "auth_atom_id",
277
+ "auth_comp_id",
278
+ "auth_seq_id",
279
+ "calc_attached_atom",
280
+ "calc_flag",
281
+ "disorder_assembly",
282
+ "disorder_group",
209
283
  "group_PDB",
210
- "type_symbol",
284
+ "id",
285
+ "label_alt_id",
286
+ "label_asym_id",
211
287
  "label_atom_id",
212
288
  "label_comp_id",
213
- "label_asym_id",
214
- "auth_atom_id",
215
- "auth_comp_id",
216
- "auth_asym_id",
289
+ "label_entity_id",
290
+ "thermal_displace_type",
291
+ "type_symbol",
292
+ "pdbx_atom_ambiguity",
293
+ "adp_type",
294
+ "refinement_flags",
295
+ "refinement_flags_adp",
296
+ "refinement_flags_occupancy",
297
+ "refinement_flags_posn",
298
+ "pdbx_auth_alt_id",
299
+ "pdbx_PDB_ins_code",
300
+ "pdbx_PDB_residue_no",
301
+ "pdbx_PDB_residue_name",
302
+ "pdbx_PDB_strand_id",
303
+ "pdbx_PDB_atom_name",
304
+ "pdbx_auth_atom_name",
305
+ "pdbx_auth_comp_id",
306
+ "pdbx_auth_asym_id",
307
+ "pdbx_auth_seq_id",
308
+ "pdbx_tls_group_id",
309
+ "pdbx_ncs_dom_id",
310
+ "pdbx_group_NDB",
311
+ "pdbx_atom_group",
312
+ "pdbx_label_seq_num",
313
+ "pdbx_not_in_asym",
314
+ "pdbx_sifts_xref_db_name",
315
+ "pdbx_sifts_xref_db_acc",
316
+ "pdbx_sifts_xref_db_num",
317
+ "pdbx_sifts_xref_db_res",
217
318
  ]
218
319
 
219
- for col in categorical_columns:
320
+ # Convert columns to appropriate types
321
+ for col in float_cols:
322
+ if col in df.columns:
323
+ df[col] = pd.to_numeric(df[col], errors="coerce")
324
+
325
+ for col in int_cols:
326
+ if col in df.columns:
327
+ # Use Int64 (nullable integer) to handle potential NaNs from coercion
328
+ df[col] = pd.to_numeric(df[col], errors="coerce").astype("Int64")
329
+
330
+ for col in category_cols:
220
331
  if col in df.columns:
221
332
  df[col] = df[col].astype("category")
222
333
 
@@ -226,6 +337,473 @@ def parse_cif_atoms(content: Union[str, IO[str]]) -> pd.DataFrame:
226
337
  return df
227
338
 
228
339
 
340
+ def can_write_pdb(df: pd.DataFrame) -> bool:
341
+ """
342
+ Check if the DataFrame can be losslessly represented in PDB format.
343
+
344
+ PDB format has limitations on field widths:
345
+ - Atom serial number (id): max 99999
346
+ - Chain identifier (auth_asym_id): max 1 character
347
+ - Residue sequence number (auth_seq_id): max 9999
348
+
349
+ Parameters:
350
+ -----------
351
+ df : pd.DataFrame
352
+ DataFrame containing atom records, as created by parse_pdb_atoms or parse_cif_atoms.
353
+
354
+ Returns:
355
+ --------
356
+ bool
357
+ True if the DataFrame can be written to PDB format without data loss/truncation, False otherwise.
358
+ """
359
+ format_type = df.attrs.get("format")
360
+
361
+ if format_type == "PDB":
362
+ # Assume data originally from PDB already fits PDB constraints
363
+ return True
364
+
365
+ if df.empty:
366
+ # An empty DataFrame can be represented as an empty PDB file
367
+ return True
368
+
369
+ if format_type == "mmCIF":
370
+ # Check serial number (id)
371
+ # Convert to numeric first to handle potential categorical type and NaNs
372
+ if "id" not in df.columns or (
373
+ pd.to_numeric(df["id"], errors="coerce").max() > 99999
374
+ ):
375
+ return False
376
+
377
+ # Check chain ID (auth_asym_id) length
378
+ if "auth_asym_id" not in df.columns or (
379
+ df["auth_asym_id"].dropna().astype(str).str.len().max() > 1
380
+ ):
381
+ return False
382
+
383
+ # Check residue sequence number (auth_seq_id)
384
+ if "auth_seq_id" not in df.columns or (
385
+ pd.to_numeric(df["auth_seq_id"], errors="coerce").max() > 9999
386
+ ):
387
+ return False
388
+
389
+ # All checks passed for mmCIF
390
+ return True
391
+
392
+ # If format is unknown or not PDB/mmCIF, assume it cannot be safely written
393
+ return False
394
+
395
+
396
+ def fit_to_pdb(df: pd.DataFrame) -> pd.DataFrame:
397
+ """
398
+ Attempts to fit the atom data in a DataFrame to comply with PDB format limitations.
399
+
400
+ If the data already fits (checked by can_write_pdb), returns the original DataFrame.
401
+ Otherwise, checks if fitting is possible based on total atoms, unique chains,
402
+ and residues per chain. If fitting is possible, it renumbers atoms, renames chains,
403
+ and renumbers residues within each chain sequentially starting from 1.
404
+
405
+ Parameters:
406
+ -----------
407
+ df : pd.DataFrame
408
+ DataFrame containing atom records, as created by parse_pdb_atoms or parse_cif_atoms.
409
+
410
+ Returns:
411
+ --------
412
+ pd.DataFrame
413
+ A new DataFrame with data potentially modified to fit PDB constraints.
414
+ The 'format' attribute of the returned DataFrame will be set to 'PDB'.
415
+
416
+ Raises:
417
+ -------
418
+ ValueError
419
+ If the data cannot be fitted into PDB format constraints (too many atoms,
420
+ chains, or residues per chain).
421
+ """
422
+ format_type = df.attrs.get("format")
423
+
424
+ if not format_type:
425
+ raise ValueError("DataFrame format attribute is not set.")
426
+
427
+ if can_write_pdb(df):
428
+ return df
429
+
430
+ # Determine column names based on format
431
+ if format_type == "PDB":
432
+ serial_col = "serial"
433
+ chain_col = "chainID"
434
+ resseq_col = "resSeq"
435
+ icode_col = "iCode"
436
+ elif format_type == "mmCIF":
437
+ serial_col = "id"
438
+ chain_col = "auth_asym_id"
439
+ resseq_col = "auth_seq_id"
440
+ icode_col = "pdbx_PDB_ins_code"
441
+ else:
442
+ raise ValueError(f"Unsupported DataFrame format: {format_type}")
443
+
444
+ # --- Feasibility Checks ---
445
+ if chain_col not in df.columns:
446
+ raise ValueError(f"Missing required chain column: {chain_col}")
447
+ if resseq_col not in df.columns:
448
+ raise ValueError(f"Missing required residue sequence column: {resseq_col}")
449
+
450
+ unique_chains = df[chain_col].unique()
451
+ num_chains = len(unique_chains)
452
+ total_atoms = len(df)
453
+ max_pdb_serial = 99999
454
+ max_pdb_residue = 9999
455
+ available_chain_ids = list(
456
+ string.ascii_uppercase + string.ascii_lowercase + string.digits
457
+ )
458
+ max_pdb_chains = len(available_chain_ids)
459
+
460
+ # Check 1: Total atoms + TER lines <= 99999
461
+ if total_atoms + num_chains > max_pdb_serial:
462
+ raise ValueError(
463
+ f"Cannot fit to PDB: Total atoms ({total_atoms}) + TER lines ({num_chains}) exceeds PDB limit ({max_pdb_serial})."
464
+ )
465
+
466
+ # Check 2: Number of chains <= 62
467
+ if num_chains > max_pdb_chains:
468
+ raise ValueError(
469
+ f"Cannot fit to PDB: Number of unique chains ({num_chains}) exceeds PDB limit ({max_pdb_chains})."
470
+ )
471
+
472
+ # Check 3: Max residues per chain <= 9999
473
+ # More accurate check: group by chain, then count unique (resSeq, iCode) tuples
474
+ # Use a temporary structure to avoid modifying the original df
475
+ check_df = pd.DataFrame(
476
+ {
477
+ "chain": df[chain_col],
478
+ "resSeq": df[resseq_col],
479
+ "iCode": df[icode_col].fillna("") if icode_col in df.columns else "",
480
+ }
481
+ )
482
+ residue_counts = check_df.groupby("chain").apply(
483
+ lambda x: x[["resSeq", "iCode"]].drop_duplicates().shape[0]
484
+ )
485
+ max_residues_per_chain = residue_counts.max() if not residue_counts.empty else 0
486
+
487
+ if max_residues_per_chain > max_pdb_residue:
488
+ raise ValueError(
489
+ f"Cannot fit to PDB: Maximum residues in a single chain ({max_residues_per_chain}) exceeds PDB limit ({max_pdb_residue})."
490
+ )
491
+
492
+ # --- Perform Fitting ---
493
+ df_fitted = df.copy()
494
+
495
+ # 1. Rename Chains
496
+ chain_mapping = {
497
+ orig_chain: available_chain_ids[i] for i, orig_chain in enumerate(unique_chains)
498
+ }
499
+ df_fitted[chain_col] = df_fitted[chain_col].map(chain_mapping)
500
+ # Ensure the chain column is treated as string/object after mapping
501
+ df_fitted[chain_col] = df_fitted[chain_col].astype(object)
502
+
503
+ # 2. Renumber Residues within each new chain
504
+ new_resseq_col = "new_resSeq" # Temporary column for new numbering
505
+ df_fitted[new_resseq_col] = -1 # Initialize
506
+
507
+ all_new_res_maps = {}
508
+ for new_chain_id, group in df_fitted.groupby(chain_col):
509
+ # Identify unique original residues (seq + icode) in order of appearance
510
+ original_residues = group[[resseq_col, icode_col]].drop_duplicates()
511
+ # Create mapping: (orig_resSeq, orig_iCode) -> new_resSeq (1-based)
512
+ residue_mapping = {
513
+ tuple(res): i + 1
514
+ for i, res in enumerate(original_residues.itertuples(index=False))
515
+ }
516
+ all_new_res_maps[new_chain_id] = residue_mapping
517
+
518
+ # Apply mapping to the group
519
+ res_indices = group.set_index([resseq_col, icode_col]).index
520
+ df_fitted.loc[group.index, new_resseq_col] = res_indices.map(residue_mapping)
521
+
522
+ # Replace original residue number and clear insertion code
523
+ df_fitted[resseq_col] = df_fitted[new_resseq_col]
524
+ df_fitted[icode_col] = None # Insertion codes are now redundant
525
+ df_fitted.drop(columns=[new_resseq_col], inplace=True)
526
+ # Convert resseq_col back to Int64 if it was before, handling potential NaNs if any step failed
527
+ df_fitted[resseq_col] = df_fitted[resseq_col].astype("Int64")
528
+
529
+ # 3. Renumber Atom Serials
530
+ new_serial_col = "new_serial"
531
+ df_fitted[new_serial_col] = -1 # Initialize
532
+ current_serial = 0
533
+ last_chain_id_for_serial = None
534
+
535
+ # Iterate in the potentially re-sorted order after grouping/mapping
536
+ # Ensure stable sort order for consistent serial numbering
537
+ df_fitted.sort_index(
538
+ inplace=True
539
+ ) # Sort by original index to maintain original atom order as much as possible
540
+
541
+ for index, row in df_fitted.iterrows():
542
+ current_chain_id = row[chain_col]
543
+ if (
544
+ last_chain_id_for_serial is not None
545
+ and current_chain_id != last_chain_id_for_serial
546
+ ):
547
+ current_serial += 1 # Increment for TER line
548
+
549
+ current_serial += 1
550
+ if current_serial > max_pdb_serial:
551
+ # This should have been caught by the initial check, but is a safeguard
552
+ raise ValueError("Serial number exceeded PDB limit during renumbering.")
553
+
554
+ df_fitted.loc[index, new_serial_col] = current_serial
555
+ last_chain_id_for_serial = current_chain_id
556
+
557
+ # Replace original serial number
558
+ df_fitted[serial_col] = df_fitted[new_serial_col]
559
+ df_fitted.drop(columns=[new_serial_col], inplace=True)
560
+ # Convert serial_col back to Int64
561
+ df_fitted[serial_col] = df_fitted[serial_col].astype("Int64")
562
+
563
+ # Update attributes and column types for PDB compatibility
564
+ df_fitted.attrs["format"] = "PDB"
565
+
566
+ # Ensure final column types match expected PDB output (especially categories)
567
+ # Reapply categorical conversion as some operations might change dtypes
568
+ pdb_categorical_cols = [
569
+ "record_type",
570
+ "name",
571
+ "altLoc",
572
+ "resName",
573
+ chain_col,
574
+ "element",
575
+ "charge",
576
+ icode_col,
577
+ ]
578
+ if "record_type" not in df_fitted.columns and "group_PDB" in df_fitted.columns:
579
+ df_fitted.rename(
580
+ columns={"group_PDB": "record_type"}, inplace=True
581
+ ) # Ensure correct name
582
+
583
+ for col in pdb_categorical_cols:
584
+ if col in df_fitted.columns:
585
+ # Handle None explicitly before converting to category if needed
586
+ if df_fitted[col].isnull().any():
587
+ df_fitted[col] = (
588
+ df_fitted[col].astype(object).fillna("")
589
+ ) # Fill None with empty string for category
590
+ df_fitted[col] = df_fitted[col].astype("category")
591
+
592
+ # Rename columns if necessary from mmCIF to PDB standard names
593
+ rename_map = {
594
+ "id": "serial",
595
+ "auth_asym_id": "chainID",
596
+ "auth_seq_id": "resSeq",
597
+ "pdbx_PDB_ins_code": "iCode",
598
+ "label_atom_id": "name", # Prefer label_atom_id if auth_atom_id not present? PDB uses 'name'
599
+ "label_comp_id": "resName", # Prefer label_comp_id if auth_comp_id not present? PDB uses 'resName'
600
+ "type_symbol": "element",
601
+ "pdbx_formal_charge": "charge",
602
+ "Cartn_x": "x",
603
+ "Cartn_y": "y",
604
+ "Cartn_z": "z",
605
+ "B_iso_or_equiv": "tempFactor",
606
+ "group_PDB": "record_type",
607
+ "pdbx_PDB_model_num": "model",
608
+ # Add mappings for auth_atom_id -> name, auth_comp_id -> resName if needed,
609
+ # deciding on precedence if both label_* and auth_* exist.
610
+ # Current write_pdb prioritizes auth_* when reading mmCIF, so map those.
611
+ "auth_atom_id": "name",
612
+ "auth_comp_id": "resName",
613
+ }
614
+
615
+ # Only rename columns that actually exist in the DataFrame
616
+ actual_rename_map = {k: v for k, v in rename_map.items() if k in df_fitted.columns}
617
+ df_fitted.rename(columns=actual_rename_map, inplace=True)
618
+
619
+ # Ensure essential PDB columns exist, even if empty, if they were created during fitting
620
+ pdb_essential_cols = [
621
+ "record_type",
622
+ "serial",
623
+ "name",
624
+ "altLoc",
625
+ "resName",
626
+ "chainID",
627
+ "resSeq",
628
+ "iCode",
629
+ "x",
630
+ "y",
631
+ "z",
632
+ "occupancy",
633
+ "tempFactor",
634
+ "element",
635
+ "charge",
636
+ "model",
637
+ ]
638
+ for col in pdb_essential_cols:
639
+ if col not in df_fitted.columns:
640
+ # This case might occur if input mmCIF was missing fundamental columns mapped to PDB essentials
641
+ # Decide on default value or raise error. Adding empty series for now.
642
+ df_fitted[col] = pd.Series(
643
+ dtype="object"
644
+ ) # Add as object to handle potential None/mixed types initially
645
+
646
+ # Re-order columns to standard PDB order for clarity
647
+ final_pdb_order = [col for col in pdb_essential_cols if col in df_fitted.columns]
648
+ other_cols = [col for col in df_fitted.columns if col not in final_pdb_order]
649
+ df_fitted = df_fitted[final_pdb_order + other_cols]
650
+
651
+ # --- Final Type Conversions for PDB format ---
652
+ # Convert numeric columns (similar to parse_pdb_atoms)
653
+ pdb_numeric_columns = [
654
+ "serial",
655
+ "resSeq",
656
+ "x",
657
+ "y",
658
+ "z",
659
+ "occupancy",
660
+ "tempFactor",
661
+ "model",
662
+ ]
663
+ for col in pdb_numeric_columns:
664
+ if col in df_fitted.columns:
665
+ # Use Int64 for integer-like columns that might have been NaN during processing
666
+ if col in ["serial", "resSeq", "model"]:
667
+ df_fitted[col] = pd.to_numeric(df_fitted[col], errors="coerce").astype(
668
+ "Int64"
669
+ )
670
+ else: # Floats
671
+ df_fitted[col] = pd.to_numeric(df_fitted[col], errors="coerce")
672
+
673
+ # Convert categorical columns (similar to parse_pdb_atoms)
674
+ # Note: chainID and iCode were already handled during fitting/renaming
675
+ pdb_categorical_columns_final = [
676
+ "record_type",
677
+ "name",
678
+ "altLoc",
679
+ "resName",
680
+ "chainID", # Already category, but ensure consistency
681
+ "iCode", # Already category, but ensure consistency
682
+ "element",
683
+ "charge",
684
+ ]
685
+ for col in pdb_categorical_columns_final:
686
+ if col in df_fitted.columns:
687
+ # Ensure the column is categorical first
688
+ if not pd.api.types.is_categorical_dtype(df_fitted[col]):
689
+ # Convert non-categorical columns, handling potential NaNs
690
+ if df_fitted[col].isnull().any():
691
+ df_fitted[col] = (
692
+ df_fitted[col].astype(object).fillna("").astype("category")
693
+ )
694
+ else:
695
+ df_fitted[col] = df_fitted[col].astype("category")
696
+ else:
697
+ # If already categorical, check if '' needs to be added before fillna
698
+ has_nans = df_fitted[col].isnull().any()
699
+ if has_nans and "" not in df_fitted[col].cat.categories:
700
+ # Add '' category explicitly
701
+ df_fitted[col] = df_fitted[col].cat.add_categories([""])
702
+
703
+ # Fill None/NaN with empty string (now safe)
704
+ if has_nans:
705
+ df_fitted[col].fillna("", inplace=True)
706
+
707
+ return df_fitted
708
+
709
+
710
+ def _format_pdb_atom_line(atom_data: dict) -> str:
711
+ """Formats a dictionary of atom data into a PDB ATOM/HETATM line."""
712
+ # PDB format specification:
713
+ # COLUMNS DATA TYPE FIELD DEFINITION
714
+ # -----------------------------------------------------------------------
715
+ # 1 - 6 Record name "ATOM " or "HETATM"
716
+ # 7 - 11 Integer serial Atom serial number.
717
+ # 13 - 16 Atom name Atom name.
718
+ # 17 Character altLoc Alternate location indicator.
719
+ # 18 - 20 Residue name resName Residue name.
720
+ # 22 Character chainID Chain identifier.
721
+ # 23 - 26 Integer resSeq Residue sequence number.
722
+ # 27 AChar iCode Code for insertion of residues.
723
+ # 31 - 38 Real(8.3) x Orthogonal coordinates for X.
724
+ # 39 - 46 Real(8.3) y Orthogonal coordinates for Y.
725
+ # 47 - 54 Real(8.3) z Orthogonal coordinates for Z.
726
+ # 55 - 60 Real(6.2) occupancy Occupancy.
727
+ # 61 - 66 Real(6.2) tempFactor Temperature factor.
728
+ # 77 - 78 LString(2) element Element symbol, right-justified.
729
+ # 79 - 80 LString(2) charge Charge on the atom.
730
+
731
+ # Record name (ATOM/HETATM)
732
+ record_name = atom_data.get("record_name", "ATOM").ljust(6)
733
+
734
+ # Serial number
735
+ serial = str(atom_data.get("serial", 0)).rjust(5)
736
+
737
+ # Atom name - special alignment rules
738
+ atom_name = atom_data.get("name", "")
739
+ if len(atom_name) < 4 and atom_name[:1].isalpha():
740
+ # Pad with space on left for 1-3 char names starting with a letter
741
+ atom_name_fmt = (" " + atom_name).ljust(4)
742
+ else:
743
+ # Use as is, left-justified, for 4-char names or those starting with a digit
744
+ atom_name_fmt = atom_name.ljust(4)
745
+
746
+ # Alternate location indicator
747
+ alt_loc = atom_data.get("altLoc", "")[:1].ljust(1) # Max 1 char
748
+
749
+ # Residue name
750
+ res_name = atom_data.get("resName", "").rjust(
751
+ 3
752
+ ) # Spec says "Residue name", examples often right-justified
753
+
754
+ # Chain identifier
755
+ chain_id = atom_data.get("chainID", "")[:1].ljust(1) # Max 1 char
756
+
757
+ # Residue sequence number
758
+ res_seq = str(atom_data.get("resSeq", 0)).rjust(4)
759
+
760
+ # Insertion code
761
+ icode = atom_data.get("iCode", "")[:1].ljust(1) # Max 1 char
762
+
763
+ # Coordinates
764
+ x = f"{atom_data.get('x', 0.0):8.3f}"
765
+ y = f"{atom_data.get('y', 0.0):8.3f}"
766
+ z = f"{atom_data.get('z', 0.0):8.3f}"
767
+
768
+ # Occupancy
769
+ occupancy = f"{atom_data.get('occupancy', 1.0):6.2f}"
770
+
771
+ # Temperature factor
772
+ temp_factor = f"{atom_data.get('tempFactor', 0.0):6.2f}"
773
+
774
+ # Element symbol
775
+ element = atom_data.get("element", "").rjust(2)
776
+
777
+ # Charge
778
+ charge_val = atom_data.get("charge", "")
779
+ charge_fmt = ""
780
+ if charge_val:
781
+ try:
782
+ # Try converting numeric charge (e.g., +1, -2) to PDB format (1+, 2-)
783
+ charge_int = int(float(charge_val)) # Use float first for cases like "1.0"
784
+ if charge_int != 0:
785
+ charge_fmt = f"{abs(charge_int)}{'+' if charge_int > 0 else '-'}"
786
+ except ValueError:
787
+ # If already formatted (e.g., "1+", "FE2+"), use its string representation
788
+ charge_fmt = str(charge_val)
789
+ # Ensure it fits and is right-justified
790
+ charge_fmt = charge_fmt.strip()[:2].rjust(2)
791
+ else:
792
+ charge_fmt = " " # Blank if no charge
793
+
794
+ # Construct the full line
795
+ # Ensure spacing is correct according to the spec
796
+ # 1-6 Record name | 7-11 Serial | 12 Space | 13-16 Name | 17 AltLoc | 18-20 ResName | 21 Space | 22 ChainID | 23-26 ResSeq | 27 iCode | 28-30 Spaces | 31-38 X | 39-46 Y | 47-54 Z | 55-60 Occupancy | 61-66 TempFactor | 67-76 Spaces | 77-78 Element | 79-80 Charge
797
+ line = (
798
+ f"{record_name}{serial} {atom_name_fmt}{alt_loc}{res_name} {chain_id}{res_seq}{icode} "
799
+ f"{x}{y}{z}{occupancy}{temp_factor} " # 10 spaces
800
+ f"{element}{charge_fmt}"
801
+ )
802
+
803
+ # Ensure the line is exactly 80 characters long
804
+ return line.ljust(80)
805
+
806
+
229
807
  def write_pdb(
230
808
  df: pd.DataFrame, output: Union[str, TextIO, None] = None
231
809
  ) -> Union[str, None]:
@@ -235,7 +813,8 @@ def write_pdb(
235
813
  Parameters:
236
814
  -----------
237
815
  df : pd.DataFrame
238
- DataFrame containing atom records, as created by parse_pdb_atoms or parse_cif_atoms
816
+ DataFrame containing atom records, as created by parse_pdb_atoms or parse_cif_atoms.
817
+ Must contain columns mappable to PDB format fields.
239
818
  output : Union[str, TextIO, None], optional
240
819
  Output file path or file-like object. If None, returns the PDB content as a string.
241
820
 
@@ -244,218 +823,162 @@ def write_pdb(
244
823
  Union[str, None]
245
824
  If output is None, returns the PDB content as a string. Otherwise, returns None.
246
825
  """
247
- # Create a buffer to store the PDB content
248
826
  buffer = io.StringIO()
827
+ format_type = df.attrs.get("format", "PDB") # Assume PDB if not specified
249
828
 
250
- # Get the format of the DataFrame
251
- format_type = df.attrs.get("format", "PDB")
252
-
253
- # Variables to track chain changes for TER records
829
+ last_model_num = None
254
830
  last_chain_id = None
255
- last_res_seq = None
256
- last_res_name = None
257
- last_serial = None
258
- last_icode = None
259
-
260
- # Process each row in the DataFrame
261
- for index, row in df.iterrows():
262
- # Get current chain ID
831
+ last_res_info = None # Tuple (resSeq, iCode, resName) for TER record
832
+ last_serial = 0
833
+
834
+ # Check if DataFrame is empty
835
+ if df.empty:
836
+ buffer.write("END\n")
837
+ content = buffer.getvalue()
838
+ buffer.close()
839
+ if output is not None:
840
+ if isinstance(output, str):
841
+ with open(output, "w") as f:
842
+ f.write(content)
843
+ else:
844
+ output.write(content)
845
+ return None
846
+ return content
847
+
848
+ for _, row in df.iterrows():
849
+ atom_data = {}
850
+
851
+ # --- Data Extraction ---
263
852
  if format_type == "PDB":
264
- current_chain_id = row["chainID"]
265
- else: # mmCIF
266
- current_chain_id = row.get("auth_asym_id", row.get("label_asym_id", ""))
853
+ # Pre-process PDB values, converting None to empty strings for optional fields
854
+ raw_alt_loc = row.get("altLoc")
855
+ pdb_alt_loc = "" if pd.isna(raw_alt_loc) else str(raw_alt_loc)
856
+
857
+ raw_icode = row.get("iCode")
858
+ pdb_icode = "" if pd.isna(raw_icode) else str(raw_icode)
859
+
860
+ raw_element = row.get("element")
861
+ pdb_element = "" if pd.isna(raw_element) else str(raw_element)
862
+
863
+ raw_charge = row.get("charge")
864
+ pdb_charge = "" if pd.isna(raw_charge) else str(raw_charge)
865
+
866
+ atom_data = {
867
+ "record_name": row.get("record_type", "ATOM"),
868
+ "serial": int(row.get("serial", 0)),
869
+ "name": str(row.get("name", "")),
870
+ "altLoc": pdb_alt_loc,
871
+ "resName": str(row.get("resName", "")),
872
+ "chainID": str(row.get("chainID", "")),
873
+ "resSeq": int(row.get("resSeq", 0)),
874
+ "iCode": pdb_icode,
875
+ "x": float(row.get("x", 0.0)),
876
+ "y": float(row.get("y", 0.0)),
877
+ "z": float(row.get("z", 0.0)),
878
+ "occupancy": float(row.get("occupancy", 1.0)),
879
+ "tempFactor": float(row.get("tempFactor", 0.0)),
880
+ "element": pdb_element,
881
+ "charge": pdb_charge,
882
+ "model": int(row.get("model", 1)),
883
+ }
884
+ elif format_type == "mmCIF":
885
+ # Pre-process mmCIF values to PDB compatible format, converting None to empty strings
886
+ raw_alt_loc = row.get("label_alt_id")
887
+ pdb_alt_loc = "" if pd.isna(raw_alt_loc) else str(raw_alt_loc)
888
+
889
+ raw_icode = row.get("pdbx_PDB_ins_code")
890
+ pdb_icode = "" if pd.isna(raw_icode) else str(raw_icode)
891
+
892
+ raw_element = row.get("type_symbol")
893
+ pdb_element = "" if pd.isna(raw_element) else str(raw_element)
894
+
895
+ raw_charge = row.get("pdbx_formal_charge")
896
+ pdb_charge = "" if pd.isna(raw_charge) else str(raw_charge)
897
+
898
+ atom_data = {
899
+ "record_name": row.get("group_PDB", "ATOM"),
900
+ "serial": int(row.get("id", 0)),
901
+ "name": str(row.get("auth_atom_id", row.get("label_atom_id", ""))),
902
+ "altLoc": pdb_alt_loc,
903
+ "resName": str(row.get("auth_comp_id", row.get("label_comp_id", ""))),
904
+ "chainID": str(row.get("auth_asym_id", row.get("label_asym_id"))),
905
+ "resSeq": int(row.get("auth_seq_id", row.get("label_seq_id", 0))),
906
+ "iCode": pdb_icode,
907
+ "x": float(row.get("Cartn_x", 0.0)),
908
+ "y": float(row.get("Cartn_y", 0.0)),
909
+ "z": float(row.get("Cartn_z", 0.0)),
910
+ "occupancy": float(row.get("occupancy", 1.0)),
911
+ "tempFactor": float(row.get("B_iso_or_equiv", 0.0)),
912
+ "element": pdb_element,
913
+ "charge": pdb_charge,
914
+ "model": int(row.get("pdbx_PDB_model_num", 1)),
915
+ }
916
+ else:
917
+ raise ValueError(f"Unsupported DataFrame format: {format_type}")
918
+
919
+ # --- MODEL/ENDMDL Records ---
920
+ current_model_num = atom_data["model"]
921
+ if current_model_num != last_model_num:
922
+ if last_model_num is not None:
923
+ buffer.write("ENDMDL\n")
924
+ buffer.write(f"MODEL {current_model_num:>4}\n")
925
+ last_model_num = current_model_num
926
+ # Reset chain/residue tracking for the new model
927
+ last_chain_id = None
928
+ last_res_info = None
929
+
930
+ # --- TER Records ---
931
+ current_chain_id = atom_data["chainID"]
932
+ current_res_info = (
933
+ atom_data["resSeq"],
934
+ atom_data["iCode"],
935
+ atom_data["resName"],
936
+ )
267
937
 
268
- # Write TER record if chain changes
938
+ # Write TER if chain ID changes within the same model
269
939
  if last_chain_id is not None and current_chain_id != last_chain_id:
270
- # Format TER record according to PDB specification
271
- # Columns:
272
- # 1-6: "TER "
273
- # 7-11: Serial number (right-justified)
274
- # 18-20: Residue name (right-justified)
275
- # 22: Chain ID
276
- # 23-26: Residue sequence number (right-justified)
277
- # 27: Insertion code
278
940
  ter_serial = str(last_serial + 1).rjust(5)
279
- ter_res_name = last_res_name.strip().ljust(3) # Strip and left-justify
941
+ ter_res_name = last_res_info[2].strip().rjust(3) # Use last residue's name
280
942
  ter_chain_id = last_chain_id
281
- ter_res_seq = last_res_seq.rjust(4)
282
- ter_icode = last_icode if last_icode else "" # Use last recorded iCode
943
+ ter_res_seq = str(last_res_info[0]).rjust(4) # Use last residue's seq num
944
+ ter_icode = (
945
+ last_res_info[1] if last_res_info[1] else ""
946
+ ) # Use last residue's icode
283
947
 
284
- # Construct the TER line ensuring correct spacing for all fields
285
- # TER (1-6), serial (7-11), space (12-17), resName (18-20), space (21),
286
- # chainID (22), resSeq (23-26), iCode (27)
287
948
  ter_line = f"TER {ter_serial} {ter_res_name} {ter_chain_id}{ter_res_seq}{ter_icode}"
288
949
  buffer.write(ter_line.ljust(80) + "\n")
289
950
 
290
- # Initialize the line with spaces
291
- line = " " * 80
292
-
293
- # Set record type (ATOM or HETATM)
294
- if format_type == "PDB":
295
- record_type = row["record_type"]
296
- else: # mmCIF
297
- record_type = row.get("group_PDB", "ATOM")
298
- line = record_type.ljust(6) + line[6:]
299
-
300
- # Set atom serial number
301
- if format_type == "PDB":
302
- serial = str(int(row["serial"]))
303
- else: # mmCIF
304
- serial = str(int(row["id"]))
305
- line = line[:6] + serial.rjust(5) + line[11:]
306
-
307
- # Set atom name
308
- if format_type == "PDB":
309
- atom_name = row["name"]
310
- else: # mmCIF
311
- atom_name = row.get("auth_atom_id", row.get("label_atom_id", ""))
312
-
313
- # Right-justify atom name if it starts with a number
314
- if atom_name and atom_name[0].isdigit():
315
- line = line[:12] + atom_name.ljust(4) + line[16:]
316
- else:
317
- line = line[:12] + " " + atom_name.ljust(3) + line[16:]
318
-
319
- # Set alternate location indicator
320
- if format_type == "PDB":
321
- alt_loc = row.get("altLoc", "")
322
- else: # mmCIF
323
- alt_loc = row.get("label_alt_id", "")
324
- line = line[:16] + alt_loc + line[17:]
325
-
326
- # Set residue name
327
- if format_type == "PDB":
328
- res_name = row["resName"]
329
- else: # mmCIF
330
- res_name = row.get("auth_comp_id", row.get("label_comp_id", ""))
331
- line = line[:17] + res_name.ljust(3) + line[20:]
332
-
333
- # Set chain identifier
334
- if format_type == "PDB":
335
- chain_id = row["chainID"]
336
- else: # mmCIF
337
- chain_id = row.get("auth_asym_id", row.get("label_asym_id", ""))
338
- line = line[:21] + chain_id + line[22:]
339
-
340
- # Set residue sequence number
341
- if format_type == "PDB":
342
- res_seq = str(int(row["resSeq"]))
343
- else: # mmCIF
344
- res_seq = str(int(row.get("auth_seq_id", row.get("label_seq_id", 0))))
345
- line = line[:22] + res_seq.rjust(4) + line[26:]
346
-
347
- # Set insertion code
348
- if format_type == "PDB":
349
- icode = row["iCode"] if pd.notna(row["iCode"]) else ""
350
- else: # mmCIF
351
- icode = (
352
- row.get("pdbx_PDB_ins_code", "")
353
- if pd.notna(row.get("pdbx_PDB_ins_code", ""))
354
- else ""
355
- )
356
- line = line[:26] + icode + line[27:]
357
-
358
- # Set X coordinate
359
- if format_type == "PDB":
360
- x = float(row["x"])
361
- else: # mmCIF
362
- x = float(row["Cartn_x"])
363
- line = line[:30] + f"{x:8.3f}" + line[38:]
951
+ # --- Format and Write ATOM/HETATM Line ---
952
+ pdb_line = _format_pdb_atom_line(atom_data)
953
+ buffer.write(pdb_line + "\n")
364
954
 
365
- # Set Y coordinate
366
- if format_type == "PDB":
367
- y = float(row["y"])
368
- else: # mmCIF
369
- y = float(row["Cartn_y"])
370
- line = line[:38] + f"{y:8.3f}" + line[46:]
371
-
372
- # Set Z coordinate
373
- if format_type == "PDB":
374
- z = float(row["z"])
375
- else: # mmCIF
376
- z = float(row["Cartn_z"])
377
- line = line[:46] + f"{z:8.3f}" + line[54:]
378
-
379
- # Set occupancy
380
- if format_type == "PDB":
381
- occupancy = float(row["occupancy"])
382
- else: # mmCIF
383
- occupancy = float(row.get("occupancy", 1.0))
384
- line = line[:54] + f"{occupancy:6.2f}" + line[60:]
955
+ # --- Update Tracking Variables ---
956
+ last_serial = atom_data["serial"]
957
+ last_chain_id = current_chain_id
958
+ last_res_info = current_res_info
385
959
 
386
- # Set temperature factor
387
- if format_type == "PDB":
388
- temp_factor = float(row["tempFactor"])
389
- else: # mmCIF
390
- temp_factor = float(row.get("B_iso_or_equiv", 0.0))
391
- line = line[:60] + f"{temp_factor:6.2f}" + line[66:]
392
-
393
- # Set element symbol
394
- if format_type == "PDB":
395
- element = row["element"]
396
- else: # mmCIF
397
- element = row.get("type_symbol", "")
398
- line = line[:76] + element.rjust(2) + line[78:]
399
-
400
- # Set charge
401
- if format_type == "PDB":
402
- charge = row["charge"]
403
- else: # mmCIF
404
- charge = row.get("pdbx_formal_charge", "")
405
- if charge and charge not in ["?", "."]:
406
- # Convert numeric charge to PDB format (e.g., "1+" or "2-")
407
- try:
408
- charge_val = int(charge)
409
- if charge_val != 0:
410
- charge = f"{abs(charge_val)}{'+' if charge_val > 0 else '-'}"
411
- else:
412
- charge = ""
413
- except ValueError:
414
- pass
415
- line = line[:78] + charge + line[80:]
416
-
417
- # Write the line to the buffer
418
- buffer.write(line.rstrip() + "\n")
419
-
420
- # Update last atom info for potential TER record
421
- if format_type == "PDB":
422
- last_serial = int(row["serial"])
423
- last_res_name = row["resName"]
424
- last_chain_id = row["chainID"]
425
- last_res_seq = str(int(row["resSeq"]))
426
- last_icode = row["iCode"] if pd.notna(row["iCode"]) else ""
427
- else: # mmCIF
428
- last_serial = int(row["id"])
429
- last_res_name = row.get("auth_comp_id", row.get("label_comp_id", ""))
430
- last_chain_id = row.get("auth_asym_id", row.get("label_asym_id", ""))
431
- last_res_seq = str(int(row.get("auth_seq_id", row.get("label_seq_id", 0))))
432
- last_icode = (
433
- row.get("pdbx_PDB_ins_code", "")
434
- if pd.notna(row.get("pdbx_PDB_ins_code", ""))
435
- else ""
436
- )
437
-
438
- # Add TER record for the last chain
960
+ # --- Final Records ---
961
+ # Add TER record for the very last chain in the last model
439
962
  if last_chain_id is not None:
440
- # Format TER record according to PDB specification
441
963
  ter_serial = str(last_serial + 1).rjust(5)
442
- ter_res_name = last_res_name.strip().ljust(3) # Strip and left-justify
964
+ ter_res_name = last_res_info[2].strip().rjust(3)
443
965
  ter_chain_id = last_chain_id
444
- ter_res_seq = last_res_seq.rjust(4)
445
- ter_icode = last_icode if last_icode else "" # Use last recorded iCode
966
+ ter_res_seq = str(last_res_info[0]).rjust(4)
967
+ ter_icode = last_res_info[1] if last_res_info[1] else ""
446
968
 
447
- # Construct the TER line ensuring correct spacing for all fields
448
969
  ter_line = f"TER {ter_serial} {ter_res_name} {ter_chain_id}{ter_res_seq}{ter_icode}"
449
970
  buffer.write(ter_line.ljust(80) + "\n")
450
971
 
451
- # Add END record
972
+ # Add ENDMDL if models were used
973
+ if last_model_num is not None:
974
+ buffer.write("ENDMDL\n")
975
+
452
976
  buffer.write("END\n")
453
977
 
454
- # Get the content as a string
978
+ # --- Output Handling ---
455
979
  content = buffer.getvalue()
456
980
  buffer.close()
457
981
 
458
- # Write to output if provided
459
982
  if output is not None:
460
983
  if isinstance(output, str):
461
984
  with open(output, "w") as f:
@@ -463,9 +986,8 @@ def write_pdb(
463
986
  else:
464
987
  output.write(content)
465
988
  return None
466
-
467
- # Return the content as a string
468
- return content
989
+ else:
990
+ return content
469
991
 
470
992
 
471
993
  def write_cif(
@@ -490,7 +1012,7 @@ def write_cif(
490
1012
  format_type = df.attrs.get("format", "PDB")
491
1013
 
492
1014
  # Create a new DataContainer
493
- data_container = DataContainer("data_structure")
1015
+ data_container = DataContainer("rnapolis")
494
1016
 
495
1017
  # Define the attributes for atom_site category
496
1018
  if format_type == "mmCIF":
@@ -519,7 +1041,7 @@ def write_cif(
519
1041
  "auth_comp_id", # resName
520
1042
  "auth_asym_id", # chainID
521
1043
  "auth_atom_id", # name
522
- "pdbx_PDB_model_num", # (generated)
1044
+ "pdbx_PDB_model_num", # model
523
1045
  ]
524
1046
 
525
1047
  # Prepare rows for the atom_site category
@@ -527,32 +1049,44 @@ def write_cif(
527
1049
 
528
1050
  for _, row in df.iterrows():
529
1051
  if format_type == "mmCIF":
530
- # Use existing mmCIF data
531
- row_data = [str(row.get(attr, "?")) for attr in attributes]
1052
+ # Use existing mmCIF data, converting None to '?' universally
1053
+ row_data = []
1054
+ for attr in attributes:
1055
+ value = row.get(attr)
1056
+ if pd.isna(value):
1057
+ # Use '?' as the standard placeholder for missing values
1058
+ row_data.append("?")
1059
+ else:
1060
+ # Ensure all non-missing values are converted to string
1061
+ row_data.append(str(value))
532
1062
  else: # PDB format
533
- # Map PDB data to mmCIF format
1063
+ # Map PDB data to mmCIF format, converting None to '.' or '?'
534
1064
  entity_id = "1" # Default entity ID
535
- model_num = "1" # Default model number
1065
+ model_num = str(int(row["model"]))
1066
+
1067
+ # Pre-process optional fields for mmCIF placeholders
1068
+ element_val = "?" if pd.isna(row.get("element")) else str(row["element"])
1069
+ altloc_val = "." if pd.isna(row.get("altLoc")) else str(row["altLoc"])
1070
+ icode_val = "." if pd.isna(row.get("iCode")) else str(row["iCode"])
1071
+ charge_val = "." if pd.isna(row.get("charge")) else str(row["charge"])
536
1072
 
537
1073
  row_data = [
538
1074
  str(row["record_type"]), # group_PDB
539
1075
  str(int(row["serial"])), # id
540
- str(row["element"]), # type_symbol
1076
+ element_val, # type_symbol
541
1077
  str(row["name"]), # label_atom_id
542
- str(row.get("altLoc", "")), # label_alt_id
1078
+ altloc_val, # label_alt_id
543
1079
  str(row["resName"]), # label_comp_id
544
1080
  str(row["chainID"]), # label_asym_id
545
1081
  entity_id, # label_entity_id
546
1082
  str(int(row["resSeq"])), # label_seq_id
547
- str(row["iCode"])
548
- if pd.notna(row["iCode"])
549
- else "?", # pdbx_PDB_ins_code
1083
+ icode_val, # pdbx_PDB_ins_code
550
1084
  f"{float(row['x']):.3f}", # Cartn_x
551
1085
  f"{float(row['y']):.3f}", # Cartn_y
552
1086
  f"{float(row['z']):.3f}", # Cartn_z
553
1087
  f"{float(row['occupancy']):.2f}", # occupancy
554
1088
  f"{float(row['tempFactor']):.2f}", # B_iso_or_equiv
555
- str(row.get("charge", "")) or "?", # pdbx_formal_charge
1089
+ charge_val, # pdbx_formal_charge
556
1090
  str(int(row["resSeq"])), # auth_seq_id
557
1091
  str(row["resName"]), # auth_comp_id
558
1092
  str(row["chainID"]), # auth_asym_id
rnapolis/splitter.py CHANGED
@@ -3,10 +3,14 @@ import argparse
3
3
  import os
4
4
  import sys
5
5
 
6
- import pandas as pd
7
-
8
6
  from rnapolis.parser import is_cif
9
- from rnapolis.parser_v2 import parse_cif_atoms, parse_pdb_atoms, write_cif, write_pdb
7
+ from rnapolis.parser_v2 import (
8
+ fit_to_pdb,
9
+ parse_cif_atoms,
10
+ parse_pdb_atoms,
11
+ write_cif,
12
+ write_pdb,
13
+ )
10
14
 
11
15
 
12
16
  def main():
@@ -97,12 +101,21 @@ def main():
97
101
 
98
102
  try:
99
103
  if output_format == "PDB":
100
- write_pdb(model_df, output_path)
104
+ df_to_write = fit_to_pdb(model_df)
105
+ write_pdb(df_to_write, output_path)
101
106
  else: # mmCIF
102
107
  write_cif(model_df, output_path)
108
+ except ValueError as e:
109
+ # Handle errors specifically from fit_to_pdb
110
+ print(
111
+ f"Error fitting model {model_num} from {args.file} to PDB: {e}. Skipping model.",
112
+ file=sys.stderr,
113
+ )
114
+ continue
103
115
  except Exception as e:
116
+ # Handle general writing errors
104
117
  print(
105
- f"Error writing file {output_path}: {e}",
118
+ f"Error writing file {output_path} for model {model_num}: {e}",
106
119
  file=sys.stderr,
107
120
  )
108
121
  # Optionally continue to next model or exit
rnapolis/unifier.py CHANGED
@@ -7,7 +7,13 @@ from collections import Counter
7
7
  import pandas as pd
8
8
 
9
9
  from rnapolis.parser import is_cif
10
- from rnapolis.parser_v2 import parse_cif_atoms, parse_pdb_atoms, write_cif, write_pdb
10
+ from rnapolis.parser_v2 import (
11
+ fit_to_pdb,
12
+ parse_cif_atoms,
13
+ parse_pdb_atoms,
14
+ write_cif,
15
+ write_pdb,
16
+ )
11
17
  from rnapolis.tertiary_v2 import Structure
12
18
 
13
19
 
@@ -140,13 +146,22 @@ def main():
140
146
 
141
147
  ext = ".pdb" if format == "PDB" else ".cif"
142
148
 
143
- with open(f"{args.output}/{base}{ext}", "w") as f:
144
- df = pd.concat([residue.atoms for residue in residues])
149
+ df = pd.concat([residue.atoms for residue in residues])
145
150
 
151
+ try:
146
152
  if format == "PDB":
147
- write_pdb(df, f)
153
+ df_to_write = fit_to_pdb(df)
154
+ with open(f"{args.output}/{base}{ext}", "w") as f:
155
+ write_pdb(df_to_write, f)
148
156
  else:
149
- write_cif(df, f)
157
+ with open(f"{args.output}/{base}{ext}", "w") as f:
158
+ write_cif(df, f)
159
+ except ValueError as e:
160
+ print(
161
+ f"Error processing {path} for PDB output: {e}. Skipping file.",
162
+ file=sys.stderr,
163
+ )
164
+ continue
150
165
 
151
166
 
152
167
  if __name__ == "__main__":
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: RNApolis
3
- Version: 0.8.0
3
+ Version: 0.8.1
4
4
  Summary: A Python library containing RNA-related bioinformatics functions and classes
5
5
  Home-page: https://github.com/tzok/rnapolis-py
6
6
  Author: Tomasz Zok
@@ -12,17 +12,17 @@ rnapolis/mmcif_pdbx_v50.dic,sha256=5QFx1ssDaehR4_DQ-tS9VQux262SiLXaqcwmwwejF5c,5
12
12
  rnapolis/molecule_filter.py,sha256=jgcpJxx_oXEBX0d30v4k_FdwRouRUPUsEtCYWgLGpD4,7310
13
13
  rnapolis/motif_extractor.py,sha256=Lfn1iEkhkP9eZD3GPEWNAfy00QO7QPCc8wM_XS1ory8,1147
14
14
  rnapolis/parser.py,sha256=3g4mtFvpiEENFcSBBtx_E_x1vJPF9BujWnts0kb9XjE,16340
15
- rnapolis/parser_v2.py,sha256=eUccbTXCD5I7q0GVbaGWmjj0CT5d2VK8x9tr0gtrRuA,19801
15
+ rnapolis/parser_v2.py,sha256=qG6CO3or7zmuJu368g9Nzokiqdeip4yjD14F163uH6w,40618
16
16
  rnapolis/rfam_folder.py,sha256=SjiiyML_T1__saruFwSMJEoQ7Y55GIU8ktS8ZUn5-fw,11111
17
- rnapolis/splitter.py,sha256=8mMZ2ZmhqptPUjmkDOFbLvC-dvWpuvJ0beSoeaD5pzk,3642
17
+ rnapolis/splitter.py,sha256=x-Zn21mkiMgvYPptUFD9BbdNIvoaM6b8GzGf6uYXEwE,4052
18
18
  rnapolis/tertiary.py,sha256=6t9ZB4w33-5n_M3sns1RoFXCOTgVAgGH4WDNG5OG9Kg,23426
19
19
  rnapolis/tertiary_v2.py,sha256=I1uyHWIUePNGO5m-suoL4ibtz02qAJUMvYm0BUKUygY,22480
20
20
  rnapolis/transformer.py,sha256=aC0nBmHHJf5TyLvBIV57Jj3tlwpvHbPo347opfAOlQA,3844
21
- rnapolis/unifier.py,sha256=DR1_IllgaAYT9_FUE6XC9B-2wgqbBHs2D1MjyZT2j2g,5438
21
+ rnapolis/unifier.py,sha256=2ge7IB9FdRgzSAiVD39U_ciwtdDJ2fGzf8mUIudbrqY,5820
22
22
  rnapolis/util.py,sha256=IdquFO3PV1_KDqodjupzm0Rqvgy0CeSzxGHaGEHYXVU,543
23
- rnapolis-0.8.0.dist-info/licenses/LICENSE,sha256=ZGRu12MzCgbYA-Lt8MyBlmjvPZh7xfiD5u5wBx0enq4,1066
24
- rnapolis-0.8.0.dist-info/METADATA,sha256=zD_byFTP6xNdYCQdu5bslqSE_noBjSagzhn2EOSlcYE,54537
25
- rnapolis-0.8.0.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
26
- rnapolis-0.8.0.dist-info/entry_points.txt,sha256=H00KoN54wU3dFOofAu3H_3PADmZOBTB1hXf5TUU2uzo,438
27
- rnapolis-0.8.0.dist-info/top_level.txt,sha256=LcO18koxZcWoJ21KDRRRo_tyIbmXL5z61dPitZpy8yc,9
28
- rnapolis-0.8.0.dist-info/RECORD,,
23
+ rnapolis-0.8.1.dist-info/licenses/LICENSE,sha256=ZGRu12MzCgbYA-Lt8MyBlmjvPZh7xfiD5u5wBx0enq4,1066
24
+ rnapolis-0.8.1.dist-info/METADATA,sha256=NOg9-s2n313HElku8z06JiBvEhPf6oV9RR7ur20hwys,54537
25
+ rnapolis-0.8.1.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
26
+ rnapolis-0.8.1.dist-info/entry_points.txt,sha256=H00KoN54wU3dFOofAu3H_3PADmZOBTB1hXf5TUU2uzo,438
27
+ rnapolis-0.8.1.dist-info/top_level.txt,sha256=LcO18koxZcWoJ21KDRRRo_tyIbmXL5z61dPitZpy8yc,9
28
+ rnapolis-0.8.1.dist-info/RECORD,,