RNApolis 0.7.0__py3-none-any.whl → 0.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
rnapolis/parser_v2.py CHANGED
@@ -1,4 +1,6 @@
1
1
  import io
2
+ import os
3
+ import string
2
4
  import tempfile
3
5
  from typing import IO, TextIO, Union
4
6
 
@@ -34,31 +36,46 @@ def parse_pdb_atoms(content: Union[str, IO[str]]) -> pd.DataFrame:
34
36
  if isinstance(lines[0], bytes):
35
37
  lines = [line.decode("utf-8") for line in lines]
36
38
 
39
+ current_model = 1
37
40
  for line in lines:
38
41
  record_type = line[:6].strip()
39
42
 
43
+ # Check for MODEL record
44
+ if record_type == "MODEL":
45
+ try:
46
+ current_model = int(line[10:14].strip())
47
+ except ValueError:
48
+ # Handle cases where MODEL record might be malformed
49
+ pass # Keep the previous model number
50
+ continue
51
+
40
52
  # Only process ATOM and HETATM records
41
53
  if record_type not in ["ATOM", "HETATM"]:
42
54
  continue
43
55
 
44
56
  # Parse fields according to PDB format specification
57
+ alt_loc = line[16:17].strip()
45
58
  icode = line[26:27].strip()
59
+ element = line[76:78].strip()
60
+ charge = line[78:80].strip()
61
+
46
62
  record = {
47
63
  "record_type": record_type,
48
64
  "serial": line[6:11].strip(),
49
65
  "name": line[12:16].strip(),
50
- "altLoc": line[16:17].strip(),
66
+ "altLoc": None if not alt_loc else alt_loc, # Store None if empty
51
67
  "resName": line[17:20].strip(),
52
68
  "chainID": line[21:22].strip(),
53
69
  "resSeq": line[22:26].strip(),
54
- "iCode": None if not icode else icode, # Convert empty string to None
70
+ "iCode": None if not icode else icode, # Store None if empty
55
71
  "x": line[30:38].strip(),
56
72
  "y": line[38:46].strip(),
57
73
  "z": line[46:54].strip(),
58
74
  "occupancy": line[54:60].strip(),
59
75
  "tempFactor": line[60:66].strip(),
60
- "element": line[76:78].strip(),
61
- "charge": line[78:80].strip(),
76
+ "element": None if not element else element, # Store None if empty
77
+ "charge": None if not charge else charge, # Store None if empty
78
+ "model": current_model, # Add the current model number
62
79
  }
63
80
 
64
81
  records.append(record)
@@ -83,13 +100,23 @@ def parse_pdb_atoms(content: Union[str, IO[str]]) -> pd.DataFrame:
83
100
  "tempFactor",
84
101
  "element",
85
102
  "charge",
103
+ "model",
86
104
  ]
87
105
  )
88
106
 
89
107
  df = pd.DataFrame(records)
90
108
 
91
109
  # Convert numeric columns to appropriate types
92
- numeric_columns = ["serial", "resSeq", "x", "y", "z", "occupancy", "tempFactor"]
110
+ numeric_columns = [
111
+ "serial",
112
+ "resSeq",
113
+ "x",
114
+ "y",
115
+ "z",
116
+ "occupancy",
117
+ "tempFactor",
118
+ "model",
119
+ ]
93
120
  for col in numeric_columns:
94
121
  df[col] = pd.to_numeric(df[col], errors="coerce")
95
122
 
@@ -128,18 +155,37 @@ def parse_cif_atoms(content: Union[str, IO[str]]) -> pd.DataFrame:
128
155
  """
129
156
  adapter = IoAdapterPy()
130
157
 
131
- # Handle both string content and file-like objects
158
+ # Handle string, StringIO, and file-like objects
132
159
  if isinstance(content, str):
133
- # Create a temporary file to use with the adapter
134
- import tempfile
135
-
136
- with tempfile.NamedTemporaryFile(mode="w+", suffix=".cif") as temp_file:
160
+ # Create a temporary file for string input
161
+ with tempfile.NamedTemporaryFile(
162
+ mode="w+", suffix=".cif", delete=False
163
+ ) as temp_file:
137
164
  temp_file.write(content)
138
- temp_file.flush()
139
- data = adapter.readFile(temp_file.name)
140
- else:
141
- # Assume it's a file-like object with a name attribute
165
+ temp_file_path = temp_file.name
166
+ try:
167
+ data = adapter.readFile(temp_file_path)
168
+ finally:
169
+ os.remove(temp_file_path) # Clean up the temporary file
170
+ elif isinstance(content, io.StringIO):
171
+ # Create a temporary file for StringIO input
172
+ with tempfile.NamedTemporaryFile(
173
+ mode="w+", suffix=".cif", delete=False
174
+ ) as temp_file:
175
+ content.seek(0) # Ensure reading from the start
176
+ temp_file.write(content.read())
177
+ temp_file_path = temp_file.name
178
+ try:
179
+ data = adapter.readFile(temp_file_path)
180
+ finally:
181
+ os.remove(temp_file_path) # Clean up the temporary file
182
+ elif hasattr(content, "name"):
183
+ # Assume it's a file-like object with a name attribute (like an open file)
142
184
  data = adapter.readFile(content.name)
185
+ else:
186
+ raise TypeError(
187
+ "Unsupported input type for parse_cif_atoms. Expected str, file-like object with name, or StringIO."
188
+ )
143
189
 
144
190
  # Get the atom_site category
145
191
  category = data[0].getObj("atom_site")
@@ -155,47 +201,133 @@ def parse_cif_atoms(content: Union[str, IO[str]]) -> pd.DataFrame:
155
201
  # Create a list of dictionaries for each atom
156
202
  records = []
157
203
  for row in rows:
158
- record = dict(zip(attributes, row))
159
-
160
- # Convert "?" or "." in insertion code to None
161
- if "pdbx_PDB_ins_code" in record:
162
- if record["pdbx_PDB_ins_code"] in ["?", ".", ""]:
163
- record["pdbx_PDB_ins_code"] = None
164
-
204
+ record = {}
205
+ for attr, value in zip(attributes, row):
206
+ # Store None if value indicates missing data ('?' or '.')
207
+ if value in ["?", "."]:
208
+ record[attr] = None
209
+ else:
210
+ record[attr] = value
165
211
  records.append(record)
166
212
 
167
213
  # Create DataFrame from records
168
214
  df = pd.DataFrame(records)
169
215
 
170
- # Convert numeric columns to appropriate types
171
- numeric_columns = [
172
- "id",
173
- "auth_seq_id",
216
+ # Define columns based on mmCIF specification for atom_site
217
+ float_cols = [
218
+ "aniso_B[1][1]",
219
+ "aniso_B[1][1]_esd",
220
+ "aniso_B[1][2]",
221
+ "aniso_B[1][2]_esd",
222
+ "aniso_B[1][3]",
223
+ "aniso_B[1][3]_esd",
224
+ "aniso_B[2][2]",
225
+ "aniso_B[2][2]_esd",
226
+ "aniso_B[2][3]",
227
+ "aniso_B[2][3]_esd",
228
+ "aniso_B[3][3]",
229
+ "aniso_B[3][3]_esd",
230
+ "aniso_ratio",
231
+ "aniso_U[1][1]",
232
+ "aniso_U[1][1]_esd",
233
+ "aniso_U[1][2]",
234
+ "aniso_U[1][2]_esd",
235
+ "aniso_U[1][3]",
236
+ "aniso_U[1][3]_esd",
237
+ "aniso_U[2][2]",
238
+ "aniso_U[2][2]_esd",
239
+ "aniso_U[2][3]",
240
+ "aniso_U[2][3]_esd",
241
+ "aniso_U[3][3]",
242
+ "aniso_U[3][3]_esd",
243
+ "B_equiv_geom_mean",
244
+ "B_equiv_geom_mean_esd",
245
+ "B_iso_or_equiv",
246
+ "B_iso_or_equiv_esd",
174
247
  "Cartn_x",
248
+ "Cartn_x_esd",
175
249
  "Cartn_y",
250
+ "Cartn_y_esd",
176
251
  "Cartn_z",
252
+ "Cartn_z_esd",
253
+ "fract_x",
254
+ "fract_x_esd",
255
+ "fract_y",
256
+ "fract_y_esd",
257
+ "fract_z",
258
+ "fract_z_esd",
177
259
  "occupancy",
178
- "B_iso_or_equiv",
260
+ "occupancy_esd",
261
+ "U_equiv_geom_mean",
262
+ "U_equiv_geom_mean_esd",
263
+ "U_iso_or_equiv",
264
+ "U_iso_or_equiv_esd",
265
+ ]
266
+ int_cols = [
267
+ "attached_hydrogens",
268
+ "label_seq_id",
269
+ "symmetry_multiplicity",
270
+ "pdbx_PDB_model_num",
179
271
  "pdbx_formal_charge",
272
+ "pdbx_label_index",
180
273
  ]
181
-
182
- for col in numeric_columns:
183
- if col in df.columns:
184
- df[col] = pd.to_numeric(df[col], errors="coerce")
185
-
186
- # Convert categorical columns
187
- categorical_columns = [
274
+ category_cols = [
275
+ "auth_asym_id",
276
+ "auth_atom_id",
277
+ "auth_comp_id",
278
+ "auth_seq_id",
279
+ "calc_attached_atom",
280
+ "calc_flag",
281
+ "disorder_assembly",
282
+ "disorder_group",
188
283
  "group_PDB",
189
- "type_symbol",
284
+ "id",
285
+ "label_alt_id",
286
+ "label_asym_id",
190
287
  "label_atom_id",
191
288
  "label_comp_id",
192
- "label_asym_id",
193
- "auth_atom_id",
194
- "auth_comp_id",
195
- "auth_asym_id",
289
+ "label_entity_id",
290
+ "thermal_displace_type",
291
+ "type_symbol",
292
+ "pdbx_atom_ambiguity",
293
+ "adp_type",
294
+ "refinement_flags",
295
+ "refinement_flags_adp",
296
+ "refinement_flags_occupancy",
297
+ "refinement_flags_posn",
298
+ "pdbx_auth_alt_id",
299
+ "pdbx_PDB_ins_code",
300
+ "pdbx_PDB_residue_no",
301
+ "pdbx_PDB_residue_name",
302
+ "pdbx_PDB_strand_id",
303
+ "pdbx_PDB_atom_name",
304
+ "pdbx_auth_atom_name",
305
+ "pdbx_auth_comp_id",
306
+ "pdbx_auth_asym_id",
307
+ "pdbx_auth_seq_id",
308
+ "pdbx_tls_group_id",
309
+ "pdbx_ncs_dom_id",
310
+ "pdbx_group_NDB",
311
+ "pdbx_atom_group",
312
+ "pdbx_label_seq_num",
313
+ "pdbx_not_in_asym",
314
+ "pdbx_sifts_xref_db_name",
315
+ "pdbx_sifts_xref_db_acc",
316
+ "pdbx_sifts_xref_db_num",
317
+ "pdbx_sifts_xref_db_res",
196
318
  ]
197
319
 
198
- for col in categorical_columns:
320
+ # Convert columns to appropriate types
321
+ for col in float_cols:
322
+ if col in df.columns:
323
+ df[col] = pd.to_numeric(df[col], errors="coerce")
324
+
325
+ for col in int_cols:
326
+ if col in df.columns:
327
+ # Use Int64 (nullable integer) to handle potential NaNs from coercion
328
+ df[col] = pd.to_numeric(df[col], errors="coerce").astype("Int64")
329
+
330
+ for col in category_cols:
199
331
  if col in df.columns:
200
332
  df[col] = df[col].astype("category")
201
333
 
@@ -205,170 +337,648 @@ def parse_cif_atoms(content: Union[str, IO[str]]) -> pd.DataFrame:
205
337
  return df
206
338
 
207
339
 
208
- def write_pdb(
209
- df: pd.DataFrame, output: Union[str, TextIO, None] = None
210
- ) -> Union[str, None]:
340
+ def can_write_pdb(df: pd.DataFrame) -> bool:
211
341
  """
212
- Write a DataFrame of atom records to PDB format.
342
+ Check if the DataFrame can be losslessly represented in PDB format.
343
+
344
+ PDB format has limitations on field widths:
345
+ - Atom serial number (id): max 99999
346
+ - Chain identifier (auth_asym_id): max 1 character
347
+ - Residue sequence number (auth_seq_id): max 9999
213
348
 
214
349
  Parameters:
215
350
  -----------
216
351
  df : pd.DataFrame
217
- DataFrame containing atom records, as created by parse_pdb_atoms or parse_cif_atoms
218
- output : Union[str, TextIO, None], optional
219
- Output file path or file-like object. If None, returns the PDB content as a string.
352
+ DataFrame containing atom records, as created by parse_pdb_atoms or parse_cif_atoms.
220
353
 
221
354
  Returns:
222
355
  --------
223
- Union[str, None]
224
- If output is None, returns the PDB content as a string. Otherwise, returns None.
356
+ bool
357
+ True if the DataFrame can be written to PDB format without data loss/truncation, False otherwise.
225
358
  """
226
- # Create a buffer to store the PDB content
227
- buffer = io.StringIO()
359
+ format_type = df.attrs.get("format")
228
360
 
229
- # Get the format of the DataFrame
230
- format_type = df.attrs.get("format", "PDB")
361
+ if format_type == "PDB":
362
+ # Assume data originally from PDB already fits PDB constraints
363
+ return True
231
364
 
232
- # Process each row in the DataFrame
233
- for _, row in df.iterrows():
234
- # Initialize the line with spaces
235
- line = " " * 80
365
+ if df.empty:
366
+ # An empty DataFrame can be represented as an empty PDB file
367
+ return True
236
368
 
237
- # Set record type (ATOM or HETATM)
238
- if format_type == "PDB":
239
- record_type = row["record_type"]
240
- else: # mmCIF
241
- record_type = row.get("group_PDB", "ATOM")
242
- line = record_type.ljust(6) + line[6:]
369
+ if format_type == "mmCIF":
370
+ # Check serial number (id)
371
+ # Convert to numeric first to handle potential categorical type and NaNs
372
+ if "id" not in df.columns or (
373
+ pd.to_numeric(df["id"], errors="coerce").max() > 99999
374
+ ):
375
+ return False
243
376
 
244
- # Set atom serial number
245
- if format_type == "PDB":
246
- serial = str(int(row["serial"]))
247
- else: # mmCIF
248
- serial = str(int(row["id"]))
249
- line = line[:6] + serial.rjust(5) + line[11:]
377
+ # Check chain ID (auth_asym_id) length
378
+ if "auth_asym_id" not in df.columns or (
379
+ df["auth_asym_id"].dropna().astype(str).str.len().max() > 1
380
+ ):
381
+ return False
250
382
 
251
- # Set atom name
252
- if format_type == "PDB":
253
- atom_name = row["name"]
254
- else: # mmCIF
255
- atom_name = row.get("auth_atom_id", row.get("label_atom_id", ""))
383
+ # Check residue sequence number (auth_seq_id)
384
+ if "auth_seq_id" not in df.columns or (
385
+ pd.to_numeric(df["auth_seq_id"], errors="coerce").max() > 9999
386
+ ):
387
+ return False
256
388
 
257
- # Right-justify atom name if it starts with a number
258
- if atom_name and atom_name[0].isdigit():
259
- line = line[:12] + atom_name.ljust(4) + line[16:]
260
- else:
261
- line = line[:12] + " " + atom_name.ljust(3) + line[16:]
389
+ # All checks passed for mmCIF
390
+ return True
262
391
 
263
- # Set alternate location indicator
264
- if format_type == "PDB":
265
- alt_loc = row.get("altLoc", "")
266
- else: # mmCIF
267
- alt_loc = row.get("label_alt_id", "")
268
- line = line[:16] + alt_loc + line[17:]
392
+ # If format is unknown or not PDB/mmCIF, assume it cannot be safely written
393
+ return False
269
394
 
270
- # Set residue name
271
- if format_type == "PDB":
272
- res_name = row["resName"]
273
- else: # mmCIF
274
- res_name = row.get("auth_comp_id", row.get("label_comp_id", ""))
275
- line = line[:17] + res_name.ljust(3) + line[20:]
276
395
 
277
- # Set chain identifier
278
- if format_type == "PDB":
279
- chain_id = row["chainID"]
280
- else: # mmCIF
281
- chain_id = row.get("auth_asym_id", row.get("label_asym_id", ""))
282
- line = line[:21] + chain_id + line[22:]
396
+ def fit_to_pdb(df: pd.DataFrame) -> pd.DataFrame:
397
+ """
398
+ Attempts to fit the atom data in a DataFrame to comply with PDB format limitations.
283
399
 
284
- # Set residue sequence number
285
- if format_type == "PDB":
286
- res_seq = str(int(row["resSeq"]))
287
- else: # mmCIF
288
- res_seq = str(int(row.get("auth_seq_id", row.get("label_seq_id", 0))))
289
- line = line[:22] + res_seq.rjust(4) + line[26:]
400
+ If the data already fits (checked by can_write_pdb), returns the original DataFrame.
401
+ Otherwise, checks if fitting is possible based on total atoms, unique chains,
402
+ and residues per chain. If fitting is possible, it renumbers atoms, renames chains,
403
+ and renumbers residues within each chain sequentially starting from 1.
290
404
 
291
- # Set insertion code
292
- if format_type == "PDB":
293
- icode = row["iCode"] if pd.notna(row["iCode"]) else ""
294
- else: # mmCIF
295
- icode = (
296
- row.get("pdbx_PDB_ins_code", "")
297
- if pd.notna(row.get("pdbx_PDB_ins_code", ""))
298
- else ""
299
- )
300
- line = line[:26] + icode + line[27:]
301
-
302
- # Set X coordinate
303
- if format_type == "PDB":
304
- x = float(row["x"])
305
- else: # mmCIF
306
- x = float(row["Cartn_x"])
307
- line = line[:30] + f"{x:8.3f}" + line[38:]
405
+ Parameters:
406
+ -----------
407
+ df : pd.DataFrame
408
+ DataFrame containing atom records, as created by parse_pdb_atoms or parse_cif_atoms.
308
409
 
309
- # Set Y coordinate
310
- if format_type == "PDB":
311
- y = float(row["y"])
312
- else: # mmCIF
313
- y = float(row["Cartn_y"])
314
- line = line[:38] + f"{y:8.3f}" + line[46:]
410
+ Returns:
411
+ --------
412
+ pd.DataFrame
413
+ A new DataFrame with data potentially modified to fit PDB constraints.
414
+ The 'format' attribute of the returned DataFrame will be set to 'PDB'.
415
+
416
+ Raises:
417
+ -------
418
+ ValueError
419
+ If the data cannot be fitted into PDB format constraints (too many atoms,
420
+ chains, or residues per chain).
421
+ """
422
+ format_type = df.attrs.get("format")
423
+
424
+ if not format_type:
425
+ raise ValueError("DataFrame format attribute is not set.")
426
+
427
+ if can_write_pdb(df):
428
+ return df
429
+
430
+ # Determine column names based on format
431
+ if format_type == "PDB":
432
+ serial_col = "serial"
433
+ chain_col = "chainID"
434
+ resseq_col = "resSeq"
435
+ icode_col = "iCode"
436
+ elif format_type == "mmCIF":
437
+ serial_col = "id"
438
+ chain_col = "auth_asym_id"
439
+ resseq_col = "auth_seq_id"
440
+ icode_col = "pdbx_PDB_ins_code"
441
+ else:
442
+ raise ValueError(f"Unsupported DataFrame format: {format_type}")
443
+
444
+ # --- Feasibility Checks ---
445
+ if chain_col not in df.columns:
446
+ raise ValueError(f"Missing required chain column: {chain_col}")
447
+ if resseq_col not in df.columns:
448
+ raise ValueError(f"Missing required residue sequence column: {resseq_col}")
449
+
450
+ unique_chains = df[chain_col].unique()
451
+ num_chains = len(unique_chains)
452
+ total_atoms = len(df)
453
+ max_pdb_serial = 99999
454
+ max_pdb_residue = 9999
455
+ available_chain_ids = list(
456
+ string.ascii_uppercase + string.ascii_lowercase + string.digits
457
+ )
458
+ max_pdb_chains = len(available_chain_ids)
459
+
460
+ # Check 1: Total atoms + TER lines <= 99999
461
+ if total_atoms + num_chains > max_pdb_serial:
462
+ raise ValueError(
463
+ f"Cannot fit to PDB: Total atoms ({total_atoms}) + TER lines ({num_chains}) exceeds PDB limit ({max_pdb_serial})."
464
+ )
315
465
 
316
- # Set Z coordinate
317
- if format_type == "PDB":
318
- z = float(row["z"])
319
- else: # mmCIF
320
- z = float(row["Cartn_z"])
321
- line = line[:46] + f"{z:8.3f}" + line[54:]
466
+ # Check 2: Number of chains <= 62
467
+ if num_chains > max_pdb_chains:
468
+ raise ValueError(
469
+ f"Cannot fit to PDB: Number of unique chains ({num_chains}) exceeds PDB limit ({max_pdb_chains})."
470
+ )
322
471
 
323
- # Set occupancy
324
- if format_type == "PDB":
325
- occupancy = float(row["occupancy"])
326
- else: # mmCIF
327
- occupancy = float(row.get("occupancy", 1.0))
328
- line = line[:54] + f"{occupancy:6.2f}" + line[60:]
472
+ # Check 3: Max residues per chain <= 9999
473
+ # More accurate check: group by chain, then count unique (resSeq, iCode) tuples
474
+ # Use a temporary structure to avoid modifying the original df
475
+ check_df = pd.DataFrame(
476
+ {
477
+ "chain": df[chain_col],
478
+ "resSeq": df[resseq_col],
479
+ "iCode": df[icode_col].fillna("") if icode_col in df.columns else "",
480
+ }
481
+ )
482
+ residue_counts = check_df.groupby("chain").apply(
483
+ lambda x: x[["resSeq", "iCode"]].drop_duplicates().shape[0]
484
+ )
485
+ max_residues_per_chain = residue_counts.max() if not residue_counts.empty else 0
486
+
487
+ if max_residues_per_chain > max_pdb_residue:
488
+ raise ValueError(
489
+ f"Cannot fit to PDB: Maximum residues in a single chain ({max_residues_per_chain}) exceeds PDB limit ({max_pdb_residue})."
490
+ )
329
491
 
330
- # Set temperature factor
331
- if format_type == "PDB":
332
- temp_factor = float(row["tempFactor"])
333
- else: # mmCIF
334
- temp_factor = float(row.get("B_iso_or_equiv", 0.0))
335
- line = line[:60] + f"{temp_factor:6.2f}" + line[66:]
492
+ # --- Perform Fitting ---
493
+ df_fitted = df.copy()
494
+
495
+ # 1. Rename Chains
496
+ chain_mapping = {
497
+ orig_chain: available_chain_ids[i] for i, orig_chain in enumerate(unique_chains)
498
+ }
499
+ df_fitted[chain_col] = df_fitted[chain_col].map(chain_mapping)
500
+ # Ensure the chain column is treated as string/object after mapping
501
+ df_fitted[chain_col] = df_fitted[chain_col].astype(object)
502
+
503
+ # 2. Renumber Residues within each new chain
504
+ new_resseq_col = "new_resSeq" # Temporary column for new numbering
505
+ df_fitted[new_resseq_col] = -1 # Initialize
506
+
507
+ all_new_res_maps = {}
508
+ for new_chain_id, group in df_fitted.groupby(chain_col):
509
+ # Identify unique original residues (seq + icode) in order of appearance
510
+ original_residues = group[[resseq_col, icode_col]].drop_duplicates()
511
+ # Create mapping: (orig_resSeq, orig_iCode) -> new_resSeq (1-based)
512
+ residue_mapping = {
513
+ tuple(res): i + 1
514
+ for i, res in enumerate(original_residues.itertuples(index=False))
515
+ }
516
+ all_new_res_maps[new_chain_id] = residue_mapping
517
+
518
+ # Apply mapping to the group
519
+ res_indices = group.set_index([resseq_col, icode_col]).index
520
+ df_fitted.loc[group.index, new_resseq_col] = res_indices.map(residue_mapping)
521
+
522
+ # Replace original residue number and clear insertion code
523
+ df_fitted[resseq_col] = df_fitted[new_resseq_col]
524
+ df_fitted[icode_col] = None # Insertion codes are now redundant
525
+ df_fitted.drop(columns=[new_resseq_col], inplace=True)
526
+ # Convert resseq_col back to Int64 if it was before, handling potential NaNs if any step failed
527
+ df_fitted[resseq_col] = df_fitted[resseq_col].astype("Int64")
528
+
529
+ # 3. Renumber Atom Serials
530
+ new_serial_col = "new_serial"
531
+ df_fitted[new_serial_col] = -1 # Initialize
532
+ current_serial = 0
533
+ last_chain_id_for_serial = None
534
+
535
+ # Iterate in the potentially re-sorted order after grouping/mapping
536
+ # Ensure stable sort order for consistent serial numbering
537
+ df_fitted.sort_index(
538
+ inplace=True
539
+ ) # Sort by original index to maintain original atom order as much as possible
540
+
541
+ for index, row in df_fitted.iterrows():
542
+ current_chain_id = row[chain_col]
543
+ if (
544
+ last_chain_id_for_serial is not None
545
+ and current_chain_id != last_chain_id_for_serial
546
+ ):
547
+ current_serial += 1 # Increment for TER line
548
+
549
+ current_serial += 1
550
+ if current_serial > max_pdb_serial:
551
+ # This should have been caught by the initial check, but is a safeguard
552
+ raise ValueError("Serial number exceeded PDB limit during renumbering.")
553
+
554
+ df_fitted.loc[index, new_serial_col] = current_serial
555
+ last_chain_id_for_serial = current_chain_id
556
+
557
+ # Replace original serial number
558
+ df_fitted[serial_col] = df_fitted[new_serial_col]
559
+ df_fitted.drop(columns=[new_serial_col], inplace=True)
560
+ # Convert serial_col back to Int64
561
+ df_fitted[serial_col] = df_fitted[serial_col].astype("Int64")
562
+
563
+ # Update attributes and column types for PDB compatibility
564
+ df_fitted.attrs["format"] = "PDB"
565
+
566
+ # Ensure final column types match expected PDB output (especially categories)
567
+ # Reapply categorical conversion as some operations might change dtypes
568
+ pdb_categorical_cols = [
569
+ "record_type",
570
+ "name",
571
+ "altLoc",
572
+ "resName",
573
+ chain_col,
574
+ "element",
575
+ "charge",
576
+ icode_col,
577
+ ]
578
+ if "record_type" not in df_fitted.columns and "group_PDB" in df_fitted.columns:
579
+ df_fitted.rename(
580
+ columns={"group_PDB": "record_type"}, inplace=True
581
+ ) # Ensure correct name
582
+
583
+ for col in pdb_categorical_cols:
584
+ if col in df_fitted.columns:
585
+ # Handle None explicitly before converting to category if needed
586
+ if df_fitted[col].isnull().any():
587
+ df_fitted[col] = (
588
+ df_fitted[col].astype(object).fillna("")
589
+ ) # Fill None with empty string for category
590
+ df_fitted[col] = df_fitted[col].astype("category")
591
+
592
+ # Rename columns if necessary from mmCIF to PDB standard names
593
+ rename_map = {
594
+ "id": "serial",
595
+ "auth_asym_id": "chainID",
596
+ "auth_seq_id": "resSeq",
597
+ "pdbx_PDB_ins_code": "iCode",
598
+ "label_atom_id": "name", # Prefer label_atom_id if auth_atom_id not present? PDB uses 'name'
599
+ "label_comp_id": "resName", # Prefer label_comp_id if auth_comp_id not present? PDB uses 'resName'
600
+ "type_symbol": "element",
601
+ "pdbx_formal_charge": "charge",
602
+ "Cartn_x": "x",
603
+ "Cartn_y": "y",
604
+ "Cartn_z": "z",
605
+ "B_iso_or_equiv": "tempFactor",
606
+ "group_PDB": "record_type",
607
+ "pdbx_PDB_model_num": "model",
608
+ # Add mappings for auth_atom_id -> name, auth_comp_id -> resName if needed,
609
+ # deciding on precedence if both label_* and auth_* exist.
610
+ # Current write_pdb prioritizes auth_* when reading mmCIF, so map those.
611
+ "auth_atom_id": "name",
612
+ "auth_comp_id": "resName",
613
+ }
614
+
615
+ # Only rename columns that actually exist in the DataFrame
616
+ actual_rename_map = {k: v for k, v in rename_map.items() if k in df_fitted.columns}
617
+ df_fitted.rename(columns=actual_rename_map, inplace=True)
618
+
619
+ # Ensure essential PDB columns exist, even if empty, if they were created during fitting
620
+ pdb_essential_cols = [
621
+ "record_type",
622
+ "serial",
623
+ "name",
624
+ "altLoc",
625
+ "resName",
626
+ "chainID",
627
+ "resSeq",
628
+ "iCode",
629
+ "x",
630
+ "y",
631
+ "z",
632
+ "occupancy",
633
+ "tempFactor",
634
+ "element",
635
+ "charge",
636
+ "model",
637
+ ]
638
+ for col in pdb_essential_cols:
639
+ if col not in df_fitted.columns:
640
+ # This case might occur if input mmCIF was missing fundamental columns mapped to PDB essentials
641
+ # Decide on default value or raise error. Adding empty series for now.
642
+ df_fitted[col] = pd.Series(
643
+ dtype="object"
644
+ ) # Add as object to handle potential None/mixed types initially
645
+
646
+ # Re-order columns to standard PDB order for clarity
647
+ final_pdb_order = [col for col in pdb_essential_cols if col in df_fitted.columns]
648
+ other_cols = [col for col in df_fitted.columns if col not in final_pdb_order]
649
+ df_fitted = df_fitted[final_pdb_order + other_cols]
650
+
651
+ # --- Final Type Conversions for PDB format ---
652
+ # Convert numeric columns (similar to parse_pdb_atoms)
653
+ pdb_numeric_columns = [
654
+ "serial",
655
+ "resSeq",
656
+ "x",
657
+ "y",
658
+ "z",
659
+ "occupancy",
660
+ "tempFactor",
661
+ "model",
662
+ ]
663
+ for col in pdb_numeric_columns:
664
+ if col in df_fitted.columns:
665
+ # Use Int64 for integer-like columns that might have been NaN during processing
666
+ if col in ["serial", "resSeq", "model"]:
667
+ df_fitted[col] = pd.to_numeric(df_fitted[col], errors="coerce").astype(
668
+ "Int64"
669
+ )
670
+ else: # Floats
671
+ df_fitted[col] = pd.to_numeric(df_fitted[col], errors="coerce")
672
+
673
+ # Convert categorical columns (similar to parse_pdb_atoms)
674
+ # Note: chainID and iCode were already handled during fitting/renaming
675
+ pdb_categorical_columns_final = [
676
+ "record_type",
677
+ "name",
678
+ "altLoc",
679
+ "resName",
680
+ "chainID", # Already category, but ensure consistency
681
+ "iCode", # Already category, but ensure consistency
682
+ "element",
683
+ "charge",
684
+ ]
685
+ for col in pdb_categorical_columns_final:
686
+ if col in df_fitted.columns:
687
+ # Ensure the column is categorical first
688
+ if not pd.api.types.is_categorical_dtype(df_fitted[col]):
689
+ # Convert non-categorical columns, handling potential NaNs
690
+ if df_fitted[col].isnull().any():
691
+ df_fitted[col] = (
692
+ df_fitted[col].astype(object).fillna("").astype("category")
693
+ )
694
+ else:
695
+ df_fitted[col] = df_fitted[col].astype("category")
696
+ else:
697
+ # If already categorical, check if '' needs to be added before fillna
698
+ has_nans = df_fitted[col].isnull().any()
699
+ if has_nans and "" not in df_fitted[col].cat.categories:
700
+ # Add '' category explicitly
701
+ df_fitted[col] = df_fitted[col].cat.add_categories([""])
702
+
703
+ # Fill None/NaN with empty string (now safe)
704
+ if has_nans:
705
+ df_fitted[col].fillna("", inplace=True)
706
+
707
+ return df_fitted
708
+
709
+
710
+ def _format_pdb_atom_line(atom_data: dict) -> str:
711
+ """Formats a dictionary of atom data into a PDB ATOM/HETATM line."""
712
+ # PDB format specification:
713
+ # COLUMNS DATA TYPE FIELD DEFINITION
714
+ # -----------------------------------------------------------------------
715
+ # 1 - 6 Record name "ATOM " or "HETATM"
716
+ # 7 - 11 Integer serial Atom serial number.
717
+ # 13 - 16 Atom name Atom name.
718
+ # 17 Character altLoc Alternate location indicator.
719
+ # 18 - 20 Residue name resName Residue name.
720
+ # 22 Character chainID Chain identifier.
721
+ # 23 - 26 Integer resSeq Residue sequence number.
722
+ # 27 AChar iCode Code for insertion of residues.
723
+ # 31 - 38 Real(8.3) x Orthogonal coordinates for X.
724
+ # 39 - 46 Real(8.3) y Orthogonal coordinates for Y.
725
+ # 47 - 54 Real(8.3) z Orthogonal coordinates for Z.
726
+ # 55 - 60 Real(6.2) occupancy Occupancy.
727
+ # 61 - 66 Real(6.2) tempFactor Temperature factor.
728
+ # 77 - 78 LString(2) element Element symbol, right-justified.
729
+ # 79 - 80 LString(2) charge Charge on the atom.
730
+
731
+ # Record name (ATOM/HETATM)
732
+ record_name = atom_data.get("record_name", "ATOM").ljust(6)
733
+
734
+ # Serial number
735
+ serial = str(atom_data.get("serial", 0)).rjust(5)
736
+
737
+ # Atom name - special alignment rules
738
+ atom_name = atom_data.get("name", "")
739
+ if len(atom_name) < 4 and atom_name[:1].isalpha():
740
+ # Pad with space on left for 1-3 char names starting with a letter
741
+ atom_name_fmt = (" " + atom_name).ljust(4)
742
+ else:
743
+ # Use as is, left-justified, for 4-char names or those starting with a digit
744
+ atom_name_fmt = atom_name.ljust(4)
745
+
746
+ # Alternate location indicator
747
+ alt_loc = atom_data.get("altLoc", "")[:1].ljust(1) # Max 1 char
748
+
749
+ # Residue name
750
+ res_name = atom_data.get("resName", "").rjust(
751
+ 3
752
+ ) # Spec says "Residue name", examples often right-justified
753
+
754
+ # Chain identifier
755
+ chain_id = atom_data.get("chainID", "")[:1].ljust(1) # Max 1 char
756
+
757
+ # Residue sequence number
758
+ res_seq = str(atom_data.get("resSeq", 0)).rjust(4)
759
+
760
+ # Insertion code
761
+ icode = atom_data.get("iCode", "")[:1].ljust(1) # Max 1 char
762
+
763
+ # Coordinates
764
+ x = f"{atom_data.get('x', 0.0):8.3f}"
765
+ y = f"{atom_data.get('y', 0.0):8.3f}"
766
+ z = f"{atom_data.get('z', 0.0):8.3f}"
767
+
768
+ # Occupancy
769
+ occupancy = f"{atom_data.get('occupancy', 1.0):6.2f}"
770
+
771
+ # Temperature factor
772
+ temp_factor = f"{atom_data.get('tempFactor', 0.0):6.2f}"
773
+
774
+ # Element symbol
775
+ element = atom_data.get("element", "").rjust(2)
776
+
777
+ # Charge
778
+ charge_val = atom_data.get("charge", "")
779
+ charge_fmt = ""
780
+ if charge_val:
781
+ try:
782
+ # Try converting numeric charge (e.g., +1, -2) to PDB format (1+, 2-)
783
+ charge_int = int(float(charge_val)) # Use float first for cases like "1.0"
784
+ if charge_int != 0:
785
+ charge_fmt = f"{abs(charge_int)}{'+' if charge_int > 0 else '-'}"
786
+ except ValueError:
787
+ # If already formatted (e.g., "1+", "FE2+"), use its string representation
788
+ charge_fmt = str(charge_val)
789
+ # Ensure it fits and is right-justified
790
+ charge_fmt = charge_fmt.strip()[:2].rjust(2)
791
+ else:
792
+ charge_fmt = " " # Blank if no charge
336
793
 
337
- # Set element symbol
338
- if format_type == "PDB":
339
- element = row["element"]
340
- else: # mmCIF
341
- element = row.get("type_symbol", "")
342
- line = line[:76] + element.rjust(2) + line[78:]
794
+ # Construct the full line
795
+ # Ensure spacing is correct according to the spec
796
+ # 1-6 Record name | 7-11 Serial | 12 Space | 13-16 Name | 17 AltLoc | 18-20 ResName | 21 Space | 22 ChainID | 23-26 ResSeq | 27 iCode | 28-30 Spaces | 31-38 X | 39-46 Y | 47-54 Z | 55-60 Occupancy | 61-66 TempFactor | 67-76 Spaces | 77-78 Element | 79-80 Charge
797
+ line = (
798
+ f"{record_name}{serial} {atom_name_fmt}{alt_loc}{res_name} {chain_id}{res_seq}{icode} "
799
+ f"{x}{y}{z}{occupancy}{temp_factor} " # 10 spaces
800
+ f"{element}{charge_fmt}"
801
+ )
802
+
803
+ # Ensure the line is exactly 80 characters long
804
+ return line.ljust(80)
343
805
 
344
- # Set charge
806
+
807
+ def write_pdb(
808
+ df: pd.DataFrame, output: Union[str, TextIO, None] = None
809
+ ) -> Union[str, None]:
810
+ """
811
+ Write a DataFrame of atom records to PDB format.
812
+
813
+ Parameters:
814
+ -----------
815
+ df : pd.DataFrame
816
+ DataFrame containing atom records, as created by parse_pdb_atoms or parse_cif_atoms.
817
+ Must contain columns mappable to PDB format fields.
818
+ output : Union[str, TextIO, None], optional
819
+ Output file path or file-like object. If None, returns the PDB content as a string.
820
+
821
+ Returns:
822
+ --------
823
+ Union[str, None]
824
+ If output is None, returns the PDB content as a string. Otherwise, returns None.
825
+ """
826
+ buffer = io.StringIO()
827
+ format_type = df.attrs.get("format", "PDB") # Assume PDB if not specified
828
+
829
+ last_model_num = None
830
+ last_chain_id = None
831
+ last_res_info = None # Tuple (resSeq, iCode, resName) for TER record
832
+ last_serial = 0
833
+
834
+ # Check if DataFrame is empty
835
+ if df.empty:
836
+ buffer.write("END\n")
837
+ content = buffer.getvalue()
838
+ buffer.close()
839
+ if output is not None:
840
+ if isinstance(output, str):
841
+ with open(output, "w") as f:
842
+ f.write(content)
843
+ else:
844
+ output.write(content)
845
+ return None
846
+ return content
847
+
848
+ for _, row in df.iterrows():
849
+ atom_data = {}
850
+
851
+ # --- Data Extraction ---
345
852
  if format_type == "PDB":
346
- charge = row["charge"]
347
- else: # mmCIF
348
- charge = row.get("pdbx_formal_charge", "")
349
- if charge and charge not in ["?", "."]:
350
- # Convert numeric charge to PDB format (e.g., "1+" or "2-")
351
- try:
352
- charge_val = int(charge)
353
- if charge_val != 0:
354
- charge = f"{abs(charge_val)}{'+' if charge_val > 0 else '-'}"
355
- else:
356
- charge = ""
357
- except ValueError:
358
- pass
359
- line = line[:78] + charge + line[80:]
360
-
361
- # Write the line to the buffer
362
- buffer.write(line.rstrip() + "\n")
363
-
364
- # Add END record
853
+ # Pre-process PDB values, converting None to empty strings for optional fields
854
+ raw_alt_loc = row.get("altLoc")
855
+ pdb_alt_loc = "" if pd.isna(raw_alt_loc) else str(raw_alt_loc)
856
+
857
+ raw_icode = row.get("iCode")
858
+ pdb_icode = "" if pd.isna(raw_icode) else str(raw_icode)
859
+
860
+ raw_element = row.get("element")
861
+ pdb_element = "" if pd.isna(raw_element) else str(raw_element)
862
+
863
+ raw_charge = row.get("charge")
864
+ pdb_charge = "" if pd.isna(raw_charge) else str(raw_charge)
865
+
866
+ atom_data = {
867
+ "record_name": row.get("record_type", "ATOM"),
868
+ "serial": int(row.get("serial", 0)),
869
+ "name": str(row.get("name", "")),
870
+ "altLoc": pdb_alt_loc,
871
+ "resName": str(row.get("resName", "")),
872
+ "chainID": str(row.get("chainID", "")),
873
+ "resSeq": int(row.get("resSeq", 0)),
874
+ "iCode": pdb_icode,
875
+ "x": float(row.get("x", 0.0)),
876
+ "y": float(row.get("y", 0.0)),
877
+ "z": float(row.get("z", 0.0)),
878
+ "occupancy": float(row.get("occupancy", 1.0)),
879
+ "tempFactor": float(row.get("tempFactor", 0.0)),
880
+ "element": pdb_element,
881
+ "charge": pdb_charge,
882
+ "model": int(row.get("model", 1)),
883
+ }
884
+ elif format_type == "mmCIF":
885
+ # Pre-process mmCIF values to PDB compatible format, converting None to empty strings
886
+ raw_alt_loc = row.get("label_alt_id")
887
+ pdb_alt_loc = "" if pd.isna(raw_alt_loc) else str(raw_alt_loc)
888
+
889
+ raw_icode = row.get("pdbx_PDB_ins_code")
890
+ pdb_icode = "" if pd.isna(raw_icode) else str(raw_icode)
891
+
892
+ raw_element = row.get("type_symbol")
893
+ pdb_element = "" if pd.isna(raw_element) else str(raw_element)
894
+
895
+ raw_charge = row.get("pdbx_formal_charge")
896
+ pdb_charge = "" if pd.isna(raw_charge) else str(raw_charge)
897
+
898
+ atom_data = {
899
+ "record_name": row.get("group_PDB", "ATOM"),
900
+ "serial": int(row.get("id", 0)),
901
+ "name": str(row.get("auth_atom_id", row.get("label_atom_id", ""))),
902
+ "altLoc": pdb_alt_loc,
903
+ "resName": str(row.get("auth_comp_id", row.get("label_comp_id", ""))),
904
+ "chainID": str(row.get("auth_asym_id", row.get("label_asym_id"))),
905
+ "resSeq": int(row.get("auth_seq_id", row.get("label_seq_id", 0))),
906
+ "iCode": pdb_icode,
907
+ "x": float(row.get("Cartn_x", 0.0)),
908
+ "y": float(row.get("Cartn_y", 0.0)),
909
+ "z": float(row.get("Cartn_z", 0.0)),
910
+ "occupancy": float(row.get("occupancy", 1.0)),
911
+ "tempFactor": float(row.get("B_iso_or_equiv", 0.0)),
912
+ "element": pdb_element,
913
+ "charge": pdb_charge,
914
+ "model": int(row.get("pdbx_PDB_model_num", 1)),
915
+ }
916
+ else:
917
+ raise ValueError(f"Unsupported DataFrame format: {format_type}")
918
+
919
+ # --- MODEL/ENDMDL Records ---
920
+ current_model_num = atom_data["model"]
921
+ if current_model_num != last_model_num:
922
+ if last_model_num is not None:
923
+ buffer.write("ENDMDL\n")
924
+ buffer.write(f"MODEL {current_model_num:>4}\n")
925
+ last_model_num = current_model_num
926
+ # Reset chain/residue tracking for the new model
927
+ last_chain_id = None
928
+ last_res_info = None
929
+
930
+ # --- TER Records ---
931
+ current_chain_id = atom_data["chainID"]
932
+ current_res_info = (
933
+ atom_data["resSeq"],
934
+ atom_data["iCode"],
935
+ atom_data["resName"],
936
+ )
937
+
938
+ # Write TER if chain ID changes within the same model
939
+ if last_chain_id is not None and current_chain_id != last_chain_id:
940
+ ter_serial = str(last_serial + 1).rjust(5)
941
+ ter_res_name = last_res_info[2].strip().rjust(3) # Use last residue's name
942
+ ter_chain_id = last_chain_id
943
+ ter_res_seq = str(last_res_info[0]).rjust(4) # Use last residue's seq num
944
+ ter_icode = (
945
+ last_res_info[1] if last_res_info[1] else ""
946
+ ) # Use last residue's icode
947
+
948
+ ter_line = f"TER {ter_serial} {ter_res_name} {ter_chain_id}{ter_res_seq}{ter_icode}"
949
+ buffer.write(ter_line.ljust(80) + "\n")
950
+
951
+ # --- Format and Write ATOM/HETATM Line ---
952
+ pdb_line = _format_pdb_atom_line(atom_data)
953
+ buffer.write(pdb_line + "\n")
954
+
955
+ # --- Update Tracking Variables ---
956
+ last_serial = atom_data["serial"]
957
+ last_chain_id = current_chain_id
958
+ last_res_info = current_res_info
959
+
960
+ # --- Final Records ---
961
+ # Add TER record for the very last chain in the last model
962
+ if last_chain_id is not None:
963
+ ter_serial = str(last_serial + 1).rjust(5)
964
+ ter_res_name = last_res_info[2].strip().rjust(3)
965
+ ter_chain_id = last_chain_id
966
+ ter_res_seq = str(last_res_info[0]).rjust(4)
967
+ ter_icode = last_res_info[1] if last_res_info[1] else ""
968
+
969
+ ter_line = f"TER {ter_serial} {ter_res_name} {ter_chain_id}{ter_res_seq}{ter_icode}"
970
+ buffer.write(ter_line.ljust(80) + "\n")
971
+
972
+ # Add ENDMDL if models were used
973
+ if last_model_num is not None:
974
+ buffer.write("ENDMDL\n")
975
+
365
976
  buffer.write("END\n")
366
977
 
367
- # Get the content as a string
978
+ # --- Output Handling ---
368
979
  content = buffer.getvalue()
369
980
  buffer.close()
370
981
 
371
- # Write to output if provided
372
982
  if output is not None:
373
983
  if isinstance(output, str):
374
984
  with open(output, "w") as f:
@@ -376,9 +986,8 @@ def write_pdb(
376
986
  else:
377
987
  output.write(content)
378
988
  return None
379
-
380
- # Return the content as a string
381
- return content
989
+ else:
990
+ return content
382
991
 
383
992
 
384
993
  def write_cif(
@@ -403,7 +1012,7 @@ def write_cif(
403
1012
  format_type = df.attrs.get("format", "PDB")
404
1013
 
405
1014
  # Create a new DataContainer
406
- data_container = DataContainer("data_structure")
1015
+ data_container = DataContainer("rnapolis")
407
1016
 
408
1017
  # Define the attributes for atom_site category
409
1018
  if format_type == "mmCIF":
@@ -432,7 +1041,7 @@ def write_cif(
432
1041
  "auth_comp_id", # resName
433
1042
  "auth_asym_id", # chainID
434
1043
  "auth_atom_id", # name
435
- "pdbx_PDB_model_num", # (generated)
1044
+ "pdbx_PDB_model_num", # model
436
1045
  ]
437
1046
 
438
1047
  # Prepare rows for the atom_site category
@@ -440,32 +1049,44 @@ def write_cif(
440
1049
 
441
1050
  for _, row in df.iterrows():
442
1051
  if format_type == "mmCIF":
443
- # Use existing mmCIF data
444
- row_data = [str(row.get(attr, "?")) for attr in attributes]
1052
+ # Use existing mmCIF data, converting None to '?' universally
1053
+ row_data = []
1054
+ for attr in attributes:
1055
+ value = row.get(attr)
1056
+ if pd.isna(value):
1057
+ # Use '?' as the standard placeholder for missing values
1058
+ row_data.append("?")
1059
+ else:
1060
+ # Ensure all non-missing values are converted to string
1061
+ row_data.append(str(value))
445
1062
  else: # PDB format
446
- # Map PDB data to mmCIF format
1063
+ # Map PDB data to mmCIF format, converting None to '.' or '?'
447
1064
  entity_id = "1" # Default entity ID
448
- model_num = "1" # Default model number
1065
+ model_num = str(int(row["model"]))
1066
+
1067
+ # Pre-process optional fields for mmCIF placeholders
1068
+ element_val = "?" if pd.isna(row.get("element")) else str(row["element"])
1069
+ altloc_val = "." if pd.isna(row.get("altLoc")) else str(row["altLoc"])
1070
+ icode_val = "." if pd.isna(row.get("iCode")) else str(row["iCode"])
1071
+ charge_val = "." if pd.isna(row.get("charge")) else str(row["charge"])
449
1072
 
450
1073
  row_data = [
451
1074
  str(row["record_type"]), # group_PDB
452
1075
  str(int(row["serial"])), # id
453
- str(row["element"]), # type_symbol
1076
+ element_val, # type_symbol
454
1077
  str(row["name"]), # label_atom_id
455
- str(row.get("altLoc", "")), # label_alt_id
1078
+ altloc_val, # label_alt_id
456
1079
  str(row["resName"]), # label_comp_id
457
1080
  str(row["chainID"]), # label_asym_id
458
1081
  entity_id, # label_entity_id
459
1082
  str(int(row["resSeq"])), # label_seq_id
460
- str(row["iCode"])
461
- if pd.notna(row["iCode"])
462
- else "?", # pdbx_PDB_ins_code
1083
+ icode_val, # pdbx_PDB_ins_code
463
1084
  f"{float(row['x']):.3f}", # Cartn_x
464
1085
  f"{float(row['y']):.3f}", # Cartn_y
465
1086
  f"{float(row['z']):.3f}", # Cartn_z
466
1087
  f"{float(row['occupancy']):.2f}", # occupancy
467
1088
  f"{float(row['tempFactor']):.2f}", # B_iso_or_equiv
468
- str(row.get("charge", "")) or "?", # pdbx_formal_charge
1089
+ charge_val, # pdbx_formal_charge
469
1090
  str(int(row["resSeq"])), # auth_seq_id
470
1091
  str(row["resName"]), # auth_comp_id
471
1092
  str(row["chainID"]), # auth_asym_id