RNApolis 0.4.17__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
rnapolis/parser_v2.py ADDED
@@ -0,0 +1,506 @@
1
+ import io
2
+ import tempfile
3
+ from typing import IO, TextIO, Union
4
+
5
+ import pandas as pd
6
+ from mmcif.io.IoAdapterPy import IoAdapterPy
7
+ from mmcif.io.PdbxReader import DataCategory, DataContainer
8
+
9
+
10
+ def parse_pdb_atoms(content: Union[str, IO[str]]) -> pd.DataFrame:
11
+ """
12
+ Parse PDB file content and extract ATOM and HETATM records into a pandas DataFrame.
13
+
14
+ Parameters:
15
+ -----------
16
+ content : Union[str, IO[str]]
17
+ Content of a PDB file as a string or file-like object
18
+
19
+ Returns:
20
+ --------
21
+ pd.DataFrame
22
+ DataFrame containing parsed ATOM and HETATM records with columns corresponding to PDB format
23
+ """
24
+ records = []
25
+
26
+ # Handle both string content and file-like objects
27
+ if isinstance(content, str):
28
+ lines = content.splitlines()
29
+ else:
30
+ # Read all lines from the file-like object
31
+ content.seek(0) # Ensure we're at the beginning of the file
32
+ lines = content.readlines()
33
+ # Convert bytes to string if needed
34
+ if isinstance(lines[0], bytes):
35
+ lines = [line.decode("utf-8") for line in lines]
36
+
37
+ for line in lines:
38
+ record_type = line[:6].strip()
39
+
40
+ # Only process ATOM and HETATM records
41
+ if record_type not in ["ATOM", "HETATM"]:
42
+ continue
43
+
44
+ # Parse fields according to PDB format specification
45
+ icode = line[26:27].strip()
46
+ record = {
47
+ "record_type": record_type,
48
+ "serial": line[6:11].strip(),
49
+ "name": line[12:16].strip(),
50
+ "altLoc": line[16:17].strip(),
51
+ "resName": line[17:20].strip(),
52
+ "chainID": line[21:22].strip(),
53
+ "resSeq": line[22:26].strip(),
54
+ "iCode": None if not icode else icode, # Convert empty string to None
55
+ "x": line[30:38].strip(),
56
+ "y": line[38:46].strip(),
57
+ "z": line[46:54].strip(),
58
+ "occupancy": line[54:60].strip(),
59
+ "tempFactor": line[60:66].strip(),
60
+ "element": line[76:78].strip(),
61
+ "charge": line[78:80].strip(),
62
+ }
63
+
64
+ records.append(record)
65
+
66
+ # Create DataFrame from records
67
+ if not records:
68
+ # Return empty DataFrame with correct columns if no records found
69
+ return pd.DataFrame(
70
+ columns=[
71
+ "record_type",
72
+ "serial",
73
+ "name",
74
+ "altLoc",
75
+ "resName",
76
+ "chainID",
77
+ "resSeq",
78
+ "iCode",
79
+ "x",
80
+ "y",
81
+ "z",
82
+ "occupancy",
83
+ "tempFactor",
84
+ "element",
85
+ "charge",
86
+ ]
87
+ )
88
+
89
+ df = pd.DataFrame(records)
90
+
91
+ # Convert numeric columns to appropriate types
92
+ numeric_columns = ["serial", "resSeq", "x", "y", "z", "occupancy", "tempFactor"]
93
+ for col in numeric_columns:
94
+ df[col] = pd.to_numeric(df[col], errors="coerce")
95
+
96
+ # Convert categorical columns
97
+ categorical_columns = [
98
+ "record_type",
99
+ "name",
100
+ "altLoc",
101
+ "resName",
102
+ "chainID",
103
+ "element",
104
+ "charge",
105
+ ]
106
+ for col in categorical_columns:
107
+ df[col] = df[col].astype("category")
108
+
109
+ # Add format attribute to the DataFrame
110
+ df.attrs["format"] = "PDB"
111
+
112
+ return df
113
+
114
+
115
+ def parse_cif_atoms(content: Union[str, IO[str]]) -> pd.DataFrame:
116
+ """
117
+ Parse mmCIF file content and extract atom_site records into a pandas DataFrame.
118
+
119
+ Parameters:
120
+ -----------
121
+ content : Union[str, IO[str]]
122
+ Content of a mmCIF file as a string or file-like object
123
+
124
+ Returns:
125
+ --------
126
+ pd.DataFrame
127
+ DataFrame containing parsed atom_site records with columns corresponding to mmCIF format
128
+ """
129
+ adapter = IoAdapterPy()
130
+
131
+ # Handle both string content and file-like objects
132
+ if isinstance(content, str):
133
+ # Create a temporary file to use with the adapter
134
+ import tempfile
135
+
136
+ with tempfile.NamedTemporaryFile(mode="w+", suffix=".cif") as temp_file:
137
+ temp_file.write(content)
138
+ temp_file.flush()
139
+ data = adapter.readFile(temp_file.name)
140
+ else:
141
+ # Assume it's a file-like object with a name attribute
142
+ data = adapter.readFile(content.name)
143
+
144
+ # Get the atom_site category
145
+ category = data[0].getObj("atom_site")
146
+
147
+ if not category:
148
+ # Return empty DataFrame if no atom_site category found
149
+ return pd.DataFrame()
150
+
151
+ # Extract attribute names and data rows
152
+ attributes = category.getAttributeList()
153
+ rows = category.getRowList()
154
+
155
+ # Create a list of dictionaries for each atom
156
+ records = []
157
+ for row in rows:
158
+ record = dict(zip(attributes, row))
159
+
160
+ # Convert "?" or "." in insertion code to None
161
+ if "pdbx_PDB_ins_code" in record:
162
+ if record["pdbx_PDB_ins_code"] in ["?", ".", ""]:
163
+ record["pdbx_PDB_ins_code"] = None
164
+
165
+ records.append(record)
166
+
167
+ # Create DataFrame from records
168
+ df = pd.DataFrame(records)
169
+
170
+ # Convert numeric columns to appropriate types
171
+ numeric_columns = [
172
+ "id",
173
+ "auth_seq_id",
174
+ "Cartn_x",
175
+ "Cartn_y",
176
+ "Cartn_z",
177
+ "occupancy",
178
+ "B_iso_or_equiv",
179
+ "pdbx_formal_charge",
180
+ ]
181
+
182
+ for col in numeric_columns:
183
+ if col in df.columns:
184
+ df[col] = pd.to_numeric(df[col], errors="coerce")
185
+
186
+ # Convert categorical columns
187
+ categorical_columns = [
188
+ "group_PDB",
189
+ "type_symbol",
190
+ "label_atom_id",
191
+ "label_comp_id",
192
+ "label_asym_id",
193
+ "auth_atom_id",
194
+ "auth_comp_id",
195
+ "auth_asym_id",
196
+ ]
197
+
198
+ for col in categorical_columns:
199
+ if col in df.columns:
200
+ df[col] = df[col].astype("category")
201
+
202
+ # Add format attribute to the DataFrame
203
+ df.attrs["format"] = "mmCIF"
204
+
205
+ return df
206
+
207
+
208
+ def write_pdb(
209
+ df: pd.DataFrame, output: Union[str, TextIO, None] = None
210
+ ) -> Union[str, None]:
211
+ """
212
+ Write a DataFrame of atom records to PDB format.
213
+
214
+ Parameters:
215
+ -----------
216
+ df : pd.DataFrame
217
+ DataFrame containing atom records, as created by parse_pdb_atoms or parse_cif_atoms
218
+ output : Union[str, TextIO, None], optional
219
+ Output file path or file-like object. If None, returns the PDB content as a string.
220
+
221
+ Returns:
222
+ --------
223
+ Union[str, None]
224
+ If output is None, returns the PDB content as a string. Otherwise, returns None.
225
+ """
226
+ # Create a buffer to store the PDB content
227
+ buffer = io.StringIO()
228
+
229
+ # Get the format of the DataFrame
230
+ format_type = df.attrs.get("format", "PDB")
231
+
232
+ # Process each row in the DataFrame
233
+ for _, row in df.iterrows():
234
+ # Initialize the line with spaces
235
+ line = " " * 80
236
+
237
+ # Set record type (ATOM or HETATM)
238
+ if format_type == "PDB":
239
+ record_type = row["record_type"]
240
+ else: # mmCIF
241
+ record_type = row.get("group_PDB", "ATOM")
242
+ line = record_type.ljust(6) + line[6:]
243
+
244
+ # Set atom serial number
245
+ if format_type == "PDB":
246
+ serial = str(int(row["serial"]))
247
+ else: # mmCIF
248
+ serial = str(int(row["id"]))
249
+ line = line[:6] + serial.rjust(5) + line[11:]
250
+
251
+ # Set atom name
252
+ if format_type == "PDB":
253
+ atom_name = row["name"]
254
+ else: # mmCIF
255
+ atom_name = row.get("auth_atom_id", row.get("label_atom_id", ""))
256
+
257
+ # Right-justify atom name if it starts with a number
258
+ if atom_name and atom_name[0].isdigit():
259
+ line = line[:12] + atom_name.ljust(4) + line[16:]
260
+ else:
261
+ line = line[:12] + " " + atom_name.ljust(3) + line[16:]
262
+
263
+ # Set alternate location indicator
264
+ if format_type == "PDB":
265
+ alt_loc = row.get("altLoc", "")
266
+ else: # mmCIF
267
+ alt_loc = row.get("label_alt_id", "")
268
+ line = line[:16] + alt_loc + line[17:]
269
+
270
+ # Set residue name
271
+ if format_type == "PDB":
272
+ res_name = row["resName"]
273
+ else: # mmCIF
274
+ res_name = row.get("auth_comp_id", row.get("label_comp_id", ""))
275
+ line = line[:17] + res_name.ljust(3) + line[20:]
276
+
277
+ # Set chain identifier
278
+ if format_type == "PDB":
279
+ chain_id = row["chainID"]
280
+ else: # mmCIF
281
+ chain_id = row.get("auth_asym_id", row.get("label_asym_id", ""))
282
+ line = line[:21] + chain_id + line[22:]
283
+
284
+ # Set residue sequence number
285
+ if format_type == "PDB":
286
+ res_seq = str(int(row["resSeq"]))
287
+ else: # mmCIF
288
+ res_seq = str(int(row.get("auth_seq_id", row.get("label_seq_id", 0))))
289
+ line = line[:22] + res_seq.rjust(4) + line[26:]
290
+
291
+ # Set insertion code
292
+ if format_type == "PDB":
293
+ icode = row["iCode"] if pd.notna(row["iCode"]) else ""
294
+ else: # mmCIF
295
+ icode = (
296
+ row.get("pdbx_PDB_ins_code", "")
297
+ if pd.notna(row.get("pdbx_PDB_ins_code", ""))
298
+ else ""
299
+ )
300
+ line = line[:26] + icode + line[27:]
301
+
302
+ # Set X coordinate
303
+ if format_type == "PDB":
304
+ x = float(row["x"])
305
+ else: # mmCIF
306
+ x = float(row["Cartn_x"])
307
+ line = line[:30] + f"{x:8.3f}" + line[38:]
308
+
309
+ # Set Y coordinate
310
+ if format_type == "PDB":
311
+ y = float(row["y"])
312
+ else: # mmCIF
313
+ y = float(row["Cartn_y"])
314
+ line = line[:38] + f"{y:8.3f}" + line[46:]
315
+
316
+ # Set Z coordinate
317
+ if format_type == "PDB":
318
+ z = float(row["z"])
319
+ else: # mmCIF
320
+ z = float(row["Cartn_z"])
321
+ line = line[:46] + f"{z:8.3f}" + line[54:]
322
+
323
+ # Set occupancy
324
+ if format_type == "PDB":
325
+ occupancy = float(row["occupancy"])
326
+ else: # mmCIF
327
+ occupancy = float(row.get("occupancy", 1.0))
328
+ line = line[:54] + f"{occupancy:6.2f}" + line[60:]
329
+
330
+ # Set temperature factor
331
+ if format_type == "PDB":
332
+ temp_factor = float(row["tempFactor"])
333
+ else: # mmCIF
334
+ temp_factor = float(row.get("B_iso_or_equiv", 0.0))
335
+ line = line[:60] + f"{temp_factor:6.2f}" + line[66:]
336
+
337
+ # Set element symbol
338
+ if format_type == "PDB":
339
+ element = row["element"]
340
+ else: # mmCIF
341
+ element = row.get("type_symbol", "")
342
+ line = line[:76] + element.rjust(2) + line[78:]
343
+
344
+ # Set charge
345
+ if format_type == "PDB":
346
+ charge = row["charge"]
347
+ else: # mmCIF
348
+ charge = row.get("pdbx_formal_charge", "")
349
+ if charge and charge not in ["?", "."]:
350
+ # Convert numeric charge to PDB format (e.g., "1+" or "2-")
351
+ try:
352
+ charge_val = int(charge)
353
+ if charge_val != 0:
354
+ charge = f"{abs(charge_val)}{'+' if charge_val > 0 else '-'}"
355
+ else:
356
+ charge = ""
357
+ except ValueError:
358
+ pass
359
+ line = line[:78] + charge + line[80:]
360
+
361
+ # Write the line to the buffer
362
+ buffer.write(line.rstrip() + "\n")
363
+
364
+ # Add END record
365
+ buffer.write("END\n")
366
+
367
+ # Get the content as a string
368
+ content = buffer.getvalue()
369
+ buffer.close()
370
+
371
+ # Write to output if provided
372
+ if output is not None:
373
+ if isinstance(output, str):
374
+ with open(output, "w") as f:
375
+ f.write(content)
376
+ else:
377
+ output.write(content)
378
+ return None
379
+
380
+ # Return the content as a string
381
+ return content
382
+
383
+
384
+ def write_cif(
385
+ df: pd.DataFrame, output: Union[str, TextIO, None] = None
386
+ ) -> Union[str, None]:
387
+ """
388
+ Write a DataFrame of atom records to mmCIF format.
389
+
390
+ Parameters:
391
+ -----------
392
+ df : pd.DataFrame
393
+ DataFrame containing atom records, as created by parse_pdb_atoms or parse_cif_atoms
394
+ output : Union[str, TextIO, None], optional
395
+ Output file path or file-like object. If None, returns the mmCIF content as a string.
396
+
397
+ Returns:
398
+ --------
399
+ Union[str, None]
400
+ If output is None, returns the mmCIF content as a string. Otherwise, returns None.
401
+ """
402
+ # Get the format of the DataFrame
403
+ format_type = df.attrs.get("format", "PDB")
404
+
405
+ # Create a new DataContainer
406
+ data_container = DataContainer("data_structure")
407
+
408
+ # Define the attributes for atom_site category
409
+ if format_type == "mmCIF":
410
+ # Use existing mmCIF attributes
411
+ attributes = list(df.columns)
412
+ else: # PDB format
413
+ # Map PDB columns to mmCIF attributes
414
+ attributes = [
415
+ "group_PDB", # record_type
416
+ "id", # serial
417
+ "type_symbol", # element
418
+ "label_atom_id", # name
419
+ "label_alt_id", # altLoc
420
+ "label_comp_id", # resName
421
+ "label_asym_id", # chainID
422
+ "label_entity_id", # (generated)
423
+ "label_seq_id", # resSeq
424
+ "pdbx_PDB_ins_code", # iCode
425
+ "Cartn_x", # x
426
+ "Cartn_y", # y
427
+ "Cartn_z", # z
428
+ "occupancy", # occupancy
429
+ "B_iso_or_equiv", # tempFactor
430
+ "pdbx_formal_charge", # charge
431
+ "auth_seq_id", # resSeq
432
+ "auth_comp_id", # resName
433
+ "auth_asym_id", # chainID
434
+ "auth_atom_id", # name
435
+ "pdbx_PDB_model_num", # (generated)
436
+ ]
437
+
438
+ # Prepare rows for the atom_site category
439
+ rows = []
440
+
441
+ for _, row in df.iterrows():
442
+ if format_type == "mmCIF":
443
+ # Use existing mmCIF data
444
+ row_data = [str(row.get(attr, "?")) for attr in attributes]
445
+ else: # PDB format
446
+ # Map PDB data to mmCIF format
447
+ entity_id = "1" # Default entity ID
448
+ model_num = "1" # Default model number
449
+
450
+ row_data = [
451
+ str(row["record_type"]), # group_PDB
452
+ str(int(row["serial"])), # id
453
+ str(row["element"]), # type_symbol
454
+ str(row["name"]), # label_atom_id
455
+ str(row.get("altLoc", "")), # label_alt_id
456
+ str(row["resName"]), # label_comp_id
457
+ str(row["chainID"]), # label_asym_id
458
+ entity_id, # label_entity_id
459
+ str(int(row["resSeq"])), # label_seq_id
460
+ str(row["iCode"])
461
+ if pd.notna(row["iCode"])
462
+ else "?", # pdbx_PDB_ins_code
463
+ f"{float(row['x']):.3f}", # Cartn_x
464
+ f"{float(row['y']):.3f}", # Cartn_y
465
+ f"{float(row['z']):.3f}", # Cartn_z
466
+ f"{float(row['occupancy']):.2f}", # occupancy
467
+ f"{float(row['tempFactor']):.2f}", # B_iso_or_equiv
468
+ str(row.get("charge", "")) or "?", # pdbx_formal_charge
469
+ str(int(row["resSeq"])), # auth_seq_id
470
+ str(row["resName"]), # auth_comp_id
471
+ str(row["chainID"]), # auth_asym_id
472
+ str(row["name"]), # auth_atom_id
473
+ model_num, # pdbx_PDB_model_num
474
+ ]
475
+
476
+ rows.append(row_data)
477
+
478
+ # Create the atom_site category
479
+ atom_site_category = DataCategory("atom_site", attributes, rows)
480
+
481
+ # Add the category to the data container
482
+ data_container.append(atom_site_category)
483
+
484
+ # Create an IoAdapter for writing
485
+ adapter = IoAdapterPy()
486
+
487
+ # Handle output
488
+ if output is None:
489
+ # Return as string - write to a temporary file and read it back
490
+ with tempfile.NamedTemporaryFile(mode="w+", suffix=".cif") as temp_file:
491
+ adapter.writeFile(temp_file.name, [data_container])
492
+ temp_file.flush()
493
+ temp_file.seek(0)
494
+ return temp_file.read()
495
+ elif isinstance(output, str):
496
+ # Write to a file path
497
+ adapter.writeFile(output, [data_container])
498
+ return None
499
+ else:
500
+ # Write to a file-like object
501
+ with tempfile.NamedTemporaryFile(mode="w+", suffix=".cif") as temp_file:
502
+ adapter.writeFile(temp_file.name, [data_container])
503
+ temp_file.flush()
504
+ temp_file.seek(0)
505
+ output.write(temp_file.read())
506
+ return None