RNApolis 0.6.2__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
rnapolis/adapter.py ADDED
@@ -0,0 +1,537 @@
1
+ #! /usr/bin/env python
2
+ import argparse
3
+ import csv
4
+ import logging
5
+ import os
6
+ from enum import Enum
7
+ from typing import Dict, List, Optional, Tuple
8
+
9
+ import orjson
10
+
11
+ from rnapolis.common import (
12
+ BR,
13
+ BaseInteractions,
14
+ BasePair,
15
+ BasePhosphate,
16
+ BaseRibose,
17
+ BPh,
18
+ BpSeq,
19
+ LeontisWesthof,
20
+ OtherInteraction,
21
+ Residue,
22
+ ResidueAuth,
23
+ Stacking,
24
+ StackingTopology,
25
+ Structure2D,
26
+ )
27
+ from rnapolis.parser import read_3d_structure
28
+ from rnapolis.tertiary import Mapping2D3D, Structure3D
29
+ from rnapolis.util import handle_input_file
30
+
31
+
32
+ class ExternalTool(Enum):
33
+ FR3D = "fr3d"
34
+ DSSR = "dssr"
35
+
36
+
37
+ logging.basicConfig(level=os.getenv("LOGLEVEL", "INFO").upper())
38
+
39
+
40
+ def parse_unit_id(nt: str) -> Residue:
41
+ """Parse FR3D unit ID format into a Residue object."""
42
+ fields = nt.split("|")
43
+ icode = fields[7] if len(fields) >= 8 and fields[7] != "" else None
44
+ auth = ResidueAuth(fields[2], int(fields[4]), icode, fields[3])
45
+ return Residue(None, auth)
46
+
47
+
48
+ def unify_classification(fr3d_name: str) -> tuple:
49
+ """Convert FR3D classification to internal format."""
50
+ original_name = fr3d_name # Keep for logging
51
+
52
+ # Handle 'n' prefix (e.g., ncWW -> cWW, ns55 -> s55)
53
+ if fr3d_name.startswith("n"):
54
+ fr3d_name = fr3d_name[1:]
55
+ logging.debug(
56
+ f"Detected 'n' prefix: removed from {original_name} -> {fr3d_name}"
57
+ )
58
+
59
+ # Handle alternative base pairs with 'a' suffix (e.g., cWWa -> cWW)
60
+ if len(fr3d_name) >= 3 and fr3d_name.endswith("a"):
61
+ fr3d_name = fr3d_name[:-1] # Remove the 'a' suffix
62
+ logging.debug(
63
+ f"Detected alternative base pair: removed 'a' suffix from {original_name} -> {fr3d_name}"
64
+ )
65
+
66
+ # Handle backbone interactions: 0BR, 1BR, ... 9BR for base-ribose
67
+ if len(fr3d_name) == 3 and fr3d_name[1:] == "BR" and fr3d_name[0].isdigit():
68
+ try:
69
+ br_type = f"_{fr3d_name[0]}"
70
+ return ("base-ribose", BR[br_type])
71
+ except (ValueError, KeyError):
72
+ logging.debug(f"Unknown base-ribose interaction: {original_name}")
73
+ return ("other", None)
74
+
75
+ # Handle backbone interactions: 0BPh, 1BPh, ... 9BPh for base-phosphate
76
+ if len(fr3d_name) == 4 and fr3d_name[1:] == "BPh" and fr3d_name[0].isdigit():
77
+ try:
78
+ bph_type = f"_{fr3d_name[0]}"
79
+ return ("base-phosphate", BPh[bph_type])
80
+ except (ValueError, KeyError):
81
+ logging.debug(f"Unknown base-phosphate interaction: {original_name}")
82
+ return ("other", None)
83
+
84
+ # Handle the stacking notation from direct FR3D service (s33, s35, s53, s55)
85
+ if (
86
+ len(fr3d_name) == 3
87
+ and fr3d_name.startswith("s")
88
+ and fr3d_name[1] in ("3", "5")
89
+ and fr3d_name[2] in ("3", "5")
90
+ ):
91
+ if fr3d_name == "s33":
92
+ return ("stacking", StackingTopology.downward)
93
+ if fr3d_name == "s55":
94
+ return ("stacking", StackingTopology.upward)
95
+ if fr3d_name == "s35":
96
+ return ("stacking", StackingTopology.outward)
97
+ if fr3d_name == "s53":
98
+ return ("stacking", StackingTopology.inward)
99
+
100
+ # Handle the cWW style notation from direct FR3D service output
101
+ # Support both uppercase and lowercase edge names (e.g., cWW, cww, tHS, ths, tSs, etc.)
102
+ if len(fr3d_name) == 3 and fr3d_name[0].lower() in ("c", "t"):
103
+ try:
104
+ # Convert to the format expected by LeontisWesthof
105
+ edge_type = fr3d_name[0].lower() # c or t
106
+ edge1 = fr3d_name[1].upper() # W, H, S (convert to uppercase)
107
+ edge2 = fr3d_name[2].upper() # W, H, S (convert to uppercase)
108
+
109
+ lw_format = f"{edge_type}{edge1}{edge2}"
110
+ return ("base-pair", LeontisWesthof[lw_format])
111
+ except KeyError:
112
+ logging.debug(
113
+ f"Fr3d unknown interaction from service: {original_name} -> {fr3d_name}"
114
+ )
115
+ return ("other", None)
116
+
117
+ # Handle other classifications with different formatting
118
+ logging.debug(f"Fr3d unknown interaction: {fr3d_name}")
119
+ return ("other", None)
120
+
121
+
122
+ def _process_interaction_line(
123
+ line: str,
124
+ interactions_data: Dict[str, list],
125
+ ):
126
+ """
127
+ Process a single interaction line and add it to the appropriate list.
128
+
129
+ Args:
130
+ line: The tab-separated interaction line
131
+ interactions_data: Dictionary containing all interaction lists
132
+
133
+ Returns:
134
+ True if successfully processed, False otherwise
135
+ """
136
+ try:
137
+ # Split by tabs and get the first three fields
138
+ parts = line.split("\t")
139
+ if len(parts) < 3:
140
+ logging.warning(f"Invalid interaction line format: {line}")
141
+ return False
142
+
143
+ nt1 = parts[0]
144
+ interaction_type = parts[1]
145
+ nt2 = parts[2]
146
+
147
+ nt1_residue = parse_unit_id(nt1)
148
+ nt2_residue = parse_unit_id(nt2)
149
+
150
+ # Convert the interaction type to our internal format
151
+ interaction_category, classification = unify_classification(interaction_type)
152
+
153
+ # Add to the appropriate list based on the interaction category
154
+ if interaction_category == "base-pair":
155
+ interactions_data["base_pairs"].append(
156
+ BasePair(nt1_residue, nt2_residue, classification, None)
157
+ )
158
+ elif interaction_category == "stacking":
159
+ interactions_data["stackings"].append(
160
+ Stacking(nt1_residue, nt2_residue, classification)
161
+ )
162
+ elif interaction_category == "base-ribose":
163
+ interactions_data["base_ribose_interactions"].append(
164
+ BaseRibose(nt1_residue, nt2_residue, classification)
165
+ )
166
+ elif interaction_category == "base-phosphate":
167
+ interactions_data["base_phosphate_interactions"].append(
168
+ BasePhosphate(nt1_residue, nt2_residue, classification)
169
+ )
170
+ elif interaction_category == "other":
171
+ interactions_data["other_interactions"].append(
172
+ OtherInteraction(nt1_residue, nt2_residue)
173
+ )
174
+
175
+ return True
176
+ except (ValueError, IndexError) as e:
177
+ logging.warning(f"Error parsing interaction: {e}")
178
+ return False
179
+
180
+
181
+ def match_dssr_name_to_residue(
182
+ structure3d: Structure3D, nt_id: Optional[str]
183
+ ) -> Optional[Residue]:
184
+ if nt_id is not None:
185
+ nt_id = nt_id.split(":")[-1]
186
+ for residue in structure3d.residues:
187
+ if residue.full_name == nt_id:
188
+ return residue
189
+ logging.warning(f"Failed to find residue {nt_id}")
190
+ return None
191
+
192
+
193
+ def match_dssr_lw(lw: Optional[str]) -> Optional[LeontisWesthof]:
194
+ return LeontisWesthof[lw] if lw in dir(LeontisWesthof) else None
195
+
196
+
197
+ def parse_dssr_output(
198
+ file_path: str, structure3d: Structure3D, model: Optional[int] = None
199
+ ) -> BaseInteractions:
200
+ """
201
+ Parse DSSR JSON output and convert to BaseInteractions.
202
+
203
+ Args:
204
+ file_path: Path to DSSR JSON output file
205
+ structure3d: The 3D structure parsed from PDB/mmCIF
206
+ model: Model number to use (if None, use first model)
207
+
208
+ Returns:
209
+ BaseInteractions object containing the interactions found by DSSR
210
+ """
211
+ base_pairs: List[BasePair] = []
212
+ stackings: List[Stacking] = []
213
+
214
+ with open(file_path) as f:
215
+ dssr = orjson.loads(f.read())
216
+
217
+ # Handle multi-model files
218
+ if "models" in dssr:
219
+ if model is None and dssr.get("models"):
220
+ # If model is None, use the first model
221
+ dssr = dssr.get("models")[0].get("parameters", {})
222
+ else:
223
+ # Otherwise find the specified model
224
+ for result in dssr.get("models", []):
225
+ if result.get("model", None) == model:
226
+ dssr = result.get("parameters", {})
227
+ break
228
+
229
+ for pair in dssr.get("pairs", []):
230
+ nt1 = match_dssr_name_to_residue(structure3d, pair.get("nt1", None))
231
+ nt2 = match_dssr_name_to_residue(structure3d, pair.get("nt2", None))
232
+ lw = match_dssr_lw(pair.get("LW", None))
233
+
234
+ if nt1 is not None and nt2 is not None and lw is not None:
235
+ base_pairs.append(BasePair(nt1, nt2, lw, None))
236
+
237
+ for stack in dssr.get("stacks", []):
238
+ nts = [
239
+ match_dssr_name_to_residue(structure3d, nt)
240
+ for nt in stack.get("nts_long", "").split(",")
241
+ ]
242
+ for i in range(1, len(nts)):
243
+ nt1 = nts[i - 1]
244
+ nt2 = nts[i]
245
+ if nt1 is not None and nt2 is not None:
246
+ stackings.append(Stacking(nt1, nt2, None))
247
+
248
+ return BaseInteractions(base_pairs, stackings, [], [], [])
249
+
250
+
251
+ def parse_external_output(
252
+ file_path: str, tool: ExternalTool, structure3d: Structure3D
253
+ ) -> BaseInteractions:
254
+ """
255
+ Parse the output from an external tool (FR3D, DSSR, etc.) and convert it to BaseInteractions.
256
+
257
+ Args:
258
+ file_path: Path to the external tool output file
259
+ tool: The external tool that generated the output
260
+ structure3d: The 3D structure parsed from PDB/mmCIF
261
+
262
+ Returns:
263
+ BaseInteractions object containing the interactions found by the external tool
264
+ """
265
+ if tool == ExternalTool.FR3D:
266
+ return parse_fr3d_output(file_path)
267
+ elif tool == ExternalTool.DSSR:
268
+ return parse_dssr_output(file_path, structure3d)
269
+ else:
270
+ raise ValueError(f"Unsupported external tool: {tool}")
271
+
272
+
273
+ def parse_fr3d_output(file_path: str) -> BaseInteractions:
274
+ """
275
+ Parse FR3D output file and convert to BaseInteractions.
276
+
277
+ Args:
278
+ file_path: Path to a concatenated FR3D output file containing basepair, stacking,
279
+ and backbone interactions
280
+
281
+ Returns:
282
+ BaseInteractions object containing the interactions found by FR3D
283
+ """
284
+ # Initialize the interaction data dictionary
285
+ interactions_data = {
286
+ "base_pairs": [],
287
+ "stackings": [],
288
+ "base_ribose_interactions": [],
289
+ "base_phosphate_interactions": [],
290
+ "other_interactions": [],
291
+ }
292
+
293
+ # Process the concatenated file
294
+ with open(file_path, "r") as f:
295
+ for line in f:
296
+ line = line.strip()
297
+ if not line or line.startswith("#"):
298
+ continue
299
+
300
+ # Process every non-empty, non-comment line
301
+ _process_interaction_line(line, interactions_data)
302
+
303
+ # Return a BaseInteractions object with all the processed interactions
304
+ return BaseInteractions(
305
+ interactions_data["base_pairs"],
306
+ interactions_data["stackings"],
307
+ interactions_data["base_ribose_interactions"],
308
+ interactions_data["base_phosphate_interactions"],
309
+ interactions_data["other_interactions"],
310
+ )
311
+
312
+
313
+ def process_external_tool_output(
314
+ structure3d: Structure3D,
315
+ external_file_path: str,
316
+ tool: ExternalTool,
317
+ model: Optional[int] = None,
318
+ find_gaps: bool = False,
319
+ all_dot_brackets: bool = False,
320
+ ) -> Tuple[Structure2D, List[str]]:
321
+ """
322
+ Process external tool output and create a secondary structure representation.
323
+
324
+ This function can be used from other code to process external tool outputs
325
+ and get a Structure2D object with the secondary structure information.
326
+
327
+ Args:
328
+ structure3d: The 3D structure parsed from PDB/mmCIF
329
+ external_file_path: Path to the external tool output file
330
+ tool: The external tool that generated the output (FR3D, DSSR, etc.)
331
+ model: Model number to use (if None, use first model)
332
+ find_gaps: Whether to detect gaps in the structure
333
+ all_dot_brackets: Whether to return all possible dot-bracket notations
334
+
335
+ Returns:
336
+ A tuple containing the Structure2D object and a list of dot-bracket notations
337
+ """
338
+ # Parse external tool output
339
+ base_interactions = parse_external_output(external_file_path, tool, structure3d)
340
+
341
+ # Extract secondary structure using the external tool's interactions
342
+ return extract_secondary_structure_from_external(
343
+ structure3d, base_interactions, model, find_gaps, all_dot_brackets
344
+ )
345
+
346
+
347
+ def extract_secondary_structure_from_external(
348
+ tertiary_structure: Structure3D,
349
+ base_interactions: BaseInteractions,
350
+ model: Optional[int] = None,
351
+ find_gaps: bool = False,
352
+ all_dot_brackets: bool = False,
353
+ ) -> Tuple[Structure2D, List[str]]:
354
+ """
355
+ Create a secondary structure representation using interactions from an external tool.
356
+
357
+ Args:
358
+ tertiary_structure: The 3D structure parsed from PDB/mmCIF
359
+ base_interactions: Interactions parsed from external tool output
360
+ model: Model number to use (if None, use all models)
361
+ find_gaps: Whether to detect gaps in the structure
362
+ all_dot_brackets: Whether to return all possible dot-bracket notations
363
+
364
+ Returns:
365
+ A tuple containing the Structure2D object and a list of dot-bracket notations
366
+ """
367
+ mapping = Mapping2D3D(
368
+ tertiary_structure,
369
+ base_interactions.basePairs,
370
+ base_interactions.stackings,
371
+ find_gaps,
372
+ )
373
+ stems, single_strands, hairpins, loops = mapping.bpseq.elements
374
+ structure2d = Structure2D(
375
+ base_interactions,
376
+ str(mapping.bpseq),
377
+ mapping.dot_bracket,
378
+ mapping.extended_dot_bracket,
379
+ stems,
380
+ single_strands,
381
+ hairpins,
382
+ loops,
383
+ )
384
+ if all_dot_brackets:
385
+ return structure2d, mapping.all_dot_brackets
386
+ else:
387
+ return structure2d, [structure2d.dotBracket]
388
+
389
+
390
+ def write_json(path: str, structure2d: BaseInteractions):
391
+ with open(path, "wb") as f:
392
+ f.write(orjson.dumps(structure2d))
393
+
394
+
395
+ def write_csv(path: str, structure2d: Structure2D):
396
+ with open(path, "w") as f:
397
+ writer = csv.writer(f)
398
+ writer.writerow(["nt1", "nt2", "type", "classification-1", "classification-2"])
399
+ for base_pair in structure2d.baseInteractions.basePairs:
400
+ writer.writerow(
401
+ [
402
+ base_pair.nt1.full_name,
403
+ base_pair.nt2.full_name,
404
+ "base pair",
405
+ base_pair.lw.value,
406
+ (
407
+ base_pair.saenger.value or ""
408
+ if base_pair.saenger is not None
409
+ else ""
410
+ ),
411
+ ]
412
+ )
413
+ for stacking in structure2d.baseInteractions.stackings:
414
+ writer.writerow(
415
+ [
416
+ stacking.nt1.full_name,
417
+ stacking.nt2.full_name,
418
+ "stacking",
419
+ stacking.topology.value if stacking.topology is not None else "",
420
+ "",
421
+ ]
422
+ )
423
+ for base_phosphate in structure2d.baseInteractions.basePhosphateInteractions:
424
+ writer.writerow(
425
+ [
426
+ base_phosphate.nt1.full_name,
427
+ base_phosphate.nt2.full_name,
428
+ "base-phosphate interaction",
429
+ base_phosphate.bph.value if base_phosphate.bph is not None else "",
430
+ "",
431
+ ]
432
+ )
433
+ for base_ribose in structure2d.baseInteractions.baseRiboseInteractions:
434
+ writer.writerow(
435
+ [
436
+ base_ribose.nt1.full_name,
437
+ base_ribose.nt2.full_name,
438
+ "base-ribose interaction",
439
+ base_ribose.br.value if base_ribose.br is not None else "",
440
+ "",
441
+ ]
442
+ )
443
+ for other in structure2d.baseInteractions.otherInteractions:
444
+ writer.writerow(
445
+ [
446
+ other.nt1.full_name,
447
+ other.nt2.full_name,
448
+ "other interaction",
449
+ "",
450
+ "",
451
+ ]
452
+ )
453
+
454
+
455
+ def write_bpseq(path: str, bpseq: BpSeq):
456
+ with open(path, "w") as f:
457
+ f.write(str(bpseq))
458
+
459
+
460
+ def main():
461
+ parser = argparse.ArgumentParser()
462
+ parser.add_argument("input", help="Path to PDB or mmCIF file")
463
+ parser.add_argument(
464
+ "--external",
465
+ required=True,
466
+ help="Path to external tool output file (FR3D, DSSR, etc.)",
467
+ )
468
+ parser.add_argument(
469
+ "--tool",
470
+ choices=[t.value for t in ExternalTool],
471
+ required=True,
472
+ help="External tool that generated the output file",
473
+ )
474
+ parser.add_argument(
475
+ "-a",
476
+ "--all-dot-brackets",
477
+ action="store_true",
478
+ help="(optional) print all dot-brackets, not only optimal one (exclusive with -e/--extended)",
479
+ )
480
+ parser.add_argument("-b", "--bpseq", help="(optional) path to output BPSEQ file")
481
+ parser.add_argument("-c", "--csv", help="(optional) path to output CSV file")
482
+ parser.add_argument(
483
+ "-j",
484
+ "--json",
485
+ help="(optional) path to output JSON file",
486
+ )
487
+ parser.add_argument(
488
+ "-e",
489
+ "--extended",
490
+ action="store_true",
491
+ help="(optional) if set, the program will print extended secondary structure to the standard output",
492
+ )
493
+ parser.add_argument(
494
+ "-f",
495
+ "--find-gaps",
496
+ action="store_true",
497
+ help="(optional) if set, the program will detect gaps and break the PDB chain into two or more strands",
498
+ )
499
+ parser.add_argument("-d", "--dot", help="(optional) path to output DOT file")
500
+ args = parser.parse_args()
501
+
502
+ file = handle_input_file(args.input)
503
+ structure3d = read_3d_structure(file, None)
504
+
505
+ # Process external tool output and get secondary structure
506
+ structure2d, dot_brackets = process_external_tool_output(
507
+ structure3d,
508
+ args.external,
509
+ ExternalTool(args.tool),
510
+ None,
511
+ args.find_gaps,
512
+ args.all_dot_brackets,
513
+ )
514
+
515
+ if args.csv:
516
+ write_csv(args.csv, structure2d)
517
+
518
+ if args.json:
519
+ write_json(args.json, structure2d)
520
+
521
+ if args.bpseq:
522
+ write_bpseq(args.bpseq, structure2d.bpseq)
523
+
524
+ if args.extended:
525
+ print(structure2d.extendedDotBracket)
526
+ elif args.all_dot_brackets:
527
+ for dot_bracket in dot_brackets:
528
+ print(dot_bracket)
529
+ else:
530
+ print(structure2d.dotBracket)
531
+
532
+ if args.dot:
533
+ print(BpSeq.from_string(structure2d.bpseq).graphviz)
534
+
535
+
536
+ if __name__ == "__main__":
537
+ main()
rnapolis/parser_v2.py CHANGED
@@ -34,9 +34,19 @@ def parse_pdb_atoms(content: Union[str, IO[str]]) -> pd.DataFrame:
34
34
  if isinstance(lines[0], bytes):
35
35
  lines = [line.decode("utf-8") for line in lines]
36
36
 
37
+ current_model = 1
37
38
  for line in lines:
38
39
  record_type = line[:6].strip()
39
40
 
41
+ # Check for MODEL record
42
+ if record_type == "MODEL":
43
+ try:
44
+ current_model = int(line[10:14].strip())
45
+ except ValueError:
46
+ # Handle cases where MODEL record might be malformed
47
+ pass # Keep the previous model number
48
+ continue
49
+
40
50
  # Only process ATOM and HETATM records
41
51
  if record_type not in ["ATOM", "HETATM"]:
42
52
  continue
@@ -59,6 +69,7 @@ def parse_pdb_atoms(content: Union[str, IO[str]]) -> pd.DataFrame:
59
69
  "tempFactor": line[60:66].strip(),
60
70
  "element": line[76:78].strip(),
61
71
  "charge": line[78:80].strip(),
72
+ "model": current_model, # Add the current model number
62
73
  }
63
74
 
64
75
  records.append(record)
@@ -83,13 +94,23 @@ def parse_pdb_atoms(content: Union[str, IO[str]]) -> pd.DataFrame:
83
94
  "tempFactor",
84
95
  "element",
85
96
  "charge",
97
+ "model",
86
98
  ]
87
99
  )
88
100
 
89
101
  df = pd.DataFrame(records)
90
102
 
91
103
  # Convert numeric columns to appropriate types
92
- numeric_columns = ["serial", "resSeq", "x", "y", "z", "occupancy", "tempFactor"]
104
+ numeric_columns = [
105
+ "serial",
106
+ "resSeq",
107
+ "x",
108
+ "y",
109
+ "z",
110
+ "occupancy",
111
+ "tempFactor",
112
+ "model",
113
+ ]
93
114
  for col in numeric_columns:
94
115
  df[col] = pd.to_numeric(df[col], errors="coerce")
95
116
 
@@ -229,8 +250,43 @@ def write_pdb(
229
250
  # Get the format of the DataFrame
230
251
  format_type = df.attrs.get("format", "PDB")
231
252
 
253
+ # Variables to track chain changes for TER records
254
+ last_chain_id = None
255
+ last_res_seq = None
256
+ last_res_name = None
257
+ last_serial = None
258
+ last_icode = None
259
+
232
260
  # Process each row in the DataFrame
233
- for _, row in df.iterrows():
261
+ for index, row in df.iterrows():
262
+ # Get current chain ID
263
+ if format_type == "PDB":
264
+ current_chain_id = row["chainID"]
265
+ else: # mmCIF
266
+ current_chain_id = row.get("auth_asym_id", row.get("label_asym_id", ""))
267
+
268
+ # Write TER record if chain changes
269
+ if last_chain_id is not None and current_chain_id != last_chain_id:
270
+ # Format TER record according to PDB specification
271
+ # Columns:
272
+ # 1-6: "TER "
273
+ # 7-11: Serial number (right-justified)
274
+ # 18-20: Residue name (right-justified)
275
+ # 22: Chain ID
276
+ # 23-26: Residue sequence number (right-justified)
277
+ # 27: Insertion code
278
+ ter_serial = str(last_serial + 1).rjust(5)
279
+ ter_res_name = last_res_name.strip().ljust(3) # Strip and left-justify
280
+ ter_chain_id = last_chain_id
281
+ ter_res_seq = last_res_seq.rjust(4)
282
+ ter_icode = last_icode if last_icode else "" # Use last recorded iCode
283
+
284
+ # Construct the TER line ensuring correct spacing for all fields
285
+ # TER (1-6), serial (7-11), space (12-17), resName (18-20), space (21),
286
+ # chainID (22), resSeq (23-26), iCode (27)
287
+ ter_line = f"TER {ter_serial} {ter_res_name} {ter_chain_id}{ter_res_seq}{ter_icode}"
288
+ buffer.write(ter_line.ljust(80) + "\n")
289
+
234
290
  # Initialize the line with spaces
235
291
  line = " " * 80
236
292
 
@@ -361,6 +417,37 @@ def write_pdb(
361
417
  # Write the line to the buffer
362
418
  buffer.write(line.rstrip() + "\n")
363
419
 
420
+ # Update last atom info for potential TER record
421
+ if format_type == "PDB":
422
+ last_serial = int(row["serial"])
423
+ last_res_name = row["resName"]
424
+ last_chain_id = row["chainID"]
425
+ last_res_seq = str(int(row["resSeq"]))
426
+ last_icode = row["iCode"] if pd.notna(row["iCode"]) else ""
427
+ else: # mmCIF
428
+ last_serial = int(row["id"])
429
+ last_res_name = row.get("auth_comp_id", row.get("label_comp_id", ""))
430
+ last_chain_id = row.get("auth_asym_id", row.get("label_asym_id", ""))
431
+ last_res_seq = str(int(row.get("auth_seq_id", row.get("label_seq_id", 0))))
432
+ last_icode = (
433
+ row.get("pdbx_PDB_ins_code", "")
434
+ if pd.notna(row.get("pdbx_PDB_ins_code", ""))
435
+ else ""
436
+ )
437
+
438
+ # Add TER record for the last chain
439
+ if last_chain_id is not None:
440
+ # Format TER record according to PDB specification
441
+ ter_serial = str(last_serial + 1).rjust(5)
442
+ ter_res_name = last_res_name.strip().ljust(3) # Strip and left-justify
443
+ ter_chain_id = last_chain_id
444
+ ter_res_seq = last_res_seq.rjust(4)
445
+ ter_icode = last_icode if last_icode else "" # Use last recorded iCode
446
+
447
+ # Construct the TER line ensuring correct spacing for all fields
448
+ ter_line = f"TER {ter_serial} {ter_res_name} {ter_chain_id}{ter_res_seq}{ter_icode}"
449
+ buffer.write(ter_line.ljust(80) + "\n")
450
+
364
451
  # Add END record
365
452
  buffer.write("END\n")
366
453
 
rnapolis/splitter.py ADDED
@@ -0,0 +1,115 @@
1
+ #!/usr/bin/env python3
2
+ import argparse
3
+ import os
4
+ import sys
5
+
6
+ import pandas as pd
7
+
8
+ from rnapolis.parser import is_cif
9
+ from rnapolis.parser_v2 import parse_cif_atoms, parse_pdb_atoms, write_cif, write_pdb
10
+
11
+
12
+ def main():
13
+ """Main function to run the splitter tool."""
14
+ parser = argparse.ArgumentParser(
15
+ description="Split a multi-model PDB or mmCIF file into separate files per model."
16
+ )
17
+ parser.add_argument("--output", "-o", help="Output directory", required=True)
18
+ parser.add_argument(
19
+ "--format",
20
+ "-f",
21
+ help="Output format (possible values: PDB, mmCIF, keep. Default: keep)",
22
+ default="keep",
23
+ )
24
+ parser.add_argument("file", help="Input PDB or mmCIF file to split")
25
+ args = parser.parse_args()
26
+
27
+ # Check if input file exists
28
+ if not os.path.exists(args.file):
29
+ print(f"Error: Input file not found: {args.file}", file=sys.stderr)
30
+ sys.exit(1)
31
+
32
+ # Read and parse the input file
33
+ input_format = "mmCIF"
34
+ try:
35
+ with open(args.file) as f:
36
+ if is_cif(f):
37
+ atoms_df = parse_cif_atoms(f)
38
+ model_column = "pdbx_PDB_model_num"
39
+ else:
40
+ atoms_df = parse_pdb_atoms(f)
41
+ input_format = "PDB"
42
+ model_column = "model"
43
+ except Exception as e:
44
+ print(f"Error parsing file {args.file}: {e}", file=sys.stderr)
45
+ sys.exit(1)
46
+
47
+ if atoms_df.empty:
48
+ print(f"Warning: No atoms found in {args.file}", file=sys.stderr)
49
+ sys.exit(0)
50
+
51
+ # Check if model column exists
52
+ if model_column not in atoms_df.columns:
53
+ print(
54
+ f"Error: Model column '{model_column}' not found in the parsed data from {args.file}.",
55
+ file=sys.stderr,
56
+ )
57
+ print(
58
+ "This might indicate an issue with the input file or the parser.",
59
+ file=sys.stderr,
60
+ )
61
+ sys.exit(1)
62
+
63
+ # Determine output format
64
+ output_format = args.format.upper()
65
+ if output_format == "KEEP":
66
+ output_format = input_format
67
+ elif output_format not in ["PDB", "MMCIF"]:
68
+ print(
69
+ f"Error: Invalid output format '{args.format}'. Choose PDB, mmCIF, or keep.",
70
+ file=sys.stderr,
71
+ )
72
+ sys.exit(1)
73
+
74
+ # Ensure output directory exists
75
+ os.makedirs(args.output, exist_ok=True)
76
+
77
+ # Group by model number
78
+ grouped_by_model = atoms_df.groupby(model_column)
79
+
80
+ # Get base name for output files
81
+ base_name = os.path.splitext(os.path.basename(args.file))[0]
82
+
83
+ # Write each model to a separate file
84
+ for model_num, model_df in grouped_by_model:
85
+ # Ensure model_df is a DataFrame copy to avoid SettingWithCopyWarning
86
+ model_df = model_df.copy()
87
+
88
+ # Set the correct format attribute for the writer function
89
+ model_df.attrs["format"] = input_format
90
+
91
+ # Construct output filename
92
+ ext = ".pdb" if output_format == "PDB" else ".cif"
93
+ output_filename = f"{base_name}_model_{model_num}{ext}"
94
+ output_path = os.path.join(args.output, output_filename)
95
+
96
+ print(f"Writing model {model_num} to {output_path}...")
97
+
98
+ try:
99
+ if output_format == "PDB":
100
+ write_pdb(model_df, output_path)
101
+ else: # mmCIF
102
+ write_cif(model_df, output_path)
103
+ except Exception as e:
104
+ print(
105
+ f"Error writing file {output_path}: {e}",
106
+ file=sys.stderr,
107
+ )
108
+ # Optionally continue to next model or exit
109
+ # sys.exit(1)
110
+
111
+ print("Splitting complete.")
112
+
113
+
114
+ if __name__ == "__main__":
115
+ main()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: RNApolis
3
- Version: 0.6.2
3
+ Version: 0.8.0
4
4
  Summary: A Python library containing RNA-related bioinformatics functions and classes
5
5
  Home-page: https://github.com/tzok/rnapolis-py
6
6
  Author: Tomasz Zok
@@ -1,3 +1,4 @@
1
+ rnapolis/adapter.py,sha256=n7f5e8dbP-grJI7L9GycYAbMjpMvTuUM5aXiiCqG91k,18239
1
2
  rnapolis/aligner.py,sha256=o7rQyjAZ3n4VXcnSPY3HVB8nLNRkVbl552O3NVh0mfg,3429
2
3
  rnapolis/annotator.py,sha256=hRRzRmneYxbg2tvwVHMWLfzmJb4szV0JL_6EOC09Gwg,22101
3
4
  rnapolis/clashfinder.py,sha256=AC9_tIx7QIk57sELq_aKfU1u3UMOXbgcccQeGHhMR6c,8517
@@ -11,16 +12,17 @@ rnapolis/mmcif_pdbx_v50.dic,sha256=5QFx1ssDaehR4_DQ-tS9VQux262SiLXaqcwmwwejF5c,5
11
12
  rnapolis/molecule_filter.py,sha256=jgcpJxx_oXEBX0d30v4k_FdwRouRUPUsEtCYWgLGpD4,7310
12
13
  rnapolis/motif_extractor.py,sha256=Lfn1iEkhkP9eZD3GPEWNAfy00QO7QPCc8wM_XS1ory8,1147
13
14
  rnapolis/parser.py,sha256=3g4mtFvpiEENFcSBBtx_E_x1vJPF9BujWnts0kb9XjE,16340
14
- rnapolis/parser_v2.py,sha256=ltesVKBiIKk9JlM02ttTJzLm1g5MHdPzDgQTcl40GP8,16257
15
+ rnapolis/parser_v2.py,sha256=eUccbTXCD5I7q0GVbaGWmjj0CT5d2VK8x9tr0gtrRuA,19801
15
16
  rnapolis/rfam_folder.py,sha256=SjiiyML_T1__saruFwSMJEoQ7Y55GIU8ktS8ZUn5-fw,11111
17
+ rnapolis/splitter.py,sha256=8mMZ2ZmhqptPUjmkDOFbLvC-dvWpuvJ0beSoeaD5pzk,3642
16
18
  rnapolis/tertiary.py,sha256=6t9ZB4w33-5n_M3sns1RoFXCOTgVAgGH4WDNG5OG9Kg,23426
17
19
  rnapolis/tertiary_v2.py,sha256=I1uyHWIUePNGO5m-suoL4ibtz02qAJUMvYm0BUKUygY,22480
18
20
  rnapolis/transformer.py,sha256=aC0nBmHHJf5TyLvBIV57Jj3tlwpvHbPo347opfAOlQA,3844
19
21
  rnapolis/unifier.py,sha256=DR1_IllgaAYT9_FUE6XC9B-2wgqbBHs2D1MjyZT2j2g,5438
20
22
  rnapolis/util.py,sha256=IdquFO3PV1_KDqodjupzm0Rqvgy0CeSzxGHaGEHYXVU,543
21
- rnapolis-0.6.2.dist-info/licenses/LICENSE,sha256=ZGRu12MzCgbYA-Lt8MyBlmjvPZh7xfiD5u5wBx0enq4,1066
22
- rnapolis-0.6.2.dist-info/METADATA,sha256=2epFKLVBOoNmJHGZSSSF4bNEdOq2eB_KpWKmannB7rY,54537
23
- rnapolis-0.6.2.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
24
- rnapolis-0.6.2.dist-info/entry_points.txt,sha256=kS_Ji3_6UaomxkOaYpGHh4aZKaIh9CAfzoexbaS3y50,372
25
- rnapolis-0.6.2.dist-info/top_level.txt,sha256=LcO18koxZcWoJ21KDRRRo_tyIbmXL5z61dPitZpy8yc,9
26
- rnapolis-0.6.2.dist-info/RECORD,,
23
+ rnapolis-0.8.0.dist-info/licenses/LICENSE,sha256=ZGRu12MzCgbYA-Lt8MyBlmjvPZh7xfiD5u5wBx0enq4,1066
24
+ rnapolis-0.8.0.dist-info/METADATA,sha256=zD_byFTP6xNdYCQdu5bslqSE_noBjSagzhn2EOSlcYE,54537
25
+ rnapolis-0.8.0.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
26
+ rnapolis-0.8.0.dist-info/entry_points.txt,sha256=H00KoN54wU3dFOofAu3H_3PADmZOBTB1hXf5TUU2uzo,438
27
+ rnapolis-0.8.0.dist-info/top_level.txt,sha256=LcO18koxZcWoJ21KDRRRo_tyIbmXL5z61dPitZpy8yc,9
28
+ rnapolis-0.8.0.dist-info/RECORD,,
@@ -1,4 +1,5 @@
1
1
  [console_scripts]
2
+ adapter = rnapolis.adapter:main
2
3
  aligner = rnapolis.aligner:main
3
4
  annotator = rnapolis.annotator:main
4
5
  clashfinder = rnapolis.clashfinder:main
@@ -6,5 +7,6 @@ metareader = rnapolis.metareader:main
6
7
  molecule-filter = rnapolis.molecule_filter:main
7
8
  motif-extractor = rnapolis.motif_extractor:main
8
9
  rfam-folder = rnapolis.rfam_folder:main
10
+ splitter = rnapolis.splitter:main
9
11
  transformer = rnapolis.transformer:main
10
12
  unifier = rnapolis.unifier:main