RNApolis 0.7.0__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
rnapolis/parser_v2.py CHANGED
@@ -34,9 +34,19 @@ def parse_pdb_atoms(content: Union[str, IO[str]]) -> pd.DataFrame:
34
34
  if isinstance(lines[0], bytes):
35
35
  lines = [line.decode("utf-8") for line in lines]
36
36
 
37
+ current_model = 1
37
38
  for line in lines:
38
39
  record_type = line[:6].strip()
39
40
 
41
+ # Check for MODEL record
42
+ if record_type == "MODEL":
43
+ try:
44
+ current_model = int(line[10:14].strip())
45
+ except ValueError:
46
+ # Handle cases where MODEL record might be malformed
47
+ pass # Keep the previous model number
48
+ continue
49
+
40
50
  # Only process ATOM and HETATM records
41
51
  if record_type not in ["ATOM", "HETATM"]:
42
52
  continue
@@ -59,6 +69,7 @@ def parse_pdb_atoms(content: Union[str, IO[str]]) -> pd.DataFrame:
59
69
  "tempFactor": line[60:66].strip(),
60
70
  "element": line[76:78].strip(),
61
71
  "charge": line[78:80].strip(),
72
+ "model": current_model, # Add the current model number
62
73
  }
63
74
 
64
75
  records.append(record)
@@ -83,13 +94,23 @@ def parse_pdb_atoms(content: Union[str, IO[str]]) -> pd.DataFrame:
83
94
  "tempFactor",
84
95
  "element",
85
96
  "charge",
97
+ "model",
86
98
  ]
87
99
  )
88
100
 
89
101
  df = pd.DataFrame(records)
90
102
 
91
103
  # Convert numeric columns to appropriate types
92
- numeric_columns = ["serial", "resSeq", "x", "y", "z", "occupancy", "tempFactor"]
104
+ numeric_columns = [
105
+ "serial",
106
+ "resSeq",
107
+ "x",
108
+ "y",
109
+ "z",
110
+ "occupancy",
111
+ "tempFactor",
112
+ "model",
113
+ ]
93
114
  for col in numeric_columns:
94
115
  df[col] = pd.to_numeric(df[col], errors="coerce")
95
116
 
@@ -229,8 +250,43 @@ def write_pdb(
229
250
  # Get the format of the DataFrame
230
251
  format_type = df.attrs.get("format", "PDB")
231
252
 
253
+ # Variables to track chain changes for TER records
254
+ last_chain_id = None
255
+ last_res_seq = None
256
+ last_res_name = None
257
+ last_serial = None
258
+ last_icode = None
259
+
232
260
  # Process each row in the DataFrame
233
- for _, row in df.iterrows():
261
+ for index, row in df.iterrows():
262
+ # Get current chain ID
263
+ if format_type == "PDB":
264
+ current_chain_id = row["chainID"]
265
+ else: # mmCIF
266
+ current_chain_id = row.get("auth_asym_id", row.get("label_asym_id", ""))
267
+
268
+ # Write TER record if chain changes
269
+ if last_chain_id is not None and current_chain_id != last_chain_id:
270
+ # Format TER record according to PDB specification
271
+ # Columns:
272
+ # 1-6: "TER "
273
+ # 7-11: Serial number (right-justified)
274
+ # 18-20: Residue name (right-justified)
275
+ # 22: Chain ID
276
+ # 23-26: Residue sequence number (right-justified)
277
+ # 27: Insertion code
278
+ ter_serial = str(last_serial + 1).rjust(5)
279
+ ter_res_name = last_res_name.strip().ljust(3) # Strip and left-justify
280
+ ter_chain_id = last_chain_id
281
+ ter_res_seq = last_res_seq.rjust(4)
282
+ ter_icode = last_icode if last_icode else "" # Use last recorded iCode
283
+
284
+ # Construct the TER line ensuring correct spacing for all fields
285
+ # TER (1-6), serial (7-11), space (12-17), resName (18-20), space (21),
286
+ # chainID (22), resSeq (23-26), iCode (27)
287
+ ter_line = f"TER {ter_serial} {ter_res_name} {ter_chain_id}{ter_res_seq}{ter_icode}"
288
+ buffer.write(ter_line.ljust(80) + "\n")
289
+
234
290
  # Initialize the line with spaces
235
291
  line = " " * 80
236
292
 
@@ -361,6 +417,37 @@ def write_pdb(
361
417
  # Write the line to the buffer
362
418
  buffer.write(line.rstrip() + "\n")
363
419
 
420
+ # Update last atom info for potential TER record
421
+ if format_type == "PDB":
422
+ last_serial = int(row["serial"])
423
+ last_res_name = row["resName"]
424
+ last_chain_id = row["chainID"]
425
+ last_res_seq = str(int(row["resSeq"]))
426
+ last_icode = row["iCode"] if pd.notna(row["iCode"]) else ""
427
+ else: # mmCIF
428
+ last_serial = int(row["id"])
429
+ last_res_name = row.get("auth_comp_id", row.get("label_comp_id", ""))
430
+ last_chain_id = row.get("auth_asym_id", row.get("label_asym_id", ""))
431
+ last_res_seq = str(int(row.get("auth_seq_id", row.get("label_seq_id", 0))))
432
+ last_icode = (
433
+ row.get("pdbx_PDB_ins_code", "")
434
+ if pd.notna(row.get("pdbx_PDB_ins_code", ""))
435
+ else ""
436
+ )
437
+
438
+ # Add TER record for the last chain
439
+ if last_chain_id is not None:
440
+ # Format TER record according to PDB specification
441
+ ter_serial = str(last_serial + 1).rjust(5)
442
+ ter_res_name = last_res_name.strip().ljust(3) # Strip and left-justify
443
+ ter_chain_id = last_chain_id
444
+ ter_res_seq = last_res_seq.rjust(4)
445
+ ter_icode = last_icode if last_icode else "" # Use last recorded iCode
446
+
447
+ # Construct the TER line ensuring correct spacing for all fields
448
+ ter_line = f"TER {ter_serial} {ter_res_name} {ter_chain_id}{ter_res_seq}{ter_icode}"
449
+ buffer.write(ter_line.ljust(80) + "\n")
450
+
364
451
  # Add END record
365
452
  buffer.write("END\n")
366
453
 
rnapolis/splitter.py ADDED
@@ -0,0 +1,115 @@
1
+ #!/usr/bin/env python3
2
+ import argparse
3
+ import os
4
+ import sys
5
+
6
+ import pandas as pd
7
+
8
+ from rnapolis.parser import is_cif
9
+ from rnapolis.parser_v2 import parse_cif_atoms, parse_pdb_atoms, write_cif, write_pdb
10
+
11
+
12
+ def main():
13
+ """Main function to run the splitter tool."""
14
+ parser = argparse.ArgumentParser(
15
+ description="Split a multi-model PDB or mmCIF file into separate files per model."
16
+ )
17
+ parser.add_argument("--output", "-o", help="Output directory", required=True)
18
+ parser.add_argument(
19
+ "--format",
20
+ "-f",
21
+ help="Output format (possible values: PDB, mmCIF, keep. Default: keep)",
22
+ default="keep",
23
+ )
24
+ parser.add_argument("file", help="Input PDB or mmCIF file to split")
25
+ args = parser.parse_args()
26
+
27
+ # Check if input file exists
28
+ if not os.path.exists(args.file):
29
+ print(f"Error: Input file not found: {args.file}", file=sys.stderr)
30
+ sys.exit(1)
31
+
32
+ # Read and parse the input file
33
+ input_format = "mmCIF"
34
+ try:
35
+ with open(args.file) as f:
36
+ if is_cif(f):
37
+ atoms_df = parse_cif_atoms(f)
38
+ model_column = "pdbx_PDB_model_num"
39
+ else:
40
+ atoms_df = parse_pdb_atoms(f)
41
+ input_format = "PDB"
42
+ model_column = "model"
43
+ except Exception as e:
44
+ print(f"Error parsing file {args.file}: {e}", file=sys.stderr)
45
+ sys.exit(1)
46
+
47
+ if atoms_df.empty:
48
+ print(f"Warning: No atoms found in {args.file}", file=sys.stderr)
49
+ sys.exit(0)
50
+
51
+ # Check if model column exists
52
+ if model_column not in atoms_df.columns:
53
+ print(
54
+ f"Error: Model column '{model_column}' not found in the parsed data from {args.file}.",
55
+ file=sys.stderr,
56
+ )
57
+ print(
58
+ "This might indicate an issue with the input file or the parser.",
59
+ file=sys.stderr,
60
+ )
61
+ sys.exit(1)
62
+
63
+ # Determine output format
64
+ output_format = args.format.upper()
65
+ if output_format == "KEEP":
66
+ output_format = input_format
67
+ elif output_format not in ["PDB", "MMCIF"]:
68
+ print(
69
+ f"Error: Invalid output format '{args.format}'. Choose PDB, mmCIF, or keep.",
70
+ file=sys.stderr,
71
+ )
72
+ sys.exit(1)
73
+
74
+ # Ensure output directory exists
75
+ os.makedirs(args.output, exist_ok=True)
76
+
77
+ # Group by model number
78
+ grouped_by_model = atoms_df.groupby(model_column)
79
+
80
+ # Get base name for output files
81
+ base_name = os.path.splitext(os.path.basename(args.file))[0]
82
+
83
+ # Write each model to a separate file
84
+ for model_num, model_df in grouped_by_model:
85
+ # Ensure model_df is a DataFrame copy to avoid SettingWithCopyWarning
86
+ model_df = model_df.copy()
87
+
88
+ # Set the correct format attribute for the writer function
89
+ model_df.attrs["format"] = input_format
90
+
91
+ # Construct output filename
92
+ ext = ".pdb" if output_format == "PDB" else ".cif"
93
+ output_filename = f"{base_name}_model_{model_num}{ext}"
94
+ output_path = os.path.join(args.output, output_filename)
95
+
96
+ print(f"Writing model {model_num} to {output_path}...")
97
+
98
+ try:
99
+ if output_format == "PDB":
100
+ write_pdb(model_df, output_path)
101
+ else: # mmCIF
102
+ write_cif(model_df, output_path)
103
+ except Exception as e:
104
+ print(
105
+ f"Error writing file {output_path}: {e}",
106
+ file=sys.stderr,
107
+ )
108
+ # Optionally continue to next model or exit
109
+ # sys.exit(1)
110
+
111
+ print("Splitting complete.")
112
+
113
+
114
+ if __name__ == "__main__":
115
+ main()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: RNApolis
3
- Version: 0.7.0
3
+ Version: 0.8.0
4
4
  Summary: A Python library containing RNA-related bioinformatics functions and classes
5
5
  Home-page: https://github.com/tzok/rnapolis-py
6
6
  Author: Tomasz Zok
@@ -12,16 +12,17 @@ rnapolis/mmcif_pdbx_v50.dic,sha256=5QFx1ssDaehR4_DQ-tS9VQux262SiLXaqcwmwwejF5c,5
12
12
  rnapolis/molecule_filter.py,sha256=jgcpJxx_oXEBX0d30v4k_FdwRouRUPUsEtCYWgLGpD4,7310
13
13
  rnapolis/motif_extractor.py,sha256=Lfn1iEkhkP9eZD3GPEWNAfy00QO7QPCc8wM_XS1ory8,1147
14
14
  rnapolis/parser.py,sha256=3g4mtFvpiEENFcSBBtx_E_x1vJPF9BujWnts0kb9XjE,16340
15
- rnapolis/parser_v2.py,sha256=ltesVKBiIKk9JlM02ttTJzLm1g5MHdPzDgQTcl40GP8,16257
15
+ rnapolis/parser_v2.py,sha256=eUccbTXCD5I7q0GVbaGWmjj0CT5d2VK8x9tr0gtrRuA,19801
16
16
  rnapolis/rfam_folder.py,sha256=SjiiyML_T1__saruFwSMJEoQ7Y55GIU8ktS8ZUn5-fw,11111
17
+ rnapolis/splitter.py,sha256=8mMZ2ZmhqptPUjmkDOFbLvC-dvWpuvJ0beSoeaD5pzk,3642
17
18
  rnapolis/tertiary.py,sha256=6t9ZB4w33-5n_M3sns1RoFXCOTgVAgGH4WDNG5OG9Kg,23426
18
19
  rnapolis/tertiary_v2.py,sha256=I1uyHWIUePNGO5m-suoL4ibtz02qAJUMvYm0BUKUygY,22480
19
20
  rnapolis/transformer.py,sha256=aC0nBmHHJf5TyLvBIV57Jj3tlwpvHbPo347opfAOlQA,3844
20
21
  rnapolis/unifier.py,sha256=DR1_IllgaAYT9_FUE6XC9B-2wgqbBHs2D1MjyZT2j2g,5438
21
22
  rnapolis/util.py,sha256=IdquFO3PV1_KDqodjupzm0Rqvgy0CeSzxGHaGEHYXVU,543
22
- rnapolis-0.7.0.dist-info/licenses/LICENSE,sha256=ZGRu12MzCgbYA-Lt8MyBlmjvPZh7xfiD5u5wBx0enq4,1066
23
- rnapolis-0.7.0.dist-info/METADATA,sha256=Rrnbq7pKHvcPKcHPp9nWjAZZ6x8PflF2WaJCTxaRgbo,54537
24
- rnapolis-0.7.0.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
25
- rnapolis-0.7.0.dist-info/entry_points.txt,sha256=D630mec6slaw_QMmzDNeBIy7p0pTtuOGnu8xTjmx8VA,404
26
- rnapolis-0.7.0.dist-info/top_level.txt,sha256=LcO18koxZcWoJ21KDRRRo_tyIbmXL5z61dPitZpy8yc,9
27
- rnapolis-0.7.0.dist-info/RECORD,,
23
+ rnapolis-0.8.0.dist-info/licenses/LICENSE,sha256=ZGRu12MzCgbYA-Lt8MyBlmjvPZh7xfiD5u5wBx0enq4,1066
24
+ rnapolis-0.8.0.dist-info/METADATA,sha256=zD_byFTP6xNdYCQdu5bslqSE_noBjSagzhn2EOSlcYE,54537
25
+ rnapolis-0.8.0.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
26
+ rnapolis-0.8.0.dist-info/entry_points.txt,sha256=H00KoN54wU3dFOofAu3H_3PADmZOBTB1hXf5TUU2uzo,438
27
+ rnapolis-0.8.0.dist-info/top_level.txt,sha256=LcO18koxZcWoJ21KDRRRo_tyIbmXL5z61dPitZpy8yc,9
28
+ rnapolis-0.8.0.dist-info/RECORD,,
@@ -7,5 +7,6 @@ metareader = rnapolis.metareader:main
7
7
  molecule-filter = rnapolis.molecule_filter:main
8
8
  motif-extractor = rnapolis.motif_extractor:main
9
9
  rfam-folder = rnapolis.rfam_folder:main
10
+ splitter = rnapolis.splitter:main
10
11
  transformer = rnapolis.transformer:main
11
12
  unifier = rnapolis.unifier:main