RNApolis 0.7.0__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rnapolis/parser_v2.py +89 -2
- rnapolis/splitter.py +115 -0
- {rnapolis-0.7.0.dist-info → rnapolis-0.8.0.dist-info}/METADATA +1 -1
- {rnapolis-0.7.0.dist-info → rnapolis-0.8.0.dist-info}/RECORD +8 -7
- {rnapolis-0.7.0.dist-info → rnapolis-0.8.0.dist-info}/entry_points.txt +1 -0
- {rnapolis-0.7.0.dist-info → rnapolis-0.8.0.dist-info}/WHEEL +0 -0
- {rnapolis-0.7.0.dist-info → rnapolis-0.8.0.dist-info}/licenses/LICENSE +0 -0
- {rnapolis-0.7.0.dist-info → rnapolis-0.8.0.dist-info}/top_level.txt +0 -0
rnapolis/parser_v2.py
CHANGED
@@ -34,9 +34,19 @@ def parse_pdb_atoms(content: Union[str, IO[str]]) -> pd.DataFrame:
|
|
34
34
|
if isinstance(lines[0], bytes):
|
35
35
|
lines = [line.decode("utf-8") for line in lines]
|
36
36
|
|
37
|
+
current_model = 1
|
37
38
|
for line in lines:
|
38
39
|
record_type = line[:6].strip()
|
39
40
|
|
41
|
+
# Check for MODEL record
|
42
|
+
if record_type == "MODEL":
|
43
|
+
try:
|
44
|
+
current_model = int(line[10:14].strip())
|
45
|
+
except ValueError:
|
46
|
+
# Handle cases where MODEL record might be malformed
|
47
|
+
pass # Keep the previous model number
|
48
|
+
continue
|
49
|
+
|
40
50
|
# Only process ATOM and HETATM records
|
41
51
|
if record_type not in ["ATOM", "HETATM"]:
|
42
52
|
continue
|
@@ -59,6 +69,7 @@ def parse_pdb_atoms(content: Union[str, IO[str]]) -> pd.DataFrame:
|
|
59
69
|
"tempFactor": line[60:66].strip(),
|
60
70
|
"element": line[76:78].strip(),
|
61
71
|
"charge": line[78:80].strip(),
|
72
|
+
"model": current_model, # Add the current model number
|
62
73
|
}
|
63
74
|
|
64
75
|
records.append(record)
|
@@ -83,13 +94,23 @@ def parse_pdb_atoms(content: Union[str, IO[str]]) -> pd.DataFrame:
|
|
83
94
|
"tempFactor",
|
84
95
|
"element",
|
85
96
|
"charge",
|
97
|
+
"model",
|
86
98
|
]
|
87
99
|
)
|
88
100
|
|
89
101
|
df = pd.DataFrame(records)
|
90
102
|
|
91
103
|
# Convert numeric columns to appropriate types
|
92
|
-
numeric_columns = [
|
104
|
+
numeric_columns = [
|
105
|
+
"serial",
|
106
|
+
"resSeq",
|
107
|
+
"x",
|
108
|
+
"y",
|
109
|
+
"z",
|
110
|
+
"occupancy",
|
111
|
+
"tempFactor",
|
112
|
+
"model",
|
113
|
+
]
|
93
114
|
for col in numeric_columns:
|
94
115
|
df[col] = pd.to_numeric(df[col], errors="coerce")
|
95
116
|
|
@@ -229,8 +250,43 @@ def write_pdb(
|
|
229
250
|
# Get the format of the DataFrame
|
230
251
|
format_type = df.attrs.get("format", "PDB")
|
231
252
|
|
253
|
+
# Variables to track chain changes for TER records
|
254
|
+
last_chain_id = None
|
255
|
+
last_res_seq = None
|
256
|
+
last_res_name = None
|
257
|
+
last_serial = None
|
258
|
+
last_icode = None
|
259
|
+
|
232
260
|
# Process each row in the DataFrame
|
233
|
-
for
|
261
|
+
for index, row in df.iterrows():
|
262
|
+
# Get current chain ID
|
263
|
+
if format_type == "PDB":
|
264
|
+
current_chain_id = row["chainID"]
|
265
|
+
else: # mmCIF
|
266
|
+
current_chain_id = row.get("auth_asym_id", row.get("label_asym_id", ""))
|
267
|
+
|
268
|
+
# Write TER record if chain changes
|
269
|
+
if last_chain_id is not None and current_chain_id != last_chain_id:
|
270
|
+
# Format TER record according to PDB specification
|
271
|
+
# Columns:
|
272
|
+
# 1-6: "TER "
|
273
|
+
# 7-11: Serial number (right-justified)
|
274
|
+
# 18-20: Residue name (right-justified)
|
275
|
+
# 22: Chain ID
|
276
|
+
# 23-26: Residue sequence number (right-justified)
|
277
|
+
# 27: Insertion code
|
278
|
+
ter_serial = str(last_serial + 1).rjust(5)
|
279
|
+
ter_res_name = last_res_name.strip().ljust(3) # Strip and left-justify
|
280
|
+
ter_chain_id = last_chain_id
|
281
|
+
ter_res_seq = last_res_seq.rjust(4)
|
282
|
+
ter_icode = last_icode if last_icode else "" # Use last recorded iCode
|
283
|
+
|
284
|
+
# Construct the TER line ensuring correct spacing for all fields
|
285
|
+
# TER (1-6), serial (7-11), space (12-17), resName (18-20), space (21),
|
286
|
+
# chainID (22), resSeq (23-26), iCode (27)
|
287
|
+
ter_line = f"TER {ter_serial} {ter_res_name} {ter_chain_id}{ter_res_seq}{ter_icode}"
|
288
|
+
buffer.write(ter_line.ljust(80) + "\n")
|
289
|
+
|
234
290
|
# Initialize the line with spaces
|
235
291
|
line = " " * 80
|
236
292
|
|
@@ -361,6 +417,37 @@ def write_pdb(
|
|
361
417
|
# Write the line to the buffer
|
362
418
|
buffer.write(line.rstrip() + "\n")
|
363
419
|
|
420
|
+
# Update last atom info for potential TER record
|
421
|
+
if format_type == "PDB":
|
422
|
+
last_serial = int(row["serial"])
|
423
|
+
last_res_name = row["resName"]
|
424
|
+
last_chain_id = row["chainID"]
|
425
|
+
last_res_seq = str(int(row["resSeq"]))
|
426
|
+
last_icode = row["iCode"] if pd.notna(row["iCode"]) else ""
|
427
|
+
else: # mmCIF
|
428
|
+
last_serial = int(row["id"])
|
429
|
+
last_res_name = row.get("auth_comp_id", row.get("label_comp_id", ""))
|
430
|
+
last_chain_id = row.get("auth_asym_id", row.get("label_asym_id", ""))
|
431
|
+
last_res_seq = str(int(row.get("auth_seq_id", row.get("label_seq_id", 0))))
|
432
|
+
last_icode = (
|
433
|
+
row.get("pdbx_PDB_ins_code", "")
|
434
|
+
if pd.notna(row.get("pdbx_PDB_ins_code", ""))
|
435
|
+
else ""
|
436
|
+
)
|
437
|
+
|
438
|
+
# Add TER record for the last chain
|
439
|
+
if last_chain_id is not None:
|
440
|
+
# Format TER record according to PDB specification
|
441
|
+
ter_serial = str(last_serial + 1).rjust(5)
|
442
|
+
ter_res_name = last_res_name.strip().ljust(3) # Strip and left-justify
|
443
|
+
ter_chain_id = last_chain_id
|
444
|
+
ter_res_seq = last_res_seq.rjust(4)
|
445
|
+
ter_icode = last_icode if last_icode else "" # Use last recorded iCode
|
446
|
+
|
447
|
+
# Construct the TER line ensuring correct spacing for all fields
|
448
|
+
ter_line = f"TER {ter_serial} {ter_res_name} {ter_chain_id}{ter_res_seq}{ter_icode}"
|
449
|
+
buffer.write(ter_line.ljust(80) + "\n")
|
450
|
+
|
364
451
|
# Add END record
|
365
452
|
buffer.write("END\n")
|
366
453
|
|
rnapolis/splitter.py
ADDED
@@ -0,0 +1,115 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
import argparse
|
3
|
+
import os
|
4
|
+
import sys
|
5
|
+
|
6
|
+
import pandas as pd
|
7
|
+
|
8
|
+
from rnapolis.parser import is_cif
|
9
|
+
from rnapolis.parser_v2 import parse_cif_atoms, parse_pdb_atoms, write_cif, write_pdb
|
10
|
+
|
11
|
+
|
12
|
+
def main():
|
13
|
+
"""Main function to run the splitter tool."""
|
14
|
+
parser = argparse.ArgumentParser(
|
15
|
+
description="Split a multi-model PDB or mmCIF file into separate files per model."
|
16
|
+
)
|
17
|
+
parser.add_argument("--output", "-o", help="Output directory", required=True)
|
18
|
+
parser.add_argument(
|
19
|
+
"--format",
|
20
|
+
"-f",
|
21
|
+
help="Output format (possible values: PDB, mmCIF, keep. Default: keep)",
|
22
|
+
default="keep",
|
23
|
+
)
|
24
|
+
parser.add_argument("file", help="Input PDB or mmCIF file to split")
|
25
|
+
args = parser.parse_args()
|
26
|
+
|
27
|
+
# Check if input file exists
|
28
|
+
if not os.path.exists(args.file):
|
29
|
+
print(f"Error: Input file not found: {args.file}", file=sys.stderr)
|
30
|
+
sys.exit(1)
|
31
|
+
|
32
|
+
# Read and parse the input file
|
33
|
+
input_format = "mmCIF"
|
34
|
+
try:
|
35
|
+
with open(args.file) as f:
|
36
|
+
if is_cif(f):
|
37
|
+
atoms_df = parse_cif_atoms(f)
|
38
|
+
model_column = "pdbx_PDB_model_num"
|
39
|
+
else:
|
40
|
+
atoms_df = parse_pdb_atoms(f)
|
41
|
+
input_format = "PDB"
|
42
|
+
model_column = "model"
|
43
|
+
except Exception as e:
|
44
|
+
print(f"Error parsing file {args.file}: {e}", file=sys.stderr)
|
45
|
+
sys.exit(1)
|
46
|
+
|
47
|
+
if atoms_df.empty:
|
48
|
+
print(f"Warning: No atoms found in {args.file}", file=sys.stderr)
|
49
|
+
sys.exit(0)
|
50
|
+
|
51
|
+
# Check if model column exists
|
52
|
+
if model_column not in atoms_df.columns:
|
53
|
+
print(
|
54
|
+
f"Error: Model column '{model_column}' not found in the parsed data from {args.file}.",
|
55
|
+
file=sys.stderr,
|
56
|
+
)
|
57
|
+
print(
|
58
|
+
"This might indicate an issue with the input file or the parser.",
|
59
|
+
file=sys.stderr,
|
60
|
+
)
|
61
|
+
sys.exit(1)
|
62
|
+
|
63
|
+
# Determine output format
|
64
|
+
output_format = args.format.upper()
|
65
|
+
if output_format == "KEEP":
|
66
|
+
output_format = input_format
|
67
|
+
elif output_format not in ["PDB", "MMCIF"]:
|
68
|
+
print(
|
69
|
+
f"Error: Invalid output format '{args.format}'. Choose PDB, mmCIF, or keep.",
|
70
|
+
file=sys.stderr,
|
71
|
+
)
|
72
|
+
sys.exit(1)
|
73
|
+
|
74
|
+
# Ensure output directory exists
|
75
|
+
os.makedirs(args.output, exist_ok=True)
|
76
|
+
|
77
|
+
# Group by model number
|
78
|
+
grouped_by_model = atoms_df.groupby(model_column)
|
79
|
+
|
80
|
+
# Get base name for output files
|
81
|
+
base_name = os.path.splitext(os.path.basename(args.file))[0]
|
82
|
+
|
83
|
+
# Write each model to a separate file
|
84
|
+
for model_num, model_df in grouped_by_model:
|
85
|
+
# Ensure model_df is a DataFrame copy to avoid SettingWithCopyWarning
|
86
|
+
model_df = model_df.copy()
|
87
|
+
|
88
|
+
# Set the correct format attribute for the writer function
|
89
|
+
model_df.attrs["format"] = input_format
|
90
|
+
|
91
|
+
# Construct output filename
|
92
|
+
ext = ".pdb" if output_format == "PDB" else ".cif"
|
93
|
+
output_filename = f"{base_name}_model_{model_num}{ext}"
|
94
|
+
output_path = os.path.join(args.output, output_filename)
|
95
|
+
|
96
|
+
print(f"Writing model {model_num} to {output_path}...")
|
97
|
+
|
98
|
+
try:
|
99
|
+
if output_format == "PDB":
|
100
|
+
write_pdb(model_df, output_path)
|
101
|
+
else: # mmCIF
|
102
|
+
write_cif(model_df, output_path)
|
103
|
+
except Exception as e:
|
104
|
+
print(
|
105
|
+
f"Error writing file {output_path}: {e}",
|
106
|
+
file=sys.stderr,
|
107
|
+
)
|
108
|
+
# Optionally continue to next model or exit
|
109
|
+
# sys.exit(1)
|
110
|
+
|
111
|
+
print("Splitting complete.")
|
112
|
+
|
113
|
+
|
114
|
+
if __name__ == "__main__":
|
115
|
+
main()
|
@@ -12,16 +12,17 @@ rnapolis/mmcif_pdbx_v50.dic,sha256=5QFx1ssDaehR4_DQ-tS9VQux262SiLXaqcwmwwejF5c,5
|
|
12
12
|
rnapolis/molecule_filter.py,sha256=jgcpJxx_oXEBX0d30v4k_FdwRouRUPUsEtCYWgLGpD4,7310
|
13
13
|
rnapolis/motif_extractor.py,sha256=Lfn1iEkhkP9eZD3GPEWNAfy00QO7QPCc8wM_XS1ory8,1147
|
14
14
|
rnapolis/parser.py,sha256=3g4mtFvpiEENFcSBBtx_E_x1vJPF9BujWnts0kb9XjE,16340
|
15
|
-
rnapolis/parser_v2.py,sha256=
|
15
|
+
rnapolis/parser_v2.py,sha256=eUccbTXCD5I7q0GVbaGWmjj0CT5d2VK8x9tr0gtrRuA,19801
|
16
16
|
rnapolis/rfam_folder.py,sha256=SjiiyML_T1__saruFwSMJEoQ7Y55GIU8ktS8ZUn5-fw,11111
|
17
|
+
rnapolis/splitter.py,sha256=8mMZ2ZmhqptPUjmkDOFbLvC-dvWpuvJ0beSoeaD5pzk,3642
|
17
18
|
rnapolis/tertiary.py,sha256=6t9ZB4w33-5n_M3sns1RoFXCOTgVAgGH4WDNG5OG9Kg,23426
|
18
19
|
rnapolis/tertiary_v2.py,sha256=I1uyHWIUePNGO5m-suoL4ibtz02qAJUMvYm0BUKUygY,22480
|
19
20
|
rnapolis/transformer.py,sha256=aC0nBmHHJf5TyLvBIV57Jj3tlwpvHbPo347opfAOlQA,3844
|
20
21
|
rnapolis/unifier.py,sha256=DR1_IllgaAYT9_FUE6XC9B-2wgqbBHs2D1MjyZT2j2g,5438
|
21
22
|
rnapolis/util.py,sha256=IdquFO3PV1_KDqodjupzm0Rqvgy0CeSzxGHaGEHYXVU,543
|
22
|
-
rnapolis-0.
|
23
|
-
rnapolis-0.
|
24
|
-
rnapolis-0.
|
25
|
-
rnapolis-0.
|
26
|
-
rnapolis-0.
|
27
|
-
rnapolis-0.
|
23
|
+
rnapolis-0.8.0.dist-info/licenses/LICENSE,sha256=ZGRu12MzCgbYA-Lt8MyBlmjvPZh7xfiD5u5wBx0enq4,1066
|
24
|
+
rnapolis-0.8.0.dist-info/METADATA,sha256=zD_byFTP6xNdYCQdu5bslqSE_noBjSagzhn2EOSlcYE,54537
|
25
|
+
rnapolis-0.8.0.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
|
26
|
+
rnapolis-0.8.0.dist-info/entry_points.txt,sha256=H00KoN54wU3dFOofAu3H_3PADmZOBTB1hXf5TUU2uzo,438
|
27
|
+
rnapolis-0.8.0.dist-info/top_level.txt,sha256=LcO18koxZcWoJ21KDRRRo_tyIbmXL5z61dPitZpy8yc,9
|
28
|
+
rnapolis-0.8.0.dist-info/RECORD,,
|
@@ -7,5 +7,6 @@ metareader = rnapolis.metareader:main
|
|
7
7
|
molecule-filter = rnapolis.molecule_filter:main
|
8
8
|
motif-extractor = rnapolis.motif_extractor:main
|
9
9
|
rfam-folder = rnapolis.rfam_folder:main
|
10
|
+
splitter = rnapolis.splitter:main
|
10
11
|
transformer = rnapolis.transformer:main
|
11
12
|
unifier = rnapolis.unifier:main
|
File without changes
|
File without changes
|
File without changes
|