RNApolis 0.4.16__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rnapolis/mmcif_pdbx_v50.dic +173762 -0
- rnapolis/parser_v2.py +202 -0
- rnapolis/tertiary_v2.py +618 -0
- {RNApolis-0.4.16.dist-info → rnapolis-0.5.0.dist-info}/METADATA +1 -1
- {RNApolis-0.4.16.dist-info → rnapolis-0.5.0.dist-info}/RECORD +9 -6
- {RNApolis-0.4.16.dist-info → rnapolis-0.5.0.dist-info}/WHEEL +1 -1
- {RNApolis-0.4.16.dist-info → rnapolis-0.5.0.dist-info}/LICENSE +0 -0
- {RNApolis-0.4.16.dist-info → rnapolis-0.5.0.dist-info}/entry_points.txt +0 -0
- {RNApolis-0.4.16.dist-info → rnapolis-0.5.0.dist-info}/top_level.txt +0 -0
rnapolis/parser_v2.py
ADDED
@@ -0,0 +1,202 @@
|
|
1
|
+
from typing import IO, Union
|
2
|
+
|
3
|
+
import pandas as pd
|
4
|
+
from mmcif.io.IoAdapterPy import IoAdapterPy
|
5
|
+
|
6
|
+
|
7
|
+
def parse_pdb_atoms(content: Union[str, IO[str]]) -> pd.DataFrame:
|
8
|
+
"""
|
9
|
+
Parse PDB file content and extract ATOM and HETATM records into a pandas DataFrame.
|
10
|
+
|
11
|
+
Parameters:
|
12
|
+
-----------
|
13
|
+
content : Union[str, IO[str]]
|
14
|
+
Content of a PDB file as a string or file-like object
|
15
|
+
|
16
|
+
Returns:
|
17
|
+
--------
|
18
|
+
pd.DataFrame
|
19
|
+
DataFrame containing parsed ATOM and HETATM records with columns corresponding to PDB format
|
20
|
+
"""
|
21
|
+
records = []
|
22
|
+
|
23
|
+
# Handle both string content and file-like objects
|
24
|
+
if isinstance(content, str):
|
25
|
+
lines = content.splitlines()
|
26
|
+
else:
|
27
|
+
# Read all lines from the file-like object
|
28
|
+
content.seek(0) # Ensure we're at the beginning of the file
|
29
|
+
lines = content.readlines()
|
30
|
+
# Convert bytes to string if needed
|
31
|
+
if isinstance(lines[0], bytes):
|
32
|
+
lines = [line.decode("utf-8") for line in lines]
|
33
|
+
|
34
|
+
for line in lines:
|
35
|
+
record_type = line[:6].strip()
|
36
|
+
|
37
|
+
# Only process ATOM and HETATM records
|
38
|
+
if record_type not in ["ATOM", "HETATM"]:
|
39
|
+
continue
|
40
|
+
|
41
|
+
# Parse fields according to PDB format specification
|
42
|
+
icode = line[26:27].strip()
|
43
|
+
record = {
|
44
|
+
"record_type": record_type,
|
45
|
+
"serial": line[6:11].strip(),
|
46
|
+
"name": line[12:16].strip(),
|
47
|
+
"altLoc": line[16:17].strip(),
|
48
|
+
"resName": line[17:20].strip(),
|
49
|
+
"chainID": line[21:22].strip(),
|
50
|
+
"resSeq": line[22:26].strip(),
|
51
|
+
"iCode": None if not icode else icode, # Convert empty string to None
|
52
|
+
"x": line[30:38].strip(),
|
53
|
+
"y": line[38:46].strip(),
|
54
|
+
"z": line[46:54].strip(),
|
55
|
+
"occupancy": line[54:60].strip(),
|
56
|
+
"tempFactor": line[60:66].strip(),
|
57
|
+
"element": line[76:78].strip(),
|
58
|
+
"charge": line[78:80].strip(),
|
59
|
+
}
|
60
|
+
|
61
|
+
records.append(record)
|
62
|
+
|
63
|
+
# Create DataFrame from records
|
64
|
+
if not records:
|
65
|
+
# Return empty DataFrame with correct columns if no records found
|
66
|
+
return pd.DataFrame(
|
67
|
+
columns=[
|
68
|
+
"record_type",
|
69
|
+
"serial",
|
70
|
+
"name",
|
71
|
+
"altLoc",
|
72
|
+
"resName",
|
73
|
+
"chainID",
|
74
|
+
"resSeq",
|
75
|
+
"iCode",
|
76
|
+
"x",
|
77
|
+
"y",
|
78
|
+
"z",
|
79
|
+
"occupancy",
|
80
|
+
"tempFactor",
|
81
|
+
"element",
|
82
|
+
"charge",
|
83
|
+
]
|
84
|
+
)
|
85
|
+
|
86
|
+
df = pd.DataFrame(records)
|
87
|
+
|
88
|
+
# Convert numeric columns to appropriate types
|
89
|
+
numeric_columns = ["serial", "resSeq", "x", "y", "z", "occupancy", "tempFactor"]
|
90
|
+
for col in numeric_columns:
|
91
|
+
df[col] = pd.to_numeric(df[col], errors="coerce")
|
92
|
+
|
93
|
+
# Convert categorical columns
|
94
|
+
categorical_columns = [
|
95
|
+
"record_type",
|
96
|
+
"name",
|
97
|
+
"altLoc",
|
98
|
+
"resName",
|
99
|
+
"chainID",
|
100
|
+
"element",
|
101
|
+
"charge",
|
102
|
+
]
|
103
|
+
for col in categorical_columns:
|
104
|
+
df[col] = df[col].astype("category")
|
105
|
+
|
106
|
+
# Add format attribute to the DataFrame
|
107
|
+
df.attrs["format"] = "PDB"
|
108
|
+
|
109
|
+
return df
|
110
|
+
|
111
|
+
|
112
|
+
def parse_cif_atoms(content: Union[str, IO[str]]) -> pd.DataFrame:
|
113
|
+
"""
|
114
|
+
Parse mmCIF file content and extract atom_site records into a pandas DataFrame.
|
115
|
+
|
116
|
+
Parameters:
|
117
|
+
-----------
|
118
|
+
content : Union[str, IO[str]]
|
119
|
+
Content of a mmCIF file as a string or file-like object
|
120
|
+
|
121
|
+
Returns:
|
122
|
+
--------
|
123
|
+
pd.DataFrame
|
124
|
+
DataFrame containing parsed atom_site records with columns corresponding to mmCIF format
|
125
|
+
"""
|
126
|
+
adapter = IoAdapterPy()
|
127
|
+
|
128
|
+
# Handle both string content and file-like objects
|
129
|
+
if isinstance(content, str):
|
130
|
+
# Create a temporary file to use with the adapter
|
131
|
+
import tempfile
|
132
|
+
|
133
|
+
with tempfile.NamedTemporaryFile(mode="w+", suffix=".cif") as temp_file:
|
134
|
+
temp_file.write(content)
|
135
|
+
temp_file.flush()
|
136
|
+
data = adapter.readFile(temp_file.name)
|
137
|
+
else:
|
138
|
+
# Assume it's a file-like object with a name attribute
|
139
|
+
data = adapter.readFile(content.name)
|
140
|
+
|
141
|
+
# Get the atom_site category
|
142
|
+
category = data[0].getObj("atom_site")
|
143
|
+
|
144
|
+
if not category:
|
145
|
+
# Return empty DataFrame if no atom_site category found
|
146
|
+
return pd.DataFrame()
|
147
|
+
|
148
|
+
# Extract attribute names and data rows
|
149
|
+
attributes = category.getAttributeList()
|
150
|
+
rows = category.getRowList()
|
151
|
+
|
152
|
+
# Create a list of dictionaries for each atom
|
153
|
+
records = []
|
154
|
+
for row in rows:
|
155
|
+
record = dict(zip(attributes, row))
|
156
|
+
|
157
|
+
# Convert "?" or "." in insertion code to None
|
158
|
+
if "pdbx_PDB_ins_code" in record:
|
159
|
+
if record["pdbx_PDB_ins_code"] in ["?", ".", ""]:
|
160
|
+
record["pdbx_PDB_ins_code"] = None
|
161
|
+
|
162
|
+
records.append(record)
|
163
|
+
|
164
|
+
# Create DataFrame from records
|
165
|
+
df = pd.DataFrame(records)
|
166
|
+
|
167
|
+
# Convert numeric columns to appropriate types
|
168
|
+
numeric_columns = [
|
169
|
+
"id",
|
170
|
+
"auth_seq_id",
|
171
|
+
"Cartn_x",
|
172
|
+
"Cartn_y",
|
173
|
+
"Cartn_z",
|
174
|
+
"occupancy",
|
175
|
+
"B_iso_or_equiv",
|
176
|
+
"pdbx_formal_charge",
|
177
|
+
]
|
178
|
+
|
179
|
+
for col in numeric_columns:
|
180
|
+
if col in df.columns:
|
181
|
+
df[col] = pd.to_numeric(df[col], errors="coerce")
|
182
|
+
|
183
|
+
# Convert categorical columns
|
184
|
+
categorical_columns = [
|
185
|
+
"group_PDB",
|
186
|
+
"type_symbol",
|
187
|
+
"label_atom_id",
|
188
|
+
"label_comp_id",
|
189
|
+
"label_asym_id",
|
190
|
+
"auth_atom_id",
|
191
|
+
"auth_comp_id",
|
192
|
+
"auth_asym_id",
|
193
|
+
]
|
194
|
+
|
195
|
+
for col in categorical_columns:
|
196
|
+
if col in df.columns:
|
197
|
+
df[col] = df[col].astype("category")
|
198
|
+
|
199
|
+
# Add format attribute to the DataFrame
|
200
|
+
df.attrs["format"] = "mmCIF"
|
201
|
+
|
202
|
+
return df
|