RNApolis 0.4.16__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
rnapolis/parser_v2.py ADDED
@@ -0,0 +1,202 @@
1
+ from typing import IO, Union
2
+
3
+ import pandas as pd
4
+ from mmcif.io.IoAdapterPy import IoAdapterPy
5
+
6
+
7
+ def parse_pdb_atoms(content: Union[str, IO[str]]) -> pd.DataFrame:
8
+ """
9
+ Parse PDB file content and extract ATOM and HETATM records into a pandas DataFrame.
10
+
11
+ Parameters:
12
+ -----------
13
+ content : Union[str, IO[str]]
14
+ Content of a PDB file as a string or file-like object
15
+
16
+ Returns:
17
+ --------
18
+ pd.DataFrame
19
+ DataFrame containing parsed ATOM and HETATM records with columns corresponding to PDB format
20
+ """
21
+ records = []
22
+
23
+ # Handle both string content and file-like objects
24
+ if isinstance(content, str):
25
+ lines = content.splitlines()
26
+ else:
27
+ # Read all lines from the file-like object
28
+ content.seek(0) # Ensure we're at the beginning of the file
29
+ lines = content.readlines()
30
+ # Convert bytes to string if needed
31
+ if isinstance(lines[0], bytes):
32
+ lines = [line.decode("utf-8") for line in lines]
33
+
34
+ for line in lines:
35
+ record_type = line[:6].strip()
36
+
37
+ # Only process ATOM and HETATM records
38
+ if record_type not in ["ATOM", "HETATM"]:
39
+ continue
40
+
41
+ # Parse fields according to PDB format specification
42
+ icode = line[26:27].strip()
43
+ record = {
44
+ "record_type": record_type,
45
+ "serial": line[6:11].strip(),
46
+ "name": line[12:16].strip(),
47
+ "altLoc": line[16:17].strip(),
48
+ "resName": line[17:20].strip(),
49
+ "chainID": line[21:22].strip(),
50
+ "resSeq": line[22:26].strip(),
51
+ "iCode": None if not icode else icode, # Convert empty string to None
52
+ "x": line[30:38].strip(),
53
+ "y": line[38:46].strip(),
54
+ "z": line[46:54].strip(),
55
+ "occupancy": line[54:60].strip(),
56
+ "tempFactor": line[60:66].strip(),
57
+ "element": line[76:78].strip(),
58
+ "charge": line[78:80].strip(),
59
+ }
60
+
61
+ records.append(record)
62
+
63
+ # Create DataFrame from records
64
+ if not records:
65
+ # Return empty DataFrame with correct columns if no records found
66
+ return pd.DataFrame(
67
+ columns=[
68
+ "record_type",
69
+ "serial",
70
+ "name",
71
+ "altLoc",
72
+ "resName",
73
+ "chainID",
74
+ "resSeq",
75
+ "iCode",
76
+ "x",
77
+ "y",
78
+ "z",
79
+ "occupancy",
80
+ "tempFactor",
81
+ "element",
82
+ "charge",
83
+ ]
84
+ )
85
+
86
+ df = pd.DataFrame(records)
87
+
88
+ # Convert numeric columns to appropriate types
89
+ numeric_columns = ["serial", "resSeq", "x", "y", "z", "occupancy", "tempFactor"]
90
+ for col in numeric_columns:
91
+ df[col] = pd.to_numeric(df[col], errors="coerce")
92
+
93
+ # Convert categorical columns
94
+ categorical_columns = [
95
+ "record_type",
96
+ "name",
97
+ "altLoc",
98
+ "resName",
99
+ "chainID",
100
+ "element",
101
+ "charge",
102
+ ]
103
+ for col in categorical_columns:
104
+ df[col] = df[col].astype("category")
105
+
106
+ # Add format attribute to the DataFrame
107
+ df.attrs["format"] = "PDB"
108
+
109
+ return df
110
+
111
+
112
+ def parse_cif_atoms(content: Union[str, IO[str]]) -> pd.DataFrame:
113
+ """
114
+ Parse mmCIF file content and extract atom_site records into a pandas DataFrame.
115
+
116
+ Parameters:
117
+ -----------
118
+ content : Union[str, IO[str]]
119
+ Content of a mmCIF file as a string or file-like object
120
+
121
+ Returns:
122
+ --------
123
+ pd.DataFrame
124
+ DataFrame containing parsed atom_site records with columns corresponding to mmCIF format
125
+ """
126
+ adapter = IoAdapterPy()
127
+
128
+ # Handle both string content and file-like objects
129
+ if isinstance(content, str):
130
+ # Create a temporary file to use with the adapter
131
+ import tempfile
132
+
133
+ with tempfile.NamedTemporaryFile(mode="w+", suffix=".cif") as temp_file:
134
+ temp_file.write(content)
135
+ temp_file.flush()
136
+ data = adapter.readFile(temp_file.name)
137
+ else:
138
+ # Assume it's a file-like object with a name attribute
139
+ data = adapter.readFile(content.name)
140
+
141
+ # Get the atom_site category
142
+ category = data[0].getObj("atom_site")
143
+
144
+ if not category:
145
+ # Return empty DataFrame if no atom_site category found
146
+ return pd.DataFrame()
147
+
148
+ # Extract attribute names and data rows
149
+ attributes = category.getAttributeList()
150
+ rows = category.getRowList()
151
+
152
+ # Create a list of dictionaries for each atom
153
+ records = []
154
+ for row in rows:
155
+ record = dict(zip(attributes, row))
156
+
157
+ # Convert "?" or "." in insertion code to None
158
+ if "pdbx_PDB_ins_code" in record:
159
+ if record["pdbx_PDB_ins_code"] in ["?", ".", ""]:
160
+ record["pdbx_PDB_ins_code"] = None
161
+
162
+ records.append(record)
163
+
164
+ # Create DataFrame from records
165
+ df = pd.DataFrame(records)
166
+
167
+ # Convert numeric columns to appropriate types
168
+ numeric_columns = [
169
+ "id",
170
+ "auth_seq_id",
171
+ "Cartn_x",
172
+ "Cartn_y",
173
+ "Cartn_z",
174
+ "occupancy",
175
+ "B_iso_or_equiv",
176
+ "pdbx_formal_charge",
177
+ ]
178
+
179
+ for col in numeric_columns:
180
+ if col in df.columns:
181
+ df[col] = pd.to_numeric(df[col], errors="coerce")
182
+
183
+ # Convert categorical columns
184
+ categorical_columns = [
185
+ "group_PDB",
186
+ "type_symbol",
187
+ "label_atom_id",
188
+ "label_comp_id",
189
+ "label_asym_id",
190
+ "auth_atom_id",
191
+ "auth_comp_id",
192
+ "auth_asym_id",
193
+ ]
194
+
195
+ for col in categorical_columns:
196
+ if col in df.columns:
197
+ df[col] = df[col].astype("category")
198
+
199
+ # Add format attribute to the DataFrame
200
+ df.attrs["format"] = "mmCIF"
201
+
202
+ return df