pynmrstar 3.3.5__cp312-cp312-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pynmrstar might be problematic. Click here for more details.
- cnmrstar.cpython-312-darwin.so +0 -0
- pynmrstar/__init__.py +55 -0
- pynmrstar/_internal.py +292 -0
- pynmrstar/definitions.py +32 -0
- pynmrstar/entry.py +970 -0
- pynmrstar/exceptions.py +43 -0
- pynmrstar/loop.py +1197 -0
- pynmrstar/parser.py +287 -0
- pynmrstar/reference_files/comments.str +538 -0
- pynmrstar/reference_files/data_types.csv +24 -0
- pynmrstar/reference_files/schema.csv +6726 -0
- pynmrstar/saveframe.py +1015 -0
- pynmrstar/schema.py +367 -0
- pynmrstar/utils.py +134 -0
- pynmrstar-3.3.5.dist-info/LICENSE +21 -0
- pynmrstar-3.3.5.dist-info/METADATA +59 -0
- pynmrstar-3.3.5.dist-info/RECORD +19 -0
- pynmrstar-3.3.5.dist-info/WHEEL +5 -0
- pynmrstar-3.3.5.dist-info/top_level.txt +2 -0
pynmrstar/parser.py
ADDED
|
@@ -0,0 +1,287 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import re
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
from pynmrstar import definitions, cnmrstar, entry as entry_mod, loop as loop_mod, saveframe as saveframe_mod, schema as schema_mod
|
|
6
|
+
from pynmrstar.exceptions import ParsingError
|
|
7
|
+
|
|
8
|
+
logger = logging.getLogger('pynmrstar')
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class Parser(object):
|
|
12
|
+
"""Parses an entry. You should not ever use this class directly."""
|
|
13
|
+
|
|
14
|
+
def __init__(self, entry_to_parse_into: 'entry_mod.Entry' = None) -> None:
|
|
15
|
+
|
|
16
|
+
# Just make an entry to parse into if called with no entry passed
|
|
17
|
+
if entry_to_parse_into is None:
|
|
18
|
+
entry_to_parse_into = entry_mod.Entry.from_scratch("")
|
|
19
|
+
|
|
20
|
+
self.ent: entry_mod.Entry = entry_to_parse_into
|
|
21
|
+
self.full_data: str = ""
|
|
22
|
+
self.token: str = ""
|
|
23
|
+
self.source: str = "unknown"
|
|
24
|
+
self.delimiter: str = " "
|
|
25
|
+
self.line_number: int = 0
|
|
26
|
+
|
|
27
|
+
def get_token(self) -> str:
|
|
28
|
+
""" Returns the next token in the parsing process."""
|
|
29
|
+
|
|
30
|
+
try:
|
|
31
|
+
self.token, self.line_number, self.delimiter = cnmrstar.get_token_full()
|
|
32
|
+
except ValueError as err:
|
|
33
|
+
raise ParsingError(str(err))
|
|
34
|
+
|
|
35
|
+
return self.token
|
|
36
|
+
|
|
37
|
+
@staticmethod
|
|
38
|
+
def load_data(data: str) -> None:
|
|
39
|
+
""" Loads data in preparation of parsing and cleans up newlines
|
|
40
|
+
and massages the data to make parsing work properly when multi-line
|
|
41
|
+
values aren't as expected. Useful for manually getting tokens from
|
|
42
|
+
the parser."""
|
|
43
|
+
|
|
44
|
+
# Fix DOS line endings
|
|
45
|
+
data = data.replace("\r\n", "\n").replace("\r", "\n")
|
|
46
|
+
# Change '\n; data ' started multi-lines to '\n;\ndata'
|
|
47
|
+
data = re.sub(r'\n;([^\n]+?)\n', r'\n;\n\1\n', data)
|
|
48
|
+
|
|
49
|
+
cnmrstar.load_string(data)
|
|
50
|
+
|
|
51
|
+
def parse(self,
|
|
52
|
+
data: str,
|
|
53
|
+
source: str = "unknown",
|
|
54
|
+
raise_parse_warnings: bool = False,
|
|
55
|
+
convert_data_types: bool = False,
|
|
56
|
+
schema: 'schema_mod.Schema' = None) -> 'entry_mod.Entry':
|
|
57
|
+
""" Parses the string provided as data as an NMR-STAR entry
|
|
58
|
+
and returns the parsed entry. Raises ParsingError on exceptions.
|
|
59
|
+
|
|
60
|
+
Set raise_parse_warnings to raise an exception if the file has
|
|
61
|
+
something technically incorrect, but still parsable.
|
|
62
|
+
|
|
63
|
+
Following is a list of the types of errors that would trigger
|
|
64
|
+
raise_parse_warnings:
|
|
65
|
+
|
|
66
|
+
* A loop with no data was found.
|
|
67
|
+
* A loop with no tags or values was found.
|
|
68
|
+
* A tag with an improper multi-line value was found.
|
|
69
|
+
Multi-line values should look like this:
|
|
70
|
+
\n;\nThe multi-line\nvalue here.\n;\n
|
|
71
|
+
but the tag looked like this:
|
|
72
|
+
\n; The multi-line\nvalue here.\n;\n"""
|
|
73
|
+
|
|
74
|
+
self.load_data(data)
|
|
75
|
+
self.get_token()
|
|
76
|
+
|
|
77
|
+
# Make sure this is actually a STAR file
|
|
78
|
+
if not self.token.lower().startswith("data_"):
|
|
79
|
+
raise ParsingError("Invalid file. NMR-STAR files must start with 'data_' followed by the data name. "
|
|
80
|
+
f"Did you accidentally select the wrong file? Your file started with '{self.token}'.",
|
|
81
|
+
self.line_number)
|
|
82
|
+
|
|
83
|
+
# Make sure there is a data name
|
|
84
|
+
elif len(self.token) < 6:
|
|
85
|
+
raise ParsingError("'data_' must be followed by data name. Simply 'data_' is not allowed.",
|
|
86
|
+
self.line_number)
|
|
87
|
+
|
|
88
|
+
if self.delimiter != " ":
|
|
89
|
+
raise ParsingError("The data_ keyword may not be quoted or semicolon-delimited.",
|
|
90
|
+
self.line_number)
|
|
91
|
+
|
|
92
|
+
# Set the entry_id
|
|
93
|
+
self.ent._entry_id = self.token[5:]
|
|
94
|
+
self.source = source
|
|
95
|
+
|
|
96
|
+
# We are expecting to get saveframes
|
|
97
|
+
while self.get_token() is not None:
|
|
98
|
+
|
|
99
|
+
if not self.token.lower().startswith("save_"):
|
|
100
|
+
raise ParsingError(f"Only 'save_NAME' is valid in the body of a NMR-STAR file. Found '{self.token}'.",
|
|
101
|
+
self.line_number)
|
|
102
|
+
|
|
103
|
+
if len(self.token) < 6:
|
|
104
|
+
raise ParsingError("'save_' must be followed by saveframe name. You have a 'save_' tag which is "
|
|
105
|
+
"illegal without a specified saveframe name.", self.line_number)
|
|
106
|
+
|
|
107
|
+
if self.delimiter != " ":
|
|
108
|
+
raise ParsingError("The save_ keyword may not be quoted or semicolon-delimited.",
|
|
109
|
+
self.line_number)
|
|
110
|
+
|
|
111
|
+
# Add the saveframe
|
|
112
|
+
cur_frame: Optional[saveframe_mod.Saveframe] = saveframe_mod.Saveframe.from_scratch(self.token[5:],
|
|
113
|
+
source=source)
|
|
114
|
+
self.ent.add_saveframe(cur_frame)
|
|
115
|
+
|
|
116
|
+
# We are in a saveframe
|
|
117
|
+
while self.get_token() is not None:
|
|
118
|
+
|
|
119
|
+
if self.token.lower() == "loop_":
|
|
120
|
+
if self.delimiter != " ":
|
|
121
|
+
raise ParsingError("The loop_ keyword may not be quoted or semicolon-delimited.",
|
|
122
|
+
self.line_number)
|
|
123
|
+
|
|
124
|
+
cur_loop: Optional[loop_mod.Loop] = loop_mod.Loop.from_scratch(source=source)
|
|
125
|
+
|
|
126
|
+
# We are in a loop
|
|
127
|
+
cur_data = []
|
|
128
|
+
seen_data = False
|
|
129
|
+
in_loop = True
|
|
130
|
+
while in_loop and self.get_token() is not None:
|
|
131
|
+
|
|
132
|
+
# Add a tag if it isn't quoted - if quoted, it should be treated as a data value
|
|
133
|
+
if self.token.startswith("_") and self.delimiter == " ":
|
|
134
|
+
try:
|
|
135
|
+
cur_loop.add_tag(self.token)
|
|
136
|
+
except ValueError as err:
|
|
137
|
+
raise ParsingError(str(err), self.line_number)
|
|
138
|
+
|
|
139
|
+
# On to data
|
|
140
|
+
else:
|
|
141
|
+
|
|
142
|
+
# Now that we have the tags we can add the loop
|
|
143
|
+
# to the current saveframe
|
|
144
|
+
try:
|
|
145
|
+
cur_frame.add_loop(cur_loop)
|
|
146
|
+
except ValueError as err:
|
|
147
|
+
raise ParsingError(str(err), self.line_number)
|
|
148
|
+
|
|
149
|
+
# We are in the data block of a loop
|
|
150
|
+
while self.token is not None:
|
|
151
|
+
if self.token.lower() == "stop_":
|
|
152
|
+
if self.delimiter != " ":
|
|
153
|
+
raise ParsingError(
|
|
154
|
+
"The stop_ keyword may not be quoted or semicolon-delimited.",
|
|
155
|
+
self.line_number)
|
|
156
|
+
if len(cur_loop.tags) == 0:
|
|
157
|
+
if raise_parse_warnings:
|
|
158
|
+
raise ParsingError("Loop with no tags.", self.line_number)
|
|
159
|
+
else:
|
|
160
|
+
logger.warning('Loop with no tags in parsed file on line: %s',
|
|
161
|
+
self.line_number)
|
|
162
|
+
cur_loop = None
|
|
163
|
+
if not seen_data:
|
|
164
|
+
if raise_parse_warnings:
|
|
165
|
+
raise ParsingError("Loop with no data.", self.line_number)
|
|
166
|
+
else:
|
|
167
|
+
logger.warning("Loop with no data on line: %s", self.line_number)
|
|
168
|
+
|
|
169
|
+
if len(cur_data) > 0:
|
|
170
|
+
if len(cur_data) % len(cur_loop.tags) != 0:
|
|
171
|
+
raise ParsingError(f"The loop being parsed, '{cur_loop.category}' does "
|
|
172
|
+
f"not have the expected number of data elements. This "
|
|
173
|
+
f"indicates that either one or more tag values are "
|
|
174
|
+
f"either missing from or duplicated in this loop.",
|
|
175
|
+
self.line_number)
|
|
176
|
+
try:
|
|
177
|
+
cur_loop.add_data(cur_data,
|
|
178
|
+
rearrange=True,
|
|
179
|
+
convert_data_types=convert_data_types,
|
|
180
|
+
schema=schema)
|
|
181
|
+
# If there is an issue with the loops during parsing, raise a parse error
|
|
182
|
+
# rather than the ValueError that would be raised if they made the mistake
|
|
183
|
+
# directly
|
|
184
|
+
except ValueError as e:
|
|
185
|
+
raise ParsingError(str(e))
|
|
186
|
+
cur_data = []
|
|
187
|
+
|
|
188
|
+
cur_loop = None
|
|
189
|
+
in_loop = False
|
|
190
|
+
break
|
|
191
|
+
elif self.token.startswith("_") and self.delimiter == " ":
|
|
192
|
+
raise ParsingError("Cannot have more loop tags after loop data. Or perhaps this "
|
|
193
|
+
f"was a data value which was not quoted (but must be, "
|
|
194
|
+
f"if it starts with '_')? Value: '{self.token}'.",
|
|
195
|
+
self.line_number)
|
|
196
|
+
else:
|
|
197
|
+
if len(cur_loop.tags) == 0:
|
|
198
|
+
raise ParsingError("Data value found in loop before any loop tags were "
|
|
199
|
+
"defined. Value: '{self.token}'",
|
|
200
|
+
self.line_number)
|
|
201
|
+
|
|
202
|
+
if self.token.lower() in definitions.RESERVED_KEYWORDS and self.delimiter == " ":
|
|
203
|
+
error = "Cannot use keywords as data values unless quoted or semi-colon " \
|
|
204
|
+
"delimited. Perhaps this is a loop that wasn't properly terminated " \
|
|
205
|
+
"with a 'stop_' keyword before the saveframe ended or another loop " \
|
|
206
|
+
f"began? Value found where 'stop_' or another data value expected: " \
|
|
207
|
+
f"'{self.token}'."
|
|
208
|
+
if len(cur_data) > 0:
|
|
209
|
+
error += f" Last loop data element parsed: '{cur_data[-1]}'."
|
|
210
|
+
raise ParsingError(error, self.line_number)
|
|
211
|
+
cur_data.append(self.token)
|
|
212
|
+
seen_data = True
|
|
213
|
+
|
|
214
|
+
# Get the next token
|
|
215
|
+
self.get_token()
|
|
216
|
+
|
|
217
|
+
if not self.token:
|
|
218
|
+
raise ParsingError(f"Loop improperly terminated at end of file. Loops must end with the "
|
|
219
|
+
f"'stop_' token, but the file ended without the stop token.",
|
|
220
|
+
self.line_number)
|
|
221
|
+
if self.token.lower() != 'stop_':
|
|
222
|
+
raise ParsingError(f"Loop improperly terminated at end of file. Loops must end with the "
|
|
223
|
+
f"'stop_' token, but the token '{self.token}' was found instead.",
|
|
224
|
+
self.line_number)
|
|
225
|
+
|
|
226
|
+
# Close saveframe
|
|
227
|
+
elif self.token.lower() == "save_":
|
|
228
|
+
if self.delimiter not in " ;":
|
|
229
|
+
raise ParsingError("The save_ keyword may not be quoted or semicolon-delimited.",
|
|
230
|
+
self.line_number)
|
|
231
|
+
|
|
232
|
+
if cur_frame.tag_prefix is None:
|
|
233
|
+
raise ParsingError("The tag prefix was never set! Either the saveframe had no tags, you "
|
|
234
|
+
"tried to read a version 2.1 file, or there is something else wrong with "
|
|
235
|
+
f"your file. Saveframe error occurred within: '{cur_frame.name}'",
|
|
236
|
+
line_number=self.line_number)
|
|
237
|
+
break
|
|
238
|
+
|
|
239
|
+
# Invalid content in saveframe
|
|
240
|
+
elif not self.token.startswith("_"):
|
|
241
|
+
if cur_frame.name == 'internaluseyoushouldntseethis_frame':
|
|
242
|
+
raise ParsingError(f"Invalid token found in loop contents. Expecting 'loop_' "
|
|
243
|
+
f"but found: '{self.token}'", line_number=self.line_number)
|
|
244
|
+
else:
|
|
245
|
+
raise ParsingError(f"Invalid token found in saveframe '{cur_frame.name}'. Expecting a tag, "
|
|
246
|
+
f"loop, or 'save_' token but found: '{self.token}'",
|
|
247
|
+
line_number=self.line_number)
|
|
248
|
+
|
|
249
|
+
# Add a tag
|
|
250
|
+
else:
|
|
251
|
+
if self.delimiter != " ":
|
|
252
|
+
raise ParsingError(f"Saveframe tags may not be quoted or semicolon-delimited. Quoted tag: '"
|
|
253
|
+
f"{self.token}'.",
|
|
254
|
+
self.line_number)
|
|
255
|
+
cur_tag: Optional[str] = self.token
|
|
256
|
+
|
|
257
|
+
# We are in a saveframe and waiting for the saveframe tag
|
|
258
|
+
self.get_token()
|
|
259
|
+
if self.delimiter == " ":
|
|
260
|
+
if self.token.lower() in definitions.RESERVED_KEYWORDS:
|
|
261
|
+
raise ParsingError("Cannot use keywords as data values unless quoted or semi-colon "
|
|
262
|
+
f"delimited. Illegal value: '{self.token}'", self.line_number)
|
|
263
|
+
if self.token.startswith("_"):
|
|
264
|
+
raise ParsingError(
|
|
265
|
+
"Cannot have a tag value start with an underscore unless the entire value "
|
|
266
|
+
"is quoted. You may be missing a data value on the previous line. "
|
|
267
|
+
f"Illegal value: '{self.token}'", self.line_number)
|
|
268
|
+
try:
|
|
269
|
+
cur_frame.add_tag(cur_tag,
|
|
270
|
+
self.token,
|
|
271
|
+
convert_data_types=convert_data_types,
|
|
272
|
+
schema=schema)
|
|
273
|
+
except ValueError as err:
|
|
274
|
+
raise ParsingError(str(err), line_number=self.line_number)
|
|
275
|
+
|
|
276
|
+
if not self.token or self.token.lower() != "save_":
|
|
277
|
+
raise ParsingError("Saveframe improperly terminated at end of file. Saveframes must be terminated "
|
|
278
|
+
"with the 'save_' token.",
|
|
279
|
+
self.line_number)
|
|
280
|
+
|
|
281
|
+
# Free the memory of the original copy of the data we parsed
|
|
282
|
+
self.full_data = None
|
|
283
|
+
|
|
284
|
+
# Reset the parser
|
|
285
|
+
cnmrstar.reset()
|
|
286
|
+
|
|
287
|
+
return self.ent
|