pynmrstar 3.3.5__cp313-cp313-musllinux_1_2_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pynmrstar might be problematic. Click here for more details.

pynmrstar/parser.py ADDED
@@ -0,0 +1,287 @@
1
+ import logging
2
+ import re
3
+ from typing import Optional
4
+
5
+ from pynmrstar import definitions, cnmrstar, entry as entry_mod, loop as loop_mod, saveframe as saveframe_mod, schema as schema_mod
6
+ from pynmrstar.exceptions import ParsingError
7
+
8
+ logger = logging.getLogger('pynmrstar')
9
+
10
+
11
+ class Parser(object):
12
+ """Parses an entry. You should not ever use this class directly."""
13
+
14
+ def __init__(self, entry_to_parse_into: 'entry_mod.Entry' = None) -> None:
15
+
16
+ # Just make an entry to parse into if called with no entry passed
17
+ if entry_to_parse_into is None:
18
+ entry_to_parse_into = entry_mod.Entry.from_scratch("")
19
+
20
+ self.ent: entry_mod.Entry = entry_to_parse_into
21
+ self.full_data: str = ""
22
+ self.token: str = ""
23
+ self.source: str = "unknown"
24
+ self.delimiter: str = " "
25
+ self.line_number: int = 0
26
+
27
+ def get_token(self) -> str:
28
+ """ Returns the next token in the parsing process."""
29
+
30
+ try:
31
+ self.token, self.line_number, self.delimiter = cnmrstar.get_token_full()
32
+ except ValueError as err:
33
+ raise ParsingError(str(err))
34
+
35
+ return self.token
36
+
37
+ @staticmethod
38
+ def load_data(data: str) -> None:
39
+ """ Loads data in preparation of parsing and cleans up newlines
40
+ and massages the data to make parsing work properly when multi-line
41
+ values aren't as expected. Useful for manually getting tokens from
42
+ the parser."""
43
+
44
+ # Fix DOS line endings
45
+ data = data.replace("\r\n", "\n").replace("\r", "\n")
46
+ # Change '\n; data ' started multi-lines to '\n;\ndata'
47
+ data = re.sub(r'\n;([^\n]+?)\n', r'\n;\n\1\n', data)
48
+
49
+ cnmrstar.load_string(data)
50
+
51
+ def parse(self,
52
+ data: str,
53
+ source: str = "unknown",
54
+ raise_parse_warnings: bool = False,
55
+ convert_data_types: bool = False,
56
+ schema: 'schema_mod.Schema' = None) -> 'entry_mod.Entry':
57
+ """ Parses the string provided as data as an NMR-STAR entry
58
+ and returns the parsed entry. Raises ParsingError on exceptions.
59
+
60
+ Set raise_parse_warnings to raise an exception if the file has
61
+ something technically incorrect, but still parsable.
62
+
63
+ Following is a list of the types of errors that would trigger
64
+ raise_parse_warnings:
65
+
66
+ * A loop with no data was found.
67
+ * A loop with no tags or values was found.
68
+ * A tag with an improper multi-line value was found.
69
+ Multi-line values should look like this:
70
+ \n;\nThe multi-line\nvalue here.\n;\n
71
+ but the tag looked like this:
72
+ \n; The multi-line\nvalue here.\n;\n"""
73
+
74
+ self.load_data(data)
75
+ self.get_token()
76
+
77
+ # Make sure this is actually a STAR file
78
+ if not self.token.lower().startswith("data_"):
79
+ raise ParsingError("Invalid file. NMR-STAR files must start with 'data_' followed by the data name. "
80
+ f"Did you accidentally select the wrong file? Your file started with '{self.token}'.",
81
+ self.line_number)
82
+
83
+ # Make sure there is a data name
84
+ elif len(self.token) < 6:
85
+ raise ParsingError("'data_' must be followed by data name. Simply 'data_' is not allowed.",
86
+ self.line_number)
87
+
88
+ if self.delimiter != " ":
89
+ raise ParsingError("The data_ keyword may not be quoted or semicolon-delimited.",
90
+ self.line_number)
91
+
92
+ # Set the entry_id
93
+ self.ent._entry_id = self.token[5:]
94
+ self.source = source
95
+
96
+ # We are expecting to get saveframes
97
+ while self.get_token() is not None:
98
+
99
+ if not self.token.lower().startswith("save_"):
100
+ raise ParsingError(f"Only 'save_NAME' is valid in the body of a NMR-STAR file. Found '{self.token}'.",
101
+ self.line_number)
102
+
103
+ if len(self.token) < 6:
104
+ raise ParsingError("'save_' must be followed by saveframe name. You have a 'save_' tag which is "
105
+ "illegal without a specified saveframe name.", self.line_number)
106
+
107
+ if self.delimiter != " ":
108
+ raise ParsingError("The save_ keyword may not be quoted or semicolon-delimited.",
109
+ self.line_number)
110
+
111
+ # Add the saveframe
112
+ cur_frame: Optional[saveframe_mod.Saveframe] = saveframe_mod.Saveframe.from_scratch(self.token[5:],
113
+ source=source)
114
+ self.ent.add_saveframe(cur_frame)
115
+
116
+ # We are in a saveframe
117
+ while self.get_token() is not None:
118
+
119
+ if self.token.lower() == "loop_":
120
+ if self.delimiter != " ":
121
+ raise ParsingError("The loop_ keyword may not be quoted or semicolon-delimited.",
122
+ self.line_number)
123
+
124
+ cur_loop: Optional[loop_mod.Loop] = loop_mod.Loop.from_scratch(source=source)
125
+
126
+ # We are in a loop
127
+ cur_data = []
128
+ seen_data = False
129
+ in_loop = True
130
+ while in_loop and self.get_token() is not None:
131
+
132
+ # Add a tag if it isn't quoted - if quoted, it should be treated as a data value
133
+ if self.token.startswith("_") and self.delimiter == " ":
134
+ try:
135
+ cur_loop.add_tag(self.token)
136
+ except ValueError as err:
137
+ raise ParsingError(str(err), self.line_number)
138
+
139
+ # On to data
140
+ else:
141
+
142
+ # Now that we have the tags we can add the loop
143
+ # to the current saveframe
144
+ try:
145
+ cur_frame.add_loop(cur_loop)
146
+ except ValueError as err:
147
+ raise ParsingError(str(err), self.line_number)
148
+
149
+ # We are in the data block of a loop
150
+ while self.token is not None:
151
+ if self.token.lower() == "stop_":
152
+ if self.delimiter != " ":
153
+ raise ParsingError(
154
+ "The stop_ keyword may not be quoted or semicolon-delimited.",
155
+ self.line_number)
156
+ if len(cur_loop.tags) == 0:
157
+ if raise_parse_warnings:
158
+ raise ParsingError("Loop with no tags.", self.line_number)
159
+ else:
160
+ logger.warning('Loop with no tags in parsed file on line: %s',
161
+ self.line_number)
162
+ cur_loop = None
163
+ if not seen_data:
164
+ if raise_parse_warnings:
165
+ raise ParsingError("Loop with no data.", self.line_number)
166
+ else:
167
+ logger.warning("Loop with no data on line: %s", self.line_number)
168
+
169
+ if len(cur_data) > 0:
170
+ if len(cur_data) % len(cur_loop.tags) != 0:
171
+ raise ParsingError(f"The loop being parsed, '{cur_loop.category}' does "
172
+ f"not have the expected number of data elements. This "
173
+ f"indicates that either one or more tag values are "
174
+ f"either missing from or duplicated in this loop.",
175
+ self.line_number)
176
+ try:
177
+ cur_loop.add_data(cur_data,
178
+ rearrange=True,
179
+ convert_data_types=convert_data_types,
180
+ schema=schema)
181
+ # If there is an issue with the loops during parsing, raise a parse error
182
+ # rather than the ValueError that would be raised if they made the mistake
183
+ # directly
184
+ except ValueError as e:
185
+ raise ParsingError(str(e))
186
+ cur_data = []
187
+
188
+ cur_loop = None
189
+ in_loop = False
190
+ break
191
+ elif self.token.startswith("_") and self.delimiter == " ":
192
+ raise ParsingError("Cannot have more loop tags after loop data. Or perhaps this "
193
+ f"was a data value which was not quoted (but must be, "
194
+ f"if it starts with '_')? Value: '{self.token}'.",
195
+ self.line_number)
196
+ else:
197
+ if len(cur_loop.tags) == 0:
198
+ raise ParsingError("Data value found in loop before any loop tags were "
199
+ "defined. Value: '{self.token}'",
200
+ self.line_number)
201
+
202
+ if self.token.lower() in definitions.RESERVED_KEYWORDS and self.delimiter == " ":
203
+ error = "Cannot use keywords as data values unless quoted or semi-colon " \
204
+ "delimited. Perhaps this is a loop that wasn't properly terminated " \
205
+ "with a 'stop_' keyword before the saveframe ended or another loop " \
206
+ f"began? Value found where 'stop_' or another data value expected: " \
207
+ f"'{self.token}'."
208
+ if len(cur_data) > 0:
209
+ error += f" Last loop data element parsed: '{cur_data[-1]}'."
210
+ raise ParsingError(error, self.line_number)
211
+ cur_data.append(self.token)
212
+ seen_data = True
213
+
214
+ # Get the next token
215
+ self.get_token()
216
+
217
+ if not self.token:
218
+ raise ParsingError(f"Loop improperly terminated at end of file. Loops must end with the "
219
+ f"'stop_' token, but the file ended without the stop token.",
220
+ self.line_number)
221
+ if self.token.lower() != 'stop_':
222
+ raise ParsingError(f"Loop improperly terminated at end of file. Loops must end with the "
223
+ f"'stop_' token, but the token '{self.token}' was found instead.",
224
+ self.line_number)
225
+
226
+ # Close saveframe
227
+ elif self.token.lower() == "save_":
228
+ if self.delimiter not in " ;":
229
+ raise ParsingError("The save_ keyword may not be quoted or semicolon-delimited.",
230
+ self.line_number)
231
+
232
+ if cur_frame.tag_prefix is None:
233
+ raise ParsingError("The tag prefix was never set! Either the saveframe had no tags, you "
234
+ "tried to read a version 2.1 file, or there is something else wrong with "
235
+ f"your file. Saveframe error occurred within: '{cur_frame.name}'",
236
+ line_number=self.line_number)
237
+ break
238
+
239
+ # Invalid content in saveframe
240
+ elif not self.token.startswith("_"):
241
+ if cur_frame.name == 'internaluseyoushouldntseethis_frame':
242
+ raise ParsingError(f"Invalid token found in loop contents. Expecting 'loop_' "
243
+ f"but found: '{self.token}'", line_number=self.line_number)
244
+ else:
245
+ raise ParsingError(f"Invalid token found in saveframe '{cur_frame.name}'. Expecting a tag, "
246
+ f"loop, or 'save_' token but found: '{self.token}'",
247
+ line_number=self.line_number)
248
+
249
+ # Add a tag
250
+ else:
251
+ if self.delimiter != " ":
252
+ raise ParsingError(f"Saveframe tags may not be quoted or semicolon-delimited. Quoted tag: '"
253
+ f"{self.token}'.",
254
+ self.line_number)
255
+ cur_tag: Optional[str] = self.token
256
+
257
+ # We are in a saveframe and waiting for the saveframe tag
258
+ self.get_token()
259
+ if self.delimiter == " ":
260
+ if self.token.lower() in definitions.RESERVED_KEYWORDS:
261
+ raise ParsingError("Cannot use keywords as data values unless quoted or semi-colon "
262
+ f"delimited. Illegal value: '{self.token}'", self.line_number)
263
+ if self.token.startswith("_"):
264
+ raise ParsingError(
265
+ "Cannot have a tag value start with an underscore unless the entire value "
266
+ "is quoted. You may be missing a data value on the previous line. "
267
+ f"Illegal value: '{self.token}'", self.line_number)
268
+ try:
269
+ cur_frame.add_tag(cur_tag,
270
+ self.token,
271
+ convert_data_types=convert_data_types,
272
+ schema=schema)
273
+ except ValueError as err:
274
+ raise ParsingError(str(err), line_number=self.line_number)
275
+
276
+ if not self.token or self.token.lower() != "save_":
277
+ raise ParsingError("Saveframe improperly terminated at end of file. Saveframes must be terminated "
278
+ "with the 'save_' token.",
279
+ self.line_number)
280
+
281
+ # Free the memory of the original copy of the data we parsed
282
+ self.full_data = None
283
+
284
+ # Reset the parser
285
+ cnmrstar.reset()
286
+
287
+ return self.ent